extern "C" magma_int_t magma_sresidual( magma_s_matrix A, magma_s_matrix b, magma_s_matrix x, float *res, magma_queue_t queue ) { magma_int_t info = 0; // set queue for old dense routines magma_queue_t orig_queue=NULL; magmablasGetKernelStream( &orig_queue ); // some useful variables float zero = MAGMA_S_ZERO, one = MAGMA_S_ONE, mone = MAGMA_S_NEG_ONE; magma_int_t dofs = A.num_rows; magma_int_t num_vecs = b.num_rows*b.num_cols/A.num_rows; magma_s_matrix r={Magma_CSR}; if ( A.num_rows == b.num_rows ) { CHECK( magma_svinit( &r, Magma_DEV, A.num_rows, b.num_cols, zero, queue )); CHECK( magma_s_spmv( one, A, x, zero, r, queue )); // r = A x magma_saxpy(dofs, mone, b.dval, 1, r.dval, 1); // r = r - b *res = magma_snrm2(dofs, r.dval, 1); // res = ||r|| // /magma_snrm2(dofs, b.dval, 1); /||b|| //printf( "relative residual: %e\n", *res ); } else if ((b.num_rows*b.num_cols)%A.num_rows== 0 ) { CHECK( magma_svinit( &r, Magma_DEV, b.num_rows,b.num_cols, zero, queue )); CHECK( magma_s_spmv( one, A, x, zero, r, queue )); // r = A x for( magma_int_t i=0; i<num_vecs; i++) { magma_saxpy(dofs, mone, b(i), 1, r(i), 1); // r = r - b res[i] = magma_snrm2(dofs, r(i), 1); // res = ||r|| } // /magma_snrm2(dofs, b.dval, 1); /||b|| //printf( "relative residual: %e\n", *res ); } else { printf("error: dimensions do not match.\n"); info = MAGMA_ERR_NOT_SUPPORTED; } cleanup: magma_smfree(&r, queue ); magmablasSetKernelStream( orig_queue ); return info; }
extern "C" magma_int_t magma_sjacobiiter( magma_s_matrix M, magma_s_matrix c, magma_s_matrix *x, magma_s_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = 0; // local variables float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t dofs = M.num_rows*x->num_cols; magma_s_matrix t={Magma_CSR}, swap={Magma_CSR}; CHECK( magma_svinit( &t, Magma_DEV, M.num_rows, x->num_cols, c_zero, queue )); for( magma_int_t i=0; i<solver_par->maxiter; i++ ) { CHECK( magma_s_spmv( c_neg_one, M, *x, c_zero, t, queue )); // t = - M * x magma_saxpy( dofs, c_one , c.dval, 1 , t.dval, 1, queue ); // t = t + c // swap so that x again contains solution, and y is ready to be used swap = *x; *x = t; t = swap; } cleanup: magma_smfree( &t, queue ); solver_par->info = info; return info; } /* magma_sjacobiiter */
extern "C" magma_int_t magma_sresidual_slice( magma_int_t start, magma_int_t end, magma_s_matrix A, magma_s_matrix b, magma_s_matrix x, float *res, magma_queue_t queue ) { magma_int_t info = 0; // constants const float c_zero = MAGMA_S_ZERO; const float c_one = MAGMA_S_ONE; const float c_neg_one = MAGMA_S_NEG_ONE; // some useful variables magma_int_t dofs = A.num_rows; magma_int_t num_vecs = b.num_rows*b.num_cols/A.num_rows; magma_s_matrix r = {Magma_CSR}; if ( A.num_rows == b.num_rows ) { CHECK( magma_svinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_s_spmv( c_one, A, x, c_zero, r, queue )); // r = A x magma_saxpy( dofs, c_neg_one, b.dval, 1, r.dval, 1, queue ); // r = r - b *res = magma_snrm2( end-start, r.dval+start, 1, queue ); // res = ||r(start:end)|| } else if ((b.num_rows*b.num_cols)%A.num_rows == 0 ) { CHECK( magma_svinit( &r, Magma_DEV, b.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_s_spmv( c_one, A, x, c_zero, r, queue )); // r = A x for( magma_int_t i=0; i < num_vecs; i++) { magma_saxpy( dofs, c_neg_one, b(i), 1, r(i), 1, queue ); // r = r - b res[i] = magma_snrm2( end-start, r(i)+start, 1, queue ); // res = ||r(start:end)|| } } else { printf("error: dimensions do not match.\n"); info = MAGMA_ERR_NOT_SUPPORTED; } cleanup: magma_smfree( &r, queue ); return info; }
extern "C" magma_int_t magma_sjacobisetup_vector( magma_s_matrix b, magma_s_matrix d, magma_s_matrix *c, magma_queue_t queue ) { magma_int_t info = 0; magma_s_matrix diag={Magma_CSR}, c_t={Magma_CSR}, b_h={Magma_CSR}, tmp={Magma_CSR}; if ( b.memory_location == Magma_CPU ) { CHECK( magma_svinit( &c_t, Magma_CPU, b.num_rows, b.num_cols, MAGMA_S_ZERO, queue )); CHECK( magma_smtransfer( b, &b_h, b.memory_location, Magma_CPU, queue )); CHECK( magma_smtransfer( d, &diag, b.memory_location, Magma_CPU, queue )); for( magma_int_t rowindex=0; rowindex<b.num_rows; rowindex++ ) { c_t.val[rowindex] = b_h.val[rowindex] / diag.val[rowindex]; } CHECK( magma_smtransfer( c_t, c, Magma_CPU, b.memory_location, queue )); } else if ( b.memory_location == Magma_DEV ) { // fill vector CHECK( magma_svinit( &tmp, Magma_DEV, b.num_rows, b.num_cols, MAGMA_S_ZERO, queue )); CHECK( magma_sjacobisetup_vector_gpu( b.num_rows, b, d, *c, &tmp, queue )); goto cleanup; } cleanup: magma_smfree( &tmp, queue ); magma_smfree( &diag, queue ); magma_smfree( &c_t, queue ); magma_smfree( &b_h, queue ); return info; }
extern "C" magma_int_t magma_svtranspose( magma_s_matrix x, magma_s_matrix *y, magma_queue_t queue ) { magma_int_t info = 0; magma_int_t m = x.num_rows; magma_int_t n = x.num_cols; // set queue for old dense routines magma_queue_t orig_queue=NULL; magmablasGetKernelStream( &orig_queue ); magma_s_matrix x_d={Magma_CSR}, y_d={Magma_CSR}; if ( x.memory_location == Magma_DEV ) { CHECK( magma_svinit( y, Magma_DEV, x.num_rows,x.num_cols, MAGMA_S_ZERO, queue )); y->num_rows = x.num_rows; y->num_cols = x.num_cols; y->storage_type = x.storage_type; if ( x.major == MagmaColMajor) { y->major = MagmaRowMajor; magmablas_stranspose( m, n, x.val, m, y->val, n ); } else { y->major = MagmaColMajor; magmablas_stranspose( n, m, x.val, n, y->val, m ); } } else { CHECK( magma_smtransfer( x, &x_d, Magma_CPU, Magma_DEV, queue )); CHECK( magma_svtranspose( x_d, &y_d, queue )); CHECK( magma_smtransfer( y_d, y, Magma_DEV, Magma_CPU, queue )); } cleanup: if( info != 0 ){ magma_smfree( y, queue ); } magma_smfree( &x_d, queue ); magma_smfree( &y_d, queue ); magmablasSetKernelStream( orig_queue ); return info; }
extern "C" magma_int_t magma_scgs( magma_s_matrix A, magma_s_matrix b, magma_s_matrix *x, magma_s_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_CGS; solver_par->numiter = 0; solver_par->spmv_count = 0; // constants const float c_zero = MAGMA_S_ZERO; const float c_one = MAGMA_S_ONE; const float c_neg_one = MAGMA_S_NEG_ONE; // solver variables float nom0, r0, res=0, nomb; float rho, rho_l = c_one, alpha, beta; magma_int_t dofs = A.num_rows* b.num_cols; // GPU workspace magma_s_matrix r={Magma_CSR}, rt={Magma_CSR}, r_tld={Magma_CSR}, p={Magma_CSR}, q={Magma_CSR}, u={Magma_CSR}, v={Magma_CSR}, t={Magma_CSR}, p_hat={Magma_CSR}, q_hat={Magma_CSR}, u_hat={Magma_CSR}, v_hat={Magma_CSR}; CHECK( magma_svinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &rt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &r_tld,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &p_hat, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &q, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &q_hat, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &u, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &u_hat, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &v, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &v_hat, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &t, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // solver setup CHECK( magma_sresidualvec( A, b, *x, &r, &nom0, queue)); magma_scopy( dofs, r.dval, 1, r_tld.dval, 1, queue ); solver_par->init_res = nom0; nomb = magma_snrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } if ( (r0 = nomb * solver_par->rtol) < ATOLERANCE ){ r0 = ATOLERANCE; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nom0; solver_par->timing[0] = 0.0; } if ( nom0 < r0 ) { info = MAGMA_SUCCESS; goto cleanup; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; rho = magma_sdot( dofs, r.dval, 1, r_tld.dval, 1, queue ); // rho = < r,r_tld> if( magma_s_isnan_inf( rho ) ){ info = MAGMA_DIVERGENCE; break; } if ( solver_par->numiter > 1 ) { // direction vectors beta = rho / rho_l; magma_scopy( dofs, r.dval, 1, u.dval, 1, queue ); // u = r magma_saxpy( dofs, beta, q.dval, 1, u.dval, 1, queue ); // u = u + beta q magma_sscal( dofs, beta, p.dval, 1, queue ); // p = beta*p magma_saxpy( dofs, c_one, q.dval, 1, p.dval, 1, queue ); // p = q + beta*p magma_sscal( dofs, beta, p.dval, 1, queue ); // p = beta*(q + beta*p) magma_saxpy( dofs, c_one, u.dval, 1, p.dval, 1, queue ); // p = u + beta*(q + beta*p) //u = r + beta*q; //p = u + beta*( q + beta*p ); } else{ magma_scopy( dofs, r.dval, 1, u.dval, 1, queue ); // u = r magma_scopy( dofs, r.dval, 1, p.dval, 1, queue ); // p = r } CHECK( magma_s_spmv( c_one, A, p, c_zero, v_hat, queue )); // v = A p solver_par->spmv_count++; alpha = rho / magma_sdot( dofs, r_tld.dval, 1, v_hat.dval, 1, queue ); magma_scopy( dofs, u.dval, 1, q.dval, 1, queue ); // q = u magma_saxpy( dofs, -alpha, v_hat.dval, 1, q.dval, 1, queue ); // q = u - alpha v_hat magma_scopy( dofs, u.dval, 1, t.dval, 1, queue ); // t = q magma_saxpy( dofs, c_one, q.dval, 1, t.dval, 1, queue ); // t = u + q CHECK( magma_s_spmv( c_one, A, t, c_zero, rt, queue )); // t = A u_hat solver_par->spmv_count++; magma_saxpy( dofs, c_neg_one*alpha, rt.dval, 1, r.dval, 1, queue ); // r = r -alpha*A u_hat magma_saxpy( dofs, alpha, t.dval, 1, x->dval, 1, queue ); // x = x + alpha u_hat rho_l = rho; res = magma_snrm2( dofs, r.dval, 1, queue ); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res/nomb <= solver_par->rtol || res <= solver_par->atol ){ break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; CHECK( magma_sresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = res; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter && info == MAGMA_SUCCESS ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->rtol*solver_par->init_res || solver_par->iter_res < solver_par->atol ) { info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_smfree(&r, queue ); magma_smfree(&rt, queue ); magma_smfree(&r_tld, queue ); magma_smfree(&p, queue ); magma_smfree(&q, queue ); magma_smfree(&u, queue ); magma_smfree(&v, queue ); magma_smfree(&t, queue ); magma_smfree(&p_hat, queue ); magma_smfree(&q_hat, queue ); magma_smfree(&u_hat, queue ); magma_smfree(&v_hat, queue ); solver_par->info = info; return info; } /* magma_scgs */
extern "C" magma_int_t magma_scg_merge( magma_s_matrix A, magma_s_matrix b, magma_s_matrix *x, magma_s_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_CGMERGE; solver_par->numiter = 0; solver_par->spmv_count = 0; // solver variables float alpha, beta, gamma, rho, tmp1, *skp_h={0}; float nom, nom0, betanom, den, nomb; // some useful variables float c_zero = MAGMA_S_ZERO, c_one = MAGMA_S_ONE; magma_int_t dofs = A.num_rows*b.num_cols; magma_s_matrix r={Magma_CSR}, d={Magma_CSR}, z={Magma_CSR}, B={Magma_CSR}, C={Magma_CSR}; float *d1=NULL, *d2=NULL, *skp=NULL; // GPU workspace CHECK( magma_svinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &d, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &z, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_smalloc( &d1, dofs*(1) )); CHECK( magma_smalloc( &d2, dofs*(1) )); // array for the parameters CHECK( magma_smalloc( &skp, 6 )); // skp = [alpha|beta|gamma|rho|tmp1|tmp2] // solver setup magma_sscal( dofs, c_zero, x->dval, 1, queue ); // x = 0 //CHECK( magma_sresidualvec( A, b, *x, &r, nom0, queue)); magma_scopy( dofs, b.dval, 1, r.dval, 1, queue ); // r = b magma_scopy( dofs, r.dval, 1, d.dval, 1, queue ); // d = r nom0 = betanom = magma_snrm2( dofs, r.dval, 1, queue ); nom = nom0 * nom0; // nom = r' * r CHECK( magma_s_spmv( c_one, A, d, c_zero, z, queue )); // z = A d den = MAGMA_S_ABS( magma_sdot( dofs, d.dval, 1, z.dval, 1, queue ) ); // den = d'* z solver_par->init_res = nom0; nomb = magma_snrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } // array on host for the parameters CHECK( magma_smalloc_cpu( &skp_h, 6 )); alpha = rho = gamma = tmp1 = c_one; beta = magma_sdot( dofs, r.dval, 1, r.dval, 1, queue ); skp_h[0]=alpha; skp_h[1]=beta; skp_h[2]=gamma; skp_h[3]=rho; skp_h[4]=tmp1; skp_h[5]=MAGMA_S_MAKE(nom, 0.0); magma_ssetvector( 6, skp_h, 1, skp, 1, queue ); if( nom0 < solver_par->atol || nom0/nomb < solver_par->rtol ){ info = MAGMA_SUCCESS; goto cleanup; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t) nom0; solver_par->timing[0] = 0.0; } // check positive definite if (den <= 0.0) { info = MAGMA_NONSPD; goto cleanup; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; // computes SpMV and dot product CHECK( magma_scgmerge_spmv1( A, d1, d2, d.dval, z.dval, skp, queue )); solver_par->spmv_count++; // updates x, r, computes scalars and updates d CHECK( magma_scgmerge_xrbeta( dofs, d1, d2, x->dval, r.dval, d.dval, z.dval, skp, queue )); // check stopping criterion (asynchronous copy) magma_sgetvector( 1 , skp+1, 1, skp_h+1, 1, queue ); betanom = sqrt(MAGMA_S_ABS(skp_h[1])); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < solver_par->atol || betanom/nomb < solver_par->rtol ) { break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; CHECK( magma_sresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = betanom; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->atol || solver_par->iter_res/solver_par->init_res < solver_par->rtol ){ info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_DIVERGENCE; } cleanup: magma_smfree(&r, queue ); magma_smfree(&z, queue ); magma_smfree(&d, queue ); magma_smfree(&B, queue ); magma_smfree(&C, queue ); magma_free( d1 ); magma_free( d2 ); magma_free( skp ); magma_free_cpu( skp_h ); solver_par->info = info; return info; } /* magma_scg_merge */
extern "C" magma_int_t magma_sbicg( magma_s_matrix A, magma_s_matrix b, magma_s_matrix *x, magma_s_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_BICG; solver_par->numiter = 0; solver_par->spmv_count = 0; // some useful variables float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t dofs = A.num_rows * b.num_cols; // workspace magma_s_matrix r={Magma_CSR}, rt={Magma_CSR}, p={Magma_CSR}, pt={Magma_CSR}, z={Magma_CSR}, zt={Magma_CSR}, q={Magma_CSR}, y={Magma_CSR}, yt={Magma_CSR}, qt={Magma_CSR}; // need to transpose the matrix magma_s_matrix AT={Magma_CSR}, Ah1={Magma_CSR}, Ah2={Magma_CSR}; CHECK( magma_svinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &rt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &pt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &q, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &qt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &y, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &yt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &z, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &zt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // solver variables float alpha, rho, beta, rho_new, ptq; float res, nomb, nom0, r0; // transpose the matrix magma_smtransfer( A, &Ah1, Magma_DEV, Magma_CPU, queue ); magma_smconvert( Ah1, &Ah2, A.storage_type, Magma_CSR, queue ); magma_smfree(&Ah1, queue ); magma_smtransposeconjugate( Ah2, &Ah1, queue ); magma_smfree(&Ah2, queue ); Ah2.blocksize = A.blocksize; Ah2.alignment = A.alignment; magma_smconvert( Ah1, &Ah2, Magma_CSR, A.storage_type, queue ); magma_smfree(&Ah1, queue ); magma_smtransfer( Ah2, &AT, Magma_CPU, Magma_DEV, queue ); magma_smfree(&Ah2, queue ); // solver setup CHECK( magma_sresidualvec( A, b, *x, &r, &nom0, queue)); res = nom0; solver_par->init_res = nom0; magma_scopy( dofs, r.dval, 1, rt.dval, 1, queue ); // rr = r rho_new = magma_sdot( dofs, rt.dval, 1, r.dval, 1, queue ); // rho=<rr,r> rho = alpha = MAGMA_S_MAKE( 1.0, 0. ); nomb = magma_snrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } if ( (r0 = nomb * solver_par->rtol) < ATOLERANCE ){ r0 = ATOLERANCE; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } if ( nom0 < r0 ) { info = MAGMA_SUCCESS; goto cleanup; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; magma_scopy( dofs, r.dval, 1 , y.dval, 1, queue ); // y=r magma_scopy( dofs, y.dval, 1 , z.dval, 1, queue ); // z=y magma_scopy( dofs, rt.dval, 1 , yt.dval, 1, queue ); // yt=rt magma_scopy( dofs, yt.dval, 1 , zt.dval, 1, queue ); // zt=yt rho= rho_new; rho_new = magma_sdot( dofs, rt.dval, 1, z.dval, 1, queue ); // rho=<rt,z> if( magma_s_isnan_inf( rho_new ) ){ info = MAGMA_DIVERGENCE; break; } if( solver_par->numiter==1 ){ magma_scopy( dofs, z.dval, 1 , p.dval, 1, queue ); // yt=rt magma_scopy( dofs, zt.dval, 1 , pt.dval, 1, queue ); // zt=yt } else { beta = rho_new/rho; magma_sscal( dofs, beta, p.dval, 1, queue ); // p = beta*p magma_saxpy( dofs, c_one , z.dval, 1 , p.dval, 1, queue ); // p = z+beta*p magma_sscal( dofs, MAGMA_S_CONJ(beta), pt.dval, 1, queue ); // pt = beta*pt magma_saxpy( dofs, c_one , zt.dval, 1 , pt.dval, 1, queue ); // pt = zt+beta*pt } CHECK( magma_s_spmv( c_one, A, p, c_zero, q, queue )); // v = Ap CHECK( magma_s_spmv( c_one, AT, pt, c_zero, qt, queue )); // v = Ap solver_par->spmv_count++; solver_par->spmv_count++; ptq = magma_sdot( dofs, pt.dval, 1, q.dval, 1, queue ); alpha = rho_new /ptq; magma_saxpy( dofs, alpha, p.dval, 1 , x->dval, 1, queue ); // x=x+alpha*p magma_saxpy( dofs, c_neg_one * alpha, q.dval, 1 , r.dval, 1, queue ); // r=r+alpha*q magma_saxpy( dofs, c_neg_one * MAGMA_S_CONJ(alpha), qt.dval, 1 , rt.dval, 1, queue ); // r=r+alpha*q res = magma_snrm2( dofs, r.dval, 1, queue ); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res/nomb <= solver_par->rtol || res <= solver_par->atol ){ break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; CHECK( magma_sresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = res; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->rtol*solver_par->init_res || solver_par->iter_res < solver_par->atol ) { info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_smfree(&r, queue ); magma_smfree(&rt, queue ); magma_smfree(&p, queue ); magma_smfree(&pt, queue ); magma_smfree(&q, queue ); magma_smfree(&qt, queue ); magma_smfree(&y, queue ); magma_smfree(&yt, queue ); magma_smfree(&z, queue ); magma_smfree(&zt, queue ); magma_smfree(&AT, queue ); magma_smfree(&Ah1, queue ); magma_smfree(&Ah2, queue ); solver_par->info = info; return info; } /* magma_sbicg */
extern "C" magma_int_t magma_spcg_merge( magma_s_matrix A, magma_s_matrix b, magma_s_matrix *x, magma_s_solver_par *solver_par, magma_s_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_PCGMERGE; solver_par->numiter = 0; solver_par->spmv_count = 0; // solver variables float alpha, beta, gamma, rho, tmp1, *skp_h={0}; float nom, nom0, r0, res, nomb; float den; // some useful variables float c_zero = MAGMA_S_ZERO, c_one = MAGMA_S_ONE; magma_int_t dofs = A.num_rows*b.num_cols; magma_s_matrix r={Magma_CSR}, d={Magma_CSR}, z={Magma_CSR}, h={Magma_CSR}, rt={Magma_CSR}; float *d1=NULL, *d2=NULL, *skp=NULL; // GPU workspace CHECK( magma_svinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &d, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &z, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &rt, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &h, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_smalloc( &d1, dofs*(2) )); CHECK( magma_smalloc( &d2, dofs*(2) )); // array for the parameters CHECK( magma_smalloc( &skp, 7 )); // skp = [alpha|beta|gamma|rho|tmp1|tmp2|res] // solver setup CHECK( magma_sresidualvec( A, b, *x, &r, &nom0, queue)); // preconditioner CHECK( magma_s_applyprecond_left( MagmaNoTrans, A, r, &rt, precond_par, queue )); CHECK( magma_s_applyprecond_right( MagmaNoTrans, A, rt, &h, precond_par, queue )); magma_scopy( dofs, h.dval, 1, d.dval, 1, queue ); nom = MAGMA_S_ABS( magma_sdot( dofs, r.dval, 1, h.dval, 1, queue )); CHECK( magma_s_spmv( c_one, A, d, c_zero, z, queue )); // z = A d den = magma_sdot( dofs, d.dval, 1, z.dval, 1, queue ); // den = d'* z solver_par->init_res = nom0; nomb = magma_snrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } if ( (r0 = nomb * solver_par->rtol) < ATOLERANCE ){ r0 = ATOLERANCE; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nom0; solver_par->timing[0] = 0.0; } if ( nom < r0 ) { info = MAGMA_SUCCESS; goto cleanup; } // check positive definite if ( MAGMA_S_ABS(den) <= 0.0 ) { info = MAGMA_NONSPD; goto cleanup; } // array on host for the parameters CHECK( magma_smalloc_cpu( &skp_h, 7 )); alpha = rho = gamma = tmp1 = c_one; beta = magma_sdot( dofs, h.dval, 1, r.dval, 1, queue ); skp_h[0]=alpha; skp_h[1]=beta; skp_h[2]=gamma; skp_h[3]=rho; skp_h[4]=tmp1; skp_h[5]=MAGMA_S_MAKE(nom, 0.0); skp_h[6]=MAGMA_S_MAKE(nom, 0.0); magma_ssetvector( 7, skp_h, 1, skp, 1, queue ); //Chronometry real_Double_t tempo1, tempo2, tempop1, tempop2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; // computes SpMV and dot product CHECK( magma_scgmerge_spmv1( A, d1, d2, d.dval, z.dval, skp, queue )); solver_par->spmv_count++; if( precond_par->solver == Magma_JACOBI ){ CHECK( magma_sjcgmerge_xrbeta( dofs, d1, d2, precond_par->d.dval, x->dval, r.dval, d.dval, z.dval, h.dval, skp, queue )); } else if( precond_par->solver == Magma_NONE ){ // updates x, r CHECK( magma_spcgmerge_xrbeta1( dofs, x->dval, r.dval, d.dval, z.dval, skp, queue )); // computes scalars and updates d CHECK( magma_spcgmerge_xrbeta2( dofs, d1, d2, r.dval, r.dval, d.dval, skp, queue )); } else { // updates x, r CHECK( magma_spcgmerge_xrbeta1( dofs, x->dval, r.dval, d.dval, z.dval, skp, queue )); // preconditioner in between tempop1 = magma_sync_wtime( queue ); CHECK( magma_s_applyprecond_left( MagmaNoTrans, A, r, &rt, precond_par, queue )); CHECK( magma_s_applyprecond_right( MagmaNoTrans, A, rt, &h, precond_par, queue )); // magma_scopy( dofs, r.dval, 1, h.dval, 1 ); tempop2 = magma_sync_wtime( queue ); precond_par->runtime += tempop2-tempop1; // computes scalars and updates d CHECK( magma_spcgmerge_xrbeta2( dofs, d1, d2, h.dval, r.dval, d.dval, skp, queue )); } //if( solver_par->numiter==1){ // magma_scopy( dofs, h.dval, 1, d.dval, 1 ); //} // updates x, r, computes scalars and updates d //CHECK( magma_scgmerge_xrbeta( dofs, d1, d2, x->dval, r.dval, d.dval, z.dval, skp, queue )); // check stopping criterion (asynchronous copy) magma_sgetvector( 1 , skp+6, 1, skp_h+6, 1, queue ); res = sqrt(MAGMA_S_ABS(skp_h[6])); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res/nomb <= solver_par->rtol || res <= solver_par->atol ){ break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; CHECK( magma_sresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = res; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->atol || solver_par->iter_res/solver_par->init_res < solver_par->rtol ){ info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_DIVERGENCE; } cleanup: magma_smfree(&r, queue ); magma_smfree(&z, queue ); magma_smfree(&d, queue ); magma_smfree(&rt, queue ); magma_smfree(&h, queue ); magma_free( d1 ); magma_free( d2 ); magma_free( skp ); magma_free_cpu( skp_h ); solver_par->info = info; return info; } /* magma_spcg_merge */
extern "C" magma_int_t magma_sjacobi( magma_s_matrix A, magma_s_matrix b, magma_s_matrix *x, magma_s_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // some useful variables float c_zero = MAGMA_S_ZERO; real_Double_t tempo1, tempo2, runtime=0; float residual; //float nom0 = 0.0; magma_s_matrix r={Magma_CSR}, d={Magma_CSR}, ACSR={Magma_CSR}; CHECK( magma_smconvert(A, &ACSR, A.storage_type, Magma_CSR, queue ) ); // prepare solver feedback solver_par->solver = Magma_JACOBI; solver_par->info = MAGMA_SUCCESS; // solver setup CHECK( magma_svinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_sresidualvec( ACSR, b, *x, &r, &residual, queue)); solver_par->init_res = residual; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t) residual; } //nom0 = residual; // Jacobi setup CHECK( magma_sjacobisetup_diagscal( ACSR, &d, queue )); magma_s_solver_par jacobiiter_par; if ( solver_par->verbose > 0 ) { jacobiiter_par.maxiter = solver_par->verbose; } else { jacobiiter_par.maxiter = solver_par->maxiter; } solver_par->numiter = 0; solver_par->spmv_count = 0; // Jacobi iterator do { tempo1 = magma_sync_wtime( queue ); solver_par->numiter = solver_par->numiter+jacobiiter_par.maxiter; //CHECK( magma_sjacobiiter_sys( A, b, d, r, x, &jacobiiter_par, queue ) ); CHECK( magma_sjacobispmvupdate(jacobiiter_par.maxiter, ACSR, r, b, d, x, queue )); solver_par->spmv_count = solver_par->spmv_count+jacobiiter_par.maxiter; tempo2 = magma_sync_wtime( queue ); runtime += tempo2 - tempo1; //CHECK( magma_sjacobispmvupdate_bw(jacobiiter_par.maxiter, A, r, b, d, x, queue )); if ( solver_par->verbose > 0 ) { CHECK( magma_sresidualvec( ACSR, b, *x, &r, &residual, queue)); solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) residual; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) runtime; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); solver_par->runtime = (real_Double_t) runtime; CHECK( magma_sresidualvec( A, b, *x, &r, &residual, queue)); solver_par->final_res = residual; if ( solver_par->init_res > solver_par->final_res ) info = MAGMA_SUCCESS; else info = MAGMA_DIVERGENCE; cleanup: magma_smfree( &r, queue ); magma_smfree( &d, queue ); magma_smfree( &ACSR, queue ); solver_par->info = info; return info; } /* magma_sjacobi */
extern "C" magma_int_t magma_sjacobisetup( magma_s_matrix A, magma_s_matrix b, magma_s_matrix *M, magma_s_matrix *c, magma_queue_t queue ) { magma_int_t info = 0; magma_int_t i; magma_s_matrix A_h1={Magma_CSR}, A_h2={Magma_CSR}, B={Magma_CSR}, C={Magma_CSR}; magma_s_matrix diag={Magma_CSR}, c_t={Magma_CSR}, b_h={Magma_CSR}; CHECK( magma_svinit( &c_t, Magma_CPU, A.num_rows, b.num_cols, MAGMA_S_ZERO, queue )); CHECK( magma_svinit( &diag, Magma_CPU, A.num_rows, b.num_cols, MAGMA_S_ZERO, queue )); CHECK( magma_smtransfer( b, &b_h, A.memory_location, Magma_CPU, queue )); if ( A.storage_type != Magma_CSR ) { CHECK( magma_smtransfer( A, &A_h1, A.memory_location, Magma_CPU, queue )); CHECK( magma_smconvert( A_h1, &B, A_h1.storage_type, Magma_CSR, queue )); } else { CHECK( magma_smtransfer( A, &B, A.memory_location, Magma_CPU, queue )); } for( magma_int_t rowindex=0; rowindex<B.num_rows; rowindex++ ) { magma_int_t start = (B.drow[rowindex]); magma_int_t end = (B.drow[rowindex+1]); for( i=start; i<end; i++ ) { if ( B.dcol[i]==rowindex ) { diag.val[rowindex] = B.val[i]; if ( MAGMA_S_REAL( diag.val[rowindex]) == 0 ) printf(" error: zero diagonal element in row %d!\n", int(rowindex)); } } for( i=start; i<end; i++ ) { B.val[i] = B.val[i] / diag.val[rowindex]; if ( B.dcol[i]==rowindex ) { B.val[i] = MAGMA_S_MAKE( 0., 0. ); } } c_t.val[rowindex] = b_h.val[rowindex] / diag.val[rowindex]; } CHECK( magma_s_csr_compressor(&B.val, &B.drow, &B.dcol, &C.val, &C.drow, &C.dcol, &B.num_rows, queue )); C.num_rows = B.num_rows; C.num_cols = B.num_cols; C.memory_location = B.memory_location; C.nnz = C.drow[B.num_rows]; C.storage_type = B.storage_type; C.memory_location = B.memory_location; if ( A.storage_type != Magma_CSR) { A_h2.alignment = A.alignment; A_h2.blocksize = A.blocksize; CHECK( magma_smconvert( C, &A_h2, Magma_CSR, A_h1.storage_type, queue )); CHECK( magma_smtransfer( A_h2, M, Magma_CPU, A.memory_location, queue )); } else { CHECK( magma_smtransfer( C, M, Magma_CPU, A.memory_location, queue )); } CHECK( magma_smtransfer( c_t, c, Magma_CPU, A.memory_location, queue )); if ( A.storage_type != Magma_CSR) { magma_smfree( &A_h1, queue ); magma_smfree( &A_h2, queue ); } cleanup: magma_smfree( &B, queue ); magma_smfree( &C, queue ); magma_smfree( &diag, queue ); magma_smfree( &c_t, queue ); magma_smfree( &b_h, queue ); return info; }
extern "C" magma_int_t magma_sjacobisetup_diagscal( magma_s_matrix A, magma_s_matrix *d, magma_queue_t queue ) { magma_int_t info = 0; magma_int_t i; magma_s_matrix A_h1={Magma_CSR}, B={Magma_CSR}; magma_s_matrix diag={Magma_CSR}; CHECK( magma_svinit( &diag, Magma_CPU, A.num_rows, 1, MAGMA_S_ZERO, queue )); if ( A.storage_type != Magma_CSR || A.memory_location != Magma_CPU ) { CHECK( magma_smtransfer( A, &A_h1, A.memory_location, Magma_CPU, queue )); CHECK( magma_smconvert( A_h1, &B, A_h1.storage_type, Magma_CSR, queue )); for( magma_int_t rowindex=0; rowindex<B.num_rows; rowindex++ ) { magma_int_t start = (B.drow[rowindex]); magma_int_t end = (B.drow[rowindex+1]); for( i=start; i<end; i++ ) { if ( B.dcol[i]==rowindex ) { diag.val[rowindex] = 1.0/B.val[i]; break; } } if ( diag.val[rowindex] == MAGMA_S_ZERO ){ printf(" error: zero diagonal element in row %d!\n", int(rowindex)); if ( A.storage_type != Magma_CSR) { magma_smfree( &A_h1, queue ); } magma_smfree( &B, queue ); magma_smfree( &diag, queue ); info = MAGMA_ERR_BADPRECOND; goto cleanup; } } } else{ for( magma_int_t rowindex=0; rowindex<A.num_rows; rowindex++ ) { magma_int_t start = (A.drow[rowindex]); magma_int_t end = (A.drow[rowindex+1]); for( i=start; i<end; i++ ) { if ( A.dcol[i]==rowindex ) { diag.val[rowindex] = 1.0/A.val[i]; break; } } if ( diag.val[rowindex] == MAGMA_S_ZERO ){ printf(" error: zero diagonal element in row %d!\n", int(rowindex)); if ( A.storage_type != Magma_CSR) { magma_smfree( &A_h1, queue ); } magma_smfree( &B, queue ); magma_smfree( &diag, queue ); info = MAGMA_ERR_BADPRECOND; goto cleanup; } } } CHECK( magma_smtransfer( diag, d, Magma_CPU, Magma_DEV, queue )); cleanup: magma_smfree( &A_h1, queue ); magma_smfree( &B, queue ); magma_smfree( &diag, queue ); return info; }
/* //////////////////////////////////////////////////////////////////////////// -- testing any solver */ int main( int argc, char** argv ) { magma_int_t info = 0; TESTING_INIT(); magma_sopts zopts; magma_queue_t queue=NULL; magma_queue_create( &queue ); float one = MAGMA_S_MAKE(1.0, 0.0); float zero = MAGMA_S_MAKE(0.0, 0.0); magma_s_matrix A={Magma_CSR}, B={Magma_CSR}, B_d={Magma_CSR}; magma_s_matrix x={Magma_CSR}, b={Magma_CSR}; int i=1; CHECK( magma_sparse_opts( argc, argv, &zopts, &i, queue )); B.blocksize = zopts.blocksize; B.alignment = zopts.alignment; CHECK( magma_ssolverinfo_init( &zopts.solver_par, &zopts.precond_par, queue )); while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); CHECK( magma_sm_5stencil( laplace_size, &A, queue )); } else { // file-matrix test CHECK( magma_s_csr_mtx( &A, argv[i], queue )); } // for the eigensolver case zopts.solver_par.ev_length = A.num_cols; CHECK( magma_seigensolverinfo_init( &zopts.solver_par, queue )); // scale matrix CHECK( magma_smscale( &A, zopts.scaling, queue )); // preconditioner if ( zopts.solver_par.solver != Magma_ITERREF ) { CHECK( magma_s_precondsetup( A, b, &zopts.solver_par, &zopts.precond_par, queue ) ); } CHECK( magma_smconvert( A, &B, Magma_CSR, zopts.output_format, queue )); printf( "\n%% matrix info: %d-by-%d with %d nonzeros\n\n", int(A.num_rows), int(A.num_cols), int(A.nnz) ); printf("matrixinfo = [ \n"); printf("%% size (m x n) || nonzeros (nnz) || nnz/m || stored nnz\n"); printf("%%======================================================================" "======%%\n"); printf(" %8d %8d %10d %4d %10d\n", int(B.num_rows), int(B.num_cols), int(B.true_nnz), int(B.true_nnz/B.num_rows), int(B.nnz) ); printf("%%======================================================================" "======%%\n"); printf("];\n"); CHECK( magma_smtransfer( B, &B_d, Magma_CPU, Magma_DEV, queue )); // vectors and initial guess CHECK( magma_svinit( &b, Magma_DEV, A.num_rows, 1, one, queue )); //magma_svinit( &x, Magma_DEV, A.num_cols, 1, one, queue ); //magma_s_spmv( one, B_d, x, zero, b, queue ); // b = A x //magma_smfree(&x, queue ); CHECK( magma_svinit( &x, Magma_DEV, A.num_cols, 1, zero, queue )); info = magma_s_solver( B_d, b, &x, &zopts, queue ); if( info != 0 ) { printf("%%error: solver returned: %s (%d).\n", magma_strerror( info ), int(info) ); } printf("data = [\n"); magma_ssolverinfo( &zopts.solver_par, &zopts.precond_par, queue ); printf("];\n\n"); printf("precond_info = [\n"); printf("%% setup runtime\n"); printf(" %.6f %.6f\n", zopts.precond_par.setuptime, zopts.precond_par.runtime ); printf("];\n\n"); magma_smfree(&B_d, queue ); magma_smfree(&B, queue ); magma_smfree(&A, queue ); magma_smfree(&x, queue ); magma_smfree(&b, queue ); i++; } cleanup: magma_smfree(&B_d, queue ); magma_smfree(&B, queue ); magma_smfree(&A, queue ); magma_smfree(&x, queue ); magma_smfree(&b, queue ); magma_ssolverinfo_free( &zopts.solver_par, &zopts.precond_par, queue ); magma_queue_destroy( queue ); TESTING_FINALIZE(); return info; }
extern "C" magma_int_t magma_slsqr( magma_s_matrix A, magma_s_matrix b, magma_s_matrix *x, magma_s_solver_par *solver_par, magma_s_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_LSQR; solver_par->numiter = 0; solver_par->spmv_count = 0; magma_int_t m = A.num_rows * b.num_cols; magma_int_t n = A.num_cols * b.num_cols; // local variables float c_zero = MAGMA_S_ZERO, c_one = MAGMA_S_ONE; // solver variables float s, nom0, r0, res=0, nomb, phibar, beta, alpha, c, rho, rhot, phi, thet, normr, normar, norma, sumnormd2, normd; // need to transpose the matrix magma_s_matrix AT={Magma_CSR}, Ah1={Magma_CSR}, Ah2={Magma_CSR}; // GPU workspace magma_s_matrix r={Magma_CSR}, v={Magma_CSR}, z={Magma_CSR}, zt={Magma_CSR}, d={Magma_CSR}, vt={Magma_CSR}, q={Magma_CSR}, w={Magma_CSR}, u={Magma_CSR}; CHECK( magma_svinit( &r, Magma_DEV, A.num_cols, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &v, Magma_DEV, A.num_cols, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &z, Magma_DEV, A.num_cols, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &d, Magma_DEV, A.num_cols, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &vt,Magma_DEV, A.num_cols, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &q, Magma_DEV, A.num_cols, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &w, Magma_DEV, A.num_cols, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &u, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &zt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // transpose the matrix magma_smtransfer( A, &Ah1, Magma_DEV, Magma_CPU, queue ); magma_smconvert( Ah1, &Ah2, A.storage_type, Magma_CSR, queue ); magma_smfree(&Ah1, queue ); magma_smtransposeconjugate( Ah2, &Ah1, queue ); magma_smfree(&Ah2, queue ); Ah2.blocksize = A.blocksize; Ah2.alignment = A.alignment; magma_smconvert( Ah1, &Ah2, Magma_CSR, A.storage_type, queue ); magma_smfree(&Ah1, queue ); magma_smtransfer( Ah2, &AT, Magma_CPU, Magma_DEV, queue ); magma_smfree(&Ah2, queue ); // solver setup CHECK( magma_sresidualvec( A, b, *x, &r, &nom0, queue)); solver_par->init_res = nom0; nomb = magma_snrm2( m, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } if ( (r0 = nomb * solver_par->rtol) < ATOLERANCE ){ r0 = ATOLERANCE; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nom0; solver_par->timing[0] = 0.0; } if ( nom0 < r0 ) { info = MAGMA_SUCCESS; goto cleanup; } magma_scopy( m, b.dval, 1, u.dval, 1, queue ); beta = magma_snrm2( m, u.dval, 1, queue ); magma_sscal( m, MAGMA_S_MAKE(1./beta, 0.0 ), u.dval, 1, queue ); normr = beta; c = 1.0; s = 0.0; phibar = beta; CHECK( magma_s_spmv( c_one, AT, u, c_zero, v, queue )); if( precond_par->solver == Magma_NONE ){ ; } else { CHECK( magma_s_applyprecond_right( MagmaTrans, A, v, &zt, precond_par, queue )); CHECK( magma_s_applyprecond_left( MagmaTrans, A, zt, &v, precond_par, queue )); } alpha = magma_snrm2( n, v.dval, 1, queue ); magma_sscal( n, MAGMA_S_MAKE(1./alpha, 0.0 ), v.dval, 1, queue ); normar = alpha * beta; norma = 0; sumnormd2 = 0; //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; // start iteration do { solver_par->numiter++; if( precond_par->solver == Magma_NONE || A.num_rows != A.num_cols ) { magma_scopy( n, v.dval, 1 , z.dval, 1, queue ); } else { CHECK( magma_s_applyprecond_left( MagmaNoTrans, A, v, &zt, precond_par, queue )); CHECK( magma_s_applyprecond_right( MagmaNoTrans, A, zt, &z, precond_par, queue )); } //CHECK( magma_s_spmv( c_one, A, z, MAGMA_S_MAKE(-alpha,0.0), u, queue )); CHECK( magma_s_spmv( c_one, A, z, c_zero, zt, queue )); magma_sscal( m, MAGMA_S_MAKE(-alpha, 0.0 ), u.dval, 1, queue ); magma_saxpy( m, c_one, zt.dval, 1, u.dval, 1, queue ); solver_par->spmv_count++; beta = magma_snrm2( m, u.dval, 1, queue ); magma_sscal( m, MAGMA_S_MAKE(1./beta, 0.0 ), u.dval, 1, queue ); // norma = norm([norma alpha beta]); norma = sqrt(norma*norma + alpha*alpha + beta*beta ); //lsvec( solver_par->numiter-1 ) = normar / norma; thet = -s * alpha; rhot = c * alpha; rho = sqrt( rhot * rhot + beta * beta ); c = rhot / rho; s = - beta / rho; phi = c * phibar; phibar = s * phibar; // d = (z - thet * d) / rho; magma_sscal( n, MAGMA_S_MAKE(-thet, 0.0 ), d.dval, 1, queue ); magma_saxpy( n, c_one, z.dval, 1, d.dval, 1, queue ); magma_sscal( n, MAGMA_S_MAKE(1./rho, 0.0 ), d.dval, 1, queue ); normd = magma_snrm2( n, d.dval, 1, queue ); sumnormd2 = sumnormd2 + normd*normd; // convergence check res = normr; if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose == c_zero ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } // check for convergence in A*x=b if ( res/nomb <= solver_par->rtol || res <= solver_par->atol ){ info = MAGMA_SUCCESS; break; } // check for convergence in min{|b-A*x|} if ( A.num_rows != A.num_cols && ( normar/(norma*normr) <= solver_par->rtol || normar <= solver_par->atol ) ){ printf("%% warning: quit from minimization convergence check.\n"); info = MAGMA_SUCCESS; break; } magma_saxpy( n, MAGMA_S_MAKE( phi, 0.0 ), d.dval, 1, x->dval, 1, queue ); normr = fabs(s) * normr; CHECK( magma_s_spmv( c_one, AT, u, c_zero, vt, queue )); solver_par->spmv_count++; if( precond_par->solver == Magma_NONE ){ ; } else { CHECK( magma_s_applyprecond_right( MagmaTrans, A, vt, &zt, precond_par, queue )); CHECK( magma_s_applyprecond_left( MagmaTrans, A, zt, &vt, precond_par, queue )); } magma_sscal( n, MAGMA_S_MAKE(-beta, 0.0 ), v.dval, 1, queue ); magma_saxpy( n, c_one, vt.dval, 1, v.dval, 1, queue ); alpha = magma_snrm2( n, v.dval, 1, queue ); magma_sscal( n, MAGMA_S_MAKE(1./alpha, 0.0 ), v.dval, 1, queue ); normar = alpha * fabs(s*phi); } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; CHECK( magma_sresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = res; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter && info == MAGMA_SUCCESS ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == c_zero ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->rtol*solver_par->init_res || solver_par->iter_res < solver_par->atol ) { info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == c_zero ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_smfree(&r, queue ); magma_smfree(&v, queue ); magma_smfree(&z, queue ); magma_smfree(&zt, queue ); magma_smfree(&d, queue ); magma_smfree(&vt, queue ); magma_smfree(&q, queue ); magma_smfree(&u, queue ); magma_smfree(&w, queue ); magma_smfree(&AT, queue ); magma_smfree(&Ah1, queue ); magma_smfree(&Ah2, queue ); solver_par->info = info; return info; } /* magma_sqmr */
/* //////////////////////////////////////////////////////////////////////////// -- testing any solver */ int main( int argc, char** argv ) { magma_int_t info = 0; TESTING_INIT(); magma_queue_t queue=NULL; magma_queue_create( 0, &queue ); float one = MAGMA_S_MAKE(1.0, 0.0); float zero = MAGMA_S_MAKE(0.0, 0.0); magma_s_matrix A={Magma_CSR}, B_d={Magma_CSR}; magma_s_matrix x={Magma_CSR}, b={Magma_CSR}; int i=1; while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); CHECK( magma_sm_5stencil( laplace_size, &A, queue )); } else { // file-matrix test CHECK( magma_s_csr_mtx( &A, argv[i], queue )); } printf( "\n# matrix info: %d-by-%d with %d nonzeros\n\n", int(A.num_rows), int(A.num_cols), int(A.nnz) ); magma_int_t n = A.num_rows; CHECK( magma_smtransfer( A, &B_d, Magma_CPU, Magma_DEV, queue )); // vectors and initial guess CHECK( magma_svinit( &b, Magma_DEV, A.num_cols, 1, zero, queue )); CHECK( magma_svinit( &x, Magma_DEV, A.num_cols, 1, one, queue )); CHECK( magma_sprint_vector( b, 90, 10, queue )); CHECK( magma_sprint_matrix( A, queue )); printf("\n\n\n"); CHECK( magma_sprint_matrix( B_d, queue )); float res; res = magma_snrm2(n, b.dval, 1, queue ); printf("norm0: %f\n", res); CHECK( magma_s_spmv( one, B_d, x, zero, b, queue )); // b = A x CHECK( magma_sprint_vector( b, 0, 100, queue )); CHECK( magma_sprint_vector( b, b.num_rows-10, 10, queue )); res = magma_snrm2( n, b.dval, 1, queue ); printf("norm: %f\n", res); CHECK( magma_sresidual( B_d, x, b, &res, queue)); printf("res: %f\n", res); magma_smfree(&B_d, queue ); magma_smfree(&A, queue ); magma_smfree(&x, queue ); magma_smfree(&b, queue ); i++; } cleanup: magma_smfree(&A, queue ); magma_smfree(&B_d, queue ); magma_smfree(&x, queue ); magma_smfree(&b, queue ); magma_queue_destroy( queue ); magma_finalize(); return info; }
extern "C" magma_int_t magma_sbpcg( magma_s_matrix A, magma_s_matrix b, magma_s_matrix *x, magma_s_solver_par *solver_par, magma_s_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = 0; magma_int_t i, num_vecs = b.num_rows/A.num_rows; // prepare solver feedback solver_par->solver = Magma_PCG; solver_par->numiter = 0; solver_par->spmv_count = 0; solver_par->info = MAGMA_SUCCESS; // local variables float c_zero = MAGMA_S_ZERO, c_one = MAGMA_S_ONE; magma_int_t dofs = A.num_rows; // GPU workspace magma_s_matrix r={Magma_CSR}, rt={Magma_CSR}, p={Magma_CSR}, q={Magma_CSR}, h={Magma_CSR}; // solver variables float *alpha={0}, *beta={0}; alpha = NULL; beta = NULL; float *nom={0}, *nom0={0}, *r0={0}, *gammaold={0}, *gammanew={0}, *den={0}, *res={0}, *residual={0}; nom = NULL; nom0 = NULL; r0 = NULL; gammaold = NULL; gammanew = NULL; den = NULL; res = NULL; residual = NULL; CHECK( magma_smalloc_cpu(&alpha, num_vecs)); CHECK( magma_smalloc_cpu(&beta, num_vecs)); CHECK( magma_smalloc_cpu(&residual, num_vecs)); CHECK( magma_smalloc_cpu(&nom, num_vecs)); CHECK( magma_smalloc_cpu(&nom0, num_vecs)); CHECK( magma_smalloc_cpu(&r0, num_vecs)); CHECK( magma_smalloc_cpu(&gammaold, num_vecs)); CHECK( magma_smalloc_cpu(&gammanew, num_vecs)); CHECK( magma_smalloc_cpu(&den, num_vecs)); CHECK( magma_smalloc_cpu(&res, num_vecs)); CHECK( magma_smalloc_cpu(&residual, num_vecs)); CHECK( magma_svinit( &r, Magma_DEV, dofs*num_vecs, 1, c_zero, queue )); CHECK( magma_svinit( &rt, Magma_DEV, dofs*num_vecs, 1, c_zero, queue )); CHECK( magma_svinit( &p, Magma_DEV, dofs*num_vecs, 1, c_zero, queue )); CHECK( magma_svinit( &q, Magma_DEV, dofs*num_vecs, 1, c_zero, queue )); CHECK( magma_svinit( &h, Magma_DEV, dofs*num_vecs, 1, c_zero, queue )); // solver setup CHECK( magma_sresidualvec( A, b, *x, &r, nom0, queue)); // preconditioner CHECK( magma_s_applyprecond_left( MagmaNoTrans, A, r, &rt, precond_par, queue )); CHECK( magma_s_applyprecond_right( MagmaNoTrans, A, rt, &h, precond_par, queue )); magma_scopy( dofs*num_vecs, h.dval, 1, p.dval, 1, queue ); // p = h for( i=0; i<num_vecs; i++) { nom[i] = MAGMA_S_REAL( magma_sdot( dofs, r(i), 1, h(i), 1, queue ) ); nom0[i] = magma_snrm2( dofs, r(i), 1, queue ); } CHECK( magma_s_spmv( c_one, A, p, c_zero, q, queue )); // q = A p for( i=0; i<num_vecs; i++) den[i] = MAGMA_S_REAL( magma_sdot( dofs, p(i), 1, q(i), 1, queue ) ); // den = p dot q solver_par->init_res = nom0[0]; if ( (r0[0] = nom[0] * solver_par->rtol) < ATOLERANCE ) r0[0] = ATOLERANCE; // check positive definite if (den[0] <= 0.0) { printf("Operator A is not postive definite. (Ar,r) = %f\n", den[0]); info = MAGMA_NONSPD; goto cleanup; } if ( nom[0] < r0[0] ) { solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; goto cleanup; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nom0[0]; solver_par->timing[0] = 0.0; } solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; // preconditioner CHECK( magma_s_applyprecond_left( MagmaNoTrans, A, r, &rt, precond_par, queue )); CHECK( magma_s_applyprecond_right( MagmaNoTrans, A, rt, &h, precond_par, queue )); for( i=0; i<num_vecs; i++) gammanew[i] = MAGMA_S_REAL( magma_sdot( dofs, r(i), 1, h(i), 1, queue ) ); // gn = < r,h> if ( solver_par->numiter==1 ) { magma_scopy( dofs*num_vecs, h.dval, 1, p.dval, 1, queue ); // p = h } else { for( i=0; i<num_vecs; i++) { beta[i] = MAGMA_S_MAKE(gammanew[i]/gammaold[i], 0.); // beta = gn/go magma_sscal( dofs, beta[i], p(i), 1, queue ); // p = beta*p magma_saxpy( dofs, c_one, h(i), 1, p(i), 1, queue ); // p = p + h } } CHECK( magma_s_spmv( c_one, A, p, c_zero, q, queue )); // q = A p solver_par->spmv_count++; // magma_s_bspmv_tuned( dofs, num_vecs, c_one, A, p.dval, c_zero, q.dval, queue ); for( i=0; i<num_vecs; i++) { den[i] = MAGMA_S_REAL(magma_sdot( dofs, p(i), 1, q(i), 1, queue) ); // den = p dot q alpha[i] = MAGMA_S_MAKE(gammanew[i]/den[i], 0.); magma_saxpy( dofs, alpha[i], p(i), 1, x->dval+dofs*i, 1, queue ); // x = x + alpha p magma_saxpy( dofs, -alpha[i], q(i), 1, r(i), 1, queue ); // r = r - alpha q gammaold[i] = gammanew[i]; res[i] = magma_snrm2( dofs, r(i), 1, queue ); } if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res[0]; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res[0]/nom0[0] < solver_par->rtol ) { break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; CHECK( magma_sresidual( A, b, *x, residual, queue )); solver_par->iter_res = res[0]; solver_par->final_res = residual[0]; if ( solver_par->numiter < solver_par->maxiter ) { solver_par->info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res[0]; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->rtol*solver_par->init_res ){ info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res[0]; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } for( i=0; i<num_vecs; i++) { printf("%.4e ",res[i]); } printf("\n"); for( i=0; i<num_vecs; i++) { printf("%.4e ",residual[i]); } printf("\n"); cleanup: magma_smfree(&r, queue ); magma_smfree(&rt, queue ); magma_smfree(&p, queue ); magma_smfree(&q, queue ); magma_smfree(&h, queue ); magma_free_cpu(alpha); magma_free_cpu(beta); magma_free_cpu(nom); magma_free_cpu(nom0); magma_free_cpu(r0); magma_free_cpu(gammaold); magma_free_cpu(gammanew); magma_free_cpu(den); magma_free_cpu(res); solver_par->info = info; return info; } /* magma_sbpcg */
extern "C" magma_int_t magma_sbicgstab_merge3( magma_s_matrix A, magma_s_matrix b, magma_s_matrix *x, magma_s_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_BICGSTABMERGE; solver_par->numiter = 0; solver_par->spmv_count = 0; // solver variables float alpha, beta, omega, rho_old, rho_new, *skp_h={0}; float nom, nom0, betanom, nomb; // some useful variables float c_zero = MAGMA_S_ZERO, c_one = MAGMA_S_ONE; magma_int_t dofs = A.num_rows; // workspace magma_s_matrix q={Magma_CSR}, r={Magma_CSR}, rr={Magma_CSR}, p={Magma_CSR}, v={Magma_CSR}, s={Magma_CSR}, t={Magma_CSR}; float *d1=NULL, *d2=NULL, *skp=NULL; d1 = NULL; d2 = NULL; skp = NULL; CHECK( magma_smalloc( &d1, dofs*(2) )); CHECK( magma_smalloc( &d2, dofs*(2) )); // array for the parameters CHECK( magma_smalloc( &skp, 8 )); // skp = [alpha|beta|omega|rho_old|rho|nom|tmp1|tmp2] CHECK( magma_svinit( &q, Magma_DEV, dofs*6, 1, c_zero, queue )); // q = rr|r|p|v|s|t rr.memory_location = Magma_DEV; rr.dval = NULL; rr.num_rows = rr.nnz = dofs; rr.num_cols = 1; rr.storage_type = Magma_DENSE; r.memory_location = Magma_DEV; r.dval = NULL; r.num_rows = r.nnz = dofs; r.num_cols = 1; r.storage_type = Magma_DENSE; p.memory_location = Magma_DEV; p.dval = NULL; p.num_rows = p.nnz = dofs; p.num_cols = 1; p.storage_type = Magma_DENSE; v.memory_location = Magma_DEV; v.dval = NULL; v.num_rows = v.nnz = dofs; v.num_cols = 1; v.storage_type = Magma_DENSE; s.memory_location = Magma_DEV; s.dval = NULL; s.num_rows = s.nnz = dofs; s.num_cols = 1; s.storage_type = Magma_DENSE; t.memory_location = Magma_DEV; t.dval = NULL; t.num_rows = t.nnz = dofs; t.num_cols = 1; t.storage_type = Magma_DENSE; rr.dval = q(0); r.dval = q(1); p.dval = q(2); v.dval = q(3); s.dval = q(4); t.dval = q(5); // solver setup CHECK( magma_sresidualvec( A, b, *x, &r, &nom0, queue)); magma_scopy( dofs, r.dval, 1, q(0), 1, queue ); // rr = r magma_scopy( dofs, r.dval, 1, q(1), 1, queue ); // q = r betanom = nom0; nom = nom0*nom0; rho_new = magma_sdot( dofs, r.dval, 1, r.dval, 1, queue ); // rho=<rr,r> rho_old = omega = alpha = MAGMA_S_MAKE( 1.0, 0. ); beta = rho_new; solver_par->init_res = nom0; // array on host for the parameters CHECK( magma_smalloc_cpu( &skp_h, 8 )); nomb = magma_snrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } skp_h[0]=alpha; skp_h[1]=beta; skp_h[2]=omega; skp_h[3]=rho_old; skp_h[4]=rho_new; skp_h[5]=MAGMA_S_MAKE(nom, 0.0); magma_ssetvector( 8, skp_h, 1, skp, 1, queue ); CHECK( magma_s_spmv( c_one, A, r, c_zero, v, queue )); // z = A r nomb = magma_snrm2( dofs, b.dval, 1, queue ); if( nom0 < solver_par->atol || nom0/nomb < solver_par->rtol ){ info = MAGMA_SUCCESS; goto cleanup; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; // computes p=r+beta*(p-omega*v) CHECK( magma_sbicgmerge1( dofs, skp, v.dval, r.dval, p.dval, queue )); CHECK( magma_s_spmv( c_one, A, p, c_zero, v, queue )); // v = Ap solver_par->spmv_count++; CHECK( magma_smdotc( dofs, 1, q.dval, v.dval, d1, d2, skp, queue )); CHECK( magma_sbicgmerge4( 1, skp, queue )); CHECK( magma_sbicgmerge2( dofs, skp, r.dval, v.dval, s.dval, queue )); // s=r-alpha*v CHECK( magma_s_spmv( c_one, A, s, c_zero, t, queue )); // t=As solver_par->spmv_count++; CHECK( magma_smdotc( dofs, 2, q.dval+4*dofs, t.dval, d1, d2, skp+6, queue )); CHECK( magma_sbicgmerge4( 2, skp, queue )); CHECK( magma_sbicgmerge_xrbeta( dofs, d1, d2, q.dval, r.dval, p.dval, s.dval, t.dval, x->dval, skp, queue )); // check stopping criterion magma_sgetvector_async( 1 , skp+5, 1, skp_h+5, 1, queue ); betanom = sqrt(MAGMA_S_REAL(skp_h[5])); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < solver_par->atol || betanom/nomb < solver_par->rtol ) { break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; CHECK( magma_sresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = betanom; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->atol || solver_par->iter_res/solver_par->init_res < solver_par->rtol ){ info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_smfree(&q, queue ); // frees all vectors magma_free(d1); magma_free(d2); magma_free( skp ); magma_free_cpu( skp_h ); solver_par->info = info; return info; } /* sbicgstab_merge */
extern "C" magma_int_t magma_sidr_strms( magma_s_matrix A, magma_s_matrix b, magma_s_matrix *x, magma_s_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_IDRMERGE; solver_par->numiter = 0; solver_par->spmv_count = 0; solver_par->init_res = 0.0; solver_par->final_res = 0.0; solver_par->iter_res = 0.0; solver_par->runtime = 0.0; // constants const float c_zero = MAGMA_S_ZERO; const float c_one = MAGMA_S_ONE; const float c_n_one = MAGMA_S_NEG_ONE; // internal user options const magma_int_t smoothing = 1; // 0 = disable, 1 = enable const float angle = 0.7; // [0-1] // local variables magma_int_t iseed[4] = {0, 0, 0, 1}; magma_int_t dof; magma_int_t s; magma_int_t distr; magma_int_t k, i, sk; magma_int_t innerflag; magma_int_t ldd; magma_int_t q; float residual; float nrm; float nrmb; float nrmr; float nrmt; float rho; float om; float gamma; // matrices and vectors magma_s_matrix dxs = {Magma_CSR}; magma_s_matrix dr = {Magma_CSR}, drs = {Magma_CSR}; magma_s_matrix dP = {Magma_CSR}, dP1 = {Magma_CSR}; magma_s_matrix dG = {Magma_CSR}, dGcol = {Magma_CSR}; magma_s_matrix dU = {Magma_CSR}; magma_s_matrix dM = {Magma_CSR}; magma_s_matrix df = {Magma_CSR}; magma_s_matrix dt = {Magma_CSR}, dtt = {Magma_CSR}; magma_s_matrix dc = {Magma_CSR}; magma_s_matrix dv = {Magma_CSR}; magma_s_matrix dskp = {Magma_CSR}; magma_s_matrix dalpha = {Magma_CSR}; magma_s_matrix dbeta = {Magma_CSR}; float *hMdiag = NULL; float *hskp = NULL; float *halpha = NULL; float *hbeta = NULL; float *d1 = NULL, *d2 = NULL; // queue variables const magma_int_t nqueues = 3; // number of queues magma_queue_t queues[nqueues]; // chronometry real_Double_t tempo1, tempo2; // create additional queues queues[0] = queue; for ( q = 1; q < nqueues; q++ ) { magma_queue_create( queue->device(), &(queues[q]) ); } // initial s space // TODO: add option for 's' (shadow space number) // Hack: uses '--restart' option as the shadow space number. // This is not a good idea because the default value of restart option is used to detect // if the user provided a custom restart. This means that if the default restart value // is changed then the code will think it was the user (unless the default value is // also updated in the 'if' statement below. s = 1; if ( solver_par->restart != 50 ) { if ( solver_par->restart > A.num_cols ) { s = A.num_cols; } else { s = solver_par->restart; } } solver_par->restart = s; // set max iterations solver_par->maxiter = min( 2 * A.num_cols, solver_par->maxiter ); // check if matrix A is square if ( A.num_rows != A.num_cols ) { //printf("Matrix A is not square.\n"); info = MAGMA_ERR_NOT_SUPPORTED; goto cleanup; } // |b| nrmb = magma_snrm2( b.num_rows, b.dval, 1, queue ); if ( nrmb == 0.0 ) { magma_sscal( x->num_rows, MAGMA_S_ZERO, x->dval, 1, queue ); info = MAGMA_SUCCESS; goto cleanup; } // t = 0 // make t twice as large to contain both, dt and dr ldd = magma_roundup( b.num_rows, 32 ); CHECK( magma_svinit( &dt, Magma_DEV, ldd, 2, c_zero, queue )); dt.num_rows = b.num_rows; dt.num_cols = 1; dt.nnz = dt.num_rows; // redirect the dr.dval to the second part of dt CHECK( magma_svinit( &dr, Magma_DEV, b.num_rows, 1, c_zero, queue )); magma_free( dr.dval ); dr.dval = dt.dval + ldd; // r = b - A x CHECK( magma_sresidualvec( A, b, *x, &dr, &nrmr, queue )); // |r| solver_par->init_res = nrmr; solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nrmr; } // check if initial is guess good enough if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { info = MAGMA_SUCCESS; goto cleanup; } // P = randn(n, s) // P = ortho(P) //--------------------------------------- // P = 0.0 CHECK( magma_svinit( &dP, Magma_CPU, A.num_cols, s, c_zero, queue )); // P = randn(n, s) distr = 3; // 1 = unif (0,1), 2 = unif (-1,1), 3 = normal (0,1) dof = dP.num_rows * dP.num_cols; lapackf77_slarnv( &distr, iseed, &dof, dP.val ); // transfer P to device CHECK( magma_smtransfer( dP, &dP1, Magma_CPU, Magma_DEV, queue )); magma_smfree( &dP, queue ); // P = ortho(P1) if ( dP1.num_cols > 1 ) { // P = magma_sqr(P1), QR factorization CHECK( magma_sqr( dP1.num_rows, dP1.num_cols, dP1, dP1.ld, &dP, NULL, queue )); } else { // P = P1 / |P1| nrm = magma_snrm2( dof, dP1.dval, 1, queue ); nrm = 1.0 / nrm; magma_sscal( dof, nrm, dP1.dval, 1, queue ); CHECK( magma_smtransfer( dP1, &dP, Magma_DEV, Magma_DEV, queue )); } magma_smfree( &dP1, queue ); //--------------------------------------- // allocate memory for the scalar products CHECK( magma_smalloc_pinned( &hskp, 5 )); CHECK( magma_svinit( &dskp, Magma_DEV, 4, 1, c_zero, queue )); CHECK( magma_smalloc_pinned( &halpha, s )); CHECK( magma_svinit( &dalpha, Magma_DEV, s, 1, c_zero, queue )); CHECK( magma_smalloc_pinned( &hbeta, s )); CHECK( magma_svinit( &dbeta, Magma_DEV, s, 1, c_zero, queue )); // workspace for merged dot product CHECK( magma_smalloc( &d1, max(2, s) * b.num_rows )); CHECK( magma_smalloc( &d2, max(2, s) * b.num_rows )); // smoothing enabled if ( smoothing > 0 ) { // set smoothing solution vector CHECK( magma_smtransfer( *x, &dxs, Magma_DEV, Magma_DEV, queue )); // tt = 0 // make tt twice as large to contain both, dtt and drs ldd = magma_roundup( b.num_rows, 32 ); CHECK( magma_svinit( &dtt, Magma_DEV, ldd, 2, c_zero, queue )); dtt.num_rows = dr.num_rows; dtt.num_cols = 1; dtt.nnz = dtt.num_rows; // redirect the drs.dval to the second part of dtt CHECK( magma_svinit( &drs, Magma_DEV, dr.num_rows, 1, c_zero, queue )); magma_free( drs.dval ); drs.dval = dtt.dval + ldd; // set smoothing residual vector magma_scopyvector( dr.num_rows, dr.dval, 1, drs.dval, 1, queue ); } // G(n,s) = 0 if ( s > 1 ) { ldd = magma_roundup( A.num_rows, 32 ); CHECK( magma_svinit( &dG, Magma_DEV, ldd, s, c_zero, queue )); dG.num_rows = A.num_rows; } else { CHECK( magma_svinit( &dG, Magma_DEV, A.num_rows, s, c_zero, queue )); } // dGcol represents a single column of dG, array pointer is set inside loop CHECK( magma_svinit( &dGcol, Magma_DEV, dG.num_rows, 1, c_zero, queue )); magma_free( dGcol.dval ); // U(n,s) = 0 if ( s > 1 ) { ldd = magma_roundup( A.num_cols, 32 ); CHECK( magma_svinit( &dU, Magma_DEV, ldd, s, c_zero, queue )); dU.num_rows = A.num_cols; } else { CHECK( magma_svinit( &dU, Magma_DEV, A.num_cols, s, c_zero, queue )); } // M(s,s) = I CHECK( magma_svinit( &dM, Magma_DEV, s, s, c_zero, queue )); CHECK( magma_smalloc_pinned( &hMdiag, s )); magmablas_slaset( MagmaFull, dM.num_rows, dM.num_cols, c_zero, c_one, dM.dval, dM.ld, queue ); // f = 0 CHECK( magma_svinit( &df, Magma_DEV, dP.num_cols, 1, c_zero, queue )); // c = 0 CHECK( magma_svinit( &dc, Magma_DEV, dM.num_cols, 1, c_zero, queue )); // v = r CHECK( magma_smtransfer( dr, &dv, Magma_DEV, Magma_DEV, queue )); //--------------START TIME--------------- // chronometry tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->timing[0] = 0.0; } cudaProfilerStart(); om = MAGMA_S_ONE; gamma = MAGMA_S_ZERO; innerflag = 0; // new RHS for small systems // f = P' r // Q1 magma_sgemvmdot_shfl( dP.num_rows, dP.num_cols, dP.dval, dr.dval, d1, d2, df.dval, queues[1] ); // skp[4] = f(k) // Q1 magma_sgetvector_async( 1, df.dval, 1, &hskp[4], 1, queues[1] ); // c(k:s) = f(k:s) // Q1 magma_scopyvector_async( s, df.dval, 1, dc.dval, 1, queues[1] ); // c(k:s) = M(k:s,k:s) \ f(k:s) // Q1 magma_strsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, s, dM.dval, dM.ld, dc.dval, 1, queues[1] ); // start iteration do { solver_par->numiter++; // shadow space loop for ( k = 0; k < s; ++k ) { sk = s - k; dGcol.dval = dG.dval + k * dG.ld; // v = r - G(:,k:s) c(k:s) // Q1 magmablas_sgemv( MagmaNoTrans, dG.num_rows, sk, c_n_one, dGcol.dval, dG.ld, &dc.dval[k], 1, c_one, dv.dval, 1, queues[1] ); // U(:,k) = om * v + U(:,k:s) c(k:s) // Q1 magmablas_sgemv( MagmaNoTrans, dU.num_rows, sk, c_one, &dU.dval[k*dU.ld], dU.ld, &dc.dval[k], 1, om, dv.dval, 1, queues[1] ); // G(:,k) = A U(:,k) // Q1 CHECK( magma_s_spmv( c_one, A, dv, c_zero, dGcol, queues[1] )); solver_par->spmv_count++; // bi-orthogonalize the new basis vectors for ( i = 0; i < k; ++i ) { // alpha = P(:,i)' G(:,k) // Q1 halpha[i] = magma_sdot( dP.num_rows, &dP.dval[i*dP.ld], 1, dGcol.dval, 1, queues[1] ); // implicit sync Q1 --> alpha = P(:,i)' G(:,k) // alpha = alpha / M(i,i) halpha[i] = halpha[i] / hMdiag[i]; // G(:,k) = G(:,k) - alpha * G(:,i) // Q1 magma_saxpy( dG.num_rows, -halpha[i], &dG.dval[i*dG.ld], 1, dGcol.dval, 1, queues[1] ); } // sync Q1 --> G(:,k) = G(:,k) - alpha * G(:,i), skp[4] = f(k) magma_queue_sync( queues[1] ); // new column of M = P'G, first k-1 entries are zero // M(k:s,k) = P(:,k:s)' G(:,k) // Q2 magma_sgemvmdot_shfl( dP.num_rows, sk, &dP.dval[k*dP.ld], dGcol.dval, d1, d2, &dM.dval[k*dM.ld+k], queues[2] ); // non-first s iteration if ( k > 0 ) { // alpha = dalpha // Q0 magma_ssetvector_async( k, halpha, 1, dalpha.dval, 1, queues[0] ); // U update outside of loop using GEMV // U(:,k) = U(:,k) - U(:,1:k) * alpha(1:k) // Q0 magmablas_sgemv( MagmaNoTrans, dU.num_rows, k, c_n_one, dU.dval, dU.ld, dalpha.dval, 1, c_one, dv.dval, 1, queues[0] ); } // Mdiag(k) = M(k,k) // Q2 magma_sgetvector( 1, &dM.dval[k*dM.ld+k], 1, &hMdiag[k], 1, queues[2] ); // implicit sync Q2 --> Mdiag(k) = M(k,k) // U(:,k) = v // Q0 magma_scopyvector_async( dU.num_rows, dv.dval, 1, &dU.dval[k*dU.ld], 1, queues[0] ); // check M(k,k) == 0 if ( MAGMA_S_EQUAL(hMdiag[k], MAGMA_S_ZERO) ) { innerflag = 1; info = MAGMA_DIVERGENCE; break; } // beta = f(k) / M(k,k) hbeta[k] = hskp[4] / hMdiag[k]; // check for nan if ( magma_s_isnan( hbeta[k] ) || magma_s_isinf( hbeta[k] )) { innerflag = 1; info = MAGMA_DIVERGENCE; break; } // r = r - beta * G(:,k) // Q2 magma_saxpy( dr.num_rows, -hbeta[k], dGcol.dval, 1, dr.dval, 1, queues[2] ); // non-last s iteration if ( (k + 1) < s ) { // f(k+1:s) = f(k+1:s) - beta * M(k+1:s,k) // Q1 magma_saxpy( sk-1, -hbeta[k], &dM.dval[k*dM.ld+(k+1)], 1, &df.dval[k+1], 1, queues[1] ); // c(k+1:s) = f(k+1:s) // Q1 magma_scopyvector_async( sk-1, &df.dval[k+1], 1, &dc.dval[k+1], 1, queues[1] ); // c(k+1:s) = M(k+1:s,k+1:s) \ f(k+1:s) // Q1 magma_strsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, sk-1, &dM.dval[(k+1)*dM.ld+(k+1)], dM.ld, &dc.dval[k+1], 1, queues[1] ); // skp[4] = f(k+1) // Q1 magma_sgetvector_async( 1, &df.dval[k+1], 1, &hskp[4], 1, queues[1] ); } // smoothing disabled if ( smoothing <= 0 ) { // |r| // Q2 nrmr = magma_snrm2( dr.num_rows, dr.dval, 1, queues[2] ); // implicit sync Q2 --> |r| // smoothing enabled } else { // smoothing operation //--------------------------------------- // t = rs - r // Q2 magma_sidr_smoothing_1( drs.num_rows, drs.num_cols, drs.dval, dr.dval, dtt.dval, queues[2] ); // x = x + beta * U(:,k) // Q0 magma_saxpy( x->num_rows, hbeta[k], &dU.dval[k*dU.ld], 1, x->dval, 1, queues[0] ); // t't // t'rs // Q2 CHECK( magma_sgemvmdot_shfl( dt.ld, 2, dtt.dval, dtt.dval, d1, d2, &dskp.dval[2], queues[2] )); // skp[2-3] = dskp[2-3] // Q2 magma_sgetvector( 2, &dskp.dval[2], 1, &hskp[2], 1, queues[2] ); // implicit sync Q2 --> skp = dskp // gamma = (t' * rs) / (t' * t) gamma = hskp[3] / hskp[2]; // rs = rs - gamma * t // Q1 magma_saxpy( drs.num_rows, -gamma, dtt.dval, 1, drs.dval, 1, queues[1] ); // xs = xs - gamma * (xs - x) // Q0 magma_sidr_smoothing_2( dxs.num_rows, dxs.num_cols, -gamma, x->dval, dxs.dval, queues[0] ); // |rs| // Q1 nrmr = magma_snrm2( drs.num_rows, drs.dval, 1, queues[1] ); // implicit sync Q0 --> |r| //--------------------------------------- } // v = r // Q1 magma_scopyvector_async( dr.num_rows, dr.dval, 1, dv.dval, 1, queues[1] ); // last s iteration if ( (k + 1) == s ) { // t = A r // Q2 CHECK( magma_s_spmv( c_one, A, dr, c_zero, dt, queues[2] )); solver_par->spmv_count++; // t't // t'r // Q2 CHECK( magma_sgemvmdot_shfl( dt.ld, 2, dt.dval, dt.dval, d1, d2, dskp.dval, queues[2] )); } // store current timing and residual if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter) % solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)nrmr; solver_par->timing[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)tempo2 - tempo1; } } // check convergence or iteration limit if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { s = k + 1; // for the x-update outside the loop innerflag = 2; info = MAGMA_SUCCESS; break; } } // smoothing disabled if ( smoothing <= 0 && innerflag != 1 ) { // dbeta(1:s) = beta(1:s) // Q0 magma_ssetvector_async( s, hbeta, 1, dbeta.dval, 1, queues[0] ); // x = x + U(:,1:s) * beta(1:s) // Q0 magmablas_sgemv( MagmaNoTrans, dU.num_rows, s, c_one, dU.dval, dU.ld, dbeta.dval, 1, c_one, x->dval, 1, queues[0] ); } // check convergence or iteration limit or invalid result of inner loop if ( innerflag > 0 ) { break; } // computation of a new omega //--------------------------------------- // skp[0-2] = dskp[0-2] // Q2 magma_sgetvector( 2, dskp.dval, 1, hskp, 1, queues[2] ); // implicit sync Q2 --> skp = dskp // |t| nrmt = magma_ssqrt( MAGMA_S_REAL(hskp[0]) ); // rho = abs((t' * r) / (|t| * |r|)) rho = MAGMA_D_ABS( MAGMA_S_REAL(hskp[1]) / (nrmt * nrmr) ); // om = (t' * r) / (|t| * |t|) om = hskp[1] / hskp[0]; if ( rho < angle ) { om = (om * angle) / rho; } //--------------------------------------- if ( MAGMA_S_EQUAL(om, MAGMA_S_ZERO) ) { info = MAGMA_DIVERGENCE; break; } // sync Q1 --> v = r magma_queue_sync( queues[1] ); // r = r - om * t // Q2 magma_saxpy( dr.num_rows, -om, dt.dval, 1, dr.dval, 1, queues[2] ); // x = x + om * v // Q0 magma_saxpy( x->num_rows, om, dv.dval, 1, x->dval, 1, queues[0] ); // smoothing disabled if ( smoothing <= 0 ) { // |r| // Q2 nrmr = magma_snrm2( dr.num_rows, dr.dval, 1, queues[2] ); // implicit sync Q2 --> |r| // v = r // Q0 magma_scopyvector_async( dr.num_rows, dr.dval, 1, dv.dval, 1, queues[0] ); // new RHS for small systems // f = P' r // Q1 magma_sgemvmdot_shfl( dP.num_rows, dP.num_cols, dP.dval, dr.dval, d1, d2, df.dval, queues[1] ); // skp[4] = f(k) // Q1 magma_sgetvector_async( 1, df.dval, 1, &hskp[4], 1, queues[1] ); // c(k:s) = f(k:s) // Q1 magma_scopyvector_async( s, df.dval, 1, dc.dval, 1, queues[1] ); // c(k:s) = M(k:s,k:s) \ f(k:s) // Q1 magma_strsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, s, dM.dval, dM.ld, dc.dval, 1, queues[1] ); // smoothing enabled } else { // smoothing operation //--------------------------------------- // t = rs - r // Q2 magma_sidr_smoothing_1( drs.num_rows, drs.num_cols, drs.dval, dr.dval, dtt.dval, queues[2] ); // t't // t'rs // Q2 CHECK( magma_sgemvmdot_shfl( dt.ld, 2, dtt.dval, dtt.dval, d1, d2, &dskp.dval[2], queues[2] )); // skp[2-3] = dskp[2-3] // Q2 magma_sgetvector( 2, &dskp.dval[2], 1, &hskp[2], 1, queues[2] ); // implicit sync Q2 --> skp = dskp // gamma = (t' * rs) / (t' * t) gamma = hskp[3] / hskp[2]; // rs = rs - gamma * (rs - r) // Q2 magma_saxpy( drs.num_rows, -gamma, dtt.dval, 1, drs.dval, 1, queues[2] ); // xs = xs - gamma * (xs - x) // Q0 magma_sidr_smoothing_2( dxs.num_rows, dxs.num_cols, -gamma, x->dval, dxs.dval, queues[0] ); // v = r // Q0 magma_scopyvector_async( dr.num_rows, dr.dval, 1, dv.dval, 1, queues[0] ); // new RHS for small systems // f = P' r // Q1 magma_sgemvmdot_shfl( dP.num_rows, dP.num_cols, dP.dval, dr.dval, d1, d2, df.dval, queues[1] ); // skp[4] = f(k) // Q1 magma_sgetvector_async( 1, df.dval, 1, &hskp[4], 1, queues[1] ); // c(k:s) = f(k:s) // Q1 magma_scopyvector_async( s, df.dval, 1, dc.dval, 1, queues[1] ); // |rs| // Q2 nrmr = magma_snrm2( drs.num_rows, drs.dval, 1, queues[2] ); // implicit sync Q2 --> |r| // c(k:s) = M(k:s,k:s) \ f(k:s) // Q1 magma_strsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, s, dM.dval, dM.ld, dc.dval, 1, queues[1] ); //--------------------------------------- } // store current timing and residual if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); magma_queue_sync( queue ); if ( (solver_par->numiter) % solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)nrmr; solver_par->timing[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)tempo2 - tempo1; } } // check convergence or iteration limit if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { info = MAGMA_SUCCESS; break; } // sync Q0 --> v = r magma_queue_sync( queues[0] ); } while ( solver_par->numiter + 1 <= solver_par->maxiter ); // sync all queues for ( q = 0; q < nqueues; q++ ) { magma_queue_sync( queues[q] ); } // smoothing enabled if ( smoothing > 0 ) { // x = xs magma_scopyvector_async( x->num_rows, dxs.dval, 1, x->dval, 1, queue ); // r = rs magma_scopyvector_async( dr.num_rows, drs.dval, 1, dr.dval, 1, queue ); } cudaProfilerStop(); // get last iteration timing tempo2 = magma_sync_wtime( queue ); magma_queue_sync( queue ); solver_par->runtime = (real_Double_t)tempo2 - tempo1; //--------------STOP TIME---------------- // get final stats solver_par->iter_res = nrmr; CHECK( magma_sresidualvec( A, b, *x, &dr, &residual, queue )); solver_par->final_res = residual; // set solver conclusion if ( info != MAGMA_SUCCESS && info != MAGMA_DIVERGENCE ) { if ( solver_par->init_res > solver_par->final_res ) { info = MAGMA_SLOW_CONVERGENCE; } } cleanup: // free resources // sync all queues, destory additional queues magma_queue_sync( queues[0] ); for ( q = 1; q < nqueues; q++ ) { magma_queue_sync( queues[q] ); magma_queue_destroy( queues[q] ); } // smoothing enabled if ( smoothing > 0 ) { drs.dval = NULL; // needed because its pointer is redirected to dtt magma_smfree( &dxs, queue ); magma_smfree( &drs, queue ); magma_smfree( &dtt, queue ); } dr.dval = NULL; // needed because its pointer is redirected to dt dGcol.dval = NULL; // needed because its pointer is redirected to dG magma_smfree( &dr, queue ); magma_smfree( &dP, queue ); magma_smfree( &dP1, queue ); magma_smfree( &dG, queue ); magma_smfree( &dGcol, queue ); magma_smfree( &dU, queue ); magma_smfree( &dM, queue ); magma_smfree( &df, queue ); magma_smfree( &dt, queue ); magma_smfree( &dc, queue ); magma_smfree( &dv, queue ); magma_smfree( &dskp, queue ); magma_smfree( &dalpha, queue ); magma_smfree( &dbeta, queue ); magma_free_pinned( hMdiag ); magma_free_pinned( hskp ); magma_free_pinned( halpha ); magma_free_pinned( hbeta ); magma_free( d1 ); magma_free( d2 ); solver_par->info = info; return info; /* magma_sidr_strms */ }
/* //////////////////////////////////////////////////////////////////////////// -- testing any solver */ int main( int argc, char** argv ) { magma_int_t info = 0; TESTING_INIT(); magma_sopts zopts; magma_queue_t queue=NULL; magma_queue_create( /*devices[ opts->device ],*/ &queue ); float one = MAGMA_S_MAKE(1.0, 0.0); float zero = MAGMA_S_MAKE(0.0, 0.0); magma_s_matrix A={Magma_CSR}, B={Magma_CSR}, B_d={Magma_CSR}; magma_s_matrix x={Magma_CSR}, b={Magma_CSR}; int i=1; CHECK( magma_sparse_opts( argc, argv, &zopts, &i, queue )); B.blocksize = zopts.blocksize; B.alignment = zopts.alignment; if ( zopts.solver_par.solver != Magma_PCG && zopts.solver_par.solver != Magma_PGMRES && zopts.solver_par.solver != Magma_PBICGSTAB && zopts.solver_par.solver != Magma_ITERREF && zopts.solver_par.solver != Magma_LOBPCG ) zopts.precond_par.solver = Magma_NONE; CHECK( magma_ssolverinfo_init( &zopts.solver_par, &zopts.precond_par, queue )); while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); CHECK( magma_sm_5stencil( laplace_size, &A, queue )); } else { // file-matrix test CHECK( magma_s_csr_mtx( &A, argv[i], queue )); } printf( "\n# matrix info: %d-by-%d with %d nonzeros\n\n", (int) A.num_rows,(int) A.num_cols,(int) A.nnz ); // for the eigensolver case zopts.solver_par.ev_length = A.num_rows; CHECK( magma_seigensolverinfo_init( &zopts.solver_par, queue )); // scale matrix CHECK( magma_smscale( &A, zopts.scaling, queue )); CHECK( magma_smconvert( A, &B, Magma_CSR, zopts.output_format, queue )); CHECK( magma_smtransfer( B, &B_d, Magma_CPU, Magma_DEV, queue )); // vectors and initial guess CHECK( magma_svinit( &b, Magma_DEV, A.num_cols, 1, one, queue )); //magma_svinit( &x, Magma_DEV, A.num_cols, 1, one, queue ); //magma_s_spmv( one, B_d, x, zero, b, queue ); // b = A x //magma_smfree(&x, queue ); CHECK( magma_svinit( &x, Magma_DEV, A.num_cols, 1, zero, queue )); info = magma_s_solver( B_d, b, &x, &zopts, queue ); if( info != 0 ){ printf("error: solver returned: %s (%d).\n", magma_strerror( info ), info ); } magma_ssolverinfo( &zopts.solver_par, &zopts.precond_par, queue ); magma_smfree(&B_d, queue ); magma_smfree(&B, queue ); magma_smfree(&A, queue ); magma_smfree(&x, queue ); magma_smfree(&b, queue ); i++; } cleanup: magma_smfree(&B_d, queue ); magma_smfree(&B, queue ); magma_smfree(&A, queue ); magma_smfree(&x, queue ); magma_smfree(&b, queue ); magma_ssolverinfo_free( &zopts.solver_par, &zopts.precond_par, queue ); magma_queue_destroy( queue ); TESTING_FINALIZE(); return info; }
magma_int_t magma_scustomicsetup( magma_s_matrix A, magma_s_matrix b, magma_s_preconditioner *precond, magma_queue_t queue ) { magma_int_t info = 0; cusparseHandle_t cusparseHandle=NULL; cusparseMatDescr_t descrL=NULL; cusparseMatDescr_t descrU=NULL; magma_s_matrix hA={Magma_CSR}; char preconditionermatrix[255]; snprintf( preconditionermatrix, sizeof(preconditionermatrix), "precondL.mtx" ); CHECK( magma_s_csr_mtx( &hA, preconditionermatrix , queue) ); // for CUSPARSE CHECK( magma_smtransfer( hA, &precond->M, Magma_CPU, Magma_DEV , queue )); // copy the matrix to precond->L and (transposed) to precond->U CHECK( magma_smtransfer(precond->M, &(precond->L), Magma_DEV, Magma_DEV, queue )); CHECK( magma_smtranspose( precond->L, &(precond->U), queue )); // extract the diagonal of L into precond->d CHECK( magma_sjacobisetup_diagscal( precond->L, &precond->d, queue )); CHECK( magma_svinit( &precond->work1, Magma_DEV, hA.num_rows, 1, MAGMA_S_ZERO, queue )); // extract the diagonal of U into precond->d2 CHECK( magma_sjacobisetup_diagscal( precond->U, &precond->d2, queue )); CHECK( magma_svinit( &precond->work2, Magma_DEV, hA.num_rows, 1, MAGMA_S_ZERO, queue )); // CUSPARSE context // CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrL )); CHECK_CUSPARSE( cusparseSetMatType( descrL, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrL, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrL, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrL, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoL )); CHECK_CUSPARSE( cusparseScsrsv_analysis( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->M.num_rows, precond->M.nnz, descrL, precond->M.val, precond->M.row, precond->M.col, precond->cuinfoL )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrU )); CHECK_CUSPARSE( cusparseSetMatType( descrU, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrU, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrU, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrU, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoU )); CHECK_CUSPARSE( cusparseScsrsv_analysis( cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, precond->M.num_rows, precond->M.nnz, descrU, precond->M.val, precond->M.row, precond->M.col, precond->cuinfoU )); cleanup: cusparseDestroy( cusparseHandle ); cusparseDestroyMatDescr( descrL ); cusparseDestroyMatDescr( descrU ); cusparseHandle=NULL; descrL=NULL; descrU=NULL; magma_smfree( &hA, queue ); return info; }
extern "C" magma_int_t magma_spidr( magma_s_matrix A, magma_s_matrix b, magma_s_matrix *x, magma_s_solver_par *solver_par, magma_s_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_PIDR; solver_par->numiter = 0; solver_par->spmv_count = 0; solver_par->init_res = 0.0; solver_par->final_res = 0.0; solver_par->iter_res = 0.0; solver_par->runtime = 0.0; // constants const float c_zero = MAGMA_S_ZERO; const float c_one = MAGMA_S_ONE; const float c_n_one = MAGMA_S_NEG_ONE; // internal user parameters const magma_int_t smoothing = 1; // 0 = disable, 1 = enable const float angle = 0.7; // [0-1] // local variables magma_int_t iseed[4] = {0, 0, 0, 1}; magma_int_t dof; magma_int_t s; magma_int_t distr; magma_int_t k, i, sk; magma_int_t innerflag; float residual; float nrm; float nrmb; float nrmr; float nrmt; float rho; float om; float tt; float tr; float gamma; float alpha; float mkk; float fk; // matrices and vectors magma_s_matrix dxs = {Magma_CSR}; magma_s_matrix dr = {Magma_CSR}, drs = {Magma_CSR}; magma_s_matrix dP = {Magma_CSR}, dP1 = {Magma_CSR}; magma_s_matrix dG = {Magma_CSR}; magma_s_matrix dU = {Magma_CSR}; magma_s_matrix dM = {Magma_CSR}; magma_s_matrix df = {Magma_CSR}; magma_s_matrix dt = {Magma_CSR}; magma_s_matrix dc = {Magma_CSR}; magma_s_matrix dv = {Magma_CSR}; magma_s_matrix dbeta = {Magma_CSR}, hbeta = {Magma_CSR}; magma_s_matrix dlu = {Magma_CSR}; // chronometry real_Double_t tempo1, tempo2; // initial s space // TODO: add option for 's' (shadow space number) // Hack: uses '--restart' option as the shadow space number. // This is not a good idea because the default value of restart option is used to detect // if the user provided a custom restart. This means that if the default restart value // is changed then the code will think it was the user (unless the default value is // also updated in the 'if' statement below. s = 1; if ( solver_par->restart != 50 ) { if ( solver_par->restart > A.num_cols ) { s = A.num_cols; } else { s = solver_par->restart; } } solver_par->restart = s; // set max iterations solver_par->maxiter = min( 2 * A.num_cols, solver_par->maxiter ); // check if matrix A is square if ( A.num_rows != A.num_cols ) { //printf("Matrix A is not square.\n"); info = MAGMA_ERR_NOT_SUPPORTED; goto cleanup; } // |b| nrmb = magma_snrm2( b.num_rows, b.dval, 1, queue ); if ( nrmb == 0.0 ) { magma_sscal( x->num_rows, MAGMA_S_ZERO, x->dval, 1, queue ); info = MAGMA_SUCCESS; goto cleanup; } // r = b - A x CHECK( magma_svinit( &dr, Magma_DEV, b.num_rows, 1, c_zero, queue )); CHECK( magma_sresidualvec( A, b, *x, &dr, &nrmr, queue )); // |r| solver_par->init_res = nrmr; solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nrmr; } // check if initial is guess good enough if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { info = MAGMA_SUCCESS; goto cleanup; } // P = randn(n, s) // P = ortho(P) //--------------------------------------- // P = 0.0 CHECK( magma_svinit( &dP, Magma_CPU, A.num_cols, s, c_zero, queue )); // P = randn(n, s) distr = 3; // 1 = unif (0,1), 2 = unif (-1,1), 3 = normal (0,1) dof = dP.num_rows * dP.num_cols; lapackf77_slarnv( &distr, iseed, &dof, dP.val ); // transfer P to device CHECK( magma_smtransfer( dP, &dP1, Magma_CPU, Magma_DEV, queue )); magma_smfree( &dP, queue ); // P = ortho(P1) if ( dP1.num_cols > 1 ) { // P = magma_sqr(P1), QR factorization CHECK( magma_sqr( dP1.num_rows, dP1.num_cols, dP1, dP1.ld, &dP, NULL, queue )); } else { // P = P1 / |P1| nrm = magma_snrm2( dof, dP1.dval, 1, queue ); nrm = 1.0 / nrm; magma_sscal( dof, nrm, dP1.dval, 1, queue ); CHECK( magma_smtransfer( dP1, &dP, Magma_DEV, Magma_DEV, queue )); } magma_smfree( &dP1, queue ); //--------------------------------------- // allocate memory for the scalar products CHECK( magma_svinit( &hbeta, Magma_CPU, s, 1, c_zero, queue )); CHECK( magma_svinit( &dbeta, Magma_DEV, s, 1, c_zero, queue )); // smoothing enabled if ( smoothing > 0 ) { // set smoothing solution vector CHECK( magma_smtransfer( *x, &dxs, Magma_DEV, Magma_DEV, queue )); // set smoothing residual vector CHECK( magma_smtransfer( dr, &drs, Magma_DEV, Magma_DEV, queue )); } // G(n,s) = 0 CHECK( magma_svinit( &dG, Magma_DEV, A.num_cols, s, c_zero, queue )); // U(n,s) = 0 CHECK( magma_svinit( &dU, Magma_DEV, A.num_cols, s, c_zero, queue )); // M(s,s) = I CHECK( magma_svinit( &dM, Magma_DEV, s, s, c_zero, queue )); magmablas_slaset( MagmaFull, s, s, c_zero, c_one, dM.dval, s, queue ); // f = 0 CHECK( magma_svinit( &df, Magma_DEV, dP.num_cols, 1, c_zero, queue )); // t = 0 CHECK( magma_svinit( &dt, Magma_DEV, dr.num_rows, 1, c_zero, queue )); // c = 0 CHECK( magma_svinit( &dc, Magma_DEV, dM.num_cols, 1, c_zero, queue )); // v = 0 CHECK( magma_svinit( &dv, Magma_DEV, dr.num_rows, 1, c_zero, queue )); // lu = 0 CHECK( magma_svinit( &dlu, Magma_DEV, A.num_rows, 1, c_zero, queue )); //--------------START TIME--------------- // chronometry tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->timing[0] = 0.0; } om = MAGMA_S_ONE; innerflag = 0; // start iteration do { solver_par->numiter++; // new RHS for small systems // f = P' r magmablas_sgemv( MagmaConjTrans, dP.num_rows, dP.num_cols, c_one, dP.dval, dP.ld, dr.dval, 1, c_zero, df.dval, 1, queue ); // shadow space loop for ( k = 0; k < s; ++k ) { sk = s - k; // f(k:s) = M(k:s,k:s) c(k:s) magma_scopyvector( sk, &df.dval[k], 1, &dc.dval[k], 1, queue ); magma_strsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, sk, &dM.dval[k*dM.ld+k], dM.ld, &dc.dval[k], 1, queue ); // v = r - G(:,k:s) c(k:s) magma_scopyvector( dr.num_rows, dr.dval, 1, dv.dval, 1, queue ); magmablas_sgemv( MagmaNoTrans, dG.num_rows, sk, c_n_one, &dG.dval[k*dG.ld], dG.ld, &dc.dval[k], 1, c_one, dv.dval, 1, queue ); // preconditioning operation // v = L \ v; // v = U \ v; CHECK( magma_s_applyprecond_left( MagmaNoTrans, A, dv, &dlu, precond_par, queue )); CHECK( magma_s_applyprecond_right( MagmaNoTrans, A, dlu, &dv, precond_par, queue )); // U(:,k) = om * v + U(:,k:s) c(k:s) magmablas_sgemv( MagmaNoTrans, dU.num_rows, sk, c_one, &dU.dval[k*dU.ld], dU.ld, &dc.dval[k], 1, om, dv.dval, 1, queue ); magma_scopyvector( dU.num_rows, dv.dval, 1, &dU.dval[k*dU.ld], 1, queue ); // G(:,k) = A U(:,k) CHECK( magma_s_spmv( c_one, A, dv, c_zero, dv, queue )); solver_par->spmv_count++; magma_scopyvector( dG.num_rows, dv.dval, 1, &dG.dval[k*dG.ld], 1, queue ); // bi-orthogonalize the new basis vectors for ( i = 0; i < k; ++i ) { // alpha = P(:,i)' G(:,k) alpha = magma_sdot( dP.num_rows, &dP.dval[i*dP.ld], 1, &dG.dval[k*dG.ld], 1, queue ); // alpha = alpha / M(i,i) magma_sgetvector( 1, &dM.dval[i*dM.ld+i], 1, &mkk, 1, queue ); alpha = alpha / mkk; // G(:,k) = G(:,k) - alpha * G(:,i) magma_saxpy( dG.num_rows, -alpha, &dG.dval[i*dG.ld], 1, &dG.dval[k*dG.ld], 1, queue ); // U(:,k) = U(:,k) - alpha * U(:,i) magma_saxpy( dU.num_rows, -alpha, &dU.dval[i*dU.ld], 1, &dU.dval[k*dU.ld], 1, queue ); } // new column of M = P'G, first k-1 entries are zero // M(k:s,k) = P(:,k:s)' G(:,k) magmablas_sgemv( MagmaConjTrans, dP.num_rows, sk, c_one, &dP.dval[k*dP.ld], dP.ld, &dG.dval[k*dG.ld], 1, c_zero, &dM.dval[k*dM.ld+k], 1, queue ); // check M(k,k) == 0 magma_sgetvector( 1, &dM.dval[k*dM.ld+k], 1, &mkk, 1, queue ); if ( MAGMA_S_EQUAL(mkk, MAGMA_S_ZERO) ) { innerflag = 1; info = MAGMA_DIVERGENCE; break; } // beta = f(k) / M(k,k) magma_sgetvector( 1, &df.dval[k], 1, &fk, 1, queue ); hbeta.val[k] = fk / mkk; // check for nan if ( magma_s_isnan( hbeta.val[k] ) || magma_s_isinf( hbeta.val[k] )) { innerflag = 1; info = MAGMA_DIVERGENCE; break; } // r = r - beta * G(:,k) magma_saxpy( dr.num_rows, -hbeta.val[k], &dG.dval[k*dG.ld], 1, dr.dval, 1, queue ); // smoothing disabled if ( smoothing <= 0 ) { // |r| nrmr = magma_snrm2( dr.num_rows, dr.dval, 1, queue ); // smoothing enabled } else { // x = x + beta * U(:,k) magma_saxpy( x->num_rows, hbeta.val[k], &dU.dval[k*dU.ld], 1, x->dval, 1, queue ); // smoothing operation //--------------------------------------- // t = rs - r magma_scopyvector( drs.num_rows, drs.dval, 1, dt.dval, 1, queue ); magma_saxpy( dt.num_rows, c_n_one, dr.dval, 1, dt.dval, 1, queue ); // t't // t'rs tt = magma_sdot( dt.num_rows, dt.dval, 1, dt.dval, 1, queue ); tr = magma_sdot( dt.num_rows, dt.dval, 1, drs.dval, 1, queue ); // gamma = (t' * rs) / (t' * t) gamma = tr / tt; // rs = rs - gamma * (rs - r) magma_saxpy( drs.num_rows, -gamma, dt.dval, 1, drs.dval, 1, queue ); // xs = xs - gamma * (xs - x) magma_scopyvector( dxs.num_rows, dxs.dval, 1, dt.dval, 1, queue ); magma_saxpy( dt.num_rows, c_n_one, x->dval, 1, dt.dval, 1, queue ); magma_saxpy( dxs.num_rows, -gamma, dt.dval, 1, dxs.dval, 1, queue ); // |rs| nrmr = magma_snrm2( drs.num_rows, drs.dval, 1, queue ); //--------------------------------------- } // store current timing and residual if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter) % solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)nrmr; solver_par->timing[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)tempo2 - tempo1; } } // check convergence if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { s = k + 1; // for the x-update outside the loop innerflag = 2; info = MAGMA_SUCCESS; break; } // non-last s iteration if ( (k + 1) < s ) { // f(k+1:s) = f(k+1:s) - beta * M(k+1:s,k) magma_saxpy( sk-1, -hbeta.val[k], &dM.dval[k*dM.ld+(k+1)], 1, &df.dval[k+1], 1, queue ); } } // smoothing disabled if ( smoothing <= 0 && innerflag != 1 ) { // update solution approximation x // x = x + U(:,1:s) * beta(1:s) magma_ssetvector( s, hbeta.val, 1, dbeta.dval, 1, queue ); magmablas_sgemv( MagmaNoTrans, dU.num_rows, s, c_one, dU.dval, dU.ld, dbeta.dval, 1, c_one, x->dval, 1, queue ); } // check convergence or iteration limit or invalid result of inner loop if ( innerflag > 0 ) { break; } // v = r magma_scopyvector( dr.num_rows, dr.dval, 1, dv.dval, 1, queue ); // preconditioning operation // v = L \ v; // v = U \ v; CHECK( magma_s_applyprecond_left( MagmaNoTrans, A, dv, &dlu, precond_par, queue )); CHECK( magma_s_applyprecond_right( MagmaNoTrans, A, dlu, &dv, precond_par, queue )); // t = A v CHECK( magma_s_spmv( c_one, A, dv, c_zero, dt, queue )); solver_par->spmv_count++; // computation of a new omega //--------------------------------------- // |t| nrmt = magma_snrm2( dt.num_rows, dt.dval, 1, queue ); // t'r tr = magma_sdot( dt.num_rows, dt.dval, 1, dr.dval, 1, queue ); // rho = abs(t' * r) / (|t| * |r|)) rho = MAGMA_D_ABS( MAGMA_S_REAL(tr) / (nrmt * nrmr) ); // om = (t' * r) / (|t| * |t|) om = tr / (nrmt * nrmt); if ( rho < angle ) { om = (om * angle) / rho; } //--------------------------------------- if ( MAGMA_S_EQUAL(om, MAGMA_S_ZERO) ) { info = MAGMA_DIVERGENCE; break; } // update approximation vector // x = x + om * v magma_saxpy( x->num_rows, om, dv.dval, 1, x->dval, 1, queue ); // update residual vector // r = r - om * t magma_saxpy( dr.num_rows, -om, dt.dval, 1, dr.dval, 1, queue ); // smoothing disabled if ( smoothing <= 0 ) { // residual norm nrmr = magma_snrm2( b.num_rows, dr.dval, 1, queue ); // smoothing enabled } else { // smoothing operation //--------------------------------------- // t = rs - r magma_scopyvector( drs.num_rows, drs.dval, 1, dt.dval, 1, queue ); magma_saxpy( dt.num_rows, c_n_one, dr.dval, 1, dt.dval, 1, queue ); // t't // t'rs tt = magma_sdot( dt.num_rows, dt.dval, 1, dt.dval, 1, queue ); tr = magma_sdot( dt.num_rows, dt.dval, 1, drs.dval, 1, queue ); // gamma = (t' * rs) / (|t| * |t|) gamma = tr / tt; // rs = rs - gamma * (rs - r) magma_saxpy( drs.num_rows, -gamma, dt.dval, 1, drs.dval, 1, queue ); // xs = xs - gamma * (xs - x) magma_scopyvector( dxs.num_rows, dxs.dval, 1, dt.dval, 1, queue ); magma_saxpy( dt.num_rows, c_n_one, x->dval, 1, dt.dval, 1, queue ); magma_saxpy( dxs.num_rows, -gamma, dt.dval, 1, dxs.dval, 1, queue ); // |rs| nrmr = magma_snrm2( b.num_rows, drs.dval, 1, queue ); //--------------------------------------- } // store current timing and residual if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter) % solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)nrmr; solver_par->timing[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)tempo2 - tempo1; } } // check convergence if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { info = MAGMA_SUCCESS; break; } } while ( solver_par->numiter + 1 <= solver_par->maxiter ); // smoothing enabled if ( smoothing > 0 ) { // x = xs magma_scopyvector( x->num_rows, dxs.dval, 1, x->dval, 1, queue ); // r = rs magma_scopyvector( dr.num_rows, drs.dval, 1, dr.dval, 1, queue ); } // get last iteration timing tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t)tempo2 - tempo1; //--------------STOP TIME---------------- // get final stats solver_par->iter_res = nrmr; CHECK( magma_sresidualvec( A, b, *x, &dr, &residual, queue )); solver_par->final_res = residual; // set solver conclusion if ( info != MAGMA_SUCCESS && info != MAGMA_DIVERGENCE ) { if ( solver_par->init_res > solver_par->final_res ) { info = MAGMA_SLOW_CONVERGENCE; } } cleanup: // free resources // smoothing enabled if ( smoothing > 0 ) { magma_smfree( &dxs, queue ); magma_smfree( &drs, queue ); } magma_smfree( &dr, queue ); magma_smfree( &dP, queue ); magma_smfree( &dP1, queue ); magma_smfree( &dG, queue ); magma_smfree( &dU, queue ); magma_smfree( &dM, queue ); magma_smfree( &df, queue ); magma_smfree( &dt, queue ); magma_smfree( &dc, queue ); magma_smfree( &dv, queue ); magma_smfree(&dlu, queue); magma_smfree( &dbeta, queue ); magma_smfree( &hbeta, queue ); solver_par->info = info; return info; /* magma_spidr */ }
extern "C" magma_int_t magma_sbicgstab_merge( magma_s_matrix A, magma_s_matrix b, magma_s_matrix *x, magma_s_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_BICGSTAB; solver_par->numiter = 0; solver_par->spmv_count = 0; // some useful variables float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; magma_int_t dofs = A.num_rows * b.num_cols; // workspace magma_s_matrix r={Magma_CSR}, rr={Magma_CSR}, p={Magma_CSR}, v={Magma_CSR}, s={Magma_CSR}, t={Magma_CSR}, d1={Magma_CSR}, d2={Magma_CSR}; CHECK( magma_svinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &rr,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &v, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &s, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &t, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &d1, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &d2, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // solver variables float alpha, beta, omega, rho_old, rho_new; float betanom, nom0, r0, res, nomb; res=0; //float nom; //float den; // solver setup CHECK( magma_sresidualvec( A, b, *x, &r, &nom0, queue)); magma_scopy( dofs, r.dval, 1, rr.dval, 1, queue ); // rr = r betanom = nom0; //nom = nom0*nom0; rho_new = magma_sdot( dofs, r.dval, 1, r.dval, 1, queue ); // rho=<rr,r> rho_old = omega = alpha = MAGMA_S_MAKE( 1.0, 0. ); solver_par->init_res = nom0; CHECK( magma_s_spmv( c_one, A, r, c_zero, v, queue )); // z = A r //den = MAGMA_S_REAL( magma_sdot( dofs, v.dval, 1, r.dval, 1), queue ); // den = z' * r nomb = magma_snrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } if ( (r0 = nomb * solver_par->rtol) < ATOLERANCE ){ r0 = ATOLERANCE; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } if ( nomb < r0 ) { info = MAGMA_SUCCESS; goto cleanup; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; rho_old = rho_new; // rho_old=rho rho_new = magma_sdot( dofs, rr.dval, 1, r.dval, 1, queue ); // rho=<rr,r> beta = rho_new/rho_old * alpha/omega; // beta=rho/rho_old *alpha/omega if( magma_s_isnan_inf( beta ) ){ info = MAGMA_DIVERGENCE; break; } // p = r + beta * ( p - omega * v ) magma_sbicgstab_1( r.num_rows, r.num_cols, beta, omega, r.dval, v.dval, p.dval, queue ); CHECK( magma_s_spmv( c_one, A, p, c_zero, v, queue )); // v = Ap solver_par->spmv_count++; //alpha = rho_new / tmpval; alpha = rho_new /magma_sdot( dofs, rr.dval, 1, v.dval, 1, queue ); if( magma_s_isnan_inf( alpha ) ){ info = MAGMA_DIVERGENCE; break; } // s = r - alpha v magma_sbicgstab_2( r.num_rows, r.num_cols, alpha, r.dval, v.dval, s.dval, queue ); CHECK( magma_s_spmv( c_one, A, s, c_zero, t, queue )); // t=As solver_par->spmv_count++; omega = magma_sdot( dofs, t.dval, 1, s.dval, 1, queue ) // omega = <s,t>/<t,t> / magma_sdot( dofs, t.dval, 1, t.dval, 1, queue ); // x = x + alpha * p + omega * s // r = s - omega * t magma_sbicgstab_3( r.num_rows, r.num_cols, alpha, omega, p.dval, s.dval, t.dval, x->dval, r.dval, queue ); res = betanom = magma_snrm2( dofs, r.dval, 1, queue ); //nom = betanom*betanom; if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res/nomb <= solver_par->rtol || res <= solver_par->atol ){ break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; CHECK( magma_sresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = res; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter && info == MAGMA_SUCCESS ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->rtol*solver_par->init_res || solver_par->iter_res < solver_par->atol ) { info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_smfree(&r, queue ); magma_smfree(&rr, queue ); magma_smfree(&p, queue ); magma_smfree(&v, queue ); magma_smfree(&s, queue ); magma_smfree(&t, queue ); magma_smfree(&d1, queue ); magma_smfree(&d2, queue ); solver_par->info = info; return info; } /* magma_sbicgstab_merge */
/* //////////////////////////////////////////////////////////////////////////// -- testing sparse matrix vector product */ int main( int argc, char** argv ) { magma_int_t info = 0; TESTING_CHECK( magma_init() ); magma_print_environment(); magma_queue_t queue=NULL; magma_queue_create( 0, &queue ); magma_s_matrix hA={Magma_CSR}, hA_SELLP={Magma_CSR}, dA={Magma_CSR}, dA_SELLP={Magma_CSR}; magma_s_matrix hx={Magma_CSR}, hy={Magma_CSR}, dx={Magma_CSR}, dy={Magma_CSR}, hrefvec={Magma_CSR}, hcheck={Magma_CSR}; hA_SELLP.blocksize = 8; hA_SELLP.alignment = 8; real_Double_t start, end, res; #ifdef MAGMA_WITH_MKL magma_int_t *pntre=NULL; #endif cusparseHandle_t cusparseHandle = NULL; cusparseMatDescr_t descr = NULL; float c_one = MAGMA_S_MAKE(1.0, 0.0); float c_zero = MAGMA_S_MAKE(0.0, 0.0); float accuracy = 1e-10; #define PRECISION_s #if defined(PRECISION_c) accuracy = 1e-4; #endif #if defined(PRECISION_s) accuracy = 1e-4; #endif magma_int_t i, j; for( i = 1; i < argc; ++i ) { if ( strcmp("--blocksize", argv[i]) == 0 ) { hA_SELLP.blocksize = atoi( argv[++i] ); } else if ( strcmp("--alignment", argv[i]) == 0 ) { hA_SELLP.alignment = atoi( argv[++i] ); } else break; } printf("\n# usage: ./run_sspmm" " [ --blocksize %lld --alignment %lld (for SELLP) ] matrices\n\n", (long long) hA_SELLP.blocksize, (long long) hA_SELLP.alignment ); while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); TESTING_CHECK( magma_sm_5stencil( laplace_size, &hA, queue )); } else { // file-matrix test TESTING_CHECK( magma_s_csr_mtx( &hA, argv[i], queue )); } printf("%% matrix info: %lld-by-%lld with %lld nonzeros\n", (long long) hA.num_rows, (long long) hA.num_cols, (long long) hA.nnz ); real_Double_t FLOPS = 2.0*hA.nnz/1e9; // m - number of rows for the sparse matrix // n - number of vectors to be multiplied in the SpMM product magma_int_t m, n; m = hA.num_rows; n = 48; // init CPU vectors TESTING_CHECK( magma_svinit( &hx, Magma_CPU, m, n, c_one, queue )); TESTING_CHECK( magma_svinit( &hy, Magma_CPU, m, n, c_zero, queue )); // init DEV vectors TESTING_CHECK( magma_svinit( &dx, Magma_DEV, m, n, c_one, queue )); TESTING_CHECK( magma_svinit( &dy, Magma_DEV, m, n, c_zero, queue )); // calling MKL with CSR #ifdef MAGMA_WITH_MKL TESTING_CHECK( magma_imalloc_cpu( &pntre, m + 1 ) ); pntre[0] = 0; for (j=0; j < m; j++ ) { pntre[j] = hA.row[j+1]; } MKL_INT num_rows = hA.num_rows; MKL_INT num_cols = hA.num_cols; MKL_INT nnz = hA.nnz; MKL_INT num_vecs = n; MKL_INT *col; TESTING_CHECK( magma_malloc_cpu( (void**) &col, nnz * sizeof(MKL_INT) )); for( magma_int_t t=0; t < hA.nnz; ++t ) { col[ t ] = hA.col[ t ]; } MKL_INT *row; TESTING_CHECK( magma_malloc_cpu( (void**) &row, num_rows * sizeof(MKL_INT) )); for( magma_int_t t=0; t < hA.num_rows; ++t ) { row[ t ] = hA.col[ t ]; } // === Call MKL with consecutive SpMVs, using mkl_scsrmv === // warmp up mkl_scsrmv( "N", &num_rows, &num_cols, MKL_ADDR(&c_one), "GFNC", MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), MKL_ADDR(&c_zero), MKL_ADDR(hy.val) ); start = magma_wtime(); for (j=0; j < 10; j++ ) { mkl_scsrmv( "N", &num_rows, &num_cols, MKL_ADDR(&c_one), "GFNC", MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), MKL_ADDR(&c_zero), MKL_ADDR(hy.val) ); } end = magma_wtime(); printf( "\n > MKL SpMVs : %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10/(end-start) ); // === Call MKL with blocked SpMVs, using mkl_scsrmm === char transa = 'n'; MKL_INT ldb = n, ldc=n; char matdescra[6] = {'g', 'l', 'n', 'c', 'x', 'x'}; // warm up mkl_scsrmm( &transa, &num_rows, &num_vecs, &num_cols, MKL_ADDR(&c_one), matdescra, MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), &ldb, MKL_ADDR(&c_zero), MKL_ADDR(hy.val), &ldc ); start = magma_wtime(); for (j=0; j < 10; j++ ) { mkl_scsrmm( &transa, &num_rows, &num_vecs, &num_cols, MKL_ADDR(&c_one), matdescra, MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), &ldb, MKL_ADDR(&c_zero), MKL_ADDR(hy.val), &ldc ); } end = magma_wtime(); printf( "\n > MKL SpMM : %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10.*n/(end-start) ); magma_free_cpu( row ); magma_free_cpu( col ); row = NULL; col = NULL; #endif // MAGMA_WITH_MKL // copy matrix to GPU TESTING_CHECK( magma_smtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue )); // SpMV on GPU (CSR) start = magma_sync_wtime( queue ); for (j=0; j < 10; j++) { TESTING_CHECK( magma_s_spmv( c_one, dA, dx, c_zero, dy, queue )); } end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (standard CSR).\n", (end-start)/10, FLOPS*10.*n/(end-start) ); TESTING_CHECK( magma_smtransfer( dy, &hrefvec , Magma_DEV, Magma_CPU, queue )); magma_smfree(&dA, queue ); // convert to SELLP and copy to GPU TESTING_CHECK( magma_smconvert( hA, &hA_SELLP, Magma_CSR, Magma_SELLP, queue )); TESTING_CHECK( magma_smtransfer( hA_SELLP, &dA_SELLP, Magma_CPU, Magma_DEV, queue )); magma_smfree(&hA_SELLP, queue ); magma_smfree( &dy, queue ); TESTING_CHECK( magma_svinit( &dy, Magma_DEV, dx.num_rows, dx.num_cols, c_zero, queue )); // SpMV on GPU (SELLP) start = magma_sync_wtime( queue ); for (j=0; j < 10; j++) { TESTING_CHECK( magma_s_spmv( c_one, dA_SELLP, dx, c_zero, dy, queue )); } end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (SELLP).\n", (end-start)/10, FLOPS*10.*n/(end-start) ); TESTING_CHECK( magma_smtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue )); res = 0.0; for(magma_int_t k=0; k < hA.num_rows; k++ ) { res=res + MAGMA_S_REAL(hcheck.val[k]) - MAGMA_S_REAL(hrefvec.val[k]); } printf("%% |x-y|_F = %8.2e\n", res); if ( res < accuracy ) printf("%% tester spmm SELL-P: ok\n"); else printf("%% tester spmm SELL-P: failed\n"); magma_smfree( &hcheck, queue ); magma_smfree(&dA_SELLP, queue ); // SpMV on GPU (CUSPARSE - CSR) // CUSPARSE context // magma_smfree( &dy, queue ); TESTING_CHECK( magma_svinit( &dy, Magma_DEV, dx.num_rows, dx.num_cols, c_zero, queue )); //#ifdef PRECISION_d start = magma_sync_wtime( queue ); TESTING_CHECK( cusparseCreate( &cusparseHandle )); TESTING_CHECK( cusparseSetStream( cusparseHandle, magma_queue_get_cuda_stream(queue) )); TESTING_CHECK( cusparseCreateMatDescr( &descr )); TESTING_CHECK( cusparseSetMatType( descr, CUSPARSE_MATRIX_TYPE_GENERAL )); TESTING_CHECK( cusparseSetMatIndexBase( descr, CUSPARSE_INDEX_BASE_ZERO )); float alpha = c_one; float beta = c_zero; // copy matrix to GPU TESTING_CHECK( magma_smtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue) ); for (j=0; j < 10; j++) { cusparseScsrmm(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, dA.num_rows, n, dA.num_cols, dA.nnz, &alpha, descr, dA.dval, dA.drow, dA.dcol, dx.dval, dA.num_cols, &beta, dy.dval, dA.num_cols); } end = magma_sync_wtime( queue ); printf( " > CUSPARSE: %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10*n/(end-start) ); TESTING_CHECK( magma_smtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue )); res = 0.0; for(magma_int_t k=0; k < hA.num_rows; k++ ) { res = res + MAGMA_S_REAL(hcheck.val[k]) - MAGMA_S_REAL(hrefvec.val[k]); } printf("%% |x-y|_F = %8.2e\n", res); if ( res < accuracy ) printf("%% tester spmm cuSPARSE: ok\n"); else printf("%% tester spmm cuSPARSE: failed\n"); magma_smfree( &hcheck, queue ); cusparseDestroyMatDescr( descr ); cusparseDestroy( cusparseHandle ); descr = NULL; cusparseHandle = NULL; //#endif printf("\n\n"); // free CPU memory magma_smfree( &hA, queue ); magma_smfree( &hx, queue ); magma_smfree( &hy, queue ); magma_smfree( &hrefvec, queue ); // free GPU memory magma_smfree( &dx, queue ); magma_smfree( &dy, queue ); magma_smfree( &dA, queue); #ifdef MAGMA_WITH_MKL magma_free_cpu( pntre ); #endif i++; } magma_queue_destroy( queue ); TESTING_CHECK( magma_finalize() ); return info; }
extern "C" magma_int_t magma_scg_res( magma_s_matrix A, magma_s_matrix b, magma_s_matrix *x, magma_s_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_CG; solver_par->numiter = 0; solver_par->spmv_count = 0; // solver variables float alpha, beta; float nom0, r0, res, nomb; float den, gammanew, gammaold = MAGMA_S_MAKE(1.0,0.0); // local variables float c_zero = MAGMA_S_ZERO, c_one = MAGMA_S_ONE; magma_int_t dofs = A.num_rows* b.num_cols; // GPU workspace magma_s_matrix r={Magma_CSR}, p={Magma_CSR}, q={Magma_CSR}; CHECK( magma_svinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &q, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // solver setup CHECK( magma_sresidualvec( A, b, *x, &r, &nom0, queue)); magma_scopy( dofs, r.dval, 1, p.dval, 1, queue ); // p = h CHECK( magma_s_spmv( c_one, A, p, c_zero, q, queue )); // q = A p solver_par->spmv_count++; den = magma_sdot( dofs, p.dval, 1, q.dval, 1, queue ); // den = p dot q solver_par->init_res = nom0; nomb = magma_snrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } if ( (r0 = nomb * solver_par->rtol) < ATOLERANCE ){ r0 = ATOLERANCE; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nom0; solver_par->timing[0] = 0.0; } if ( nomb < r0 ) { info = MAGMA_SUCCESS; goto cleanup; } // check positive definite if ( MAGMA_S_ABS(den) <= 0.0 ) { info = MAGMA_NONSPD; goto cleanup; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; gammanew = magma_sdot( dofs, r.dval, 1, r.dval, 1, queue ); // gn = < r,r> if ( solver_par->numiter == 1 ) { magma_scopy( dofs, r.dval, 1, p.dval, 1, queue ); // p = r } else { beta = (gammanew/gammaold); // beta = gn/go magma_sscal( dofs, beta, p.dval, 1, queue ); // p = beta*p magma_saxpy( dofs, c_one, r.dval, 1, p.dval, 1, queue ); // p = p + r } CHECK( magma_s_spmv( c_one, A, p, c_zero, q, queue )); // q = A p solver_par->spmv_count++; den = magma_sdot( dofs, p.dval, 1, q.dval, 1, queue ); // den = p dot q alpha = gammanew / den; magma_saxpy( dofs, alpha, p.dval, 1, x->dval, 1, queue ); // x = x + alpha p magma_saxpy( dofs, -alpha, q.dval, 1, r.dval, 1, queue ); // r = r - alpha q gammaold = gammanew; res = magma_snrm2( dofs, r.dval, 1, queue ); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res/nomb <= solver_par->rtol || res <= solver_par->atol ){ break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; CHECK( magma_sresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = res; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->rtol*solver_par->init_res || solver_par->iter_res < solver_par->atol ) { info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_smfree(&r, queue ); magma_smfree(&p, queue ); magma_smfree(&q, queue ); solver_par->info = info; return info; } /* magma_scg */
/* //////////////////////////////////////////////////////////////////////////// -- testing any solver */ int main( int argc, char** argv ) { magma_int_t info = 0; /* Initialize */ TESTING_INIT(); magma_queue_t queue=NULL; magma_queue_create( &queue ); magmablasSetKernelStream( queue ); magma_int_t j, n=1000000, FLOPS; float one = MAGMA_S_MAKE( 1.0, 0.0 ); float two = MAGMA_S_MAKE( 2.0, 0.0 ); magma_s_matrix a={Magma_CSR}, ad={Magma_CSR}, bd={Magma_CSR}, cd={Magma_CSR}; CHECK( magma_svinit( &a, Magma_CPU, n, 1, one, queue )); CHECK( magma_svinit( &bd, Magma_DEV, n, 1, two, queue )); CHECK( magma_svinit( &cd, Magma_DEV, n, 1, one, queue )); CHECK( magma_smtransfer( a, &ad, Magma_CPU, Magma_DEV, queue )); real_Double_t start, end, res; FLOPS = 2*n; start = magma_sync_wtime( queue ); for (j=0; j<100; j++) res = magma_snrm2(n, ad.dval, 1); end = magma_sync_wtime( queue ); printf( " > MAGMA nrm2: %.2e seconds %.2e GFLOP/s\n", (end-start)/100, FLOPS*100/1e9/(end-start) ); FLOPS = n; start = magma_sync_wtime( queue ); for (j=0; j<100; j++) magma_sscal( n, two, ad.dval, 1 ); end = magma_sync_wtime( queue ); printf( " > MAGMA scal: %.2e seconds %.2e GFLOP/s\n", (end-start)/100, FLOPS*100/1e9/(end-start) ); FLOPS = 2*n; start = magma_sync_wtime( queue ); for (j=0; j<100; j++) magma_saxpy( n, one, ad.dval, 1, bd.dval, 1 ); end = magma_sync_wtime( queue ); printf( " > MAGMA axpy: %.2e seconds %.2e GFLOP/s\n", (end-start)/100, FLOPS*100/1e9/(end-start) ); FLOPS = n; start = magma_sync_wtime( queue ); for (j=0; j<100; j++) magma_scopy( n, bd.dval, 1, ad.dval, 1 ); end = magma_sync_wtime( queue ); printf( " > MAGMA copy: %.2e seconds %.2e GFLOP/s\n", (end-start)/100, FLOPS*100/1e9/(end-start) ); FLOPS = 2*n; start = magma_sync_wtime( queue ); for (j=0; j<100; j++) res = MAGMA_S_REAL( magma_sdot(n, ad.dval, 1, bd.dval, 1) ); end = magma_sync_wtime( queue ); printf( " > MAGMA dotc: %.2e seconds %.2e GFLOP/s\n", (end-start)/100, FLOPS*100/1e9/(end-start) ); printf("# tester BLAS: ok\n"); magma_smfree( &a, queue); magma_smfree(&ad, queue); magma_smfree(&bd, queue); magma_smfree(&cd, queue); cleanup: magma_smfree( &a, queue); magma_smfree(&ad, queue); magma_smfree(&bd, queue); magma_smfree(&cd, queue); magmablasSetKernelStream( NULL ); magma_queue_destroy( queue ); magma_finalize(); return info; }
/* //////////////////////////////////////////////////////////////////////////// -- testing zdot */ int main( int argc, char** argv ) { magma_int_t info = 0; magma_queue_t queue=NULL; magma_queue_create( 0, &queue ); const float one = MAGMA_S_MAKE(1.0, 0.0); const float zero = MAGMA_S_MAKE(0.0, 0.0); float alpha; TESTING_INIT(); magma_s_matrix a={Magma_CSR}, b={Magma_CSR}, x={Magma_CSR}, y={Magma_CSR}, skp={Magma_CSR}; printf("%%=======================================================================================================================================================================\n"); printf("\n"); printf(" | runtime | GFLOPS\n"); printf("%% n num_vecs | CUDOT CUGEMV MAGMAGEMV MDOT MDGM MDGM_SHFL | CUDOT CUGEMV MAGMAGEMV MDOT MDGM MDGM_SHFL\n"); printf("%%------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n"); printf("\n"); for( magma_int_t num_vecs=1; num_vecs <= 32; num_vecs += 1 ) { for( magma_int_t n=500000; n < 500001; n += 10000 ) { int iters = 10; float computations = (2.* n * iters * num_vecs); #define ENABLE_TIMER #ifdef ENABLE_TIMER real_Double_t mdot1, mdot2, mdgm1, mdgm2, magmagemv1, magmagemv2, cugemv1, cugemv2, cudot1, cudot2; real_Double_t mdot_time, mdgm_time, mdgmshf_time, magmagemv_time, cugemv_time, cudot_time; #endif CHECK( magma_svinit( &a, Magma_DEV, n, num_vecs, one, queue )); CHECK( magma_svinit( &b, Magma_DEV, n, 1, one, queue )); CHECK( magma_svinit( &x, Magma_DEV, n, 8, one, queue )); CHECK( magma_svinit( &y, Magma_DEV, n, 8, one, queue )); CHECK( magma_svinit( &skp, Magma_DEV, 1, num_vecs, zero, queue )); // warm up CHECK( magma_sgemvmdot( n, num_vecs, a.dval, b.dval, x.dval, y.dval, skp.dval, queue )); // CUDOT #ifdef ENABLE_TIMER cudot1 = magma_sync_wtime( queue ); #endif for( int h=0; h < iters; h++) { for( int l=0; l<num_vecs; l++){ alpha = magma_sdot( n, a.dval+l*a.num_rows, 1, b.dval, 1, queue ); //cudaDeviceSynchronize(); } //cudaDeviceSynchronize(); } #ifdef ENABLE_TIMER cudot2 = magma_sync_wtime( queue ); cudot_time=cudot2-cudot1; #endif // CUGeMV #ifdef ENABLE_TIMER cugemv1 = magma_sync_wtime( queue ); #endif for( int h=0; h < iters; h++) { magma_sgemv( MagmaTrans, n, num_vecs, one, a.dval, n, b.dval, 1, zero, skp.dval, 1, queue ); } #ifdef ENABLE_TIMER cugemv2 = magma_sync_wtime( queue ); cugemv_time=cugemv2-cugemv1; #endif // MAGMAGeMV #ifdef ENABLE_TIMER magmagemv1 = magma_sync_wtime( queue ); #endif for( int h=0; h < iters; h++) { magmablas_sgemv( MagmaTrans, n, num_vecs, one, a.dval, n, b.dval, 1, zero, skp.dval, 1, queue ); } #ifdef ENABLE_TIMER magmagemv2 = magma_sync_wtime( queue ); magmagemv_time=magmagemv2-magmagemv1; #endif // MDOT #ifdef ENABLE_TIMER mdot1 = magma_sync_wtime( queue ); #endif for( int h=0; h < iters; h++) { for( int c = 0; c<num_vecs/2; c++ ){ CHECK( magma_smdotc( n, 2, a.dval, b.dval, x.dval, y.dval, skp.dval, queue )); } for( int c = 0; c<num_vecs%2; c++ ){ CHECK( magma_smdotc( n, 1, a.dval, b.dval, x.dval, y.dval, skp.dval, queue )); } //h++; } #ifdef ENABLE_TIMER mdot2 = magma_sync_wtime( queue ); mdot_time=mdot2-mdot1; #endif // MDGM #ifdef ENABLE_TIMER mdgm1 = magma_sync_wtime( queue ); #endif for( int h=0; h < iters; h++) { CHECK( magma_sgemvmdot( n, num_vecs, a.dval, b.dval, x.dval, y.dval, skp.dval, queue )); //h++; } #ifdef ENABLE_TIMER mdgm2 = magma_sync_wtime( queue ); mdgm_time=mdgm2-mdgm1; #endif // MDGM_shfl #ifdef ENABLE_TIMER mdgm1 = magma_sync_wtime( queue ); #endif for( int h=0; h < iters; h++) { CHECK( magma_sgemvmdot_shfl( n, num_vecs, a.dval, b.dval, x.dval, y.dval, skp.dval, queue )); } #ifdef ENABLE_TIMER mdgm2 = magma_sync_wtime( queue ); mdgmshf_time=mdgm2-mdgm1; #endif //magma_sprint_gpu(num_vecs,1,skp.dval,num_vecs); //Chronometry #ifdef ENABLE_TIMER printf("%d %d %e %e %e %e %e %e || %e %e %e %e %e %e\n", int(n), int(num_vecs), cudot_time/iters, (cugemv_time)/iters, (magmagemv_time)/iters, (mdot_time)/iters, (mdgm_time)/iters, (mdgmshf_time)/iters, computations/(cudot_time*1e9), computations/(cugemv_time*1e9), computations/(magmagemv_time*1e9), computations/(mdot_time*1e9), computations/(mdgm_time*1e9), computations/(mdgmshf_time*1e9) ); #endif magma_smfree(&a, queue ); magma_smfree(&b, queue ); magma_smfree(&x, queue ); magma_smfree(&y, queue ); magma_smfree(&skp, queue ); } //printf("%%================================================================================================================================================\n"); //printf("\n"); //printf("\n"); } // use alpha to silence compiler warnings if ( isnan( real( alpha ))) { info = -1; } cleanup: magma_queue_destroy( queue ); TESTING_FINALIZE(); return info; }