magma_int_t magma_dcg_merge( magma_d_sparse_matrix A, magma_d_vector b, magma_d_vector *x, magma_d_solver_par *solver_par ){ // prepare solver feedback solver_par->solver = Magma_CGMERGE; solver_par->numiter = 0; solver_par->info = 0; // some useful variables double c_zero = MAGMA_D_ZERO, c_one = MAGMA_D_ONE; magma_int_t dofs = A.num_rows; // GPU stream magma_queue_t stream[2]; magma_event_t event[1]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); // GPU workspace magma_d_vector r, d, z; magma_d_vinit( &r, Magma_DEV, dofs, c_zero ); magma_d_vinit( &d, Magma_DEV, dofs, c_zero ); magma_d_vinit( &z, Magma_DEV, dofs, c_zero ); double *d1, *d2, *skp; magma_dmalloc( &d1, dofs*(1) ); magma_dmalloc( &d2, dofs*(1) ); // array for the parameters magma_dmalloc( &skp, 6 ); // skp = [alpha|beta|gamma|rho|tmp1|tmp2] // solver variables double alpha, beta, gamma, rho, tmp1, *skp_h; double nom, nom0, r0, betanom, den; // solver setup magma_dscal( dofs, c_zero, x->val, 1) ; // x = 0 magma_dcopy( dofs, b.val, 1, r.val, 1 ); // r = b magma_dcopy( dofs, b.val, 1, d.val, 1 ); // d = b nom0 = betanom = magma_dnrm2( dofs, r.val, 1 ); nom = nom0 * nom0; // nom = r' * r magma_d_spmv( c_one, A, d, c_zero, z ); // z = A d den = MAGMA_D_REAL( magma_ddot(dofs, d.val, 1, z.val, 1) ); // den = d'* z solver_par->init_res = nom0; // array on host for the parameters magma_dmalloc_cpu( &skp_h, 6 ); alpha = rho = gamma = tmp1 = c_one; beta = magma_ddot(dofs, r.val, 1, r.val, 1); skp_h[0]=alpha; skp_h[1]=beta; skp_h[2]=gamma; skp_h[3]=rho; skp_h[4]=tmp1; skp_h[5]=MAGMA_D_MAKE(nom, 0.0); magma_dsetvector( 6, skp_h, 1, skp, 1 ); if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) return MAGMA_SUCCESS; // check positive definite if (den <= 0.0) { printf("Operator A is not postive definite. (Ar,r) = %f\n", den); return -100; } //Chronometry real_Double_t tempo1, tempo2; magma_device_sync(); tempo1=magma_wtime(); if( solver_par->verbose > 0 ){ solver_par->res_vec[0] = (real_Double_t) nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ){ magmablasSetKernelStream(stream[0]); // computes SpMV and dot product magma_dcgmerge_spmv1( A, d1, d2, d.val, z.val, skp ); // updates x, r, computes scalars and updates d magma_dcgmerge_xrbeta( dofs, d1, d2, x->val, r.val, d.val, z.val, skp ); // check stopping criterion (asynchronous copy) magma_dgetvector_async( 1 , skp+1, 1, skp_h+1, 1, stream[1] ); betanom = sqrt(MAGMA_D_REAL(skp_h[1])); if( solver_par->verbose > 0 ){ magma_device_sync(); tempo2=magma_wtime(); if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < r0 ) { break; } } magma_device_sync(); tempo2=magma_wtime(); solver_par->runtime = (real_Double_t) tempo2-tempo1; double residual; magma_dresidual( A, b, *x, &residual ); solver_par->iter_res = betanom; solver_par->final_res = residual; if( solver_par->numiter < solver_par->maxiter){ solver_par->info = 0; }else if( solver_par->init_res > solver_par->final_res ){ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -2; } else{ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -1; } magma_d_vfree(&r); magma_d_vfree(&z); magma_d_vfree(&d); magma_free( d1 ); magma_free( d2 ); magma_free( skp ); magma_free_cpu( skp_h ); return MAGMA_SUCCESS; } /* magma_dcg_merge */
magma_int_t magma_dpgmres( magma_d_sparse_matrix A, magma_d_vector b, magma_d_vector *x, magma_d_solver_par *solver_par, magma_d_preconditioner *precond_par ){ // prepare solver feedback solver_par->solver = Magma_PGMRES; solver_par->numiter = 0; solver_par->info = 0; // local variables double c_zero = MAGMA_D_ZERO, c_one = MAGMA_D_ONE, c_mone = MAGMA_D_NEG_ONE; magma_int_t dofs = A.num_rows; magma_int_t i, j, k, m = 0; magma_int_t restart = min( dofs-1, solver_par->restart ); magma_int_t ldh = restart+1; double nom, rNorm, RNorm, nom0, betanom, r0 = 0.; // CPU workspace magma_setdevice(0); double *H, *HH, *y, *h1; magma_dmalloc_pinned( &H, (ldh+1)*ldh ); magma_dmalloc_pinned( &y, ldh ); magma_dmalloc_pinned( &HH, ldh*ldh ); magma_dmalloc_pinned( &h1, ldh ); // GPU workspace magma_d_vector r, q, q_t, z, z_t, t; magma_d_vinit( &t, Magma_DEV, dofs, c_zero ); magma_d_vinit( &r, Magma_DEV, dofs, c_zero ); magma_d_vinit( &q, Magma_DEV, dofs*(ldh+1), c_zero ); magma_d_vinit( &z, Magma_DEV, dofs*(ldh+1), c_zero ); magma_d_vinit( &z_t, Magma_DEV, dofs, c_zero ); q_t.memory_location = Magma_DEV; q_t.val = NULL; q_t.num_rows = q_t.nnz = dofs; double *dy, *dH = NULL; if (MAGMA_SUCCESS != magma_dmalloc( &dy, ldh )) return MAGMA_ERR_DEVICE_ALLOC; if (MAGMA_SUCCESS != magma_dmalloc( &dH, (ldh+1)*ldh )) return MAGMA_ERR_DEVICE_ALLOC; // GPU stream magma_queue_t stream[2]; magma_event_t event[1]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); magmablasSetKernelStream(stream[0]); magma_dscal( dofs, c_zero, x->val, 1 ); // x = 0 magma_dcopy( dofs, b.val, 1, r.val, 1 ); // r = b nom0 = betanom = magma_dnrm2( dofs, r.val, 1 ); // nom0= || r|| nom = nom0 * nom0; solver_par->init_res = nom0; H(1,0) = MAGMA_D_MAKE( nom0, 0. ); magma_dsetvector(1, &H(1,0), 1, &dH(1,0), 1); if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) return MAGMA_SUCCESS; //Chronometry real_Double_t tempo1, tempo2; magma_device_sync(); tempo1=magma_wtime(); if( solver_par->verbose > 0 ){ solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ){ magma_dcopy(dofs, r.val, 1, q(0), 1); // q[0] = 1.0/H(1,0) r magma_dscal(dofs, 1./H(1,0), q(0), 1); // (to be fused) for(k=1; k<=restart; k++) { q_t.val = q(k-1); magmablasSetKernelStream(stream[0]); // preconditioner // z[k] = M^(-1) q(k) magma_d_applyprecond_left( A, q_t, &t, precond_par ); magma_d_applyprecond_right( A, t, &z_t, precond_par ); magma_dcopy(dofs, z_t.val, 1, z(k-1), 1); // r = A q[k] magma_d_spmv( c_one, A, z_t, c_zero, r ); if (solver_par->ortho == Magma_MGS ) { // modified Gram-Schmidt magmablasSetKernelStream(stream[0]); for (i=1; i<=k; i++) { H(i,k) =magma_ddot(dofs, q(i-1), 1, r.val, 1); // H(i,k) = q[i] . r magma_daxpy(dofs,-H(i,k), q(i-1), 1, r.val, 1); // r = r - H(i,k) q[i] } H(k+1,k) = MAGMA_D_MAKE( magma_dnrm2(dofs, r.val, 1), 0. ); // H(k+1,k) = sqrt(r . r) if (k < restart) { magma_dcopy(dofs, r.val, 1, q(k), 1); // q[k] = 1.0/H[k][k-1] r magma_dscal(dofs, 1./H(k+1,k), q(k), 1); // (to be fused) } } else if (solver_par->ortho == Magma_FUSED_CGS ) { // fusing dgemv with dnrm2 in classical Gram-Schmidt magmablasSetKernelStream(stream[0]); magma_dcopy(dofs, r.val, 1, q(k), 1); // dH(1:k+1,k) = q[0:k] . r magmablas_dgemv(MagmaTrans, dofs, k+1, c_one, q(0), dofs, r.val, 1, c_zero, &dH(1,k), 1); // r = r - q[0:k-1] dH(1:k,k) magmablas_dgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.val, 1); // 1) dH(k+1,k) = sqrt( dH(k+1,k) - dH(1:k,k) ) magma_dcopyscale( dofs, k, r.val, q(k), &dH(1,k) ); // 2) q[k] = q[k] / dH(k+1,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_dgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); // asynch copy dH(1:(k+1),k) to H(1:(k+1),k) } else { // classical Gram-Schmidt (default) // > explicitly calling magmabls magmablasSetKernelStream(stream[0]); magmablas_dgemv(MagmaTrans, dofs, k, c_one, q(0), dofs, r.val, 1, c_zero, &dH(1,k), 1); // dH(1:k,k) = q[0:k-1] . r #ifndef DNRM2SCALE // start copying dH(1:k,k) to H(1:k,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_dgetvector_async(k, &dH(1,k), 1, &H(1,k), 1, stream[1]); #endif // r = r - q[0:k-1] dH(1:k,k) magmablas_dgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.val, 1); #ifdef DNRM2SCALE magma_dcopy(dofs, r.val, 1, q(k), 1); // q[k] = r / H(k,k-1) magma_dnrm2scale(dofs, q(k), dofs, &dH(k+1,k) ); // dH(k+1,k) = sqrt(r . r) and r = r / dH(k+1,k) magma_event_record( event[0], stream[0] ); // start sending dH(1:k,k) to H(1:k,k) magma_queue_wait_event( stream[1], event[0] ); // can we keep H(k+1,k) on GPU and combine? magma_dgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); #else H(k+1,k) = MAGMA_D_MAKE( magma_dnrm2(dofs, r.val, 1), 0. ); // H(k+1,k) = sqrt(r . r) if( k<solver_par->restart ){ magmablasSetKernelStream(stream[0]); magma_dcopy(dofs, r.val, 1, q(k), 1); // q[k] = 1.0/H[k][k-1] r magma_dscal(dofs, 1./H(k+1,k), q(k), 1); // (to be fused) } #endif } } magma_queue_sync( stream[1] ); for( k=1; k<=restart; k++ ){ /* Minimization of || b-Ax || in H_k */ for (i=1; i<=k; i++) { #if defined(PRECISION_z) || defined(PRECISION_c) cblas_ddot_sub( i+1, &H(1,k), 1, &H(1,i), 1, &HH(k,i) ); #else HH(k,i) = cblas_ddot(i+1, &H(1,k), 1, &H(1,i), 1); #endif } h1[k] = H(1,k)*H(1,0); if (k != 1) for (i=1; i<k; i++) { for (m=i+1; m<k; m++){ HH(k,m) -= HH(k,i) * HH(m,i); } HH(k,k) -= HH(k,i) * HH(k,i) / HH(i,i); HH(k,i) = HH(k,i)/HH(i,i); h1[k] -= h1[i] * HH(k,i); } y[k] = h1[k]/HH(k,k); if (k != 1) for (i=k-1; i>=1; i--) { y[i] = h1[i]/HH(i,i); for (j=i+1; j<=k; j++) y[i] -= y[j] * HH(j,i); } m = k; rNorm = fabs(MAGMA_D_REAL(H(k+1,k))); } magma_dsetmatrix_async(m, 1, y+1, m, dy, m, stream[0]); magmablasSetKernelStream(stream[0]); magma_dgemv(MagmaNoTrans, dofs, m, c_one, z(0), dofs, dy, 1, c_one, x->val, 1); magma_d_spmv( c_mone, A, *x, c_zero, r ); // r = - A * x magma_daxpy(dofs, c_one, b.val, 1, r.val, 1); // r = r + b H(1,0) = MAGMA_D_MAKE( magma_dnrm2(dofs, r.val, 1), 0. ); // RNorm = H[1][0] = || r || RNorm = MAGMA_D_REAL( H(1,0) ); betanom = fabs(RNorm); if( solver_par->verbose > 0 ){ magma_device_sync(); tempo2=magma_wtime(); if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < r0 ) { break; } } magma_device_sync(); tempo2=magma_wtime(); solver_par->runtime = (real_Double_t) tempo2-tempo1; double residual; magma_dresidual( A, b, *x, &residual ); solver_par->iter_res = betanom; solver_par->final_res = residual; if( solver_par->numiter < solver_par->maxiter){ solver_par->info = 0; }else if( solver_par->init_res > solver_par->final_res ){ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -2; } else{ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -1; } // free pinned memory magma_free_pinned( H ); magma_free_pinned( y ); magma_free_pinned( HH ); magma_free_pinned( h1 ); // free GPU memory magma_free(dy); if (dH != NULL ) magma_free(dH); magma_d_vfree(&t); magma_d_vfree(&r); magma_d_vfree(&q); magma_d_vfree(&z); magma_d_vfree(&z_t); // free GPU streams and events magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_event_destroy( event[0] ); magmablasSetKernelStream(NULL); return MAGMA_SUCCESS; } /* magma_dgmres */
extern "C" magma_int_t magma_diterref( magma_d_matrix A, magma_d_matrix b, magma_d_matrix *x, magma_d_solver_par *solver_par, magma_d_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // some useful variables double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; // prepare solver feedback solver_par->solver = Magma_ITERREF; solver_par->numiter = 0; solver_par->spmv_count = 0; magma_int_t dofs = A.num_rows*b.num_cols; // solver variables double nom, nom0; // workspace magma_d_matrix r={Magma_CSR}, z={Magma_CSR}; CHECK( magma_dvinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_dvinit( &z, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); double residual; CHECK( magma_dresidual( A, b, *x, &residual, queue )); solver_par->init_res = residual; // solver setup magma_dscal( dofs, c_zero, x->dval, 1, queue ); // x = 0 //CHECK( magma_dresidualvec( A, b, *x, &r, nom, queue)); magma_dcopy( dofs, b.dval, 1, r.dval, 1, queue ); // r = b nom0 = magma_dnrm2( dofs, r.dval, 1, queue ); // nom0 = || r || nom = nom0 * nom0; solver_par->init_res = nom0; if( nom0 < solver_par->atol || nom0/solver_par->init_res < solver_par->rtol ){ solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; info = MAGMA_SUCCESS; goto cleanup; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ) { magma_dscal( dofs, MAGMA_D_MAKE(1./nom, 0.), r.dval, 1, queue ); // scale it CHECK( magma_d_precond( A, r, &z, precond_par, queue )); // inner solver: A * z = r magma_dscal( dofs, MAGMA_D_MAKE(nom, 0.), z.dval, 1, queue ); // scale it magma_daxpy( dofs, c_one, z.dval, 1, x->dval, 1, queue ); // x = x + z CHECK( magma_d_spmv( c_neg_one, A, *x, c_zero, r, queue )); // r = - A x solver_par->spmv_count++; magma_daxpy( dofs, c_one, b.dval, 1, r.dval, 1, queue ); // r = r + b nom = magma_dnrm2( dofs, r.dval, 1, queue ); // nom = || r || if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) nom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if( nom < solver_par->atol || nom/solver_par->init_res < solver_par->rtol ){ break; } } tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; CHECK( magma_dresidualvec( A, b, *x, &r, &residual, queue)); solver_par->final_res = residual; solver_par->iter_res = nom; if ( solver_par->numiter < solver_par->maxiter ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) nom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->atol || solver_par->iter_res/solver_par->init_res < solver_par->rtol ){ info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) nom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_dmfree(&r, queue ); magma_dmfree(&z, queue ); solver_par->info = info; return info; } /* magma_diterref */
extern "C" magma_int_t magma_dpbicgstab( magma_d_sparse_matrix A, magma_d_vector b, magma_d_vector *x, magma_d_solver_par *solver_par, magma_d_preconditioner *precond_par, magma_queue_t queue ) { // set queue for old dense routines magma_queue_t orig_queue; magmablasGetKernelStream( &orig_queue ); // prepare solver feedback solver_par->solver = Magma_PBICGSTAB; solver_par->numiter = 0; solver_par->info = MAGMA_SUCCESS; // some useful variables double c_zero = MAGMA_D_ZERO, c_one = MAGMA_D_ONE, c_mone = MAGMA_D_NEG_ONE; magma_int_t dofs = A.num_rows; // workspace magma_d_vector r,rr,p,v,s,t,ms,mt,y,z; magma_d_vinit( &r, Magma_DEV, dofs, c_zero, queue ); magma_d_vinit( &rr, Magma_DEV, dofs, c_zero, queue ); magma_d_vinit( &p, Magma_DEV, dofs, c_zero, queue ); magma_d_vinit( &v, Magma_DEV, dofs, c_zero, queue ); magma_d_vinit( &s, Magma_DEV, dofs, c_zero, queue ); magma_d_vinit( &t, Magma_DEV, dofs, c_zero, queue ); magma_d_vinit( &ms, Magma_DEV, dofs, c_zero, queue ); magma_d_vinit( &mt, Magma_DEV, dofs, c_zero, queue ); magma_d_vinit( &y, Magma_DEV, dofs, c_zero, queue ); magma_d_vinit( &z, Magma_DEV, dofs, c_zero, queue ); // solver variables double alpha, beta, omega, rho_old, rho_new; double nom, betanom, nom0, r0, den, res; // solver setup magma_dscal( dofs, c_zero, x->dval, 1) ; // x = 0 magma_dcopy( dofs, b.dval, 1, r.dval, 1 ); // r = b magma_dcopy( dofs, b.dval, 1, rr.dval, 1 ); // rr = b nom0 = betanom = magma_dnrm2( dofs, r.dval, 1 ); // nom = || r || nom = nom0*nom0; rho_new = omega = alpha = MAGMA_D_MAKE( 1.0, 0. ); solver_par->init_res = nom0; magma_d_spmv( c_one, A, r, c_zero, v, queue ); // z = A r den = MAGMA_D_REAL( magma_ddot(dofs, v.dval, 1, r.dval, 1) ); // den = z' * r if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) { magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } // check positive definite if (den <= 0.0) { printf("Operator A is not postive definite. (Ar,r) = %f\n", den); magmablasSetKernelStream( orig_queue ); return MAGMA_NONSPD; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ) { rho_old = rho_new; // rho_old=rho rho_new = magma_ddot( dofs, rr.dval, 1, r.dval, 1 ); // rho=<rr,r> beta = rho_new/rho_old * alpha/omega; // beta=rho/rho_old *alpha/omega magma_dscal( dofs, beta, p.dval, 1 ); // p = beta*p magma_daxpy( dofs, c_mone * omega * beta, v.dval, 1 , p.dval, 1 ); // p = p-omega*beta*v magma_daxpy( dofs, c_one, r.dval, 1, p.dval, 1 ); // p = p+r // preconditioner magma_d_applyprecond_left( A, p, &mt, precond_par, queue ); magma_d_applyprecond_right( A, mt, &y, precond_par, queue ); magma_d_spmv( c_one, A, y, c_zero, v, queue ); // v = Ap alpha = rho_new / magma_ddot( dofs, rr.dval, 1, v.dval, 1 ); magma_dcopy( dofs, r.dval, 1 , s.dval, 1 ); // s=r magma_daxpy( dofs, c_mone * alpha, v.dval, 1 , s.dval, 1 ); // s=s-alpha*v // preconditioner magma_d_applyprecond_left( A, s, &ms, precond_par, queue ); magma_d_applyprecond_right( A, ms, &z, precond_par, queue ); magma_d_spmv( c_one, A, z, c_zero, t, queue ); // t=As // preconditioner magma_d_applyprecond_left( A, s, &ms, precond_par, queue ); magma_d_applyprecond_left( A, t, &mt, precond_par, queue ); // omega = <ms,mt>/<mt,mt> omega = magma_ddot( dofs, mt.dval, 1, ms.dval, 1 ) / magma_ddot( dofs, mt.dval, 1, mt.dval, 1 ); magma_daxpy( dofs, alpha, y.dval, 1 , x->dval, 1 ); // x=x+alpha*p magma_daxpy( dofs, omega, z.dval, 1 , x->dval, 1 ); // x=x+omega*s magma_dcopy( dofs, s.dval, 1 , r.dval, 1 ); // r=s magma_daxpy( dofs, c_mone * omega, t.dval, 1 , r.dval, 1 ); // r=r-omega*t res = betanom = magma_dnrm2( dofs, r.dval, 1 ); nom = betanom*betanom; if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res/nom0 < solver_par->epsilon ) { break; } } tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; double residual; magma_dresidual( A, b, *x, &residual, queue ); solver_par->final_res = residual; solver_par->iter_res = res; if ( solver_par->numiter < solver_par->maxiter) { solver_par->info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_SLOW_CONVERGENCE; } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_DIVERGENCE; } magma_d_vfree(&r, queue ); magma_d_vfree(&rr, queue ); magma_d_vfree(&p, queue ); magma_d_vfree(&v, queue ); magma_d_vfree(&s, queue ); magma_d_vfree(&t, queue ); magma_d_vfree(&ms, queue ); magma_d_vfree(&mt, queue ); magma_d_vfree(&y, queue ); magma_d_vfree(&z, queue ); magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } /* magma_dbicgstab */
magma_int_t magma_dbicgstab_merge2( magma_d_sparse_matrix A, magma_d_vector b, magma_d_vector *x, magma_d_solver_par *solver_par ){ // prepare solver feedback solver_par->solver = Magma_BICGSTABMERGE2; solver_par->numiter = 0; solver_par->info = 0; // some useful variables double c_zero = MAGMA_D_ZERO, c_one = MAGMA_D_ONE; magma_int_t dofs = A.num_rows; // GPU stream magma_queue_t stream[2]; magma_event_t event[1]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); // workspace magma_d_vector q, r,rr,p,v,s,t; double *d1, *d2, *skp; magma_dmalloc( &d1, dofs*(2) ); magma_dmalloc( &d2, dofs*(2) ); // array for the parameters magma_dmalloc( &skp, 8 ); // skp = [alpha|beta|omega|rho_old|rho|nom|tmp1|tmp2] magma_d_vinit( &q, Magma_DEV, dofs*6, c_zero ); // q = rr|r|p|v|s|t rr.memory_location = Magma_DEV; rr.val = NULL; rr.num_rows = rr.nnz = dofs; r.memory_location = Magma_DEV; r.val = NULL; r.num_rows = r.nnz = dofs; p.memory_location = Magma_DEV; p.val = NULL; p.num_rows = p.nnz = dofs; v.memory_location = Magma_DEV; v.val = NULL; v.num_rows = v.nnz = dofs; s.memory_location = Magma_DEV; s.val = NULL; s.num_rows = s.nnz = dofs; t.memory_location = Magma_DEV; t.val = NULL; t.num_rows = t.nnz = dofs; rr.val = q(0); r.val = q(1); p.val = q(2); v.val = q(3); s.val = q(4); t.val = q(5); // solver variables double alpha, beta, omega, rho_old, rho_new, *skp_h; double nom, nom0, betanom, r0, den; // solver setup magma_dscal( dofs, c_zero, x->val, 1) ; // x = 0 magma_dcopy( dofs, b.val, 1, q(0), 1 ); // rr = b magma_dcopy( dofs, b.val, 1, q(1), 1 ); // r = b rho_new = magma_ddot( dofs, r.val, 1, r.val, 1 ); // rho=<rr,r> nom = MAGMA_D_REAL(magma_ddot( dofs, r.val, 1, r.val, 1 )); nom0 = betanom = sqrt(nom); // nom = || r || rho_old = omega = alpha = MAGMA_D_MAKE( 1.0, 0. ); beta = rho_new; solver_par->init_res = nom0; // array on host for the parameters magma_dmalloc_cpu( &skp_h, 8 ); skp_h[0]=alpha; skp_h[1]=beta; skp_h[2]=omega; skp_h[3]=rho_old; skp_h[4]=rho_new; skp_h[5]=MAGMA_D_MAKE(nom, 0.0); magma_dsetvector( 8, skp_h, 1, skp, 1 ); magma_d_spmv( c_one, A, r, c_zero, v ); // z = A r den = MAGMA_D_REAL( magma_ddot(dofs, v.val, 1, r.val, 1) );// den = z dot r if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) return MAGMA_SUCCESS; // check positive definite if (den <= 0.0) { printf("Operator A is not postive definite. (Ar,r) = %f\n", den); return -100; } //Chronometry real_Double_t tempo1, tempo2; magma_device_sync(); tempo1=magma_wtime(); if( solver_par->verbose > 0 ){ solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ){ magmablasSetKernelStream(stream[0]); // computes p=r+beta*(p-omega*v) magma_dbicgmerge1( dofs, skp, v.val, r.val, p.val ); magma_dbicgmerge_spmv1( A, d1, d2, q(2), q(0), q(3), skp ); magma_dbicgmerge2( dofs, skp, r.val, v.val, s.val ); // s=r-alpha*v magma_dbicgmerge_spmv2( A, d1, d2, q(4), q(5), skp); magma_dbicgmerge_xrbeta( dofs, d1, d2, q(0), q(1), q(2), q(4), q(5), x->val, skp); // check stopping criterion (asynchronous copy) magma_dgetvector_async( 1 , skp+5, 1, skp_h+5, 1, stream[1] ); betanom = sqrt(MAGMA_D_REAL(skp_h[5])); if( solver_par->verbose > 0 ){ magma_device_sync(); tempo2=magma_wtime(); if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < r0 ) { break; } } magma_device_sync(); tempo2=magma_wtime(); solver_par->runtime = (real_Double_t) tempo2-tempo1; double residual; magma_dresidual( A, b, *x, &residual ); solver_par->iter_res = betanom; solver_par->final_res = residual; if( solver_par->numiter < solver_par->maxiter){ solver_par->info = 0; }else if( solver_par->init_res > solver_par->final_res ){ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -2; } else{ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -1; } magma_d_vfree(&q); // frees all vectors magma_free(d1); magma_free(d2); magma_free( skp ); magma_free_cpu( skp_h ); return MAGMA_SUCCESS; } /* dbicgstab_merge2 */
magma_int_t magma_dpcg( magma_d_sparse_matrix A, magma_d_vector b, magma_d_vector *x, magma_d_solver_par *solver_par, magma_d_preconditioner *precond_par ){ // prepare solver feedback solver_par->solver = Magma_PCG; solver_par->numiter = 0; solver_par->info = 0; // local variables double c_zero = MAGMA_D_ZERO, c_one = MAGMA_D_ONE; magma_int_t dofs = A.num_rows; // GPU workspace magma_d_vector r, rt, p, q, h; magma_d_vinit( &r, Magma_DEV, dofs, c_zero ); magma_d_vinit( &rt, Magma_DEV, dofs, c_zero ); magma_d_vinit( &p, Magma_DEV, dofs, c_zero ); magma_d_vinit( &q, Magma_DEV, dofs, c_zero ); magma_d_vinit( &h, Magma_DEV, dofs, c_zero ); // solver variables double alpha, beta; double nom, nom0, r0, gammaold, gammanew, den, res; // solver setup magma_dscal( dofs, c_zero, x->val, 1) ; // x = 0 magma_dcopy( dofs, b.val, 1, r.val, 1 ); // r = b // preconditioner magma_d_applyprecond_left( A, r, &rt, precond_par ); magma_d_applyprecond_right( A, rt, &h, precond_par ); magma_dcopy( dofs, h.val, 1, p.val, 1 ); // p = h nom = MAGMA_D_REAL( magma_ddot(dofs, r.val, 1, h.val, 1) ); nom0 = magma_dnrm2( dofs, r.val, 1 ); magma_d_spmv( c_one, A, p, c_zero, q ); // q = A p den = MAGMA_D_REAL( magma_ddot(dofs, p.val, 1, q.val, 1) );// den = p dot q solver_par->init_res = nom0; if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) return MAGMA_SUCCESS; // check positive definite if (den <= 0.0) { printf("Operator A is not postive definite. (Ar,r) = %f\n", den); return -100; } //Chronometry real_Double_t tempo1, tempo2; magma_device_sync(); tempo1=magma_wtime(); if( solver_par->verbose > 0 ){ solver_par->res_vec[0] = (real_Double_t)nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ){ // preconditioner magma_d_applyprecond_left( A, r, &rt, precond_par ); magma_d_applyprecond_right( A, rt, &h, precond_par ); gammanew = MAGMA_D_REAL( magma_ddot(dofs, r.val, 1, h.val, 1) ); // gn = < r,h> if( solver_par->numiter==1 ){ magma_dcopy( dofs, h.val, 1, p.val, 1 ); // p = h }else{ beta = MAGMA_D_MAKE(gammanew/gammaold, 0.); // beta = gn/go magma_dscal(dofs, beta, p.val, 1); // p = beta*p magma_daxpy(dofs, c_one, h.val, 1, p.val, 1); // p = p + h } magma_d_spmv( c_one, A, p, c_zero, q ); // q = A p den = MAGMA_D_REAL(magma_ddot(dofs, p.val, 1, q.val, 1)); // den = p dot q alpha = MAGMA_D_MAKE(gammanew/den, 0.); magma_daxpy(dofs, alpha, p.val, 1, x->val, 1); // x = x + alpha p magma_daxpy(dofs, -alpha, q.val, 1, r.val, 1); // r = r - alpha q gammaold = gammanew; res = magma_dnrm2( dofs, r.val, 1 ); if( solver_par->verbose > 0 ){ magma_device_sync(); tempo2=magma_wtime(); if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res/nom0 < solver_par->epsilon ) { break; } } magma_device_sync(); tempo2=magma_wtime(); solver_par->runtime = (real_Double_t) tempo2-tempo1; double residual; magma_dresidual( A, b, *x, &residual ); solver_par->iter_res = res; solver_par->final_res = residual; if( solver_par->numiter < solver_par->maxiter){ solver_par->info = 0; }else if( solver_par->init_res > solver_par->final_res ){ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -2; } else{ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -1; } magma_d_vfree(&r); magma_d_vfree(&rt); magma_d_vfree(&p); magma_d_vfree(&q); magma_d_vfree(&h); return MAGMA_SUCCESS; } /* magma_dcg */
extern "C" magma_int_t magma_dbpcg( magma_d_sparse_matrix A, magma_d_vector b, magma_d_vector *x, magma_d_solver_par *solver_par, magma_d_preconditioner *precond_par, magma_queue_t queue ) { // set queue for old dense routines magma_queue_t orig_queue; magmablasGetKernelStream( &orig_queue ); magma_int_t stat_dev = 0, stat_cpu = 0; magma_int_t i, num_vecs = b.num_rows/A.num_rows; // prepare solver feedback solver_par->solver = Magma_PCG; solver_par->numiter = 0; solver_par->info = MAGMA_SUCCESS; // local variables double c_zero = MAGMA_D_ZERO, c_one = MAGMA_D_ONE; magma_int_t dofs = A.num_rows; // GPU workspace magma_d_vector r, rt, p, q, h; magma_d_vinit( &r, Magma_DEV, dofs*num_vecs, c_zero, queue ); magma_d_vinit( &rt, Magma_DEV, dofs*num_vecs, c_zero, queue ); magma_d_vinit( &p, Magma_DEV, dofs*num_vecs, c_zero, queue ); magma_d_vinit( &q, Magma_DEV, dofs*num_vecs, c_zero, queue ); magma_d_vinit( &h, Magma_DEV, dofs*num_vecs, c_zero, queue ); // solver variables double *alpha, *beta; alpha = NULL; beta = NULL; stat_cpu += magma_dmalloc_cpu(&alpha, num_vecs); stat_cpu += magma_dmalloc_cpu(&beta, num_vecs); double *nom, *nom0, *r0, *gammaold, *gammanew, *den, *res, *residual; nom = NULL; nom0 = NULL; r0 = NULL; gammaold = NULL; gammanew = NULL; den = NULL; res = NULL; residual = NULL; stat_cpu += magma_dmalloc_cpu(&residual, num_vecs); stat_cpu += magma_dmalloc_cpu(&nom, num_vecs); stat_cpu += magma_dmalloc_cpu(&nom0, num_vecs); stat_cpu += magma_dmalloc_cpu(&r0, num_vecs); stat_cpu += magma_dmalloc_cpu(&gammaold, num_vecs); stat_cpu += magma_dmalloc_cpu(&gammanew, num_vecs); stat_cpu += magma_dmalloc_cpu(&den, num_vecs); stat_cpu += magma_dmalloc_cpu(&res, num_vecs); stat_cpu += magma_dmalloc_cpu(&residual, num_vecs); if( stat_cpu != 0 ){ magma_free_cpu( nom ); magma_free_cpu( nom0 ); magma_free_cpu( r0 ); magma_free_cpu( gammaold ); magma_free_cpu( gammanew ); magma_free_cpu( den ); magma_free_cpu( res ); magma_free_cpu( alpha ); magma_free_cpu( beta ); magma_free_cpu( residual ); magmablasSetKernelStream( orig_queue ); printf("error: memory allocation.\n"); return MAGMA_ERR_HOST_ALLOC; } // solver setup magma_dscal( dofs*num_vecs, c_zero, x->dval, 1) ; // x = 0 magma_dcopy( dofs*num_vecs, b.dval, 1, r.dval, 1 ); // r = b // preconditioner magma_d_applyprecond_left( A, r, &rt, precond_par, queue ); magma_d_applyprecond_right( A, rt, &h, precond_par, queue ); magma_dcopy( dofs*num_vecs, h.dval, 1, p.dval, 1 ); // p = h for( i=0; i<num_vecs; i++) { nom[i] = MAGMA_D_REAL( magma_ddot(dofs, r(i), 1, h(i), 1) ); nom0[i] = magma_dnrm2( dofs, r(i), 1 ); } magma_d_spmv( c_one, A, p, c_zero, q, queue ); // q = A p for( i=0; i<num_vecs; i++) den[i] = MAGMA_D_REAL( magma_ddot(dofs, p(i), 1, q(i), 1) ); // den = p dot q solver_par->init_res = nom0[0]; if ( (r0[0] = nom[0] * solver_par->epsilon) < ATOLERANCE ) r0[0] = ATOLERANCE; // check positive definite if (den[0] <= 0.0) { printf("Operator A is not postive definite. (Ar,r) = %f\n", den[0]); magmablasSetKernelStream( orig_queue ); return MAGMA_NONSPD; solver_par->info = MAGMA_NONSPD;; } if ( nom[0] < r0[0] ) { magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nom0[0]; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ) { // preconditioner magma_d_applyprecond_left( A, r, &rt, precond_par, queue ); magma_d_applyprecond_right( A, rt, &h, precond_par, queue ); for( i=0; i<num_vecs; i++) gammanew[i] = MAGMA_D_REAL( magma_ddot(dofs, r(i), 1, h(i), 1) ); // gn = < r,h> if ( solver_par->numiter==1 ) { magma_dcopy( dofs*num_vecs, h.dval, 1, p.dval, 1 ); // p = h } else { for( i=0; i<num_vecs; i++) { beta[i] = MAGMA_D_MAKE(gammanew[i]/gammaold[i], 0.); // beta = gn/go magma_dscal(dofs, beta[i], p(i), 1); // p = beta*p magma_daxpy(dofs, c_one, h(i), 1, p(i), 1); // p = p + h } } magma_d_spmv( c_one, A, p, c_zero, q, queue ); // q = A p // magma_d_bspmv_tuned( dofs, num_vecs, c_one, A, p.dval, c_zero, q.dval, queue ); for( i=0; i<num_vecs; i++) { den[i] = MAGMA_D_REAL(magma_ddot(dofs, p(i), 1, q(i), 1)); // den = p dot q alpha[i] = MAGMA_D_MAKE(gammanew[i]/den[i], 0.); magma_daxpy(dofs, alpha[i], p(i), 1, x->dval+dofs*i, 1); // x = x + alpha p magma_daxpy(dofs, -alpha[i], q(i), 1, r(i), 1); // r = r - alpha q gammaold[i] = gammanew[i]; res[i] = magma_dnrm2( dofs, r(i), 1 ); } if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res[0]; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res[0]/nom0[0] < solver_par->epsilon ) { break; } } tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; magma_dresidual( A, b, *x, residual, queue ); solver_par->iter_res = res[0]; solver_par->final_res = residual[0]; if ( solver_par->numiter < solver_par->maxiter) { solver_par->info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res[0]; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_SLOW_CONVERGENCE; } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res[0]; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_DIVERGENCE; } for( i=0; i<num_vecs; i++) { printf("%.4e ",res[i]); } printf("\n"); for( i=0; i<num_vecs; i++) { printf("%.4e ",residual[i]); } printf("\n"); magma_d_vfree(&r, queue ); magma_d_vfree(&rt, queue ); magma_d_vfree(&p, queue ); magma_d_vfree(&q, queue ); magma_d_vfree(&h, queue ); magma_free_cpu(alpha); magma_free_cpu(beta); magma_free_cpu(nom); magma_free_cpu(nom0); magma_free_cpu(r0); magma_free_cpu(gammaold); magma_free_cpu(gammanew); magma_free_cpu(den); magma_free_cpu(res); magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } /* magma_dbpcg */
extern "C" magma_int_t magma_diterref( magma_d_sparse_matrix A, magma_d_vector b, magma_d_vector *x, magma_d_solver_par *solver_par, magma_d_preconditioner *precond_par, magma_queue_t queue ) { // set queue for old dense routines magma_queue_t orig_queue; magmablasGetKernelStream( &orig_queue ); // prepare solver feedback solver_par->solver = Magma_ITERREF; solver_par->numiter = 0; solver_par->info = MAGMA_SUCCESS; double residual; magma_dresidual( A, b, *x, &residual, queue ); solver_par->init_res = residual; // some useful variables double c_zero = MAGMA_D_ZERO, c_one = MAGMA_D_ONE, c_mone = MAGMA_D_NEG_ONE; magma_int_t dofs = A.num_rows; // workspace magma_d_vector r,z; magma_d_vinit( &r, Magma_DEV, dofs, c_zero, queue ); magma_d_vinit( &z, Magma_DEV, dofs, c_zero, queue ); // solver variables double nom, nom0, r0; // solver setup magma_dscal( dofs, c_zero, x->dval, 1) ; // x = 0 magma_dcopy( dofs, b.dval, 1, r.dval, 1 ); // r = b nom0 = magma_dnrm2(dofs, r.dval, 1); // nom0 = || r || nom = nom0 * nom0; solver_par->init_res = nom0; if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) { magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ) { magma_dscal( dofs, MAGMA_D_MAKE(1./nom, 0.), r.dval, 1) ; // scale it magma_d_precond( A, r, &z, precond_par, queue ); // inner solver: A * z = r magma_dscal( dofs, MAGMA_D_MAKE(nom, 0.), z.dval, 1) ; // scale it magma_daxpy(dofs, c_one, z.dval, 1, x->dval, 1); // x = x + z magma_d_spmv( c_mone, A, *x, c_zero, r, queue ); // r = - A x magma_daxpy(dofs, c_one, b.dval, 1, r.dval, 1); // r = r + b nom = magma_dnrm2(dofs, r.dval, 1); // nom = || r || if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) nom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( nom < r0 ) { break; } } tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; magma_dresidual( A, b, *x, &residual, queue ); solver_par->final_res = residual; solver_par->iter_res = nom; if ( solver_par->numiter < solver_par->maxiter) { solver_par->info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) nom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_SLOW_CONVERGENCE; } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) nom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_DIVERGENCE; } magma_d_vfree(&r, queue ); magma_d_vfree(&z, queue ); magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } /* magma_diterref */
extern "C" magma_int_t magma_dfgmres( magma_d_matrix A, magma_d_matrix b, magma_d_matrix *x, magma_d_solver_par *solver_par, magma_d_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; magma_int_t dofs = A.num_rows; // prepare solver feedback solver_par->solver = Magma_PGMRES; solver_par->numiter = 0; solver_par->spmv_count = 0; //Chronometry real_Double_t tempo1, tempo2; magma_int_t dim = solver_par->restart; magma_int_t m1 = dim+1; // used inside H macro magma_int_t i, j, k; double beta; double rel_resid, resid0=1, r0=0.0, betanom = 0.0, nom; magma_d_matrix v_t={Magma_CSR}, w_t={Magma_CSR}, t={Magma_CSR}, t2={Magma_CSR}, V={Magma_CSR}, W={Magma_CSR}; v_t.memory_location = Magma_DEV; v_t.num_rows = dofs; v_t.num_cols = 1; v_t.dval = NULL; v_t.storage_type = Magma_DENSE; w_t.memory_location = Magma_DEV; w_t.num_rows = dofs; w_t.num_cols = 1; w_t.dval = NULL; w_t.storage_type = Magma_DENSE; double temp; double *H={0}, *s={0}, *cs={0}, *sn={0}; CHECK( magma_dvinit( &t, Magma_DEV, dofs, 1, MAGMA_D_ZERO, queue )); CHECK( magma_dvinit( &t2, Magma_DEV, dofs, 1, MAGMA_D_ZERO, queue )); CHECK( magma_dmalloc_pinned( &H, (dim+1)*dim )); CHECK( magma_dmalloc_pinned( &s, dim+1 )); CHECK( magma_dmalloc_pinned( &cs, dim )); CHECK( magma_dmalloc_pinned( &sn, dim )); CHECK( magma_dvinit( &V, Magma_DEV, dofs*(dim+1), 1, MAGMA_D_ZERO, queue )); CHECK( magma_dvinit( &W, Magma_DEV, dofs*dim, 1, MAGMA_D_ZERO, queue )); CHECK( magma_dresidual( A, b, *x, &nom, queue)); solver_par->init_res = nom; if ( ( nom * solver_par->rtol) < ATOLERANCE ) r0 = ATOLERANCE; solver_par->numiter = 0; solver_par->spmv_count = 0; tempo1 = magma_sync_wtime( queue ); do { solver_par->numiter++; // compute initial residual and its norm // A.mult(n, 1, x, n, V(0), n); // V(0) = A*x CHECK( magma_d_spmv( MAGMA_D_ONE, A, *x, MAGMA_D_ZERO, t, queue )); solver_par->spmv_count++; magma_dcopy( dofs, t.dval, 1, V(0), 1, queue ); temp = MAGMA_D_MAKE(-1.0, 0.0); magma_daxpy( dofs,temp, b.dval, 1, V(0), 1, queue ); // V(0) = V(0) - b beta = MAGMA_D_MAKE( magma_dnrm2( dofs, V(0), 1, queue ), 0.0 ); // beta = norm(V(0)) if( magma_d_isnan_inf( beta ) ){ info = MAGMA_DIVERGENCE; break; } if (solver_par->numiter == 0){ solver_par->init_res = MAGMA_D_REAL( beta ); resid0 = MAGMA_D_REAL( beta ); r0 = resid0 * solver_par->rtol; if ( r0 < ATOLERANCE ) r0 = ATOLERANCE; if ( resid0 < r0 ) { solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; info = MAGMA_SUCCESS; goto cleanup; } } if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = resid0; solver_par->timing[0] = 0.0; } temp = -1.0/beta; magma_dscal( dofs, temp, V(0), 1, queue ); // V(0) = -V(0)/beta // save very first residual norm if (solver_par->numiter == 0) solver_par->init_res = MAGMA_D_REAL( beta ); for (i = 1; i < dim+1; i++) s[i] = MAGMA_D_ZERO; s[0] = beta; i = -1; do { i++; // M.apply(n, 1, V(i), n, W(i), n); v_t.dval = V(i); CHECK( magma_d_applyprecond_left( MagmaNoTrans, A, v_t, &t, precond_par, queue )); CHECK( magma_d_applyprecond_right( MagmaNoTrans, A, t, &t2, precond_par, queue )); magma_dcopy( dofs, t2.dval, 1, W(i), 1, queue ); // A.mult(n, 1, W(i), n, V(i+1), n); w_t.dval = W(i); CHECK( magma_d_spmv( MAGMA_D_ONE, A, w_t, MAGMA_D_ZERO, t, queue )); solver_par->spmv_count++; magma_dcopy( dofs, t.dval, 1, V(i+1), 1, queue ); for (k = 0; k <= i; k++) { H(k, i) = magma_ddot( dofs, V(k), 1, V(i+1), 1, queue ); temp = -H(k,i); // V(i+1) -= H(k, i) * V(k); magma_daxpy( dofs,-H(k,i), V(k), 1, V(i+1), 1, queue ); } H(i+1, i) = MAGMA_D_MAKE( magma_dnrm2( dofs, V(i+1), 1, queue), 0. ); // H(i+1,i) = ||r|| temp = 1.0 / H(i+1, i); // V(i+1) = V(i+1) / H(i+1, i) magma_dscal( dofs, temp, V(i+1), 1, queue ); // (to be fused) for (k = 0; k < i; k++) ApplyPlaneRotation(&H(k,i), &H(k+1,i), cs[k], sn[k]); GeneratePlaneRotation(H(i,i), H(i+1,i), &cs[i], &sn[i]); ApplyPlaneRotation(&H(i,i), &H(i+1,i), cs[i], sn[i]); ApplyPlaneRotation(&s[i], &s[i+1], cs[i], sn[i]); betanom = MAGMA_D_ABS( s[i+1] ); rel_resid = betanom / resid0; if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if (rel_resid <= solver_par->rtol || betanom <= solver_par->atol ){ info = MAGMA_SUCCESS; break; } } while (i+1 < dim && solver_par->numiter+1 <= solver_par->maxiter); // solve upper triangular system in place for (j = i; j >= 0; j--) { s[j] /= H(j,j); for (k = j-1; k >= 0; k--) s[k] -= H(k,j) * s[j]; } // update the solution for (j = 0; j <= i; j++) { // x = x + s[j] * W(j) magma_daxpy( dofs, s[j], W(j), 1, x->dval, 1, queue ); } } while (rel_resid > solver_par->rtol && solver_par->numiter+1 <= solver_par->maxiter); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; double residual; CHECK( magma_dresidual( A, b, *x, &residual, queue )); solver_par->iter_res = betanom; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter && info == MAGMA_SUCCESS ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->rtol*solver_par->init_res || solver_par->iter_res < solver_par->atol ) { info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: // free pinned memory magma_free_pinned(s); magma_free_pinned(cs); magma_free_pinned(sn); magma_free_pinned(H); //free DEV memory magma_dmfree( &V, queue); magma_dmfree( &W, queue); magma_dmfree( &t, queue); magma_dmfree( &t2, queue); solver_par->info = info; return info; } /* magma_dfgmres */
extern "C" magma_int_t magma_dbpcg( magma_d_matrix A, magma_d_matrix b, magma_d_matrix *x, magma_d_solver_par *solver_par, magma_d_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = 0; magma_int_t i, num_vecs = b.num_rows/A.num_rows; // prepare solver feedback solver_par->solver = Magma_PCG; solver_par->numiter = 0; solver_par->spmv_count = 0; solver_par->info = MAGMA_SUCCESS; // local variables double c_zero = MAGMA_D_ZERO, c_one = MAGMA_D_ONE; magma_int_t dofs = A.num_rows; // GPU workspace magma_d_matrix r={Magma_CSR}, rt={Magma_CSR}, p={Magma_CSR}, q={Magma_CSR}, h={Magma_CSR}; // solver variables double *alpha={0}, *beta={0}; alpha = NULL; beta = NULL; double *nom={0}, *nom0={0}, *r0={0}, *gammaold={0}, *gammanew={0}, *den={0}, *res={0}, *residual={0}; nom = NULL; nom0 = NULL; r0 = NULL; gammaold = NULL; gammanew = NULL; den = NULL; res = NULL; residual = NULL; CHECK( magma_dmalloc_cpu(&alpha, num_vecs)); CHECK( magma_dmalloc_cpu(&beta, num_vecs)); CHECK( magma_dmalloc_cpu(&residual, num_vecs)); CHECK( magma_dmalloc_cpu(&nom, num_vecs)); CHECK( magma_dmalloc_cpu(&nom0, num_vecs)); CHECK( magma_dmalloc_cpu(&r0, num_vecs)); CHECK( magma_dmalloc_cpu(&gammaold, num_vecs)); CHECK( magma_dmalloc_cpu(&gammanew, num_vecs)); CHECK( magma_dmalloc_cpu(&den, num_vecs)); CHECK( magma_dmalloc_cpu(&res, num_vecs)); CHECK( magma_dmalloc_cpu(&residual, num_vecs)); CHECK( magma_dvinit( &r, Magma_DEV, dofs*num_vecs, 1, c_zero, queue )); CHECK( magma_dvinit( &rt, Magma_DEV, dofs*num_vecs, 1, c_zero, queue )); CHECK( magma_dvinit( &p, Magma_DEV, dofs*num_vecs, 1, c_zero, queue )); CHECK( magma_dvinit( &q, Magma_DEV, dofs*num_vecs, 1, c_zero, queue )); CHECK( magma_dvinit( &h, Magma_DEV, dofs*num_vecs, 1, c_zero, queue )); // solver setup CHECK( magma_dresidualvec( A, b, *x, &r, nom0, queue)); // preconditioner CHECK( magma_d_applyprecond_left( MagmaNoTrans, A, r, &rt, precond_par, queue )); CHECK( magma_d_applyprecond_right( MagmaNoTrans, A, rt, &h, precond_par, queue )); magma_dcopy( dofs*num_vecs, h.dval, 1, p.dval, 1, queue ); // p = h for( i=0; i<num_vecs; i++) { nom[i] = MAGMA_D_REAL( magma_ddot( dofs, r(i), 1, h(i), 1, queue ) ); nom0[i] = magma_dnrm2( dofs, r(i), 1, queue ); } CHECK( magma_d_spmv( c_one, A, p, c_zero, q, queue )); // q = A p for( i=0; i<num_vecs; i++) den[i] = MAGMA_D_REAL( magma_ddot( dofs, p(i), 1, q(i), 1, queue ) ); // den = p dot q solver_par->init_res = nom0[0]; if ( (r0[0] = nom[0] * solver_par->rtol) < ATOLERANCE ) r0[0] = ATOLERANCE; // check positive definite if (den[0] <= 0.0) { printf("Operator A is not postive definite. (Ar,r) = %f\n", den[0]); info = MAGMA_NONSPD; goto cleanup; } if ( nom[0] < r0[0] ) { solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; goto cleanup; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nom0[0]; solver_par->timing[0] = 0.0; } solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; // preconditioner CHECK( magma_d_applyprecond_left( MagmaNoTrans, A, r, &rt, precond_par, queue )); CHECK( magma_d_applyprecond_right( MagmaNoTrans, A, rt, &h, precond_par, queue )); for( i=0; i<num_vecs; i++) gammanew[i] = MAGMA_D_REAL( magma_ddot( dofs, r(i), 1, h(i), 1, queue ) ); // gn = < r,h> if ( solver_par->numiter==1 ) { magma_dcopy( dofs*num_vecs, h.dval, 1, p.dval, 1, queue ); // p = h } else { for( i=0; i<num_vecs; i++) { beta[i] = MAGMA_D_MAKE(gammanew[i]/gammaold[i], 0.); // beta = gn/go magma_dscal( dofs, beta[i], p(i), 1, queue ); // p = beta*p magma_daxpy( dofs, c_one, h(i), 1, p(i), 1, queue ); // p = p + h } } CHECK( magma_d_spmv( c_one, A, p, c_zero, q, queue )); // q = A p solver_par->spmv_count++; // magma_d_bspmv_tuned( dofs, num_vecs, c_one, A, p.dval, c_zero, q.dval, queue ); for( i=0; i<num_vecs; i++) { den[i] = MAGMA_D_REAL(magma_ddot( dofs, p(i), 1, q(i), 1, queue) ); // den = p dot q alpha[i] = MAGMA_D_MAKE(gammanew[i]/den[i], 0.); magma_daxpy( dofs, alpha[i], p(i), 1, x->dval+dofs*i, 1, queue ); // x = x + alpha p magma_daxpy( dofs, -alpha[i], q(i), 1, r(i), 1, queue ); // r = r - alpha q gammaold[i] = gammanew[i]; res[i] = magma_dnrm2( dofs, r(i), 1, queue ); } if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res[0]; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res[0]/nom0[0] < solver_par->rtol ) { break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; CHECK( magma_dresidual( A, b, *x, residual, queue )); solver_par->iter_res = res[0]; solver_par->final_res = residual[0]; if ( solver_par->numiter < solver_par->maxiter ) { solver_par->info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res[0]; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->rtol*solver_par->init_res ){ info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res[0]; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } for( i=0; i<num_vecs; i++) { printf("%.4e ",res[i]); } printf("\n"); for( i=0; i<num_vecs; i++) { printf("%.4e ",residual[i]); } printf("\n"); cleanup: magma_dmfree(&r, queue ); magma_dmfree(&rt, queue ); magma_dmfree(&p, queue ); magma_dmfree(&q, queue ); magma_dmfree(&h, queue ); magma_free_cpu(alpha); magma_free_cpu(beta); magma_free_cpu(nom); magma_free_cpu(nom0); magma_free_cpu(r0); magma_free_cpu(gammaold); magma_free_cpu(gammanew); magma_free_cpu(den); magma_free_cpu(res); solver_par->info = info; return info; } /* magma_dbpcg */