extern "C" magma_int_t magma_cresidualvec( magma_c_matrix A, magma_c_matrix b, magma_c_matrix x, magma_c_matrix *r, float *res, magma_queue_t queue ) { magma_int_t info =0; // some useful variables magmaFloatComplex zero = MAGMA_C_ZERO, one = MAGMA_C_ONE, mone = MAGMA_C_NEG_ONE; magma_int_t dofs = A.num_rows; if ( A.num_rows == b.num_rows ) { CHECK( magma_c_spmv( mone, A, x, zero, *r, queue )); // r = A x magma_caxpy( dofs, one, b.dval, 1, r->dval, 1, queue ); // r = r - b *res = magma_scnrm2( dofs, r->dval, 1, queue ); // res = ||r|| // /magma_scnrm2( dofs, b.dval, 1, queue ); /||b|| //printf( "relative residual: %e\n", *res ); } else if ((b.num_rows*b.num_cols)%A.num_rows== 0 ) { magma_int_t num_vecs = b.num_rows*b.num_cols/A.num_rows; CHECK( magma_c_spmv( mone, A, x, zero, *r, queue )); // r = A x for( magma_int_t i=0; i<num_vecs; i++) { magma_caxpy( dofs, one, b(i), 1, r(i), 1, queue ); // r = r - b res[i] = magma_scnrm2( dofs, r(i), 1, queue ); // res = ||r|| } // /magma_scnrm2( dofs, b.dval, 1, queue ); /||b|| //printf( "relative residual: %e\n", *res ); } else { printf("%%error: dimensions do not match.\n"); info = MAGMA_ERR_NOT_SUPPORTED; } cleanup: return info; }
extern "C" magma_int_t magma_cgmres( magma_c_sparse_matrix A, magma_c_vector b, magma_c_vector *x, magma_c_solver_par *solver_par, magma_queue_t queue ) { magma_int_t stat = 0; // set queue for old dense routines magma_queue_t orig_queue; magmablasGetKernelStream( &orig_queue ); magma_int_t stat_cpu = 0, stat_dev = 0; // prepare solver feedback solver_par->solver = Magma_GMRES; solver_par->numiter = 0; solver_par->info = MAGMA_SUCCESS; // local variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE, c_mone = MAGMA_C_NEG_ONE; magma_int_t dofs = A.num_rows; magma_int_t i, j, k, m = 0; magma_int_t restart = min( dofs-1, solver_par->restart ); magma_int_t ldh = restart+1; float nom, rNorm, RNorm, nom0, betanom, r0 = 0.; // CPU workspace //magma_setdevice(0); magmaFloatComplex *H, *HH, *y, *h1; stat_cpu += magma_cmalloc_pinned( &H, (ldh+1)*ldh ); stat_cpu += magma_cmalloc_pinned( &y, ldh ); stat_cpu += magma_cmalloc_pinned( &HH, ldh*ldh ); stat_cpu += magma_cmalloc_pinned( &h1, ldh ); if( stat_cpu != 0){ magma_free_pinned( H ); magma_free_pinned( y ); magma_free_pinned( HH ); magma_free_pinned( h1 ); magmablasSetKernelStream( orig_queue ); return MAGMA_ERR_HOST_ALLOC; } // GPU workspace magma_c_vector r, q, q_t; magma_c_vinit( &r, Magma_DEV, dofs, c_zero, queue ); magma_c_vinit( &q, Magma_DEV, dofs*(ldh+1), c_zero, queue ); q_t.memory_location = Magma_DEV; q_t.dval = NULL; q_t.num_rows = q_t.nnz = dofs; q_t.num_cols = 1; magmaFloatComplex *dy = NULL, *dH = NULL; stat_dev += magma_cmalloc( &dy, ldh ); stat_dev += magma_cmalloc( &dH, (ldh+1)*ldh ); if( stat_dev != 0){ magma_free_pinned( H ); magma_free_pinned( y ); magma_free_pinned( HH ); magma_free_pinned( h1 ); magma_free( dH ); magma_free( dy ); magma_free( dH ); magma_free( dy ); magmablasSetKernelStream( orig_queue ); return MAGMA_ERR_DEVICE_ALLOC; } // GPU stream magma_queue_t stream[2]; magma_event_t event[1]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); //magmablasSetKernelStream(stream[0]); magma_cscal( dofs, c_zero, x->dval, 1 ); // x = 0 magma_ccopy( dofs, b.dval, 1, r.dval, 1 ); // r = b nom0 = betanom = magma_scnrm2( dofs, r.dval, 1 ); // nom0= || r|| nom = nom0 * nom0; solver_par->init_res = nom0; H(1,0) = MAGMA_C_MAKE( nom0, 0. ); magma_csetvector(1, &H(1,0), 1, &dH(1,0), 1); if ( (r0 = nom0 * solver_par->epsilon ) < ATOLERANCE ){ r0 = solver_par->epsilon; } if ( nom < r0 ) { magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ) { for(k=1; k<=restart; k++) { magma_ccopy(dofs, r.dval, 1, q(k-1), 1); // q[0] = 1.0/||r|| magma_cscal(dofs, 1./H(k,k-1), q(k-1), 1); // (to be fused) q_t.dval = q(k-1); //magmablasSetKernelStream(stream[0]); magma_c_spmv( c_one, A, q_t, c_zero, r, queue ); // r = A q[k] // if (solver_par->ortho == Magma_MGS ) { // modified Gram-Schmidt for (i=1; i<=k; i++) { H(i,k) =magma_cdotc(dofs, q(i-1), 1, r.dval, 1); // H(i,k) = q[i] . r magma_caxpy(dofs,-H(i,k), q(i-1), 1, r.dval, 1); // r = r - H(i,k) q[i] } H(k+1,k) = MAGMA_C_MAKE( magma_scnrm2(dofs, r.dval, 1), 0. ); // H(k+1,k) = ||r|| /*} else if (solver_par->ortho == Magma_FUSED_CGS ) { // fusing cgemv with scnrm2 in classical Gram-Schmidt magmablasSetKernelStream(stream[0]); magma_ccopy(dofs, r.dval, 1, q(k), 1); // dH(1:k+1,k) = q[0:k] . r magmablas_cgemv(MagmaTrans, dofs, k+1, c_one, q(0), dofs, r.dval, 1, c_zero, &dH(1,k), 1); // r = r - q[0:k-1] dH(1:k,k) magmablas_cgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.dval, 1); // 1) dH(k+1,k) = sqrt( dH(k+1,k) - dH(1:k,k) ) magma_ccopyscale( dofs, k, r.dval, q(k), &dH(1,k) ); // 2) q[k] = q[k] / dH(k+1,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_cgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); // asynch copy dH(1:(k+1),k) to H(1:(k+1),k) } else { // classical Gram-Schmidt (default) // > explicitly calling magmabls magmablasSetKernelStream(stream[0]); magmablas_cgemv(MagmaTrans, dofs, k, c_one, q(0), dofs, r.dval, 1, c_zero, &dH(1,k), 1, queue ); // dH(1:k,k) = q[0:k-1] . r #ifndef SCNRM2SCALE // start copying dH(1:k,k) to H(1:k,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_cgetvector_async(k, &dH(1,k), 1, &H(1,k), 1, stream[1]); #endif // r = r - q[0:k-1] dH(1:k,k) magmablas_cgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.dval, 1); #ifdef SCNRM2SCALE magma_ccopy(dofs, r.dval, 1, q(k), 1); // q[k] = r / H(k,k-1) magma_scnrm2scale(dofs, q(k), dofs, &dH(k+1,k) ); // dH(k+1,k) = sqrt(r . r) and r = r / dH(k+1,k) magma_event_record( event[0], stream[0] ); // start sending dH(1:k,k) to H(1:k,k) magma_queue_wait_event( stream[1], event[0] ); // can we keep H(k+1,k) on GPU and combine? magma_cgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); #else H(k+1,k) = MAGMA_C_MAKE( magma_scnrm2(dofs, r.dval, 1), 0. ); // H(k+1,k) = sqrt(r . r) if ( k<solver_par->restart ) { magmablasSetKernelStream(stream[0]); magma_ccopy(dofs, r.dval, 1, q(k), 1); // q[k] = 1.0/H[k][k-1] r magma_cscal(dofs, 1./H(k+1,k), q(k), 1); // (to be fused) } #endif }*/ /* Minimization of || b-Ax || in H_k */ for (i=1; i<=k; i++) { HH(k,i) = magma_cblas_cdotc( i+1, &H(1,k), 1, &H(1,i), 1 ); } h1[k] = H(1,k)*H(1,0); if (k != 1) { for (i=1; i<k; i++) { HH(k,i) = HH(k,i)/HH(i,i);// for (m=i+1; m<=k; m++) { HH(k,m) -= HH(k,i) * HH(m,i) * HH(i,i); } h1[k] -= h1[i] * HH(k,i); } } y[k] = h1[k]/HH(k,k); if (k != 1) for (i=k-1; i>=1; i--) { y[i] = h1[i]/HH(i,i); for (j=i+1; j<=k; j++) y[i] -= y[j] * HH(j,i); } m = k; rNorm = fabs(MAGMA_C_REAL(H(k+1,k))); }/* Minimization done */ // compute solution approximation magma_csetmatrix(m, 1, y+1, m, dy, m ); magma_cgemv(MagmaNoTrans, dofs, m, c_one, q(0), dofs, dy, 1, c_one, x->dval, 1); // compute residual magma_c_spmv( c_mone, A, *x, c_zero, r, queue ); // r = - A * x magma_caxpy(dofs, c_one, b.dval, 1, r.dval, 1); // r = r + b H(1,0) = MAGMA_C_MAKE( magma_scnrm2(dofs, r.dval, 1), 0. ); // RNorm = H[1][0] = || r || RNorm = MAGMA_C_REAL( H(1,0) ); betanom = fabs(RNorm); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < r0 ) { break; } } tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; magma_cresidual( A, b, *x, &residual, queue ); solver_par->iter_res = betanom; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter) { solver_par->info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_SLOW_CONVERGENCE; } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_DIVERGENCE; } // free pinned memory magma_free_pinned( H ); magma_free_pinned( y ); magma_free_pinned( HH ); magma_free_pinned( h1 ); // free GPU memory magma_free(dy); if (dH != NULL ) magma_free(dH); magma_c_vfree(&r, queue ); magma_c_vfree(&q, queue ); // free GPU streams and events magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_event_destroy( event[0] ); //magmablasSetKernelStream(NULL); magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } /* magma_cgmres */
extern "C" magma_int_t magma_cidr( magma_c_matrix A, magma_c_matrix b, magma_c_matrix *x, magma_c_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_IDR; solver_par->numiter = 0; solver_par->spmv_count = 0; solver_par->init_res = 0.0; solver_par->final_res = 0.0; solver_par->iter_res = 0.0; solver_par->runtime = 0.0; // constants const magmaFloatComplex c_zero = MAGMA_C_ZERO; const magmaFloatComplex c_one = MAGMA_C_ONE; const magmaFloatComplex c_n_one = MAGMA_C_NEG_ONE; // internal user parameters const magma_int_t smoothing = 1; // 0 = disable, 1 = enable const float angle = 0.7; // [0-1] // local variables magma_int_t iseed[4] = {0, 0, 0, 1}; magma_int_t dof; magma_int_t s; magma_int_t distr; magma_int_t k, i, sk; magma_int_t innerflag; float residual; float nrm; float nrmb; float nrmr; float nrmt; float rho; magmaFloatComplex om; magmaFloatComplex tt; magmaFloatComplex tr; magmaFloatComplex gamma; magmaFloatComplex alpha; magmaFloatComplex mkk; magmaFloatComplex fk; // matrices and vectors magma_c_matrix dxs = {Magma_CSR}; magma_c_matrix dr = {Magma_CSR}, drs = {Magma_CSR}; magma_c_matrix dP = {Magma_CSR}, dP1 = {Magma_CSR}; magma_c_matrix dG = {Magma_CSR}; magma_c_matrix dU = {Magma_CSR}; magma_c_matrix dM = {Magma_CSR}; magma_c_matrix df = {Magma_CSR}; magma_c_matrix dt = {Magma_CSR}; magma_c_matrix dc = {Magma_CSR}; magma_c_matrix dv = {Magma_CSR}; magma_c_matrix dbeta = {Magma_CSR}, hbeta = {Magma_CSR}; // chronometry real_Double_t tempo1, tempo2; // initial s space // TODO: add option for 's' (shadow space number) // Hack: uses '--restart' option as the shadow space number. // This is not a good idea because the default value of restart option is used to detect // if the user provided a custom restart. This means that if the default restart value // is changed then the code will think it was the user (unless the default value is // also updated in the 'if' statement below. s = 1; if ( solver_par->restart != 50 ) { if ( solver_par->restart > A.num_cols ) { s = A.num_cols; } else { s = solver_par->restart; } } solver_par->restart = s; // set max iterations solver_par->maxiter = min( 2 * A.num_cols, solver_par->maxiter ); // check if matrix A is square if ( A.num_rows != A.num_cols ) { //printf("Matrix A is not square.\n"); info = MAGMA_ERR_NOT_SUPPORTED; goto cleanup; } // |b| nrmb = magma_scnrm2( b.num_rows, b.dval, 1, queue ); if ( nrmb == 0.0 ) { magma_cscal( x->num_rows, MAGMA_C_ZERO, x->dval, 1, queue ); info = MAGMA_SUCCESS; goto cleanup; } // r = b - A x CHECK( magma_cvinit( &dr, Magma_DEV, b.num_rows, 1, c_zero, queue )); CHECK( magma_cresidualvec( A, b, *x, &dr, &nrmr, queue )); // |r| solver_par->init_res = nrmr; solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nrmr; } // check if initial is guess good enough if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { info = MAGMA_SUCCESS; goto cleanup; } // P = randn(n, s) // P = ortho(P) //--------------------------------------- // P = 0.0 CHECK( magma_cvinit( &dP, Magma_CPU, A.num_cols, s, c_zero, queue )); // P = randn(n, s) distr = 3; // 1 = unif (0,1), 2 = unif (-1,1), 3 = normal (0,1) dof = dP.num_rows * dP.num_cols; lapackf77_clarnv( &distr, iseed, &dof, dP.val ); // transfer P to device CHECK( magma_cmtransfer( dP, &dP1, Magma_CPU, Magma_DEV, queue )); magma_cmfree( &dP, queue ); // P = ortho(P1) if ( dP1.num_cols > 1 ) { // P = magma_cqr(P1), QR factorization CHECK( magma_cqr( dP1.num_rows, dP1.num_cols, dP1, dP1.ld, &dP, NULL, queue )); } else { // P = P1 / |P1| nrm = magma_scnrm2( dof, dP1.dval, 1, queue ); nrm = 1.0 / nrm; magma_csscal( dof, nrm, dP1.dval, 1, queue ); CHECK( magma_cmtransfer( dP1, &dP, Magma_DEV, Magma_DEV, queue )); } magma_cmfree( &dP1, queue ); //--------------------------------------- // allocate memory for the scalar products CHECK( magma_cvinit( &hbeta, Magma_CPU, s, 1, c_zero, queue )); CHECK( magma_cvinit( &dbeta, Magma_DEV, s, 1, c_zero, queue )); // smoothing enabled if ( smoothing > 0 ) { // set smoothing solution vector CHECK( magma_cmtransfer( *x, &dxs, Magma_DEV, Magma_DEV, queue )); // set smoothing residual vector CHECK( magma_cmtransfer( dr, &drs, Magma_DEV, Magma_DEV, queue )); } // G(n,s) = 0 CHECK( magma_cvinit( &dG, Magma_DEV, A.num_cols, s, c_zero, queue )); // U(n,s) = 0 CHECK( magma_cvinit( &dU, Magma_DEV, A.num_cols, s, c_zero, queue )); // M(s,s) = I CHECK( magma_cvinit( &dM, Magma_DEV, s, s, c_zero, queue )); magmablas_claset( MagmaFull, s, s, c_zero, c_one, dM.dval, s, queue ); // f = 0 CHECK( magma_cvinit( &df, Magma_DEV, dP.num_cols, 1, c_zero, queue )); // t = 0 CHECK( magma_cvinit( &dt, Magma_DEV, dr.num_rows, 1, c_zero, queue )); // c = 0 CHECK( magma_cvinit( &dc, Magma_DEV, dM.num_cols, 1, c_zero, queue )); // v = 0 CHECK( magma_cvinit( &dv, Magma_DEV, dr.num_rows, 1, c_zero, queue )); //--------------START TIME--------------- // chronometry tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->timing[0] = 0.0; } om = MAGMA_C_ONE; innerflag = 0; // start iteration do { solver_par->numiter++; // new RHS for small systems // f = P' r magmablas_cgemv( MagmaConjTrans, dP.num_rows, dP.num_cols, c_one, dP.dval, dP.ld, dr.dval, 1, c_zero, df.dval, 1, queue ); // shadow space loop for ( k = 0; k < s; ++k ) { sk = s - k; // f(k:s) = M(k:s,k:s) c(k:s) magma_ccopyvector( sk, &df.dval[k], 1, &dc.dval[k], 1, queue ); magma_ctrsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, sk, &dM.dval[k*dM.ld+k], dM.ld, &dc.dval[k], 1, queue ); // v = r - G(:,k:s) c(k:s) magma_ccopyvector( dr.num_rows, dr.dval, 1, dv.dval, 1, queue ); magmablas_cgemv( MagmaNoTrans, dG.num_rows, sk, c_n_one, &dG.dval[k*dG.ld], dG.ld, &dc.dval[k], 1, c_one, dv.dval, 1, queue ); // U(:,k) = om * v + U(:,k:s) c(k:s) magmablas_cgemv( MagmaNoTrans, dU.num_rows, sk, c_one, &dU.dval[k*dU.ld], dU.ld, &dc.dval[k], 1, om, dv.dval, 1, queue ); magma_ccopyvector( dU.num_rows, dv.dval, 1, &dU.dval[k*dU.ld], 1, queue ); // G(:,k) = A U(:,k) CHECK( magma_c_spmv( c_one, A, dv, c_zero, dv, queue )); solver_par->spmv_count++; magma_ccopyvector( dG.num_rows, dv.dval, 1, &dG.dval[k*dG.ld], 1, queue ); // bi-orthogonalize the new basis vectors for ( i = 0; i < k; ++i ) { // alpha = P(:,i)' G(:,k) alpha = magma_cdotc( dP.num_rows, &dP.dval[i*dP.ld], 1, &dG.dval[k*dG.ld], 1, queue ); // alpha = alpha / M(i,i) magma_cgetvector( 1, &dM.dval[i*dM.ld+i], 1, &mkk, 1, queue ); alpha = alpha / mkk; // G(:,k) = G(:,k) - alpha * G(:,i) magma_caxpy( dG.num_rows, -alpha, &dG.dval[i*dG.ld], 1, &dG.dval[k*dG.ld], 1, queue ); // U(:,k) = U(:,k) - alpha * U(:,i) magma_caxpy( dU.num_rows, -alpha, &dU.dval[i*dU.ld], 1, &dU.dval[k*dU.ld], 1, queue ); } // new column of M = P'G, first k-1 entries are zero // M(k:s,k) = P(:,k:s)' G(:,k) magmablas_cgemv( MagmaConjTrans, dP.num_rows, sk, c_one, &dP.dval[k*dP.ld], dP.ld, &dG.dval[k*dG.ld], 1, c_zero, &dM.dval[k*dM.ld+k], 1, queue ); // check M(k,k) == 0 magma_cgetvector( 1, &dM.dval[k*dM.ld+k], 1, &mkk, 1, queue ); if ( MAGMA_C_EQUAL(mkk, MAGMA_C_ZERO) ) { innerflag = 1; info = MAGMA_DIVERGENCE; break; } // beta = f(k) / M(k,k) magma_cgetvector( 1, &df.dval[k], 1, &fk, 1, queue ); hbeta.val[k] = fk / mkk; // check for nan if ( magma_c_isnan( hbeta.val[k] ) || magma_c_isinf( hbeta.val[k] )) { innerflag = 1; info = MAGMA_DIVERGENCE; break; } // r = r - beta * G(:,k) magma_caxpy( dr.num_rows, -hbeta.val[k], &dG.dval[k*dG.ld], 1, dr.dval, 1, queue ); // smoothing disabled if ( smoothing <= 0 ) { // |r| nrmr = magma_scnrm2( dr.num_rows, dr.dval, 1, queue ); // smoothing enabled } else { // x = x + beta * U(:,k) magma_caxpy( x->num_rows, hbeta.val[k], &dU.dval[k*dU.ld], 1, x->dval, 1, queue ); // smoothing operation //--------------------------------------- // t = rs - r magma_ccopyvector( drs.num_rows, drs.dval, 1, dt.dval, 1, queue ); magma_caxpy( dt.num_rows, c_n_one, dr.dval, 1, dt.dval, 1, queue ); // t't // t'rs tt = magma_cdotc( dt.num_rows, dt.dval, 1, dt.dval, 1, queue ); tr = magma_cdotc( dt.num_rows, dt.dval, 1, drs.dval, 1, queue ); // gamma = (t' * rs) / (t' * t) gamma = tr / tt; // rs = rs - gamma * (rs - r) magma_caxpy( drs.num_rows, -gamma, dt.dval, 1, drs.dval, 1, queue ); // xs = xs - gamma * (xs - x) magma_ccopyvector( dxs.num_rows, dxs.dval, 1, dt.dval, 1, queue ); magma_caxpy( dt.num_rows, c_n_one, x->dval, 1, dt.dval, 1, queue ); magma_caxpy( dxs.num_rows, -gamma, dt.dval, 1, dxs.dval, 1, queue ); // |rs| nrmr = magma_scnrm2( drs.num_rows, drs.dval, 1, queue ); //--------------------------------------- } // store current timing and residual if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter) % solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)nrmr; solver_par->timing[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)tempo2 - tempo1; } } // check convergence if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { s = k + 1; // for the x-update outside the loop innerflag = 2; info = MAGMA_SUCCESS; break; } // non-last s iteration if ( (k + 1) < s ) { // f(k+1:s) = f(k+1:s) - beta * M(k+1:s,k) magma_caxpy( sk-1, -hbeta.val[k], &dM.dval[k*dM.ld+(k+1)], 1, &df.dval[k+1], 1, queue ); } } // smoothing disabled if ( smoothing <= 0 && innerflag != 1 ) { // update solution approximation x // x = x + U(:,1:s) * beta(1:s) magma_csetvector( s, hbeta.val, 1, dbeta.dval, 1, queue ); magmablas_cgemv( MagmaNoTrans, dU.num_rows, s, c_one, dU.dval, dU.ld, dbeta.dval, 1, c_one, x->dval, 1, queue ); } // check convergence or iteration limit or invalid result of inner loop if ( innerflag > 0 ) { break; } // t = A v // t = A r CHECK( magma_c_spmv( c_one, A, dr, c_zero, dt, queue )); solver_par->spmv_count++; // computation of a new omega //--------------------------------------- // |t| nrmt = magma_scnrm2( dt.num_rows, dt.dval, 1, queue ); // t'r tr = magma_cdotc( dt.num_rows, dt.dval, 1, dr.dval, 1, queue ); // rho = abs(t' * r) / (|t| * |r|)) rho = MAGMA_D_ABS( MAGMA_C_REAL(tr) / (nrmt * nrmr) ); // om = (t' * r) / (|t| * |t|) om = tr / (nrmt * nrmt); if ( rho < angle ) { om = (om * angle) / rho; } //--------------------------------------- if ( MAGMA_C_EQUAL(om, MAGMA_C_ZERO) ) { info = MAGMA_DIVERGENCE; break; } // update approximation vector // x = x + om * v // x = x + om * r magma_caxpy( x->num_rows, om, dr.dval, 1, x->dval, 1, queue ); // update residual vector // r = r - om * t magma_caxpy( dr.num_rows, -om, dt.dval, 1, dr.dval, 1, queue ); // smoothing disabled if ( smoothing <= 0 ) { // residual norm nrmr = magma_scnrm2( b.num_rows, dr.dval, 1, queue ); // smoothing enabled } else { // smoothing operation //--------------------------------------- // t = rs - r magma_ccopyvector( drs.num_rows, drs.dval, 1, dt.dval, 1, queue ); magma_caxpy( dt.num_rows, c_n_one, dr.dval, 1, dt.dval, 1, queue ); // t't // t'rs tt = magma_cdotc( dt.num_rows, dt.dval, 1, dt.dval, 1, queue ); tr = magma_cdotc( dt.num_rows, dt.dval, 1, drs.dval, 1, queue ); // gamma = (t' * rs) / (|t| * |t|) gamma = tr / tt; // rs = rs - gamma * (rs - r) magma_caxpy( drs.num_rows, -gamma, dt.dval, 1, drs.dval, 1, queue ); // xs = xs - gamma * (xs - x) magma_ccopyvector( dxs.num_rows, dxs.dval, 1, dt.dval, 1, queue ); magma_caxpy( dt.num_rows, c_n_one, x->dval, 1, dt.dval, 1, queue ); magma_caxpy( dxs.num_rows, -gamma, dt.dval, 1, dxs.dval, 1, queue ); // |rs| nrmr = magma_scnrm2( b.num_rows, drs.dval, 1, queue ); //--------------------------------------- } // store current timing and residual if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter) % solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)nrmr; solver_par->timing[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)tempo2 - tempo1; } } // check convergence if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { info = MAGMA_SUCCESS; break; } } while ( solver_par->numiter + 1 <= solver_par->maxiter ); // smoothing enabled if ( smoothing > 0 ) { // x = xs magma_ccopyvector( x->num_rows, dxs.dval, 1, x->dval, 1, queue ); // r = rs magma_ccopyvector( dr.num_rows, drs.dval, 1, dr.dval, 1, queue ); } // get last iteration timing tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t)tempo2 - tempo1; //--------------STOP TIME---------------- // get final stats solver_par->iter_res = nrmr; CHECK( magma_cresidualvec( A, b, *x, &dr, &residual, queue )); solver_par->final_res = residual; // set solver conclusion if ( info != MAGMA_SUCCESS && info != MAGMA_DIVERGENCE ) { if ( solver_par->init_res > solver_par->final_res ) { info = MAGMA_SLOW_CONVERGENCE; } } cleanup: // free resources // smoothing enabled if ( smoothing > 0 ) { magma_cmfree( &dxs, queue ); magma_cmfree( &drs, queue ); } magma_cmfree( &dr, queue ); magma_cmfree( &dP, queue ); magma_cmfree( &dP1, queue ); magma_cmfree( &dG, queue ); magma_cmfree( &dU, queue ); magma_cmfree( &dM, queue ); magma_cmfree( &df, queue ); magma_cmfree( &dt, queue ); magma_cmfree( &dc, queue ); magma_cmfree( &dv, queue ); magma_cmfree( &dbeta, queue ); magma_cmfree( &hbeta, queue ); solver_par->info = info; return info; /* magma_cidr */ }
extern "C" magma_int_t magma_ccg_merge( magma_c_matrix A, magma_c_matrix b, magma_c_matrix *x, magma_c_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_CGMERGE; solver_par->numiter = 0; solver_par->spmv_count = 0; // solver variables magmaFloatComplex alpha, beta, gamma, rho, tmp1, *skp_h={0}; float nom, nom0, betanom, den, nomb; // some useful variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE; magma_int_t dofs = A.num_rows*b.num_cols; magma_c_matrix r={Magma_CSR}, d={Magma_CSR}, z={Magma_CSR}, B={Magma_CSR}, C={Magma_CSR}; magmaFloatComplex *d1=NULL, *d2=NULL, *skp=NULL; // GPU workspace CHECK( magma_cvinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &d, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &z, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cmalloc( &d1, dofs*(1) )); CHECK( magma_cmalloc( &d2, dofs*(1) )); // array for the parameters CHECK( magma_cmalloc( &skp, 6 )); // skp = [alpha|beta|gamma|rho|tmp1|tmp2] // solver setup magma_cscal( dofs, c_zero, x->dval, 1, queue ); // x = 0 //CHECK( magma_cresidualvec( A, b, *x, &r, nom0, queue)); magma_ccopy( dofs, b.dval, 1, r.dval, 1, queue ); // r = b magma_ccopy( dofs, r.dval, 1, d.dval, 1, queue ); // d = r nom0 = betanom = magma_scnrm2( dofs, r.dval, 1, queue ); nom = nom0 * nom0; // nom = r' * r CHECK( magma_c_spmv( c_one, A, d, c_zero, z, queue )); // z = A d den = MAGMA_C_ABS( magma_cdotc( dofs, d.dval, 1, z.dval, 1, queue ) ); // den = d'* z solver_par->init_res = nom0; nomb = magma_scnrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } // array on host for the parameters CHECK( magma_cmalloc_cpu( &skp_h, 6 )); alpha = rho = gamma = tmp1 = c_one; beta = magma_cdotc( dofs, r.dval, 1, r.dval, 1, queue ); skp_h[0]=alpha; skp_h[1]=beta; skp_h[2]=gamma; skp_h[3]=rho; skp_h[4]=tmp1; skp_h[5]=MAGMA_C_MAKE(nom, 0.0); magma_csetvector( 6, skp_h, 1, skp, 1, queue ); if( nom0 < solver_par->atol || nom0/nomb < solver_par->rtol ){ info = MAGMA_SUCCESS; goto cleanup; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t) nom0; solver_par->timing[0] = 0.0; } // check positive definite if (den <= 0.0) { info = MAGMA_NONSPD; goto cleanup; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; // computes SpMV and dot product CHECK( magma_ccgmerge_spmv1( A, d1, d2, d.dval, z.dval, skp, queue )); solver_par->spmv_count++; // updates x, r, computes scalars and updates d CHECK( magma_ccgmerge_xrbeta( dofs, d1, d2, x->dval, r.dval, d.dval, z.dval, skp, queue )); // check stopping criterion (asynchronous copy) magma_cgetvector( 1 , skp+1, 1, skp_h+1, 1, queue ); betanom = sqrt(MAGMA_C_ABS(skp_h[1])); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < solver_par->atol || betanom/nomb < solver_par->rtol ) { break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; CHECK( magma_cresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = betanom; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->atol || solver_par->iter_res/solver_par->init_res < solver_par->rtol ){ info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_DIVERGENCE; } cleanup: magma_cmfree(&r, queue ); magma_cmfree(&z, queue ); magma_cmfree(&d, queue ); magma_cmfree(&B, queue ); magma_cmfree(&C, queue ); magma_free( d1 ); magma_free( d2 ); magma_free( skp ); magma_free_cpu( skp_h ); solver_par->info = info; return info; } /* magma_ccg_merge */
extern "C" magma_int_t magma_clsqr( magma_c_matrix A, magma_c_matrix b, magma_c_matrix *x, magma_c_solver_par *solver_par, magma_c_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_LSQR; solver_par->numiter = 0; solver_par->spmv_count = 0; magma_int_t m = A.num_rows * b.num_cols; magma_int_t n = A.num_cols * b.num_cols; // local variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE; // solver variables float s, nom0, r0, res=0, nomb, phibar, beta, alpha, c, rho, rhot, phi, thet, normr, normar, norma, sumnormd2, normd; // need to transpose the matrix magma_c_matrix AT={Magma_CSR}, Ah1={Magma_CSR}, Ah2={Magma_CSR}; // GPU workspace magma_c_matrix r={Magma_CSR}, v={Magma_CSR}, z={Magma_CSR}, zt={Magma_CSR}, d={Magma_CSR}, vt={Magma_CSR}, q={Magma_CSR}, w={Magma_CSR}, u={Magma_CSR}; CHECK( magma_cvinit( &r, Magma_DEV, A.num_cols, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &v, Magma_DEV, A.num_cols, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &z, Magma_DEV, A.num_cols, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &d, Magma_DEV, A.num_cols, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &vt,Magma_DEV, A.num_cols, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &q, Magma_DEV, A.num_cols, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &w, Magma_DEV, A.num_cols, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &u, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &zt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // transpose the matrix magma_cmtransfer( A, &Ah1, Magma_DEV, Magma_CPU, queue ); magma_cmconvert( Ah1, &Ah2, A.storage_type, Magma_CSR, queue ); magma_cmfree(&Ah1, queue ); magma_cmtransposeconjugate( Ah2, &Ah1, queue ); magma_cmfree(&Ah2, queue ); Ah2.blocksize = A.blocksize; Ah2.alignment = A.alignment; magma_cmconvert( Ah1, &Ah2, Magma_CSR, A.storage_type, queue ); magma_cmfree(&Ah1, queue ); magma_cmtransfer( Ah2, &AT, Magma_CPU, Magma_DEV, queue ); magma_cmfree(&Ah2, queue ); // solver setup CHECK( magma_cresidualvec( A, b, *x, &r, &nom0, queue)); solver_par->init_res = nom0; nomb = magma_scnrm2( m, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } if ( (r0 = nomb * solver_par->rtol) < ATOLERANCE ){ r0 = ATOLERANCE; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nom0; solver_par->timing[0] = 0.0; } if ( nom0 < r0 ) { info = MAGMA_SUCCESS; goto cleanup; } magma_ccopy( m, b.dval, 1, u.dval, 1, queue ); beta = magma_scnrm2( m, u.dval, 1, queue ); magma_cscal( m, MAGMA_C_MAKE(1./beta, 0.0 ), u.dval, 1, queue ); normr = beta; c = 1.0; s = 0.0; phibar = beta; CHECK( magma_c_spmv( c_one, AT, u, c_zero, v, queue )); if( precond_par->solver == Magma_NONE ){ ; } else { CHECK( magma_c_applyprecond_right( MagmaTrans, A, v, &zt, precond_par, queue )); CHECK( magma_c_applyprecond_left( MagmaTrans, A, zt, &v, precond_par, queue )); } alpha = magma_scnrm2( n, v.dval, 1, queue ); magma_cscal( n, MAGMA_C_MAKE(1./alpha, 0.0 ), v.dval, 1, queue ); normar = alpha * beta; norma = 0; sumnormd2 = 0; //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; // start iteration do { solver_par->numiter++; if( precond_par->solver == Magma_NONE || A.num_rows != A.num_cols ) { magma_ccopy( n, v.dval, 1 , z.dval, 1, queue ); } else { CHECK( magma_c_applyprecond_left( MagmaNoTrans, A, v, &zt, precond_par, queue )); CHECK( magma_c_applyprecond_right( MagmaNoTrans, A, zt, &z, precond_par, queue )); } //CHECK( magma_c_spmv( c_one, A, z, MAGMA_C_MAKE(-alpha,0.0), u, queue )); CHECK( magma_c_spmv( c_one, A, z, c_zero, zt, queue )); magma_cscal( m, MAGMA_C_MAKE(-alpha, 0.0 ), u.dval, 1, queue ); magma_caxpy( m, c_one, zt.dval, 1, u.dval, 1, queue ); solver_par->spmv_count++; beta = magma_scnrm2( m, u.dval, 1, queue ); magma_cscal( m, MAGMA_C_MAKE(1./beta, 0.0 ), u.dval, 1, queue ); // norma = norm([norma alpha beta]); norma = sqrt(norma*norma + alpha*alpha + beta*beta ); //lsvec( solver_par->numiter-1 ) = normar / norma; thet = -s * alpha; rhot = c * alpha; rho = sqrt( rhot * rhot + beta * beta ); c = rhot / rho; s = - beta / rho; phi = c * phibar; phibar = s * phibar; // d = (z - thet * d) / rho; magma_cscal( n, MAGMA_C_MAKE(-thet, 0.0 ), d.dval, 1, queue ); magma_caxpy( n, c_one, z.dval, 1, d.dval, 1, queue ); magma_cscal( n, MAGMA_C_MAKE(1./rho, 0.0 ), d.dval, 1, queue ); normd = magma_scnrm2( n, d.dval, 1, queue ); sumnormd2 = sumnormd2 + normd*normd; // convergence check res = normr; if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose == c_zero ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } // check for convergence in A*x=b if ( res/nomb <= solver_par->rtol || res <= solver_par->atol ){ info = MAGMA_SUCCESS; break; } // check for convergence in min{|b-A*x|} if ( A.num_rows != A.num_cols && ( normar/(norma*normr) <= solver_par->rtol || normar <= solver_par->atol ) ){ printf("%% warning: quit from minimization convergence check.\n"); info = MAGMA_SUCCESS; break; } magma_caxpy( n, MAGMA_C_MAKE( phi, 0.0 ), d.dval, 1, x->dval, 1, queue ); normr = fabs(s) * normr; CHECK( magma_c_spmv( c_one, AT, u, c_zero, vt, queue )); solver_par->spmv_count++; if( precond_par->solver == Magma_NONE ){ ; } else { CHECK( magma_c_applyprecond_right( MagmaTrans, A, vt, &zt, precond_par, queue )); CHECK( magma_c_applyprecond_left( MagmaTrans, A, zt, &vt, precond_par, queue )); } magma_cscal( n, MAGMA_C_MAKE(-beta, 0.0 ), v.dval, 1, queue ); magma_caxpy( n, c_one, vt.dval, 1, v.dval, 1, queue ); alpha = magma_scnrm2( n, v.dval, 1, queue ); magma_cscal( n, MAGMA_C_MAKE(1./alpha, 0.0 ), v.dval, 1, queue ); normar = alpha * fabs(s*phi); } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; CHECK( magma_cresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = res; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter && info == MAGMA_SUCCESS ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == c_zero ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->rtol*solver_par->init_res || solver_par->iter_res < solver_par->atol ) { info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == c_zero ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_cmfree(&r, queue ); magma_cmfree(&v, queue ); magma_cmfree(&z, queue ); magma_cmfree(&zt, queue ); magma_cmfree(&d, queue ); magma_cmfree(&vt, queue ); magma_cmfree(&q, queue ); magma_cmfree(&u, queue ); magma_cmfree(&w, queue ); magma_cmfree(&AT, queue ); magma_cmfree(&Ah1, queue ); magma_cmfree(&Ah2, queue ); solver_par->info = info; return info; } /* magma_cqmr */
extern "C" magma_int_t magma_cbicgstab( magma_c_sparse_matrix A, magma_c_vector b, magma_c_vector *x, magma_c_solver_par *solver_par, magma_queue_t queue ) { // set queue for old dense routines magma_queue_t orig_queue; magmablasGetKernelStream( &orig_queue ); // prepare solver feedback solver_par->solver = Magma_BICGSTAB; solver_par->numiter = 0; solver_par->info = MAGMA_SUCCESS; // some useful variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE, c_mone = MAGMA_C_NEG_ONE; magma_int_t dofs = A.num_rows; // workspace magma_c_vector r,rr,p,v,s,t; magma_c_vinit( &r, Magma_DEV, dofs, c_zero, queue ); magma_c_vinit( &rr, Magma_DEV, dofs, c_zero, queue ); magma_c_vinit( &p, Magma_DEV, dofs, c_zero, queue ); magma_c_vinit( &v, Magma_DEV, dofs, c_zero, queue ); magma_c_vinit( &s, Magma_DEV, dofs, c_zero, queue ); magma_c_vinit( &t, Magma_DEV, dofs, c_zero, queue ); // solver variables magmaFloatComplex alpha, beta, omega, rho_old, rho_new; float nom, betanom, nom0, r0, den, res; // solver setup magma_cscal( dofs, c_zero, x->dval, 1) ; // x = 0 magma_ccopy( dofs, b.dval, 1, r.dval, 1 ); // r = b magma_ccopy( dofs, b.dval, 1, rr.dval, 1 ); // rr = b nom0 = betanom = magma_scnrm2( dofs, r.dval, 1 ); // nom = || r || nom = nom0*nom0; rho_old = omega = alpha = MAGMA_C_MAKE( 1.0, 0. ); solver_par->init_res = nom0; magma_c_spmv( c_one, A, r, c_zero, v, queue ); // z = A r den = MAGMA_C_REAL( magma_cdotc(dofs, v.dval, 1, r.dval, 1) ); // den = z' * r if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) { magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } // check positive definite if (den <= 0.0) { printf("Operator A is not postive definite. (Ar,r) = %f\n", den); magmablasSetKernelStream( orig_queue ); return MAGMA_NONSPD; solver_par->info = MAGMA_NONSPD;; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ) { rho_new = magma_cdotc( dofs, rr.dval, 1, r.dval, 1 ); // rho=<rr,r> beta = rho_new/rho_old * alpha/omega; // beta=rho/rho_old *alpha/omega magma_cscal( dofs, beta, p.dval, 1 ); // p = beta*p magma_caxpy( dofs, c_mone * omega * beta, v.dval, 1 , p.dval, 1 ); // p = p-omega*beta*v magma_caxpy( dofs, c_one, r.dval, 1, p.dval, 1 ); // p = p+r magma_c_spmv( c_one, A, p, c_zero, v, queue ); // v = Ap alpha = rho_new / magma_cdotc( dofs, rr.dval, 1, v.dval, 1 ); magma_ccopy( dofs, r.dval, 1 , s.dval, 1 ); // s=r magma_caxpy( dofs, c_mone * alpha, v.dval, 1 , s.dval, 1 ); // s=s-alpha*v magma_c_spmv( c_one, A, s, c_zero, t, queue ); // t=As omega = magma_cdotc( dofs, t.dval, 1, s.dval, 1 ) // omega = <s,t>/<t,t> / magma_cdotc( dofs, t.dval, 1, t.dval, 1 ); magma_caxpy( dofs, alpha, p.dval, 1 , x->dval, 1 ); // x=x+alpha*p magma_caxpy( dofs, omega, s.dval, 1 , x->dval, 1 ); // x=x+omega*s magma_ccopy( dofs, s.dval, 1 , r.dval, 1 ); // r=s magma_caxpy( dofs, c_mone * omega, t.dval, 1 , r.dval, 1 ); // r=r-omega*t res = betanom = magma_scnrm2( dofs, r.dval, 1 ); nom = betanom*betanom; rho_old = rho_new; // rho_old=rho if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res/nom0 < solver_par->epsilon ) { break; } } tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; magma_cresidual( A, b, *x, &residual, queue ); solver_par->iter_res = res; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter) { solver_par->info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_SLOW_CONVERGENCE; } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_DIVERGENCE; } magma_c_vfree(&r, queue ); magma_c_vfree(&rr, queue ); magma_c_vfree(&p, queue ); magma_c_vfree(&v, queue ); magma_c_vfree(&s, queue ); magma_c_vfree(&t, queue ); magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } /* magma_cbicgstab */
extern "C" magma_int_t magma_cpcg( magma_c_matrix A, magma_c_matrix b, magma_c_matrix *x, magma_c_solver_par *solver_par, magma_c_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_PCG; solver_par->numiter = 0; solver_par->spmv_count = 0; // solver variables magmaFloatComplex alpha, beta; float nom0, r0, res, nomb; magmaFloatComplex den, gammanew, gammaold = MAGMA_C_MAKE(1.0,0.0); // local variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE; magma_int_t dofs = A.num_rows* b.num_cols; // GPU workspace magma_c_matrix r={Magma_CSR}, rt={Magma_CSR}, p={Magma_CSR}, q={Magma_CSR}, h={Magma_CSR}; CHECK( magma_cvinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &rt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &q, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &h, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // solver setup CHECK( magma_cresidualvec( A, b, *x, &r, &nom0, queue)); // preconditioner CHECK( magma_c_applyprecond_left( MagmaNoTrans, A, r, &rt, precond_par, queue )); CHECK( magma_c_applyprecond_right( MagmaNoTrans, A, rt, &h, precond_par, queue )); magma_ccopy( dofs, h.dval, 1, p.dval, 1, queue ); // p = h CHECK( magma_c_spmv( c_one, A, p, c_zero, q, queue )); // q = A p den = magma_cdotc( dofs, p.dval, 1, q.dval, 1, queue ); // den = p dot q solver_par->init_res = nom0; nomb = magma_scnrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } if ( (r0 = nomb * solver_par->rtol) < ATOLERANCE ){ r0 = ATOLERANCE; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nom0; solver_par->timing[0] = 0.0; } if ( nomb < r0 ) { info = MAGMA_SUCCESS; goto cleanup; } // check positive definite if ( MAGMA_C_ABS(den) <= 0.0 ) { info = MAGMA_NONSPD; goto cleanup; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; // preconditioner CHECK( magma_c_applyprecond_left( MagmaNoTrans, A, r, &rt, precond_par, queue )); CHECK( magma_c_applyprecond_right( MagmaNoTrans, A, rt, &h, precond_par, queue )); gammanew = magma_cdotc( dofs, r.dval, 1, h.dval, 1, queue ); // gn = < r,h> if ( solver_par->numiter == 1 ) { magma_ccopy( dofs, h.dval, 1, p.dval, 1, queue ); // p = h } else { beta = (gammanew/gammaold); // beta = gn/go magma_cscal( dofs, beta, p.dval, 1, queue ); // p = beta*p magma_caxpy( dofs, c_one, h.dval, 1, p.dval, 1, queue ); // p = p + h } CHECK( magma_c_spmv( c_one, A, p, c_zero, q, queue )); // q = A p den = magma_cdotc( dofs, p.dval, 1, q.dval, 1, queue ); // den = p dot q alpha = gammanew / den; magma_caxpy( dofs, alpha, p.dval, 1, x->dval, 1, queue ); // x = x + alpha p magma_caxpy( dofs, -alpha, q.dval, 1, r.dval, 1, queue ); // r = r - alpha q gammaold = gammanew; res = magma_scnrm2( dofs, r.dval, 1, queue ); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res/nomb <= solver_par->rtol || res <= solver_par->atol ){ break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; CHECK( magma_cresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = res; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->rtol*solver_par->init_res || solver_par->iter_res < solver_par->atol ) { info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_cmfree(&r, queue ); magma_cmfree(&rt, queue ); magma_cmfree(&p, queue ); magma_cmfree(&q, queue ); magma_cmfree(&h, queue ); solver_par->info = info; return info; } /* magma_ccg */
extern "C" magma_int_t magma_cpcgs_merge( magma_c_matrix A, magma_c_matrix b, magma_c_matrix *x, magma_c_solver_par *solver_par, magma_c_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_PCGS; solver_par->numiter = 0; solver_par->spmv_count = 0; // local variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE; // solver variables float nom0, r0, res, nomb; magmaFloatComplex rho, rho_l = c_one, alpha, beta; magma_int_t dofs = A.num_rows* b.num_cols; // GPU workspace magma_c_matrix r={Magma_CSR}, rt={Magma_CSR}, r_tld={Magma_CSR}, p={Magma_CSR}, q={Magma_CSR}, u={Magma_CSR}, v={Magma_CSR}, t={Magma_CSR}, p_hat={Magma_CSR}, q_hat={Magma_CSR}, u_hat={Magma_CSR}, v_hat={Magma_CSR}; CHECK( magma_cvinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &rt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &r_tld,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &p_hat, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &q, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &q_hat, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &u, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &u_hat, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &v, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &v_hat, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &t, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // solver setup CHECK( magma_cresidualvec( A, b, *x, &r, &nom0, queue)); magma_ccopy( dofs, r.dval, 1, r_tld.dval, 1, queue ); solver_par->init_res = nom0; nomb = magma_scnrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } if ( (r0 = nomb * solver_par->rtol) < ATOLERANCE ){ r0 = ATOLERANCE; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nom0; solver_par->timing[0] = 0.0; } if ( nom0 < r0 ) { info = MAGMA_SUCCESS; goto cleanup; } //Chronometry real_Double_t tempo1, tempo2, tempop1, tempop2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; rho = magma_cdotc( dofs, r.dval, 1, r_tld.dval, 1, queue ); // rho = < r,r_tld> if ( MAGMA_C_ABS(rho) == 0.0 ) { goto cleanup; } if ( solver_par->numiter > 1 ) { // direction vectors beta = rho / rho_l; magma_ccgs_1( r.num_rows, r.num_cols, beta, r.dval, q.dval, u.dval, p.dval, queue ); //u = r + beta*q; //p = u + beta*( q + beta*p ); } else{ magma_ccgs_2( r.num_rows, r.num_cols, r.dval, u.dval, p.dval, queue ); // u = r // p = r } // preconditioner tempop1 = magma_sync_wtime( queue ); CHECK( magma_c_applyprecond_left( MagmaNoTrans, A, p, &rt, precond_par, queue )); CHECK( magma_c_applyprecond_right( MagmaNoTrans, A, rt, &p_hat, precond_par, queue )); tempop2 = magma_sync_wtime( queue ); precond_par->runtime += tempop2-tempop1; CHECK( magma_c_spmv( c_one, A, p_hat, c_zero, v_hat, queue )); // v = A p solver_par->spmv_count++; alpha = rho / magma_cdotc( dofs, r_tld.dval, 1, v_hat.dval, 1, queue ); magma_ccgs_3( r.num_rows, r.num_cols, alpha, v_hat.dval, u.dval, q.dval, t.dval, queue ); // q = u - alpha v_hat // t = u + q // preconditioner tempop1 = magma_sync_wtime( queue ); CHECK( magma_c_applyprecond_left( MagmaNoTrans, A, t, &rt, precond_par, queue )); CHECK( magma_c_applyprecond_right( MagmaNoTrans, A, rt, &u_hat, precond_par, queue )); tempop2 = magma_sync_wtime( queue ); precond_par->runtime += tempop2-tempop1; CHECK( magma_c_spmv( c_one, A, u_hat, c_zero, t, queue )); // t = A u_hat solver_par->spmv_count++; magma_ccgs_4( r.num_rows, r.num_cols, alpha, u_hat.dval, t.dval, x->dval, r.dval, queue ); // r = r -alpha*A u_hat // x = x + alpha u_hat res = magma_scnrm2( dofs, r.dval, 1, queue ); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res/nomb <= solver_par->rtol || res <= solver_par->atol ){ break; } rho_l = rho; } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; CHECK( magma_cresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = res; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->rtol*solver_par->init_res || solver_par->iter_res < solver_par->atol ) { info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_cmfree(&r, queue ); magma_cmfree(&rt, queue ); magma_cmfree(&r_tld, queue ); magma_cmfree(&p, queue ); magma_cmfree(&q, queue ); magma_cmfree(&u, queue ); magma_cmfree(&v, queue ); magma_cmfree(&t, queue ); magma_cmfree(&p_hat, queue ); magma_cmfree(&q_hat, queue ); magma_cmfree(&u_hat, queue ); magma_cmfree(&v_hat, queue ); solver_par->info = info; return info; } /* magma_cpcgs_merge */
/* //////////////////////////////////////////////////////////////////////////// -- testing any solver */ int main( int argc, char** argv ) { magma_int_t info = 0; TESTING_CHECK( magma_init() ); magma_print_environment(); magma_queue_t queue=NULL; magma_queue_create( 0, &queue ); magmaFloatComplex one = MAGMA_C_MAKE(1.0, 0.0); magmaFloatComplex zero = MAGMA_C_MAKE(0.0, 0.0); magma_c_matrix A={Magma_CSR}, B_d={Magma_CSR}; magma_c_matrix x={Magma_CSR}, b={Magma_CSR}; int i=1; while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); TESTING_CHECK( magma_cm_5stencil( laplace_size, &A, queue )); } else { // file-matrix test TESTING_CHECK( magma_c_csr_mtx( &A, argv[i], queue )); } printf( "\n# matrix info: %lld-by-%lld with %lld nonzeros\n\n", (long long) A.num_rows, (long long) A.num_cols, (long long) A.nnz ); magma_int_t n = A.num_rows; TESTING_CHECK( magma_cmtransfer( A, &B_d, Magma_CPU, Magma_DEV, queue )); // vectors and initial guess TESTING_CHECK( magma_cvinit( &b, Magma_DEV, A.num_cols, 1, zero, queue )); TESTING_CHECK( magma_cvinit( &x, Magma_DEV, A.num_cols, 1, one, queue )); TESTING_CHECK( magma_cprint_vector( b, 90, 10, queue )); TESTING_CHECK( magma_cprint_matrix( A, queue )); printf("\n\n\n"); TESTING_CHECK( magma_cprint_matrix( B_d, queue )); float res; res = magma_scnrm2( n, b.dval, 1, queue ); printf("norm0: %f\n", res); TESTING_CHECK( magma_c_spmv( one, B_d, x, zero, b, queue )); // b = A x TESTING_CHECK( magma_cprint_vector( b, 0, 100, queue )); TESTING_CHECK( magma_cprint_vector( b, b.num_rows-10, 10, queue )); res = magma_scnrm2( n, b.dval, 1, queue ); printf("norm: %f\n", res); TESTING_CHECK( magma_cresidual( B_d, x, b, &res, queue )); printf("res: %f\n", res); magma_cmfree(&B_d, queue ); magma_cmfree(&A, queue ); magma_cmfree(&x, queue ); magma_cmfree(&b, queue ); i++; } magma_queue_destroy( queue ); magma_finalize(); return info; }
extern "C" magma_int_t magma_cbpcg( magma_c_sparse_matrix A, magma_c_vector b, magma_c_vector *x, magma_c_solver_par *solver_par, magma_c_preconditioner *precond_par, magma_queue_t queue ) { // set queue for old dense routines magma_queue_t orig_queue; magmablasGetKernelStream( &orig_queue ); magma_int_t stat_dev = 0, stat_cpu = 0; magma_int_t i, num_vecs = b.num_rows/A.num_rows; // prepare solver feedback solver_par->solver = Magma_PCG; solver_par->numiter = 0; solver_par->info = MAGMA_SUCCESS; // local variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE; magma_int_t dofs = A.num_rows; // GPU workspace magma_c_vector r, rt, p, q, h; magma_c_vinit( &r, Magma_DEV, dofs*num_vecs, c_zero, queue ); magma_c_vinit( &rt, Magma_DEV, dofs*num_vecs, c_zero, queue ); magma_c_vinit( &p, Magma_DEV, dofs*num_vecs, c_zero, queue ); magma_c_vinit( &q, Magma_DEV, dofs*num_vecs, c_zero, queue ); magma_c_vinit( &h, Magma_DEV, dofs*num_vecs, c_zero, queue ); // solver variables magmaFloatComplex *alpha, *beta; alpha = NULL; beta = NULL; stat_cpu += magma_cmalloc_cpu(&alpha, num_vecs); stat_cpu += magma_cmalloc_cpu(&beta, num_vecs); float *nom, *nom0, *r0, *gammaold, *gammanew, *den, *res, *residual; nom = NULL; nom0 = NULL; r0 = NULL; gammaold = NULL; gammanew = NULL; den = NULL; res = NULL; residual = NULL; stat_cpu += magma_smalloc_cpu(&residual, num_vecs); stat_cpu += magma_smalloc_cpu(&nom, num_vecs); stat_cpu += magma_smalloc_cpu(&nom0, num_vecs); stat_cpu += magma_smalloc_cpu(&r0, num_vecs); stat_cpu += magma_smalloc_cpu(&gammaold, num_vecs); stat_cpu += magma_smalloc_cpu(&gammanew, num_vecs); stat_cpu += magma_smalloc_cpu(&den, num_vecs); stat_cpu += magma_smalloc_cpu(&res, num_vecs); stat_cpu += magma_smalloc_cpu(&residual, num_vecs); if( stat_cpu != 0 ){ magma_free_cpu( nom ); magma_free_cpu( nom0 ); magma_free_cpu( r0 ); magma_free_cpu( gammaold ); magma_free_cpu( gammanew ); magma_free_cpu( den ); magma_free_cpu( res ); magma_free_cpu( alpha ); magma_free_cpu( beta ); magma_free_cpu( residual ); magmablasSetKernelStream( orig_queue ); printf("error: memory allocation.\n"); return MAGMA_ERR_HOST_ALLOC; } // solver setup magma_cscal( dofs*num_vecs, c_zero, x->dval, 1) ; // x = 0 magma_ccopy( dofs*num_vecs, b.dval, 1, r.dval, 1 ); // r = b // preconditioner magma_c_applyprecond_left( A, r, &rt, precond_par, queue ); magma_c_applyprecond_right( A, rt, &h, precond_par, queue ); magma_ccopy( dofs*num_vecs, h.dval, 1, p.dval, 1 ); // p = h for( i=0; i<num_vecs; i++) { nom[i] = MAGMA_C_REAL( magma_cdotc(dofs, r(i), 1, h(i), 1) ); nom0[i] = magma_scnrm2( dofs, r(i), 1 ); } magma_c_spmv( c_one, A, p, c_zero, q, queue ); // q = A p for( i=0; i<num_vecs; i++) den[i] = MAGMA_C_REAL( magma_cdotc(dofs, p(i), 1, q(i), 1) ); // den = p dot q solver_par->init_res = nom0[0]; if ( (r0[0] = nom[0] * solver_par->epsilon) < ATOLERANCE ) r0[0] = ATOLERANCE; // check positive definite if (den[0] <= 0.0) { printf("Operator A is not postive definite. (Ar,r) = %f\n", den[0]); magmablasSetKernelStream( orig_queue ); return MAGMA_NONSPD; solver_par->info = MAGMA_NONSPD;; } if ( nom[0] < r0[0] ) { magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nom0[0]; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ) { // preconditioner magma_c_applyprecond_left( A, r, &rt, precond_par, queue ); magma_c_applyprecond_right( A, rt, &h, precond_par, queue ); for( i=0; i<num_vecs; i++) gammanew[i] = MAGMA_C_REAL( magma_cdotc(dofs, r(i), 1, h(i), 1) ); // gn = < r,h> if ( solver_par->numiter==1 ) { magma_ccopy( dofs*num_vecs, h.dval, 1, p.dval, 1 ); // p = h } else { for( i=0; i<num_vecs; i++) { beta[i] = MAGMA_C_MAKE(gammanew[i]/gammaold[i], 0.); // beta = gn/go magma_cscal(dofs, beta[i], p(i), 1); // p = beta*p magma_caxpy(dofs, c_one, h(i), 1, p(i), 1); // p = p + h } } magma_c_spmv( c_one, A, p, c_zero, q, queue ); // q = A p // magma_c_bspmv_tuned( dofs, num_vecs, c_one, A, p.dval, c_zero, q.dval, queue ); for( i=0; i<num_vecs; i++) { den[i] = MAGMA_C_REAL(magma_cdotc(dofs, p(i), 1, q(i), 1)); // den = p dot q alpha[i] = MAGMA_C_MAKE(gammanew[i]/den[i], 0.); magma_caxpy(dofs, alpha[i], p(i), 1, x->dval+dofs*i, 1); // x = x + alpha p magma_caxpy(dofs, -alpha[i], q(i), 1, r(i), 1); // r = r - alpha q gammaold[i] = gammanew[i]; res[i] = magma_scnrm2( dofs, r(i), 1 ); } if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res[0]; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res[0]/nom0[0] < solver_par->epsilon ) { break; } } tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; magma_cresidual( A, b, *x, residual, queue ); solver_par->iter_res = res[0]; solver_par->final_res = residual[0]; if ( solver_par->numiter < solver_par->maxiter) { solver_par->info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res[0]; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_SLOW_CONVERGENCE; } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res[0]; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_DIVERGENCE; } for( i=0; i<num_vecs; i++) { printf("%.4e ",res[i]); } printf("\n"); for( i=0; i<num_vecs; i++) { printf("%.4e ",residual[i]); } printf("\n"); magma_c_vfree(&r, queue ); magma_c_vfree(&rt, queue ); magma_c_vfree(&p, queue ); magma_c_vfree(&q, queue ); magma_c_vfree(&h, queue ); magma_free_cpu(alpha); magma_free_cpu(beta); magma_free_cpu(nom); magma_free_cpu(nom0); magma_free_cpu(r0); magma_free_cpu(gammaold); magma_free_cpu(gammanew); magma_free_cpu(den); magma_free_cpu(res); magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } /* magma_cbpcg */
extern "C" magma_int_t magma_cpidr_strms( magma_c_matrix A, magma_c_matrix b, magma_c_matrix *x, magma_c_solver_par *solver_par, magma_c_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_PIDRMERGE; solver_par->numiter = 0; solver_par->spmv_count = 0; solver_par->init_res = 0.0; solver_par->final_res = 0.0; solver_par->iter_res = 0.0; solver_par->runtime = 0.0; // constants const magmaFloatComplex c_zero = MAGMA_C_ZERO; const magmaFloatComplex c_one = MAGMA_C_ONE; const magmaFloatComplex c_n_one = MAGMA_C_NEG_ONE; // internal user options const magma_int_t smoothing = 1; // 0 = disable, 1 = enable const float angle = 0.7; // [0-1] // local variables magma_int_t iseed[4] = {0, 0, 0, 1}; magma_int_t dof; magma_int_t s; magma_int_t distr; magma_int_t k, i, sk; magma_int_t innerflag; magma_int_t ldd; magma_int_t q; float residual; float nrm; float nrmb; float nrmr; float nrmt; float rho; magmaFloatComplex om; magmaFloatComplex gamma; // matrices and vectors magma_c_matrix dxs = {Magma_CSR}; magma_c_matrix dr = {Magma_CSR}, drs = {Magma_CSR}; magma_c_matrix dP = {Magma_CSR}, dP1 = {Magma_CSR}; magma_c_matrix dG = {Magma_CSR}, dGcol = {Magma_CSR}; magma_c_matrix dU = {Magma_CSR}; magma_c_matrix dM = {Magma_CSR}; magma_c_matrix df = {Magma_CSR}; magma_c_matrix dt = {Magma_CSR}, dtt = {Magma_CSR}; magma_c_matrix dc = {Magma_CSR}; magma_c_matrix dv = {Magma_CSR}; magma_c_matrix dlu = {Magma_CSR}; magma_c_matrix dskp = {Magma_CSR}; magma_c_matrix dalpha = {Magma_CSR}; magma_c_matrix dbeta = {Magma_CSR}; magmaFloatComplex *hMdiag = NULL; magmaFloatComplex *hskp = NULL; magmaFloatComplex *halpha = NULL; magmaFloatComplex *hbeta = NULL; magmaFloatComplex *d1 = NULL, *d2 = NULL; // queue variables const magma_int_t nqueues = 3; // number of queues magma_queue_t queues[nqueues]; // chronometry real_Double_t tempo1, tempo2; // create additional queues queues[0] = queue; for ( q = 1; q < nqueues; q++ ) { magma_queue_create( queue->device(), &(queues[q]) ); } // initial s space // TODO: add option for 's' (shadow space number) // Hack: uses '--restart' option as the shadow space number. // This is not a good idea because the default value of restart option is used to detect // if the user provided a custom restart. This means that if the default restart value // is changed then the code will think it was the user (unless the default value is // also updated in the 'if' statement below. s = 1; if ( solver_par->restart != 50 ) { if ( solver_par->restart > A.num_cols ) { s = A.num_cols; } else { s = solver_par->restart; } } solver_par->restart = s; // set max iterations solver_par->maxiter = min( 2 * A.num_cols, solver_par->maxiter ); // check if matrix A is square if ( A.num_rows != A.num_cols ) { //printf("Matrix A is not square.\n"); info = MAGMA_ERR_NOT_SUPPORTED; goto cleanup; } // |b| nrmb = magma_scnrm2( b.num_rows, b.dval, 1, queue ); if ( nrmb == 0.0 ) { magma_cscal( x->num_rows, MAGMA_C_ZERO, x->dval, 1, queue ); info = MAGMA_SUCCESS; goto cleanup; } // t = 0 // make t twice as large to contain both, dt and dr ldd = magma_roundup( b.num_rows, 32 ); CHECK( magma_cvinit( &dt, Magma_DEV, ldd, 2, c_zero, queue )); dt.num_rows = b.num_rows; dt.num_cols = 1; dt.nnz = dt.num_rows; // redirect the dr.dval to the second part of dt CHECK( magma_cvinit( &dr, Magma_DEV, b.num_rows, 1, c_zero, queue )); magma_free( dr.dval ); dr.dval = dt.dval + ldd; // r = b - A x CHECK( magma_cresidualvec( A, b, *x, &dr, &nrmr, queue )); // |r| solver_par->init_res = nrmr; solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nrmr; } // check if initial is guess good enough if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { info = MAGMA_SUCCESS; goto cleanup; } // P = randn(n, s) // P = ortho(P) //--------------------------------------- // P = 0.0 CHECK( magma_cvinit( &dP, Magma_CPU, A.num_cols, s, c_zero, queue )); // P = randn(n, s) distr = 3; // 1 = unif (0,1), 2 = unif (-1,1), 3 = normal (0,1) dof = dP.num_rows * dP.num_cols; lapackf77_clarnv( &distr, iseed, &dof, dP.val ); // transfer P to device CHECK( magma_cmtransfer( dP, &dP1, Magma_CPU, Magma_DEV, queue )); magma_cmfree( &dP, queue ); // P = ortho(P1) if ( dP1.num_cols > 1 ) { // P = magma_cqr(P1), QR factorization CHECK( magma_cqr( dP1.num_rows, dP1.num_cols, dP1, dP1.ld, &dP, NULL, queue )); } else { // P = P1 / |P1| nrm = magma_scnrm2( dof, dP1.dval, 1, queue ); nrm = 1.0 / nrm; magma_csscal( dof, nrm, dP1.dval, 1, queue ); CHECK( magma_cmtransfer( dP1, &dP, Magma_DEV, Magma_DEV, queue )); } magma_cmfree( &dP1, queue ); //--------------------------------------- // allocate memory for the scalar products CHECK( magma_cmalloc_pinned( &hskp, 5 )); CHECK( magma_cvinit( &dskp, Magma_DEV, 4, 1, c_zero, queue )); CHECK( magma_cmalloc_pinned( &halpha, s )); CHECK( magma_cvinit( &dalpha, Magma_DEV, s, 1, c_zero, queue )); CHECK( magma_cmalloc_pinned( &hbeta, s )); CHECK( magma_cvinit( &dbeta, Magma_DEV, s, 1, c_zero, queue )); // workspace for merged dot product CHECK( magma_cmalloc( &d1, max(2, s) * b.num_rows )); CHECK( magma_cmalloc( &d2, max(2, s) * b.num_rows )); // smoothing enabled if ( smoothing > 0 ) { // set smoothing solution vector CHECK( magma_cmtransfer( *x, &dxs, Magma_DEV, Magma_DEV, queue )); // tt = 0 // make tt twice as large to contain both, dtt and drs ldd = magma_roundup( b.num_rows, 32 ); CHECK( magma_cvinit( &dtt, Magma_DEV, ldd, 2, c_zero, queue )); dtt.num_rows = dr.num_rows; dtt.num_cols = 1; dtt.nnz = dtt.num_rows; // redirect the drs.dval to the second part of dtt CHECK( magma_cvinit( &drs, Magma_DEV, dr.num_rows, 1, c_zero, queue )); magma_free( drs.dval ); drs.dval = dtt.dval + ldd; // set smoothing residual vector magma_ccopyvector( dr.num_rows, dr.dval, 1, drs.dval, 1, queue ); } // G(n,s) = 0 if ( s > 1 ) { ldd = magma_roundup( A.num_rows, 32 ); CHECK( magma_cvinit( &dG, Magma_DEV, ldd, s, c_zero, queue )); dG.num_rows = A.num_rows; } else { CHECK( magma_cvinit( &dG, Magma_DEV, A.num_rows, s, c_zero, queue )); } // dGcol represents a single column of dG, array pointer is set inside loop CHECK( magma_cvinit( &dGcol, Magma_DEV, dG.num_rows, 1, c_zero, queue )); magma_free( dGcol.dval ); // U(n,s) = 0 if ( s > 1 ) { ldd = magma_roundup( A.num_cols, 32 ); CHECK( magma_cvinit( &dU, Magma_DEV, ldd, s, c_zero, queue )); dU.num_rows = A.num_cols; } else { CHECK( magma_cvinit( &dU, Magma_DEV, A.num_cols, s, c_zero, queue )); } // M(s,s) = I CHECK( magma_cvinit( &dM, Magma_DEV, s, s, c_zero, queue )); CHECK( magma_cmalloc_pinned( &hMdiag, s )); magmablas_claset( MagmaFull, dM.num_rows, dM.num_cols, c_zero, c_one, dM.dval, dM.ld, queue ); // f = 0 CHECK( magma_cvinit( &df, Magma_DEV, dP.num_cols, 1, c_zero, queue )); // c = 0 CHECK( magma_cvinit( &dc, Magma_DEV, dM.num_cols, 1, c_zero, queue )); // v = r CHECK( magma_cmtransfer( dr, &dv, Magma_DEV, Magma_DEV, queue )); // lu = 0 CHECK( magma_cvinit( &dlu, Magma_DEV, dr.num_rows, 1, c_zero, queue )); //--------------START TIME--------------- // chronometry tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->timing[0] = 0.0; } om = MAGMA_C_ONE; gamma = MAGMA_C_ZERO; innerflag = 0; // start iteration do { solver_par->numiter++; // new RHS for small systems // f = P' r // Q1 magma_cgemvmdot_shfl( dP.num_rows, dP.num_cols, dP.dval, dr.dval, d1, d2, df.dval, queues[1] ); // skp[4] = f(k) // Q1 magma_cgetvector_async( 1, df.dval, 1, &hskp[4], 1, queues[1] ); // c(k:s) = f(k:s) // Q1 magma_ccopyvector_async( s, df.dval, 1, dc.dval, 1, queues[1] ); // c(k:s) = M(k:s,k:s) \ f(k:s) // Q1 magma_ctrsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, s, dM.dval, dM.ld, dc.dval, 1, queues[1] ); // shadow space loop for ( k = 0; k < s; ++k ) { sk = s - k; dGcol.dval = dG.dval + k * dG.ld; // v = r - G(:,k:s) c(k:s) // Q1 magmablas_cgemv( MagmaNoTrans, dG.num_rows, sk, c_n_one, dGcol.dval, dG.ld, &dc.dval[k], 1, c_one, dv.dval, 1, queues[1] ); // preconditioning operation // v = L \ v; // v = U \ v; // Q1 CHECK( magma_c_applyprecond_left( MagmaNoTrans, A, dv, &dlu, precond_par, queues[1] )); CHECK( magma_c_applyprecond_right( MagmaNoTrans, A, dlu, &dv, precond_par, queues[1] )); // sync Q0 --> U(:,k) = U(:,k) - U(:,1:k) * alpha(1:k) magma_queue_sync( queues[0] ); // U(:,k) = om * v + U(:,k:s) c(k:s) // Q1 magmablas_cgemv( MagmaNoTrans, dU.num_rows, sk, c_one, &dU.dval[k*dU.ld], dU.ld, &dc.dval[k], 1, om, dv.dval, 1, queues[1] ); // G(:,k) = A U(:,k) // Q1 CHECK( magma_c_spmv( c_one, A, dv, c_zero, dGcol, queues[1] )); solver_par->spmv_count++; // bi-orthogonalize the new basis vectors for ( i = 0; i < k; ++i ) { // alpha = P(:,i)' G(:,k) // Q1 halpha[i] = magma_cdotc( dP.num_rows, &dP.dval[i*dP.ld], 1, dGcol.dval, 1, queues[1] ); // implicit sync Q1 --> alpha = P(:,i)' G(:,k) // alpha = alpha / M(i,i) halpha[i] = halpha[i] / hMdiag[i]; // G(:,k) = G(:,k) - alpha * G(:,i) // Q1 magma_caxpy( dG.num_rows, -halpha[i], &dG.dval[i*dG.ld], 1, dGcol.dval, 1, queues[1] ); } // sync Q1 --> compute new G, skp[4] = f(k magma_queue_sync( queues[1] ); // new column of M = P'G, first k-1 entries are zero // M(k:s,k) = P(:,k:s)' G(:,k) // Q2 magma_cgemvmdot_shfl( dP.num_rows, sk, &dP.dval[k*dP.ld], dGcol.dval, d1, d2, &dM.dval[k*dM.ld+k], queues[2] ); // U(:,k) = v // Q0 magma_ccopyvector_async( dU.num_rows, dv.dval, 1, &dU.dval[k*dU.ld], 1, queues[0] ); // non-first s iteration if ( k > 0 ) { // alpha = dalpha // Q0 magma_csetvector_async( k, halpha, 1, dalpha.dval, 1, queues[0] ); // U update outside of loop using GEMV // U(:,k) = U(:,k) - U(:,1:k) * alpha(1:k) // Q0 magmablas_cgemv( MagmaNoTrans, dU.num_rows, k, c_n_one, dU.dval, dU.ld, dalpha.dval, 1, c_one, &dU.dval[k*dU.ld], 1, queues[0] ); } // Mdiag(k) = M(k,k) // Q2 magma_cgetvector( 1, &dM.dval[k*dM.ld+k], 1, &hMdiag[k], 1, queues[2] ); // implicit sync Q2 --> Mdiag(k) = M(k,k) // check M(k,k) == 0 if ( MAGMA_C_EQUAL(hMdiag[k], MAGMA_C_ZERO) ) { innerflag = 1; info = MAGMA_DIVERGENCE; break; } // beta = f(k) / M(k,k) hbeta[k] = hskp[4] / hMdiag[k]; // check for nan if ( magma_c_isnan( hbeta[k] ) || magma_c_isinf( hbeta[k] )) { innerflag = 1; info = MAGMA_DIVERGENCE; break; } // non-last s iteration if ( (k + 1) < s ) { // f(k+1:s) = f(k+1:s) - beta * M(k+1:s,k) // Q1 magma_caxpy( sk-1, -hbeta[k], &dM.dval[k*dM.ld+(k+1)], 1, &df.dval[k+1], 1, queues[1] ); // c(k+1:s) = f(k+1:s) // Q1 magma_ccopyvector_async( sk-1, &df.dval[k+1], 1, &dc.dval[k+1], 1, queues[1] ); // c(k+1:s) = M(k+1:s,k+1:s) \ f(k+1:s) // Q1 magma_ctrsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, sk-1, &dM.dval[(k+1)*dM.ld+(k+1)], dM.ld, &dc.dval[k+1], 1, queues[1] ); // skp[4] = f(k+1) // Q1 magma_cgetvector_async( 1, &df.dval[k+1], 1, &hskp[4], 1, queues[1] ); } // r = r - beta * G(:,k) // Q2 magma_caxpy( dr.num_rows, -hbeta[k], dGcol.dval, 1, dr.dval, 1, queues[2] ); // smoothing disabled if ( smoothing <= 0 ) { // |r| // Q2 nrmr = magma_scnrm2( dr.num_rows, dr.dval, 1, queues[2] ); // implicit sync Q2 --> |r| // v = r // Q1 magma_ccopyvector_async( dr.num_rows, dr.dval, 1, dv.dval, 1, queues[1] ); // smoothing enabled } else { // x = x + beta * U(:,k) // Q0 magma_caxpy( x->num_rows, hbeta[k], &dU.dval[k*dU.ld], 1, x->dval, 1, queues[0] ); // smoothing operation //--------------------------------------- // t = rs - r // Q2 magma_cidr_smoothing_1( drs.num_rows, drs.num_cols, drs.dval, dr.dval, dtt.dval, queues[2] ); // t't // t'rs // Q2 CHECK( magma_cgemvmdot_shfl( dt.ld, 2, dtt.dval, dtt.dval, d1, d2, &dskp.dval[2], queues[2] )); // skp[2-3] = dskp[2-3] // Q2 magma_cgetvector( 2, &dskp.dval[2], 1, &hskp[2], 1, queues[2] ); // implicit sync Q2 --> skp = dskp // gamma = (t' * rs) / (t' * t) gamma = hskp[3] / hskp[2]; // xs = xs - gamma * (xs - x) // Q0 magma_cidr_smoothing_2( dxs.num_rows, dxs.num_cols, -gamma, x->dval, dxs.dval, queues[0] ); // v = r // Q1 magma_ccopyvector_async( dr.num_rows, dr.dval, 1, dv.dval, 1, queues[1] ); // rs = rs - gamma * t // Q2 magma_caxpy( drs.num_rows, -gamma, dtt.dval, 1, drs.dval, 1, queues[2] ); // |rs| // Q2 nrmr = magma_scnrm2( drs.num_rows, drs.dval, 1, queues[2] ); // implicit sync Q2 --> |r| //--------------------------------------- } // store current timing and residual if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter) % solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)nrmr; solver_par->timing[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)tempo2 - tempo1; } } // check convergence or iteration limit if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { s = k + 1; // for the x-update outside the loop innerflag = 2; info = MAGMA_SUCCESS; break; } } // smoothing disabled if ( smoothing <= 0 && innerflag != 1 ) { // dbeta(1:s) = beta(1:s) // Q0 magma_csetvector_async( s, hbeta, 1, dbeta.dval, 1, queues[0] ); // x = x + U(:,1:s) * beta(1:s) // Q0 magmablas_cgemv( MagmaNoTrans, dU.num_rows, s, c_one, dU.dval, dU.ld, dbeta.dval, 1, c_one, x->dval, 1, queues[0] ); } // check convergence or iteration limit or invalid result of inner loop if ( innerflag > 0 ) { break; } // preconditioning operation // v = L \ v; // v = U \ v; // Q2 CHECK( magma_c_applyprecond_left( MagmaNoTrans, A, dv, &dlu, precond_par, queues[2] )); CHECK( magma_c_applyprecond_right( MagmaNoTrans, A, dlu, &dv, precond_par, queues[2] )); // t = A v // Q2 CHECK( magma_c_spmv( c_one, A, dv, c_zero, dt, queues[2] )); solver_par->spmv_count++; // computation of a new omega //--------------------------------------- // t't // t'r // Q2 CHECK( magma_cgemvmdot_shfl( dt.ld, 2, dt.dval, dt.dval, d1, d2, dskp.dval, queues[2] )); // skp[0-2] = dskp[0-2] // Q2 magma_cgetvector( 2, dskp.dval, 1, hskp, 1, queues[2] ); // implicit sync Q2 --> skp = dskp // |t| nrmt = magma_ssqrt( MAGMA_C_REAL(hskp[0]) ); // rho = abs((t' * r) / (|t| * |r|)) rho = MAGMA_D_ABS( MAGMA_C_REAL(hskp[1]) / (nrmt * nrmr) ); // om = (t' * r) / (|t| * |t|) om = hskp[1] / hskp[0]; if ( rho < angle ) { om = (om * angle) / rho; } //--------------------------------------- if ( MAGMA_C_EQUAL(om, MAGMA_C_ZERO) ) { info = MAGMA_DIVERGENCE; break; } // sync Q1 --> v = r magma_queue_sync( queues[1] ); // r = r - om * t // Q2 magma_caxpy( dr.num_rows, -om, dt.dval, 1, dr.dval, 1, queues[2] ); // x = x + om * v // Q0 magma_caxpy( x->num_rows, om, dv.dval, 1, x->dval, 1, queues[0] ); // smoothing disabled if ( smoothing <= 0 ) { // |r| // Q2 nrmr = magma_scnrm2( dr.num_rows, dr.dval, 1, queues[2] ); // implicit sync Q2 --> |r| // v = r // Q1 magma_ccopyvector_async( dr.num_rows, dr.dval, 1, dv.dval, 1, queues[1] ); // smoothing enabled } else { // smoothing operation //--------------------------------------- // t = rs - r // Q2 magma_cidr_smoothing_1( drs.num_rows, drs.num_cols, drs.dval, dr.dval, dtt.dval, queues[2] ); // t't // t'rs // Q2 CHECK( magma_cgemvmdot_shfl( dt.ld, 2, dtt.dval, dtt.dval, d1, d2, &dskp.dval[2], queues[2] )); // skp[2-3] = dskp[2-3] // Q2 magma_cgetvector( 2, &dskp.dval[2], 1, &hskp[2], 1, queues[2] ); // implicit sync Q2 --> skp = dskp // gamma = (t' * rs) / (t' * t) gamma = hskp[3] / hskp[2]; // xs = xs - gamma * (xs - x) // Q0 magma_cidr_smoothing_2( dxs.num_rows, dxs.num_cols, -gamma, x->dval, dxs.dval, queues[0] ); // v = r // Q1 magma_ccopyvector_async( dr.num_rows, dr.dval, 1, dv.dval, 1, queues[1] ); // rs = rs - gamma * (rs - r) // Q2 magma_caxpy( drs.num_rows, -gamma, dtt.dval, 1, drs.dval, 1, queues[2] ); // |rs| // Q2 nrmr = magma_scnrm2( drs.num_rows, drs.dval, 1, queues[2] ); // implicit sync Q2 --> |r| //--------------------------------------- } // store current timing and residual if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); magma_queue_sync( queue ); if ( (solver_par->numiter) % solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)nrmr; solver_par->timing[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)tempo2 - tempo1; } } // check convergence or iteration limit if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { info = MAGMA_SUCCESS; break; } } while ( solver_par->numiter + 1 <= solver_par->maxiter ); // sync all queues for ( q = 0; q < nqueues; q++ ) { magma_queue_sync( queues[q] ); } // smoothing enabled if ( smoothing > 0 ) { // x = xs magma_ccopyvector_async( x->num_rows, dxs.dval, 1, x->dval, 1, queue ); // r = rs magma_ccopyvector_async( dr.num_rows, drs.dval, 1, dr.dval, 1, queue ); } // get last iteration timing tempo2 = magma_sync_wtime( queue ); magma_queue_sync( queue ); solver_par->runtime = (real_Double_t)tempo2 - tempo1; //--------------STOP TIME---------------- // get final stats solver_par->iter_res = nrmr; CHECK( magma_cresidualvec( A, b, *x, &dr, &residual, queue )); solver_par->final_res = residual; // set solver conclusion if ( info != MAGMA_SUCCESS && info != MAGMA_DIVERGENCE ) { if ( solver_par->init_res > solver_par->final_res ) { info = MAGMA_SLOW_CONVERGENCE; } } cleanup: // free resources // sync all queues, destory additional queues magma_queue_sync( queues[0] ); for ( q = 1; q < nqueues; q++ ) { magma_queue_sync( queues[q] ); magma_queue_destroy( queues[q] ); } // smoothing enabled if ( smoothing > 0 ) { drs.dval = NULL; // needed because its pointer is redirected to dtt magma_cmfree( &dxs, queue ); magma_cmfree( &drs, queue ); magma_cmfree( &dtt, queue ); } dr.dval = NULL; // needed because its pointer is redirected to dt dGcol.dval = NULL; // needed because its pointer is redirected to dG magma_cmfree( &dr, queue ); magma_cmfree( &dP, queue ); magma_cmfree( &dP1, queue ); magma_cmfree( &dG, queue ); magma_cmfree( &dGcol, queue ); magma_cmfree( &dU, queue ); magma_cmfree( &dM, queue ); magma_cmfree( &df, queue ); magma_cmfree( &dt, queue ); magma_cmfree( &dc, queue ); magma_cmfree( &dv, queue ); magma_cmfree( &dlu, queue ); magma_cmfree( &dskp, queue ); magma_cmfree( &dalpha, queue ); magma_cmfree( &dbeta, queue ); magma_free_pinned( hMdiag ); magma_free_pinned( hskp ); magma_free_pinned( halpha ); magma_free_pinned( hbeta ); magma_free( d1 ); magma_free( d2 ); solver_par->info = info; return info; /* magma_cpidr_strms */ }
extern "C" magma_int_t magma_ctfqmr( magma_c_matrix A, magma_c_matrix b, magma_c_matrix *x, magma_c_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_TFQMR; solver_par->numiter = 0; solver_par->spmv_count = 0; // local variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE; // solver variables float nom0, r0, res, nomb; //, normx, normd, normr_act; magmaFloatComplex rho = c_one, rho_l = c_one, eta = c_zero , c = c_zero , theta = c_zero , tau = c_zero, alpha = c_one, beta = c_zero, sigma = c_zero; magma_int_t dofs = A.num_rows* b.num_cols; // magma_int_t stag = 0; // GPU workspace magma_c_matrix r={Magma_CSR}, r_tld={Magma_CSR}, pu_m={Magma_CSR}, d={Magma_CSR}, w={Magma_CSR}, v={Magma_CSR}, u_mp1={Magma_CSR}, u_m={Magma_CSR}, Au={Magma_CSR}, Ad={Magma_CSR}, Au_new={Magma_CSR}; CHECK( magma_cvinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &u_mp1,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &r_tld,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &u_m, Magma_DEV, A.num_rows, b.num_cols, c_one, queue )); CHECK( magma_cvinit( &pu_m, Magma_DEV, A.num_rows, b.num_cols, c_one, queue )); CHECK( magma_cvinit( &v, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &d, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &w, Magma_DEV, A.num_rows, b.num_cols, c_one, queue )); CHECK( magma_cvinit( &Ad, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &Au_new, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &Au, Magma_DEV, A.num_rows, b.num_cols, c_one, queue )); // solver setup CHECK( magma_cresidualvec( A, b, *x, &r, &nom0, queue)); solver_par->init_res = nom0; magma_ccopy( dofs, r.dval, 1, r_tld.dval, 1, queue ); magma_ccopy( dofs, r.dval, 1, w.dval, 1, queue ); magma_ccopy( dofs, r.dval, 1, u_m.dval, 1, queue ); magma_ccopy( dofs, u_m.dval, 1, pu_m.dval, 1, queue ); CHECK( magma_c_spmv( c_one, A, pu_m, c_zero, v, queue )); // v = A u magma_ccopy( dofs, v.dval, 1, Au.dval, 1, queue ); nomb = magma_scnrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } if ( (r0 = nomb * solver_par->rtol) < ATOLERANCE ){ r0 = ATOLERANCE; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nom0; solver_par->timing[0] = 0.0; } if ( nom0 < r0 ) { info = MAGMA_SUCCESS; goto cleanup; } tau = magma_csqrt( magma_cdotc( dofs, r.dval, 1, r_tld.dval, 1, queue )); rho = magma_cdotc( dofs, r.dval, 1, r_tld.dval, 1, queue ); rho_l = rho; //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; if( solver_par->numiter%2 == 1 ){ alpha = rho / magma_cdotc( dofs, v.dval, 1, r_tld.dval, 1, queue ); magma_ccopy( dofs, u_m.dval, 1, u_mp1.dval, 1, queue ); magma_caxpy( dofs, -alpha, v.dval, 1, u_mp1.dval, 1, queue ); // u_mp1 = u_m - alpha*v; } magma_caxpy( dofs, -alpha, Au.dval, 1, w.dval, 1, queue ); // w = w - alpha*Au; sigma = theta * theta / alpha * eta; magma_cscal( dofs, sigma, d.dval, 1, queue ); magma_caxpy( dofs, c_one, pu_m.dval, 1, d.dval, 1, queue ); // d = pu_m + sigma*d; magma_cscal( dofs, sigma, Ad.dval, 1, queue ); magma_caxpy( dofs, c_one, Au.dval, 1, Ad.dval, 1, queue ); // Ad = Au + sigma*Ad; theta = magma_csqrt( magma_cdotc(dofs, w.dval, 1, w.dval, 1, queue) ) / tau; c = c_one / magma_csqrt( c_one + theta*theta ); tau = tau * theta *c; eta = c * c * alpha; // normd = magma_scnrm2( dofs, d.dval, 1, queue ); // normx = magma_scnrm2( dofs, x->dval, 1, queue ); // // // if ( MAGMA_C_ABS(eta)*normd < 1e-15*normx ){ // stag = stag + 1; // } else { // stag = 0; // } magma_caxpy( dofs, eta, d.dval, 1, x->dval, 1, queue ); // x = x + eta * d magma_caxpy( dofs, -eta, Ad.dval, 1, r.dval, 1, queue ); // r = r - eta * Ad res = magma_scnrm2( dofs, r.dval, 1, queue ); // normr_act = res; if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res/nomb <= solver_par->rtol || res <= solver_par->atol ){ info = MAGMA_SUCCESS; break; } // if (normr_act < normrmin){ //update minimal norm quantities // normrmin = normr_act; // //xmin = x; // } // if ( stag >= maxstagsteps ){ // 3 iterates are the same // break; // } if( solver_par->numiter%2 == 0 ){ rho = magma_cdotc( dofs, w.dval, 1, r_tld.dval, 1, queue ); beta = rho / rho_l; rho_l = rho; magma_ccopy( dofs, w.dval, 1, u_mp1.dval, 1, queue ); magma_caxpy( dofs, beta, u_m.dval, 1, u_mp1.dval, 1, queue ); // u_mp1 = w + beta*u_m; } magma_ccopy( dofs, u_mp1.dval, 1, pu_m.dval, 1, queue ); CHECK( magma_c_spmv( c_one, A, pu_m, c_zero, Au_new, queue )); // Au_new = A pu_m solver_par->spmv_count++; if( solver_par->numiter%2 == 0 ){ magma_cscal( dofs, beta*beta, v.dval, 1, queue ); magma_caxpy( dofs, beta, Au.dval, 1, v.dval, 1, queue ); magma_caxpy( dofs, c_one, Au_new.dval, 1, v.dval, 1, queue ); // v = Au_new + beta*(Au+beta*v); } magma_ccopy( dofs, Au_new.dval, 1, Au.dval, 1, queue ); magma_ccopy( dofs, u_mp1.dval, 1, u_m.dval, 1, queue ); } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; CHECK( magma_cresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = res; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->rtol*solver_par->init_res || solver_par->iter_res < solver_par->atol ) { info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_cmfree(&r, queue ); magma_cmfree(&r_tld, queue ); magma_cmfree(&d, queue ); magma_cmfree(&w, queue ); magma_cmfree(&v, queue ); magma_cmfree(&pu_m, queue ); magma_cmfree(&u_m, queue ); magma_cmfree(&u_mp1, queue ); magma_cmfree(&d, queue ); magma_cmfree(&Au, queue ); magma_cmfree(&Au_new, queue ); magma_cmfree(&Ad, queue ); solver_par->info = info; return info; } /* magma_ctfqmr */
/***************************************************************************//** Purpose ------- CLAQPS computes a step of QR factorization with column pivoting of a complex M-by-N matrix A by using Blas-3. It tries to factorize NB columns from A starting from the row OFFSET+1, and updates all of the matrix with Blas-3 xGEMM. In some cases, due to catastrophic cancellations, it cannot factorize NB columns. Hence, the actual number of factorized columns is returned in KB. Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0 @param[in] offset INTEGER The number of rows of A that have been factorized in previous steps. @param[in] nb INTEGER The number of columns to factorize. @param[out] kb INTEGER The number of columns actually factorized. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, block A(OFFSET+1:M,1:KB) is the triangular factor obtained and block A(1:OFFSET,1:N) has been accordingly pivoted, but no factorized. The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has been updated. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[in,out] dA COMPLEX array, dimension (LDA,N) Copy of A on the GPU. Portions of A are updated on the CPU; portions of dA are updated on the GPU. See code for details. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). @param[in,out] jpvt INTEGER array, dimension (N) JPVT(I) = K <==> Column K of the full matrix A has been permuted into position I in AP. @param[out] tau COMPLEX array, dimension (KB) The scalar factors of the elementary reflectors. @param[in,out] vn1 REAL array, dimension (N) The vector with the partial column norms. @param[in,out] vn2 REAL array, dimension (N) The vector with the exact column norms. @param[in,out] auxv COMPLEX array, dimension (NB) Auxiliar vector. @param[in,out] F COMPLEX array, dimension (LDF,NB) Matrix F' = L*Y'*A. @param[in] ldf INTEGER The leading dimension of the array F. LDF >= max(1,N). @param[in,out] dF COMPLEX array, dimension (LDDF,NB) Copy of F on the GPU. See code for details. @param[in] lddf INTEGER The leading dimension of the array dF. LDDF >= max(1,N). @ingroup magma_laqps *******************************************************************************/ extern "C" magma_int_t magma_claqps( magma_int_t m, magma_int_t n, magma_int_t offset, magma_int_t nb, magma_int_t *kb, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex_ptr dA, magma_int_t ldda, magma_int_t *jpvt, magmaFloatComplex *tau, float *vn1, float *vn2, magmaFloatComplex *auxv, magmaFloatComplex *F, magma_int_t ldf, magmaFloatComplex_ptr dF, magma_int_t lddf) { #define A(i, j) (A + (i) + (j)*(lda )) #define dA(i, j) (dA + (i) + (j)*(ldda)) #define F(i, j) (F + (i) + (j)*(ldf )) #define dF(i, j) (dF + (i) + (j)*(lddf)) magmaFloatComplex c_zero = MAGMA_C_MAKE( 0.,0.); magmaFloatComplex c_one = MAGMA_C_MAKE( 1.,0.); magmaFloatComplex c_neg_one = MAGMA_C_MAKE(-1.,0.); magma_int_t ione = 1; magma_int_t i__1, i__2; float d__1; magmaFloatComplex z__1; magma_int_t j, k, rk; magmaFloatComplex Akk; magma_int_t pvt; float temp, temp2, tol3z; magma_int_t itemp; magma_int_t lsticc; magma_int_t lastrk; lastrk = min( m, n + offset ); tol3z = magma_ssqrt( lapackf77_slamch("Epsilon")); magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); lsticc = 0; k = 0; while( k < nb && lsticc == 0 ) { rk = offset + k; /* Determine ith pivot column and swap if necessary */ // subtract 1 from Fortran isamax; pvt, k are 0-based. i__1 = n-k; pvt = k + blasf77_isamax( &i__1, &vn1[k], &ione ) - 1; if (pvt != k) { if (pvt >= nb) { /* 1. Start copy from GPU */ magma_cgetmatrix_async( m - offset - nb, 1, dA(offset + nb, pvt), ldda, A (offset + nb, pvt), lda, queue ); } /* F gets swapped so F must be sent at the end to GPU */ i__1 = k; blasf77_cswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf ); itemp = jpvt[pvt]; jpvt[pvt] = jpvt[k]; jpvt[k] = itemp; vn1[pvt] = vn1[k]; vn2[pvt] = vn2[k]; if (pvt < nb) { /* no need of transfer if pivot is within the panel */ blasf77_cswap( &m, A(0, pvt), &ione, A(0, k), &ione ); } else { /* 1. Finish copy from GPU */ magma_queue_sync( queue ); /* 2. Swap as usual on CPU */ blasf77_cswap(&m, A(0, pvt), &ione, A(0, k), &ione); /* 3. Restore the GPU */ magma_csetmatrix_async( m - offset - nb, 1, A (offset + nb, pvt), lda, dA(offset + nb, pvt), ldda, queue ); } } /* Apply previous Householder reflectors to column K: A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'. Optimization: multiply with beta=0; wait for vector and subtract */ if (k > 0) { #ifdef COMPLEX for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_C_CONJ( *F(k,j) ); } #endif i__1 = m - rk; i__2 = k; blasf77_cgemv( MagmaNoTransStr, &i__1, &i__2, &c_neg_one, A(rk, 0), &lda, F(k, 0), &ldf, &c_one, A(rk, k), &ione ); #ifdef COMPLEX for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_C_CONJ( *F(k,j) ); } #endif } /* Generate elementary reflector H(k). */ if (rk < m-1) { i__1 = m - rk; lapackf77_clarfg( &i__1, A(rk, k), A(rk + 1, k), &ione, &tau[k] ); } else { lapackf77_clarfg( &ione, A(rk, k), A(rk, k), &ione, &tau[k] ); } Akk = *A(rk, k); *A(rk, k) = c_one; /* Compute Kth column of F: Compute F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */ if (k < n-1) { i__1 = m - rk; i__2 = n - k - 1; /* Send the vector to the GPU */ magma_csetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda, queue ); /* Multiply on GPU */ // was CALL CGEMV( 'Conjugate transpose', M-RK+1, N-K, // TAU( K ), A( RK, K+1 ), LDA, // A( RK, K ), 1, // CZERO, F( K+1, K ), 1 ) magma_int_t i__3 = nb-k-1; magma_int_t i__4 = i__2 - i__3; magma_int_t i__5 = nb-k; magma_cgemv( MagmaConjTrans, i__1 - i__5, i__2 - i__3, tau[k], dA(rk +i__5, k+1+i__3), ldda, dA(rk +i__5, k ), ione, c_zero, dF(k+1+i__3, k ), ione, queue ); magma_cgetmatrix_async( i__2-i__3, 1, dF(k + 1 +i__3, k), i__2, F (k + 1 +i__3, k), i__2, queue ); blasf77_cgemv( MagmaConjTransStr, &i__1, &i__3, &tau[k], A(rk, k+1), &lda, A(rk, k ), &ione, &c_zero, F(k+1, k ), &ione ); magma_queue_sync( queue ); blasf77_cgemv( MagmaConjTransStr, &i__5, &i__4, &tau[k], A(rk, k+1+i__3), &lda, A(rk, k ), &ione, &c_one, F(k+1+i__3, k ), &ione ); } /* Padding F(1:K,K) with zeros. */ for (j = 0; j < k; ++j) { *F(j, k) = c_zero; } /* Incremental updating of F: F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). */ if (k > 0) { i__1 = m - rk; i__2 = k; z__1 = MAGMA_C_NEGATE( tau[k] ); blasf77_cgemv( MagmaConjTransStr, &i__1, &i__2, &z__1, A(rk, 0), &lda, A(rk, k), &ione, &c_zero, auxv, &ione ); i__1 = k; blasf77_cgemv( MagmaNoTransStr, &n, &i__1, &c_one, F(0,0), &ldf, auxv, &ione, &c_one, F(0,k), &ione ); } /* Optimization: On the last iteration start sending F back to the GPU */ /* Update the current row of A: A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'. */ if (k < n-1) { i__1 = n - k - 1; i__2 = k + 1; blasf77_cgemm( MagmaNoTransStr, MagmaConjTransStr, &ione, &i__1, &i__2, &c_neg_one, A(rk, 0 ), &lda, F(k+1,0 ), &ldf, &c_one, A(rk, k+1), &lda ); } /* Update partial column norms. */ if (rk < lastrk) { for (j = k + 1; j < n; ++j) { if (vn1[j] != 0.) { /* NOTE: The following 4 lines follow from the analysis in Lapack Working Note 176. */ temp = MAGMA_C_ABS( *A(rk,j) ) / vn1[j]; temp = max( 0., ((1. + temp) * (1. - temp)) ); d__1 = vn1[j] / vn2[j]; temp2 = temp * (d__1 * d__1); if (temp2 <= tol3z) { vn2[j] = (float) lsticc; lsticc = j; } else { vn1[j] *= magma_ssqrt(temp); } } } } *A(rk, k) = Akk; ++k; } // leave k as the last column done --k; *kb = k + 1; rk = offset + *kb - 1; /* Apply the block reflector to the rest of the matrix: A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)' */ if (*kb < min(n, m - offset)) { i__1 = m - rk - 1; i__2 = n - *kb; /* Send F to the GPU */ magma_csetmatrix( i__2, *kb, F (*kb, 0), ldf, dF(*kb, 0), i__2, queue ); magma_cgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb, c_neg_one, dA(rk+1, 0 ), ldda, dF(*kb, 0 ), i__2, c_one, dA(rk+1, *kb), ldda, queue ); } /* Recomputation of difficult columns. */ while( lsticc > 0 ) { itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc])); i__1 = m - rk - 1; if (lsticc <= nb) { vn1[lsticc] = magma_cblas_scnrm2( i__1, A(rk+1,lsticc), ione ); } else { /* Where is the data, CPU or GPU ? */ float r1, r2; r1 = magma_cblas_scnrm2( nb-k, A(rk+1,lsticc), ione ); r2 = magma_scnrm2( m-offset-nb, dA(offset + nb + 1, lsticc), ione, queue ); //vn1[lsticc] = magma_scnrm2( i__1, dA(rk + 1, lsticc), ione, queue ); vn1[lsticc] = magma_ssqrt(r1*r1 + r2*r2); } /* NOTE: The computation of VN1( LSTICC ) relies on the fact that SNRM2 does not fail on vectors with norm below the value of SQRT(SLAMCH('S')) */ vn2[lsticc] = vn1[lsticc]; lsticc = itemp; } magma_queue_destroy( queue ); return MAGMA_SUCCESS; } /* magma_claqps */
extern "C" magma_int_t magma_cbicgstab_merge3( magma_c_matrix A, magma_c_matrix b, magma_c_matrix *x, magma_c_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_BICGSTABMERGE; solver_par->numiter = 0; solver_par->spmv_count = 0; // solver variables magmaFloatComplex alpha, beta, omega, rho_old, rho_new, *skp_h={0}; float nom, nom0, betanom, nomb; // some useful variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE; magma_int_t dofs = A.num_rows; // workspace magma_c_matrix q={Magma_CSR}, r={Magma_CSR}, rr={Magma_CSR}, p={Magma_CSR}, v={Magma_CSR}, s={Magma_CSR}, t={Magma_CSR}; magmaFloatComplex *d1=NULL, *d2=NULL, *skp=NULL; d1 = NULL; d2 = NULL; skp = NULL; CHECK( magma_cmalloc( &d1, dofs*(2) )); CHECK( magma_cmalloc( &d2, dofs*(2) )); // array for the parameters CHECK( magma_cmalloc( &skp, 8 )); // skp = [alpha|beta|omega|rho_old|rho|nom|tmp1|tmp2] CHECK( magma_cvinit( &q, Magma_DEV, dofs*6, 1, c_zero, queue )); // q = rr|r|p|v|s|t rr.memory_location = Magma_DEV; rr.dval = NULL; rr.num_rows = rr.nnz = dofs; rr.num_cols = 1; rr.storage_type = Magma_DENSE; r.memory_location = Magma_DEV; r.dval = NULL; r.num_rows = r.nnz = dofs; r.num_cols = 1; r.storage_type = Magma_DENSE; p.memory_location = Magma_DEV; p.dval = NULL; p.num_rows = p.nnz = dofs; p.num_cols = 1; p.storage_type = Magma_DENSE; v.memory_location = Magma_DEV; v.dval = NULL; v.num_rows = v.nnz = dofs; v.num_cols = 1; v.storage_type = Magma_DENSE; s.memory_location = Magma_DEV; s.dval = NULL; s.num_rows = s.nnz = dofs; s.num_cols = 1; s.storage_type = Magma_DENSE; t.memory_location = Magma_DEV; t.dval = NULL; t.num_rows = t.nnz = dofs; t.num_cols = 1; t.storage_type = Magma_DENSE; rr.dval = q(0); r.dval = q(1); p.dval = q(2); v.dval = q(3); s.dval = q(4); t.dval = q(5); // solver setup CHECK( magma_cresidualvec( A, b, *x, &r, &nom0, queue)); magma_ccopy( dofs, r.dval, 1, q(0), 1, queue ); // rr = r magma_ccopy( dofs, r.dval, 1, q(1), 1, queue ); // q = r betanom = nom0; nom = nom0*nom0; rho_new = magma_cdotc( dofs, r.dval, 1, r.dval, 1, queue ); // rho=<rr,r> rho_old = omega = alpha = MAGMA_C_MAKE( 1.0, 0. ); beta = rho_new; solver_par->init_res = nom0; // array on host for the parameters CHECK( magma_cmalloc_cpu( &skp_h, 8 )); nomb = magma_scnrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } skp_h[0]=alpha; skp_h[1]=beta; skp_h[2]=omega; skp_h[3]=rho_old; skp_h[4]=rho_new; skp_h[5]=MAGMA_C_MAKE(nom, 0.0); magma_csetvector( 8, skp_h, 1, skp, 1, queue ); CHECK( magma_c_spmv( c_one, A, r, c_zero, v, queue )); // z = A r nomb = magma_scnrm2( dofs, b.dval, 1, queue ); if( nom0 < solver_par->atol || nom0/nomb < solver_par->rtol ){ info = MAGMA_SUCCESS; goto cleanup; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; // computes p=r+beta*(p-omega*v) CHECK( magma_cbicgmerge1( dofs, skp, v.dval, r.dval, p.dval, queue )); CHECK( magma_c_spmv( c_one, A, p, c_zero, v, queue )); // v = Ap solver_par->spmv_count++; CHECK( magma_cmdotc( dofs, 1, q.dval, v.dval, d1, d2, skp, queue )); CHECK( magma_cbicgmerge4( 1, skp, queue )); CHECK( magma_cbicgmerge2( dofs, skp, r.dval, v.dval, s.dval, queue )); // s=r-alpha*v CHECK( magma_c_spmv( c_one, A, s, c_zero, t, queue )); // t=As solver_par->spmv_count++; CHECK( magma_cmdotc( dofs, 2, q.dval+4*dofs, t.dval, d1, d2, skp+6, queue )); CHECK( magma_cbicgmerge4( 2, skp, queue )); CHECK( magma_cbicgmerge_xrbeta( dofs, d1, d2, q.dval, r.dval, p.dval, s.dval, t.dval, x->dval, skp, queue )); // check stopping criterion magma_cgetvector_async( 1 , skp+5, 1, skp_h+5, 1, queue ); betanom = sqrt(MAGMA_C_REAL(skp_h[5])); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < solver_par->atol || betanom/nomb < solver_par->rtol ) { break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; CHECK( magma_cresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = betanom; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->atol || solver_par->iter_res/solver_par->init_res < solver_par->rtol ){ info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_cmfree(&q, queue ); // frees all vectors magma_free(d1); magma_free(d2); magma_free( skp ); magma_free_cpu( skp_h ); solver_par->info = info; return info; } /* cbicgstab_merge */
extern "C" magma_int_t magma_citerref( magma_c_matrix A, magma_c_matrix b, magma_c_matrix *x, magma_c_solver_par *solver_par, magma_c_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = 0; // set queue for old dense routines magma_queue_t orig_queue=NULL; magmablasGetKernelStream( &orig_queue ); // some useful variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE, c_mone = MAGMA_C_NEG_ONE; // prepare solver feedback solver_par->solver = Magma_ITERREF; solver_par->numiter = 0; solver_par->info = MAGMA_SUCCESS; magma_int_t dofs = A.num_rows*b.num_cols; // solver variables float nom, nom0, r0; // workspace magma_c_matrix r={Magma_CSR}, z={Magma_CSR}; CHECK( magma_cvinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &z, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); float residual; CHECK( magma_cresidual( A, b, *x, &residual, queue )); solver_par->init_res = residual; // solver setup magma_cscal( dofs, c_zero, x->dval, 1) ; // x = 0 //CHECK( magma_cresidualvec( A, b, *x, &r, nom, queue)); magma_ccopy( dofs, b.dval, 1, r.dval, 1 ); // r = b nom0 = magma_scnrm2(dofs, r.dval, 1); // nom0 = || r || nom = nom0 * nom0; solver_par->init_res = nom0; if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) { solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; goto cleanup; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ) { magma_cscal( dofs, MAGMA_C_MAKE(1./nom, 0.), r.dval, 1) ; // scale it CHECK( magma_c_precond( A, r, &z, precond_par, queue )); // inner solver: A * z = r magma_cscal( dofs, MAGMA_C_MAKE(nom, 0.), z.dval, 1) ; // scale it magma_caxpy(dofs, c_one, z.dval, 1, x->dval, 1); // x = x + z CHECK( magma_c_spmv( c_mone, A, *x, c_zero, r, queue )); // r = - A x magma_caxpy(dofs, c_one, b.dval, 1, r.dval, 1); // r = r + b nom = magma_scnrm2(dofs, r.dval, 1); // nom = || r || if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) nom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( nom < r0 ) { break; } } tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; CHECK( magma_cresidualvec( A, b, *x, &r, &residual, queue)); solver_par->final_res = residual; solver_par->iter_res = nom; if ( solver_par->numiter < solver_par->maxiter ) { solver_par->info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) nom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->epsilon*solver_par->init_res ){ info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) nom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_cmfree(&r, queue ); magma_cmfree(&z, queue ); magmablasSetKernelStream( orig_queue ); solver_par->info = info; return info; } /* magma_citerref */
extern "C" magma_int_t magma_cbombard( magma_c_matrix A, magma_c_matrix b, magma_c_matrix *x, magma_c_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // 1=QMR, 2=CGS, 3+BiCGSTAB magma_int_t flag = 0; // prepare solver feedback solver_par->solver = Magma_BOMBARD; solver_par->numiter = 0; solver_par->spmv_count = 0; // local variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE; // solver variables float nom0, r0, res, Q_res, T_res, C_res, B_res, nomb; //QMR magmaFloatComplex Q_rho = c_one, Q_rho1 = c_one, Q_eta = -c_one , Q_pds = c_one, Q_thet = c_one, Q_thet1 = c_one, Q_epsilon = c_one, Q_beta = c_one, Q_delta = c_one, Q_pde = c_one, Q_rde = c_one, Q_gamm = c_one, Q_gamm1 = c_one, Q_psi = c_one; //TFQMR magmaFloatComplex T_rho = c_one, T_rho_l = c_one, T_eta = c_zero , T_c = c_zero , T_theta = c_zero , T_tau = c_zero, T_alpha = c_one, T_beta = c_zero, T_sigma = c_zero; //CGS magmaFloatComplex C_rho, C_rho_l = c_one, C_alpha, C_beta = c_zero; //BiCGSTAB magmaFloatComplex B_alpha, B_beta, B_omega, B_rho_old, B_rho_new; magma_int_t dofs = A.num_rows* b.num_cols; // need to transpose the matrix // GPU workspace // QMR magma_c_matrix AT = {Magma_CSR}, Ah1 = {Magma_CSR}, Ah2 = {Magma_CSR}, Q_r={Magma_CSR}, r_tld={Magma_CSR}, Q_x={Magma_CSR}, Q_v={Magma_CSR}, Q_w={Magma_CSR}, Q_wt={Magma_CSR}, Q_d={Magma_CSR}, Q_s={Magma_CSR}, Q_z={Magma_CSR}, Q_q={Magma_CSR}, Q_p={Magma_CSR}, Q_pt={Magma_CSR}, Q_y={Magma_CSR}, d1={Magma_CSR}, d2={Magma_CSR}; //TFQMR // GPU workspace magma_c_matrix T_r={Magma_CSR}, T_pu_m={Magma_CSR}, T_x={Magma_CSR}, T_d={Magma_CSR}, T_w={Magma_CSR}, T_v={Magma_CSR}, T_u_mp1={Magma_CSR}, T_u_m={Magma_CSR}, T_Au={Magma_CSR}, T_Ad={Magma_CSR}, T_Au_new={Magma_CSR}; // CGS magma_c_matrix C_r={Magma_CSR}, C_rt={Magma_CSR}, C_x={Magma_CSR}, C_p={Magma_CSR}, C_q={Magma_CSR}, C_u={Magma_CSR}, C_v={Magma_CSR}, C_t={Magma_CSR}, C_p_hat={Magma_CSR}, C_q_hat={Magma_CSR}, C_u_hat={Magma_CSR}, C_v_hat={Magma_CSR}; //BiCGSTAB magma_c_matrix B_r={Magma_CSR}, B_x={Magma_CSR}, B_p={Magma_CSR}, B_v={Magma_CSR}, B_s={Magma_CSR}, B_t={Magma_CSR}; CHECK( magma_cvinit( &r_tld, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &d1, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &d2, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // QMR CHECK( magma_cvinit( &Q_r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &Q_v, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &Q_w, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &Q_wt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &Q_d, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &Q_s, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &Q_z, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &Q_q, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &Q_p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &Q_pt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &Q_y, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &Q_x, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // TFQMR CHECK( magma_cvinit( &T_r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &T_u_mp1,Magma_DEV, A.num_rows, b.num_cols, c_one, queue )); CHECK( magma_cvinit( &T_u_m, Magma_DEV, A.num_rows, b.num_cols, c_one, queue )); CHECK( magma_cvinit( &T_pu_m, Magma_DEV, A.num_rows, b.num_cols, c_one, queue )); CHECK( magma_cvinit( &T_v, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &T_d, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &T_w, Magma_DEV, A.num_rows, b.num_cols, c_one, queue )); CHECK( magma_cvinit( &T_Ad, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &T_Au_new, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &T_Au, Magma_DEV, A.num_rows, b.num_cols, c_one, queue )); CHECK( magma_cvinit( &T_x, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // CGS CHECK( magma_cvinit( &C_r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &C_rt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &C_x,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &C_p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &C_p_hat, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &C_q, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &C_q_hat, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &C_u, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &C_u_hat, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &C_v, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &C_v_hat, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &C_t, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // BiCGSTAB CHECK( magma_cvinit( &B_r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &B_x,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &B_p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &B_v, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &B_s, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &B_t, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // solver setup CHECK( magma_cresidualvec( A, b, *x, &r_tld, &nom0, queue)); solver_par->init_res = nom0; res = nom0; // QMR magma_ccopy( dofs, r_tld.dval, 1, Q_r.dval, 1, queue ); magma_ccopy( dofs, r_tld.dval, 1, Q_y.dval, 1, queue ); magma_ccopy( dofs, r_tld.dval, 1, Q_v.dval, 1, queue ); magma_ccopy( dofs, r_tld.dval, 1, Q_wt.dval, 1, queue ); magma_ccopy( dofs, r_tld.dval, 1, Q_z.dval, 1, queue ); magma_ccopy( dofs, x->dval, 1, Q_x.dval, 1, queue ); // transpose the matrix // transpose the matrix magma_cmtransfer( A, &Ah1, Magma_DEV, Magma_CPU, queue ); magma_cmconvert( Ah1, &Ah2, A.storage_type, Magma_CSR, queue ); magma_cmfree(&Ah1, queue ); magma_cmtransposeconjugate( Ah2, &Ah1, queue ); magma_cmfree(&Ah2, queue ); Ah2.blocksize = A.blocksize; Ah2.alignment = A.alignment; magma_cmconvert( Ah1, &Ah2, Magma_CSR, A.storage_type, queue ); magma_cmfree(&Ah1, queue ); magma_cmtransfer( Ah2, &AT, Magma_CPU, Magma_DEV, queue ); magma_cmfree(&Ah2, queue ); // TFQMR solver_par->init_res = nom0; magma_ccopy( dofs, r_tld.dval, 1, T_r.dval, 1, queue ); magma_ccopy( dofs, T_r.dval, 1, T_w.dval, 1, queue ); magma_ccopy( dofs, T_r.dval, 1, T_u_m.dval, 1, queue ); magma_ccopy( dofs, T_r.dval, 1, T_u_mp1.dval, 1, queue ); magma_ccopy( dofs, T_u_m.dval, 1, T_pu_m.dval, 1, queue ); CHECK( magma_c_spmv( c_one, A, T_pu_m, c_zero, T_v, queue )); magma_ccopy( dofs, T_v.dval, 1, T_Au.dval, 1, queue ); // CGS magma_ccopy( dofs, r_tld.dval, 1, C_r.dval, 1, queue ); magma_ccopy( dofs, x->dval, 1, C_x.dval, 1, queue ); // BiCGSTAB magma_ccopy( dofs, r_tld.dval, 1, B_r.dval, 1, queue ); magma_ccopy( dofs, x->dval, 1, B_x.dval, 1, queue ); CHECK( magma_c_spmv( c_one, A, B_r, c_zero, B_v, queue )); nomb = magma_scnrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } if ( (r0 = nomb * solver_par->rtol) < ATOLERANCE ){ r0 = ATOLERANCE; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nom0; solver_par->timing[0] = 0.0; } if ( nom0 < r0 ) { info = MAGMA_SUCCESS; goto cleanup; } T_tau = magma_csqrt( magma_cdotc( dofs, T_r.dval, 1, r_tld.dval, 1, queue) ); T_rho = magma_cdotc( dofs, T_r.dval, 1, r_tld.dval, 1, queue ); T_rho_l = T_rho; Q_psi = magma_csqrt( magma_cdotc( dofs, Q_z.dval, 1, Q_z.dval, 1, queue )); Q_rho = magma_csqrt( magma_cdotc( dofs, Q_y.dval, 1, Q_y.dval, 1, queue )); // BiCGSTAB B_rho_new = magma_cdotc( dofs, B_r.dval, 1, B_r.dval, 1, queue ); B_rho_old = B_omega = B_alpha = MAGMA_C_MAKE( 1.0, 0. ); // v = y / rho // y = y / rho // w = wt / psi // z = z / psi magma_cqmr_1( b.num_rows, b.num_cols, Q_rho, Q_psi, Q_y.dval, Q_z.dval, Q_v.dval, Q_w.dval, queue ); //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; //QMR: delta = z' * y; Q_delta = magma_cdotc( dofs, Q_z.dval, 1, Q_y.dval, 1, queue ); // TFQMR T_alpha = T_rho / magma_cdotc( dofs, T_v.dval, 1, r_tld.dval, 1, queue ); T_sigma = T_theta * T_theta / T_alpha * T_eta; //CGS: rho = r' * r_tld C_rho = magma_cdotc( dofs, C_r.dval, 1, r_tld.dval, 1, queue ); // BiCGSTAB B_rho_old = B_rho_new; B_rho_new = magma_cdotc( dofs, r_tld.dval, 1, B_r.dval, 1, queue ); // rho=<rr,r> B_beta = B_rho_new/B_rho_old * B_alpha/B_omega; // beta=rho/rho_old *alpha/omega if( solver_par->numiter == 1 ){ //QMR: p = y; //QMR: q = z; magma_ccopy( dofs, Q_y.dval, 1, Q_p.dval, 1, queue ); magma_ccopy( dofs, Q_z.dval, 1, Q_q.dval, 1, queue ); //QMR: u = r; //QMR: p = r; magma_ccgs_2( b.num_rows, b.num_cols, C_r.dval, C_u.dval, C_p.dval, queue ); } else{ Q_pde = Q_psi * Q_delta / Q_epsilon; Q_rde = Q_rho * MAGMA_C_CONJ(Q_delta/Q_epsilon); C_beta = C_rho / C_rho_l; //QMR p = y - pde * p //QMR q = z - rde * q magma_cqmr_2( b.num_rows, b.num_cols, Q_pde, Q_rde, Q_y.dval, Q_z.dval, Q_p.dval, Q_q.dval, queue ); //CGS: u = r + beta*q; //CGS: p = u + beta*( q + beta*p ); magma_ccgs_1( b.num_rows, b.num_cols, C_beta, C_r.dval, C_q.dval, C_u.dval, C_p.dval, queue ); } // TFQMR magma_ctfqmr_1( b.num_rows, b.num_cols, T_alpha, T_sigma, T_v.dval, T_Au.dval, T_u_m.dval, T_pu_m.dval, T_u_mp1.dval, T_w.dval, T_d.dval, T_Ad.dval, queue ); T_theta = magma_csqrt( magma_cdotc(dofs, T_w.dval, 1, T_w.dval, 1, queue) ) / T_tau; T_c = c_one / magma_csqrt( c_one + T_theta*T_theta ); T_tau = T_tau * T_theta *T_c; T_eta = T_c * T_c * T_alpha; T_sigma = T_theta * T_theta / T_alpha * T_eta; magma_ctfqmr_2( b.num_rows, b.num_cols, T_eta, T_d.dval, T_Ad.dval, T_x.dval, T_r.dval, queue ); magma_ccopy( dofs, T_u_mp1.dval, 1, T_pu_m.dval, 1, queue ); // BiCGSTAB: p = r + beta * ( p - omega * v ) magma_cbicgstab_1( b.num_rows, b.num_cols, B_beta, B_omega, B_r.dval, B_v.dval, B_p.dval, queue ); //QMR CHECK( magma_c_spmv( c_one, A, Q_p, c_zero, Q_pt, queue )); //TFQMR CHECK( magma_c_spmv( c_one, A, T_pu_m, c_zero, T_Au_new, queue )); //CGS CHECK( magma_c_spmv( c_one, A, C_p, c_zero, C_v_hat, queue )); // BiCGSTAB CHECK( magma_c_spmv( c_one, A, B_p, c_zero, B_v, queue )); // v = Ap solver_par->spmv_count++; //QMR: epsilon = q' * pt; Q_epsilon = magma_cdotc( dofs, Q_q.dval, 1, Q_pt.dval, 1, queue ); Q_beta = Q_epsilon / Q_delta; //TFQMR magma_ccopy( dofs, T_Au_new.dval, 1, T_Au.dval, 1, queue ); magma_ccopy( dofs, T_u_mp1.dval, 1, T_u_m.dval, 1, queue ); //CGS: alpha = r_tld' * v_hat C_alpha = C_rho / magma_cdotc( dofs, r_tld.dval, 1, C_v_hat.dval, 1, queue ); //BiCGSTAB B_alpha = B_rho_new / magma_cdotc( dofs, r_tld.dval, 1, B_v.dval, 1, queue ); //QMR: v = pt - beta * v //QMR: y = v magma_cqmr_3( b.num_rows, b.num_cols, Q_beta, Q_pt.dval, Q_v.dval, Q_y.dval, queue ); // TFQMR magma_ctfqmr_5( b.num_rows, b.num_cols, T_alpha, T_sigma, T_v.dval, T_Au.dval, T_pu_m.dval, T_w.dval, T_d.dval, T_Ad.dval, queue ); // TFQMR T_sigma = T_theta * T_theta / T_alpha * T_eta; T_theta = magma_csqrt( magma_cdotc(dofs, T_w.dval, 1, T_w.dval, 1, queue) ) / T_tau; T_c = c_one / magma_csqrt( c_one + T_theta*T_theta ); T_tau = T_tau * T_theta *T_c; T_eta = T_c * T_c * T_alpha; // TFQMR magma_ctfqmr_2( b.num_rows, b.num_cols, T_eta, T_d.dval, T_Ad.dval, T_x.dval, T_r.dval, queue ); T_rho = magma_cdotc( dofs, T_w.dval, 1, r_tld.dval, 1, queue ); T_beta = T_rho / T_rho_l; T_rho_l = T_rho; magma_ctfqmr_3( b.num_rows, b.num_cols, T_beta, T_w.dval, T_u_m.dval, T_u_mp1.dval, queue ); magma_ccopy( dofs, T_u_mp1.dval, 1, T_pu_m.dval, 1, queue ); //CGS: q = u - alpha v_hat //CGS: t = u + q magma_ccgs_3( b.num_rows, b.num_cols, C_alpha, C_v_hat.dval, C_u.dval, C_q.dval, C_t.dval, queue ); // BiCGSTAB: s = r - alpha v magma_cbicgstab_2( b.num_rows, b.num_cols, B_alpha, B_r.dval, B_v.dval, B_s.dval, queue ); Q_rho1 = Q_rho; //QMR rho = norm(y); Q_rho = magma_csqrt( magma_cdotc( dofs, Q_y.dval, 1, Q_y.dval, 1, queue ) ); //QMR wt = A' * q - beta' * w; CHECK( magma_c_spmv( c_one, AT, Q_q, c_zero, Q_wt, queue )); //TFQMR CHECK( magma_c_spmv( c_one, A, T_pu_m, c_zero, T_Au_new, queue )); //CGS t = A u_hat CHECK( magma_c_spmv( c_one, A, C_t, c_zero, C_rt, queue )); //BiCGSTAB CHECK( magma_c_spmv( c_one, A, B_s, c_zero, B_t, queue )); // t=As solver_par->spmv_count++; //BiCGSTAB B_omega = magma_cdotc( dofs, B_t.dval, 1, B_s.dval, 1, queue ) // omega = <s,t>/<t,t> / magma_cdotc( dofs, B_t.dval, 1, B_t.dval, 1, queue ); // QMR magma_caxpy( dofs, - MAGMA_C_CONJ( Q_beta ), Q_w.dval, 1, Q_wt.dval, 1, queue ); // no precond: z = wt magma_ccopy( dofs, Q_wt.dval, 1, Q_z.dval, 1, queue ); //TFQMR magma_ctfqmr_4( b.num_rows, b.num_cols, T_beta, T_Au_new.dval, T_v.dval, T_Au.dval, queue ); magma_ccopy( dofs, T_u_mp1.dval, 1, T_u_m.dval, 1, queue ); // QMR Q_thet1 = Q_thet; Q_thet = Q_rho / (Q_gamm * MAGMA_C_MAKE( MAGMA_C_ABS(Q_beta), 0.0 )); Q_gamm1 = Q_gamm; Q_gamm = c_one / magma_csqrt(c_one + Q_thet*Q_thet); Q_eta = - Q_eta * Q_rho1 * Q_gamm * Q_gamm / (Q_beta * Q_gamm1 * Q_gamm1); if ( solver_par->numiter == 1 ) { //QMR: d = eta * p + pds * d; //QMR: s = eta * pt + pds * d; //QMR: x = x + d; //QMR: r = r - s; magma_cqmr_4( b.num_rows, b.num_cols, Q_eta, Q_p.dval, Q_pt.dval, Q_d.dval, Q_s.dval, Q_x.dval, Q_r.dval, queue ); } else { Q_pds = (Q_thet1 * Q_gamm) * (Q_thet1 * Q_gamm); // d = eta * p + pds * d; // s = eta * pt + pds * d; // x = x + d; // r = r - s; magma_cqmr_5( b.num_rows, b.num_cols, Q_eta, Q_pds, Q_p.dval, Q_pt.dval, Q_d.dval, Q_s.dval, Q_x.dval, Q_r.dval, queue ); } // CGS: r = r -alpha*A u_hat // CGS: x = x + alpha u_hat magma_ccgs_4( b.num_rows, b.num_cols, C_alpha, C_t.dval, C_rt.dval, C_x.dval, C_r.dval, queue ); C_rho_l = C_rho; // BiCGSTAB: x = x + alpha * p + omega * s // BiCGSTAB: r = s - omega * t magma_cbicgstab_3( b.num_rows, b.num_cols, B_alpha, B_omega, B_p.dval, B_s.dval, B_t.dval, B_x.dval, B_r.dval, queue ); //QMR: psi = norm(z); Q_psi = magma_csqrt( magma_cdotc( dofs, Q_z.dval, 1, Q_z.dval, 1, queue ) ); //QMR: v = y / rho //QMR: y = y / rho //QMR: w = wt / psi //QMR: z = z / psi magma_cqmr_1( b.num_rows, b.num_cols, Q_rho, Q_psi, Q_y.dval, Q_z.dval, Q_v.dval, Q_w.dval, queue ); Q_res = magma_scnrm2( dofs, Q_r.dval, 1, queue ); T_res = magma_scnrm2( dofs, T_r.dval, 1, queue ); C_res = magma_scnrm2( dofs, C_r.dval, 1, queue ); B_res = magma_scnrm2( dofs, B_r.dval, 1, queue ); // printf(" %e %e %e\n", Q_res, C_res, B_res); if( Q_res < res ){ res = Q_res; flag = 1; } if( T_res < res ){ res = Q_res; flag = 2; } if( C_res < res ){ res = C_res; flag = 3; } if( B_res < res ){ res = B_res; flag = 4; } if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose == c_zero ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res/nomb <= solver_par->rtol || res <= solver_par->atol ){ info = MAGMA_SUCCESS; break; } if( magma_c_isnan_inf( Q_beta ) && magma_c_isnan_inf( C_beta ) && magma_c_isnan_inf( B_beta ) ){ info = MAGMA_DIVERGENCE; break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); // copy back the best solver switch ( flag ) { case 1: printf("%% QMR fastest solver.\n"); magma_ccopy( dofs, Q_x.dval, 1, x->dval, 1, queue ); break; case 2: printf("%% TFQMR fastest solver.\n"); magma_ccopy( dofs, T_x.dval, 1, x->dval, 1, queue ); break; case 3: printf("%% CGS fastest solver.\n"); magma_ccopy( dofs, C_x.dval, 1, x->dval, 1, queue ); break; case 4: printf("%% BiCGSTAB fastest solver.\n"); magma_ccopy( dofs, B_x.dval, 1, x->dval, 1, queue ); break; } tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; CHECK( magma_cresidualvec( A, b, *x, &r_tld, &residual, queue)); solver_par->iter_res = res; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter && info == MAGMA_SUCCESS ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == c_zero ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->rtol*solver_par->init_res || solver_par->iter_res < solver_par->atol ) { info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == c_zero ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_cmfree(&r_tld, queue ); magma_cmfree(&d1, queue ); magma_cmfree(&d2, queue ); magma_cmfree(&AT, queue ); // QMR magma_cmfree(&Q_r, queue ); magma_cmfree(&Q_v, queue ); magma_cmfree(&Q_w, queue ); magma_cmfree(&Q_wt, queue ); magma_cmfree(&Q_d, queue ); magma_cmfree(&Q_s, queue ); magma_cmfree(&Q_z, queue ); magma_cmfree(&Q_q, queue ); magma_cmfree(&Q_p, queue ); magma_cmfree(&Q_pt, queue ); magma_cmfree(&Q_y, queue ); magma_cmfree(&Q_x, queue ); magma_cmfree(&Ah1, queue ); magma_cmfree(&Ah2, queue ); // TFQMR magma_cmfree(&T_r, queue ); magma_cmfree(&T_x, queue ); magma_cmfree(&T_d, queue ); magma_cmfree(&T_w, queue ); magma_cmfree(&T_v, queue ); magma_cmfree(&T_u_m, queue ); magma_cmfree(&T_u_mp1, queue ); magma_cmfree(&T_pu_m, queue ); magma_cmfree(&T_d, queue ); magma_cmfree(&T_Au, queue ); magma_cmfree(&T_Au_new, queue ); magma_cmfree(&T_Ad, queue ); // CGS magma_cmfree(&C_r, queue ); magma_cmfree(&C_rt, queue ); magma_cmfree(&C_x, queue ); magma_cmfree(&C_p, queue ); magma_cmfree(&C_q, queue ); magma_cmfree(&C_u, queue ); magma_cmfree(&C_v, queue ); magma_cmfree(&C_t, queue ); magma_cmfree(&C_p_hat, queue ); magma_cmfree(&C_q_hat, queue ); magma_cmfree(&C_u_hat, queue ); magma_cmfree(&C_v_hat, queue ); // BiCGSTAB magma_cmfree(&B_r, queue ); magma_cmfree(&B_x, queue ); magma_cmfree(&B_p, queue ); magma_cmfree(&B_v, queue ); magma_cmfree(&B_s, queue ); magma_cmfree(&B_t, queue ); solver_par->info = info; return info; } /* magma_cbombard */
extern "C" magma_int_t magma_cpqmr_merge( magma_c_matrix A, magma_c_matrix b, magma_c_matrix *x, magma_c_solver_par *solver_par, magma_c_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_QMR; solver_par->numiter = 0; solver_par->spmv_count = 0; // local variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE; // solver variables float nom0, r0, res=0.0, nomb; magmaFloatComplex rho = c_one, rho1 = c_one, eta = -c_one , pds = c_one, thet = c_one, thet1 = c_one, epsilon = c_one, beta = c_one, delta = c_one, pde = c_one, rde = c_one, gamm = c_one, gamm1 = c_one, psi = c_one; magma_int_t dofs = A.num_rows* b.num_cols; // need to transpose the matrix magma_c_matrix AT={Magma_CSR}, Ah1={Magma_CSR}, Ah2={Magma_CSR}; // GPU workspace magma_c_matrix r={Magma_CSR}, r_tld={Magma_CSR}, v={Magma_CSR}, w={Magma_CSR}, wt={Magma_CSR}, d={Magma_CSR}, s={Magma_CSR}, z={Magma_CSR}, q={Magma_CSR}, p={Magma_CSR}, pt={Magma_CSR}, y={Magma_CSR}, vt={Magma_CSR}, yt={Magma_CSR}, zt={Magma_CSR}; CHECK( magma_cvinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &r_tld, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &v, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &w, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &wt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &d, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &s, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &z, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &q, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &pt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &y, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &yt, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &vt, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &zt, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // solver setup CHECK( magma_cresidualvec( A, b, *x, &r, &nom0, queue)); solver_par->init_res = nom0; magma_ccopy( dofs, r.dval, 1, r_tld.dval, 1, queue ); magma_ccopy( dofs, r.dval, 1, vt.dval, 1, queue ); magma_ccopy( dofs, r.dval, 1, wt.dval, 1, queue ); // transpose the matrix magma_cmtransfer( A, &Ah1, Magma_DEV, Magma_CPU, queue ); magma_cmconvert( Ah1, &Ah2, A.storage_type, Magma_CSR, queue ); magma_cmfree(&Ah1, queue ); magma_cmtransposeconjugate( Ah2, &Ah1, queue ); magma_cmfree(&Ah2, queue ); Ah2.blocksize = A.blocksize; Ah2.alignment = A.alignment; magma_cmconvert( Ah1, &Ah2, Magma_CSR, A.storage_type, queue ); magma_cmfree(&Ah1, queue ); magma_cmtransfer( Ah2, &AT, Magma_CPU, Magma_DEV, queue ); magma_cmfree(&Ah2, queue ); nomb = magma_scnrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } if ( (r0 = nomb * solver_par->rtol) < ATOLERANCE ){ r0 = ATOLERANCE; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nom0; solver_par->timing[0] = 0.0; } if ( nom0 < r0 ) { info = MAGMA_SUCCESS; goto cleanup; } // no precond: y = vt, z = wt // magma_ccopy( dofs, vt.dval, 1, y.dval, 1, queue ); // magma_ccopy( dofs, wt.dval, 1, z.dval, 1, queue ); CHECK( magma_c_applyprecond_left( MagmaNoTrans, A, vt, &y, precond_par, queue )); CHECK( magma_c_applyprecond_right( MagmaTrans, A, wt, &z, precond_par, queue )); psi = magma_csqrt( magma_cdotc( dofs, z.dval, 1, z.dval, 1, queue )); rho = magma_csqrt( magma_cdotc( dofs, y.dval, 1, y.dval, 1, queue )); // v = vt / rho // y = y / rho // w = wt / psi // z = z / psi magma_cqmr_8( r.num_rows, r.num_cols, rho, psi, vt.dval, wt.dval, y.dval, z.dval, v.dval, w.dval, queue ); //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; // start iteration do { solver_par->numiter++; if( magma_c_isnan_inf( rho ) || magma_c_isnan_inf( psi ) ){ info = MAGMA_DIVERGENCE; break; } // delta = z' * y; delta = magma_cdotc( dofs, z.dval, 1, y.dval, 1, queue ); if( magma_c_isnan_inf( delta ) ){ info = MAGMA_DIVERGENCE; break; } // no precond: yt = y, zt = z // magma_ccopy( dofs, y.dval, 1, yt.dval, 1, queue ); // magma_ccopy( dofs, z.dval, 1, zt.dval, 1, queue ); CHECK( magma_c_applyprecond_right( MagmaNoTrans, A, y, &yt, precond_par, queue )); CHECK( magma_c_applyprecond_left( MagmaTrans, A, z, &zt, precond_par, queue )); if( solver_par->numiter == 1 ){ // p = y; // q = z; magma_ccopy( dofs, yt.dval, 1, p.dval, 1, queue ); magma_ccopy( dofs, zt.dval, 1, q.dval, 1, queue ); } else{ pde = psi * delta / epsilon; rde = rho * MAGMA_C_CONJ(delta/epsilon); // p = yt - pde * p // q = zt - rde * q magma_cqmr_2( r.num_rows, r.num_cols, pde, rde, yt.dval, zt.dval, p.dval, q.dval, queue ); } if( magma_c_isnan_inf( rho ) || magma_c_isnan_inf( psi ) ){ info = MAGMA_DIVERGENCE; break; } CHECK( magma_c_spmv( c_one, A, p, c_zero, pt, queue )); solver_par->spmv_count++; // epsilon = q' * pt; epsilon = magma_cdotc( dofs, q.dval, 1, pt.dval, 1, queue ); beta = epsilon / delta; if( magma_c_isnan_inf( epsilon ) || magma_c_isnan_inf( beta ) ){ info = MAGMA_DIVERGENCE; break; } // vt = pt - beta * v; magma_cqmr_7( r.num_rows, r.num_cols, beta, pt.dval, v.dval, vt.dval, queue ); magma_ccopy( dofs, v.dval, 1, vt.dval, 1, queue ); magma_cscal( dofs, -beta, vt.dval, 1, queue ); magma_caxpy( dofs, c_one, pt.dval, 1, vt.dval, 1, queue ); // no precond: y = vt // magma_ccopy( dofs, vt.dval, 1, y.dval, 1, queue ); CHECK( magma_c_applyprecond_left( MagmaNoTrans, A, vt, &y, precond_par, queue )); rho1 = rho; // rho = norm(y); rho = magma_csqrt( magma_cdotc( dofs, y.dval, 1, y.dval, 1, queue )); // wt = A' * q - beta' * w; CHECK( magma_c_spmv( c_one, AT, q, c_zero, wt, queue )); solver_par->spmv_count++; magma_caxpy( dofs, - MAGMA_C_CONJ( beta ), w.dval, 1, wt.dval, 1, queue ); // no precond: z = wt // magma_ccopy( dofs, wt.dval, 1, z.dval, 1, queue ); CHECK( magma_c_applyprecond_right( MagmaTrans, A, wt, &z, precond_par, queue )); thet1 = thet; thet = rho / (gamm * MAGMA_C_MAKE( MAGMA_C_ABS(beta), 0.0 )); gamm1 = gamm; gamm = c_one / magma_csqrt(c_one + thet*thet); eta = - eta * rho1 * gamm * gamm / (beta * gamm1 * gamm1); if( magma_c_isnan_inf( thet ) || magma_c_isnan_inf( gamm ) || magma_c_isnan_inf( eta ) ){ info = MAGMA_DIVERGENCE; break; } if( solver_par->numiter == 1 ){ // d = eta * p + pds * d; // s = eta * pt + pds * d; // x = x + d; // r = r - s; magma_cqmr_4( r.num_rows, r.num_cols, eta, p.dval, pt.dval, d.dval, s.dval, x->dval, r.dval, queue ); } else{ // pds = (thet1 * gamm)^2; pds = (thet1 * gamm) * (thet1 * gamm); // d = eta * p + pds * d; // s = eta * pt + pds * d; // x = x + d; // r = r - s; magma_cqmr_5( r.num_rows, r.num_cols, eta, pds, p.dval, pt.dval, d.dval, s.dval, x->dval, r.dval, queue ); } // psi = norm(z); psi = magma_csqrt( magma_cdotc( dofs, z.dval, 1, z.dval, 1, queue ) ); res = magma_scnrm2( dofs, r.dval, 1, queue ); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose == c_zero ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } // v = vt / rho // y = y / rho // w = wt / psi // z = z / psi magma_cqmr_8( r.num_rows, r.num_cols, rho, psi, vt.dval, wt.dval, y.dval, z.dval, v.dval, w.dval, queue ); if ( res/nomb <= solver_par->rtol || res <= solver_par->atol ){ break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; CHECK( magma_cresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = res; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter && info == MAGMA_SUCCESS ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == c_zero ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->rtol*solver_par->init_res || solver_par->iter_res < solver_par->atol ) { info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == c_zero ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_cmfree(&r, queue ); magma_cmfree(&r_tld, queue ); magma_cmfree(&v, queue ); magma_cmfree(&w, queue ); magma_cmfree(&wt, queue ); magma_cmfree(&d, queue ); magma_cmfree(&s, queue ); magma_cmfree(&z, queue ); magma_cmfree(&q, queue ); magma_cmfree(&p, queue ); magma_cmfree(&zt, queue ); magma_cmfree(&vt, queue ); magma_cmfree(&yt, queue ); magma_cmfree(&pt, queue ); magma_cmfree(&y, queue ); magma_cmfree(&AT, queue ); magma_cmfree(&Ah1, queue ); magma_cmfree(&Ah2, queue ); solver_par->info = info; return info; } /* magma_cqmr */
extern "C" magma_int_t magma_cpbicgstab_merge( magma_c_matrix A, magma_c_matrix b, magma_c_matrix *x, magma_c_solver_par *solver_par, magma_c_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_BICGSTAB; solver_par->numiter = 0; solver_par->spmv_count = 0; // some useful variables magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magma_int_t dofs = A.num_rows * b.num_cols; // workspace magma_c_matrix r={Magma_CSR}, rr={Magma_CSR}, p={Magma_CSR}, v={Magma_CSR}, z={Magma_CSR}, y={Magma_CSR}, ms={Magma_CSR}, mt={Magma_CSR}, s={Magma_CSR}, t={Magma_CSR}, d1={Magma_CSR}, d2={Magma_CSR}; CHECK( magma_cvinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &rr,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &v, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &s, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &t, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &ms,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &mt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &y, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &z, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &d1, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &d2, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // solver variables magmaFloatComplex alpha, beta, omega, rho_old, rho_new; float betanom, nom0, r0, res, nomb; res=0; //float den; // solver setup CHECK( magma_cresidualvec( A, b, *x, &r, &nom0, queue)); magma_ccopy( dofs, r.dval, 1, rr.dval, 1, queue ); // rr = r betanom = nom0; rho_new = magma_cdotc( dofs, r.dval, 1, r.dval, 1, queue ); // rho=<rr,r> rho_old = omega = alpha = MAGMA_C_MAKE( 1.0, 0. ); solver_par->init_res = nom0; CHECK( magma_c_spmv( c_one, A, r, c_zero, v, queue )); // z = A r nomb = magma_scnrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } if ( (r0 = nomb * solver_par->rtol) < ATOLERANCE ){ r0 = ATOLERANCE; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } if ( nomb < r0 ) { info = MAGMA_SUCCESS; goto cleanup; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; rho_old = rho_new; // rho_old=rho rho_new = magma_cdotc( dofs, rr.dval, 1, r.dval, 1, queue ); // rho=<rr,r> beta = rho_new/rho_old * alpha/omega; // beta=rho/rho_old *alpha/omega if( magma_c_isnan_inf( beta ) ){ info = MAGMA_DIVERGENCE; break; } // p = r + beta * ( p - omega * v ) magma_cbicgstab_1( r.num_rows, r.num_cols, beta, omega, r.dval, v.dval, p.dval, queue ); // preconditioner CHECK( magma_c_applyprecond_left( MagmaNoTrans, A, p, &mt, precond_par, queue )); CHECK( magma_c_applyprecond_right( MagmaNoTrans, A, mt, &y, precond_par, queue )); CHECK( magma_c_spmv( c_one, A, y, c_zero, v, queue )); // v = Ap solver_par->spmv_count++; //alpha = rho_new / tmpval; alpha = rho_new /magma_cdotc( dofs, rr.dval, 1, v.dval, 1, queue ); if( magma_c_isnan_inf( alpha ) ){ info = MAGMA_DIVERGENCE; break; } // s = r - alpha v magma_cbicgstab_2( r.num_rows, r.num_cols, alpha, r.dval, v.dval, s.dval, queue ); // preconditioner CHECK( magma_c_applyprecond_left( MagmaNoTrans, A, s, &ms, precond_par, queue )); CHECK( magma_c_applyprecond_right( MagmaNoTrans, A, ms, &z, precond_par, queue )); CHECK( magma_c_spmv( c_one, A, z, c_zero, t, queue )); // t=As solver_par->spmv_count++; omega = magma_cdotc( dofs, t.dval, 1, s.dval, 1, queue ) // omega = <s,t>/<t,t> / magma_cdotc( dofs, t.dval, 1, t.dval, 1, queue ); // x = x + alpha * y + omega * z // r = s - omega * t magma_cbicgstab_4( r.num_rows, r.num_cols, alpha, omega, y.dval, z.dval, s.dval, t.dval, x->dval, r.dval, queue ); res = betanom = magma_scnrm2( dofs, r.dval, 1, queue ); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res/nomb <= solver_par->rtol || res <= solver_par->atol ){ break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; CHECK( magma_cresidualvec( A, b, *x, &r, &residual, queue)); solver_par->final_res = residual; solver_par->iter_res = res; if ( solver_par->numiter < solver_par->maxiter && info == MAGMA_SUCCESS ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->rtol*solver_par->init_res || solver_par->iter_res < solver_par->atol ) { info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_cmfree(&r, queue ); magma_cmfree(&rr, queue ); magma_cmfree(&p, queue ); magma_cmfree(&v, queue ); magma_cmfree(&s, queue ); magma_cmfree(&y, queue ); magma_cmfree(&z, queue ); magma_cmfree(&t, queue ); magma_cmfree(&ms, queue ); magma_cmfree(&mt, queue ); magma_cmfree(&d1, queue ); magma_cmfree(&d2, queue ); solver_par->info = info; return info; } /* magma_cbicgstab_merge */
extern "C" magma_int_t magma_cpbicgstab( magma_c_matrix A, magma_c_matrix b, magma_c_matrix *x, magma_c_solver_par *solver_par, magma_c_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = 0; // set queue for old dense routines magma_queue_t orig_queue=NULL; magmablasGetKernelStream( &orig_queue ); // prepare solver feedback solver_par->solver = Magma_PBICGSTAB; solver_par->numiter = 0; solver_par->info = MAGMA_SUCCESS; // some useful variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE, c_mone = MAGMA_C_NEG_ONE; magma_int_t dofs = A.num_rows*b.num_cols; // workspace magma_c_matrix r={Magma_CSR}, rr={Magma_CSR}, p={Magma_CSR}, v={Magma_CSR}, s={Magma_CSR}, t={Magma_CSR}, ms={Magma_CSR}, mt={Magma_CSR}, y={Magma_CSR}, z={Magma_CSR}; CHECK( magma_cvinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &rr,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &v, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &s, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &t, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &ms,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &mt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &y, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &z, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // solver variables magmaFloatComplex alpha, beta, omega, rho_old, rho_new; float nom, betanom, nom0, r0, den, res; // solver setup CHECK( magma_cresidualvec( A, b, *x, &r, &nom0, queue)); magma_ccopy( dofs, r.dval, 1, rr.dval, 1 ); // rr = r betanom = nom0; nom = nom0*nom0; rho_new = omega = alpha = MAGMA_C_MAKE( 1.0, 0. ); solver_par->init_res = nom0; CHECK( magma_c_spmv( c_one, A, r, c_zero, v, queue )); // z = A r den = MAGMA_C_REAL( magma_cdotc(dofs, v.dval, 1, r.dval, 1) ); // den = z' * r if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) { solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; goto cleanup; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } solver_par->numiter = 0; // start iteration do { solver_par->numiter++; rho_old = rho_new; // rho_old=rho rho_new = magma_cdotc( dofs, rr.dval, 1, r.dval, 1 ); // rho=<rr,r> beta = rho_new/rho_old * alpha/omega; // beta=rho/rho_old *alpha/omega magma_cscal( dofs, beta, p.dval, 1 ); // p = beta*p magma_caxpy( dofs, c_mone * omega * beta, v.dval, 1 , p.dval, 1 ); // p = p-omega*beta*v magma_caxpy( dofs, c_one, r.dval, 1, p.dval, 1 ); // p = p+r // preconditioner CHECK( magma_c_applyprecond_left( A, p, &mt, precond_par, queue )); CHECK( magma_c_applyprecond_right( A, mt, &y, precond_par, queue )); CHECK( magma_c_spmv( c_one, A, y, c_zero, v, queue )); // v = Ap alpha = rho_new / magma_cdotc( dofs, rr.dval, 1, v.dval, 1 ); magma_ccopy( dofs, r.dval, 1 , s.dval, 1 ); // s=r magma_caxpy( dofs, c_mone * alpha, v.dval, 1 , s.dval, 1 ); // s=s-alpha*v // preconditioner CHECK( magma_c_applyprecond_left( A, s, &ms, precond_par, queue )); CHECK( magma_c_applyprecond_right( A, ms, &z, precond_par, queue )); CHECK( magma_c_spmv( c_one, A, z, c_zero, t, queue )); // t=As // preconditioner CHECK( magma_c_applyprecond_left( A, s, &ms, precond_par, queue )); CHECK( magma_c_applyprecond_left( A, t, &mt, precond_par, queue )); // omega = <ms,mt>/<mt,mt> omega = magma_cdotc( dofs, mt.dval, 1, ms.dval, 1 ) / magma_cdotc( dofs, mt.dval, 1, mt.dval, 1 ); magma_caxpy( dofs, alpha, y.dval, 1 , x->dval, 1 ); // x=x+alpha*p magma_caxpy( dofs, omega, z.dval, 1 , x->dval, 1 ); // x=x+omega*s magma_ccopy( dofs, s.dval, 1 , r.dval, 1 ); // r=s magma_caxpy( dofs, c_mone * omega, t.dval, 1 , r.dval, 1 ); // r=r-omega*t res = betanom = magma_scnrm2( dofs, r.dval, 1 ); nom = betanom*betanom; if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res/nom0 < solver_par->epsilon ) { break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; CHECK( magma_cresidualvec( A, b, *x, &r, &residual, queue)); solver_par->final_res = residual; solver_par->iter_res = res; if ( solver_par->numiter < solver_par->maxiter ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->epsilon*solver_par->init_res ){ info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_cmfree(&r, queue ); magma_cmfree(&rr, queue ); magma_cmfree(&p, queue ); magma_cmfree(&v, queue ); magma_cmfree(&s, queue ); magma_cmfree(&t, queue ); magma_cmfree(&ms, queue ); magma_cmfree(&mt, queue ); magma_cmfree(&y, queue ); magma_cmfree(&z, queue ); magmablasSetKernelStream( orig_queue ); solver_par->info = info; return info; } /* magma_cbicgstab */
extern "C" magma_int_t magma_ccg( magma_c_matrix A, magma_c_matrix b, magma_c_matrix *x, magma_c_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = 0; // set queue for old dense routines magma_queue_t orig_queue=NULL; magmablasGetKernelStream( &orig_queue ); // prepare solver feedback solver_par->solver = Magma_CG; solver_par->numiter = 0; solver_par->info = MAGMA_SUCCESS; // local variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE; magma_int_t dofs = A.num_rows * b.num_cols; // GPU workspace magma_c_matrix r={Magma_CSR}, p={Magma_CSR}, q={Magma_CSR}; CHECK( magma_cvinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cvinit( &q, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // solver variables magmaFloatComplex alpha, beta; float nom, nom0, r0, betanom, betanomsq, den; // solver setup CHECK( magma_cresidualvec( A, b, *x, &r, &nom0, queue)); magma_ccopy( dofs, r.dval, 1, p.dval, 1 ); // p = r betanom = nom0; nom = nom0 * nom0; // nom = r' * r CHECK( magma_c_spmv( c_one, A, p, c_zero, q, queue )); // q = A p den = MAGMA_C_REAL( magma_cdotc(dofs, p.dval, 1, q.dval, 1) );// den = p dot q solver_par->init_res = nom0; if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) { solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; goto cleanup; } // check positive definite if (den <= 0.0) { printf("Operator A is not postive definite. (Ar,r) = %f\n", den); magmablasSetKernelStream( orig_queue ); info = MAGMA_NONSPD; goto cleanup; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nom0; solver_par->timing[0] = 0.0; } solver_par->numiter = 0; // start iteration do { solver_par->numiter++; alpha = MAGMA_C_MAKE(nom/den, 0.); magma_caxpy(dofs, alpha, p.dval, 1, x->dval, 1); // x = x + alpha p magma_caxpy(dofs, -alpha, q.dval, 1, r.dval, 1); // r = r - alpha q betanom = magma_scnrm2(dofs, r.dval, 1); // betanom = || r || betanomsq = betanom * betanom; // betanoms = r' * r if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < r0 ) { break; } beta = MAGMA_C_MAKE(betanomsq/nom, 0.); // beta = betanoms/nom magma_cscal(dofs, beta, p.dval, 1); // p = beta*p magma_caxpy(dofs, c_one, r.dval, 1, p.dval, 1); // p = p + r CHECK( magma_c_spmv( c_one, A, p, c_zero, q, queue )); // q = A p den = MAGMA_C_REAL(magma_cdotc(dofs, p.dval, 1, q.dval, 1)); // den = p dot q nom = betanomsq; } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; CHECK( magma_cresidualvec( A, b, *x, &r, &residual, queue)); solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter ) { solver_par->info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->epsilon*solver_par->init_res ){ info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_cmfree(&r, queue ); magma_cmfree(&p, queue ); magma_cmfree(&q, queue ); magmablasSetKernelStream( orig_queue ); solver_par->info = info; return info; } /* magma_ccg */