magma_int_t magma_zmsort( magmaDoubleComplex *x, magma_index_t *col, magma_index_t *row, magma_int_t first, magma_int_t last, magma_queue_t queue ) { magma_int_t info = 0; magmaDoubleComplex temp; magma_index_t pivot,j,i, tmpcol, tmprow; if(first<last){ pivot=first; i=first; j=last; while(i<j){ while( MAGMA_Z_ABS(x[i]) <= MAGMA_Z_ABS(x[pivot]) && i<last ) i++; while( MAGMA_Z_ABS(x[j]) > MAGMA_Z_ABS(x[pivot]) ) j--; if( i<j ){ temp = x[i]; x[i] = x[j]; x[j] = temp; tmpcol = col[i]; col[i] = col[j]; col[j] = tmpcol; tmprow = row[i]; row[i] = row[j]; row[j] = tmprow; } } temp=x[pivot]; x[pivot]=x[j]; x[j]=temp; CHECK( magma_zmsort( x, col, row, first, j-1, queue )); CHECK( magma_zmsort( x, col, row, j+1, last, queue )); } cleanup: return info; }
magma_int_t magma_zsort( magmaDoubleComplex *x, magma_int_t first, magma_int_t last, magma_queue_t queue ) { magma_int_t info = 0; magmaDoubleComplex temp; magma_index_t pivot,j,i; if(first<last){ pivot=first; i=first; j=last; while(i<j){ while( MAGMA_Z_ABS(x[i]) <= MAGMA_Z_ABS(x[pivot]) && i<last ) i++; while( MAGMA_Z_ABS(x[j]) > MAGMA_Z_ABS(x[pivot]) ) j--; if( i<j ){ temp = x[i]; x[i] = x[j]; x[j] = temp; } } temp=x[pivot]; x[pivot]=x[j]; x[j]=temp; CHECK( magma_zsort( x, first, j-1, queue )); CHECK( magma_zsort( x, j+1, last, queue )); } cleanup: return info; }
extern "C" magma_int_t magma_zpcg_merge( magma_z_matrix A, magma_z_matrix b, magma_z_matrix *x, magma_z_solver_par *solver_par, magma_z_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_PCGMERGE; solver_par->numiter = 0; solver_par->spmv_count = 0; // solver variables magmaDoubleComplex alpha, beta, gamma, rho, tmp1, *skp_h={0}; double nom, nom0, r0, res, nomb; magmaDoubleComplex den; // some useful variables magmaDoubleComplex c_zero = MAGMA_Z_ZERO, c_one = MAGMA_Z_ONE; magma_int_t dofs = A.num_rows*b.num_cols; magma_z_matrix r={Magma_CSR}, d={Magma_CSR}, z={Magma_CSR}, h={Magma_CSR}, rt={Magma_CSR}; magmaDoubleComplex *d1=NULL, *d2=NULL, *skp=NULL; // GPU workspace CHECK( magma_zvinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &d, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &z, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &rt, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &h, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zmalloc( &d1, dofs*(2) )); CHECK( magma_zmalloc( &d2, dofs*(2) )); // array for the parameters CHECK( magma_zmalloc( &skp, 7 )); // skp = [alpha|beta|gamma|rho|tmp1|tmp2|res] // solver setup CHECK( magma_zresidualvec( A, b, *x, &r, &nom0, queue)); // preconditioner CHECK( magma_z_applyprecond_left( MagmaNoTrans, A, r, &rt, precond_par, queue )); CHECK( magma_z_applyprecond_right( MagmaNoTrans, A, rt, &h, precond_par, queue )); magma_zcopy( dofs, h.dval, 1, d.dval, 1, queue ); nom = MAGMA_Z_ABS( magma_zdotc( dofs, r.dval, 1, h.dval, 1, queue )); CHECK( magma_z_spmv( c_one, A, d, c_zero, z, queue )); // z = A d den = magma_zdotc( dofs, d.dval, 1, z.dval, 1, queue ); // den = d'* z solver_par->init_res = nom0; nomb = magma_dznrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } if ( (r0 = nomb * solver_par->rtol) < ATOLERANCE ){ r0 = ATOLERANCE; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nom0; solver_par->timing[0] = 0.0; } if ( nom < r0 ) { info = MAGMA_SUCCESS; goto cleanup; } // check positive definite if ( MAGMA_Z_ABS(den) <= 0.0 ) { info = MAGMA_NONSPD; goto cleanup; } // array on host for the parameters CHECK( magma_zmalloc_cpu( &skp_h, 7 )); alpha = rho = gamma = tmp1 = c_one; beta = magma_zdotc( dofs, h.dval, 1, r.dval, 1, queue ); skp_h[0]=alpha; skp_h[1]=beta; skp_h[2]=gamma; skp_h[3]=rho; skp_h[4]=tmp1; skp_h[5]=MAGMA_Z_MAKE(nom, 0.0); skp_h[6]=MAGMA_Z_MAKE(nom, 0.0); magma_zsetvector( 7, skp_h, 1, skp, 1, queue ); //Chronometry real_Double_t tempo1, tempo2, tempop1, tempop2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; // computes SpMV and dot product CHECK( magma_zcgmerge_spmv1( A, d1, d2, d.dval, z.dval, skp, queue )); solver_par->spmv_count++; if( precond_par->solver == Magma_JACOBI ){ CHECK( magma_zjcgmerge_xrbeta( dofs, d1, d2, precond_par->d.dval, x->dval, r.dval, d.dval, z.dval, h.dval, skp, queue )); } else if( precond_par->solver == Magma_NONE ){ // updates x, r CHECK( magma_zpcgmerge_xrbeta1( dofs, x->dval, r.dval, d.dval, z.dval, skp, queue )); // computes scalars and updates d CHECK( magma_zpcgmerge_xrbeta2( dofs, d1, d2, r.dval, r.dval, d.dval, skp, queue )); } else { // updates x, r CHECK( magma_zpcgmerge_xrbeta1( dofs, x->dval, r.dval, d.dval, z.dval, skp, queue )); // preconditioner in between tempop1 = magma_sync_wtime( queue ); CHECK( magma_z_applyprecond_left( MagmaNoTrans, A, r, &rt, precond_par, queue )); CHECK( magma_z_applyprecond_right( MagmaNoTrans, A, rt, &h, precond_par, queue )); // magma_zcopy( dofs, r.dval, 1, h.dval, 1 ); tempop2 = magma_sync_wtime( queue ); precond_par->runtime += tempop2-tempop1; // computes scalars and updates d CHECK( magma_zpcgmerge_xrbeta2( dofs, d1, d2, h.dval, r.dval, d.dval, skp, queue )); } //if( solver_par->numiter==1){ // magma_zcopy( dofs, h.dval, 1, d.dval, 1 ); //} // updates x, r, computes scalars and updates d //CHECK( magma_zcgmerge_xrbeta( dofs, d1, d2, x->dval, r.dval, d.dval, z.dval, skp, queue )); // check stopping criterion (asynchronous copy) magma_zgetvector( 1 , skp+6, 1, skp_h+6, 1, queue ); res = sqrt(MAGMA_Z_ABS(skp_h[6])); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res/nomb <= solver_par->rtol || res <= solver_par->atol ){ break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; double residual; CHECK( magma_zresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = res; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->atol || solver_par->iter_res/solver_par->init_res < solver_par->rtol ){ info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_DIVERGENCE; } cleanup: magma_zmfree(&r, queue ); magma_zmfree(&z, queue ); magma_zmfree(&d, queue ); magma_zmfree(&rt, queue ); magma_zmfree(&h, queue ); magma_free( d1 ); magma_free( d2 ); magma_free( skp ); magma_free_cpu( skp_h ); solver_par->info = info; return info; } /* magma_zpcg_merge */
extern "C" magma_int_t magma_zqmr_merge( magma_z_matrix A, magma_z_matrix b, magma_z_matrix *x, magma_z_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_QMRMERGE; solver_par->numiter = 0; solver_par->spmv_count = 0; // local variables magmaDoubleComplex c_zero = MAGMA_Z_ZERO, c_one = MAGMA_Z_ONE; // solver variables double nom0, r0, res=0, nomb; magmaDoubleComplex rho = c_one, rho1 = c_one, eta = -c_one , pds = c_one, thet = c_one, thet1 = c_one, epsilon = c_one, beta = c_one, delta = c_one, pde = c_one, rde = c_one, gamm = c_one, gamm1 = c_one, psi = c_one; magma_int_t dofs = A.num_rows* b.num_cols; // need to transpose the matrix magma_z_matrix AT={Magma_CSR}, Ah1={Magma_CSR}, Ah2={Magma_CSR}; // GPU workspace magma_z_matrix r={Magma_CSR}, r_tld={Magma_CSR}, v={Magma_CSR}, w={Magma_CSR}, wt={Magma_CSR}, d={Magma_CSR}, s={Magma_CSR}, z={Magma_CSR}, q={Magma_CSR}, p={Magma_CSR}, pt={Magma_CSR}, y={Magma_CSR}; CHECK( magma_zvinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &r_tld, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &v, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &w, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &wt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &d, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &s, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &z, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &q, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &pt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &y, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // solver setup CHECK( magma_zresidualvec( A, b, *x, &r, &nom0, queue)); solver_par->init_res = nom0; magma_zcopy( dofs, r.dval, 1, r_tld.dval, 1, queue ); magma_zcopy( dofs, r.dval, 1, y.dval, 1, queue ); magma_zcopy( dofs, r.dval, 1, v.dval, 1, queue ); magma_zcopy( dofs, r.dval, 1, wt.dval, 1, queue ); magma_zcopy( dofs, r.dval, 1, z.dval, 1, queue ); // transpose the matrix magma_zmtransfer( A, &Ah1, Magma_DEV, Magma_CPU, queue ); magma_zmconvert( Ah1, &Ah2, A.storage_type, Magma_CSR, queue ); magma_zmfree(&Ah1, queue ); magma_zmtransposeconjugate( Ah2, &Ah1, queue ); magma_zmfree(&Ah2, queue ); Ah2.blocksize = A.blocksize; Ah2.alignment = A.alignment; magma_zmconvert( Ah1, &Ah2, Magma_CSR, A.storage_type, queue ); magma_zmfree(&Ah1, queue ); magma_zmtransfer( Ah2, &AT, Magma_CPU, Magma_DEV, queue ); magma_zmfree(&Ah2, queue ); nomb = magma_dznrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } if ( (r0 = nomb * solver_par->rtol) < ATOLERANCE ){ r0 = ATOLERANCE; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nom0; solver_par->timing[0] = 0.0; } if ( nom0 < r0 ) { info = MAGMA_SUCCESS; goto cleanup; } psi = magma_zsqrt( magma_zdotc( dofs, z.dval, 1, z.dval, 1, queue )); rho = magma_zsqrt( magma_zdotc( dofs, y.dval, 1, y.dval, 1, queue )); // v = y / rho // y = y / rho // w = wt / psi // z = z / psi magma_zqmr_1( r.num_rows, r.num_cols, rho, psi, y.dval, z.dval, v.dval, w.dval, queue ); //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; if( magma_z_isnan_inf( rho ) || magma_z_isnan_inf( psi ) ){ info = MAGMA_DIVERGENCE; break; } // delta = z' * y; delta = magma_zdotc( dofs, z.dval, 1, y.dval, 1, queue ); if( magma_z_isnan_inf( delta ) ){ info = MAGMA_DIVERGENCE; break; } // no precond: yt = y, zt = z //magma_zcopy( dofs, y.dval, 1, yt.dval, 1 ); //magma_zcopy( dofs, z.dval, 1, zt.dval, 1 ); if( solver_par->numiter == 1 ){ // p = y; // q = z; magma_zcopy( dofs, y.dval, 1, p.dval, 1, queue ); magma_zcopy( dofs, z.dval, 1, q.dval, 1, queue ); } else{ pde = psi * delta / epsilon; rde = rho * MAGMA_Z_CONJ(delta/epsilon); // p = y - pde * p // q = z - rde * q magma_zqmr_2( r.num_rows, r.num_cols, pde, rde, y.dval, z.dval, p.dval, q.dval, queue ); } if( magma_z_isnan_inf( rho ) || magma_z_isnan_inf( psi ) ){ info = MAGMA_DIVERGENCE; break; } CHECK( magma_z_spmv( c_one, A, p, c_zero, pt, queue )); solver_par->spmv_count++; // epsilon = q' * pt; epsilon = magma_zdotc( dofs, q.dval, 1, pt.dval, 1, queue ); beta = epsilon / delta; if( magma_z_isnan_inf( epsilon ) || magma_z_isnan_inf( beta ) ){ info = MAGMA_DIVERGENCE; break; } // v = pt - beta * v // y = v magma_zqmr_3( r.num_rows, r.num_cols, beta, pt.dval, v.dval, y.dval, queue ); rho1 = rho; // rho = norm(y); rho = magma_zsqrt( magma_zdotc( dofs, y.dval, 1, y.dval, 1, queue )); // wt = A' * q - beta' * w; CHECK( magma_z_spmv( c_one, AT, q, c_zero, wt, queue )); solver_par->spmv_count++; magma_zaxpy( dofs, - MAGMA_Z_CONJ( beta ), w.dval, 1, wt.dval, 1, queue ); // no precond: z = wt magma_zcopy( dofs, wt.dval, 1, z.dval, 1, queue ); thet1 = thet; thet = rho / (gamm * MAGMA_Z_MAKE( MAGMA_Z_ABS(beta), 0.0 )); gamm1 = gamm; gamm = c_one / magma_zsqrt(c_one + thet*thet); eta = - eta * rho1 * gamm * gamm / (beta * gamm1 * gamm1); if( magma_z_isnan_inf( thet ) || magma_z_isnan_inf( gamm ) || magma_z_isnan_inf( eta ) ){ info = MAGMA_DIVERGENCE; break; } if( solver_par->numiter == 1 ){ // d = eta * p + pds * d; // s = eta * pt + pds * d; // x = x + d; // r = r - s; magma_zqmr_4( r.num_rows, r.num_cols, eta, p.dval, pt.dval, d.dval, s.dval, x->dval, r.dval, queue ); } else{ pds = (thet1 * gamm) * (thet1 * gamm); // d = eta * p + pds * d; // s = eta * pt + pds * d; // x = x + d; // r = r - s; magma_zqmr_5( r.num_rows, r.num_cols, eta, pds, p.dval, pt.dval, d.dval, s.dval, x->dval, r.dval, queue ); } // psi = norm(z); psi = magma_zsqrt( magma_zdotc( dofs, z.dval, 1, z.dval, 1, queue ) ); res = magma_dznrm2( dofs, r.dval, 1, queue ); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose == c_zero ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } // v = y / rho // y = y / rho // w = wt / psi // z = z / psi magma_zqmr_1( r.num_rows, r.num_cols, rho, psi, y.dval, z.dval, v.dval, w.dval, queue ); if ( res/nomb <= solver_par->rtol || res <= solver_par->atol ){ break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; double residual; CHECK( magma_zresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = res; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter && info == MAGMA_SUCCESS ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == c_zero ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->rtol*solver_par->init_res || solver_par->iter_res < solver_par->atol ) { info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == c_zero ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_zmfree(&r, queue ); magma_zmfree(&r_tld, queue ); magma_zmfree(&v, queue ); magma_zmfree(&w, queue ); magma_zmfree(&wt, queue ); magma_zmfree(&d, queue ); magma_zmfree(&s, queue ); magma_zmfree(&z, queue ); magma_zmfree(&q, queue ); magma_zmfree(&p, queue ); magma_zmfree(&pt, queue ); magma_zmfree(&y, queue ); magma_zmfree(&AT, queue ); magma_zmfree(&Ah1, queue ); magma_zmfree(&Ah2, queue ); solver_par->info = info; return info; } /* magma_zqmr_merge */
magma_int_t magma_zorderstatistics( magmaDoubleComplex *val, magma_int_t length, magma_int_t k, magma_int_t r, magmaDoubleComplex *element, magma_queue_t queue ) { magma_int_t info = 0; magma_int_t i, st; magmaDoubleComplex tmp; if( r == 0 ){ for ( st = i = 0; i < length - 1; i++ ) { if ( magma_z_isnan_inf( val[i]) ) { printf("error: array contains %f + %fi.\n", MAGMA_Z_REAL(val[i]), MAGMA_Z_IMAG(val[i]) ); info = MAGMA_ERR_NAN; goto cleanup; } if ( MAGMA_Z_ABS(val[i]) > MAGMA_Z_ABS(val[length-1]) ){ continue; } SWAP(i, st); st++; } SWAP(length-1, st); if ( k == st ){ *element = val[st]; } else if ( st > k ) { CHECK( magma_zorderstatistics( val, st, k, r, element, queue )); } else { CHECK( magma_zorderstatistics( val+st, length-st, k-st, r, element, queue )); } } else { for ( st = i = 0; i < length - 1; i++ ) { if ( magma_z_isnan_inf( val[i]) ) { printf("error: array contains %f + %fi.\n", MAGMA_Z_REAL(val[i]), MAGMA_Z_IMAG(val[i]) ); info = MAGMA_ERR_NAN; goto cleanup; } if ( MAGMA_Z_ABS(val[i]) < MAGMA_Z_ABS(val[length-1]) ){ continue; } SWAP(i, st); st++; } SWAP(length-1, st); if ( k == st ){ *element = val[st]; } else if ( st > k ) { CHECK( magma_zorderstatistics( val, st, k, r, element, queue )); } else { CHECK( magma_zorderstatistics( val+st, length-st, k-st, r, element, queue )); } } cleanup: return info; }
/** Purpose ------- ZLAQPS computes a step of QR factorization with column pivoting of a complex M-by-N matrix A by using Blas-3. It tries to factorize NB columns from A starting from the row OFFSET+1, and updates all of the matrix with Blas-3 xGEMM. In some cases, due to catastrophic cancellations, it cannot factorize NB columns. Hence, the actual number of factorized columns is returned in KB. Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0 @param[in] offset INTEGER The number of rows of A that have been factorized in previous steps. @param[in] nb INTEGER The number of columns to factorize. @param[out] kb INTEGER The number of columns actually factorized. @param[in,out] A COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, block A(OFFSET+1:M,1:KB) is the triangular factor obtained and block A(1:OFFSET,1:N) has been accordingly pivoted, but no factorized. The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has been updated. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[in,out] jpvt INTEGER array, dimension (N) JPVT(I) = K <==> Column K of the full matrix A has been permuted into position I in AP. @param[out] tau COMPLEX_16 array, dimension (KB) The scalar factors of the elementary reflectors. @param[in,out] vn1 DOUBLE PRECISION array, dimension (N) The vector with the partial column norms. @param[in,out] vn2 DOUBLE PRECISION array, dimension (N) The vector with the exact column norms. @param[in,out] auxv COMPLEX_16 array, dimension (NB) Auxiliar vector. @param[in,out] F COMPLEX_16 array, dimension (LDF,NB) Matrix F' = L*Y'*A. @param[in] ldf INTEGER The leading dimension of the array F. LDF >= max(1,N). @ingroup magma_zgeqp3_aux ********************************************************************/ extern "C" magma_int_t magma_zlaqps( magma_int_t m, magma_int_t n, magma_int_t offset, magma_int_t nb, magma_int_t *kb, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex_ptr dA, magma_int_t ldda, magma_int_t *jpvt, magmaDoubleComplex *tau, double *vn1, double *vn2, magmaDoubleComplex *auxv, magmaDoubleComplex *F, magma_int_t ldf, magmaDoubleComplex_ptr dF, magma_int_t lddf) { #define A(i, j) (A + (i) + (j)*(lda )) #define dA(i, j) (dA + (i) + (j)*(ldda)) #define F(i, j) (F + (i) + (j)*(ldf )) #define dF(i, j) (dF + (i) + (j)*(lddf)) magmaDoubleComplex c_zero = MAGMA_Z_MAKE( 0.,0.); magmaDoubleComplex c_one = MAGMA_Z_MAKE( 1.,0.); magmaDoubleComplex c_neg_one = MAGMA_Z_MAKE(-1.,0.); magma_int_t ione = 1; magma_int_t i__1, i__2; double d__1; magmaDoubleComplex z__1; magma_int_t j, k, rk; magmaDoubleComplex Akk; magma_int_t pvt; double temp, temp2, tol3z; magma_int_t itemp; magma_int_t lsticc; magma_int_t lastrk; lastrk = min( m, n + offset ); tol3z = magma_dsqrt( lapackf77_dlamch("Epsilon")); magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); lsticc = 0; k = 0; while( k < nb && lsticc == 0 ) { rk = offset + k; /* Determine ith pivot column and swap if necessary */ // subtract 1 from Fortran idamax; pvt, k are 0-based. i__1 = n-k; pvt = k + blasf77_idamax( &i__1, &vn1[k], &ione ) - 1; if (pvt != k) { if (pvt >= nb) { /* 1. Start copy from GPU */ magma_zgetmatrix_async( m - offset - nb, 1, dA(offset + nb, pvt), ldda, A (offset + nb, pvt), lda, queue ); } /* F gets swapped so F must be sent at the end to GPU */ i__1 = k; blasf77_zswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf ); itemp = jpvt[pvt]; jpvt[pvt] = jpvt[k]; jpvt[k] = itemp; vn1[pvt] = vn1[k]; vn2[pvt] = vn2[k]; if (pvt < nb) { /* no need of transfer if pivot is within the panel */ blasf77_zswap( &m, A(0, pvt), &ione, A(0, k), &ione ); } else { /* 1. Finish copy from GPU */ magma_queue_sync( queue ); /* 2. Swap as usual on CPU */ blasf77_zswap(&m, A(0, pvt), &ione, A(0, k), &ione); /* 3. Restore the GPU */ magma_zsetmatrix_async( m - offset - nb, 1, A (offset + nb, pvt), lda, dA(offset + nb, pvt), ldda, queue ); } } /* Apply previous Householder reflectors to column K: A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'. Optimization: multiply with beta=0; wait for vector and subtract */ if (k > 0) { #ifdef COMPLEX for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_Z_CONJ( *F(k,j) ); } #endif i__1 = m - rk; i__2 = k; blasf77_zgemv( MagmaNoTransStr, &i__1, &i__2, &c_neg_one, A(rk, 0), &lda, F(k, 0), &ldf, &c_one, A(rk, k), &ione ); #ifdef COMPLEX for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_Z_CONJ( *F(k,j) ); } #endif } /* Generate elementary reflector H(k). */ if (rk < m-1) { i__1 = m - rk; lapackf77_zlarfg( &i__1, A(rk, k), A(rk + 1, k), &ione, &tau[k] ); } else { lapackf77_zlarfg( &ione, A(rk, k), A(rk, k), &ione, &tau[k] ); } Akk = *A(rk, k); *A(rk, k) = c_one; /* Compute Kth column of F: Compute F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */ if (k < n-1) { i__1 = m - rk; i__2 = n - k - 1; /* Send the vector to the GPU */ magma_zsetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda, queue ); /* Multiply on GPU */ // was CALL ZGEMV( 'Conjugate transpose', M-RK+1, N-K, // TAU( K ), A( RK, K+1 ), LDA, // A( RK, K ), 1, // CZERO, F( K+1, K ), 1 ) magma_int_t i__3 = nb-k-1; magma_int_t i__4 = i__2 - i__3; magma_int_t i__5 = nb-k; magma_zgemv( MagmaConjTrans, i__1 - i__5, i__2 - i__3, tau[k], dA(rk +i__5, k+1+i__3), ldda, dA(rk +i__5, k ), ione, c_zero, dF(k+1+i__3, k ), ione, queue ); magma_zgetmatrix_async( i__2-i__3, 1, dF(k + 1 +i__3, k), i__2, F (k + 1 +i__3, k), i__2, queue ); blasf77_zgemv( MagmaConjTransStr, &i__1, &i__3, &tau[k], A(rk, k+1), &lda, A(rk, k ), &ione, &c_zero, F(k+1, k ), &ione ); magma_queue_sync( queue ); blasf77_zgemv( MagmaConjTransStr, &i__5, &i__4, &tau[k], A(rk, k+1+i__3), &lda, A(rk, k ), &ione, &c_one, F(k+1+i__3, k ), &ione ); } /* Padding F(1:K,K) with zeros. */ for (j = 0; j < k; ++j) { *F(j, k) = c_zero; } /* Incremental updating of F: F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). */ if (k > 0) { i__1 = m - rk; i__2 = k; z__1 = MAGMA_Z_NEGATE( tau[k] ); blasf77_zgemv( MagmaConjTransStr, &i__1, &i__2, &z__1, A(rk, 0), &lda, A(rk, k), &ione, &c_zero, auxv, &ione ); i__1 = k; blasf77_zgemv( MagmaNoTransStr, &n, &i__1, &c_one, F(0,0), &ldf, auxv, &ione, &c_one, F(0,k), &ione ); } /* Optimization: On the last iteration start sending F back to the GPU */ /* Update the current row of A: A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'. */ if (k < n-1) { i__1 = n - k - 1; i__2 = k + 1; blasf77_zgemm( MagmaNoTransStr, MagmaConjTransStr, &ione, &i__1, &i__2, &c_neg_one, A(rk, 0 ), &lda, F(k+1,0 ), &ldf, &c_one, A(rk, k+1), &lda ); } /* Update partial column norms. */ if (rk < lastrk) { for (j = k + 1; j < n; ++j) { if (vn1[j] != 0.) { /* NOTE: The following 4 lines follow from the analysis in Lapack Working Note 176. */ temp = MAGMA_Z_ABS( *A(rk,j) ) / vn1[j]; temp = max( 0., ((1. + temp) * (1. - temp)) ); d__1 = vn1[j] / vn2[j]; temp2 = temp * (d__1 * d__1); if (temp2 <= tol3z) { vn2[j] = (double) lsticc; lsticc = j; } else { vn1[j] *= magma_dsqrt(temp); } } } } *A(rk, k) = Akk; ++k; } // leave k as the last column done --k; *kb = k + 1; rk = offset + *kb - 1; /* Apply the block reflector to the rest of the matrix: A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)' */ if (*kb < min(n, m - offset)) { i__1 = m - rk - 1; i__2 = n - *kb; /* Send F to the GPU */ magma_zsetmatrix( i__2, *kb, F (*kb, 0), ldf, dF(*kb, 0), i__2, queue ); magma_zgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb, c_neg_one, dA(rk+1, 0 ), ldda, dF(*kb, 0 ), i__2, c_one, dA(rk+1, *kb), ldda, queue ); } /* Recomputation of difficult columns. */ while( lsticc > 0 ) { itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc])); i__1 = m - rk - 1; if (lsticc <= nb) { vn1[lsticc] = magma_cblas_dznrm2( i__1, A(rk+1,lsticc), ione ); } else { /* Where is the data, CPU or GPU ? */ double r1, r2; r1 = magma_cblas_dznrm2( nb-k, A(rk+1,lsticc), ione ); r2 = magma_dznrm2( m-offset-nb, dA(offset + nb + 1, lsticc), ione, queue ); //vn1[lsticc] = magma_dznrm2( i__1, dA(rk + 1, lsticc), ione, queue ); vn1[lsticc] = magma_dsqrt(r1*r1 + r2*r2); } /* NOTE: The computation of VN1( LSTICC ) relies on the fact that SNRM2 does not fail on vectors with norm below the value of SQRT(DLAMCH('S')) */ vn2[lsticc] = vn1[lsticc]; lsticc = itemp; } magma_queue_destroy( queue ); return MAGMA_SUCCESS; } /* magma_zlaqps */
extern "C" magma_int_t magma_zcg_res( magma_z_matrix A, magma_z_matrix b, magma_z_matrix *x, magma_z_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_CG; solver_par->numiter = 0; solver_par->spmv_count = 0; // solver variables magmaDoubleComplex alpha, beta; double nom, nom0, r0, res, nomb; magmaDoubleComplex den, gammanew, gammaold = MAGMA_Z_MAKE(1.0,0.0); // local variables magmaDoubleComplex c_zero = MAGMA_Z_ZERO, c_one = MAGMA_Z_ONE; magma_int_t dofs = A.num_rows* b.num_cols; // GPU workspace magma_z_matrix r={Magma_CSR}, p={Magma_CSR}, q={Magma_CSR}; CHECK( magma_zvinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &q, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // solver setup CHECK( magma_zresidualvec( A, b, *x, &r, &nom0, queue)); magma_zcopy( dofs, r.dval, 1, p.dval, 1, queue ); // p = h nom = MAGMA_Z_ABS( magma_zdotc( dofs, r.dval, 1, r.dval, 1, queue) ); CHECK( magma_z_spmv( c_one, A, p, c_zero, q, queue )); // q = A p solver_par->spmv_count++; den = magma_zdotc( dofs, p.dval, 1, q.dval, 1, queue ); // den = p dot q solver_par->init_res = nom0; nomb = magma_dznrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } if ( (r0 = nomb * solver_par->rtol) < ATOLERANCE ){ r0 = ATOLERANCE; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nom0; solver_par->timing[0] = 0.0; } if ( nom < r0 ) { info = MAGMA_SUCCESS; goto cleanup; } // check positive definite if ( MAGMA_Z_ABS(den) <= 0.0 ) { info = MAGMA_NONSPD; goto cleanup; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; gammanew = magma_zdotc( dofs, r.dval, 1, r.dval, 1, queue ); // gn = < r,r> if ( solver_par->numiter == 1 ) { magma_zcopy( dofs, r.dval, 1, p.dval, 1, queue ); // p = r } else { beta = (gammanew/gammaold); // beta = gn/go magma_zscal( dofs, beta, p.dval, 1, queue ); // p = beta*p magma_zaxpy( dofs, c_one, r.dval, 1, p.dval, 1, queue ); // p = p + r } CHECK( magma_z_spmv( c_one, A, p, c_zero, q, queue )); // q = A p solver_par->spmv_count++; den = magma_zdotc( dofs, p.dval, 1, q.dval, 1, queue ); // den = p dot q alpha = gammanew / den; magma_zaxpy( dofs, alpha, p.dval, 1, x->dval, 1, queue ); // x = x + alpha p magma_zaxpy( dofs, -alpha, q.dval, 1, r.dval, 1, queue ); // r = r - alpha q gammaold = gammanew; res = magma_dznrm2( dofs, r.dval, 1, queue ); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res/nomb <= solver_par->rtol || res <= solver_par->atol ){ break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; double residual; CHECK( magma_zresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = res; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->rtol*solver_par->init_res || solver_par->iter_res < solver_par->atol ) { info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_zmfree(&r, queue ); magma_zmfree(&p, queue ); magma_zmfree(&q, queue ); solver_par->info = info; return info; } /* magma_zcg */
extern "C" magma_int_t magma_zmslice( magma_int_t num_slices, magma_int_t slice, magma_z_matrix A, magma_z_matrix *B, magma_z_matrix *ALOC, magma_z_matrix *ANLOC, magma_index_t *comm_i, magmaDoubleComplex *comm_v, magma_int_t *start, magma_int_t *end, magma_queue_t queue ) { magma_int_t info = 0; if( A.num_rows != A.num_cols ){ printf("%% error: only supported for square matrices.\n"); info = MAGMA_ERR_NOT_SUPPORTED; goto cleanup; } if ( A.memory_location == Magma_CPU && A.storage_type == Magma_CSR ){ CHECK( magma_zmconvert( A, B, Magma_CSR, Magma_CSR, queue ) ); magma_free_cpu( B->col ); magma_free_cpu( B->val ); CHECK( magma_zmconvert( A, ALOC, Magma_CSR, Magma_CSR, queue ) ); magma_free_cpu( ALOC->col ); magma_free_cpu( ALOC->row ); magma_free_cpu( ALOC->val ); CHECK( magma_zmconvert( A, ANLOC, Magma_CSR, Magma_CSR, queue ) ); magma_free_cpu( ANLOC->col ); magma_free_cpu( ANLOC->row ); magma_free_cpu( ANLOC->val ); magma_int_t i,j,k, nnz, nnz_loc=0, loc_row = 0, nnz_nloc = 0; magma_index_t col; magma_int_t size = magma_ceildiv( A.num_rows, num_slices ); magma_int_t lstart = slice*size; magma_int_t lend = min( (slice+1)*size, A.num_rows ); // correct size for last slice size = lend-lstart; CHECK( magma_index_malloc_cpu( &ALOC->row, size+1 ) ); CHECK( magma_index_malloc_cpu( &ANLOC->row, size+1 ) ); // count elements for slice - identity for rest nnz = A.row[ lend ] - A.row[ lstart ] + ( A.num_rows - size ); CHECK( magma_index_malloc_cpu( &B->col, nnz ) ); CHECK( magma_zmalloc_cpu( &B->val, nnz ) ); // for the communication plan for( i=0; i<A.num_rows; i++ ) { comm_i[i] = 0; comm_v[i] = MAGMA_Z_ZERO; } k=0; B->row[i] = 0; ALOC->row[0] = 0; ANLOC->row[0] = 0; // identity above slice for( i=0; i<lstart; i++ ) { B->row[i+1] = B->row[i]+1; B->val[k] = MAGMA_Z_ONE; B->col[k] = i; k++; } // slice for( i=lstart; i<lend; i++ ) { B->row[i+1] = B->row[i] + (A.row[i+1]-A.row[i]); for( j=A.row[i]; j<A.row[i+1]; j++ ){ B->val[k] = A.val[j]; col = A.col[j]; B->col[k] = col; // communication plan if( col<lstart || col>=lend ){ comm_i[ col ] = 1; comm_v[ col ] = comm_v[ col ] + MAGMA_Z_MAKE( MAGMA_Z_ABS( A.val[j] ), 0.0 ); nnz_nloc++; } else { nnz_loc++; } k++; } loc_row++; ALOC->row[ loc_row ] = nnz_loc; ANLOC->row[ loc_row ] = nnz_nloc; } CHECK( magma_index_malloc_cpu( &ALOC->col, nnz_loc ) ); CHECK( magma_zmalloc_cpu( &ALOC->val, nnz_loc ) ); ALOC->num_rows = size; ALOC->num_cols = size; ALOC->nnz = nnz_loc; CHECK( magma_index_malloc_cpu( &ANLOC->col, nnz_nloc ) ); CHECK( magma_zmalloc_cpu( &ANLOC->val, nnz_nloc ) ); ANLOC->num_rows = size; ANLOC->num_cols = A.num_cols; ANLOC->nnz = nnz_nloc; nnz_loc = 0; nnz_nloc = 0; // local/nonlocal matrix for( i=lstart; i<lend; i++ ) { for( j=A.row[i]; j<A.row[i+1]; j++ ){ col = A.col[j]; // insert only in local part in ALOC, nonlocal in ANLOC if( col<lstart || col>=lend ){ ANLOC->val[ nnz_nloc ] = A.val[j]; ANLOC->col[ nnz_nloc ] = col; nnz_nloc++; } else { ALOC->val[ nnz_loc ] = A.val[j]; ALOC->col[ nnz_loc ] = col-lstart; nnz_loc++; } } } // identity below slice for( i=lend; i<A.num_rows; i++ ) { B->row[i+1] = B->row[i]+1; B->val[k] = MAGMA_Z_ONE; B->col[k] = i; k++; } B->nnz = k; *start = lstart; *end = lend; } else { printf("error: mslice only supported for CSR matrices on the CPU: %d %d.\n", int(A.memory_location), int(A.storage_type) ); info = MAGMA_ERR_NOT_SUPPORTED; } cleanup: return info; }
extern "C" magma_int_t magma_zbombard_merge( magma_z_matrix A, magma_z_matrix b, magma_z_matrix *x, magma_z_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // queue variables const magma_queue_t squeue = 0; // synchronous kernel queues const magma_int_t nqueues = 3; // number of queues magma_queue_t queues[nqueues]; magma_int_t q1flag = 0; // set asynchronous kernel queues //printf("%% Kernel queues: (orig, queue) = (%p, %p)\n", (void *)orig_queue, (void *)queue); magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); if ( queue != squeue ) { queues[1] = queue; q1flag = 0; } else { magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &(queues[1]) ); queue = queues[1]; q1flag = 1; } magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &(queues[2]) ); for (int i = 0; i < nqueues; ++i ) { ; //printf("Kernel queue #%d = %p\n", i, (void *)queues[i]); } // 1=QMR, 2=CGS, 3+BiCGSTAB magma_int_t flag = 0; int mdot = 1; // prepare solver feedback solver_par->solver = Magma_BOMBARD; solver_par->numiter = 0; // constants const magmaDoubleComplex c_zero = MAGMA_Z_ZERO; const magmaDoubleComplex c_one = MAGMA_Z_ONE; // solver variables double nom0, r0, res, Q_res, C_res, B_res, nomb; //QMR magmaDoubleComplex Q_rho = c_one, Q_rho1 = c_one, Q_eta = -c_one , Q_pds = c_one, Q_thet = c_one, Q_thet1 = c_one, Q_epsilon = c_one, Q_beta = c_one, Q_delta = c_one, Q_pde = c_one, Q_rde = c_one, Q_gamm = c_one, Q_gamm1 = c_one, Q_psi = c_one; //CGS magmaDoubleComplex C_rho, C_rho_l = c_one, C_alpha, C_beta; //BiCGSTAB magmaDoubleComplex B_alpha, B_beta, B_omega, B_rho_old, B_rho_new; magma_int_t dofs = A.num_rows* b.num_cols; // need to transpose the matrix // GPU workspace // overall initial residual magma_z_matrix r_tld={Magma_CSR}; // QMR magma_z_matrix AT = {Magma_CSR}, Q_r={Magma_CSR}, Q_x={Magma_CSR}, Q_v={Magma_CSR}, Q_w={Magma_CSR}, Q_wt={Magma_CSR}, Q_d={Magma_CSR}, Q_s={Magma_CSR}, Q_z={Magma_CSR}, Q_q={Magma_CSR}, Q_p={Magma_CSR}, Q_pt={Magma_CSR}, Q_y={Magma_CSR}; // CGS magma_z_matrix C_r={Magma_CSR}, C_rt={Magma_CSR}, C_x={Magma_CSR}, C_p={Magma_CSR}, C_q={Magma_CSR}, C_u={Magma_CSR}, C_v={Magma_CSR}, C_t={Magma_CSR}, C_p_hat={Magma_CSR}, C_q_hat={Magma_CSR}, C_u_hat={Magma_CSR}, C_v_hat={Magma_CSR}; //BiCGSTAB magma_z_matrix B_r={Magma_CSR}, B_x={Magma_CSR}, B_p={Magma_CSR}, B_v={Magma_CSR}, B_s={Magma_CSR}, B_t={Magma_CSR}; // multi-vector for block-SpMV magma_z_matrix SpMV_in_1={Magma_CSR}, SpMV_out_1={Magma_CSR}, SpMV_in_2={Magma_CSR}, SpMV_out_2={Magma_CSR}; // workspace for zmzdotc magma_z_matrix d1={Magma_CSR}, d2={Magma_CSR}, skp={Magma_CSR}; CHECK( magma_zvinit( &r_tld, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &SpMV_in_1, Magma_DEV, A.num_rows, b.num_cols * 3, c_zero, queue )); CHECK( magma_zvinit( &SpMV_out_1, Magma_DEV, A.num_rows, b.num_cols * 3, c_zero, queue )); CHECK( magma_zvinit( &SpMV_in_2, Magma_DEV, A.num_rows, b.num_cols * 3, c_zero, queue )); CHECK( magma_zvinit( &SpMV_out_2, Magma_DEV, A.num_rows, b.num_cols * 3, c_zero, queue )); CHECK( magma_zvinit( &d1, Magma_DEV, A.num_rows, b.num_cols * 4, c_zero, queue )); CHECK( magma_zvinit( &d2, Magma_DEV, A.num_rows, b.num_cols * 4, c_zero, queue )); CHECK( magma_zvinit( &skp, Magma_CPU, 4, 1, c_zero, queue )); // QMR CHECK( magma_zvinit( &Q_r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &Q_v, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &Q_w, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &Q_wt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &Q_d, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &Q_s, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &Q_z, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &Q_q, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &Q_p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &Q_pt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &Q_y, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &Q_x, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // QMR CHECK( magma_zvinit( &C_r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &C_rt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &C_x,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &C_p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &C_p_hat, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &C_q, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &C_q_hat, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &C_u, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &C_u_hat, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &C_v, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &C_v_hat, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &C_t, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // BiCGSTAB CHECK( magma_zvinit( &B_r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &B_x,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &B_p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &B_v, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &B_s, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &B_t, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // solver setup CHECK( magma_zresidualvec( A, b, *x, &r_tld, &nom0, queue)); solver_par->init_res = nom0; res = nom0; // QMR magma_zcopy( dofs, r_tld.dval, 1, Q_r.dval, 1, queue ); magma_zcopy( dofs, r_tld.dval, 1, Q_y.dval, 1, queue ); magma_zcopy( dofs, r_tld.dval, 1, Q_v.dval, 1, queue ); magma_zcopy( dofs, r_tld.dval, 1, Q_wt.dval, 1, queue ); magma_zcopy( dofs, r_tld.dval, 1, Q_z.dval, 1, queue ); magma_zcopy( dofs, x->dval, 1, Q_x.dval, 1, queue ); // transpose the matrix magma_zmtransposeconjugate( A, &AT, queue ); // CGS magma_zcopy( dofs, r_tld.dval, 1, C_r.dval, 1, queue ); magma_zcopy( dofs, x->dval, 1, C_x.dval, 1, queue ); // BiCGSTAB magma_zcopy( dofs, r_tld.dval, 1, B_r.dval, 1, queue ); magma_zcopy( dofs, x->dval, 1, B_x.dval, 1, queue ); CHECK( magma_z_spmv( c_one, A, B_r, c_zero, B_v, queue )); magma_zcopy( dofs, B_v.dval, 1, SpMV_out_1.dval+2*dofs, 1, queue ); nomb = magma_dznrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } if ( (r0 = nomb * solver_par->rtol) < ATOLERANCE ){ r0 = ATOLERANCE; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nom0; solver_par->timing[0] = 0.0; } if ( nom0 < r0 ) { info = MAGMA_SUCCESS; goto cleanup; } Q_psi = magma_zsqrt( magma_zdotc( dofs, Q_z.dval, 1, Q_z.dval, 1), queue ); Q_rho = magma_zsqrt( magma_zdotc( dofs, Q_y.dval, 1, Q_y.dval, 1), queue ); // BiCGSTAB B_rho_new = magma_zdotc( dofs, B_r.dval, 1, B_r.dval, 1, queue ); B_rho_old = B_omega = B_alpha = MAGMA_Z_MAKE( 1.0, 0. ); // v = y / rho // y = y / rho // w = wt / psi // z = z / psi magma_zqmr_1( b.num_rows, b.num_cols, Q_rho, Q_psi, Q_y.dval, Q_z.dval, Q_v.dval, Q_w.dval, queue ); //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; // start iteration do { solver_par->numiter++; if(mdot == 0){ //QMR: delta = z' * y; Q_delta = magma_zdotc( dofs, Q_z.dval, 1, Q_y.dval, 1, queue ); //CGS: rho = r' * r_tld C_rho = magma_zdotc( dofs, C_r.dval, 1, r_tld.dval, 1, queue ); // BiCGSTAB B_rho_old = B_rho_new; B_rho_new = magma_zdotc( dofs, r_tld.dval, 1, B_r.dval, 1, queue ); // rho=<rr,r> B_beta = B_rho_new/B_rho_old * B_alpha/B_omega; // beta=rho/rho_old *alpha/omega }else{ B_rho_old = B_rho_new; magma_zmdotc4( b.num_rows, Q_z.dval, Q_y.dval, r_tld.dval, C_r.dval, r_tld.dval, B_r.dval, r_tld.dval, B_r.dval, d1.dval, d2.dval, skp.val, queue ); Q_delta = skp.val[0]; C_rho = skp.val[1]; B_rho_new = skp.val[2]; B_beta = B_rho_new/B_rho_old * B_alpha/B_omega; } if( solver_par->numiter == 1 ){ //QMR: p = y; //QMR: q = z; magma_zcopy( dofs, Q_y.dval, 1, SpMV_in_1.dval, 1, queue ); magma_zcopy( dofs, Q_z.dval, 1, SpMV_in_2.dval, 1, queue ); //QMR: u = r; //QMR: p = r; magma_zcgs_2( b.num_rows, b.num_cols, C_r.dval, C_u.dval, SpMV_in_1.dval+dofs, queue ); } else{ Q_pde = Q_psi * Q_delta / Q_epsilon; Q_rde = Q_rho * MAGMA_Z_CNJG(Q_delta/Q_epsilon); C_beta = C_rho / C_rho_l; //QMR p = y - pde * p //QMR q = z - rde * q magma_zqmr_2( b.num_rows, b.num_cols, Q_pde, Q_rde, Q_y.dval, Q_z.dval, SpMV_in_1.dval, SpMV_in_2.dval, queue ); //CGS: u = r + beta*q; //CGS: p = u + beta*( q + beta*p ); magma_zcgs_1( b.num_rows, b.num_cols, C_beta, C_r.dval, C_q.dval, C_u.dval, SpMV_in_1.dval+dofs, queue ); } // BiCGSTAB: p = r + beta * ( p - omega * v ) magma_zbicgstab_1( b.num_rows, b.num_cols, B_beta, B_omega, B_r.dval, SpMV_out_1.dval+2*dofs, SpMV_in_1.dval+2*dofs, queue ); /* //QMR CHECK( magma_z_spmv( c_one, A, Q_p, c_zero, Q_pt, queue )); //CGS CHECK( magma_z_spmv( c_one, A, C_p, c_zero, C_v_hat, queue )); // BiCGSTAB CHECK( magma_z_spmv( c_one, A, B_p, c_zero, B_v, queue )); */ // gather everything for a block-SpMV //magma_zcopy( dofs, Q_p.dval, 1, SpMV_in_1.dval , 1, queue ); //magma_zcopy( dofs, C_p.dval, 1, SpMV_in_1.dval+dofs , 1, queue ); //magma_zcopy( dofs, B_p.dval, 1, SpMV_in_1.dval+2*dofs , 1, queue ); // block SpMV CHECK( magma_z_spmv( c_one, A, SpMV_in_1, c_zero, SpMV_out_1, queue )); // scatter results //magma_zcopy( dofs, SpMV_out_1.dval , 1, Q_pt.dval, 1, queue ); //magma_zcopy( dofs, SpMV_out_1.dval+dofs , 1, C_v_hat.dval, 1, queue ); //magma_zcopy( dofs, SpMV_out_1.dval+2*dofs , 1, B_v.dval, 1, queue ); if( mdot == 0 ) { //QMR: epsilon = q' * pt; Q_epsilon = magma_zdotc( dofs, SpMV_in_2.dval, 1, SpMV_out_1.dval, 1, queue ); Q_beta = Q_epsilon / Q_delta; //CGS: alpha = r_tld' * v_hat C_alpha = C_rho / magma_zdotc( dofs, r_tld.dval, 1, SpMV_out_1.dval+dofs, 1, queue ); C_rho_l = C_rho; //BiCGSTAB B_alpha = B_rho_new / magma_zdotc( dofs, r_tld.dval, 1, SpMV_out_1.dval+2*dofs, 1, queue ); }else{ magma_zmdotc4( b.num_rows, SpMV_in_2.dval, SpMV_out_1.dval, r_tld.dval, SpMV_out_1.dval+dofs, r_tld.dval, SpMV_out_1.dval+2*dofs, r_tld.dval, SpMV_out_1.dval+2*dofs, d1.dval, d2.dval, skp.val, queue ); Q_epsilon = skp.val[0]; Q_beta = Q_epsilon / Q_delta; C_alpha = C_rho / skp.val[1]; C_rho_l = C_rho; B_alpha = B_rho_new / skp.val[2]; } //QMR: v = pt - beta * v //QMR: y = v magma_zqmr_3( b.num_rows, b.num_cols, Q_beta, SpMV_out_1.dval, Q_v.dval, Q_y.dval, queue ); //CGS: q = u - alpha v_hat //CGS: t = u + q magma_zcgs_3( b.num_rows, b.num_cols, C_alpha, SpMV_out_1.dval+dofs, C_u.dval, C_q.dval, SpMV_in_2.dval+dofs, queue ); // BiCGSTAB: s = r - alpha v magma_zbicgstab_2( b.num_rows, b.num_cols, B_alpha, B_r.dval, SpMV_out_1.dval+2*dofs, SpMV_in_2.dval+2*dofs, queue ); // gather everything for a block-SpMV //magma_zcopy( dofs, Q_q.dval, 1, SpMV_in_2.dval , 1, queue ); //magma_zcopy( dofs, C_t.dval, 1, SpMV_in_2.dval+dofs , 1, queue ); //magma_zcopy( dofs, B_s.dval, 1, SpMV_in_2.dval+2*dofs , 1, queue ); // block SpMV CHECK( magma_z_spmv( c_one, A, SpMV_in_2, c_zero, SpMV_out_2, queue )); // scatter results //magma_zcopy( dofs, SpMV_out_2.dval , 1, Q_wt.dval, 1, queue ); //magma_zcopy( dofs, SpMV_out_2.dval+dofs , 1, C_rt.dval, 1, queue ); //magma_zcopy( dofs, SpMV_out_2.dval+2*dofs , 1, B_t.dval, 1, queue ); Q_rho1 = Q_rho; if( mdot == 0 ) { //QMR rho = norm(y); Q_rho = magma_zsqrt( magma_zdotc( dofs, Q_y.dval, 1, Q_y.dval, 1), queue ); // BiCGSTAB B_omega = magma_zdotc( dofs, SpMV_out_2.dval+2*dofs, 1, SpMV_in_2.dval+2*dofs, 1 ) // omega = <s,t>/<t,t> / magma_zdotc( dofs, SpMV_out_2.dval+2*dofs, 1, SpMV_out_2.dval+2*dofs, 1, queue ); }else{ magma_zmdotc4( b.num_rows, Q_y.dval, Q_y.dval, Q_y.dval, Q_y.dval, SpMV_out_2.dval+2*dofs, SpMV_in_2.dval+2*dofs, SpMV_out_2.dval+2*dofs, SpMV_out_2.dval+2*dofs, d1.dval, d2.dval, skp.val, queue ); Q_rho = magma_zsqrt(skp.val[0]); B_omega = skp.val[2]/skp.val[3]; } // QMR Q_thet1 = Q_thet; Q_thet = Q_rho / (Q_gamm * MAGMA_Z_MAKE( MAGMA_Z_ABS(Q_beta), 0.0 )); Q_gamm1 = Q_gamm; Q_gamm = c_one / magma_zsqrt(c_one + Q_thet*Q_thet); Q_eta = - Q_eta * Q_rho1 * Q_gamm * Q_gamm / (Q_beta * Q_gamm1 * Q_gamm1); if( solver_par->numiter == 1 ){ //QMR: d = eta * p + pds * d; //QMR: s = eta * pt + pds * d; //QMR: x = x + d; //QMR: r = r - s; magma_zqmr_4( b.num_rows, b.num_cols, Q_eta, SpMV_in_1.dval, SpMV_out_1.dval, Q_d.dval, Q_s.dval, Q_x.dval, Q_r.dval, queue ); } else{ Q_pds = (Q_thet1 * Q_gamm) * (Q_thet1 * Q_gamm); //QMR: d = eta * p + pds * d; //QMR: s = eta * pt + pds * d; //QMR: x = x + d; //QMR: r = r - s; magma_zqmr_5( b.num_rows, b.num_cols, Q_eta, Q_pds, SpMV_in_1.dval, SpMV_out_1.dval, Q_d.dval, Q_s.dval, Q_x.dval, Q_r.dval, queue ); } // CGS: r = r -alpha*A u_hat // CGS: x = x + alpha u_hat magma_zcgs_4( b.num_rows, b.num_cols, C_alpha, SpMV_in_2.dval+dofs, SpMV_out_2.dval+dofs, C_x.dval, C_r.dval, queue ); // BiCGSTAB: x = x + alpha * p + omega * s // BiCGSTAB: r = s - omega * t magma_zbicgstab_3( b.num_rows, b.num_cols, B_alpha, B_omega, SpMV_in_1.dval+2*dofs, SpMV_in_2.dval+2*dofs, SpMV_out_2.dval+2*dofs, B_x.dval, B_r.dval, queue ); if( mdot == 0 ){ Q_res = magma_dznrm2( dofs, Q_r.dval, 1, queue ); C_res = magma_dznrm2( dofs, C_r.dval, 1, queue ); B_res = magma_dznrm2( dofs, B_r.dval, 1, queue ); //QMR: psi = norm(z); Q_psi = magma_zsqrt( magma_zdotc( dofs, Q_z.dval, 1, Q_z.dval, 1), queue ); }else{ magma_zmdotc4( b.num_rows, Q_r.dval, Q_r.dval, C_r.dval, C_r.dval, B_r.dval, B_r.dval, Q_z.dval, Q_z.dval, d1.dval, d2.dval, skp.val, queue ); Q_res = MAGMA_Z_ABS(magma_zsqrt(skp.val[0])); C_res = MAGMA_Z_ABS(magma_zsqrt(skp.val[1])); B_res = MAGMA_Z_ABS(magma_zsqrt(skp.val[2])); //QMR: psi = norm(z); Q_psi = magma_zsqrt(skp.val[3]); } //QMR: v = y / rho //QMR: y = y / rho //QMR: w = wt / psi //QMR: z = z / psi //QMR: wt = A' * q - beta' * w //QMR: no precond: z = wt magma_zqmr_6( b.num_rows, b.num_cols, Q_beta, Q_rho, Q_psi, Q_y.dval, Q_z.dval, Q_v.dval, Q_w.dval, SpMV_out_2.dval, queue ); // printf(" %e %e %e\n", Q_res, C_res, B_res); if( Q_res < res ){ res = Q_res; flag = 1; } if( C_res < res ){ res = C_res; flag = 2; } if( B_res < res ){ res = B_res; flag = 3; } if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose == c_zero ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res/nomb <= solver_par->rtol || res <= solver_par->atol ){ break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); // copy back the best solver switch ( flag ) { case 1: printf("%% QMR fastest solver.\n"); magma_zcopy( dofs, Q_x.dval, 1, x->dval, 1, queue ); break; case 2: printf("%% CGS fastest solver.\n"); magma_zcopy( dofs, C_x.dval, 1, x->dval, 1, queue ); break; case 3: printf("%% BiCGSTAB fastest solver.\n"); magma_zcopy( dofs, B_x.dval, 1, x->dval, 1, queue ); break; } tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; double residual; CHECK( magma_zresidualvec( A, b, *x, &r_tld, &residual, queue)); solver_par->iter_res = res; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == c_zero ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->rtol*solver_par->init_res || solver_par->iter_res < solver_par->atol ) { info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == c_zero ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_zmfree(&r_tld, queue ); magma_zmfree(&SpMV_in_1, queue ); magma_zmfree(&SpMV_out_1, queue ); magma_zmfree(&SpMV_in_2, queue ); magma_zmfree(&SpMV_out_2, queue ); magma_zmfree(&d1, queue ); magma_zmfree(&d2, queue ); magma_zmfree(&skp, queue ); magma_zmfree(&AT, queue ); // QMR magma_zmfree(&Q_r, queue ); magma_zmfree(&Q_v, queue ); magma_zmfree(&Q_w, queue ); magma_zmfree(&Q_wt, queue ); magma_zmfree(&Q_d, queue ); magma_zmfree(&Q_s, queue ); magma_zmfree(&Q_z, queue ); magma_zmfree(&Q_q, queue ); magma_zmfree(&Q_p, queue ); magma_zmfree(&Q_pt, queue ); magma_zmfree(&Q_y, queue ); magma_zmfree(&Q_x, queue ); // CGS magma_zmfree(&C_r, queue ); magma_zmfree(&C_rt, queue ); magma_zmfree(&C_x, queue ); magma_zmfree(&C_p, queue ); magma_zmfree(&C_q, queue ); magma_zmfree(&C_u, queue ); magma_zmfree(&C_v, queue ); magma_zmfree(&C_t, queue ); magma_zmfree(&C_p_hat, queue ); magma_zmfree(&C_q_hat, queue ); magma_zmfree(&C_u_hat, queue ); magma_zmfree(&C_v_hat, queue ); // BiCGSTAB magma_zmfree(&B_r, queue ); magma_zmfree(&B_x, queue ); magma_zmfree(&B_p, queue ); magma_zmfree(&B_v, queue ); magma_zmfree(&B_s, queue ); magma_zmfree(&B_t, queue ); // destroy asynchronous queues magma_queue_destroy( queues[0] ); if ( q1flag == 1 ) { magma_queue_destroy( queues[1] ); } magma_queue_destroy( queues[2] ); solver_par->info = info; return info; } /* magma_zbombard_merge */