magma_int_t magma_c_applyprecond( magma_c_sparse_matrix A, magma_c_vector b, magma_c_vector *x, magma_c_preconditioner *precond ) { if( precond->solver == Magma_JACOBI ){ magma_cjacobi_diagscal( A.num_rows, precond->d.val, b.val, x->val ); return MAGMA_SUCCESS; } else if( precond->solver == Magma_PASTIX ){ magma_capplypastix( b, x, precond ); return MAGMA_SUCCESS; } else if( precond->solver == Magma_ILU ){ magma_c_vector tmp; magma_c_vinit( &tmp, Magma_DEV, A.num_rows, MAGMA_C_MAKE(1.0, 0.0) ); // magma_capplycuilu_l( b, &tmp, precond ); // magma_capplycuilu_r( tmp, x, precond ); magma_c_vfree( &tmp ); return MAGMA_SUCCESS; } else if( precond->solver == Magma_ICC ){ magma_c_vector tmp; magma_c_vinit( &tmp, Magma_DEV, A.num_rows, MAGMA_C_MAKE(1.0, 0.0) ); // magma_ctrisv_l_nu( precond->L, b, &tmp ); // magma_ctrisv_r_nu( precond->L, tmp, x ); magma_c_vfree( &tmp ); return MAGMA_SUCCESS; } else{ printf( "error: preconditioner type not yet supported.\n" ); return MAGMA_ERR_NOT_SUPPORTED; } }
extern "C" magma_int_t magma_c_applyprecond( magma_c_sparse_matrix A, magma_c_vector b, magma_c_vector *x, magma_c_preconditioner *precond, magma_queue_t queue ) { // set queue for old dense routines magma_queue_t orig_queue; magmablasGetKernelStream( &orig_queue ); if ( precond->solver == Magma_JACOBI ) { magma_cjacobi_diagscal( A.num_rows, precond->d, b, x, queue ); } else if ( precond->solver == Magma_PASTIX ) { magma_capplypastix( b, x, precond, queue ); } else if ( precond->solver == Magma_ILU ) { magma_c_vector tmp; magma_c_vinit( &tmp, Magma_DEV, A.num_rows, MAGMA_C_ZERO, queue ); magma_c_vfree( &tmp, queue ); } else if ( precond->solver == Magma_ICC ) { magma_c_vector tmp; magma_c_vinit( &tmp, Magma_DEV, A.num_rows, MAGMA_C_ZERO, queue ); magma_c_vfree( &tmp, queue ); } else if ( precond->solver == Magma_NONE ) { magma_ccopy( b.num_rows, b.dval, 1, x->dval, 1 ); // x = b } else { printf( "error: preconditioner type not yet supported.\n" ); magmablasSetKernelStream( orig_queue ); return MAGMA_ERR_NOT_SUPPORTED; } magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; }
/* //////////////////////////////////////////////////////////////////////////// -- testing any solver */ int main( int argc, char** argv ) { TESTING_INIT(); magma_copts zopts; magma_queue_t queue; magma_queue_create( /*devices[ opts->device ],*/ &queue ); int i=1; magma_cparse_opts( argc, argv, &zopts, &i, queue ); magmaFloatComplex one = MAGMA_C_MAKE(1.0, 0.0); magmaFloatComplex zero = MAGMA_C_MAKE(0.0, 0.0); magma_c_sparse_matrix A, B, B_d; magma_c_vector x, b; B.blocksize = zopts.blocksize; B.alignment = zopts.alignment; if ( zopts.solver_par.solver != Magma_PCG && zopts.solver_par.solver != Magma_PGMRES && zopts.solver_par.solver != Magma_PBICGSTAB && zopts.solver_par.solver != Magma_ITERREF ) zopts.precond_par.solver = Magma_NONE; magma_csolverinfo_init( &zopts.solver_par, &zopts.precond_par, queue ); while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); magma_cm_5stencil( laplace_size, &A, queue ); } else { // file-matrix test magma_c_csr_mtx( &A, argv[i], queue ); } printf( "\n# matrix info: %d-by-%d with %d nonzeros\n\n", (int) A.num_rows,(int) A.num_cols,(int) A.nnz ); // for the eigensolver case zopts.solver_par.ev_length = A.num_rows; magma_ceigensolverinfo_init( &zopts.solver_par, queue ); // scale matrix magma_cmscale( &A, zopts.scaling, queue ); magma_c_mconvert( A, &B, Magma_CSR, zopts.output_format, queue ); magma_c_mtransfer( B, &B_d, Magma_CPU, Magma_DEV, queue ); // vectors and initial guess magma_c_vinit( &b, Magma_DEV, A.num_cols, one, queue ); magma_c_vinit( &x, Magma_DEV, A.num_cols, one, queue ); magma_c_spmv( one, B_d, x, zero, b, queue ); // b = A x magma_c_vfree(&x, queue ); magma_c_vinit( &x, Magma_DEV, A.num_cols, zero, queue ); magma_c_solver( B_d, b, &x, &zopts, queue ); magma_csolverinfo( &zopts.solver_par, &zopts.precond_par, queue ); magma_c_mfree(&B_d, queue ); magma_c_mfree(&B, queue ); magma_c_mfree(&A, queue ); magma_c_vfree(&x, queue ); magma_c_vfree(&b, queue ); i++; } magma_csolverinfo_free( &zopts.solver_par, &zopts.precond_par, queue ); magma_queue_destroy( queue ); TESTING_FINALIZE(); return 0; }
magma_int_t magma_cpastixsetup( magma_c_sparse_matrix A, magma_c_vector b, magma_c_preconditioner *precond ){ #if defined(HAVE_PASTIX) #if defined(PRECISION_d) pastix_data_t *pastix_data = NULL; /* Pointer to a storage structure needed by pastix */ pastix_int_t ncol; /* Size of the matrix */ pastix_int_t *colptr = NULL; /* Indexes of first element of each column in row and values */ pastix_int_t *rows = NULL; /* Row of each element of the matrix */ pastix_float_t *values = NULL; /* Value of each element of the matrix */ pastix_float_t *rhs = NULL; /* right hand side */ pastix_int_t *iparm = NULL; /* integer parameters for pastix */ float *dparm = NULL; /* floating parameters for pastix */ pastix_int_t *perm = NULL; /* Permutation tabular */ pastix_int_t *invp = NULL; /* Reverse permutation tabular */ pastix_int_t mat_type; magma_c_sparse_matrix A_h1, B; magma_c_vector diag, c_t, b_h; magma_c_vinit( &c_t, Magma_CPU, A.num_rows, MAGMA_C_ZERO ); magma_c_vinit( &diag, Magma_CPU, A.num_rows, MAGMA_C_ZERO ); magma_c_vtransfer( b, &b_h, A.memory_location, Magma_CPU); if( A.storage_type != Magma_CSR ){ magma_c_mtransfer( A, &A_h1, A.memory_location, Magma_CPU); magma_c_mconvert( A_h1, &B, A_h1.storage_type, Magma_CSR); } else{ magma_c_mtransfer( A, &B, A.memory_location, Magma_CPU); } rhs = (pastix_float_t*) b_h.val; ncol = B.num_rows; colptr = B.row; rows = B.col; values = (pastix_float_t*) B.val; mat_type = API_SYM_NO; iparm = (pastix_int_t*)malloc(IPARM_SIZE*sizeof(pastix_int_t)); dparm = (pastix_float_t*)malloc(DPARM_SIZE*sizeof(pastix_float_t)); /*******************************************/ /* Initialize parameters to default values */ /*******************************************/ iparm[IPARM_MODIFY_PARAMETER] = API_NO; pastix(&pastix_data, MPI_COMM_WORLD, ncol, colptr, rows, values, perm, invp, rhs, 1, iparm, dparm); iparm[IPARM_THREAD_NBR] = 16; iparm[IPARM_SYM] = mat_type; iparm[IPARM_FACTORIZATION] = API_FACT_LU; iparm[IPARM_VERBOSE] = API_VERBOSE_YES; iparm[IPARM_ORDERING] = API_ORDER_SCOTCH; iparm[IPARM_INCOMPLETE] = API_NO; iparm[IPARM_RHS_MAKING] = API_RHS_B; //iparm[IPARM_AMALGAMATION] = 5; iparm[IPARM_LEVEL_OF_FILL] = 0; /* if (incomplete == 1) { dparm[DPARM_EPSILON_REFINEMENT] = 1e-7; } */ /* * Matrix needs : * - to be in fortran numbering * - to have only the lower triangular part in symmetric case * - to have a graph with a symmetric structure in unsymmetric case * If those criteria are not matched, the csc will be reallocated and changed. */ iparm[IPARM_MATRIX_VERIFICATION] = API_YES; perm = (pastix_int_t*)malloc(ncol*sizeof(pastix_int_t)); invp = (pastix_int_t*)malloc(ncol*sizeof(pastix_int_t)); /*******************************************/ /* Step 1 - Ordering / Scotch */ /* Perform it only when the pattern of */ /* matrix change. */ /* eg: mesh refinement */ /* In many cases users can simply go from */ /* API_TASK_ORDERING to API_TASK_ANALYSE */ /* in one call. */ /*******************************************/ /*******************************************/ /* Step 2 - Symbolic factorization */ /* Perform it only when the pattern of */ /* matrix change. */ /*******************************************/ /*******************************************/ /* Step 3 - Mapping and Compute scheduling */ /* Perform it only when the pattern of */ /* matrix change. */ /*******************************************/ /*******************************************/ /* Step 4 - Numerical Factorisation */ /* Perform it each time the values of the */ /* matrix changed. */ /*******************************************/ iparm[IPARM_START_TASK] = API_TASK_ORDERING; iparm[IPARM_END_TASK] = API_TASK_NUMFACT; pastix(&pastix_data, MPI_COMM_WORLD, ncol, colptr, rows, values, perm, invp, NULL, 1, iparm, dparm); precond->int_array_1 = (magma_int_t*) perm; precond->int_array_2 = (magma_int_t*) invp; precond->M.val = (magmaFloatComplex*) values; precond->M.col = (magma_int_t*) colptr; precond->M.row = (magma_int_t*) rows; precond->M.num_rows = A.num_rows; precond->M.num_cols = A.num_cols; precond->M.memory_location = Magma_CPU; precond->pastix_data = pastix_data; precond->iparm = iparm; precond->dparm = dparm; if( A.storage_type != Magma_CSR){ magma_c_mfree( &A_h1 ); } magma_c_vfree( &b_h); magma_c_mfree( &B ); #else printf( "error: only real supported yet.\n"); #endif #else printf( "error: pastix not available.\n"); #endif return MAGMA_SUCCESS; }
magma_int_t magma_capplypastix( magma_c_vector b, magma_c_vector *x, magma_c_preconditioner *precond ){ #if defined(HAVE_PASTIX) #if defined(PRECISION_d) pastix_int_t ncol; /* Size of the matrix */ pastix_int_t *colptr = NULL; /* Indexes of first element of each column in row and values */ pastix_int_t *rows = NULL; /* Row of each element of the matrix */ pastix_float_t *values = NULL; /* Value of each element of the matrix */ pastix_float_t *rhs = NULL; /* right hand side */ pastix_int_t *iparm; /* integer parameters for pastix */ float *dparm; /* floating parameters for pastix */ pastix_int_t *perm = NULL; /* Permutation tabular */ pastix_int_t *invp = NULL; /* Reverse permutation tabular */ magma_c_vector b_h; magma_c_vtransfer( b, &b_h, b.memory_location, Magma_CPU); rhs = (pastix_float_t*) b_h.val; ncol = precond->M.num_rows; colptr = (pastix_int_t*) precond->M.col; rows = (pastix_int_t*) precond->M.row; values = (pastix_float_t*) precond->M.val; iparm = precond->iparm; dparm = precond->dparm; perm = (pastix_int_t*)precond->int_array_1; invp = (pastix_int_t*)precond->int_array_1; /*******************************************/ /* Step 5 - Solve */ /* For each one of your Right-hand-side */ /* members. */ /* Also consider using multiple */ /* right-hand-side members. */ /*******************************************/ iparm[IPARM_START_TASK] = API_TASK_SOLVE; iparm[IPARM_END_TASK] = API_TASK_REFINEMENT; pastix(&(precond->pastix_data), MPI_COMM_WORLD, ncol, colptr, rows, values, perm, invp, b_h.val, 1, iparm, dparm); // fix that x is not allocated every time // in case of many iterations, it might be faster to use // magma_csetvector( ncol, // b_h.val, 1, x->val, 1 ); magma_c_vfree( x ); magma_c_vtransfer( b_h, x, Magma_CPU, b.memory_location); magma_c_vfree( &b_h); #else printf( "error: only real supported yet.\n"); #endif #else printf( "error: pastix not available.\n"); #endif return MAGMA_SUCCESS; }
extern "C" magma_int_t magma_cgmres( magma_c_sparse_matrix A, magma_c_vector b, magma_c_vector *x, magma_c_solver_par *solver_par, magma_queue_t queue ) { magma_int_t stat = 0; // set queue for old dense routines magma_queue_t orig_queue; magmablasGetKernelStream( &orig_queue ); magma_int_t stat_cpu = 0, stat_dev = 0; // prepare solver feedback solver_par->solver = Magma_GMRES; solver_par->numiter = 0; solver_par->info = MAGMA_SUCCESS; // local variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE, c_mone = MAGMA_C_NEG_ONE; magma_int_t dofs = A.num_rows; magma_int_t i, j, k, m = 0; magma_int_t restart = min( dofs-1, solver_par->restart ); magma_int_t ldh = restart+1; float nom, rNorm, RNorm, nom0, betanom, r0 = 0.; // CPU workspace //magma_setdevice(0); magmaFloatComplex *H, *HH, *y, *h1; stat_cpu += magma_cmalloc_pinned( &H, (ldh+1)*ldh ); stat_cpu += magma_cmalloc_pinned( &y, ldh ); stat_cpu += magma_cmalloc_pinned( &HH, ldh*ldh ); stat_cpu += magma_cmalloc_pinned( &h1, ldh ); if( stat_cpu != 0){ magma_free_pinned( H ); magma_free_pinned( y ); magma_free_pinned( HH ); magma_free_pinned( h1 ); magmablasSetKernelStream( orig_queue ); return MAGMA_ERR_HOST_ALLOC; } // GPU workspace magma_c_vector r, q, q_t; magma_c_vinit( &r, Magma_DEV, dofs, c_zero, queue ); magma_c_vinit( &q, Magma_DEV, dofs*(ldh+1), c_zero, queue ); q_t.memory_location = Magma_DEV; q_t.dval = NULL; q_t.num_rows = q_t.nnz = dofs; q_t.num_cols = 1; magmaFloatComplex *dy = NULL, *dH = NULL; stat_dev += magma_cmalloc( &dy, ldh ); stat_dev += magma_cmalloc( &dH, (ldh+1)*ldh ); if( stat_dev != 0){ magma_free_pinned( H ); magma_free_pinned( y ); magma_free_pinned( HH ); magma_free_pinned( h1 ); magma_free( dH ); magma_free( dy ); magma_free( dH ); magma_free( dy ); magmablasSetKernelStream( orig_queue ); return MAGMA_ERR_DEVICE_ALLOC; } // GPU stream magma_queue_t stream[2]; magma_event_t event[1]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); //magmablasSetKernelStream(stream[0]); magma_cscal( dofs, c_zero, x->dval, 1 ); // x = 0 magma_ccopy( dofs, b.dval, 1, r.dval, 1 ); // r = b nom0 = betanom = magma_scnrm2( dofs, r.dval, 1 ); // nom0= || r|| nom = nom0 * nom0; solver_par->init_res = nom0; H(1,0) = MAGMA_C_MAKE( nom0, 0. ); magma_csetvector(1, &H(1,0), 1, &dH(1,0), 1); if ( (r0 = nom0 * solver_par->epsilon ) < ATOLERANCE ){ r0 = solver_par->epsilon; } if ( nom < r0 ) { magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ) { for(k=1; k<=restart; k++) { magma_ccopy(dofs, r.dval, 1, q(k-1), 1); // q[0] = 1.0/||r|| magma_cscal(dofs, 1./H(k,k-1), q(k-1), 1); // (to be fused) q_t.dval = q(k-1); //magmablasSetKernelStream(stream[0]); magma_c_spmv( c_one, A, q_t, c_zero, r, queue ); // r = A q[k] // if (solver_par->ortho == Magma_MGS ) { // modified Gram-Schmidt for (i=1; i<=k; i++) { H(i,k) =magma_cdotc(dofs, q(i-1), 1, r.dval, 1); // H(i,k) = q[i] . r magma_caxpy(dofs,-H(i,k), q(i-1), 1, r.dval, 1); // r = r - H(i,k) q[i] } H(k+1,k) = MAGMA_C_MAKE( magma_scnrm2(dofs, r.dval, 1), 0. ); // H(k+1,k) = ||r|| /*} else if (solver_par->ortho == Magma_FUSED_CGS ) { // fusing cgemv with scnrm2 in classical Gram-Schmidt magmablasSetKernelStream(stream[0]); magma_ccopy(dofs, r.dval, 1, q(k), 1); // dH(1:k+1,k) = q[0:k] . r magmablas_cgemv(MagmaTrans, dofs, k+1, c_one, q(0), dofs, r.dval, 1, c_zero, &dH(1,k), 1); // r = r - q[0:k-1] dH(1:k,k) magmablas_cgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.dval, 1); // 1) dH(k+1,k) = sqrt( dH(k+1,k) - dH(1:k,k) ) magma_ccopyscale( dofs, k, r.dval, q(k), &dH(1,k) ); // 2) q[k] = q[k] / dH(k+1,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_cgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); // asynch copy dH(1:(k+1),k) to H(1:(k+1),k) } else { // classical Gram-Schmidt (default) // > explicitly calling magmabls magmablasSetKernelStream(stream[0]); magmablas_cgemv(MagmaTrans, dofs, k, c_one, q(0), dofs, r.dval, 1, c_zero, &dH(1,k), 1, queue ); // dH(1:k,k) = q[0:k-1] . r #ifndef SCNRM2SCALE // start copying dH(1:k,k) to H(1:k,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_cgetvector_async(k, &dH(1,k), 1, &H(1,k), 1, stream[1]); #endif // r = r - q[0:k-1] dH(1:k,k) magmablas_cgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.dval, 1); #ifdef SCNRM2SCALE magma_ccopy(dofs, r.dval, 1, q(k), 1); // q[k] = r / H(k,k-1) magma_scnrm2scale(dofs, q(k), dofs, &dH(k+1,k) ); // dH(k+1,k) = sqrt(r . r) and r = r / dH(k+1,k) magma_event_record( event[0], stream[0] ); // start sending dH(1:k,k) to H(1:k,k) magma_queue_wait_event( stream[1], event[0] ); // can we keep H(k+1,k) on GPU and combine? magma_cgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); #else H(k+1,k) = MAGMA_C_MAKE( magma_scnrm2(dofs, r.dval, 1), 0. ); // H(k+1,k) = sqrt(r . r) if ( k<solver_par->restart ) { magmablasSetKernelStream(stream[0]); magma_ccopy(dofs, r.dval, 1, q(k), 1); // q[k] = 1.0/H[k][k-1] r magma_cscal(dofs, 1./H(k+1,k), q(k), 1); // (to be fused) } #endif }*/ /* Minimization of || b-Ax || in H_k */ for (i=1; i<=k; i++) { HH(k,i) = magma_cblas_cdotc( i+1, &H(1,k), 1, &H(1,i), 1 ); } h1[k] = H(1,k)*H(1,0); if (k != 1) { for (i=1; i<k; i++) { HH(k,i) = HH(k,i)/HH(i,i);// for (m=i+1; m<=k; m++) { HH(k,m) -= HH(k,i) * HH(m,i) * HH(i,i); } h1[k] -= h1[i] * HH(k,i); } } y[k] = h1[k]/HH(k,k); if (k != 1) for (i=k-1; i>=1; i--) { y[i] = h1[i]/HH(i,i); for (j=i+1; j<=k; j++) y[i] -= y[j] * HH(j,i); } m = k; rNorm = fabs(MAGMA_C_REAL(H(k+1,k))); }/* Minimization done */ // compute solution approximation magma_csetmatrix(m, 1, y+1, m, dy, m ); magma_cgemv(MagmaNoTrans, dofs, m, c_one, q(0), dofs, dy, 1, c_one, x->dval, 1); // compute residual magma_c_spmv( c_mone, A, *x, c_zero, r, queue ); // r = - A * x magma_caxpy(dofs, c_one, b.dval, 1, r.dval, 1); // r = r + b H(1,0) = MAGMA_C_MAKE( magma_scnrm2(dofs, r.dval, 1), 0. ); // RNorm = H[1][0] = || r || RNorm = MAGMA_C_REAL( H(1,0) ); betanom = fabs(RNorm); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < r0 ) { break; } } tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; magma_cresidual( A, b, *x, &residual, queue ); solver_par->iter_res = betanom; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter) { solver_par->info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_SLOW_CONVERGENCE; } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_DIVERGENCE; } // free pinned memory magma_free_pinned( H ); magma_free_pinned( y ); magma_free_pinned( HH ); magma_free_pinned( h1 ); // free GPU memory magma_free(dy); if (dH != NULL ) magma_free(dH); magma_c_vfree(&r, queue ); magma_c_vfree(&q, queue ); // free GPU streams and events magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_event_destroy( event[0] ); //magmablasSetKernelStream(NULL); magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } /* magma_cgmres */
extern "C" magma_int_t magma_cbicgstab_merge( magma_c_sparse_matrix A, magma_c_vector b, magma_c_vector *x, magma_c_solver_par *solver_par, magma_queue_t queue ) { // set queue for old dense routines magma_queue_t orig_queue; magmablasGetKernelStream( &orig_queue ); // prepare solver feedback solver_par->solver = Magma_BICGSTABMERGE; solver_par->numiter = 0; solver_par->info = MAGMA_SUCCESS; // some useful variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE; magma_int_t dofs = A.num_rows; // GPU stream magma_queue_t stream[2]; magma_event_t event[1]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); // workspace magma_c_vector q, r,rr,p,v,s,t; magmaFloatComplex *d1, *d2, *skp; d1 = NULL; d2 = NULL; skp = NULL; magma_int_t stat_dev = 0, stat_cpu = 0; stat_dev += magma_cmalloc( &d1, dofs*(2) ); stat_dev += magma_cmalloc( &d2, dofs*(2) ); // array for the parameters stat_dev += magma_cmalloc( &skp, 8 ); if( stat_dev != 0 ){ magma_free( d1 ); magma_free( d2 ); magma_free( skp ); printf("error: memory allocation.\n"); return MAGMA_ERR_DEVICE_ALLOC; } // skp = [alpha|beta|omega|rho_old|rho|nom|tmp1|tmp2] magma_c_vinit( &q, Magma_DEV, dofs*6, c_zero, queue ); // q = rr|r|p|v|s|t rr.memory_location = Magma_DEV; rr.dval = NULL; rr.num_rows = rr.nnz = dofs; rr.num_cols = 1; r.memory_location = Magma_DEV; r.dval = NULL; r.num_rows = r.nnz = dofs; r.num_cols = 1; p.memory_location = Magma_DEV; p.dval = NULL; p.num_rows = p.nnz = dofs; p.num_cols = 1; v.memory_location = Magma_DEV; v.dval = NULL; v.num_rows = v.nnz = dofs; v.num_cols = 1; s.memory_location = Magma_DEV; s.dval = NULL; s.num_rows = s.nnz = dofs; s.num_cols = 1; t.memory_location = Magma_DEV; t.dval = NULL; t.num_rows = t.nnz = dofs; t.num_cols = 1; rr.dval = q(0); r.dval = q(1); p.dval = q(2); v.dval = q(3); s.dval = q(4); t.dval = q(5); // solver variables magmaFloatComplex alpha, beta, omega, rho_old, rho_new, *skp_h; float nom, nom0, betanom, r0, den; // solver setup magma_cscal( dofs, c_zero, x->dval, 1) ; // x = 0 magma_ccopy( dofs, b.dval, 1, q(0), 1 ); // rr = b magma_ccopy( dofs, b.dval, 1, q(1), 1 ); // r = b rho_new = magma_cdotc( dofs, r.dval, 1, r.dval, 1 ); // rho=<rr,r> nom = MAGMA_C_REAL(magma_cdotc( dofs, r.dval, 1, r.dval, 1 )); nom0 = betanom = sqrt(nom); // nom = || r || rho_old = omega = alpha = MAGMA_C_MAKE( 1.0, 0. ); beta = rho_new; solver_par->init_res = nom0; // array on host for the parameters stat_cpu = magma_cmalloc_cpu( &skp_h, 8 ); if( stat_cpu != 0 ){ magma_free( d1 ); magma_free( d2 ); magma_free( skp ); magma_free_cpu( skp_h ); printf("error: memory allocation.\n"); return MAGMA_ERR_HOST_ALLOC; } skp_h[0]=alpha; skp_h[1]=beta; skp_h[2]=omega; skp_h[3]=rho_old; skp_h[4]=rho_new; skp_h[5]=MAGMA_C_MAKE(nom, 0.0); magma_csetvector( 8, skp_h, 1, skp, 1 ); magma_c_spmv( c_one, A, r, c_zero, v, queue ); // z = A r den = MAGMA_C_REAL( magma_cdotc(dofs, v.dval, 1, r.dval, 1) );// den = z dot r if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) { magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } // check positive definite if (den <= 0.0) { printf("Operator A is not postive definite. (Ar,r) = %f\n", den); magmablasSetKernelStream( orig_queue ); return MAGMA_NONSPD; solver_par->info = MAGMA_NONSPD;; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ) { magmablasSetKernelStream(stream[0]); // computes p=r+beta*(p-omega*v) magma_cbicgmerge1( dofs, skp, v.dval, r.dval, p.dval, queue ); magma_c_spmv( c_one, A, p, c_zero, v, queue ); // v = Ap magma_cmdotc( dofs, 1, q.dval, v.dval, d1, d2, skp, queue ); magma_cbicgmerge4( 1, skp, queue ); magma_cbicgmerge2( dofs, skp, r.dval, v.dval, s.dval, queue ); // s=r-alpha*v magma_c_spmv( c_one, A, s, c_zero, t, queue ); // t=As magma_cmdotc( dofs, 2, q.dval+4*dofs, t.dval, d1, d2, skp+6, queue ); magma_cbicgmerge4( 2, skp, queue ); magma_cbicgmerge_xrbeta( dofs, d1, d2, q.dval, r.dval, p.dval, s.dval, t.dval, x->dval, skp, queue ); // check stopping criterion (asynchronous copy) magma_cgetvector_async( 1 , skp+5, 1, skp_h+5, 1, stream[1] ); betanom = sqrt(MAGMA_C_REAL(skp_h[5])); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < r0 ) { break; } } tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; magma_cresidual( A, b, *x, &residual, queue ); solver_par->iter_res = betanom; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter) { solver_par->info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_SLOW_CONVERGENCE; } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_DIVERGENCE; } magma_c_vfree(&q, queue ); // frees all vectors magma_free(d1); magma_free(d2); magma_free( skp ); magma_free_cpu( skp_h ); magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } /* cbicgstab_merge */
/* //////////////////////////////////////////////////////////////////////////// -- testing sparse matrix vector product */ int main( int argc, char** argv ) { TESTING_INIT(); magma_queue_t queue; magma_queue_create( /*devices[ opts->device ],*/ &queue ); magma_c_sparse_matrix hA, hA_SELLP, hA_ELL, dA, dA_SELLP, dA_ELL; hA_SELLP.blocksize = 8; hA_SELLP.alignment = 8; real_Double_t start, end, res; magma_int_t *pntre; magmaFloatComplex c_one = MAGMA_C_MAKE(1.0, 0.0); magmaFloatComplex c_zero = MAGMA_C_MAKE(0.0, 0.0); magma_int_t i, j; for( i = 1; i < argc; ++i ) { if ( strcmp("--blocksize", argv[i]) == 0 ) { hA_SELLP.blocksize = atoi( argv[++i] ); } else if ( strcmp("--alignment", argv[i]) == 0 ) { hA_SELLP.alignment = atoi( argv[++i] ); } else break; } printf( "\n# usage: ./run_cspmv" " [ --blocksize %d --alignment %d (for SELLP) ]" " matrices \n\n", (int) hA_SELLP.blocksize, (int) hA_SELLP.alignment ); while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); magma_cm_5stencil( laplace_size, &hA, queue ); } else { // file-matrix test magma_c_csr_mtx( &hA, argv[i], queue ); } printf( "\n# matrix info: %d-by-%d with %d nonzeros\n\n", (int) hA.num_rows,(int) hA.num_cols,(int) hA.nnz ); real_Double_t FLOPS = 2.0*hA.nnz/1e9; magma_c_vector hx, hy, dx, dy, hrefvec, hcheck; // init CPU vectors magma_c_vinit( &hx, Magma_CPU, hA.num_rows, c_zero, queue ); magma_c_vinit( &hy, Magma_CPU, hA.num_rows, c_zero, queue ); // init DEV vectors magma_c_vinit( &dx, Magma_DEV, hA.num_rows, c_one, queue ); magma_c_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); #ifdef MAGMA_WITH_MKL // calling MKL with CSR pntre = (magma_int_t*)malloc( (hA.num_rows+1)*sizeof(magma_int_t) ); pntre[0] = 0; for (j=0; j<hA.num_rows; j++ ) { pntre[j] = hA.row[j+1]; } MKL_INT num_rows = hA.num_rows; MKL_INT num_cols = hA.num_cols; MKL_INT nnz = hA.nnz; MKL_INT *col; TESTING_MALLOC_CPU( col, MKL_INT, nnz ); for( magma_int_t t=0; t < hA.nnz; ++t ) { col[ t ] = hA.col[ t ]; } MKL_INT *row; TESTING_MALLOC_CPU( row, MKL_INT, num_rows ); for( magma_int_t t=0; t < hA.num_rows; ++t ) { row[ t ] = hA.col[ t ]; } start = magma_wtime(); for (j=0; j<10; j++ ) { mkl_ccsrmv( "N", &num_rows, &num_cols, MKL_ADDR(&c_one), "GFNC", MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), MKL_ADDR(&c_zero), MKL_ADDR(hy.val) ); } end = magma_wtime(); printf( "\n > MKL : %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10/(end-start) ); TESTING_FREE_CPU( row ); TESTING_FREE_CPU( col ); free(pntre); #endif // MAGMA_WITH_MKL // copy matrix to GPU magma_c_mtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue ); // SpMV on GPU (CSR) -- this is the reference! start = magma_sync_wtime( queue ); for (j=0; j<10; j++) magma_c_spmv( c_one, dA, dx, c_zero, dy, queue ); end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (standard CSR).\n", (end-start)/10, FLOPS*10/(end-start) ); magma_c_mfree(&dA, queue ); magma_c_vtransfer( dy, &hrefvec , Magma_DEV, Magma_CPU, queue ); // convert to ELL and copy to GPU magma_c_mconvert( hA, &hA_ELL, Magma_CSR, Magma_ELL, queue ); magma_c_mtransfer( hA_ELL, &dA_ELL, Magma_CPU, Magma_DEV, queue ); magma_c_mfree(&hA_ELL, queue ); magma_c_vfree( &dy, queue ); magma_c_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); // SpMV on GPU (ELL) start = magma_sync_wtime( queue ); for (j=0; j<10; j++) magma_c_spmv( c_one, dA_ELL, dx, c_zero, dy, queue ); end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (standard ELL).\n", (end-start)/10, FLOPS*10/(end-start) ); magma_c_mfree(&dA_ELL, queue ); magma_c_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue ); res = 0.0; for(magma_int_t k=0; k<hA.num_rows; k++ ) res=res + MAGMA_C_REAL(hcheck.val[k]) - MAGMA_C_REAL(hrefvec.val[k]); if ( res < .000001 ) printf("# tester spmv ELL: ok\n"); else printf("# tester spmv ELL: failed\n"); magma_c_vfree( &hcheck, queue ); // convert to SELLP and copy to GPU magma_c_mconvert( hA, &hA_SELLP, Magma_CSR, Magma_SELLP, queue ); magma_c_mtransfer( hA_SELLP, &dA_SELLP, Magma_CPU, Magma_DEV, queue ); magma_c_mfree(&hA_SELLP, queue ); magma_c_vfree( &dy, queue ); magma_c_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); // SpMV on GPU (SELLP) start = magma_sync_wtime( queue ); for (j=0; j<10; j++) magma_c_spmv( c_one, dA_SELLP, dx, c_zero, dy, queue ); end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (SELLP).\n", (end-start)/10, FLOPS*10/(end-start) ); magma_c_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue ); res = 0.0; for(magma_int_t k=0; k<hA.num_rows; k++ ) res=res + MAGMA_C_REAL(hcheck.val[k]) - MAGMA_C_REAL(hrefvec.val[k]); printf("# |x-y|_F = %8.2e\n", res); if ( res < .000001 ) printf("# tester spmv SELL-P: ok\n"); else printf("# tester spmv SELL-P: failed\n"); magma_c_vfree( &hcheck, queue ); magma_c_mfree(&dA_SELLP, queue ); // SpMV on GPU (CUSPARSE - CSR) // CUSPARSE context // cusparseHandle_t cusparseHandle = 0; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); cusparseSetStream( cusparseHandle, queue ); cusparseMatDescr_t descr = 0; cusparseStatus = cusparseCreateMatDescr(&descr); cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); magmaFloatComplex alpha = c_one; magmaFloatComplex beta = c_zero; magma_c_vfree( &dy, queue ); magma_c_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); // copy matrix to GPU magma_c_mtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue ); start = magma_sync_wtime( queue ); for (j=0; j<10; j++) cusparseStatus = cusparseCcsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, hA.num_rows, hA.num_cols, hA.nnz, &alpha, descr, dA.dval, dA.drow, dA.dcol, dx.dval, &beta, dy.dval); end = magma_sync_wtime( queue ); if (cusparseStatus != 0) printf("error in cuSPARSE CSR\n"); printf( " > CUSPARSE: %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10/(end-start) ); cusparseMatDescr_t descrA; cusparseStatus = cusparseCreateMatDescr(&descrA); if (cusparseStatus != 0) printf("error\n"); cusparseHybMat_t hybA; cusparseStatus = cusparseCreateHybMat( &hybA ); if (cusparseStatus != 0) printf("error\n"); magma_c_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue ); res = 0.0; for(magma_int_t k=0; k<hA.num_rows; k++ ) res=res + MAGMA_C_REAL(hcheck.val[k]) - MAGMA_C_REAL(hrefvec.val[k]); printf("# |x-y|_F = %8.2e\n", res); if ( res < .000001 ) printf("# tester spmv cuSPARSE CSR: ok\n"); else printf("# tester spmv cuSPARSE CSR: failed\n"); magma_c_vfree( &hcheck, queue ); magma_c_vfree( &dy, queue ); magma_c_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); cusparseCcsr2hyb(cusparseHandle, hA.num_rows, hA.num_cols, descrA, dA.dval, dA.drow, dA.dcol, hybA, 0, CUSPARSE_HYB_PARTITION_AUTO); start = magma_sync_wtime( queue ); for (j=0; j<10; j++) cusparseStatus = cusparseChybmv( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA, hybA, dx.dval, &beta, dy.dval); end = magma_sync_wtime( queue ); if (cusparseStatus != 0) printf("error in cuSPARSE HYB\n"); printf( " > CUSPARSE: %.2e seconds %.2e GFLOP/s (HYB).\n", (end-start)/10, FLOPS*10/(end-start) ); magma_c_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue ); res = 0.0; for(magma_int_t k=0; k<hA.num_rows; k++ ) res=res + MAGMA_C_REAL(hcheck.val[k]) - MAGMA_C_REAL(hrefvec.val[k]); printf("# |x-y|_F = %8.2e\n", res); if ( res < .000001 ) printf("# tester spmv cuSPARSE HYB: ok\n"); else printf("# tester spmv cuSPARSE HYB: failed\n"); magma_c_vfree( &hcheck, queue ); cusparseDestroyMatDescr( descrA ); cusparseDestroyHybMat( hybA ); cusparseDestroy( cusparseHandle ); magma_c_mfree(&dA, queue ); printf("\n\n"); // free CPU memory magma_c_mfree(&hA, queue ); magma_c_vfree(&hx, queue ); magma_c_vfree(&hy, queue ); magma_c_vfree(&hrefvec, queue ); // free GPU memory magma_c_vfree(&dx, queue ); magma_c_vfree(&dy, queue ); i++; } magma_queue_destroy( queue ); TESTING_FINALIZE(); return 0; }
extern "C" magma_int_t magma_cbicgstab( magma_c_sparse_matrix A, magma_c_vector b, magma_c_vector *x, magma_c_solver_par *solver_par, magma_queue_t queue ) { // set queue for old dense routines magma_queue_t orig_queue; magmablasGetKernelStream( &orig_queue ); // prepare solver feedback solver_par->solver = Magma_BICGSTAB; solver_par->numiter = 0; solver_par->info = MAGMA_SUCCESS; // some useful variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE, c_mone = MAGMA_C_NEG_ONE; magma_int_t dofs = A.num_rows; // workspace magma_c_vector r,rr,p,v,s,t; magma_c_vinit( &r, Magma_DEV, dofs, c_zero, queue ); magma_c_vinit( &rr, Magma_DEV, dofs, c_zero, queue ); magma_c_vinit( &p, Magma_DEV, dofs, c_zero, queue ); magma_c_vinit( &v, Magma_DEV, dofs, c_zero, queue ); magma_c_vinit( &s, Magma_DEV, dofs, c_zero, queue ); magma_c_vinit( &t, Magma_DEV, dofs, c_zero, queue ); // solver variables magmaFloatComplex alpha, beta, omega, rho_old, rho_new; float nom, betanom, nom0, r0, den, res; // solver setup magma_cscal( dofs, c_zero, x->dval, 1) ; // x = 0 magma_ccopy( dofs, b.dval, 1, r.dval, 1 ); // r = b magma_ccopy( dofs, b.dval, 1, rr.dval, 1 ); // rr = b nom0 = betanom = magma_scnrm2( dofs, r.dval, 1 ); // nom = || r || nom = nom0*nom0; rho_old = omega = alpha = MAGMA_C_MAKE( 1.0, 0. ); solver_par->init_res = nom0; magma_c_spmv( c_one, A, r, c_zero, v, queue ); // z = A r den = MAGMA_C_REAL( magma_cdotc(dofs, v.dval, 1, r.dval, 1) ); // den = z' * r if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) { magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } // check positive definite if (den <= 0.0) { printf("Operator A is not postive definite. (Ar,r) = %f\n", den); magmablasSetKernelStream( orig_queue ); return MAGMA_NONSPD; solver_par->info = MAGMA_NONSPD;; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ) { rho_new = magma_cdotc( dofs, rr.dval, 1, r.dval, 1 ); // rho=<rr,r> beta = rho_new/rho_old * alpha/omega; // beta=rho/rho_old *alpha/omega magma_cscal( dofs, beta, p.dval, 1 ); // p = beta*p magma_caxpy( dofs, c_mone * omega * beta, v.dval, 1 , p.dval, 1 ); // p = p-omega*beta*v magma_caxpy( dofs, c_one, r.dval, 1, p.dval, 1 ); // p = p+r magma_c_spmv( c_one, A, p, c_zero, v, queue ); // v = Ap alpha = rho_new / magma_cdotc( dofs, rr.dval, 1, v.dval, 1 ); magma_ccopy( dofs, r.dval, 1 , s.dval, 1 ); // s=r magma_caxpy( dofs, c_mone * alpha, v.dval, 1 , s.dval, 1 ); // s=s-alpha*v magma_c_spmv( c_one, A, s, c_zero, t, queue ); // t=As omega = magma_cdotc( dofs, t.dval, 1, s.dval, 1 ) // omega = <s,t>/<t,t> / magma_cdotc( dofs, t.dval, 1, t.dval, 1 ); magma_caxpy( dofs, alpha, p.dval, 1 , x->dval, 1 ); // x=x+alpha*p magma_caxpy( dofs, omega, s.dval, 1 , x->dval, 1 ); // x=x+omega*s magma_ccopy( dofs, s.dval, 1 , r.dval, 1 ); // r=s magma_caxpy( dofs, c_mone * omega, t.dval, 1 , r.dval, 1 ); // r=r-omega*t res = betanom = magma_scnrm2( dofs, r.dval, 1 ); nom = betanom*betanom; rho_old = rho_new; // rho_old=rho if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res/nom0 < solver_par->epsilon ) { break; } } tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; magma_cresidual( A, b, *x, &residual, queue ); solver_par->iter_res = res; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter) { solver_par->info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_SLOW_CONVERGENCE; } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_DIVERGENCE; } magma_c_vfree(&r, queue ); magma_c_vfree(&rr, queue ); magma_c_vfree(&p, queue ); magma_c_vfree(&v, queue ); magma_c_vfree(&s, queue ); magma_c_vfree(&t, queue ); magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } /* magma_cbicgstab */
extern "C" magma_int_t magma_cbpcg( magma_c_sparse_matrix A, magma_c_vector b, magma_c_vector *x, magma_c_solver_par *solver_par, magma_c_preconditioner *precond_par, magma_queue_t queue ) { // set queue for old dense routines magma_queue_t orig_queue; magmablasGetKernelStream( &orig_queue ); magma_int_t stat_dev = 0, stat_cpu = 0; magma_int_t i, num_vecs = b.num_rows/A.num_rows; // prepare solver feedback solver_par->solver = Magma_PCG; solver_par->numiter = 0; solver_par->info = MAGMA_SUCCESS; // local variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE; magma_int_t dofs = A.num_rows; // GPU workspace magma_c_vector r, rt, p, q, h; magma_c_vinit( &r, Magma_DEV, dofs*num_vecs, c_zero, queue ); magma_c_vinit( &rt, Magma_DEV, dofs*num_vecs, c_zero, queue ); magma_c_vinit( &p, Magma_DEV, dofs*num_vecs, c_zero, queue ); magma_c_vinit( &q, Magma_DEV, dofs*num_vecs, c_zero, queue ); magma_c_vinit( &h, Magma_DEV, dofs*num_vecs, c_zero, queue ); // solver variables magmaFloatComplex *alpha, *beta; alpha = NULL; beta = NULL; stat_cpu += magma_cmalloc_cpu(&alpha, num_vecs); stat_cpu += magma_cmalloc_cpu(&beta, num_vecs); float *nom, *nom0, *r0, *gammaold, *gammanew, *den, *res, *residual; nom = NULL; nom0 = NULL; r0 = NULL; gammaold = NULL; gammanew = NULL; den = NULL; res = NULL; residual = NULL; stat_cpu += magma_smalloc_cpu(&residual, num_vecs); stat_cpu += magma_smalloc_cpu(&nom, num_vecs); stat_cpu += magma_smalloc_cpu(&nom0, num_vecs); stat_cpu += magma_smalloc_cpu(&r0, num_vecs); stat_cpu += magma_smalloc_cpu(&gammaold, num_vecs); stat_cpu += magma_smalloc_cpu(&gammanew, num_vecs); stat_cpu += magma_smalloc_cpu(&den, num_vecs); stat_cpu += magma_smalloc_cpu(&res, num_vecs); stat_cpu += magma_smalloc_cpu(&residual, num_vecs); if( stat_cpu != 0 ){ magma_free_cpu( nom ); magma_free_cpu( nom0 ); magma_free_cpu( r0 ); magma_free_cpu( gammaold ); magma_free_cpu( gammanew ); magma_free_cpu( den ); magma_free_cpu( res ); magma_free_cpu( alpha ); magma_free_cpu( beta ); magma_free_cpu( residual ); magmablasSetKernelStream( orig_queue ); printf("error: memory allocation.\n"); return MAGMA_ERR_HOST_ALLOC; } // solver setup magma_cscal( dofs*num_vecs, c_zero, x->dval, 1) ; // x = 0 magma_ccopy( dofs*num_vecs, b.dval, 1, r.dval, 1 ); // r = b // preconditioner magma_c_applyprecond_left( A, r, &rt, precond_par, queue ); magma_c_applyprecond_right( A, rt, &h, precond_par, queue ); magma_ccopy( dofs*num_vecs, h.dval, 1, p.dval, 1 ); // p = h for( i=0; i<num_vecs; i++) { nom[i] = MAGMA_C_REAL( magma_cdotc(dofs, r(i), 1, h(i), 1) ); nom0[i] = magma_scnrm2( dofs, r(i), 1 ); } magma_c_spmv( c_one, A, p, c_zero, q, queue ); // q = A p for( i=0; i<num_vecs; i++) den[i] = MAGMA_C_REAL( magma_cdotc(dofs, p(i), 1, q(i), 1) ); // den = p dot q solver_par->init_res = nom0[0]; if ( (r0[0] = nom[0] * solver_par->epsilon) < ATOLERANCE ) r0[0] = ATOLERANCE; // check positive definite if (den[0] <= 0.0) { printf("Operator A is not postive definite. (Ar,r) = %f\n", den[0]); magmablasSetKernelStream( orig_queue ); return MAGMA_NONSPD; solver_par->info = MAGMA_NONSPD;; } if ( nom[0] < r0[0] ) { magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nom0[0]; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ) { // preconditioner magma_c_applyprecond_left( A, r, &rt, precond_par, queue ); magma_c_applyprecond_right( A, rt, &h, precond_par, queue ); for( i=0; i<num_vecs; i++) gammanew[i] = MAGMA_C_REAL( magma_cdotc(dofs, r(i), 1, h(i), 1) ); // gn = < r,h> if ( solver_par->numiter==1 ) { magma_ccopy( dofs*num_vecs, h.dval, 1, p.dval, 1 ); // p = h } else { for( i=0; i<num_vecs; i++) { beta[i] = MAGMA_C_MAKE(gammanew[i]/gammaold[i], 0.); // beta = gn/go magma_cscal(dofs, beta[i], p(i), 1); // p = beta*p magma_caxpy(dofs, c_one, h(i), 1, p(i), 1); // p = p + h } } magma_c_spmv( c_one, A, p, c_zero, q, queue ); // q = A p // magma_c_bspmv_tuned( dofs, num_vecs, c_one, A, p.dval, c_zero, q.dval, queue ); for( i=0; i<num_vecs; i++) { den[i] = MAGMA_C_REAL(magma_cdotc(dofs, p(i), 1, q(i), 1)); // den = p dot q alpha[i] = MAGMA_C_MAKE(gammanew[i]/den[i], 0.); magma_caxpy(dofs, alpha[i], p(i), 1, x->dval+dofs*i, 1); // x = x + alpha p magma_caxpy(dofs, -alpha[i], q(i), 1, r(i), 1); // r = r - alpha q gammaold[i] = gammanew[i]; res[i] = magma_scnrm2( dofs, r(i), 1 ); } if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res[0]; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res[0]/nom0[0] < solver_par->epsilon ) { break; } } tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; magma_cresidual( A, b, *x, residual, queue ); solver_par->iter_res = res[0]; solver_par->final_res = residual[0]; if ( solver_par->numiter < solver_par->maxiter) { solver_par->info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res[0]; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_SLOW_CONVERGENCE; } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res[0]; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_DIVERGENCE; } for( i=0; i<num_vecs; i++) { printf("%.4e ",res[i]); } printf("\n"); for( i=0; i<num_vecs; i++) { printf("%.4e ",residual[i]); } printf("\n"); magma_c_vfree(&r, queue ); magma_c_vfree(&rt, queue ); magma_c_vfree(&p, queue ); magma_c_vfree(&q, queue ); magma_c_vfree(&h, queue ); magma_free_cpu(alpha); magma_free_cpu(beta); magma_free_cpu(nom); magma_free_cpu(nom0); magma_free_cpu(r0); magma_free_cpu(gammaold); magma_free_cpu(gammanew); magma_free_cpu(den); magma_free_cpu(res); magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } /* magma_cbpcg */
/* //////////////////////////////////////////////////////////////////////////// -- testing any solver */ int main( int argc, char** argv) { TESTING_INIT(); magma_copts zopts; int i=1; magma_cparse_opts( argc, argv, &zopts, &i); magmaFloatComplex one = MAGMA_C_MAKE(1.0, 0.0); magmaFloatComplex zero = MAGMA_C_MAKE(0.0, 0.0); magma_c_sparse_matrix A, B, B_d; magma_c_vector x, b; B.blocksize = zopts.blocksize; B.alignment = zopts.alignment; if ( zopts.solver_par.solver != Magma_PCG && zopts.solver_par.solver != Magma_PGMRES && zopts.solver_par.solver != Magma_PBICGSTAB && zopts.solver_par.solver != Magma_ITERREF ) zopts.precond_par.solver = Magma_NONE; magma_csolverinfo_init( &zopts.solver_par, &zopts.precond_par ); while( i < argc ){ magma_c_csr_mtx( &A, argv[i] ); printf( "\n# matrix info: %d-by-%d with %d nonzeros\n\n", (int) A.num_rows,(int) A.num_cols,(int) A.nnz ); // scale matrix magma_cmscale( &A, zopts.scaling ); magma_c_mconvert( A, &B, Magma_CSR, zopts.output_format ); magma_c_mtransfer( B, &B_d, Magma_CPU, Magma_DEV ); // vectors and initial guess magma_c_vinit( &b, Magma_DEV, A.num_cols, one ); magma_c_vinit( &x, Magma_DEV, A.num_cols, one ); magma_c_spmv( one, B_d, x, zero, b ); // b = A x magma_c_vfree(&x); magma_c_vinit( &x, Magma_DEV, A.num_cols, zero ); magma_c_solver( B_d, b, &x, &zopts ); magma_csolverinfo( &zopts.solver_par, &zopts.precond_par ); magma_c_mfree(&B_d); magma_c_mfree(&B); magma_c_mfree(&A); magma_c_vfree(&x); magma_c_vfree(&b); i++; } magma_csolverinfo_free( &zopts.solver_par, &zopts.precond_par ); TESTING_FINALIZE(); return 0; }
extern "C" magma_int_t magma_ccg_res( magma_c_sparse_matrix A, magma_c_vector b, magma_c_vector *x, magma_c_solver_par *solver_par, magma_queue_t queue ) { // set queue for old dense routines magma_queue_t orig_queue; magmablasGetKernelStream( &orig_queue ); // prepare solver feedback solver_par->solver = Magma_CG; solver_par->numiter = 0; solver_par->info = MAGMA_SUCCESS; // local variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE; magma_int_t dofs = A.num_rows; // GPU workspace magma_c_vector r, p, q; magma_c_vinit( &r, Magma_DEV, dofs, c_zero, queue ); magma_c_vinit( &p, Magma_DEV, dofs, c_zero, queue ); magma_c_vinit( &q, Magma_DEV, dofs, c_zero, queue ); // solver variables magmaFloatComplex alpha, beta; float nom, nom0, r0, betanom, betanomsq, den, res; // solver setup magma_cscal( dofs, c_zero, x->dval, 1) ; // x = 0 magma_ccopy( dofs, b.dval, 1, r.dval, 1 ); // r = b magma_ccopy( dofs, b.dval, 1, p.dval, 1 ); // p = b nom0 = betanom = magma_scnrm2( dofs, r.dval, 1 ); nom = nom0 * nom0; // nom = r' * r magma_c_spmv( c_one, A, p, c_zero, q, queue ); // q = A p den = MAGMA_C_REAL( magma_cdotc(dofs, p.dval, 1, q.dval, 1) );// den = p dot q solver_par->init_res = nom0; if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) { magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } // check positive definite if (den <= 0.0) { printf("Operator A is not postive definite. (Ar,r) = %f\n", den); magmablasSetKernelStream( orig_queue ); return MAGMA_NONSPD; solver_par->info = MAGMA_NONSPD;; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ) { alpha = MAGMA_C_MAKE(nom/den, 0.); magma_caxpy(dofs, alpha, p.dval, 1, x->dval, 1); // x = x + alpha p magma_caxpy(dofs, -alpha, q.dval, 1, r.dval, 1); // r = r - alpha q res = betanom = magma_scnrm2(dofs, r.dval, 1); // betanom = || r || betanomsq = betanom * betanom; // betanoms = r' * r if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res/nom0 < solver_par->epsilon ) { break; } beta = MAGMA_C_MAKE(betanomsq/nom, 0.); // beta = betanoms/nom magma_cscal(dofs, beta, p.dval, 1); // p = beta*p magma_caxpy(dofs, c_one, r.dval, 1, p.dval, 1); // p = p + r magma_c_spmv( c_one, A, p, c_zero, q, queue ); // q = A p den = MAGMA_C_REAL(magma_cdotc(dofs, p.dval, 1, q.dval, 1)); // den = p dot q nom = betanomsq; } tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; magma_cresidual( A, b, *x, &residual, queue ); solver_par->iter_res = res; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter) { solver_par->info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_SLOW_CONVERGENCE; } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_DIVERGENCE; } magma_c_vfree(&r, queue ); magma_c_vfree(&p, queue ); magma_c_vfree(&q, queue ); magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } /* magma_ccg */