/** Purpose ------- SPOTRS solves a system of linear equations A*X = B with a symmetric positive definite matrix A using the Cholesky factorization A = U**H*U or A = L*L**H computed by SPOTRF. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in] dA REAL array on the GPU, dimension (LDDA,N) The triangular factor U or L from the Cholesky factorization A = U**H*U or A = L*L**H, as computed by SPOTRF. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,N). @param[in,out] dB REAL array on the GPU, dimension (LDDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. @param[in] lddb INTEGER The leading dimension of the array B. LDDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_sposv_comp ********************************************************************/ extern "C" magma_int_t magma_spotrs_gpu(magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs, float *dA, magma_int_t ldda, float *dB, magma_int_t lddb, magma_int_t *info) { float c_one = MAGMA_S_ONE; *info = 0; if ( uplo != MagmaUpper && uplo != MagmaLower ) *info = -1; if ( n < 0 ) *info = -2; if ( nrhs < 0) *info = -3; if ( ldda < max(1, n) ) *info = -5; if ( lddb < max(1, n) ) *info = -7; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if ( (n == 0) || (nrhs == 0) ) { return *info; } if ( uplo == MagmaUpper ) { if ( nrhs == 1) { magma_strsv(MagmaUpper, MagmaConjTrans, MagmaNonUnit, n, dA, ldda, dB, 1 ); magma_strsv(MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA, ldda, dB, 1 ); } else { magma_strsm(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb); magma_strsm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb); } } else { if ( nrhs == 1) { magma_strsv(MagmaLower, MagmaNoTrans, MagmaNonUnit, n, dA, ldda, dB, 1 ); magma_strsv(MagmaLower, MagmaConjTrans, MagmaNonUnit, n, dA, ldda, dB, 1 ); } else { magma_strsm(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb); magma_strsm(MagmaLeft, MagmaLower, MagmaConjTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb); } } return *info; }
/** Purpose ------- Solves a system of linear equations A * X = B, A**T * X = B, or A**T * X = B with a general N-by-N matrix A using the LU factorization computed by SGETRF_GPU. Arguments --------- @param[in] trans magma_trans_t Specifies the form of the system of equations: - = MagmaNoTrans: A * X = B (No transpose) - = MagmaTrans: A**T * X = B (Transpose) - = MagmaTrans: A**T * X = B (Conjugate transpose) @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in] dA REAL array on the GPU, dimension (LDA,N) The factors L and U from the factorization A = P*L*U as computed by SGETRF_GPU. @param[in] ldda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in] ipiv INTEGER array, dimension (N) The pivot indices from SGETRF; for 1 <= i <= N, row i of the matrix was interchanged with row IPIV(i). @param[in,out] dB REAL array on the GPU, dimension (LDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. @param[in] lddb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_sgesv_comp ********************************************************************/ extern "C" magma_int_t magma_sgetrs_gpu(magma_trans_t trans, magma_int_t n, magma_int_t nrhs, float *dA, magma_int_t ldda, magma_int_t *ipiv, float *dB, magma_int_t lddb, magma_int_t *info) { float c_one = MAGMA_S_ONE; float *work = NULL; int notran = (trans == MagmaNoTrans); magma_int_t i1, i2, inc; *info = 0; if ( (! notran) && (trans != MagmaTrans) && (trans != MagmaTrans) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (nrhs < 0) { *info = -3; } else if (ldda < max(1,n)) { *info = -5; } else if (lddb < max(1,n)) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (n == 0 || nrhs == 0) { return *info; } magma_smalloc_cpu( &work, n * nrhs ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } i1 = 1; i2 = n; if (notran) { inc = 1; /* Solve A * X = B. */ magma_sgetmatrix( n, nrhs, dB, lddb, work, n ); lapackf77_slaswp(&nrhs, work, &n, &i1, &i2, ipiv, &inc); magma_ssetmatrix( n, nrhs, work, n, dB, lddb ); if ( nrhs == 1) { magma_strsv(MagmaLower, MagmaNoTrans, MagmaUnit, n, dA, ldda, dB, 1 ); magma_strsv(MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA, ldda, dB, 1 ); } else { magma_strsm(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, n, nrhs, c_one, dA, ldda, dB, lddb ); magma_strsm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb ); } } else { inc = -1; /* Solve A**T * X = B or A**T * X = B. */ if ( nrhs == 1) { magma_strsv(MagmaUpper, trans, MagmaNonUnit, n, dA, ldda, dB, 1 ); magma_strsv(MagmaLower, trans, MagmaUnit, n, dA, ldda, dB, 1 ); } else { magma_strsm(MagmaLeft, MagmaUpper, trans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb ); magma_strsm(MagmaLeft, MagmaLower, trans, MagmaUnit, n, nrhs, c_one, dA, ldda, dB, lddb ); } magma_sgetmatrix( n, nrhs, dB, lddb, work, n ); lapackf77_slaswp(&nrhs, work, &n, &i1, &i2, ipiv, &inc); magma_ssetmatrix( n, nrhs, work, n, dB, lddb ); } magma_free_cpu(work); return *info; }
extern "C" magma_int_t magma_sgetrs_gpu(char trans, magma_int_t n, magma_int_t nrhs, float *dA, magma_int_t ldda, magma_int_t *ipiv, float *dB, magma_int_t lddb, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= Solves a system of linear equations A * X = B or A' * X = B with a general N-by-N matrix A using the LU factorization computed by SGETRF_GPU. Arguments ========= TRANS (input) CHARACTER*1 Specifies the form of the system of equations: = 'N': A * X = B (No transpose) = 'T': A'* X = B (Transpose) = 'C': A'* X = B (Conjugate transpose = Transpose) N (input) INTEGER The order of the matrix A. N >= 0. NRHS (input) INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. A (input) REAL array on the GPU, dimension (LDA,N) The factors L and U from the factorization A = P*L*U as computed by SGETRF_GPU. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). IPIV (input) INTEGER array, dimension (N) The pivot indices from SGETRF; for 1<=i<=N, row i of the matrix was interchanged with row IPIV(i). B (input/output) REAL array on the GPU, dimension (LDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. LDB (input) INTEGER The leading dimension of the array B. LDB >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value HWORK (workspace) REAL array, dimension N*NRHS ===================================================================== */ float c_one = MAGMA_S_ONE; float *work = NULL; char trans_[2] = {trans, 0}; int notran = lapackf77_lsame(trans_, "N"); magma_int_t i1, i2, inc; *info = 0; if ( (! notran) && (! lapackf77_lsame(trans_, "T")) && (! lapackf77_lsame(trans_, "C")) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (nrhs < 0) { *info = -3; } else if (ldda < max(1,n)) { *info = -5; } else if (lddb < max(1,n)) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (n == 0 || nrhs == 0) { return *info; } magma_smalloc_cpu( &work, n * nrhs ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } i1 = 1; i2 = n; if (notran) { inc = 1; /* Solve A * X = B. */ magma_sgetmatrix( n, nrhs, dB, lddb, work, n ); lapackf77_slaswp(&nrhs, work, &n, &i1, &i2, ipiv, &inc); magma_ssetmatrix( n, nrhs, work, n, dB, lddb ); if ( nrhs == 1) { magma_strsv(MagmaLower, MagmaNoTrans, MagmaUnit, n, dA, ldda, dB, 1 ); magma_strsv(MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA, ldda, dB, 1 ); } else { magma_strsm(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, n, nrhs, c_one, dA, ldda, dB, lddb ); magma_strsm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb ); } } else { inc = -1; /* Solve A' * X = B. */ if ( nrhs == 1) { magma_strsv(MagmaUpper, trans, MagmaNonUnit, n, dA, ldda, dB, 1 ); magma_strsv(MagmaLower, trans, MagmaUnit, n, dA, ldda, dB, 1 ); } else { magma_strsm(MagmaLeft, MagmaUpper, trans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb ); magma_strsm(MagmaLeft, MagmaLower, trans, MagmaUnit, n, nrhs, c_one, dA, ldda, dB, lddb ); } magma_sgetmatrix( n, nrhs, dB, lddb, work, n ); lapackf77_slaswp(&nrhs, work, &n, &i1, &i2, ipiv, &inc); magma_ssetmatrix( n, nrhs, work, n, dB, lddb ); } magma_free_cpu(work); return *info; }
extern "C" magma_int_t magma_spidr( magma_s_matrix A, magma_s_matrix b, magma_s_matrix *x, magma_s_solver_par *solver_par, magma_s_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_PIDR; solver_par->numiter = 0; solver_par->spmv_count = 0; solver_par->init_res = 0.0; solver_par->final_res = 0.0; solver_par->iter_res = 0.0; solver_par->runtime = 0.0; // constants const float c_zero = MAGMA_S_ZERO; const float c_one = MAGMA_S_ONE; const float c_n_one = MAGMA_S_NEG_ONE; // internal user parameters const magma_int_t smoothing = 1; // 0 = disable, 1 = enable const float angle = 0.7; // [0-1] // local variables magma_int_t iseed[4] = {0, 0, 0, 1}; magma_int_t dof; magma_int_t s; magma_int_t distr; magma_int_t k, i, sk; magma_int_t innerflag; float residual; float nrm; float nrmb; float nrmr; float nrmt; float rho; float om; float tt; float tr; float gamma; float alpha; float mkk; float fk; // matrices and vectors magma_s_matrix dxs = {Magma_CSR}; magma_s_matrix dr = {Magma_CSR}, drs = {Magma_CSR}; magma_s_matrix dP = {Magma_CSR}, dP1 = {Magma_CSR}; magma_s_matrix dG = {Magma_CSR}; magma_s_matrix dU = {Magma_CSR}; magma_s_matrix dM = {Magma_CSR}; magma_s_matrix df = {Magma_CSR}; magma_s_matrix dt = {Magma_CSR}; magma_s_matrix dc = {Magma_CSR}; magma_s_matrix dv = {Magma_CSR}; magma_s_matrix dbeta = {Magma_CSR}, hbeta = {Magma_CSR}; magma_s_matrix dlu = {Magma_CSR}; // chronometry real_Double_t tempo1, tempo2; // initial s space // TODO: add option for 's' (shadow space number) // Hack: uses '--restart' option as the shadow space number. // This is not a good idea because the default value of restart option is used to detect // if the user provided a custom restart. This means that if the default restart value // is changed then the code will think it was the user (unless the default value is // also updated in the 'if' statement below. s = 1; if ( solver_par->restart != 50 ) { if ( solver_par->restart > A.num_cols ) { s = A.num_cols; } else { s = solver_par->restart; } } solver_par->restart = s; // set max iterations solver_par->maxiter = min( 2 * A.num_cols, solver_par->maxiter ); // check if matrix A is square if ( A.num_rows != A.num_cols ) { //printf("Matrix A is not square.\n"); info = MAGMA_ERR_NOT_SUPPORTED; goto cleanup; } // |b| nrmb = magma_snrm2( b.num_rows, b.dval, 1, queue ); if ( nrmb == 0.0 ) { magma_sscal( x->num_rows, MAGMA_S_ZERO, x->dval, 1, queue ); info = MAGMA_SUCCESS; goto cleanup; } // r = b - A x CHECK( magma_svinit( &dr, Magma_DEV, b.num_rows, 1, c_zero, queue )); CHECK( magma_sresidualvec( A, b, *x, &dr, &nrmr, queue )); // |r| solver_par->init_res = nrmr; solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nrmr; } // check if initial is guess good enough if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { info = MAGMA_SUCCESS; goto cleanup; } // P = randn(n, s) // P = ortho(P) //--------------------------------------- // P = 0.0 CHECK( magma_svinit( &dP, Magma_CPU, A.num_cols, s, c_zero, queue )); // P = randn(n, s) distr = 3; // 1 = unif (0,1), 2 = unif (-1,1), 3 = normal (0,1) dof = dP.num_rows * dP.num_cols; lapackf77_slarnv( &distr, iseed, &dof, dP.val ); // transfer P to device CHECK( magma_smtransfer( dP, &dP1, Magma_CPU, Magma_DEV, queue )); magma_smfree( &dP, queue ); // P = ortho(P1) if ( dP1.num_cols > 1 ) { // P = magma_sqr(P1), QR factorization CHECK( magma_sqr( dP1.num_rows, dP1.num_cols, dP1, dP1.ld, &dP, NULL, queue )); } else { // P = P1 / |P1| nrm = magma_snrm2( dof, dP1.dval, 1, queue ); nrm = 1.0 / nrm; magma_sscal( dof, nrm, dP1.dval, 1, queue ); CHECK( magma_smtransfer( dP1, &dP, Magma_DEV, Magma_DEV, queue )); } magma_smfree( &dP1, queue ); //--------------------------------------- // allocate memory for the scalar products CHECK( magma_svinit( &hbeta, Magma_CPU, s, 1, c_zero, queue )); CHECK( magma_svinit( &dbeta, Magma_DEV, s, 1, c_zero, queue )); // smoothing enabled if ( smoothing > 0 ) { // set smoothing solution vector CHECK( magma_smtransfer( *x, &dxs, Magma_DEV, Magma_DEV, queue )); // set smoothing residual vector CHECK( magma_smtransfer( dr, &drs, Magma_DEV, Magma_DEV, queue )); } // G(n,s) = 0 CHECK( magma_svinit( &dG, Magma_DEV, A.num_cols, s, c_zero, queue )); // U(n,s) = 0 CHECK( magma_svinit( &dU, Magma_DEV, A.num_cols, s, c_zero, queue )); // M(s,s) = I CHECK( magma_svinit( &dM, Magma_DEV, s, s, c_zero, queue )); magmablas_slaset( MagmaFull, s, s, c_zero, c_one, dM.dval, s, queue ); // f = 0 CHECK( magma_svinit( &df, Magma_DEV, dP.num_cols, 1, c_zero, queue )); // t = 0 CHECK( magma_svinit( &dt, Magma_DEV, dr.num_rows, 1, c_zero, queue )); // c = 0 CHECK( magma_svinit( &dc, Magma_DEV, dM.num_cols, 1, c_zero, queue )); // v = 0 CHECK( magma_svinit( &dv, Magma_DEV, dr.num_rows, 1, c_zero, queue )); // lu = 0 CHECK( magma_svinit( &dlu, Magma_DEV, A.num_rows, 1, c_zero, queue )); //--------------START TIME--------------- // chronometry tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->timing[0] = 0.0; } om = MAGMA_S_ONE; innerflag = 0; // start iteration do { solver_par->numiter++; // new RHS for small systems // f = P' r magmablas_sgemv( MagmaConjTrans, dP.num_rows, dP.num_cols, c_one, dP.dval, dP.ld, dr.dval, 1, c_zero, df.dval, 1, queue ); // shadow space loop for ( k = 0; k < s; ++k ) { sk = s - k; // f(k:s) = M(k:s,k:s) c(k:s) magma_scopyvector( sk, &df.dval[k], 1, &dc.dval[k], 1, queue ); magma_strsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, sk, &dM.dval[k*dM.ld+k], dM.ld, &dc.dval[k], 1, queue ); // v = r - G(:,k:s) c(k:s) magma_scopyvector( dr.num_rows, dr.dval, 1, dv.dval, 1, queue ); magmablas_sgemv( MagmaNoTrans, dG.num_rows, sk, c_n_one, &dG.dval[k*dG.ld], dG.ld, &dc.dval[k], 1, c_one, dv.dval, 1, queue ); // preconditioning operation // v = L \ v; // v = U \ v; CHECK( magma_s_applyprecond_left( MagmaNoTrans, A, dv, &dlu, precond_par, queue )); CHECK( magma_s_applyprecond_right( MagmaNoTrans, A, dlu, &dv, precond_par, queue )); // U(:,k) = om * v + U(:,k:s) c(k:s) magmablas_sgemv( MagmaNoTrans, dU.num_rows, sk, c_one, &dU.dval[k*dU.ld], dU.ld, &dc.dval[k], 1, om, dv.dval, 1, queue ); magma_scopyvector( dU.num_rows, dv.dval, 1, &dU.dval[k*dU.ld], 1, queue ); // G(:,k) = A U(:,k) CHECK( magma_s_spmv( c_one, A, dv, c_zero, dv, queue )); solver_par->spmv_count++; magma_scopyvector( dG.num_rows, dv.dval, 1, &dG.dval[k*dG.ld], 1, queue ); // bi-orthogonalize the new basis vectors for ( i = 0; i < k; ++i ) { // alpha = P(:,i)' G(:,k) alpha = magma_sdot( dP.num_rows, &dP.dval[i*dP.ld], 1, &dG.dval[k*dG.ld], 1, queue ); // alpha = alpha / M(i,i) magma_sgetvector( 1, &dM.dval[i*dM.ld+i], 1, &mkk, 1, queue ); alpha = alpha / mkk; // G(:,k) = G(:,k) - alpha * G(:,i) magma_saxpy( dG.num_rows, -alpha, &dG.dval[i*dG.ld], 1, &dG.dval[k*dG.ld], 1, queue ); // U(:,k) = U(:,k) - alpha * U(:,i) magma_saxpy( dU.num_rows, -alpha, &dU.dval[i*dU.ld], 1, &dU.dval[k*dU.ld], 1, queue ); } // new column of M = P'G, first k-1 entries are zero // M(k:s,k) = P(:,k:s)' G(:,k) magmablas_sgemv( MagmaConjTrans, dP.num_rows, sk, c_one, &dP.dval[k*dP.ld], dP.ld, &dG.dval[k*dG.ld], 1, c_zero, &dM.dval[k*dM.ld+k], 1, queue ); // check M(k,k) == 0 magma_sgetvector( 1, &dM.dval[k*dM.ld+k], 1, &mkk, 1, queue ); if ( MAGMA_S_EQUAL(mkk, MAGMA_S_ZERO) ) { innerflag = 1; info = MAGMA_DIVERGENCE; break; } // beta = f(k) / M(k,k) magma_sgetvector( 1, &df.dval[k], 1, &fk, 1, queue ); hbeta.val[k] = fk / mkk; // check for nan if ( magma_s_isnan( hbeta.val[k] ) || magma_s_isinf( hbeta.val[k] )) { innerflag = 1; info = MAGMA_DIVERGENCE; break; } // r = r - beta * G(:,k) magma_saxpy( dr.num_rows, -hbeta.val[k], &dG.dval[k*dG.ld], 1, dr.dval, 1, queue ); // smoothing disabled if ( smoothing <= 0 ) { // |r| nrmr = magma_snrm2( dr.num_rows, dr.dval, 1, queue ); // smoothing enabled } else { // x = x + beta * U(:,k) magma_saxpy( x->num_rows, hbeta.val[k], &dU.dval[k*dU.ld], 1, x->dval, 1, queue ); // smoothing operation //--------------------------------------- // t = rs - r magma_scopyvector( drs.num_rows, drs.dval, 1, dt.dval, 1, queue ); magma_saxpy( dt.num_rows, c_n_one, dr.dval, 1, dt.dval, 1, queue ); // t't // t'rs tt = magma_sdot( dt.num_rows, dt.dval, 1, dt.dval, 1, queue ); tr = magma_sdot( dt.num_rows, dt.dval, 1, drs.dval, 1, queue ); // gamma = (t' * rs) / (t' * t) gamma = tr / tt; // rs = rs - gamma * (rs - r) magma_saxpy( drs.num_rows, -gamma, dt.dval, 1, drs.dval, 1, queue ); // xs = xs - gamma * (xs - x) magma_scopyvector( dxs.num_rows, dxs.dval, 1, dt.dval, 1, queue ); magma_saxpy( dt.num_rows, c_n_one, x->dval, 1, dt.dval, 1, queue ); magma_saxpy( dxs.num_rows, -gamma, dt.dval, 1, dxs.dval, 1, queue ); // |rs| nrmr = magma_snrm2( drs.num_rows, drs.dval, 1, queue ); //--------------------------------------- } // store current timing and residual if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter) % solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)nrmr; solver_par->timing[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)tempo2 - tempo1; } } // check convergence if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { s = k + 1; // for the x-update outside the loop innerflag = 2; info = MAGMA_SUCCESS; break; } // non-last s iteration if ( (k + 1) < s ) { // f(k+1:s) = f(k+1:s) - beta * M(k+1:s,k) magma_saxpy( sk-1, -hbeta.val[k], &dM.dval[k*dM.ld+(k+1)], 1, &df.dval[k+1], 1, queue ); } } // smoothing disabled if ( smoothing <= 0 && innerflag != 1 ) { // update solution approximation x // x = x + U(:,1:s) * beta(1:s) magma_ssetvector( s, hbeta.val, 1, dbeta.dval, 1, queue ); magmablas_sgemv( MagmaNoTrans, dU.num_rows, s, c_one, dU.dval, dU.ld, dbeta.dval, 1, c_one, x->dval, 1, queue ); } // check convergence or iteration limit or invalid result of inner loop if ( innerflag > 0 ) { break; } // v = r magma_scopyvector( dr.num_rows, dr.dval, 1, dv.dval, 1, queue ); // preconditioning operation // v = L \ v; // v = U \ v; CHECK( magma_s_applyprecond_left( MagmaNoTrans, A, dv, &dlu, precond_par, queue )); CHECK( magma_s_applyprecond_right( MagmaNoTrans, A, dlu, &dv, precond_par, queue )); // t = A v CHECK( magma_s_spmv( c_one, A, dv, c_zero, dt, queue )); solver_par->spmv_count++; // computation of a new omega //--------------------------------------- // |t| nrmt = magma_snrm2( dt.num_rows, dt.dval, 1, queue ); // t'r tr = magma_sdot( dt.num_rows, dt.dval, 1, dr.dval, 1, queue ); // rho = abs(t' * r) / (|t| * |r|)) rho = MAGMA_D_ABS( MAGMA_S_REAL(tr) / (nrmt * nrmr) ); // om = (t' * r) / (|t| * |t|) om = tr / (nrmt * nrmt); if ( rho < angle ) { om = (om * angle) / rho; } //--------------------------------------- if ( MAGMA_S_EQUAL(om, MAGMA_S_ZERO) ) { info = MAGMA_DIVERGENCE; break; } // update approximation vector // x = x + om * v magma_saxpy( x->num_rows, om, dv.dval, 1, x->dval, 1, queue ); // update residual vector // r = r - om * t magma_saxpy( dr.num_rows, -om, dt.dval, 1, dr.dval, 1, queue ); // smoothing disabled if ( smoothing <= 0 ) { // residual norm nrmr = magma_snrm2( b.num_rows, dr.dval, 1, queue ); // smoothing enabled } else { // smoothing operation //--------------------------------------- // t = rs - r magma_scopyvector( drs.num_rows, drs.dval, 1, dt.dval, 1, queue ); magma_saxpy( dt.num_rows, c_n_one, dr.dval, 1, dt.dval, 1, queue ); // t't // t'rs tt = magma_sdot( dt.num_rows, dt.dval, 1, dt.dval, 1, queue ); tr = magma_sdot( dt.num_rows, dt.dval, 1, drs.dval, 1, queue ); // gamma = (t' * rs) / (|t| * |t|) gamma = tr / tt; // rs = rs - gamma * (rs - r) magma_saxpy( drs.num_rows, -gamma, dt.dval, 1, drs.dval, 1, queue ); // xs = xs - gamma * (xs - x) magma_scopyvector( dxs.num_rows, dxs.dval, 1, dt.dval, 1, queue ); magma_saxpy( dt.num_rows, c_n_one, x->dval, 1, dt.dval, 1, queue ); magma_saxpy( dxs.num_rows, -gamma, dt.dval, 1, dxs.dval, 1, queue ); // |rs| nrmr = magma_snrm2( b.num_rows, drs.dval, 1, queue ); //--------------------------------------- } // store current timing and residual if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter) % solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)nrmr; solver_par->timing[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)tempo2 - tempo1; } } // check convergence if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { info = MAGMA_SUCCESS; break; } } while ( solver_par->numiter + 1 <= solver_par->maxiter ); // smoothing enabled if ( smoothing > 0 ) { // x = xs magma_scopyvector( x->num_rows, dxs.dval, 1, x->dval, 1, queue ); // r = rs magma_scopyvector( dr.num_rows, drs.dval, 1, dr.dval, 1, queue ); } // get last iteration timing tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t)tempo2 - tempo1; //--------------STOP TIME---------------- // get final stats solver_par->iter_res = nrmr; CHECK( magma_sresidualvec( A, b, *x, &dr, &residual, queue )); solver_par->final_res = residual; // set solver conclusion if ( info != MAGMA_SUCCESS && info != MAGMA_DIVERGENCE ) { if ( solver_par->init_res > solver_par->final_res ) { info = MAGMA_SLOW_CONVERGENCE; } } cleanup: // free resources // smoothing enabled if ( smoothing > 0 ) { magma_smfree( &dxs, queue ); magma_smfree( &drs, queue ); } magma_smfree( &dr, queue ); magma_smfree( &dP, queue ); magma_smfree( &dP1, queue ); magma_smfree( &dG, queue ); magma_smfree( &dU, queue ); magma_smfree( &dM, queue ); magma_smfree( &df, queue ); magma_smfree( &dt, queue ); magma_smfree( &dc, queue ); magma_smfree( &dv, queue ); magma_smfree(&dlu, queue); magma_smfree( &dbeta, queue ); magma_smfree( &hbeta, queue ); solver_par->info = info; return info; /* magma_spidr */ }
/** Purpose ------- SGETRS solves a system of linear equations A * X = B, A**T * X = B, or A**H * X = B with a general N-by-N matrix A using the LU factorization computed by SGETRF_GPU. Arguments --------- @param[in] trans magma_trans_t Specifies the form of the system of equations: - = MagmaNoTrans: A * X = B (No transpose) - = MagmaTrans: A**T * X = B (Transpose) - = MagmaConjTrans: A**H * X = B (Conjugate transpose) @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in] dA REAL array on the GPU, dimension (LDDA,N) The factors L and U from the factorization A = P*L*U as computed by SGETRF_GPU. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,N). @param[in] ipiv INTEGER array, dimension (N) The pivot indices from SGETRF; for 1 <= i <= N, row i of the matrix was interchanged with row IPIV(i). @param[in,out] dB REAL array on the GPU, dimension (LDDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. @param[in] lddb INTEGER The leading dimension of the array B. LDDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_sgesv_comp ********************************************************************/ extern "C" magma_int_t magma_sgetrs_gpu( magma_trans_t trans, magma_int_t n, magma_int_t nrhs, magmaFloat_ptr dA, magma_int_t ldda, magma_int_t *ipiv, magmaFloat_ptr dB, magma_int_t lddb, magma_int_t *info) { // Constants const float c_one = MAGMA_S_ONE; // Local variables float *work = NULL; bool notran = (trans == MagmaNoTrans); magma_int_t i1, i2, inc; *info = 0; if ( (! notran) && (trans != MagmaTrans) && (trans != MagmaConjTrans) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (nrhs < 0) { *info = -3; } else if (ldda < max(1,n)) { *info = -5; } else if (lddb < max(1,n)) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (n == 0 || nrhs == 0) { return *info; } magma_smalloc_cpu( &work, n * nrhs ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_queue_t queue = NULL; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); i1 = 1; i2 = n; if (notran) { inc = 1; /* Solve A * X = B. */ magma_sgetmatrix( n, nrhs, dB, lddb, work, n, queue ); lapackf77_slaswp( &nrhs, work, &n, &i1, &i2, ipiv, &inc ); magma_ssetmatrix( n, nrhs, work, n, dB, lddb, queue ); if ( nrhs == 1) { magma_strsv( MagmaLower, MagmaNoTrans, MagmaUnit, n, dA, ldda, dB, 1, queue ); magma_strsv( MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA, ldda, dB, 1, queue ); } else { magma_strsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, n, nrhs, c_one, dA, ldda, dB, lddb, queue ); magma_strsm( MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb, queue ); } } else { inc = -1; /* Solve A**T * X = B or A**H * X = B. */ if ( nrhs == 1) { magma_strsv( MagmaUpper, trans, MagmaNonUnit, n, dA, ldda, dB, 1, queue ); magma_strsv( MagmaLower, trans, MagmaUnit, n, dA, ldda, dB, 1, queue ); } else { magma_strsm( MagmaLeft, MagmaUpper, trans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb, queue ); magma_strsm( MagmaLeft, MagmaLower, trans, MagmaUnit, n, nrhs, c_one, dA, ldda, dB, lddb, queue ); } magma_sgetmatrix( n, nrhs, dB, lddb, work, n, queue ); lapackf77_slaswp( &nrhs, work, &n, &i1, &i2, ipiv, &inc ); magma_ssetmatrix( n, nrhs, work, n, dB, lddb, queue ); } magma_queue_destroy( queue ); magma_free_cpu( work ); return *info; }
extern "C" magma_int_t magma_spotrs_gpu(char uplo, magma_int_t n, magma_int_t nrhs, float *dA, magma_int_t ldda, float *dB, magma_int_t lddb, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= SPOTRS solves a system of linear equations A*X = B with a symmetric positive definite matrix A using the Cholesky factorization A = U**T*U or A = L*L**T computed by SPOTRF. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. NRHS (input) INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. dA (input) REAL array on the GPU, dimension (LDDA,N) The triangular factor U or L from the Cholesky factorization A = U**T*U or A = L*L**T, as computed by SPOTRF. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,N). dB (input/output) REAL array on the GPU, dimension (LDDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. LDDB (input) INTEGER The leading dimension of the array B. LDDB >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ float c_one = MAGMA_S_ONE; *info = 0 ; if( (uplo != 'U') && (uplo != 'u') && (uplo != 'L') && (uplo != 'l') ) *info = -1; if( n < 0 ) *info = -2; if( nrhs < 0) *info = -3; if ( ldda < max(1, n) ) *info = -5; if ( lddb < max(1, n) ) *info = -7; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if ( (n == 0) || (nrhs == 0) ) { return *info; } if( (uplo=='U') || (uplo=='u') ){ if ( nrhs == 1) { magma_strsv(MagmaUpper, MagmaTrans, MagmaNonUnit, n, dA, ldda, dB, 1 ); magma_strsv(MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA, ldda, dB, 1 ); } else { magma_strsm(MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb); magma_strsm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb); } } else{ if ( nrhs == 1) { magma_strsv(MagmaLower, MagmaNoTrans, MagmaNonUnit, n, dA, ldda, dB, 1 ); magma_strsv(MagmaLower, MagmaTrans, MagmaNonUnit, n, dA, ldda, dB, 1 ); } else { magma_strsm(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb); magma_strsm(MagmaLeft, MagmaLower, MagmaTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb); } } return *info; }
extern "C" magma_int_t magma_sgeqrs3_gpu(magma_int_t m, magma_int_t n, magma_int_t nrhs, float *dA, magma_int_t ldda, float *tau, float *dT, float *dB, magma_int_t lddb, float *hwork, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= Solves the least squares problem min || A*X - C || using the QR factorization A = Q*R computed by SGEQRF3_GPU. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. M >= N >= 0. NRHS (input) INTEGER The number of columns of the matrix C. NRHS >= 0. A (input) REAL array on the GPU, dimension (LDDA,N) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,n, as returned by SGEQRF3_GPU in the first n columns of its array argument A. LDDA (input) INTEGER The leading dimension of the array A, LDDA >= M. TAU (input) REAL array, dimension (N) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by MAGMA_SGEQRF_GPU. DB (input/output) REAL array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. DT (input) REAL array that is the output (the 6th argument) of magma_sgeqrf_gpu of size 2*MIN(M, N)*NB + ((N+31)/32*32 )* MAX(NB, NRHS). The array starts with a block of size MIN(M,N)*NB that stores the triangular T matrices used in the QR factorization, followed by MIN(M,N)*NB block storing the diagonal block matrices for the R matrix, followed by work space of size ((N+31)/32*32 )* MAX(NB, NRHS). LDDB (input) INTEGER The leading dimension of the array DB. LDDB >= M. HWORK (workspace/output) REAL array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK, LWORK >= max(1,NRHS). For optimum performance LWORK >= (M-N+NB)*(NRHS + 2*NB), where NB is the blocksize given by magma_get_sgeqrf_nb( M ). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the WORK array. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ #define a_ref(a_1,a_2) (dA+(a_2)*(ldda) + (a_1)) #define d_ref(a_1) (dT+(lddwork+(a_1))*nb) float c_one = MAGMA_S_ONE; magma_int_t k, lddwork; magma_int_t nb = magma_get_sgeqrf_nb(m); magma_int_t lwkopt = (m-n+nb)*(nrhs+2*nb); int lquery = (lwork == -1); hwork[0] = MAGMA_S_MAKE( (float)lwkopt, 0. ); *info = 0; if (m < 0) *info = -1; else if (n < 0 || m < n) *info = -2; else if (nrhs < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; else if (lddb < max(1,m)) *info = -8; else if (lwork < lwkopt && ! lquery) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { hwork[0] = c_one; return *info; } lddwork= k; /* B := Q' * B */ magma_sormqr_gpu( MagmaLeft, MagmaTrans, m, nrhs, n, a_ref(0,0), ldda, tau, dB, lddb, hwork, lwork, dT, nb, info ); if ( *info != 0 ) { return *info; } /* Solve R*X = B(1:n,:) 1. Move the block diagonal submatrices from d_ref to R 2. Solve 3. Restore the data format moving data from R back to d_ref */ magmablas_sswapdblk(k, nb, a_ref(0,0), ldda, 1, d_ref(0), nb, 0); if ( nrhs == 1 ) { magma_strsv(MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, a_ref(0,0), ldda, dB, 1); } else { magma_strsm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, a_ref(0,0), ldda, dB, lddb); } magmablas_sswapdblk(k, nb, d_ref(0), nb, 0, a_ref(0,0), ldda, 1); return *info; }
/** Purpose ------- Solves the least squares problem min || A*X - C || using the QR factorization A = Q*R computed by SGEQRF3_GPU. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. M >= N >= 0. @param[in] nrhs INTEGER The number of columns of the matrix C. NRHS >= 0. @param[in] dA REAL array on the GPU, dimension (LDDA,N) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,n, as returned by SGEQRF3_GPU in the first n columns of its array argument A. @param[in] ldda INTEGER The leading dimension of the array A, LDDA >= M. @param[in] tau REAL array, dimension (N) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by MAGMA_SGEQRF_GPU. @param[in,out] dB REAL array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. @param[in] dT REAL array that is the output (the 6th argument) of magma_sgeqrf_gpu of size 2*MIN(M, N)*NB + ((N+31)/32*32 )* MAX(NB, NRHS). The array starts with a block of size MIN(M,N)*NB that stores the triangular T matrices used in the QR factorization, followed by MIN(M,N)*NB block storing the diagonal block matrices for the R matrix, followed by work space of size ((N+31)/32*32 )* MAX(NB, NRHS). @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= M. @param[out] hwork (workspace) REAL array, dimension (LWORK) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK, LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB, where NB is the blocksize given by magma_get_sgeqrf_nb( M ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the WORK array. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_sgels_comp ********************************************************************/ extern "C" magma_int_t magma_sgeqrs3_gpu( magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaFloat_ptr dA, magma_int_t ldda, float *tau, magmaFloat_ptr dT, magmaFloat_ptr dB, magma_int_t lddb, float *hwork, magma_int_t lwork, magma_int_t *info) { #define dA(a_1,a_2) (dA + (a_2)*(ldda) + (a_1)) #define dT(a_1) (dT + (lddwork+(a_1))*nb) float c_one = MAGMA_S_ONE; magma_int_t k, lddwork; magma_int_t nb = magma_get_sgeqrf_nb(m); magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb; int lquery = (lwork == -1); hwork[0] = MAGMA_S_MAKE( (float)lwkopt, 0. ); *info = 0; if (m < 0) *info = -1; else if (n < 0 || m < n) *info = -2; else if (nrhs < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; else if (lddb < max(1,m)) *info = -8; else if (lwork < lwkopt && ! lquery) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { hwork[0] = c_one; return *info; } lddwork = k; /* B := Q' * B */ magma_sormqr_gpu( MagmaLeft, MagmaTrans, m, nrhs, n, dA(0,0), ldda, tau, dB, lddb, hwork, lwork, dT, nb, info ); if ( *info != 0 ) { return *info; } /* Solve R*X = B(1:n,:) 1. Move the (k-1)/nb block diagonal submatrices from dT to R 2. Solve 3. Restore the data format moving data from R back to dT */ magmablas_sswapdblk(k-1, nb, dA(0,0), ldda, 1, dT(0), nb, 0); if ( nrhs == 1 ) { magma_strsv(MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA(0,0), ldda, dB, 1); } else { magma_strsm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA(0,0), ldda, dB, lddb); } magmablas_sswapdblk(k-1, nb, dT(0), nb, 0, dA(0,0), ldda, 1); return *info; }
int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, t1, t2; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t ione = 1; const char trans[] = { 'N', 'C', 'T' }; const char uplo[] = { 'L', 'U' }; const char diag[] = { 'U', 'N' }; const char side[] = { 'L', 'R' }; float *A, *B, *C, *C2, *LU; float *dA, *dB, *dC1, *dC2; float alpha = MAGMA_S_MAKE( 0.5, 0.1 ); float beta = MAGMA_S_MAKE( 0.7, 0.2 ); float dalpha = 0.6; float dbeta = 0.8; float work[1], error, total_error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld, info; magma_int_t *piv; magma_err_t err; magma_opts opts; parse_opts( argc, argv, &opts ); printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" ); total_error = 0.; for( int i = 0; i < opts.ntest; ++i ) { m = opts.msize[i]; n = opts.nsize[i]; k = opts.ksize[i]; printf("=========================================================================\n"); printf( "M %d, N %d, K %d\n", (int) m, (int) n, (int) k ); // allocate matrices // over-allocate so they can be any combination of {m,n,k} x {m,n,k}. maxn = max( max( m, n ), k ); ld = maxn; size = maxn*maxn; err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) ); assert( err == 0 ); err = magma_smalloc_pinned( &A, size ); assert( err == 0 ); err = magma_smalloc_pinned( &B, size ); assert( err == 0 ); err = magma_smalloc_pinned( &C, size ); assert( err == 0 ); err = magma_smalloc_pinned( &C2, size ); assert( err == 0 ); err = magma_smalloc_pinned( &LU, size ); assert( err == 0 ); err = magma_smalloc( &dA, size ); assert( err == 0 ); err = magma_smalloc( &dB, size ); assert( err == 0 ); err = magma_smalloc( &dC1, size ); assert( err == 0 ); err = magma_smalloc( &dC2, size ); assert( err == 0 ); // initialize matrices size = maxn*maxn; lapackf77_slarnv( &ione, ISEED, &size, A ); lapackf77_slarnv( &ione, ISEED, &size, B ); lapackf77_slarnv( &ione, ISEED, &size, C ); printf( "========== Level 1 BLAS ==========\n" ); // ----- test SSWAP // swap 2nd and 3rd columns of dA, then copy to C2 and compare with A assert( n >= 4 ); magma_ssetmatrix( m, n, A, ld, dA, ld ); magma_ssetmatrix( m, n, A, ld, dB, ld ); magma_sswap( m, dA(0,1), 1, dA(0,2), 1 ); magma_sswap( m, dB(0,1), 1, dB(0,2), 1 ); // check results, storing diff between magma and cuda calls in C2 cublasSaxpy( ld*n, c_neg_one, dA, 1, dB, 1 ); magma_sgetmatrix( m, n, dB, ld, C2, ld ); error = lapackf77_slange( "F", &m, &k, C2, &ld, work ); total_error += error; printf( "sswap diff %.2g\n", error ); // ----- test ISAMAX // get argmax of column of A magma_ssetmatrix( m, k, A, ld, dA, ld ); error = 0; for( int j = 0; j < k; ++j ) { magma_int_t i1 = magma_isamax( m, dA(0,j), 1 ); magma_int_t i2 = cublasIsamax( m, dA(0,j), 1 ); assert( i1 == i2 ); error += abs( i1 - i2 ); } total_error += error; gflops = (float)m * k / 1e9; printf( "isamax diff %.2g\n", error ); printf( "\n" ); printf( "========== Level 2 BLAS ==========\n" ); // ----- test SGEMV // c = alpha*A*b + beta*c, with A m*n; b,c m or n-vectors // try no-trans/trans for( int ia = 0; ia < 3; ++ia ) { magma_ssetmatrix( m, n, A, ld, dA, ld ); magma_ssetvector( maxn, B, 1, dB, 1 ); magma_ssetvector( maxn, C, 1, dC1, 1 ); magma_ssetvector( maxn, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_sgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasSgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 size = (trans[ia] == 'N' ? m : n); cublasSaxpy( size, c_neg_one, dC1, 1, dC2, 1 ); magma_sgetvector( size, dC2, 1, C2, 1 ); error = lapackf77_slange( "F", &size, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_SGEMV( m, n ) / 1e9; printf( "sgemv( %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", trans[ia], error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test SSYMV // c = alpha*A*b + beta*c, with A m*m symmetric; b,c m-vectors // try upper/lower for( int iu = 0; iu < 2; ++iu ) { magma_ssetmatrix( m, m, A, ld, dA, ld ); magma_ssetvector( m, B, 1, dB, 1 ); magma_ssetvector( m, C, 1, dC1, 1 ); magma_ssetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_ssymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasSsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasSaxpy( m, c_neg_one, dC1, 1, dC2, 1 ); magma_sgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_slange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_SSYMV( m ) / 1e9; printf( "ssymv( %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test STRSV // solve A*c = c, with A m*m triangular; c m-vector // try upper/lower, no-trans/trans, unit/non-unit diag // Factor A into LU to get well-conditioned triangles, else solve yields garbage. // Still can give garbage if solves aren't consistent with LU factors, // e.g., using unit diag for U, so copy lower triangle to upper triangle. // Also used for trsm later. lapackf77_slacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld ); lapackf77_sgetrf( &maxn, &maxn, LU, &ld, piv, &info ); for( int j = 0; j < maxn; ++j ) { for( int i = 0; i < j; ++i ) { *LU(i,j) = *LU(j,i); } } for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { magma_ssetmatrix( m, m, LU, ld, dA, ld ); magma_ssetvector( m, C, 1, dC1, 1 ); magma_ssetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_strsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasStrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasSaxpy( m, c_neg_one, dC1, 1, dC2, 1 ); magma_sgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_slange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_STRSM( MagmaLeft, m, 1 ) / 1e9; printf( "strsv( %c, %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], diag[id], error, gflops/t1, gflops/t2 ); }}} printf( "\n" ); printf( "========== Level 3 BLAS ==========\n" ); // ----- test SGEMM // C = alpha*A*B + beta*C, with A m*k or k*m; B k*n or n*k; C m*n // try combinations of no-trans/trans for( int ia = 0; ia < 3; ++ia ) { for( int ib = 0; ib < 3; ++ib ) { bool nta = (trans[ia] == 'N'); bool ntb = (trans[ib] == 'N'); magma_ssetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA, ld ); magma_ssetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB, ld ); magma_ssetmatrix( m, n, C, ld, dC1, ld ); magma_ssetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_sgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasSgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasSaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_sgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_slange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_SGEMM( m, n, k ) / 1e9; printf( "sgemm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", trans[ia], trans[ib], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test SSYMM // C = alpha*A*B + beta*C (left) with A m*m symmetric; B,C m*n; or // C = alpha*B*A + beta*C (right) with A n*n symmetric; B,C m*n // try left/right, upper/lower for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { magma_ssetmatrix( m, m, A, ld, dA, ld ); magma_ssetmatrix( m, n, B, ld, dB, ld ); magma_ssetmatrix( m, n, C, ld, dC1, ld ); magma_ssetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ssymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasSsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasSaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_sgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_slange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_SSYMM( side[is], m, n ) / 1e9; printf( "ssymm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", side[is], uplo[iu], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test SSYRK // C = alpha*A*A^H + beta*C (no-trans) with A m*k and C m*m symmetric; or // C = alpha*A^H*A + beta*C (trans) with A k*m and C m*m symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { magma_ssetmatrix( n, k, A, ld, dA, ld ); magma_ssetmatrix( n, n, C, ld, dC1, ld ); magma_ssetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ssyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasSsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasSaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_sgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_slange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_SSYRK( k, n ) / 1e9; printf( "ssyrk( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test SSYR2K // C = alpha*A*B^H + ^alpha*B*A^H + beta*C (no-trans) with A,B n*k; C n*n symmetric; or // C = alpha*A^H*B + ^alpha*B^H*A + beta*C (trans) with A,B k*n; C n*n symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { bool nt = (trans[it] == 'N'); magma_ssetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA, ld ); magma_ssetmatrix( n, n, C, ld, dC1, ld ); magma_ssetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ssyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasSsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasSaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_sgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_slange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_SSYR2K( k, n ) / 1e9; printf( "ssyr2k( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test STRMM // C = alpha*A*C (left) with A m*m triangular; C m*n; or // C = alpha*C*A (right) with A n*n triangular; C m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == 'L'); magma_ssetmatrix( (left ? m : n), (left ? m : n), A, ld, dA, ld ); magma_ssetmatrix( m, n, C, ld, dC1, ld ); magma_ssetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_strmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasStrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasSaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_sgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_slange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_STRMM( side[is], m, n ) / 1e9; printf( "strmm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // ----- test STRSM // solve A*X = alpha*B (left) with A m*m triangular; B m*n; or // solve X*A = alpha*B (right) with A n*n triangular; B m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == 'L'); magma_ssetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA, ld ); magma_ssetmatrix( m, n, C, ld, dC1, ld ); magma_ssetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_strsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasStrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasSaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_sgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_slange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_STRSM( side[is], m, n ) / 1e9; printf( "strsm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // cleanup magma_free_cpu( piv ); magma_free_pinned( A ); magma_free_pinned( B ); magma_free_pinned( C ); magma_free_pinned( C2 ); magma_free_pinned( LU ); magma_free( dA ); magma_free( dB ); magma_free( dC1 ); magma_free( dC2 ); } if ( total_error != 0. ) { printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n", total_error ); } else { printf( "all tests passed\n" ); } TESTING_FINALIZE(); return 0; }
extern "C" magma_int_t magma_sidr_strms( magma_s_matrix A, magma_s_matrix b, magma_s_matrix *x, magma_s_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_IDRMERGE; solver_par->numiter = 0; solver_par->spmv_count = 0; solver_par->init_res = 0.0; solver_par->final_res = 0.0; solver_par->iter_res = 0.0; solver_par->runtime = 0.0; // constants const float c_zero = MAGMA_S_ZERO; const float c_one = MAGMA_S_ONE; const float c_n_one = MAGMA_S_NEG_ONE; // internal user options const magma_int_t smoothing = 1; // 0 = disable, 1 = enable const float angle = 0.7; // [0-1] // local variables magma_int_t iseed[4] = {0, 0, 0, 1}; magma_int_t dof; magma_int_t s; magma_int_t distr; magma_int_t k, i, sk; magma_int_t innerflag; magma_int_t ldd; magma_int_t q; float residual; float nrm; float nrmb; float nrmr; float nrmt; float rho; float om; float gamma; // matrices and vectors magma_s_matrix dxs = {Magma_CSR}; magma_s_matrix dr = {Magma_CSR}, drs = {Magma_CSR}; magma_s_matrix dP = {Magma_CSR}, dP1 = {Magma_CSR}; magma_s_matrix dG = {Magma_CSR}, dGcol = {Magma_CSR}; magma_s_matrix dU = {Magma_CSR}; magma_s_matrix dM = {Magma_CSR}; magma_s_matrix df = {Magma_CSR}; magma_s_matrix dt = {Magma_CSR}, dtt = {Magma_CSR}; magma_s_matrix dc = {Magma_CSR}; magma_s_matrix dv = {Magma_CSR}; magma_s_matrix dskp = {Magma_CSR}; magma_s_matrix dalpha = {Magma_CSR}; magma_s_matrix dbeta = {Magma_CSR}; float *hMdiag = NULL; float *hskp = NULL; float *halpha = NULL; float *hbeta = NULL; float *d1 = NULL, *d2 = NULL; // queue variables const magma_int_t nqueues = 3; // number of queues magma_queue_t queues[nqueues]; // chronometry real_Double_t tempo1, tempo2; // create additional queues queues[0] = queue; for ( q = 1; q < nqueues; q++ ) { magma_queue_create( queue->device(), &(queues[q]) ); } // initial s space // TODO: add option for 's' (shadow space number) // Hack: uses '--restart' option as the shadow space number. // This is not a good idea because the default value of restart option is used to detect // if the user provided a custom restart. This means that if the default restart value // is changed then the code will think it was the user (unless the default value is // also updated in the 'if' statement below. s = 1; if ( solver_par->restart != 50 ) { if ( solver_par->restart > A.num_cols ) { s = A.num_cols; } else { s = solver_par->restart; } } solver_par->restart = s; // set max iterations solver_par->maxiter = min( 2 * A.num_cols, solver_par->maxiter ); // check if matrix A is square if ( A.num_rows != A.num_cols ) { //printf("Matrix A is not square.\n"); info = MAGMA_ERR_NOT_SUPPORTED; goto cleanup; } // |b| nrmb = magma_snrm2( b.num_rows, b.dval, 1, queue ); if ( nrmb == 0.0 ) { magma_sscal( x->num_rows, MAGMA_S_ZERO, x->dval, 1, queue ); info = MAGMA_SUCCESS; goto cleanup; } // t = 0 // make t twice as large to contain both, dt and dr ldd = magma_roundup( b.num_rows, 32 ); CHECK( magma_svinit( &dt, Magma_DEV, ldd, 2, c_zero, queue )); dt.num_rows = b.num_rows; dt.num_cols = 1; dt.nnz = dt.num_rows; // redirect the dr.dval to the second part of dt CHECK( magma_svinit( &dr, Magma_DEV, b.num_rows, 1, c_zero, queue )); magma_free( dr.dval ); dr.dval = dt.dval + ldd; // r = b - A x CHECK( magma_sresidualvec( A, b, *x, &dr, &nrmr, queue )); // |r| solver_par->init_res = nrmr; solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nrmr; } // check if initial is guess good enough if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { info = MAGMA_SUCCESS; goto cleanup; } // P = randn(n, s) // P = ortho(P) //--------------------------------------- // P = 0.0 CHECK( magma_svinit( &dP, Magma_CPU, A.num_cols, s, c_zero, queue )); // P = randn(n, s) distr = 3; // 1 = unif (0,1), 2 = unif (-1,1), 3 = normal (0,1) dof = dP.num_rows * dP.num_cols; lapackf77_slarnv( &distr, iseed, &dof, dP.val ); // transfer P to device CHECK( magma_smtransfer( dP, &dP1, Magma_CPU, Magma_DEV, queue )); magma_smfree( &dP, queue ); // P = ortho(P1) if ( dP1.num_cols > 1 ) { // P = magma_sqr(P1), QR factorization CHECK( magma_sqr( dP1.num_rows, dP1.num_cols, dP1, dP1.ld, &dP, NULL, queue )); } else { // P = P1 / |P1| nrm = magma_snrm2( dof, dP1.dval, 1, queue ); nrm = 1.0 / nrm; magma_sscal( dof, nrm, dP1.dval, 1, queue ); CHECK( magma_smtransfer( dP1, &dP, Magma_DEV, Magma_DEV, queue )); } magma_smfree( &dP1, queue ); //--------------------------------------- // allocate memory for the scalar products CHECK( magma_smalloc_pinned( &hskp, 5 )); CHECK( magma_svinit( &dskp, Magma_DEV, 4, 1, c_zero, queue )); CHECK( magma_smalloc_pinned( &halpha, s )); CHECK( magma_svinit( &dalpha, Magma_DEV, s, 1, c_zero, queue )); CHECK( magma_smalloc_pinned( &hbeta, s )); CHECK( magma_svinit( &dbeta, Magma_DEV, s, 1, c_zero, queue )); // workspace for merged dot product CHECK( magma_smalloc( &d1, max(2, s) * b.num_rows )); CHECK( magma_smalloc( &d2, max(2, s) * b.num_rows )); // smoothing enabled if ( smoothing > 0 ) { // set smoothing solution vector CHECK( magma_smtransfer( *x, &dxs, Magma_DEV, Magma_DEV, queue )); // tt = 0 // make tt twice as large to contain both, dtt and drs ldd = magma_roundup( b.num_rows, 32 ); CHECK( magma_svinit( &dtt, Magma_DEV, ldd, 2, c_zero, queue )); dtt.num_rows = dr.num_rows; dtt.num_cols = 1; dtt.nnz = dtt.num_rows; // redirect the drs.dval to the second part of dtt CHECK( magma_svinit( &drs, Magma_DEV, dr.num_rows, 1, c_zero, queue )); magma_free( drs.dval ); drs.dval = dtt.dval + ldd; // set smoothing residual vector magma_scopyvector( dr.num_rows, dr.dval, 1, drs.dval, 1, queue ); } // G(n,s) = 0 if ( s > 1 ) { ldd = magma_roundup( A.num_rows, 32 ); CHECK( magma_svinit( &dG, Magma_DEV, ldd, s, c_zero, queue )); dG.num_rows = A.num_rows; } else { CHECK( magma_svinit( &dG, Magma_DEV, A.num_rows, s, c_zero, queue )); } // dGcol represents a single column of dG, array pointer is set inside loop CHECK( magma_svinit( &dGcol, Magma_DEV, dG.num_rows, 1, c_zero, queue )); magma_free( dGcol.dval ); // U(n,s) = 0 if ( s > 1 ) { ldd = magma_roundup( A.num_cols, 32 ); CHECK( magma_svinit( &dU, Magma_DEV, ldd, s, c_zero, queue )); dU.num_rows = A.num_cols; } else { CHECK( magma_svinit( &dU, Magma_DEV, A.num_cols, s, c_zero, queue )); } // M(s,s) = I CHECK( magma_svinit( &dM, Magma_DEV, s, s, c_zero, queue )); CHECK( magma_smalloc_pinned( &hMdiag, s )); magmablas_slaset( MagmaFull, dM.num_rows, dM.num_cols, c_zero, c_one, dM.dval, dM.ld, queue ); // f = 0 CHECK( magma_svinit( &df, Magma_DEV, dP.num_cols, 1, c_zero, queue )); // c = 0 CHECK( magma_svinit( &dc, Magma_DEV, dM.num_cols, 1, c_zero, queue )); // v = r CHECK( magma_smtransfer( dr, &dv, Magma_DEV, Magma_DEV, queue )); //--------------START TIME--------------- // chronometry tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->timing[0] = 0.0; } cudaProfilerStart(); om = MAGMA_S_ONE; gamma = MAGMA_S_ZERO; innerflag = 0; // new RHS for small systems // f = P' r // Q1 magma_sgemvmdot_shfl( dP.num_rows, dP.num_cols, dP.dval, dr.dval, d1, d2, df.dval, queues[1] ); // skp[4] = f(k) // Q1 magma_sgetvector_async( 1, df.dval, 1, &hskp[4], 1, queues[1] ); // c(k:s) = f(k:s) // Q1 magma_scopyvector_async( s, df.dval, 1, dc.dval, 1, queues[1] ); // c(k:s) = M(k:s,k:s) \ f(k:s) // Q1 magma_strsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, s, dM.dval, dM.ld, dc.dval, 1, queues[1] ); // start iteration do { solver_par->numiter++; // shadow space loop for ( k = 0; k < s; ++k ) { sk = s - k; dGcol.dval = dG.dval + k * dG.ld; // v = r - G(:,k:s) c(k:s) // Q1 magmablas_sgemv( MagmaNoTrans, dG.num_rows, sk, c_n_one, dGcol.dval, dG.ld, &dc.dval[k], 1, c_one, dv.dval, 1, queues[1] ); // U(:,k) = om * v + U(:,k:s) c(k:s) // Q1 magmablas_sgemv( MagmaNoTrans, dU.num_rows, sk, c_one, &dU.dval[k*dU.ld], dU.ld, &dc.dval[k], 1, om, dv.dval, 1, queues[1] ); // G(:,k) = A U(:,k) // Q1 CHECK( magma_s_spmv( c_one, A, dv, c_zero, dGcol, queues[1] )); solver_par->spmv_count++; // bi-orthogonalize the new basis vectors for ( i = 0; i < k; ++i ) { // alpha = P(:,i)' G(:,k) // Q1 halpha[i] = magma_sdot( dP.num_rows, &dP.dval[i*dP.ld], 1, dGcol.dval, 1, queues[1] ); // implicit sync Q1 --> alpha = P(:,i)' G(:,k) // alpha = alpha / M(i,i) halpha[i] = halpha[i] / hMdiag[i]; // G(:,k) = G(:,k) - alpha * G(:,i) // Q1 magma_saxpy( dG.num_rows, -halpha[i], &dG.dval[i*dG.ld], 1, dGcol.dval, 1, queues[1] ); } // sync Q1 --> G(:,k) = G(:,k) - alpha * G(:,i), skp[4] = f(k) magma_queue_sync( queues[1] ); // new column of M = P'G, first k-1 entries are zero // M(k:s,k) = P(:,k:s)' G(:,k) // Q2 magma_sgemvmdot_shfl( dP.num_rows, sk, &dP.dval[k*dP.ld], dGcol.dval, d1, d2, &dM.dval[k*dM.ld+k], queues[2] ); // non-first s iteration if ( k > 0 ) { // alpha = dalpha // Q0 magma_ssetvector_async( k, halpha, 1, dalpha.dval, 1, queues[0] ); // U update outside of loop using GEMV // U(:,k) = U(:,k) - U(:,1:k) * alpha(1:k) // Q0 magmablas_sgemv( MagmaNoTrans, dU.num_rows, k, c_n_one, dU.dval, dU.ld, dalpha.dval, 1, c_one, dv.dval, 1, queues[0] ); } // Mdiag(k) = M(k,k) // Q2 magma_sgetvector( 1, &dM.dval[k*dM.ld+k], 1, &hMdiag[k], 1, queues[2] ); // implicit sync Q2 --> Mdiag(k) = M(k,k) // U(:,k) = v // Q0 magma_scopyvector_async( dU.num_rows, dv.dval, 1, &dU.dval[k*dU.ld], 1, queues[0] ); // check M(k,k) == 0 if ( MAGMA_S_EQUAL(hMdiag[k], MAGMA_S_ZERO) ) { innerflag = 1; info = MAGMA_DIVERGENCE; break; } // beta = f(k) / M(k,k) hbeta[k] = hskp[4] / hMdiag[k]; // check for nan if ( magma_s_isnan( hbeta[k] ) || magma_s_isinf( hbeta[k] )) { innerflag = 1; info = MAGMA_DIVERGENCE; break; } // r = r - beta * G(:,k) // Q2 magma_saxpy( dr.num_rows, -hbeta[k], dGcol.dval, 1, dr.dval, 1, queues[2] ); // non-last s iteration if ( (k + 1) < s ) { // f(k+1:s) = f(k+1:s) - beta * M(k+1:s,k) // Q1 magma_saxpy( sk-1, -hbeta[k], &dM.dval[k*dM.ld+(k+1)], 1, &df.dval[k+1], 1, queues[1] ); // c(k+1:s) = f(k+1:s) // Q1 magma_scopyvector_async( sk-1, &df.dval[k+1], 1, &dc.dval[k+1], 1, queues[1] ); // c(k+1:s) = M(k+1:s,k+1:s) \ f(k+1:s) // Q1 magma_strsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, sk-1, &dM.dval[(k+1)*dM.ld+(k+1)], dM.ld, &dc.dval[k+1], 1, queues[1] ); // skp[4] = f(k+1) // Q1 magma_sgetvector_async( 1, &df.dval[k+1], 1, &hskp[4], 1, queues[1] ); } // smoothing disabled if ( smoothing <= 0 ) { // |r| // Q2 nrmr = magma_snrm2( dr.num_rows, dr.dval, 1, queues[2] ); // implicit sync Q2 --> |r| // smoothing enabled } else { // smoothing operation //--------------------------------------- // t = rs - r // Q2 magma_sidr_smoothing_1( drs.num_rows, drs.num_cols, drs.dval, dr.dval, dtt.dval, queues[2] ); // x = x + beta * U(:,k) // Q0 magma_saxpy( x->num_rows, hbeta[k], &dU.dval[k*dU.ld], 1, x->dval, 1, queues[0] ); // t't // t'rs // Q2 CHECK( magma_sgemvmdot_shfl( dt.ld, 2, dtt.dval, dtt.dval, d1, d2, &dskp.dval[2], queues[2] )); // skp[2-3] = dskp[2-3] // Q2 magma_sgetvector( 2, &dskp.dval[2], 1, &hskp[2], 1, queues[2] ); // implicit sync Q2 --> skp = dskp // gamma = (t' * rs) / (t' * t) gamma = hskp[3] / hskp[2]; // rs = rs - gamma * t // Q1 magma_saxpy( drs.num_rows, -gamma, dtt.dval, 1, drs.dval, 1, queues[1] ); // xs = xs - gamma * (xs - x) // Q0 magma_sidr_smoothing_2( dxs.num_rows, dxs.num_cols, -gamma, x->dval, dxs.dval, queues[0] ); // |rs| // Q1 nrmr = magma_snrm2( drs.num_rows, drs.dval, 1, queues[1] ); // implicit sync Q0 --> |r| //--------------------------------------- } // v = r // Q1 magma_scopyvector_async( dr.num_rows, dr.dval, 1, dv.dval, 1, queues[1] ); // last s iteration if ( (k + 1) == s ) { // t = A r // Q2 CHECK( magma_s_spmv( c_one, A, dr, c_zero, dt, queues[2] )); solver_par->spmv_count++; // t't // t'r // Q2 CHECK( magma_sgemvmdot_shfl( dt.ld, 2, dt.dval, dt.dval, d1, d2, dskp.dval, queues[2] )); } // store current timing and residual if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter) % solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)nrmr; solver_par->timing[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)tempo2 - tempo1; } } // check convergence or iteration limit if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { s = k + 1; // for the x-update outside the loop innerflag = 2; info = MAGMA_SUCCESS; break; } } // smoothing disabled if ( smoothing <= 0 && innerflag != 1 ) { // dbeta(1:s) = beta(1:s) // Q0 magma_ssetvector_async( s, hbeta, 1, dbeta.dval, 1, queues[0] ); // x = x + U(:,1:s) * beta(1:s) // Q0 magmablas_sgemv( MagmaNoTrans, dU.num_rows, s, c_one, dU.dval, dU.ld, dbeta.dval, 1, c_one, x->dval, 1, queues[0] ); } // check convergence or iteration limit or invalid result of inner loop if ( innerflag > 0 ) { break; } // computation of a new omega //--------------------------------------- // skp[0-2] = dskp[0-2] // Q2 magma_sgetvector( 2, dskp.dval, 1, hskp, 1, queues[2] ); // implicit sync Q2 --> skp = dskp // |t| nrmt = magma_ssqrt( MAGMA_S_REAL(hskp[0]) ); // rho = abs((t' * r) / (|t| * |r|)) rho = MAGMA_D_ABS( MAGMA_S_REAL(hskp[1]) / (nrmt * nrmr) ); // om = (t' * r) / (|t| * |t|) om = hskp[1] / hskp[0]; if ( rho < angle ) { om = (om * angle) / rho; } //--------------------------------------- if ( MAGMA_S_EQUAL(om, MAGMA_S_ZERO) ) { info = MAGMA_DIVERGENCE; break; } // sync Q1 --> v = r magma_queue_sync( queues[1] ); // r = r - om * t // Q2 magma_saxpy( dr.num_rows, -om, dt.dval, 1, dr.dval, 1, queues[2] ); // x = x + om * v // Q0 magma_saxpy( x->num_rows, om, dv.dval, 1, x->dval, 1, queues[0] ); // smoothing disabled if ( smoothing <= 0 ) { // |r| // Q2 nrmr = magma_snrm2( dr.num_rows, dr.dval, 1, queues[2] ); // implicit sync Q2 --> |r| // v = r // Q0 magma_scopyvector_async( dr.num_rows, dr.dval, 1, dv.dval, 1, queues[0] ); // new RHS for small systems // f = P' r // Q1 magma_sgemvmdot_shfl( dP.num_rows, dP.num_cols, dP.dval, dr.dval, d1, d2, df.dval, queues[1] ); // skp[4] = f(k) // Q1 magma_sgetvector_async( 1, df.dval, 1, &hskp[4], 1, queues[1] ); // c(k:s) = f(k:s) // Q1 magma_scopyvector_async( s, df.dval, 1, dc.dval, 1, queues[1] ); // c(k:s) = M(k:s,k:s) \ f(k:s) // Q1 magma_strsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, s, dM.dval, dM.ld, dc.dval, 1, queues[1] ); // smoothing enabled } else { // smoothing operation //--------------------------------------- // t = rs - r // Q2 magma_sidr_smoothing_1( drs.num_rows, drs.num_cols, drs.dval, dr.dval, dtt.dval, queues[2] ); // t't // t'rs // Q2 CHECK( magma_sgemvmdot_shfl( dt.ld, 2, dtt.dval, dtt.dval, d1, d2, &dskp.dval[2], queues[2] )); // skp[2-3] = dskp[2-3] // Q2 magma_sgetvector( 2, &dskp.dval[2], 1, &hskp[2], 1, queues[2] ); // implicit sync Q2 --> skp = dskp // gamma = (t' * rs) / (t' * t) gamma = hskp[3] / hskp[2]; // rs = rs - gamma * (rs - r) // Q2 magma_saxpy( drs.num_rows, -gamma, dtt.dval, 1, drs.dval, 1, queues[2] ); // xs = xs - gamma * (xs - x) // Q0 magma_sidr_smoothing_2( dxs.num_rows, dxs.num_cols, -gamma, x->dval, dxs.dval, queues[0] ); // v = r // Q0 magma_scopyvector_async( dr.num_rows, dr.dval, 1, dv.dval, 1, queues[0] ); // new RHS for small systems // f = P' r // Q1 magma_sgemvmdot_shfl( dP.num_rows, dP.num_cols, dP.dval, dr.dval, d1, d2, df.dval, queues[1] ); // skp[4] = f(k) // Q1 magma_sgetvector_async( 1, df.dval, 1, &hskp[4], 1, queues[1] ); // c(k:s) = f(k:s) // Q1 magma_scopyvector_async( s, df.dval, 1, dc.dval, 1, queues[1] ); // |rs| // Q2 nrmr = magma_snrm2( drs.num_rows, drs.dval, 1, queues[2] ); // implicit sync Q2 --> |r| // c(k:s) = M(k:s,k:s) \ f(k:s) // Q1 magma_strsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, s, dM.dval, dM.ld, dc.dval, 1, queues[1] ); //--------------------------------------- } // store current timing and residual if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); magma_queue_sync( queue ); if ( (solver_par->numiter) % solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)nrmr; solver_par->timing[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)tempo2 - tempo1; } } // check convergence or iteration limit if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { info = MAGMA_SUCCESS; break; } // sync Q0 --> v = r magma_queue_sync( queues[0] ); } while ( solver_par->numiter + 1 <= solver_par->maxiter ); // sync all queues for ( q = 0; q < nqueues; q++ ) { magma_queue_sync( queues[q] ); } // smoothing enabled if ( smoothing > 0 ) { // x = xs magma_scopyvector_async( x->num_rows, dxs.dval, 1, x->dval, 1, queue ); // r = rs magma_scopyvector_async( dr.num_rows, drs.dval, 1, dr.dval, 1, queue ); } cudaProfilerStop(); // get last iteration timing tempo2 = magma_sync_wtime( queue ); magma_queue_sync( queue ); solver_par->runtime = (real_Double_t)tempo2 - tempo1; //--------------STOP TIME---------------- // get final stats solver_par->iter_res = nrmr; CHECK( magma_sresidualvec( A, b, *x, &dr, &residual, queue )); solver_par->final_res = residual; // set solver conclusion if ( info != MAGMA_SUCCESS && info != MAGMA_DIVERGENCE ) { if ( solver_par->init_res > solver_par->final_res ) { info = MAGMA_SLOW_CONVERGENCE; } } cleanup: // free resources // sync all queues, destory additional queues magma_queue_sync( queues[0] ); for ( q = 1; q < nqueues; q++ ) { magma_queue_sync( queues[q] ); magma_queue_destroy( queues[q] ); } // smoothing enabled if ( smoothing > 0 ) { drs.dval = NULL; // needed because its pointer is redirected to dtt magma_smfree( &dxs, queue ); magma_smfree( &drs, queue ); magma_smfree( &dtt, queue ); } dr.dval = NULL; // needed because its pointer is redirected to dt dGcol.dval = NULL; // needed because its pointer is redirected to dG magma_smfree( &dr, queue ); magma_smfree( &dP, queue ); magma_smfree( &dP1, queue ); magma_smfree( &dG, queue ); magma_smfree( &dGcol, queue ); magma_smfree( &dU, queue ); magma_smfree( &dM, queue ); magma_smfree( &df, queue ); magma_smfree( &dt, queue ); magma_smfree( &dc, queue ); magma_smfree( &dv, queue ); magma_smfree( &dskp, queue ); magma_smfree( &dalpha, queue ); magma_smfree( &dbeta, queue ); magma_free_pinned( hMdiag ); magma_free_pinned( hskp ); magma_free_pinned( halpha ); magma_free_pinned( hbeta ); magma_free( d1 ); magma_free( d2 ); solver_par->info = info; return info; /* magma_sidr_strms */ }
/** Purpose ------- SGEQRS solves the least squares problem min || A*X - C || using the QR factorization A = Q*R computed by SGEQRF3_GPU. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. M >= N >= 0. @param[in] nrhs INTEGER The number of columns of the matrix C. NRHS >= 0. @param[in] dA REAL array on the GPU, dimension (LDDA,N) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,n, as returned by SGEQRF3_GPU in the first n columns of its array argument A. dA is modified by the routine but restored on exit. @param[in] ldda INTEGER The leading dimension of the array A, LDDA >= M. @param[in] tau REAL array, dimension (N) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by MAGMA_SGEQRF_GPU. @param[in,out] dB REAL array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. @param[in,out] dT REAL array that is the output (the 6th argument) of magma_sgeqrf_gpu of size 2*MIN(M, N)*NB + ceil(N/32)*32 )* MAX(NB, NRHS). The array starts with a block of size MIN(M,N)*NB that stores the triangular T matrices used in the QR factorization, followed by MIN(M,N)*NB block storing the diagonal block matrices for the R matrix, followed by work space of size (ceil(N/32)*32)* MAX(NB, NRHS). @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= M. @param[out] hwork (workspace) REAL array, dimension (LWORK) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK, LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB, where NB is the blocksize given by magma_get_sgeqrf_nb( M, N ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the WORK array. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_sgels_comp ********************************************************************/ extern "C" magma_int_t magma_sgeqrs3_gpu( magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaFloat_ptr dA, magma_int_t ldda, float const *tau, magmaFloat_ptr dT, magmaFloat_ptr dB, magma_int_t lddb, float *hwork, magma_int_t lwork, magma_int_t *info) { #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dT(i_) (dT + (lddwork + (i_))*nb) float c_one = MAGMA_S_ONE; magma_int_t min_mn, lddwork; magma_int_t nb = magma_get_sgeqrf_nb( m, n ); magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb; bool lquery = (lwork == -1); hwork[0] = magma_smake_lwork( lwkopt ); *info = 0; if (m < 0) *info = -1; else if (n < 0 || m < n) *info = -2; else if (nrhs < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; else if (lddb < max(1,m)) *info = -8; else if (lwork < lwkopt && ! lquery) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; min_mn = min(m,n); if (min_mn == 0) { hwork[0] = c_one; return *info; } lddwork = min_mn; magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); /* B := Q^H * B */ magma_sormqr_gpu( MagmaLeft, MagmaTrans, m, nrhs, n, dA(0,0), ldda, tau, dB, lddb, hwork, lwork, dT, nb, info ); if ( *info != 0 ) { magma_queue_destroy( queue ); return *info; } /* Solve R*X = B(1:n,:) 1. Move the (min_mn - 1)/nb block diagonal submatrices from dT to R 2. Solve 3. Restore the data format moving data from R back to dT */ magmablas_sswapdblk( min_mn-1, nb, dA(0,0), ldda, 1, dT(0), nb, 0, queue ); if ( nrhs == 1 ) { magma_strsv( MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA(0,0), ldda, dB, 1, queue ); } else { magma_strsm( MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA(0,0), ldda, dB, lddb, queue ); } magmablas_sswapdblk( min_mn-1, nb, dT(0), nb, 0, dA(0,0), ldda, 1, queue ); magma_queue_destroy( queue ); return *info; }