/** Purpose ------- ZPOSV computes the solution to a complex system of linear equations A * X = B, where A is an N-by-N Hermitian positive definite matrix and X and B are N-by-NRHS matrices. The Cholesky decomposition is used to factor A as A = U**H * U, if UPLO = MagmaUpper, or A = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is a lower triangular matrix. The factored form of A is then used to solve the system of equations A * X = B. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H*U or dA = L*L**H. @param[in] ldda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in,out] dB COMPLEX_16 array on the GPU, dimension (LDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. @param[in] lddb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zposv_driver ********************************************************************/ extern "C" magma_int_t magma_zposv_gpu( magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex_ptr dA, magma_int_t ldda, magmaDoubleComplex_ptr dB, magma_int_t lddb, magma_int_t *info ) { *info = 0; if ( uplo != MagmaUpper && uplo != MagmaLower ) *info = -1; if ( n < 0 ) *info = -2; if ( nrhs < 0 ) *info = -3; if ( ldda < max(1, n) ) *info = -5; if ( lddb < max(1, n) ) *info = -7; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if ( (n == 0) || (nrhs == 0) ) { return *info; } magma_zpotrf_gpu( uplo, n, dA, ldda, info ); if ( *info == 0 ) { magma_zpotrs_gpu( uplo, n, nrhs, dA, ldda, dB, lddb, info ); } return *info; }
void MAGMAF_ZPOTRS_GPU( char *uplo, magma_int_t *n, magma_int_t *nrhs, devptr_t *dA, magma_int_t *ldda, devptr_t *dB, magma_int_t *lddb, magma_int_t *info) { magma_zpotrs_gpu( uplo[0], *n, *nrhs, DEVPTR(dA), *ldda, DEVPTR(dB), *lddb, info); }
extern "C" magma_int_t magma_zcposv_gpu(char uplo, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex *dA, magma_int_t ldda, magmaDoubleComplex *dB, magma_int_t lddb, magmaDoubleComplex *dX, magma_int_t lddx, magmaDoubleComplex *dworkd, magmaFloatComplex *dworks, magma_int_t *iter, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZCPOSV computes the solution to a complex system of linear equations A * X = B, where A is an N-by-N Hermitian positive definite matrix and X and B are N-by-NRHS matrices. ZCPOSV first attempts to factorize the matrix in complex SINGLE PRECISION and use this factorization within an iterative refinement procedure to produce a solution with complex DOUBLE PRECISION norm-wise backward error quality (see below). If the approach fails the method switches to a complex DOUBLE PRECISION factorization and solve. The iterative refinement is not going to be a winning strategy if the ratio complex SINGLE PRECISION performance over complex DOUBLE PRECISION performance is too small. A reasonable strategy should take the number of right-hand sides and the size of the matrix into account. This might be done with a call to ILAENV in the future. Up to now, we always try iterative refinement. The iterative refinement process is stopped if ITER > ITERMAX or for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX where o ITER is the number of the current iteration in the iterative refinement process o RNRM is the infinity-norm of the residual o XNRM is the infinity-norm of the solution o ANRM is the infinity-operator-norm of the matrix A o EPS is the machine epsilon returned by DLAMCH('Epsilon') The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively. Arguments ========= UPLO (input) CHARACTER = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The number of linear equations, i.e., the order of the matrix A. N >= 0. NRHS (input) INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. dA (input or input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if iterative refinement has been successfully used (INFO.EQ.0 and ITER.GE.0, see description below), then A is unchanged, if double factorization has been used (INFO.EQ.0 and ITER.LT.0, see description below), then the array dA contains the factor U or L from the Cholesky factorization A = U**T*U or A = L*L**T. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). dB (input) COMPLEX_16 array on the GPU, dimension (LDDB,NRHS) The N-by-NRHS right hand side matrix B. LDDB (input) INTEGER The leading dimension of the array dB. LDDB >= max(1,N). dX (output) COMPLEX_16 array on the GPU, dimension (LDDX,NRHS) If INFO = 0, the N-by-NRHS solution matrix X. LDDX (input) INTEGER The leading dimension of the array dX. LDDX >= max(1,N). dworkd (workspace) COMPLEX_16 array on the GPU, dimension (N*NRHS) This array is used to hold the residual vectors. dworks (workspace) COMPLEX array on the GPU, dimension (N*(N+NRHS)) This array is used to store the complex single precision matrix and the right-hand sides or solutions in single precision. ITER (output) INTEGER < 0: iterative refinement has failed, double precision factorization has been performed -1 : the routine fell back to full precision for implementation- or machine-specific reasons -2 : narrowing the precision induced an overflow, the routine fell back to full precision -3 : failure of SPOTRF -31: stop the iterative refinement after the 30th iteration > 0: iterative refinement has been successfully used. Returns the number of iterations INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i of (DOUBLE PRECISION) A is not positive definite, so the factorization could not be completed, and the solution has not been computed. ===================================================================== */ #define dB(i,j) (dB + (i) + (j)*lddb) #define dX(i,j) (dX + (i) + (j)*lddx) #define dR(i,j) (dR + (i) + (j)*lddr) #define dSX(i,j) (dSX + (i) + (j)*lddsx) magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t ione = 1; magmaDoubleComplex *dR; magmaFloatComplex *dSA, *dSX; magmaDoubleComplex Xnrmv, Rnrmv; double Anrm, Xnrm, Rnrm, cte, eps; magma_int_t i, j, iiter, lddsa, lddsx, lddr; /* Check arguments */ *iter = 0; *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -2; else if ( ldda < max(1,n)) *info = -4; else if ( lddb < max(1,n)) *info = -7; else if ( lddx < max(1,n)) *info = -9; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if ( n == 0 || nrhs == 0 ) return *info; lddsa = n; lddsx = n; lddr = n; dSA = dworks; dSX = dSA + lddsa*n; dR = dworkd; eps = lapackf77_dlamch("Epsilon"); Anrm = magmablas_zlanhe('I', uplo, n, dA, ldda, (double*)dworkd ); cte = Anrm * eps * pow((double)n, 0.5) * BWDMAX; /* * Convert to single precision */ magmablas_zlag2c( n, nrhs, dB, lddb, dSX, lddsx, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } magmablas_zlat2c( uplo, n, dA, ldda, dSA, lddsa, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // factor dSA in single precision magma_cpotrf_gpu( uplo, n, dSA, lddsa, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // solve dSA*dSX = dB in single precision magma_cpotrs_gpu( uplo, n, nrhs, dSA, lddsa, dSX, lddsx, info ); // residual dR = dB - dA*dX in double precision magmablas_clag2z( n, nrhs, dSX, lddsx, dX, lddx, info ); magmablas_zlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dR, lddr ); if ( nrhs == 1 ) { magma_zhemv( uplo, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_zhemm( MagmaLeft, uplo, n, nrhs, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } // TODO: use MAGMA_Z_ABS( dX(i,j) ) instead of zlange? for( j=0; j < nrhs; j++ ) { i = magma_izamax( n, dX(0,j), 1) - 1; magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_izamax ( n, dR(0,j), 1 ) - 1; magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto REFINEMENT; } } *iter = 0; return *info; REFINEMENT: for( iiter=1; iiter < ITERMAX; ) { *info = 0; // convert residual dR to single precision dSX magmablas_zlag2c( n, nrhs, dR, lddr, dSX, lddsx, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // solve dSA*dSX = R in single precision magma_cpotrs_gpu( uplo, n, nrhs, dSA, lddsa, dSX, lddsx, info ); // Add correction and setup residual // dX += dSX [including conversion] --and-- // dR = dB for( j=0; j < nrhs; j++ ) { magmablas_zcaxpycp( n, dSX(0,j), dX(0,j), dB(0,j), dR(0,j) ); } // residual dR = dB - dA*dX in double precision if ( nrhs == 1 ) { magma_zhemv( uplo, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_zhemm( MagmaLeft, uplo, n, nrhs, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } /* Check whether the nrhs normwise backward errors satisfy the * stopping criterion. If yes, set ITER=IITER>0 and return. */ for( j=0; j < nrhs; j++ ) { i = magma_izamax( n, dX(0,j), 1) - 1; magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_izamax ( n, dR(0,j), 1 ) - 1; magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto L20; } } /* If we are here, the nrhs normwise backward errors satisfy * the stopping criterion, we are good to exit. */ *iter = iiter; return *info; L20: iiter++; } /* If we are at this place of the code, this is because we have * performed ITER=ITERMAX iterations and never satisified the * stopping criterion. Set up the ITER flag accordingly and follow * up on double precision routine. */ *iter = -ITERMAX - 1; FALLBACK: /* Single-precision iterative refinement failed to converge to a * satisfactory solution, so we resort to double precision. */ magma_zpotrf_gpu( uplo, n, dA, ldda, info ); if (*info == 0) { magmablas_zlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dX, lddx ); magma_zpotrs_gpu( uplo, n, nrhs, dA, ldda, dX, lddx, info ); } return *info; }
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflopsF, gflopsS, gpu_perf, gpu_time /*cpu_perf, cpu_time*/; real_Double_t gpu_perfdf, gpu_perfds; real_Double_t gpu_perfsf, gpu_perfss; double error, Rnorm, Anorm; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_B, *h_X; magmaDoubleComplex *d_A, *d_B, *d_X, *d_workd; magmaFloatComplex *d_As, *d_Bs, *d_works; double *h_workd; magma_int_t lda, ldb, ldx; magma_int_t N, nrhs, posv_iter, info, size; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; printf("Epsilon(double): %8.6e\n" "Epsilon(single): %8.6e\n\n", lapackf77_dlamch("Epsilon"), lapackf77_slamch("Epsilon") ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; printf("using: uplo = %s\n", lapack_uplo_const(opts.uplo)); printf(" N NRHS DP-Factor DP-Solve SP-Factor SP-Solve MP-Solve Iter |b-Ax|/|A|\n"); printf("=====================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; ldb = ldx = lda = N; gflopsF = FLOPS_ZPOTRF( N ) / 1e9; gflopsS = gflopsF + FLOPS_ZPOTRS( N, nrhs ) / 1e9; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, magmaDoubleComplex, ldx*nrhs ); TESTING_MALLOC_CPU( h_workd, double, N ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_DEV( d_B, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_DEV( d_X, magmaDoubleComplex, ldx*nrhs ); TESTING_MALLOC_DEV( d_works, magmaFloatComplex, lda*(N+nrhs) ); TESTING_MALLOC_DEV( d_workd, magmaDoubleComplex, N*nrhs ); /* Initialize the matrix */ size = lda * N ; lapackf77_zlarnv( &ione, ISEED, &size, h_A ); magma_zmake_hpd( N, h_A, lda ); size = ldb * nrhs ; lapackf77_zlarnv( &ione, ISEED, &size, h_B ); magma_zsetmatrix( N, N, h_A, lda, d_A, lda ); magma_zsetmatrix( N, nrhs, h_B, ldb, d_B, ldb ); //===================================================================== // Mixed Precision Iterative Refinement - GPU //===================================================================== gpu_time = magma_wtime(); magma_zcposv_gpu(opts.uplo, N, nrhs, d_A, lda, d_B, ldb, d_X, ldx, d_workd, d_works, &posv_iter, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflopsS / gpu_time; if (info != 0) printf("magma_zcposv returned error %d: %s.\n", (int) info, magma_strerror( info )); //===================================================================== // Error Computation //===================================================================== magma_zgetmatrix( N, nrhs, d_X, ldx, h_X, ldx ) ; Anorm = lapackf77_zlanhe( "I", lapack_uplo_const(opts.uplo), &N, h_A, &N, h_workd); blasf77_zhemm( "L", lapack_uplo_const(opts.uplo), &N, &nrhs, &c_one, h_A, &lda, h_X, &ldx, &c_neg_one, h_B, &ldb); Rnorm = lapackf77_zlange( "I", &N, &nrhs, h_B, &ldb, h_workd); error = Rnorm / Anorm; //===================================================================== // Double Precision Factor //===================================================================== magma_zsetmatrix( N, N, h_A, lda, d_A, lda ); gpu_time = magma_wtime(); magma_zpotrf_gpu(opts.uplo, N, d_A, lda, &info); gpu_time = magma_wtime() - gpu_time; gpu_perfdf = gflopsF / gpu_time; if (info != 0) printf("magma_zpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); //===================================================================== // Double Precision Solve //===================================================================== magma_zsetmatrix( N, N, h_A, lda, d_A, lda ); magma_zsetmatrix( N, nrhs, h_B, ldb, d_B, ldb ); gpu_time = magma_wtime(); magma_zpotrf_gpu(opts.uplo, N, d_A, lda, &info); magma_zpotrs_gpu(opts.uplo, N, nrhs, d_A, lda, d_B, ldb, &info); gpu_time = magma_wtime() - gpu_time; gpu_perfds = gflopsS / gpu_time; if (info != 0) printf("magma_zpotrs returned error %d: %s.\n", (int) info, magma_strerror( info )); //===================================================================== // Single Precision Factor //===================================================================== d_As = d_works; d_Bs = d_works + lda*N; magma_zsetmatrix( N, N, h_A, lda, d_A, lda ); magma_zsetmatrix( N, nrhs, h_B, ldb, d_B, ldb ); magmablas_zlag2c( N, N, d_A, lda, d_As, N, &info ); magmablas_zlag2c( N, nrhs, d_B, ldb, d_Bs, N, &info ); gpu_time = magma_wtime(); magma_cpotrf_gpu(opts.uplo, N, d_As, N, &info); gpu_time = magma_wtime() - gpu_time; gpu_perfsf = gflopsF / gpu_time; if (info != 0) printf("magma_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); //===================================================================== // Single Precision Solve //===================================================================== magmablas_zlag2c(N, N, d_A, lda, d_As, N, &info ); magmablas_zlag2c(N, nrhs, d_B, ldb, d_Bs, N, &info ); gpu_time = magma_wtime(); magma_cpotrf_gpu(opts.uplo, N, d_As, lda, &info); magma_cpotrs_gpu(opts.uplo, N, nrhs, d_As, N, d_Bs, N, &info); gpu_time = magma_wtime() - gpu_time; gpu_perfss = gflopsS / gpu_time; if (info != 0) printf("magma_cpotrs returned error %d: %s.\n", (int) info, magma_strerror( info )); printf("%5d %5d %7.2f %7.2f %7.2f %7.2f %7.2f %4d %8.2e %s\n", (int) N, (int) nrhs, gpu_perfdf, gpu_perfds, gpu_perfsf, gpu_perfss, gpu_perf, (int) posv_iter, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( h_workd ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_X ); TESTING_FREE_DEV( d_works ); TESTING_FREE_DEV( d_workd ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
void MAGMA_ZPOTRS_GPU( char *uplo, magma_int_t *n, magma_int_t *nrhs, double2 *A, magma_int_t *lda, double2 *b, magma_int_t *ldb, magma_int_t *info) { magma_zpotrs_gpu( uplo[0], *n, *nrhs, A, *lda, b, *ldb, info); }