/** Purpose ------- CPOSV computes the solution to a complex system of linear equations A * X = B, where A is an N-by-N Hermitian positive definite matrix and X and B are N-by-NRHS matrices. The Cholesky decomposition is used to factor A as A = U**H * U, if UPLO = MagmaUpper, or A = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is a lower triangular matrix. The factored form of A is then used to solve the system of equations A * X = B. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in,out] dA COMPLEX array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H*U or dA = L*L**H. @param[in] ldda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in,out] dB COMPLEX array on the GPU, dimension (LDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. @param[in] lddb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_cposv_driver ********************************************************************/ extern "C" magma_int_t magma_cposv_gpu( magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs, magmaFloatComplex_ptr dA, magma_int_t ldda, magmaFloatComplex_ptr dB, magma_int_t lddb, magma_int_t *info ) { *info = 0; if ( uplo != MagmaUpper && uplo != MagmaLower ) *info = -1; if ( n < 0 ) *info = -2; if ( nrhs < 0 ) *info = -3; if ( ldda < max(1, n) ) *info = -5; if ( lddb < max(1, n) ) *info = -7; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if ( (n == 0) || (nrhs == 0) ) { return *info; } magma_cpotrf_gpu( uplo, n, dA, ldda, info ); if ( *info == 0 ) { magma_cpotrs_gpu( uplo, n, nrhs, dA, ldda, dB, lddb, info ); } return *info; }
extern "C" magma_int_t magma_zcposv_gpu(char uplo, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex *dA, magma_int_t ldda, magmaDoubleComplex *dB, magma_int_t lddb, magmaDoubleComplex *dX, magma_int_t lddx, magmaDoubleComplex *dworkd, magmaFloatComplex *dworks, magma_int_t *iter, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZCPOSV computes the solution to a complex system of linear equations A * X = B, where A is an N-by-N Hermitian positive definite matrix and X and B are N-by-NRHS matrices. ZCPOSV first attempts to factorize the matrix in complex SINGLE PRECISION and use this factorization within an iterative refinement procedure to produce a solution with complex DOUBLE PRECISION norm-wise backward error quality (see below). If the approach fails the method switches to a complex DOUBLE PRECISION factorization and solve. The iterative refinement is not going to be a winning strategy if the ratio complex SINGLE PRECISION performance over complex DOUBLE PRECISION performance is too small. A reasonable strategy should take the number of right-hand sides and the size of the matrix into account. This might be done with a call to ILAENV in the future. Up to now, we always try iterative refinement. The iterative refinement process is stopped if ITER > ITERMAX or for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX where o ITER is the number of the current iteration in the iterative refinement process o RNRM is the infinity-norm of the residual o XNRM is the infinity-norm of the solution o ANRM is the infinity-operator-norm of the matrix A o EPS is the machine epsilon returned by DLAMCH('Epsilon') The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively. Arguments ========= UPLO (input) CHARACTER = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The number of linear equations, i.e., the order of the matrix A. N >= 0. NRHS (input) INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. dA (input or input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if iterative refinement has been successfully used (INFO.EQ.0 and ITER.GE.0, see description below), then A is unchanged, if double factorization has been used (INFO.EQ.0 and ITER.LT.0, see description below), then the array dA contains the factor U or L from the Cholesky factorization A = U**T*U or A = L*L**T. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). dB (input) COMPLEX_16 array on the GPU, dimension (LDDB,NRHS) The N-by-NRHS right hand side matrix B. LDDB (input) INTEGER The leading dimension of the array dB. LDDB >= max(1,N). dX (output) COMPLEX_16 array on the GPU, dimension (LDDX,NRHS) If INFO = 0, the N-by-NRHS solution matrix X. LDDX (input) INTEGER The leading dimension of the array dX. LDDX >= max(1,N). dworkd (workspace) COMPLEX_16 array on the GPU, dimension (N*NRHS) This array is used to hold the residual vectors. dworks (workspace) COMPLEX array on the GPU, dimension (N*(N+NRHS)) This array is used to store the complex single precision matrix and the right-hand sides or solutions in single precision. ITER (output) INTEGER < 0: iterative refinement has failed, double precision factorization has been performed -1 : the routine fell back to full precision for implementation- or machine-specific reasons -2 : narrowing the precision induced an overflow, the routine fell back to full precision -3 : failure of SPOTRF -31: stop the iterative refinement after the 30th iteration > 0: iterative refinement has been successfully used. Returns the number of iterations INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i of (DOUBLE PRECISION) A is not positive definite, so the factorization could not be completed, and the solution has not been computed. ===================================================================== */ #define dB(i,j) (dB + (i) + (j)*lddb) #define dX(i,j) (dX + (i) + (j)*lddx) #define dR(i,j) (dR + (i) + (j)*lddr) #define dSX(i,j) (dSX + (i) + (j)*lddsx) magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t ione = 1; magmaDoubleComplex *dR; magmaFloatComplex *dSA, *dSX; magmaDoubleComplex Xnrmv, Rnrmv; double Anrm, Xnrm, Rnrm, cte, eps; magma_int_t i, j, iiter, lddsa, lddsx, lddr; /* Check arguments */ *iter = 0; *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -2; else if ( ldda < max(1,n)) *info = -4; else if ( lddb < max(1,n)) *info = -7; else if ( lddx < max(1,n)) *info = -9; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if ( n == 0 || nrhs == 0 ) return *info; lddsa = n; lddsx = n; lddr = n; dSA = dworks; dSX = dSA + lddsa*n; dR = dworkd; eps = lapackf77_dlamch("Epsilon"); Anrm = magmablas_zlanhe('I', uplo, n, dA, ldda, (double*)dworkd ); cte = Anrm * eps * pow((double)n, 0.5) * BWDMAX; /* * Convert to single precision */ magmablas_zlag2c( n, nrhs, dB, lddb, dSX, lddsx, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } magmablas_zlat2c( uplo, n, dA, ldda, dSA, lddsa, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // factor dSA in single precision magma_cpotrf_gpu( uplo, n, dSA, lddsa, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // solve dSA*dSX = dB in single precision magma_cpotrs_gpu( uplo, n, nrhs, dSA, lddsa, dSX, lddsx, info ); // residual dR = dB - dA*dX in double precision magmablas_clag2z( n, nrhs, dSX, lddsx, dX, lddx, info ); magmablas_zlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dR, lddr ); if ( nrhs == 1 ) { magma_zhemv( uplo, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_zhemm( MagmaLeft, uplo, n, nrhs, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } // TODO: use MAGMA_Z_ABS( dX(i,j) ) instead of zlange? for( j=0; j < nrhs; j++ ) { i = magma_izamax( n, dX(0,j), 1) - 1; magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_izamax ( n, dR(0,j), 1 ) - 1; magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto REFINEMENT; } } *iter = 0; return *info; REFINEMENT: for( iiter=1; iiter < ITERMAX; ) { *info = 0; // convert residual dR to single precision dSX magmablas_zlag2c( n, nrhs, dR, lddr, dSX, lddsx, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // solve dSA*dSX = R in single precision magma_cpotrs_gpu( uplo, n, nrhs, dSA, lddsa, dSX, lddsx, info ); // Add correction and setup residual // dX += dSX [including conversion] --and-- // dR = dB for( j=0; j < nrhs; j++ ) { magmablas_zcaxpycp( n, dSX(0,j), dX(0,j), dB(0,j), dR(0,j) ); } // residual dR = dB - dA*dX in double precision if ( nrhs == 1 ) { magma_zhemv( uplo, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_zhemm( MagmaLeft, uplo, n, nrhs, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } /* Check whether the nrhs normwise backward errors satisfy the * stopping criterion. If yes, set ITER=IITER>0 and return. */ for( j=0; j < nrhs; j++ ) { i = magma_izamax( n, dX(0,j), 1) - 1; magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_izamax ( n, dR(0,j), 1 ) - 1; magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto L20; } } /* If we are here, the nrhs normwise backward errors satisfy * the stopping criterion, we are good to exit. */ *iter = iiter; return *info; L20: iiter++; } /* If we are at this place of the code, this is because we have * performed ITER=ITERMAX iterations and never satisified the * stopping criterion. Set up the ITER flag accordingly and follow * up on double precision routine. */ *iter = -ITERMAX - 1; FALLBACK: /* Single-precision iterative refinement failed to converge to a * satisfactory solution, so we resort to double precision. */ magma_zpotrf_gpu( uplo, n, dA, ldda, info ); if (*info == 0) { magmablas_zlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dX, lddx ); magma_zpotrs_gpu( uplo, n, nrhs, dA, ldda, dX, lddx, info ); } return *info; }
extern "C" magma_int_t magma_cposv_gpu( magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs, magmaFloatComplex_ptr dA, size_t dA_offset, magma_int_t ldda, magmaFloatComplex_ptr dB, size_t dB_offset, magma_int_t lddb, magma_queue_t queue, magma_int_t *info ) { /* -- clMagma (version 0.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= CPOSV computes the solution to a complex system of linear equations A * X = B, where A is an N-by-N Hermitian positive definite matrix and X and B are N-by-NRHS matrices. The Cholesky decomposition is used to factor A as A = U**H * U, if UPLO = 'U', or A = L * L**H, if UPLO = 'L', where U is an upper triangular matrix and L is a lower triangular matrix. The factored form of A is then used to solve the system of equations A * X = B. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. NRHS (input) INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. dA (input/output) COMPLEX array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = 'U', the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H*U or dA = L*L**H. LDDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). dB (input/output) COMPLEX array on the GPU, dimension (LDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. LDDB (input) INTEGER The leading dimension of the array B. LDB >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ *info = 0; if ( uplo != MagmaUpper && uplo != MagmaLower ) *info = -1; if ( n < 0 ) *info = -2; if ( nrhs < 0 ) *info = -3; if ( ldda < max(1, n) ) *info = -5; if ( lddb < max(1, n) ) *info = -7; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if ( (n == 0) || (nrhs == 0) ) { return *info; } magma_cpotrf_gpu( uplo, n, dA, 0, ldda, queue, info ); if ( *info == 0 ) { magma_cpotrs_gpu( uplo, n, nrhs, dA, 0, ldda, dB, 0, lddb, queue, info ); } return *info; }
/** Purpose ------- CPOSV computes the solution to a complex system of linear equations A * X = B, where A is an N-by-N Hermitian positive definite matrix and X and B are N-by-NRHS matrices. The Cholesky decomposition is used to factor A as A = U**H * U, if UPLO = MagmaUpper, or A = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is a lower triangular matrix. The factored form of A is then used to solve the system of equations A * X = B. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U**H*U or A = L*L**H. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in,out] B COMPLEX array, dimension (LDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. @param[in] ldb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_cposv_driver ********************************************************************/ extern "C" magma_int_t magma_cposv( magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *B, magma_int_t ldb, magma_int_t *info ) { magma_int_t num_gpus, ldda, lddb; *info = 0; if ( uplo != MagmaUpper && uplo != MagmaLower ) *info = -1; if ( n < 0 ) *info = -2; if ( nrhs < 0) *info = -3; if ( lda < max(1, n) ) *info = -5; if ( ldb < max(1, n) ) *info = -7; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if ( (n == 0) || (nrhs == 0) ) { return *info; } /* If single-GPU and allocation suceeds, use GPU interface. */ num_gpus = magma_num_gpus(); magmaFloatComplex *dA, *dB; if ( num_gpus > 1 ) { goto CPU_INTERFACE; } ldda = ((n+31)/32)*32; lddb = ldda; if ( MAGMA_SUCCESS != magma_cmalloc( &dA, ldda*n )) { goto CPU_INTERFACE; } if ( MAGMA_SUCCESS != magma_cmalloc( &dB, lddb*nrhs )) { magma_free( dA ); goto CPU_INTERFACE; } magma_csetmatrix( n, n, A, lda, dA, ldda ); magma_cpotrf_gpu( uplo, n, dA, ldda, info ); if ( *info == MAGMA_ERR_DEVICE_ALLOC ) { magma_free( dA ); magma_free( dB ); goto CPU_INTERFACE; } magma_cgetmatrix( n, n, dA, ldda, A, lda ); if ( *info == 0 ) { magma_csetmatrix( n, nrhs, B, ldb, dB, lddb ); magma_cpotrs_gpu( uplo, n, nrhs, dA, ldda, dB, lddb, info ); magma_cgetmatrix( n, nrhs, dB, lddb, B, ldb ); } magma_free( dA ); magma_free( dB ); return *info; CPU_INTERFACE: /* If multi-GPU or allocation failed, use CPU interface and LAPACK. * Faster to use LAPACK for potrs than to copy A to GPU. */ magma_cpotrf( uplo, n, A, lda, info ); if ( *info == 0 ) { lapackf77_cpotrs( lapack_uplo_const(uplo), &n, &nrhs, A, &lda, B, &ldb, info ); } return *info; }
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflopsF, gflopsS, gpu_perf, gpu_time /*cpu_perf, cpu_time*/; real_Double_t gpu_perfdf, gpu_perfds; real_Double_t gpu_perfsf, gpu_perfss; double error, Rnorm, Anorm; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_B, *h_X; magmaDoubleComplex *d_A, *d_B, *d_X, *d_workd; magmaFloatComplex *d_As, *d_Bs, *d_works; double *h_workd; magma_int_t lda, ldb, ldx; magma_int_t N, nrhs, posv_iter, info, size; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; printf("Epsilon(double): %8.6e\n" "Epsilon(single): %8.6e\n\n", lapackf77_dlamch("Epsilon"), lapackf77_slamch("Epsilon") ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; printf("using: uplo = %s\n", lapack_uplo_const(opts.uplo)); printf(" N NRHS DP-Factor DP-Solve SP-Factor SP-Solve MP-Solve Iter |b-Ax|/|A|\n"); printf("=====================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; ldb = ldx = lda = N; gflopsF = FLOPS_ZPOTRF( N ) / 1e9; gflopsS = gflopsF + FLOPS_ZPOTRS( N, nrhs ) / 1e9; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, magmaDoubleComplex, ldx*nrhs ); TESTING_MALLOC_CPU( h_workd, double, N ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_DEV( d_B, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_DEV( d_X, magmaDoubleComplex, ldx*nrhs ); TESTING_MALLOC_DEV( d_works, magmaFloatComplex, lda*(N+nrhs) ); TESTING_MALLOC_DEV( d_workd, magmaDoubleComplex, N*nrhs ); /* Initialize the matrix */ size = lda * N ; lapackf77_zlarnv( &ione, ISEED, &size, h_A ); magma_zmake_hpd( N, h_A, lda ); size = ldb * nrhs ; lapackf77_zlarnv( &ione, ISEED, &size, h_B ); magma_zsetmatrix( N, N, h_A, lda, d_A, lda ); magma_zsetmatrix( N, nrhs, h_B, ldb, d_B, ldb ); //===================================================================== // Mixed Precision Iterative Refinement - GPU //===================================================================== gpu_time = magma_wtime(); magma_zcposv_gpu(opts.uplo, N, nrhs, d_A, lda, d_B, ldb, d_X, ldx, d_workd, d_works, &posv_iter, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflopsS / gpu_time; if (info != 0) printf("magma_zcposv returned error %d: %s.\n", (int) info, magma_strerror( info )); //===================================================================== // Error Computation //===================================================================== magma_zgetmatrix( N, nrhs, d_X, ldx, h_X, ldx ) ; Anorm = lapackf77_zlanhe( "I", lapack_uplo_const(opts.uplo), &N, h_A, &N, h_workd); blasf77_zhemm( "L", lapack_uplo_const(opts.uplo), &N, &nrhs, &c_one, h_A, &lda, h_X, &ldx, &c_neg_one, h_B, &ldb); Rnorm = lapackf77_zlange( "I", &N, &nrhs, h_B, &ldb, h_workd); error = Rnorm / Anorm; //===================================================================== // Double Precision Factor //===================================================================== magma_zsetmatrix( N, N, h_A, lda, d_A, lda ); gpu_time = magma_wtime(); magma_zpotrf_gpu(opts.uplo, N, d_A, lda, &info); gpu_time = magma_wtime() - gpu_time; gpu_perfdf = gflopsF / gpu_time; if (info != 0) printf("magma_zpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); //===================================================================== // Double Precision Solve //===================================================================== magma_zsetmatrix( N, N, h_A, lda, d_A, lda ); magma_zsetmatrix( N, nrhs, h_B, ldb, d_B, ldb ); gpu_time = magma_wtime(); magma_zpotrf_gpu(opts.uplo, N, d_A, lda, &info); magma_zpotrs_gpu(opts.uplo, N, nrhs, d_A, lda, d_B, ldb, &info); gpu_time = magma_wtime() - gpu_time; gpu_perfds = gflopsS / gpu_time; if (info != 0) printf("magma_zpotrs returned error %d: %s.\n", (int) info, magma_strerror( info )); //===================================================================== // Single Precision Factor //===================================================================== d_As = d_works; d_Bs = d_works + lda*N; magma_zsetmatrix( N, N, h_A, lda, d_A, lda ); magma_zsetmatrix( N, nrhs, h_B, ldb, d_B, ldb ); magmablas_zlag2c( N, N, d_A, lda, d_As, N, &info ); magmablas_zlag2c( N, nrhs, d_B, ldb, d_Bs, N, &info ); gpu_time = magma_wtime(); magma_cpotrf_gpu(opts.uplo, N, d_As, N, &info); gpu_time = magma_wtime() - gpu_time; gpu_perfsf = gflopsF / gpu_time; if (info != 0) printf("magma_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); //===================================================================== // Single Precision Solve //===================================================================== magmablas_zlag2c(N, N, d_A, lda, d_As, N, &info ); magmablas_zlag2c(N, nrhs, d_B, ldb, d_Bs, N, &info ); gpu_time = magma_wtime(); magma_cpotrf_gpu(opts.uplo, N, d_As, lda, &info); magma_cpotrs_gpu(opts.uplo, N, nrhs, d_As, N, d_Bs, N, &info); gpu_time = magma_wtime() - gpu_time; gpu_perfss = gflopsS / gpu_time; if (info != 0) printf("magma_cpotrs returned error %d: %s.\n", (int) info, magma_strerror( info )); printf("%5d %5d %7.2f %7.2f %7.2f %7.2f %7.2f %4d %8.2e %s\n", (int) N, (int) nrhs, gpu_perfdf, gpu_perfds, gpu_perfsf, gpu_perfss, gpu_perf, (int) posv_iter, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( h_workd ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_X ); TESTING_FREE_DEV( d_works ); TESTING_FREE_DEV( d_workd ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/***************************************************************************//** Purpose ------- CPOSV computes the solution to a complex system of linear equations A * X = B, where A is an N-by-N Hermitian positive definite matrix and X and B are N-by-NRHS matrices. The Cholesky decomposition is used to factor A as A = U**H * U, if UPLO = MagmaUpper, or A = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is a lower triangular matrix. The factored form of A is then used to solve the system of equations A * X = B. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U**H*U or A = L*L**H. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in,out] B COMPLEX array, dimension (LDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. @param[in] ldb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_posv *******************************************************************************/ extern "C" magma_int_t magma_cposv( magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *B, magma_int_t ldb, magma_int_t *info ) { #ifdef HAVE_clBLAS #define dA(i_, j_) dA, ((i_) + (j_)*ldda) #define dB(i_, j_) dB, ((i_) + (j_)*lddb) #else #define dA(i_, j_) (dA + (i_) + (j_)*ldda) #define dB(i_, j_) (dB + (i_) + (j_)*lddb) #endif magma_int_t ngpu, ldda, lddb; magma_queue_t queue = NULL; magma_device_t cdev; *info = 0; if ( uplo != MagmaUpper && uplo != MagmaLower ) *info = -1; if ( n < 0 ) *info = -2; if ( nrhs < 0) *info = -3; if ( lda < max(1, n) ) *info = -5; if ( ldb < max(1, n) ) *info = -7; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (n == 0 || nrhs == 0) { return *info; } /* If single-GPU and allocation suceeds, use GPU interface. */ ngpu = magma_num_gpus(); magmaFloatComplex_ptr dA, dB; if ( ngpu > 1 ) { goto CPU_INTERFACE; } ldda = magma_roundup( n, 32 ); lddb = ldda; if ( MAGMA_SUCCESS != magma_cmalloc( &dA, ldda*n )) { goto CPU_INTERFACE; } if ( MAGMA_SUCCESS != magma_cmalloc( &dB, lddb*nrhs )) { magma_free( dA ); goto CPU_INTERFACE; } magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); magma_csetmatrix( n, n, A, lda, dA(0,0), ldda, queue ); magma_cpotrf_gpu( uplo, n, dA(0,0), ldda, info ); if ( *info == MAGMA_ERR_DEVICE_ALLOC ) { magma_queue_destroy( queue ); magma_free( dA ); magma_free( dB ); goto CPU_INTERFACE; } magma_cgetmatrix( n, n, dA(0,0), ldda, A, lda, queue ); if ( *info == 0 ) { magma_csetmatrix( n, nrhs, B, ldb, dB(0,0), lddb, queue ); magma_cpotrs_gpu( uplo, n, nrhs, dA(0,0), ldda, dB(0,0), lddb, info ); magma_cgetmatrix( n, nrhs, dB(0,0), lddb, B, ldb, queue ); } magma_queue_destroy( queue ); magma_free( dA ); magma_free( dB ); return *info; CPU_INTERFACE: /* If multi-GPU or allocation failed, use CPU interface and LAPACK. * Faster to use LAPACK for potrs than to copy A to GPU. */ magma_cpotrf( uplo, n, A, lda, info ); if ( *info == 0 ) { lapackf77_cpotrs( lapack_uplo_const(uplo), &n, &nrhs, A, &lda, B, &ldb, info ); } return *info; }