/** Purpose ------- CPOSV computes the solution to a complex system of linear equations A * X = B, where A is an N-by-N Hermitian positive definite matrix and X and B are N-by-NRHS matrices. The Cholesky decomposition is used to factor A as A = U**H * U, if UPLO = MagmaUpper, or A = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is a lower triangular matrix. The factored form of A is then used to solve the system of equations A * X = B. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in,out] dA COMPLEX array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H*U or dA = L*L**H. @param[in] ldda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in,out] dB COMPLEX array on the GPU, dimension (LDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. @param[in] lddb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_cposv_driver ********************************************************************/ extern "C" magma_int_t magma_cposv_gpu( magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs, magmaFloatComplex_ptr dA, magma_int_t ldda, magmaFloatComplex_ptr dB, magma_int_t lddb, magma_int_t *info ) { *info = 0; if ( uplo != MagmaUpper && uplo != MagmaLower ) *info = -1; if ( n < 0 ) *info = -2; if ( nrhs < 0 ) *info = -3; if ( ldda < max(1, n) ) *info = -5; if ( lddb < max(1, n) ) *info = -7; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if ( (n == 0) || (nrhs == 0) ) { return *info; } magma_cpotrf_gpu( uplo, n, dA, ldda, info ); if ( *info == 0 ) { magma_cpotrs_gpu( uplo, n, nrhs, dA, ldda, dB, lddb, info ); } return *info; }
extern "C" magma_int_t magma_zcposv_gpu(char uplo, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex *dA, magma_int_t ldda, magmaDoubleComplex *dB, magma_int_t lddb, magmaDoubleComplex *dX, magma_int_t lddx, magmaDoubleComplex *dworkd, magmaFloatComplex *dworks, magma_int_t *iter, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZCPOSV computes the solution to a complex system of linear equations A * X = B, where A is an N-by-N Hermitian positive definite matrix and X and B are N-by-NRHS matrices. ZCPOSV first attempts to factorize the matrix in complex SINGLE PRECISION and use this factorization within an iterative refinement procedure to produce a solution with complex DOUBLE PRECISION norm-wise backward error quality (see below). If the approach fails the method switches to a complex DOUBLE PRECISION factorization and solve. The iterative refinement is not going to be a winning strategy if the ratio complex SINGLE PRECISION performance over complex DOUBLE PRECISION performance is too small. A reasonable strategy should take the number of right-hand sides and the size of the matrix into account. This might be done with a call to ILAENV in the future. Up to now, we always try iterative refinement. The iterative refinement process is stopped if ITER > ITERMAX or for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX where o ITER is the number of the current iteration in the iterative refinement process o RNRM is the infinity-norm of the residual o XNRM is the infinity-norm of the solution o ANRM is the infinity-operator-norm of the matrix A o EPS is the machine epsilon returned by DLAMCH('Epsilon') The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively. Arguments ========= UPLO (input) CHARACTER = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The number of linear equations, i.e., the order of the matrix A. N >= 0. NRHS (input) INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. dA (input or input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if iterative refinement has been successfully used (INFO.EQ.0 and ITER.GE.0, see description below), then A is unchanged, if double factorization has been used (INFO.EQ.0 and ITER.LT.0, see description below), then the array dA contains the factor U or L from the Cholesky factorization A = U**T*U or A = L*L**T. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). dB (input) COMPLEX_16 array on the GPU, dimension (LDDB,NRHS) The N-by-NRHS right hand side matrix B. LDDB (input) INTEGER The leading dimension of the array dB. LDDB >= max(1,N). dX (output) COMPLEX_16 array on the GPU, dimension (LDDX,NRHS) If INFO = 0, the N-by-NRHS solution matrix X. LDDX (input) INTEGER The leading dimension of the array dX. LDDX >= max(1,N). dworkd (workspace) COMPLEX_16 array on the GPU, dimension (N*NRHS) This array is used to hold the residual vectors. dworks (workspace) COMPLEX array on the GPU, dimension (N*(N+NRHS)) This array is used to store the complex single precision matrix and the right-hand sides or solutions in single precision. ITER (output) INTEGER < 0: iterative refinement has failed, double precision factorization has been performed -1 : the routine fell back to full precision for implementation- or machine-specific reasons -2 : narrowing the precision induced an overflow, the routine fell back to full precision -3 : failure of SPOTRF -31: stop the iterative refinement after the 30th iteration > 0: iterative refinement has been successfully used. Returns the number of iterations INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i of (DOUBLE PRECISION) A is not positive definite, so the factorization could not be completed, and the solution has not been computed. ===================================================================== */ #define dB(i,j) (dB + (i) + (j)*lddb) #define dX(i,j) (dX + (i) + (j)*lddx) #define dR(i,j) (dR + (i) + (j)*lddr) #define dSX(i,j) (dSX + (i) + (j)*lddsx) magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t ione = 1; magmaDoubleComplex *dR; magmaFloatComplex *dSA, *dSX; magmaDoubleComplex Xnrmv, Rnrmv; double Anrm, Xnrm, Rnrm, cte, eps; magma_int_t i, j, iiter, lddsa, lddsx, lddr; /* Check arguments */ *iter = 0; *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -2; else if ( ldda < max(1,n)) *info = -4; else if ( lddb < max(1,n)) *info = -7; else if ( lddx < max(1,n)) *info = -9; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if ( n == 0 || nrhs == 0 ) return *info; lddsa = n; lddsx = n; lddr = n; dSA = dworks; dSX = dSA + lddsa*n; dR = dworkd; eps = lapackf77_dlamch("Epsilon"); Anrm = magmablas_zlanhe('I', uplo, n, dA, ldda, (double*)dworkd ); cte = Anrm * eps * pow((double)n, 0.5) * BWDMAX; /* * Convert to single precision */ magmablas_zlag2c( n, nrhs, dB, lddb, dSX, lddsx, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } magmablas_zlat2c( uplo, n, dA, ldda, dSA, lddsa, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // factor dSA in single precision magma_cpotrf_gpu( uplo, n, dSA, lddsa, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // solve dSA*dSX = dB in single precision magma_cpotrs_gpu( uplo, n, nrhs, dSA, lddsa, dSX, lddsx, info ); // residual dR = dB - dA*dX in double precision magmablas_clag2z( n, nrhs, dSX, lddsx, dX, lddx, info ); magmablas_zlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dR, lddr ); if ( nrhs == 1 ) { magma_zhemv( uplo, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_zhemm( MagmaLeft, uplo, n, nrhs, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } // TODO: use MAGMA_Z_ABS( dX(i,j) ) instead of zlange? for( j=0; j < nrhs; j++ ) { i = magma_izamax( n, dX(0,j), 1) - 1; magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_izamax ( n, dR(0,j), 1 ) - 1; magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto REFINEMENT; } } *iter = 0; return *info; REFINEMENT: for( iiter=1; iiter < ITERMAX; ) { *info = 0; // convert residual dR to single precision dSX magmablas_zlag2c( n, nrhs, dR, lddr, dSX, lddsx, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // solve dSA*dSX = R in single precision magma_cpotrs_gpu( uplo, n, nrhs, dSA, lddsa, dSX, lddsx, info ); // Add correction and setup residual // dX += dSX [including conversion] --and-- // dR = dB for( j=0; j < nrhs; j++ ) { magmablas_zcaxpycp( n, dSX(0,j), dX(0,j), dB(0,j), dR(0,j) ); } // residual dR = dB - dA*dX in double precision if ( nrhs == 1 ) { magma_zhemv( uplo, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_zhemm( MagmaLeft, uplo, n, nrhs, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } /* Check whether the nrhs normwise backward errors satisfy the * stopping criterion. If yes, set ITER=IITER>0 and return. */ for( j=0; j < nrhs; j++ ) { i = magma_izamax( n, dX(0,j), 1) - 1; magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_izamax ( n, dR(0,j), 1 ) - 1; magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto L20; } } /* If we are here, the nrhs normwise backward errors satisfy * the stopping criterion, we are good to exit. */ *iter = iiter; return *info; L20: iiter++; } /* If we are at this place of the code, this is because we have * performed ITER=ITERMAX iterations and never satisified the * stopping criterion. Set up the ITER flag accordingly and follow * up on double precision routine. */ *iter = -ITERMAX - 1; FALLBACK: /* Single-precision iterative refinement failed to converge to a * satisfactory solution, so we resort to double precision. */ magma_zpotrf_gpu( uplo, n, dA, ldda, info ); if (*info == 0) { magmablas_zlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dX, lddx ); magma_zpotrs_gpu( uplo, n, nrhs, dA, ldda, dX, lddx, info ); } return *info; }
extern "C" magma_int_t magma_cposv_gpu( magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs, magmaFloatComplex_ptr dA, size_t dA_offset, magma_int_t ldda, magmaFloatComplex_ptr dB, size_t dB_offset, magma_int_t lddb, magma_queue_t queue, magma_int_t *info ) { /* -- clMagma (version 0.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= CPOSV computes the solution to a complex system of linear equations A * X = B, where A is an N-by-N Hermitian positive definite matrix and X and B are N-by-NRHS matrices. The Cholesky decomposition is used to factor A as A = U**H * U, if UPLO = 'U', or A = L * L**H, if UPLO = 'L', where U is an upper triangular matrix and L is a lower triangular matrix. The factored form of A is then used to solve the system of equations A * X = B. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. NRHS (input) INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. dA (input/output) COMPLEX array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = 'U', the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H*U or dA = L*L**H. LDDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). dB (input/output) COMPLEX array on the GPU, dimension (LDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. LDDB (input) INTEGER The leading dimension of the array B. LDB >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ *info = 0; if ( uplo != MagmaUpper && uplo != MagmaLower ) *info = -1; if ( n < 0 ) *info = -2; if ( nrhs < 0 ) *info = -3; if ( ldda < max(1, n) ) *info = -5; if ( lddb < max(1, n) ) *info = -7; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if ( (n == 0) || (nrhs == 0) ) { return *info; } magma_cpotrf_gpu( uplo, n, dA, 0, ldda, queue, info ); if ( *info == 0 ) { magma_cpotrs_gpu( uplo, n, nrhs, dA, 0, ldda, dB, 0, lddb, queue, info ); } return *info; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cpotrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaFloatComplex *h_A, *h_R; magmaFloatComplex_ptr d_A; magma_int_t N, n2, lda, ldda, info; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float work[1], error; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) float tol = opts.tolerance * lapackf77_slamch("E"); printf("uplo = %s, version = %d\n", lapack_uplo_const(opts.uplo), opts.version ); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma - R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; gflops = FLOPS_CPOTRF( N ) / 1e9; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); magma_cmake_hpd( N, h_A, lda ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_csetmatrix( N, N, h_A, lda, d_A, 0, ldda, opts.queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); if ( opts.version == 1 ) { magma_cpotrf_gpu( opts.uplo, N, d_A, 0, ldda, opts.queue, &info ); } else if ( opts.version == 2 ) { magma_cpotrf2_gpu( opts.uplo, N, d_A, 0, ldda, opts.queues2, &info ); } else { printf( "Unknown version %d\n", opts.version ); exit(1); } gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cpotrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cpotrf( lapack_uplo_const(opts.uplo), &N, h_A, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_cgetmatrix( N, N, d_A, 0, ldda, h_R, lda, opts.queue ); error = lapackf77_clange("f", &N, &N, h_A, &lda, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_clange("f", &N, &N, h_R, &lda, work) / error; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); } else { printf("%5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) N, gpu_perf, gpu_time ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflopsF, gflopsS, gpu_perf, gpu_time /*cpu_perf, cpu_time*/; real_Double_t gpu_perfdf, gpu_perfds; real_Double_t gpu_perfsf, gpu_perfss; double error, Rnorm, Anorm; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_B, *h_X; magmaDoubleComplex *d_A, *d_B, *d_X, *d_workd; magmaFloatComplex *d_As, *d_Bs, *d_works; double *h_workd; magma_int_t lda, ldb, ldx; magma_int_t N, nrhs, posv_iter, info, size; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; printf("Epsilon(double): %8.6e\n" "Epsilon(single): %8.6e\n\n", lapackf77_dlamch("Epsilon"), lapackf77_slamch("Epsilon") ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; printf("using: uplo = %s\n", lapack_uplo_const(opts.uplo)); printf(" N NRHS DP-Factor DP-Solve SP-Factor SP-Solve MP-Solve Iter |b-Ax|/|A|\n"); printf("=====================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; ldb = ldx = lda = N; gflopsF = FLOPS_ZPOTRF( N ) / 1e9; gflopsS = gflopsF + FLOPS_ZPOTRS( N, nrhs ) / 1e9; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, magmaDoubleComplex, ldx*nrhs ); TESTING_MALLOC_CPU( h_workd, double, N ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_DEV( d_B, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_DEV( d_X, magmaDoubleComplex, ldx*nrhs ); TESTING_MALLOC_DEV( d_works, magmaFloatComplex, lda*(N+nrhs) ); TESTING_MALLOC_DEV( d_workd, magmaDoubleComplex, N*nrhs ); /* Initialize the matrix */ size = lda * N ; lapackf77_zlarnv( &ione, ISEED, &size, h_A ); magma_zmake_hpd( N, h_A, lda ); size = ldb * nrhs ; lapackf77_zlarnv( &ione, ISEED, &size, h_B ); magma_zsetmatrix( N, N, h_A, lda, d_A, lda ); magma_zsetmatrix( N, nrhs, h_B, ldb, d_B, ldb ); //===================================================================== // Mixed Precision Iterative Refinement - GPU //===================================================================== gpu_time = magma_wtime(); magma_zcposv_gpu(opts.uplo, N, nrhs, d_A, lda, d_B, ldb, d_X, ldx, d_workd, d_works, &posv_iter, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflopsS / gpu_time; if (info != 0) printf("magma_zcposv returned error %d: %s.\n", (int) info, magma_strerror( info )); //===================================================================== // Error Computation //===================================================================== magma_zgetmatrix( N, nrhs, d_X, ldx, h_X, ldx ) ; Anorm = lapackf77_zlanhe( "I", lapack_uplo_const(opts.uplo), &N, h_A, &N, h_workd); blasf77_zhemm( "L", lapack_uplo_const(opts.uplo), &N, &nrhs, &c_one, h_A, &lda, h_X, &ldx, &c_neg_one, h_B, &ldb); Rnorm = lapackf77_zlange( "I", &N, &nrhs, h_B, &ldb, h_workd); error = Rnorm / Anorm; //===================================================================== // Double Precision Factor //===================================================================== magma_zsetmatrix( N, N, h_A, lda, d_A, lda ); gpu_time = magma_wtime(); magma_zpotrf_gpu(opts.uplo, N, d_A, lda, &info); gpu_time = magma_wtime() - gpu_time; gpu_perfdf = gflopsF / gpu_time; if (info != 0) printf("magma_zpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); //===================================================================== // Double Precision Solve //===================================================================== magma_zsetmatrix( N, N, h_A, lda, d_A, lda ); magma_zsetmatrix( N, nrhs, h_B, ldb, d_B, ldb ); gpu_time = magma_wtime(); magma_zpotrf_gpu(opts.uplo, N, d_A, lda, &info); magma_zpotrs_gpu(opts.uplo, N, nrhs, d_A, lda, d_B, ldb, &info); gpu_time = magma_wtime() - gpu_time; gpu_perfds = gflopsS / gpu_time; if (info != 0) printf("magma_zpotrs returned error %d: %s.\n", (int) info, magma_strerror( info )); //===================================================================== // Single Precision Factor //===================================================================== d_As = d_works; d_Bs = d_works + lda*N; magma_zsetmatrix( N, N, h_A, lda, d_A, lda ); magma_zsetmatrix( N, nrhs, h_B, ldb, d_B, ldb ); magmablas_zlag2c( N, N, d_A, lda, d_As, N, &info ); magmablas_zlag2c( N, nrhs, d_B, ldb, d_Bs, N, &info ); gpu_time = magma_wtime(); magma_cpotrf_gpu(opts.uplo, N, d_As, N, &info); gpu_time = magma_wtime() - gpu_time; gpu_perfsf = gflopsF / gpu_time; if (info != 0) printf("magma_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); //===================================================================== // Single Precision Solve //===================================================================== magmablas_zlag2c(N, N, d_A, lda, d_As, N, &info ); magmablas_zlag2c(N, nrhs, d_B, ldb, d_Bs, N, &info ); gpu_time = magma_wtime(); magma_cpotrf_gpu(opts.uplo, N, d_As, lda, &info); magma_cpotrs_gpu(opts.uplo, N, nrhs, d_As, N, d_Bs, N, &info); gpu_time = magma_wtime() - gpu_time; gpu_perfss = gflopsS / gpu_time; if (info != 0) printf("magma_cpotrs returned error %d: %s.\n", (int) info, magma_strerror( info )); printf("%5d %5d %7.2f %7.2f %7.2f %7.2f %7.2f %4d %8.2e %s\n", (int) N, (int) nrhs, gpu_perfdf, gpu_perfds, gpu_perfsf, gpu_perfss, gpu_perf, (int) posv_iter, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( h_workd ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_X ); TESTING_FREE_DEV( d_works ); TESTING_FREE_DEV( d_workd ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/** Purpose ------- CPOSV computes the solution to a complex system of linear equations A * X = B, where A is an N-by-N Hermitian positive definite matrix and X and B are N-by-NRHS matrices. The Cholesky decomposition is used to factor A as A = U**H * U, if UPLO = MagmaUpper, or A = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is a lower triangular matrix. The factored form of A is then used to solve the system of equations A * X = B. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U**H*U or A = L*L**H. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in,out] B COMPLEX array, dimension (LDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. @param[in] ldb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_cposv_driver ********************************************************************/ extern "C" magma_int_t magma_cposv( magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *B, magma_int_t ldb, magma_int_t *info ) { magma_int_t num_gpus, ldda, lddb; *info = 0; if ( uplo != MagmaUpper && uplo != MagmaLower ) *info = -1; if ( n < 0 ) *info = -2; if ( nrhs < 0) *info = -3; if ( lda < max(1, n) ) *info = -5; if ( ldb < max(1, n) ) *info = -7; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if ( (n == 0) || (nrhs == 0) ) { return *info; } /* If single-GPU and allocation suceeds, use GPU interface. */ num_gpus = magma_num_gpus(); magmaFloatComplex *dA, *dB; if ( num_gpus > 1 ) { goto CPU_INTERFACE; } ldda = ((n+31)/32)*32; lddb = ldda; if ( MAGMA_SUCCESS != magma_cmalloc( &dA, ldda*n )) { goto CPU_INTERFACE; } if ( MAGMA_SUCCESS != magma_cmalloc( &dB, lddb*nrhs )) { magma_free( dA ); goto CPU_INTERFACE; } magma_csetmatrix( n, n, A, lda, dA, ldda ); magma_cpotrf_gpu( uplo, n, dA, ldda, info ); if ( *info == MAGMA_ERR_DEVICE_ALLOC ) { magma_free( dA ); magma_free( dB ); goto CPU_INTERFACE; } magma_cgetmatrix( n, n, dA, ldda, A, lda ); if ( *info == 0 ) { magma_csetmatrix( n, nrhs, B, ldb, dB, lddb ); magma_cpotrs_gpu( uplo, n, nrhs, dA, ldda, dB, lddb, info ); magma_cgetmatrix( n, nrhs, dB, lddb, B, ldb ); } magma_free( dA ); magma_free( dB ); return *info; CPU_INTERFACE: /* If multi-GPU or allocation failed, use CPU interface and LAPACK. * Faster to use LAPACK for potrs than to copy A to GPU. */ magma_cpotrf( uplo, n, A, lda, info ); if ( *info == 0 ) { lapackf77_cpotrs( lapack_uplo_const(uplo), &n, &nrhs, A, &lda, B, &ldb, info ); } return *info; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cpotrf */ int main( int argc, char** argv) { TESTING_CUDA_INIT(); magma_timestr_t start, end; float flops, gpu_perf, cpu_perf; cuFloatComplex *h_A, *h_R; cuFloatComplex *d_A; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112}; magma_int_t i, info; const char *uplo = MagmaUpperStr; cuFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float work[1], matnorm; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); } if (N>0) size[0] = size[9] = N; else exit(1); } else { printf("\nUsage: \n"); printf(" testing_cpotri_gpu -N %d\n\n", 1024); } /* Allocate host memory for the matrix */ n2 = size[9] * size[9]; ldda = ((size[9]+31)/32) * 32; TESTING_MALLOC( h_A, cuFloatComplex, n2); TESTING_HOSTALLOC( h_R, cuFloatComplex, n2); TESTING_DEVALLOC( d_A, cuFloatComplex, ldda*size[9] ); printf(" N CPU GFlop/s GPU GFlop/s ||R||_F / ||A||_F\n"); printf("========================================================\n"); for(i=0; i<10; i++){ N = size[i]; lda = N; n2 = lda*N; flops = FLOPS_CPOTRI( (float)N ) / 1000000; ldda = ((N+31)/32)*32; /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); /* Symmetrize and increase the diagonal */ { magma_int_t i, j; for(i=0; i<N; i++) { MAGMA_C_SET2REAL( h_A[i*lda+i], ( MAGMA_C_REAL(h_A[i*lda+i]) + 1.*N ) ); for(j=0; j<i; j++) h_A[i*lda+j] = cuConjf(h_A[j*lda+i]); } } lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ //cublasSetMatrix( N, N, sizeof(cuFloatComplex), h_A, lda, d_A, ldda); //magma_cpotrf_gpu(uplo[0], N, d_A, ldda, &info); /* factorize matrix */ magma_csetmatrix( N, N, h_A, lda, d_A, ldda ); magma_cpotrf_gpu(uplo[0], N, d_A, ldda, &info); // check for exact singularity //magma_cgetmatrix( N, N, d_A, ldda, h_R, lda ); //h_R[ 10 + 10*lda ] = MAGMA_C_MAKE( 0.0, 0.0 ); //magma_csetmatrix( N, N, h_R, lda, d_A, ldda ); start = get_current_time(); magma_cpotri_gpu(uplo[0], N, d_A, ldda, &info); end = get_current_time(); if (info != 0) printf("magma_cpotri_gpu returned error %d\n", (int) info); gpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_cpotrf(uplo, &N, h_A, &lda, &info); start = get_current_time(); lapackf77_cpotri(uplo, &N, h_A, &lda, &info); end = get_current_time(); if (info != 0) printf("lapackf77_cpotri returned error %d\n", (int) info); cpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_cgetmatrix( N, N, d_A, ldda, h_R, lda ); matnorm = lapackf77_clange("f", &N, &N, h_A, &lda, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); printf("%5d %6.2f %6.2f %e\n", (int) size[i], cpu_perf, gpu_perf, lapackf77_clange("f", &N, &N, h_R, &lda, work) / matnorm); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE( h_A ); TESTING_HOSTFREE( h_R ); TESTING_DEVFREE( d_A ); /* Shutdown */ TESTING_CUDA_FINALIZE(); }
/** Purpose ------- CHEGVX computes selected eigenvalues, and optionally, eigenvectors of a complex generalized Hermitian-definite eigenproblem, of the form A*x=(lambda)*B*x, A*Bx=(lambda)*x, or B*A*x=(lambda)*x. Here A and B are assumed to be Hermitian and B is also positive definite. Eigenvalues and eigenvectors can be selected by specifying either a range of values or a range of indices for the desired eigenvalues. Arguments --------- @param[in] itype INTEGER Specifies the problem type to be solved: = 1: A*x = (lambda)*B*x = 2: A*B*x = (lambda)*x = 3: B*A*x = (lambda)*x @param[in] jobz magma_vec_t - = MagmaNoVec: Compute eigenvalues only; - = MagmaVec: Compute eigenvalues and eigenvectors. @param[in] range magma_range_t - = MagmaRangeAll: all eigenvalues will be found. - = MagmaRangeV: all eigenvalues in the half-open interval (VL,VU] will be found. - = MagmaRangeI: the IL-th through IU-th eigenvalues will be found. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangles of A and B are stored; - = MagmaLower: Lower triangles of A and B are stored. @param[in] n INTEGER The order of the matrices A and B. N >= 0. @param[in,out] A COMPLEX array, dimension (LDA, N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. \n On exit, the lower triangle (if UPLO=MagmaLower) or the upper triangle (if UPLO=MagmaUpper) of A, including the diagonal, is destroyed. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in,out] B COMPLEX array, dimension (LDB, N) On entry, the Hermitian matrix B. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of B contains the upper triangular part of the matrix B. If UPLO = MagmaLower, the leading N-by-N lower triangular part of B contains the lower triangular part of the matrix B. \n On exit, if INFO <= N, the part of B containing the matrix is overwritten by the triangular factor U or L from the Cholesky factorization B = U**H*U or B = L*L**H. @param[in] ldb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[in] vl REAL @param[in] vu REAL If RANGE=MagmaRangeV, the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = MagmaRangeAll or MagmaRangeI. @param[in] il INTEGER @param[in] iu INTEGER If RANGE=MagmaRangeI, the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = MagmaRangeAll or MagmaRangeV. @param[in] abstol REAL The absolute error tolerance for the eigenvalues. An approximate eigenvalue is accepted as converged when it is determined to lie in an interval [a,b] of width less than or equal to \n ABSTOL + EPS * max( |a|,|b| ), \n where EPS is the machine precision. If ABSTOL is less than or equal to zero, then EPS*|T| will be used in its place, where |T| is the 1-norm of the tridiagonal matrix obtained by reducing A to tridiagonal form. \n Eigenvalues will be computed most accurately when ABSTOL is set to twice the underflow threshold 2*SLAMCH('S'), not zero. If this routine returns with INFO > 0, indicating that some eigenvectors did not converge, try setting ABSTOL to 2*SLAMCH('S'). @param[out] m INTEGER The total number of eigenvalues found. 0 <= M <= N. If RANGE = MagmaRangeAll, M = N, and if RANGE = MagmaRangeI, M = IU-IL+1. @param[out] w REAL array, dimension (N) The first M elements contain the selected eigenvalues in ascending order. @param[out] Z COMPLEX array, dimension (LDZ, max(1,M)) If JOBZ = MagmaNoVec, then Z is not referenced. If JOBZ = MagmaVec, then if INFO = 0, the first M columns of Z contain the orthonormal eigenvectors of the matrix A corresponding to the selected eigenvalues, with the i-th column of Z holding the eigenvector associated with W(i). The eigenvectors are normalized as follows: if ITYPE = 1 or 2, Z**T*B*Z = I; if ITYPE = 3, Z**T*inv(B)*Z = I. \n If an eigenvector fails to converge, then that column of Z contains the latest approximation to the eigenvector, and the index of the eigenvector is returned in IFAIL. Note: the user must ensure that at least max(1,M) columns are supplied in the array Z; if RANGE = MagmaRangeV, the exact value of M is not known in advance and an upper bound must be used. @param[in] ldz INTEGER The leading dimension of the array Z. LDZ >= 1, and if JOBZ = MagmaVec, LDZ >= max(1,N). @param[out] work (workspace) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. LWORK >= max(1,2*N). For optimal efficiency, LWORK >= (NB+1)*N, where NB is the blocksize for CHETRD returned by ILAENV. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param rwork (workspace) REAL array, dimension (7*N) @param iwork (workspace) INTEGER array, dimension (5*N) @param[out] ifail INTEGER array, dimension (N) If JOBZ = MagmaVec, then if INFO = 0, the first M elements of IFAIL are zero. If INFO > 0, then IFAIL contains the indices of the eigenvectors that failed to converge. If JOBZ = MagmaNoVec, then IFAIL is not referenced. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: CPOTRF or CHEEVX returned an error code: <= N: if INFO = i, CHEEVX failed to converge; i eigenvectors failed to converge. Their indices are stored in array IFAIL. > N: if INFO = N + i, for 1 <= i <= N, then the leading minor of order i of B is not positive definite. The factorization of B could not be completed and no eigenvalues or eigenvectors were computed. Further Details --------------- Based on contributions by Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA @ingroup magma_chegv_driver ********************************************************************/ extern "C" magma_int_t magma_chegvx( magma_int_t itype, magma_vec_t jobz, magma_range_t range, magma_uplo_t uplo, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *B, magma_int_t ldb, float vl, float vu, magma_int_t il, magma_int_t iu, float abstol, magma_int_t *m, float *w, magmaFloatComplex *Z, magma_int_t ldz, magmaFloatComplex *work, magma_int_t lwork, float *rwork, magma_int_t *iwork, magma_int_t *ifail, magma_int_t *info) { magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex *dA; magmaFloatComplex *dB; magmaFloatComplex *dZ; magma_int_t ldda = n; magma_int_t lddb = n; magma_int_t lddz = n; magma_int_t lower; magma_trans_t trans; magma_int_t wantz; magma_int_t lquery; magma_int_t alleig, valeig, indeig; magma_int_t lwmin; magma_queue_t stream; magma_queue_create( &stream ); wantz = (jobz == MagmaVec); lower = (uplo == MagmaLower); alleig = (range == MagmaRangeAll); valeig = (range == MagmaRangeV); indeig = (range == MagmaRangeI); lquery = (lwork == -1); *info = 0; if (itype < 1 || itype > 3) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (wantz || (jobz == MagmaNoVec))) { *info = -3; } else if (! (lower || (uplo == MagmaUpper))) { *info = -4; } else if (n < 0) { *info = -5; } else if (lda < max(1,n)) { *info = -7; } else if (ldb < max(1,n)) { *info = -9; } else if (ldz < 1 || (wantz && ldz < n)) { *info = -18; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -11; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -12; } else if (iu < min(n,il) || iu > n) { *info = -13; } } } magma_int_t nb = magma_get_chetrd_nb(n); lwmin = n * (nb + 1); work[0] = MAGMA_C_MAKE( lwmin, 0 ); if (lwork < lwmin && ! lquery) { *info = -20; } if (*info != 0) { magma_xerbla( __func__, -(*info)); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } if (MAGMA_SUCCESS != magma_cmalloc( &dA, n*ldda ) || MAGMA_SUCCESS != magma_cmalloc( &dB, n*lddb ) || MAGMA_SUCCESS != magma_cmalloc( &dZ, n*lddz )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* Form a Cholesky factorization of B. */ magma_csetmatrix( n, n, B, ldb, dB, lddb ); magma_csetmatrix_async( n, n, A, lda, dA, ldda, stream ); magma_cpotrf_gpu(uplo, n, dB, lddb, info); if (*info != 0) { *info = n + *info; return *info; } magma_queue_sync( stream ); magma_cgetmatrix_async( n, n, dB, lddb, B, ldb, stream ); /* Transform problem to standard eigenvalue problem and solve. */ magma_chegst_gpu(itype, uplo, n, dA, ldda, dB, lddb, info); magma_cheevx_gpu(jobz, range, uplo, n, dA, ldda, vl, vu, il, iu, abstol, m, w, dZ, lddz, A, lda, Z, ldz, work, lwork, rwork, iwork, ifail, info); if (wantz && *info == 0) { /* Backtransform eigenvectors to the original problem. */ if (itype == 1 || itype == 2) { /* For A*x=(lambda)*B*x and A*B*x=(lambda)*x; backtransform eigenvectors: x = inv(L)'*y or inv(U)*y */ if (lower) { trans = MagmaConjTrans; } else { trans = MagmaNoTrans; } magma_ctrsm(MagmaLeft, uplo, trans, MagmaNonUnit, n, *m, c_one, dB, lddb, dZ, lddz); } else if (itype == 3) { /* For B*A*x=(lambda)*x; backtransform eigenvectors: x = L*y or U'*y */ if (lower) { trans = MagmaNoTrans; } else { trans = MagmaConjTrans; } magma_ctrmm(MagmaLeft, uplo, trans, MagmaNonUnit, n, *m, c_one, dB, lddb, dZ, lddz); } magma_cgetmatrix( n, *m, dZ, lddz, Z, ldz ); } magma_queue_sync( stream ); magma_queue_destroy( stream ); magma_free( dA ); magma_free( dB ); magma_free( dZ ); return *info; } /* magma_chegvx */
int main( int argc, char** argv ) { magma_init(); cublasHandle_t handle; cudaSetDevice( 0 ); cublasCreate( &handle ); magmaFloatComplex *A, *B, *C; magmaFloatComplex *dA, *dB, *dC; float error, work[1]; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = { 1, 2, 3, 4 }; magma_int_t n = 10; magma_int_t lda = n; magma_int_t ldda = ((n+31)/32)*32; magma_int_t size = lda*n; magma_int_t info; magma_cmalloc_cpu( &A, lda*n ); magma_cmalloc_cpu( &B, lda*n ); magma_cmalloc_cpu( &C, lda*n ); magma_cmalloc( &dA, ldda*n ); magma_cmalloc( &dB, ldda*n ); magma_cmalloc( &dC, ldda*n ); // initialize matrices lapackf77_clarnv( &ione, ISEED, &size, A ); lapackf77_clarnv( &ione, ISEED, &size, B ); lapackf77_clarnv( &ione, ISEED, &size, C ); // increase diagonal to be SPD for( int i=0; i < n; ++i ) { C[i+i*lda] = MAGMA_C_ADD( C[i+i*lda], MAGMA_C_MAKE( n*n, 0 )); } magma_csetmatrix( n, n, A, lda, dA, ldda ); magma_csetmatrix( n, n, B, lda, dB, ldda ); magma_csetmatrix( n, n, C, lda, dC, ldda ); // compute with cublas cublasCgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &c_neg_one, dA, ldda, dB, ldda, &c_one, dC, ldda ); magma_cpotrf_gpu( MagmaLower, n, dC, ldda, &info ); if (info != 0) printf("magma_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute with LAPACK blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &n, &n, &n, &c_neg_one, A, &lda, B, &lda, &c_one, C, &lda ); lapackf77_cpotrf( MagmaLowerStr, &n, C, &lda, &info ); if (info != 0) printf("lapackf77_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute difference magma_cgetmatrix( n, n, dC, ldda, A, lda ); blasf77_caxpy( &size, &c_neg_one, C, &ione, A, &ione ); error = lapackf77_clange( "F", &n, &n, A, &lda, work ); printf( "n %d, error %8.2e\n", (int) n, error ); magma_free( dA ); magma_free( dB ); magma_free( dC ); magma_free_cpu( A ); magma_free_cpu( B ); magma_free_cpu( C ); cublasDestroy( handle ); magma_finalize(); return 0; }
int main( int argc, char** argv ) { magma_init(); cublasHandle_t handle; cudaSetDevice( 0 ); cublasCreate( &handle ); magmaFloatComplex *A, *B, *C; magmaFloatComplex *dA, *dB, *dC; float error, work[1]; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = { 1, 2, 3, 4 }; magma_int_t n, lda, ldda, size, info; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf(" N |dC - C|/|C|\n"); printf("====================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { // for this simple case, all matrices are N-by-N n = opts.nsize[itest]; lda = n; ldda = ((n+31)/32)*32; magma_cmalloc_cpu( &A, lda*n ); magma_cmalloc_cpu( &B, lda*n ); magma_cmalloc_cpu( &C, lda*n ); magma_cmalloc( &dA, ldda*n ); magma_cmalloc( &dB, ldda*n ); magma_cmalloc( &dC, ldda*n ); // initialize matrices size = lda*n; lapackf77_clarnv( &ione, ISEED, &size, A ); lapackf77_clarnv( &ione, ISEED, &size, B ); lapackf77_clarnv( &ione, ISEED, &size, C ); // increase diagonal to be SPD for( int i=0; i < n; ++i ) { C[i+i*lda] = MAGMA_C_ADD( C[i+i*lda], MAGMA_C_MAKE( n*n, 0 )); } magma_csetmatrix( n, n, A, lda, dA, ldda ); magma_csetmatrix( n, n, B, lda, dB, ldda ); magma_csetmatrix( n, n, C, lda, dC, ldda ); // compute with cublas cublasCgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &c_neg_one, dA, ldda, dB, ldda, &c_one, dC, ldda ); magma_cpotrf_gpu( MagmaLower, n, dC, ldda, &info ); if (info != 0) printf("magma_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute with LAPACK blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &n, &n, &n, &c_neg_one, A, &lda, B, &lda, &c_one, C, &lda ); lapackf77_cpotrf( MagmaLowerStr, &n, C, &lda, &info ); if (info != 0) printf("lapackf77_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute difference, |dC - C| / |C| magma_cgetmatrix( n, n, dC, ldda, A, lda ); blasf77_caxpy( &size, &c_neg_one, C, &ione, A, &ione ); error = lapackf77_clange( "F", &n, &n, A, &lda, work ) / lapackf77_clange( "F", &n, &n, C, &lda, work ); printf( "%5d %8.2e %s\n", (int) n, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); magma_free( dA ); magma_free( dB ); magma_free( dC ); magma_free_cpu( A ); magma_free_cpu( B ); magma_free_cpu( C ); fflush( stdout ); } } cublasDestroy( handle ); magma_finalize(); return status; }
extern "C" magma_int_t magma_chegvr(magma_int_t itype, char jobz, char range, char uplo, magma_int_t n, magmaFloatComplex *a, magma_int_t lda, magmaFloatComplex *b, magma_int_t ldb, float vl, float vu, magma_int_t il, magma_int_t iu, float abstol, magma_int_t *m, float *w, magmaFloatComplex *z, magma_int_t ldz, magma_int_t *isuppz, magmaFloatComplex *work, magma_int_t lwork, float *rwork, magma_int_t lrwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= CHEGVR computes all the eigenvalues, and optionally, the eigenvectors of a complex generalized Hermitian-definite eigenproblem, of the form A*x=(lambda)*B*x, A*Bx=(lambda)*x, or B*A*x=(lambda)*x. Here A and B are assumed to be Hermitian and B is also positive definite. Whenever possible, CHEEVR calls CSTEGR to compute the eigenspectrum using Relatively Robust Representations. CSTEGR computes eigenvalues by the dqds algorithm, while orthogonal eigenvectors are computed from various "good" L D L^T representations (also known as Relatively Robust Representations). Gram-Schmidt orthogonalization is avoided as far as possible. More specifically, the various steps of the algorithm are as follows. For the i-th unreduced block of T, (a) Compute T - sigma_i = L_i D_i L_i^T, such that L_i D_i L_i^T is a relatively robust representation, (b) Compute the eigenvalues, lambda_j, of L_i D_i L_i^T to high relative accuracy by the dqds algorithm, (c) If there is a cluster of close eigenvalues, "choose" sigma_i close to the cluster, and go to step (a), (d) Given the approximate eigenvalue lambda_j of L_i D_i L_i^T, compute the corresponding eigenvector by forming a rank-revealing twisted factorization. The desired accuracy of the output can be specified by the input parameter ABSTOL. For more details, see "A new O(n^2) algorithm for the symmetric tridiagonal eigenvalue/eigenvector problem", by Inderjit Dhillon, Computer Science Division Technical Report No. UCB//CSD-97-971, UC Berkeley, May 1997. Note 1 : CHEEVR calls CSTEGR when the full spectrum is requested on machines which conform to the ieee-754 floating point standard. CHEEVR calls SSTEBZ and CSTEIN on non-ieee machines and when partial spectrum requests are made. Normal execution of CSTEGR may create NaNs and infinities and hence may abort due to a floating point exception in environments which do not handle NaNs and infinities in the ieee standard default manner. Arguments ========= ITYPE (input) INTEGER Specifies the problem type to be solved: = 1: A*x = (lambda)*B*x = 2: A*B*x = (lambda)*x = 3: B*A*x = (lambda)*x RANGE (input) CHARACTER*1 = 'A': all eigenvalues will be found. = 'V': all eigenvalues in the half-open interval (VL,VU] will be found. = 'I': the IL-th through IU-th eigenvalues will be found. JOBZ (input) CHARACTER*1 = 'N': Compute eigenvalues only; = 'V': Compute eigenvalues and eigenvectors. UPLO (input) CHARACTER*1 = 'U': Upper triangles of A and B are stored; = 'L': Lower triangles of A and B are stored. N (input) INTEGER The order of the matrices A and B. N >= 0. A (input/output) COMPLEX array, dimension (LDA, N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = 'V', then if INFO = 0, A contains the matrix Z of eigenvectors. The eigenvectors are normalized as follows: if ITYPE = 1 or 2, Z**H*B*Z = I; if ITYPE = 3, Z**H*inv(B)*Z = I. If JOBZ = 'N', then on exit the upper triangle (if UPLO='U') or the lower triangle (if UPLO='L') of A, including the diagonal, is destroyed. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). B (input/output) COMPLEX array, dimension (LDB, N) On entry, the Hermitian matrix B. If UPLO = 'U', the leading N-by-N upper triangular part of B contains the upper triangular part of the matrix B. If UPLO = 'L', the leading N-by-N lower triangular part of B contains the lower triangular part of the matrix B. On exit, if INFO <= N, the part of B containing the matrix is overwritten by the triangular factor U or L from the Cholesky factorization B = U**H*U or B = L*L**H. LDB (input) INTEGER The leading dimension of the array B. LDB >= max(1,N). VL (input) REAL VU (input) REAL If RANGE='V', the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = 'A' or 'I'. IL (input) INTEGER IU (input) INTEGER If RANGE='I', the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = 'A' or 'V'. ABSTOL (input) REAL The absolute error tolerance for the eigenvalues. An approximate eigenvalue is accepted as converged when it is determined to lie in an interval [a,b] of width less than or equal to ABSTOL + EPS * max( |a|,|b| ) , where EPS is the machine precision. If ABSTOL is less than or equal to zero, then EPS*|T| will be used in its place, where |T| is the 1-norm of the tridiagonal matrix obtained by reducing A to tridiagonal form. See "Computing Small Singular Values of Bidiagonal Matrices with Guaranteed High Relative Accuracy," by Demmel and Kahan, LAPACK Working Note #3. If high relative accuracy is important, set ABSTOL to SLAMCH( 'Safe minimum' ). Doing so will guarantee that eigenvalues are computed to high relative accuracy when possible in future releases. The current code does not make any guarantees about high relative accuracy, but furutre releases will. See J. Barlow and J. Demmel, "Computing Accurate Eigensystems of Scaled Diagonally Dominant Matrices", LAPACK Working Note #7, for a discussion of which matrices define their eigenvalues to high relative accuracy. M (output) INTEGER The total number of eigenvalues found. 0 <= M <= N. If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1. W (output) REAL array, dimension (N) If INFO = 0, the eigenvalues in ascending order. Z (output) COMPLEX array, dimension (LDZ, max(1,M)) If JOBZ = 'V', then if INFO = 0, the first M columns of Z contain the orthonormal eigenvectors of the matrix A corresponding to the selected eigenvalues, with the i-th column of Z holding the eigenvector associated with W(i). If JOBZ = 'N', then Z is not referenced. Note: the user must ensure that at least max(1,M) columns are supplied in the array Z; if RANGE = 'V', the exact value of M is not known in advance and an upper bound must be used. LDZ (input) INTEGER The leading dimension of the array Z. LDZ >= 1, and if JOBZ = 'V', LDZ >= max(1,N). ISUPPZ (output) INTEGER ARRAY, dimension ( 2*max(1,M) ) The support of the eigenvectors in Z, i.e., the indices indicating the nonzero elements in Z. The i-th eigenvector is nonzero only in elements ISUPPZ( 2*i-1 ) through ISUPPZ( 2*i ). ********* Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1 WORK (workspace/output) COMPLEX array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The length of the array WORK. LWORK >= max(1,2*N). For optimal efficiency, LWORK >= (NB+1)*N, where NB is the max of the blocksize for CHETRD and for CUNMTR as returned by ILAENV. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. RWORK (workspace/output) REAL array, dimension (LRWORK) On exit, if INFO = 0, RWORK(1) returns the optimal (and minimal) LRWORK. LRWORK (input) INTEGER The length of the array RWORK. LRWORK >= max(1,24*N). If LRWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the RWORK array, returns this value as the first entry of the RWORK array, and no error message related to LRWORK is issued by XERBLA. IWORK (workspace/output) INTEGER array, dimension (LIWORK) On exit, if INFO = 0, IWORK(1) returns the optimal (and minimal) LIWORK. LIWORK (input) INTEGER The dimension of the array IWORK. LIWORK >= max(1,10*N). If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the IWORK array, returns this value as the first entry of the IWORK array, and no error message related to LIWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: Internal error Further Details =============== Based on contributions by Inderjit Dhillon, IBM Almaden, USA Osni Marques, LBNL/NERSC, USA Ken Stanley, Computer Science Division, University of California at Berkeley, USA ===================================================================== */ char uplo_[2] = {uplo, 0}; char jobz_[2] = {jobz, 0}; char range_[2] = {range, 0}; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex *da; magmaFloatComplex *db; magmaFloatComplex *dz; magma_int_t ldda = n; magma_int_t lddb = n; magma_int_t lddz = n; magma_int_t lower; char trans[1]; magma_int_t wantz; magma_int_t lquery; magma_int_t alleig, valeig, indeig; magma_int_t lwmin, lrwmin, liwmin; magma_queue_t stream; magma_queue_create( &stream ); wantz = lapackf77_lsame(jobz_, MagmaVecStr); lower = lapackf77_lsame(uplo_, MagmaLowerStr); alleig = lapackf77_lsame(range_, "A"); valeig = lapackf77_lsame(range_, "V"); indeig = lapackf77_lsame(range_, "I"); lquery = lwork == -1; *info = 0; if (itype < 1 || itype > 3) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (wantz || lapackf77_lsame(jobz_, MagmaNoVecStr))) { *info = -3; } else if (! (lower || lapackf77_lsame(uplo_, MagmaUpperStr))) { *info = -4; } else if (n < 0) { *info = -5; } else if (lda < max(1,n)) { *info = -7; } else if (ldb < max(1,n)) { *info = -9; } else if (ldz < 1 || (wantz && ldz < n)) { *info = -18; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -11; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -12; } else if (iu < min(n,il) || iu > n) { *info = -13; } } } magma_int_t nb = magma_get_chetrd_nb(n); lwmin = n * (nb + 1); lrwmin = 24 * n; liwmin = 10 * n; work[0] = MAGMA_C_MAKE( lwmin, 0 ); rwork[0] = lrwmin; iwork[0] = liwmin; if (lwork < lwmin && ! lquery) { *info = -21; } else if ((lrwork < lrwmin) && ! lquery) { *info = -23; } else if ((liwork < liwmin) && ! lquery) { *info = -25; } if (*info != 0) { magma_xerbla( __func__, -(*info)); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } if (MAGMA_SUCCESS != magma_cmalloc( &da, n*ldda ) || MAGMA_SUCCESS != magma_cmalloc( &db, n*lddb ) || MAGMA_SUCCESS != magma_cmalloc( &dz, n*lddz )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* Form a Cholesky factorization of B. */ magma_csetmatrix( n, n, b, ldb, db, lddb ); magma_csetmatrix_async( n, n, a, lda, da, ldda, stream ); magma_cpotrf_gpu(uplo_[0], n, db, lddb, info); if (*info != 0) { *info = n + *info; return *info; } magma_queue_sync( stream ); magma_cgetmatrix_async( n, n, db, lddb, b, ldb, stream ); /* Transform problem to standard eigenvalue problem and solve. */ magma_chegst_gpu(itype, uplo, n, da, ldda, db, lddb, info); magma_cheevr_gpu(jobz, range, uplo, n, da, ldda, vl, vu, il, iu, abstol, m, w, dz, lddz, isuppz, a, lda, z, ldz, work, lwork, rwork, lrwork, iwork, liwork, info); if (wantz && *info == 0) { /* Backtransform eigenvectors to the original problem. */ if (itype == 1 || itype == 2) { /* For A*x=(lambda)*B*x and A*B*x=(lambda)*x; backtransform eigenvectors: x = inv(L)'*y or inv(U)*y */ if (lower) { *(unsigned char *)trans = MagmaConjTrans; } else { *(unsigned char *)trans = MagmaNoTrans; } magma_ctrsm(MagmaLeft, uplo, *trans, MagmaNonUnit, n, *m, c_one, db, lddb, dz, lddz); } else if (itype == 3) { /* For B*A*x=(lambda)*x; backtransform eigenvectors: x = L*y or U'*y */ if (lower) { *(unsigned char *)trans = MagmaNoTrans; } else { *(unsigned char *)trans = MagmaConjTrans; } magma_ctrmm(MagmaLeft, uplo, *trans, MagmaNonUnit, n, *m, c_one, db, lddb, dz, lddz); } magma_cgetmatrix( n, *m, dz, lddz, z, ldz ); } magma_queue_sync( stream ); magma_queue_destroy( stream ); magma_free( da ); magma_free( db ); magma_free( dz ); return *info; } /* chegvr */
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; magmaFloatComplex *hA, *hR; magmaFloatComplex_ptr dA; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1024, 2048, 3072, 4032, 5184, 6048, 7200, 8064, 8928, 10560 }; magma_int_t i, info; magmaFloatComplex mz_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float work[1], matnorm, diffnorm; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); } if (N>0) size[0] = size[9] = N; else exit(1); } else { printf("\nUsage: \n"); printf(" testing_cpotrf_gpu -N %d\n\n", 1024); } /* Initialize */ magma_queue_t queue; magma_device_t device; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( &device, 1, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device, &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } /* Allocate memory for the largest matrix */ N = size[9]; n2 = N * N; ldda = ((N+31)/32) * 32; TESTING_MALLOC( hA, magmaFloatComplex, n2 ); TESTING_MALLOC_HOST( hR, magmaFloatComplex, n2 ); TESTING_MALLOC_DEV( dA, magmaFloatComplex, ldda*N ); printf("\n\n"); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma-R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================================================\n"); for(i=0; i<10; i++){ N = size[i]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; gflops = FLOPS( (float)N ) * 1e-9; /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, hA ); /* Symmetrize and increase the diagonal */ for( int i = 0; i < N; ++i ) { MAGMA_C_SET2REAL( hA(i,i), MAGMA_C_REAL(hA(i,i)) + N ); for( int j = 0; j < i; ++j ) { hA(i, j) = MAGMA_C_CNJG( hA(j,i) ); } } lapackf77_clacpy( MagmaFullStr, &N, &N, hA, &lda, hR, &lda ); /* Warm up to measure the performance */ magma_csetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue ); magma_cpotrf_gpu( MagmaUpper, N, dA, 0, ldda, &info, queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_csetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue ); gpu_time = get_time(); magma_cpotrf_gpu( MagmaUpper, N, dA, 0, ldda, &info, queue ); gpu_time = get_time() - gpu_time; if (info != 0) printf( "magma_cpotrf had error %d.\n", info ); gpu_perf = gflops / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = get_time(); lapackf77_cpotrf( MagmaUpperStr, &N, hA, &lda, &info ); cpu_time = get_time() - cpu_time; if (info != 0) printf( "lapackf77_cpotrf had error %d.\n", info ); cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result compared to LAPACK |R_magma - R_lapack| / |R_lapack| =================================================================== */ magma_cgetmatrix( N, N, dA, 0, ldda, hR, 0, lda, queue ); matnorm = lapackf77_clange("f", &N, &N, hA, &lda, work); blasf77_caxpy(&n2, &mz_one, hA, &ione, hR, &ione); diffnorm = lapackf77_clange("f", &N, &N, hR, &lda, work); printf( "%5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm ); if (argc != 1) break; } /* clean up */ TESTING_FREE( hA ); TESTING_FREE_HOST( hR ); TESTING_FREE_DEV( dA ); magma_queue_destroy( queue ); magma_finalize(); }
/***************************************************************************//** Purpose ------- CPOSV computes the solution to a complex system of linear equations A * X = B, where A is an N-by-N Hermitian positive definite matrix and X and B are N-by-NRHS matrices. The Cholesky decomposition is used to factor A as A = U**H * U, if UPLO = MagmaUpper, or A = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is a lower triangular matrix. The factored form of A is then used to solve the system of equations A * X = B. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U**H*U or A = L*L**H. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in,out] B COMPLEX array, dimension (LDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. @param[in] ldb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_posv *******************************************************************************/ extern "C" magma_int_t magma_cposv( magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *B, magma_int_t ldb, magma_int_t *info ) { #ifdef HAVE_clBLAS #define dA(i_, j_) dA, ((i_) + (j_)*ldda) #define dB(i_, j_) dB, ((i_) + (j_)*lddb) #else #define dA(i_, j_) (dA + (i_) + (j_)*ldda) #define dB(i_, j_) (dB + (i_) + (j_)*lddb) #endif magma_int_t ngpu, ldda, lddb; magma_queue_t queue = NULL; magma_device_t cdev; *info = 0; if ( uplo != MagmaUpper && uplo != MagmaLower ) *info = -1; if ( n < 0 ) *info = -2; if ( nrhs < 0) *info = -3; if ( lda < max(1, n) ) *info = -5; if ( ldb < max(1, n) ) *info = -7; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (n == 0 || nrhs == 0) { return *info; } /* If single-GPU and allocation suceeds, use GPU interface. */ ngpu = magma_num_gpus(); magmaFloatComplex_ptr dA, dB; if ( ngpu > 1 ) { goto CPU_INTERFACE; } ldda = magma_roundup( n, 32 ); lddb = ldda; if ( MAGMA_SUCCESS != magma_cmalloc( &dA, ldda*n )) { goto CPU_INTERFACE; } if ( MAGMA_SUCCESS != magma_cmalloc( &dB, lddb*nrhs )) { magma_free( dA ); goto CPU_INTERFACE; } magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); magma_csetmatrix( n, n, A, lda, dA(0,0), ldda, queue ); magma_cpotrf_gpu( uplo, n, dA(0,0), ldda, info ); if ( *info == MAGMA_ERR_DEVICE_ALLOC ) { magma_queue_destroy( queue ); magma_free( dA ); magma_free( dB ); goto CPU_INTERFACE; } magma_cgetmatrix( n, n, dA(0,0), ldda, A, lda, queue ); if ( *info == 0 ) { magma_csetmatrix( n, nrhs, B, ldb, dB(0,0), lddb, queue ); magma_cpotrs_gpu( uplo, n, nrhs, dA(0,0), ldda, dB(0,0), lddb, info ); magma_cgetmatrix( n, nrhs, dB(0,0), lddb, B, ldb, queue ); } magma_queue_destroy( queue ); magma_free( dA ); magma_free( dB ); return *info; CPU_INTERFACE: /* If multi-GPU or allocation failed, use CPU interface and LAPACK. * Faster to use LAPACK for potrs than to copy A to GPU. */ magma_cpotrf( uplo, n, A, lda, info ); if ( *info == 0 ) { lapackf77_cpotrs( lapack_uplo_const(uplo), &n, &nrhs, A, &lda, B, &ldb, info ); } return *info; }