/** Purpose ------- DSPOSV computes the solution to a real system of linear equations A * X = B, where A is an N-by-N symmetric positive definite matrix and X and B are N-by-NRHS matrices. DSPOSV first attempts to factorize the matrix in real SINGLE PRECISION and use this factorization within an iterative refinement procedure to produce a solution with real DOUBLE PRECISION norm-wise backward error quality (see below). If the approach fails the method switches to a real DOUBLE PRECISION factorization and solve. The iterative refinement is not going to be a winning strategy if the ratio real SINGLE PRECISION performance over real DOUBLE PRECISION performance is too small. A reasonable strategy should take the number of right-hand sides and the size of the matrix into account. This might be done with a call to ILAENV in the future. Up to now, we always try iterative refinement. The iterative refinement process is stopped if ITER > ITERMAX or for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX where o ITER is the number of the current iteration in the iterative refinement process o RNRM is the infinity-norm of the residual o XNRM is the infinity-norm of the solution o ANRM is the infinity-operator-norm of the matrix A o EPS is the machine epsilon returned by DLAMCH('Epsilon') The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The number of linear equations, i.e., the order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in,out] dA DOUBLE PRECISION array on the GPU, dimension (LDDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if iterative refinement has been successfully used (INFO.EQ.0 and ITER.GE.0, see description below), then A is unchanged, if double factorization has been used (INFO.EQ.0 and ITER.LT.0, see description below), then the array dA contains the factor U or L from the Cholesky factorization A = U**T*U or A = L*L**T. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). @param[in] dB DOUBLE PRECISION array on the GPU, dimension (LDDB,NRHS) The N-by-NRHS right hand side matrix B. @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= max(1,N). @param[out] dX DOUBLE PRECISION array on the GPU, dimension (LDDX,NRHS) If INFO = 0, the N-by-NRHS solution matrix X. @param[in] lddx INTEGER The leading dimension of the array dX. LDDX >= max(1,N). @param dworkd (workspace) DOUBLE PRECISION array on the GPU, dimension (N*NRHS) This array is used to hold the residual vectors. @param dworks (workspace) SINGLE PRECISION array on the GPU, dimension (N*(N+NRHS)) This array is used to store the real single precision matrix and the right-hand sides or solutions in single precision. @param[out] iter INTEGER - < 0: iterative refinement has failed, double precision factorization has been performed + -1 : the routine fell back to full precision for implementation- or machine-specific reasons + -2 : narrowing the precision induced an overflow, the routine fell back to full precision + -3 : failure of SPOTRF + -31: stop the iterative refinement after the 30th iteration - > 0: iterative refinement has been successfully used. Returns the number of iterations @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i of (DOUBLE PRECISION) A is not positive definite, so the factorization could not be completed, and the solution has not been computed. @ingroup magma_dposv_driver ********************************************************************/ extern "C" magma_int_t magma_dsposv_gpu( magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs, magmaDouble_ptr dA, magma_int_t ldda, magmaDouble_ptr dB, magma_int_t lddb, magmaDouble_ptr dX, magma_int_t lddx, magmaDouble_ptr dworkd, magmaFloat_ptr dworks, magma_int_t *iter, magma_int_t *info) { #define dB(i,j) (dB + (i) + (j)*lddb) #define dX(i,j) (dX + (i) + (j)*lddx) #define dR(i,j) (dR + (i) + (j)*lddr) #define dSX(i,j) (dSX + (i) + (j)*lddsx) // Constants const double BWDMAX = 1.0; const magma_int_t ITERMAX = 30; const double c_neg_one = MAGMA_D_NEG_ONE; const double c_one = MAGMA_D_ONE; const magma_int_t ione = 1; // Local variables magmaDouble_ptr dR; magmaFloat_ptr dSA, dSX; double Xnrmv, Rnrmv; double Anrm, Xnrm, Rnrm, cte, eps; magma_int_t i, j, iiter, lddsa, lddsx, lddr; /* Check arguments */ *iter = 0; *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -2; else if ( ldda < max(1,n)) *info = -4; else if ( lddb < max(1,n)) *info = -7; else if ( lddx < max(1,n)) *info = -9; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if ( n == 0 || nrhs == 0 ) return *info; lddsa = n; lddsx = n; lddr = n; dSA = dworks; dSX = dSA + lddsa*n; dR = dworkd; magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); eps = lapackf77_dlamch("Epsilon"); Anrm = magmablas_dlansy( MagmaInfNorm, uplo, n, dA, ldda, (double*)dworkd, n*nrhs, queue ); cte = Anrm * eps * magma_dsqrt( n ) * BWDMAX; /* * Convert to single precision */ magmablas_dlag2s( n, nrhs, dB, lddb, dSX, lddsx, queue, info ); if (*info != 0) { *iter = -2; goto fallback; } magmablas_dlat2s( uplo, n, dA, ldda, dSA, lddsa, queue, info ); if (*info != 0) { *iter = -2; goto fallback; } // factor dSA in single precision magma_spotrf_gpu( uplo, n, dSA, lddsa, info ); if (*info != 0) { *iter = -3; goto fallback; } // solve dSA*dSX = dB in single precision magma_spotrs_gpu( uplo, n, nrhs, dSA, lddsa, dSX, lddsx, info ); // residual dR = dB - dA*dX in double precision magmablas_slag2d( n, nrhs, dSX, lddsx, dX, lddx, queue, info ); magmablas_dlacpy( MagmaFull, n, nrhs, dB, lddb, dR, lddr, queue ); if ( nrhs == 1 ) { magma_dsymv( uplo, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1, queue ); } else { magma_dsymm( MagmaLeft, uplo, n, nrhs, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr, queue ); } // TODO: use MAGMA_D_ABS( dX(i,j) ) instead of dlange? for( j=0; j < nrhs; j++ ) { i = magma_idamax( n, dX(0,j), 1, queue ) - 1; magma_dgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1, queue ); Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_idamax( n, dR(0,j), 1, queue ) - 1; magma_dgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1, queue ); Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto refinement; } } *iter = 0; goto cleanup; //return *info; refinement: for( iiter=1; iiter < ITERMAX; ) { *info = 0; // convert residual dR to single precision dSX magmablas_dlag2s( n, nrhs, dR, lddr, dSX, lddsx, queue, info ); if (*info != 0) { *iter = -2; goto fallback; } // solve dSA*dSX = R in single precision magma_spotrs_gpu( uplo, n, nrhs, dSA, lddsa, dSX, lddsx, info ); // Add correction and setup residual // dX += dSX [including conversion] --and-- // dR = dB for( j=0; j < nrhs; j++ ) { magmablas_dsaxpycp( n, dSX(0,j), dX(0,j), dB(0,j), dR(0,j), queue ); } // residual dR = dB - dA*dX in double precision if ( nrhs == 1 ) { magma_dsymv( uplo, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1, queue ); } else { magma_dsymm( MagmaLeft, uplo, n, nrhs, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr, queue ); } // TODO: use MAGMA_D_ABS( dX(i,j) ) instead of dlange? /* Check whether the nrhs normwise backward errors satisfy the * stopping criterion. If yes, set ITER=IITER > 0 and return. */ for( j=0; j < nrhs; j++ ) { i = magma_idamax( n, dX(0,j), 1, queue ) - 1; magma_dgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1, queue ); Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_idamax( n, dR(0,j), 1, queue ) - 1; magma_dgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1, queue ); Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto L20; } } /* If we are here, the nrhs normwise backward errors satisfy * the stopping criterion, we are good to exit. */ *iter = iiter; goto cleanup; //return *info; L20: iiter++; } /* If we are at this place of the code, this is because we have * performed ITER=ITERMAX iterations and never satisified the * stopping criterion. Set up the ITER flag accordingly and follow * up on double precision routine. */ *iter = -ITERMAX - 1; fallback: /* Single-precision iterative refinement failed to converge to a * satisfactory solution, so we resort to double precision. */ magma_dpotrf_gpu( uplo, n, dA, ldda, info ); if (*info == 0) { magmablas_dlacpy( MagmaFull, n, nrhs, dB, lddb, dX, lddx, queue ); magma_dpotrs_gpu( uplo, n, nrhs, dA, ldda, dX, lddx, info ); } cleanup: magma_queue_destroy( queue ); return *info; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dlat2s and slat2d */ int main( int argc, char** argv ) { #define A(i_,j_) ( A + (i_) + (j_)*lda) #define SA(i_,j_) (SA + (i_) + (j_)*lda) TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; float serror, swork[1]; double c_neg_one = MAGMA_D_NEG_ONE; float s_neg_one = MAGMA_S_NEG_ONE; magma_int_t ione = 1; magma_int_t n, lda, ldda, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; float *SA, *SR; double *A, *R; float *dSA; double *dA; magma_opts opts; parse_opts( argc, argv, &opts ); magma_uplo_t uplo[] = { MagmaLower, MagmaUpper }; printf("func uplo N CPU GB/s (ms) GPU GB/s (ms) ||R||_F\n"); printf("=====================================================================\n"); for( int iuplo = 0; iuplo < 2; ++iuplo ) { for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { n = opts.nsize[itest]; lda = n; ldda = ((n+31)/32)*32; // 0.5*(n+1)*n double-real loads and 0.5*(n+1)*n single-real stores (and vice-versa for slat2d) gbytes = (real_Double_t) 0.5*(n+1)*n * (sizeof(double) + sizeof(float)) / 1e9; size = ldda*n; // ldda >= lda TESTING_MALLOC_CPU( SA, float, size ); TESTING_MALLOC_CPU( A, double, size ); TESTING_MALLOC_CPU( SR, float, size ); TESTING_MALLOC_CPU( R, double, size ); TESTING_MALLOC_DEV( dSA, float, size ); TESTING_MALLOC_DEV( dA, double, size ); lapackf77_dlarnv( &ione, ISEED, &size, A ); lapackf77_slarnv( &ione, ISEED, &size, SA ); magma_dsetmatrix( n, n, A, lda, dA, ldda ); magma_ssetmatrix( n, n, SA, lda, dSA, ldda ); /* ===================================================================== Performs operation using LAPACK dlat2s =================================================================== */ info = 0; cpu_time = magma_wtime(); lapackf77_dlat2s( lapack_uplo_const(uplo[iuplo]), &n, A, &lda, SA, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if (info != 0) printf("lapackf77_dlat2s returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA dlat2s =================================================================== */ gpu_time = magma_sync_wtime(0); magmablas_dlat2s( uplo[iuplo], n, dA, ldda, dSA, ldda, &info ); gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gbytes / gpu_time; if (info != 0) printf("magmablas_dlat2s returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_sgetmatrix( n, n, dSA, ldda, SR, lda ); if ( opts.verbose ) { printf( "A= " ); magma_dprint( n, n, A, lda ); printf( "SA= " ); magma_sprint( n, n, SA, lda ); printf( "dA= " ); magma_dprint_gpu( n, n, dA, ldda ); printf( "dSA=" ); magma_sprint_gpu( n, n, dSA, ldda ); } /* ===================================================================== compute error |SA_magma - SA_lapack| should be zero if both are IEEE compliant =================================================================== */ blasf77_saxpy( &size, &s_neg_one, SA, &ione, SR, &ione ); serror = lapackf77_slange( "Fro", &n, &n, SR, &lda, swork ); printf( "dlat2s %5s %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", lapack_uplo_const(uplo[iuplo]), (int) n, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., serror, (serror == 0 ? "ok" : "failed") ); status += ! (serror == 0); /* ===================================================================== Reset matrices =================================================================== */ lapackf77_dlarnv( &ione, ISEED, &size, A ); lapackf77_slarnv( &ione, ISEED, &size, SA ); magma_dsetmatrix( n, n, A, lda, dA, ldda ); magma_ssetmatrix( n, n, SA, lda, dSA, ldda ); /* ===================================================================== Performs operation using LAPACK slat2d LAPACK doesn't implement slat2d; use our own simple implementation. =================================================================== */ cpu_time = magma_wtime(); if ( uplo[iuplo] == MagmaLower ) { for( int j=0; j < n; ++j ) { for( int i=j; i < n; ++i ) { *A(i,j) = MAGMA_D_MAKE( real(*SA(i,j)), imag(*SA(i,j)) ); } } } else { // upper for( int j=0; j < n; ++j ) { for( int i=0; i <= j; ++i ) { *A(i,j) = MAGMA_D_MAKE( real(*SA(i,j)), imag(*SA(i,j)) ); } } } cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if (info != 0) printf("lapackf77_slat2d returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA slat2d =================================================================== */ magma_ssetmatrix( n, n, SA, lda, dSA, ldda ); gpu_time = magma_sync_wtime(0); magmablas_slat2d( uplo[iuplo], n, dSA, ldda, dA, ldda, &info ); gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gbytes / gpu_time; if (info != 0) printf("magmablas_slat2d returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_dgetmatrix( n, n, dA, ldda, R, lda ); if ( opts.verbose ) { printf( "A= " ); magma_dprint( n, n, A, lda ); printf( "SA= " ); magma_sprint( n, n, SA, lda ); printf( "dA= " ); magma_dprint_gpu( n, n, dA, ldda ); printf( "dSA=" ); magma_sprint_gpu( n, n, dSA, ldda ); } /* ===================================================================== compute error |A_magma - A_lapack| should be zero if both are IEEE compliant =================================================================== */ blasf77_daxpy( &size, &c_neg_one, A, &ione, R, &ione ); error = lapackf77_dlange( "Fro", &n, &n, R, &lda, work ); printf( "slat2d %5s %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", lapack_uplo_const(uplo[iuplo]), (int) n, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., error, (error == 0 ? "ok" : "failed") ); status += ! (error == 0); TESTING_FREE_CPU( SA ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( SR ); TESTING_FREE_CPU( R ); TESTING_FREE_DEV( dSA ); TESTING_FREE_DEV( dA ); printf( "\n" ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } printf( "\n" ); } TESTING_FINALIZE(); return status; }