extern "C" double magmablas_dlansy( magma_norm_t norm, magma_uplo_t uplo, magma_int_t n, magmaDouble_const_ptr dA, size_t dA_offset, magma_int_t ldda, magmaDouble_ptr dwork, size_t dwork_offset, magma_queue_t queue ) { magma_int_t info = 0; // 1-norm == inf-norm since A is symmetric bool inf_norm = (norm == MagmaInfNorm || norm == MagmaOneNorm); bool max_norm = (norm == MagmaMaxNorm); // inf_norm Double-Complex requires > 16 KB shared data (arch >= 200) const bool inf_implemented = true; if ( ! (max_norm || (inf_norm && inf_implemented)) ) info = -1; else if ( uplo != MagmaUpper && uplo != MagmaLower ) info = -2; else if ( n < 0 ) info = -3; else if ( ldda < n ) info = -5; if ( info != 0 ) { magma_xerbla( __func__, -(info) ); return info; } /* Quick return */ if ( n == 0 ) return 0; double res = 0; if ( inf_norm ) { dlansy_inf( uplo, n, dA,dA_offset, ldda, dwork,dwork_offset, queue ); } else { dlansy_max( uplo, n, dA,dA_offset, ldda, dwork,dwork_offset, queue ); } int i = magma_idamax( n, dwork,dwork_offset, 1, queue ) - 1; magma_dgetvector(1, dwork,dwork_offset+i, 1, &res, 1, queue ); return res; }
extern "C" magma_int_t magma_dsgesv_gpu(char trans, magma_int_t n, magma_int_t nrhs, double *dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *dipiv, double *dB, magma_int_t lddb, double *dX, magma_int_t lddx, double *dworkd, float *dworks, magma_int_t *iter, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= DSGESV computes the solution to a real system of linear equations A * X = B or A' * X = B where A is an N-by-N matrix and X and B are N-by-NRHS matrices. DSGESV first attempts to factorize the matrix in real SINGLE PRECISION and use this factorization within an iterative refinement procedure to produce a solution with real DOUBLE PRECISION norm-wise backward error quality (see below). If the approach fails the method switches to a real DOUBLE PRECISION factorization and solve. The iterative refinement is not going to be a winning strategy if the ratio real SINGLE PRECISION performance over real DOUBLE PRECISION performance is too small. A reasonable strategy should take the number of right-hand sides and the size of the matrix into account. This might be done with a call to ILAENV in the future. Up to now, we always try iterative refinement. The iterative refinement process is stopped if ITER > ITERMAX or for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX where o ITER is the number of the current iteration in the iterative refinement process o RNRM is the infinity-norm of the residual o XNRM is the infinity-norm of the solution o ANRM is the infinity-operator-norm of the matrix A o EPS is the machine epsilon returned by DLAMCH('Epsilon') The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively. Arguments ========= TRANS (input) CHARACTER*1 Specifies the form of the system of equations: = 'N': A * X = B (No transpose) = 'T': A'* X = B (Transpose) = 'C': A'* X = B (Conjugate transpose = Transpose) N (input) INTEGER The number of linear equations, i.e., the order of the matrix A. N >= 0. NRHS (input) INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. dA (input or input/output) DOUBLE PRECISION array on the GPU, dimension (ldda,N) On entry, the N-by-N coefficient matrix A. On exit, if iterative refinement has been successfully used (info.EQ.0 and ITER.GE.0, see description below), A is unchanged. If double precision factorization has been used (info.EQ.0 and ITER.LT.0, see description below), then the array dA contains the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. ldda (input) INTEGER The leading dimension of the array dA. ldda >= max(1,N). IPIV (output) INTEGER array, dimension (N) The pivot indices that define the permutation matrix P; row i of the matrix was interchanged with row IPIV(i). Corresponds either to the single precision factorization (if info.EQ.0 and ITER.GE.0) or the double precision factorization (if info.EQ.0 and ITER.LT.0). dIPIV (output) INTEGER array on the GPU, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was moved to row IPIV(i). dB (input) DOUBLE PRECISION array on the GPU, dimension (lddb,NRHS) The N-by-NRHS right hand side matrix B. lddb (input) INTEGER The leading dimension of the array dB. lddb >= max(1,N). dX (output) DOUBLE PRECISION array on the GPU, dimension (lddx,NRHS) If info = 0, the N-by-NRHS solution matrix X. lddx (input) INTEGER The leading dimension of the array dX. lddx >= max(1,N). dworkd (workspace) DOUBLE PRECISION array on the GPU, dimension (N*NRHS) This array is used to hold the residual vectors. dworks (workspace) SINGLE PRECISION array on the GPU, dimension (N*(N+NRHS)) This array is used to store the real single precision matrix and the right-hand sides or solutions in single precision. iter (output) INTEGER < 0: iterative refinement has failed, double precision factorization has been performed -1 : the routine fell back to full precision for implementation- or machine-specific reasons -2 : narrowing the precision induced an overflow, the routine fell back to full precision -3 : failure of SGETRF -31: stop the iterative refinement after the 30th iteration > 0: iterative refinement has been successfully used. Returns the number of iterations info (output) INTEGER = 0: successful exit < 0: if info = -i, the i-th argument had an illegal value > 0: if info = i, U(i,i) computed in DOUBLE PRECISION is exactly zero. The factorization has been completed, but the factor U is exactly singular, so the solution could not be computed. ===================================================================== */ #define dB(i,j) (dB + (i) + (j)*lddb) #define dX(i,j) (dX + (i) + (j)*lddx) #define dR(i,j) (dR + (i) + (j)*lddr) double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; magma_int_t ione = 1; double *dR; float *dSA, *dSX; double Xnrmv, Rnrmv; double Anrm, Xnrm, Rnrm, cte, eps; magma_int_t i, j, iiter, lddsa, lddr; /* Check arguments */ *iter = 0; *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -2; else if ( ldda < max(1,n)) *info = -4; else if ( lddb < max(1,n)) *info = -8; else if ( lddx < max(1,n)) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if ( n == 0 || nrhs == 0 ) return *info; lddsa = n; lddr = n; dSA = dworks; dSX = dSA + lddsa*n; dR = dworkd; eps = lapackf77_dlamch("Epsilon"); Anrm = magmablas_dlange('I', n, n, dA, ldda, (double*)dworkd ); cte = Anrm * eps * pow((double)n, 0.5) * BWDMAX; /* * Convert to single precision */ //magmablas_dlag2s( n, nrhs, dB, lddb, dSX, lddsx, info ); // done inside dsgetrs with pivots if (*info != 0) { *iter = -2; goto FALLBACK; } magmablas_dlag2s( n, n, dA, ldda, dSA, lddsa, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // factor dSA in single precision magma_sgetrf_gpu( n, n, dSA, lddsa, ipiv, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Generate parallel pivots { magma_int_t *newipiv; magma_imalloc_cpu( &newipiv, n ); if ( newipiv == NULL ) { *iter = -3; goto FALLBACK; } swp2pswp( trans, n, ipiv, newipiv ); magma_setvector( n, sizeof(magma_int_t), newipiv, 1, dipiv, 1 ); magma_free_cpu( newipiv ); } // solve dSA*dSX = dB in single precision // converts dB to dSX and applies pivots, solves, then converts result back to dX magma_dsgetrs_gpu( trans, n, nrhs, dSA, lddsa, dipiv, dB, lddb, dX, lddx, dSX, info ); // residual dR = dB - dA*dX in double precision magmablas_dlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dR, lddr ); if ( nrhs == 1 ) { magma_dgemv( trans, n, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_dgemm( trans, MagmaNoTrans, n, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } // TODO: use MAGMA_D_ABS( dX(i,j) ) instead of dlange? for( j=0; j < nrhs; j++ ) { i = magma_idamax( n, dX(0,j), 1) - 1; magma_dgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_idamax ( n, dR(0,j), 1 ) - 1; magma_dgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto REFINEMENT; } } *iter = 0; return *info; REFINEMENT: for( iiter=1; iiter < ITERMAX; ) { *info = 0; // convert residual dR to single precision dSX // solve dSA*dSX = R in single precision // convert result back to double precision dR // it's okay that dR is used for both dB input and dX output. magma_dsgetrs_gpu( trans, n, nrhs, dSA, lddsa, dipiv, dR, lddr, dR, lddr, dSX, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Add correction and setup residual // dX += dR --and-- // dR = dB // This saves going through dR a second time (if done with one more kernel). // -- not really: first time is read, second time is write. for( j=0; j < nrhs; j++ ) { magmablas_daxpycp( n, dR(0,j), dX(0,j), dB(0,j) ); } // residual dR = dB - dA*dX in double precision if ( nrhs == 1 ) { magma_dgemv( trans, n, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_dgemm( trans, MagmaNoTrans, n, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } /* Check whether the nrhs normwise backward errors satisfy the * stopping criterion. If yes, set ITER=IITER>0 and return. */ for( j=0; j < nrhs; j++ ) { i = magma_idamax( n, dX(0,j), 1) - 1; magma_dgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_idamax ( n, dR(0,j), 1 ) - 1; magma_dgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto L20; } } /* If we are here, the nrhs normwise backward errors satisfy * the stopping criterion, we are good to exit. */ *iter = iiter; return *info; L20: iiter++; } /* If we are at this place of the code, this is because we have * performed ITER=ITERMAX iterations and never satisified the * stopping criterion. Set up the ITER flag accordingly and follow * up on double precision routine. */ *iter = -ITERMAX - 1; FALLBACK: /* Single-precision iterative refinement failed to converge to a * satisfactory solution, so we resort to double precision. */ magma_dgetrf_gpu( n, n, dA, ldda, ipiv, info ); if (*info == 0) { magmablas_dlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dX, lddx ); magma_dgetrs_gpu( trans, n, nrhs, dA, ldda, ipiv, dX, lddx, info ); } return *info; }
int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, t1, t2; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; const char trans[] = { 'N', 'C', 'T' }; const char uplo[] = { 'L', 'U' }; const char diag[] = { 'U', 'N' }; const char side[] = { 'L', 'R' }; double *A, *B, *C, *C2, *LU; double *dA, *dB, *dC1, *dC2; double alpha = MAGMA_D_MAKE( 0.5, 0.1 ); double beta = MAGMA_D_MAKE( 0.7, 0.2 ); double dalpha = 0.6; double dbeta = 0.8; double work[1], error, total_error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld, info; magma_int_t *piv; magma_err_t err; magma_opts opts; parse_opts( argc, argv, &opts ); printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" ); total_error = 0.; for( int i = 0; i < opts.ntest; ++i ) { m = opts.msize[i]; n = opts.nsize[i]; k = opts.ksize[i]; printf("=========================================================================\n"); printf( "M %d, N %d, K %d\n", (int) m, (int) n, (int) k ); // allocate matrices // over-allocate so they can be any combination of {m,n,k} x {m,n,k}. maxn = max( max( m, n ), k ); ld = maxn; size = maxn*maxn; err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) ); assert( err == 0 ); err = magma_dmalloc_pinned( &A, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &B, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &C, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &C2, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &LU, size ); assert( err == 0 ); err = magma_dmalloc( &dA, size ); assert( err == 0 ); err = magma_dmalloc( &dB, size ); assert( err == 0 ); err = magma_dmalloc( &dC1, size ); assert( err == 0 ); err = magma_dmalloc( &dC2, size ); assert( err == 0 ); // initialize matrices size = maxn*maxn; lapackf77_dlarnv( &ione, ISEED, &size, A ); lapackf77_dlarnv( &ione, ISEED, &size, B ); lapackf77_dlarnv( &ione, ISEED, &size, C ); printf( "========== Level 1 BLAS ==========\n" ); // ----- test DSWAP // swap 2nd and 3rd columns of dA, then copy to C2 and compare with A assert( n >= 4 ); magma_dsetmatrix( m, n, A, ld, dA, ld ); magma_dsetmatrix( m, n, A, ld, dB, ld ); magma_dswap( m, dA(0,1), 1, dA(0,2), 1 ); magma_dswap( m, dB(0,1), 1, dB(0,2), 1 ); // check results, storing diff between magma and cuda calls in C2 cublasDaxpy( ld*n, c_neg_one, dA, 1, dB, 1 ); magma_dgetmatrix( m, n, dB, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &k, C2, &ld, work ); total_error += error; printf( "dswap diff %.2g\n", error ); // ----- test IDAMAX // get argmax of column of A magma_dsetmatrix( m, k, A, ld, dA, ld ); error = 0; for( int j = 0; j < k; ++j ) { magma_int_t i1 = magma_idamax( m, dA(0,j), 1 ); magma_int_t i2 = cublasIdamax( m, dA(0,j), 1 ); assert( i1 == i2 ); error += abs( i1 - i2 ); } total_error += error; gflops = (double)m * k / 1e9; printf( "idamax diff %.2g\n", error ); printf( "\n" ); printf( "========== Level 2 BLAS ==========\n" ); // ----- test DGEMV // c = alpha*A*b + beta*c, with A m*n; b,c m or n-vectors // try no-trans/trans for( int ia = 0; ia < 3; ++ia ) { magma_dsetmatrix( m, n, A, ld, dA, ld ); magma_dsetvector( maxn, B, 1, dB, 1 ); magma_dsetvector( maxn, C, 1, dC1, 1 ); magma_dsetvector( maxn, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 size = (trans[ia] == 'N' ? m : n); cublasDaxpy( size, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( size, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &size, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DGEMV( m, n ) / 1e9; printf( "dgemv( %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", trans[ia], error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test DSYMV // c = alpha*A*b + beta*c, with A m*m symmetric; b,c m-vectors // try upper/lower for( int iu = 0; iu < 2; ++iu ) { magma_dsetmatrix( m, m, A, ld, dA, ld ); magma_dsetvector( m, B, 1, dB, 1 ); magma_dsetvector( m, C, 1, dC1, 1 ); magma_dsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( m, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYMV( m ) / 1e9; printf( "dsymv( %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test DTRSV // solve A*c = c, with A m*m triangular; c m-vector // try upper/lower, no-trans/trans, unit/non-unit diag // Factor A into LU to get well-conditioned triangles, else solve yields garbage. // Still can give garbage if solves aren't consistent with LU factors, // e.g., using unit diag for U, so copy lower triangle to upper triangle. // Also used for trsm later. lapackf77_dlacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld ); lapackf77_dgetrf( &maxn, &maxn, LU, &ld, piv, &info ); for( int j = 0; j < maxn; ++j ) { for( int i = 0; i < j; ++i ) { *LU(i,j) = *LU(j,i); } } for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { magma_dsetmatrix( m, m, LU, ld, dA, ld ); magma_dsetvector( m, C, 1, dC1, 1 ); magma_dsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( m, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRSM( MagmaLeft, m, 1 ) / 1e9; printf( "dtrsv( %c, %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], diag[id], error, gflops/t1, gflops/t2 ); }}} printf( "\n" ); printf( "========== Level 3 BLAS ==========\n" ); // ----- test DGEMM // C = alpha*A*B + beta*C, with A m*k or k*m; B k*n or n*k; C m*n // try combinations of no-trans/trans for( int ia = 0; ia < 3; ++ia ) { for( int ib = 0; ib < 3; ++ib ) { bool nta = (trans[ia] == 'N'); bool ntb = (trans[ib] == 'N'); magma_dsetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA, ld ); magma_dsetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DGEMM( m, n, k ) / 1e9; printf( "dgemm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", trans[ia], trans[ib], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYMM // C = alpha*A*B + beta*C (left) with A m*m symmetric; B,C m*n; or // C = alpha*B*A + beta*C (right) with A n*n symmetric; B,C m*n // try left/right, upper/lower for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { magma_dsetmatrix( m, m, A, ld, dA, ld ); magma_dsetmatrix( m, n, B, ld, dB, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYMM( side[is], m, n ) / 1e9; printf( "dsymm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", side[is], uplo[iu], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYRK // C = alpha*A*A^H + beta*C (no-trans) with A m*k and C m*m symmetric; or // C = alpha*A^H*A + beta*C (trans) with A k*m and C m*m symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { magma_dsetmatrix( n, k, A, ld, dA, ld ); magma_dsetmatrix( n, n, C, ld, dC1, ld ); magma_dsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYRK( k, n ) / 1e9; printf( "dsyrk( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYR2K // C = alpha*A*B^H + ^alpha*B*A^H + beta*C (no-trans) with A,B n*k; C n*n symmetric; or // C = alpha*A^H*B + ^alpha*B^H*A + beta*C (trans) with A,B k*n; C n*n symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { bool nt = (trans[it] == 'N'); magma_dsetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA, ld ); magma_dsetmatrix( n, n, C, ld, dC1, ld ); magma_dsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYR2K( k, n ) / 1e9; printf( "dsyr2k( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DTRMM // C = alpha*A*C (left) with A m*m triangular; C m*n; or // C = alpha*C*A (right) with A n*n triangular; C m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == 'L'); magma_dsetmatrix( (left ? m : n), (left ? m : n), A, ld, dA, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRMM( side[is], m, n ) / 1e9; printf( "dtrmm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // ----- test DTRSM // solve A*X = alpha*B (left) with A m*m triangular; B m*n; or // solve X*A = alpha*B (right) with A n*n triangular; B m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == 'L'); magma_dsetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRSM( side[is], m, n ) / 1e9; printf( "dtrsm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // cleanup magma_free_cpu( piv ); magma_free_pinned( A ); magma_free_pinned( B ); magma_free_pinned( C ); magma_free_pinned( C2 ); magma_free_pinned( LU ); magma_free( dA ); magma_free( dB ); magma_free( dC1 ); magma_free( dC2 ); } if ( total_error != 0. ) { printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n", total_error ); } else { printf( "all tests passed\n" ); } TESTING_FINALIZE(); return 0; }
extern "C" magma_int_t magma_zlaqps_gpu(magma_int_t m, magma_int_t n, magma_int_t offset, magma_int_t nb, magma_int_t *kb, magmaDoubleComplex *A, magma_int_t lda, magma_int_t *jpvt, magmaDoubleComplex *tau, double *vn1, double *vn2, magmaDoubleComplex *auxv, magmaDoubleComplex *F, magma_int_t ldf) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZLAQPS computes a step of QR factorization with column pivoting of a complex M-by-N matrix A by using Blas-3. It tries to factorize NB columns from A starting from the row OFFSET+1, and updates all of the matrix with Blas-3 xGEMM. In some cases, due to catastrophic cancellations, it cannot factorize NB columns. Hence, the actual number of factorized columns is returned in KB. Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0 OFFSET (input) INTEGER The number of rows of A that have been factorized in previous steps. NB (input) INTEGER The number of columns to factorize. KB (output) INTEGER The number of columns actually factorized. A (input/output) COMPLEX*16 array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, block A(OFFSET+1:M,1:KB) is the triangular factor obtained and block A(1:OFFSET,1:N) has been accordingly pivoted, but no factorized. The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has been updated. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). JPVT (input/output) INTEGER array, dimension (N) JPVT(I) = K <==> Column K of the full matrix A has been permuted into position I in AP. TAU (output) COMPLEX*16 array, dimension (KB) The scalar factors of the elementary reflectors. VN1 (input/output) DOUBLE PRECISION array, dimension (N) The vector with the partial column norms. VN2 (input/output) DOUBLE PRECISION array, dimension (N) The vector with the exact column norms. AUXV (input/output) COMPLEX*16 array, dimension (NB) Auxiliar vector. F (input/output) COMPLEX*16 array, dimension (LDF,NB) Matrix F' = L*Y'*A. LDF (input) INTEGER The leading dimension of the array F. LDF >= max(1,N). ===================================================================== */ #define A(i, j) (A + (i) + (j)*(lda )) #define F(i, j) (F + (i) + (j)*(ldf )) magmaDoubleComplex c_zero = MAGMA_Z_MAKE( 0.,0.); magmaDoubleComplex c_one = MAGMA_Z_MAKE( 1.,0.); magmaDoubleComplex c_neg_one = MAGMA_Z_MAKE(-1.,0.); magma_int_t ione = 1; magma_int_t i__1, i__2; //double d__1; magmaDoubleComplex z__1; //magma_int_t j; magma_int_t k, rk; //magmaDoubleComplex Akk; magmaDoubleComplex *Aks; magmaDoubleComplex tauk; magma_int_t pvt; //double temp, temp2; double tol3z; magma_int_t itemp; double lsticc, *lsticcs; magma_int_t lastrk; magma_dmalloc( &lsticcs, 1+256*(n+255)/256 ); lastrk = min( m, n + offset ); tol3z = magma_dsqrt( lapackf77_dlamch("Epsilon")); lsticc = 0; k = 0; magma_zmalloc( &Aks, nb ); while( k < nb && lsticc == 0 ) { rk = offset + k; /* Determine ith pivot column and swap if necessary */ // Fortran: pvt, k, idamax are all 1-based; subtract 1 from k. // C: pvt, k, idamax are all 0-based; don't subtract 1. pvt = k - 1 + magma_idamax( n-k, &vn1[k], ione ); if (pvt != k) { /*if (pvt >= nb) { // 1. Start copy from GPU magma_zgetmatrix_async( m - offset - nb, 1, dA(offset + nb, pvt), ldda, A (offset + nb, pvt), lda, stream ); }*/ /* F gets swapped so F must be sent at the end to GPU */ i__1 = k; /*if (pvt < nb){ // no need of transfer if pivot is within the panel blasf77_zswap( &m, A(0, pvt), &ione, A(0, k), &ione ); } else { // 1. Finish copy from GPU magma_queue_sync( stream ); // 2. Swap as usual on CPU blasf77_zswap(&m, A(0, pvt), &ione, A(0, k), &ione); // 3. Restore the GPU magma_zsetmatrix_async( m - offset - nb, 1, A (offset + nb, pvt), lda, dA(offset + nb, pvt), ldda, stream); }*/ magmablas_zswap( m, A(0, pvt), ione, A(0, k), ione ); //blasf77_zswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf ); magmablas_zswap( i__1, F(pvt, 0), ldf, F(k, 0), ldf); itemp = jpvt[pvt]; jpvt[pvt] = jpvt[k]; jpvt[k] = itemp; //vn1[pvt] = vn1[k]; //vn2[pvt] = vn2[k]; #if defined(PRECISION_d) || defined(PRECISION_z) //magma_dswap( 1, &vn1[pvt], 1, &vn1[k], 1 ); //magma_dswap( 1, &vn2[pvt], 1, &vn2[k], 1 ); magma_dswap( 2, &vn1[pvt], n+offset, &vn1[k], n+offset ); #else //magma_sswap( 1, &vn1[pvt], 1, &vn1[k], 1 ); //magma_sswap( 1, &vn2[pvt], 1, &vn2[k], 1 ); magma_sswap(2, &vn1[pvt], n+offset, &vn1[k], n+offset); #endif } /* Apply previous Householder reflectors to column K: A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'. Optimization: multiply with beta=0; wait for vector and subtract */ if (k > 0) { /*#if (defined(PRECISION_c) || defined(PRECISION_z)) for (j = 0; j < k; ++j){ *F(k,j) = MAGMA_Z_CNJG( *F(k,j) ); } #endif*/ //#define RIGHT_UPDATE #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_zgemv( MagmaNoTrans, i__1, i__2, c_neg_one, A(offset+nb, 0), lda, F(k, 0), ldf, c_one, A(offset+nb, k), ione ); #else i__1 = m - rk; i__2 = k; /*blasf77_zgemv( MagmaNoTransStr, &i__1, &i__2, &c_neg_one, A(rk, 0), &lda, F(k, 0), &ldf, &c_one, A(rk, k), &ione );*/ magma_zgemv( MagmaNoTrans, i__1, i__2, c_neg_one, A(rk, 0), lda, F(k, 0), ldf, c_one, A(rk, k), ione ); #endif /*#if (defined(PRECISION_c) || defined(PRECISION_z)) for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_Z_CNJG( *F(k,j) ); } #endif*/ } /* Generate elementary reflector H(k). */ magma_zlarfg_gpu(m-rk, A(rk, k), A(rk + 1, k), &tau[k], &vn1[k], &Aks[k]); //Akk = *A(rk, k); //*A(rk, k) = c_one; //magma_zgetvector( 1, &Aks[k], 1, &Akk, 1 ); /* needed to avoid the race condition */ if (k == 0) magma_zsetvector( 1, &c_one, 1, A(rk, k), 1 ); else magma_zcopymatrix( 1, 1, A(offset, 0), 1, A(rk, k), 1 ); /* Compute Kth column of F: Compute F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */ if (k < n-1 || k > 0) magma_zgetvector( 1, &tau[k], 1, &tauk, 1 ); if (k < n-1) { i__1 = m - rk; i__2 = n - k - 1; /* Send the vector to the GPU */ //magma_zsetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda ); /* Multiply on GPU */ // was CALL ZGEMV( 'Conjugate transpose', M-RK+1, N-K, // TAU( K ), A( RK, K+1 ), LDA, // A( RK, K ), 1, // CZERO, F( K+1, K ), 1 ) //magma_zgetvector( 1, &tau[k], 1, &tauk, 1 ); magma_zgemv( MagmaConjTrans, m-rk, n-k-1, tauk, A( rk, k+1 ), lda, A( rk, k ), 1, c_zero, F( k+1, k ), 1 ); //magma_zscal( m-rk, tau[k], F( k+1, k), 1 ); //magma_int_t i__3 = nb-k-1; //magma_int_t i__4 = i__2 - i__3; //magma_int_t i__5 = nb-k; //magma_zgemv( MagmaConjTrans, i__1 - i__5, i__2 - i__3, // tau[k], dA(rk +i__5, k+1+i__3), ldda, // dA(rk +i__5, k ), ione, // c_zero, dF(k+1+i__3, k ), ione ); //magma_zgetmatrix_async( i__2-i__3, 1, // dF(k + 1 +i__3, k), i__2, // F (k + 1 +i__3, k), i__2, stream ); //blasf77_zgemv( MagmaConjTransStr, &i__1, &i__3, // &tau[k], A(rk, k+1), &lda, // A(rk, k ), &ione, // &c_zero, F(k+1, k ), &ione ); //magma_queue_sync( stream ); //blasf77_zgemv( MagmaConjTransStr, &i__5, &i__4, // &tau[k], A(rk, k+1+i__3), &lda, // A(rk, k ), &ione, // &c_one, F(k+1+i__3, k ), &ione ); } /* Padding F(1:K,K) with zeros. for (j = 0; j <= k; ++j) { magma_zsetvector( 1, &c_zero, 1, F(j, k), 1 ); }*/ /* Incremental updating of F: F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). F(1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K) := tau(K)(A(RK:M,K+1:N)' - F(1:N,1:K-1)*A(RK:M,1:K-1)') A(RK:M,K) so, F is (updated A)*V */ //if (k > 0 && k<n-1) { if (k > 0) { //magma_zgetvector( 1, &tau[k], 1, &tauk, 1 ); z__1 = MAGMA_Z_NEGATE( tauk ); #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_zgemv( MagmaConjTrans, i__1, i__2, z__1, A(offset+nb, 0), lda, A(offset+nb, k), ione, c_zero, auxv, ione ); i__1 = k; magma_zgemv( MagmaNoTrans, n-k-1, i__1, c_one, F(k+1,0), ldf, auxv, ione, c_one, F(k+1,k), ione ); #else i__1 = m - rk; i__2 = k; //blasf77_zgemv( MagmaConjTransStr, &i__1, &i__2, // &z__1, A(rk, 0), &lda, // A(rk, k), &ione, // &c_zero, auxv, &ione ); magma_zgemv( MagmaConjTrans, i__1, i__2, z__1, A(rk, 0), lda, A(rk, k), ione, c_zero, auxv, ione ); //i__1 = k; //blasf77_zgemv( MagmaNoTransStr, &n, &i__1, // &c_one, F(0,0), &ldf, // auxv, &ione, // &c_one, F(0,k), &ione ); /*magma_zgemv( MagmaNoTrans, n, i__1, c_one, F(0,0), ldf, auxv, ione, c_one, F(0,k), ione );*/ /* I think we only need stricly lower-triangular part :) */ magma_zgemv( MagmaNoTrans, n-k-1, i__2, c_one, F(k+1,0), ldf, auxv, ione, c_one, F(k+1,k), ione ); #endif } /* Optimization: On the last iteration start sending F back to the GPU */ /* Update the current row of A: A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'. */ if (k < n-1) { i__1 = n - k - 1; i__2 = k + 1; //blasf77_zgemm( MagmaNoTransStr, MagmaConjTransStr, &ione, &i__1, &i__2, // &c_neg_one, A(rk, 0 ), &lda, // F(k+1,0 ), &ldf, // &c_one, A(rk, k+1), &lda ); #ifdef RIGHT_UPDATE /* right-looking update of rows, */ magma_zgemm( MagmaNoTrans, MagmaConjTrans, nb-k, i__1, ione, c_neg_one, A(rk, k ), lda, F(k+1, k ), ldf, c_one, A(rk, k+1), lda ); #else /* left-looking update of rows, * * since F=A'v with original A, so no right-looking */ magma_zgemm( MagmaNoTrans, MagmaConjTrans, ione, i__1, i__2, c_neg_one, A(rk, 0 ), lda, F(k+1,0 ), ldf, c_one, A(rk, k+1), lda ); #endif } /* Update partial column norms. */ if (rk < min(m, n+offset)-1 ) { magmablas_dznrm2_row_check_adjust(n-k-1, tol3z, &vn1[k+1], &vn2[k+1], A(rk,k+1), lda, lsticcs); magma_device_sync(); #if defined(PRECISION_d) || defined(PRECISION_z) magma_dgetvector( 1, &lsticcs[0], 1, &lsticc, 1 ); #else magma_sgetvector( 1, &lsticcs[0], 1, &lsticc, 1 ); #endif } /*if (rk < lastrk) { for (j = k + 1; j < n; ++j) { if (vn1[j] != 0.) { // NOTE: The following 4 lines follow from the analysis in // Lapack Working Note 176. temp = MAGMA_Z_ABS( *A(rk,j) ) / vn1[j]; temp = max( 0., ((1. + temp) * (1. - temp)) ); d__1 = vn1[j] / vn2[j]; temp2 = temp * (d__1 * d__1); if (temp2 <= tol3z) { vn2[j] = (double) lsticc; lsticc = j; } else { vn1[j] *= magma_dsqrt(temp); } } } }*/ //*A(rk, k) = Akk; //magma_zsetvector( 1, &Akk, 1, A(rk, k), 1 ); //magma_zswap( 1, &Aks[k], 1, A(rk, k), 1 ); ++k; } magma_zcopymatrix( 1, k, Aks, 1, A(offset, 0), lda+1 ); // leave k as the last column done --k; *kb = k + 1; rk = offset + *kb - 1; /* Apply the block reflector to the rest of the matrix: A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)' */ if (*kb < min(n, m - offset)) { i__1 = m - rk - 1; i__2 = n - *kb; /* Send F to the GPU magma_zsetmatrix( i__2, *kb, F (*kb, 0), ldf, dF(*kb, 0), i__2 );*/ magma_zgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb, c_neg_one, A(rk+1, 0 ), lda, F(*kb, 0 ), ldf, c_one, A(rk+1, *kb), lda ); } /* Recomputation of difficult columns. */ if( lsticc > 0 ) { printf( " -- recompute dnorms --\n" ); magmablas_dznrm2_check(m-rk-1, n-*kb, A(rk+1,*kb), lda, &vn1[*kb], lsticcs); #if defined(PRECISION_d) || defined(PRECISION_z) magma_dcopymatrix( n-*kb, 1, &vn1[*kb], *kb, &vn2[*kb], *kb); #else magma_scopymatrix( n-*kb, 1, &vn1[*kb], *kb, &vn2[*kb], *kb); #endif /*while( lsticc > 0 ) { itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc])); i__1 = m - rk - 1; if (lsticc <= nb) vn1[lsticc] = cblas_dznrm2(i__1, A(rk + 1, lsticc), ione); else { // Where is the data, CPU or GPU ? double r1, r2; r1 = cblas_dznrm2(nb-k, A(rk + 1, lsticc), ione); r2 = magma_dznrm2(m-offset-nb, dA(offset + nb + 1, lsticc), ione); vn1[lsticc] = magma_dsqrt(r1*r1+r2*r2); } // NOTE: The computation of VN1( LSTICC ) relies on the fact that // SNRM2 does not fail on vectors with norm below the value of SQRT(DLAMCH('S')) vn2[lsticc] = vn1[lsticc]; lsticc = itemp;*/ } magma_free(Aks); magma_free(lsticcs); return MAGMA_SUCCESS; } /* magma_zlaqps */
/** Purpose ------- DSPOSV computes the solution to a real system of linear equations A * X = B, where A is an N-by-N symmetric positive definite matrix and X and B are N-by-NRHS matrices. DSPOSV first attempts to factorize the matrix in real SINGLE PRECISION and use this factorization within an iterative refinement procedure to produce a solution with real DOUBLE PRECISION norm-wise backward error quality (see below). If the approach fails the method switches to a real DOUBLE PRECISION factorization and solve. The iterative refinement is not going to be a winning strategy if the ratio real SINGLE PRECISION performance over real DOUBLE PRECISION performance is too small. A reasonable strategy should take the number of right-hand sides and the size of the matrix into account. This might be done with a call to ILAENV in the future. Up to now, we always try iterative refinement. The iterative refinement process is stopped if ITER > ITERMAX or for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX where o ITER is the number of the current iteration in the iterative refinement process o RNRM is the infinity-norm of the residual o XNRM is the infinity-norm of the solution o ANRM is the infinity-operator-norm of the matrix A o EPS is the machine epsilon returned by DLAMCH('Epsilon') The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The number of linear equations, i.e., the order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in,out] dA DOUBLE PRECISION array on the GPU, dimension (LDDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if iterative refinement has been successfully used (INFO.EQ.0 and ITER.GE.0, see description below), then A is unchanged, if double factorization has been used (INFO.EQ.0 and ITER.LT.0, see description below), then the array dA contains the factor U or L from the Cholesky factorization A = U**T*U or A = L*L**T. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). @param[in] dB DOUBLE PRECISION array on the GPU, dimension (LDDB,NRHS) The N-by-NRHS right hand side matrix B. @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= max(1,N). @param[out] dX DOUBLE PRECISION array on the GPU, dimension (LDDX,NRHS) If INFO = 0, the N-by-NRHS solution matrix X. @param[in] lddx INTEGER The leading dimension of the array dX. LDDX >= max(1,N). @param dworkd (workspace) DOUBLE PRECISION array on the GPU, dimension (N*NRHS) This array is used to hold the residual vectors. @param dworks (workspace) SINGLE PRECISION array on the GPU, dimension (N*(N+NRHS)) This array is used to store the real single precision matrix and the right-hand sides or solutions in single precision. @param[out] iter INTEGER - < 0: iterative refinement has failed, double precision factorization has been performed + -1 : the routine fell back to full precision for implementation- or machine-specific reasons + -2 : narrowing the precision induced an overflow, the routine fell back to full precision + -3 : failure of SPOTRF + -31: stop the iterative refinement after the 30th iteration - > 0: iterative refinement has been successfully used. Returns the number of iterations @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i of (DOUBLE PRECISION) A is not positive definite, so the factorization could not be completed, and the solution has not been computed. @ingroup magma_dposv_driver ********************************************************************/ extern "C" magma_int_t magma_dsposv_gpu( magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs, magmaDouble_ptr dA, magma_int_t ldda, magmaDouble_ptr dB, magma_int_t lddb, magmaDouble_ptr dX, magma_int_t lddx, magmaDouble_ptr dworkd, magmaFloat_ptr dworks, magma_int_t *iter, magma_int_t *info) { #define dB(i,j) (dB + (i) + (j)*lddb) #define dX(i,j) (dX + (i) + (j)*lddx) #define dR(i,j) (dR + (i) + (j)*lddr) #define dSX(i,j) (dSX + (i) + (j)*lddsx) // Constants const double BWDMAX = 1.0; const magma_int_t ITERMAX = 30; const double c_neg_one = MAGMA_D_NEG_ONE; const double c_one = MAGMA_D_ONE; const magma_int_t ione = 1; // Local variables magmaDouble_ptr dR; magmaFloat_ptr dSA, dSX; double Xnrmv, Rnrmv; double Anrm, Xnrm, Rnrm, cte, eps; magma_int_t i, j, iiter, lddsa, lddsx, lddr; /* Check arguments */ *iter = 0; *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -2; else if ( ldda < max(1,n)) *info = -4; else if ( lddb < max(1,n)) *info = -7; else if ( lddx < max(1,n)) *info = -9; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if ( n == 0 || nrhs == 0 ) return *info; lddsa = n; lddsx = n; lddr = n; dSA = dworks; dSX = dSA + lddsa*n; dR = dworkd; magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); eps = lapackf77_dlamch("Epsilon"); Anrm = magmablas_dlansy( MagmaInfNorm, uplo, n, dA, ldda, (double*)dworkd, n*nrhs, queue ); cte = Anrm * eps * magma_dsqrt( n ) * BWDMAX; /* * Convert to single precision */ magmablas_dlag2s( n, nrhs, dB, lddb, dSX, lddsx, queue, info ); if (*info != 0) { *iter = -2; goto fallback; } magmablas_dlat2s( uplo, n, dA, ldda, dSA, lddsa, queue, info ); if (*info != 0) { *iter = -2; goto fallback; } // factor dSA in single precision magma_spotrf_gpu( uplo, n, dSA, lddsa, info ); if (*info != 0) { *iter = -3; goto fallback; } // solve dSA*dSX = dB in single precision magma_spotrs_gpu( uplo, n, nrhs, dSA, lddsa, dSX, lddsx, info ); // residual dR = dB - dA*dX in double precision magmablas_slag2d( n, nrhs, dSX, lddsx, dX, lddx, queue, info ); magmablas_dlacpy( MagmaFull, n, nrhs, dB, lddb, dR, lddr, queue ); if ( nrhs == 1 ) { magma_dsymv( uplo, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1, queue ); } else { magma_dsymm( MagmaLeft, uplo, n, nrhs, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr, queue ); } // TODO: use MAGMA_D_ABS( dX(i,j) ) instead of dlange? for( j=0; j < nrhs; j++ ) { i = magma_idamax( n, dX(0,j), 1, queue ) - 1; magma_dgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1, queue ); Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_idamax( n, dR(0,j), 1, queue ) - 1; magma_dgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1, queue ); Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto refinement; } } *iter = 0; goto cleanup; //return *info; refinement: for( iiter=1; iiter < ITERMAX; ) { *info = 0; // convert residual dR to single precision dSX magmablas_dlag2s( n, nrhs, dR, lddr, dSX, lddsx, queue, info ); if (*info != 0) { *iter = -2; goto fallback; } // solve dSA*dSX = R in single precision magma_spotrs_gpu( uplo, n, nrhs, dSA, lddsa, dSX, lddsx, info ); // Add correction and setup residual // dX += dSX [including conversion] --and-- // dR = dB for( j=0; j < nrhs; j++ ) { magmablas_dsaxpycp( n, dSX(0,j), dX(0,j), dB(0,j), dR(0,j), queue ); } // residual dR = dB - dA*dX in double precision if ( nrhs == 1 ) { magma_dsymv( uplo, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1, queue ); } else { magma_dsymm( MagmaLeft, uplo, n, nrhs, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr, queue ); } // TODO: use MAGMA_D_ABS( dX(i,j) ) instead of dlange? /* Check whether the nrhs normwise backward errors satisfy the * stopping criterion. If yes, set ITER=IITER > 0 and return. */ for( j=0; j < nrhs; j++ ) { i = magma_idamax( n, dX(0,j), 1, queue ) - 1; magma_dgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1, queue ); Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_idamax( n, dR(0,j), 1, queue ) - 1; magma_dgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1, queue ); Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto L20; } } /* If we are here, the nrhs normwise backward errors satisfy * the stopping criterion, we are good to exit. */ *iter = iiter; goto cleanup; //return *info; L20: iiter++; } /* If we are at this place of the code, this is because we have * performed ITER=ITERMAX iterations and never satisified the * stopping criterion. Set up the ITER flag accordingly and follow * up on double precision routine. */ *iter = -ITERMAX - 1; fallback: /* Single-precision iterative refinement failed to converge to a * satisfactory solution, so we resort to double precision. */ magma_dpotrf_gpu( uplo, n, dA, ldda, info ); if (*info == 0) { magmablas_dlacpy( MagmaFull, n, nrhs, dB, lddb, dX, lddx, queue ); magma_dpotrs_gpu( uplo, n, nrhs, dA, ldda, dX, lddx, info ); } cleanup: magma_queue_destroy( queue ); return *info; }
/** Purpose ------- DSGEQRSV solves the least squares problem min || A*X - B ||, where A is an M-by-N matrix and X and B are M-by-NRHS matrices. DSGEQRSV first attempts to factorize the matrix in real SINGLE PRECISION and use this factorization within an iterative refinement procedure to produce a solution with real DOUBLE PRECISION norm-wise backward error quality (see below). If the approach fails the method switches to a real DOUBLE PRECISION factorization and solve. The iterative refinement is not going to be a winning strategy if the ratio real SINGLE PRECISION performance over real DOUBLE PRECISION performance is too small. A reasonable strategy should take the number of right-hand sides and the size of the matrix into account. This might be done with a call to ILAENV in the future. Up to now, we always try iterative refinement. The iterative refinement process is stopped if ITER > ITERMAX or for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX where o ITER is the number of the current iteration in the iterative refinement process o RNRM is the infinity-norm of the residual o XNRM is the infinity-norm of the solution o ANRM is the infinity-operator-norm of the matrix A o EPS is the machine epsilon returned by DLAMCH('Epsilon') The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. M >= N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in,out] dA DOUBLE PRECISION array on the GPU, dimension (LDDA,N) On entry, the M-by-N coefficient matrix A. On exit, if iterative refinement has been successfully used (info.EQ.0 and ITER.GE.0, see description below), A is unchanged. If double precision factorization has been used (info.EQ.0 and ITER.LT.0, see description below), then the array dA contains the QR factorization of A as returned by function DGEQRF_GPU. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). @param[in,out] dB DOUBLE PRECISION array on the GPU, dimension (LDDB,NRHS) The M-by-NRHS right hand side matrix B. May be overwritten (e.g., if refinement fails). @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= max(1,M). @param[out] dX DOUBLE PRECISION array on the GPU, dimension (LDDX,NRHS) If info = 0, the N-by-NRHS solution matrix X. @param[in] lddx INTEGER The leading dimension of the array dX. LDDX >= max(1,N). @param[out] iter INTEGER - < 0: iterative refinement has failed, double precision factorization has been performed + -1 : the routine fell back to full precision for implementation- or machine-specific reasons + -2 : narrowing the precision induced an overflow, the routine fell back to full precision + -3 : failure of SGEQRF + -31: stop the iterative refinement after the 30th iteration - > 0: iterative refinement has been successfully used. Returns the number of iterations @param[out] info INTEGER - = 0: successful exit - < 0: if info = -i, the i-th argument had an illegal value @ingroup magma_dgels_driver ********************************************************************/ extern "C" magma_int_t magma_dsgeqrsv_gpu( magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaDouble_ptr dA, magma_int_t ldda, magmaDouble_ptr dB, magma_int_t lddb, magmaDouble_ptr dX, magma_int_t lddx, magma_int_t *iter, magma_int_t *info) { #define dB(i,j) (dB + (i) + (j)*lddb) #define dX(i,j) (dX + (i) + (j)*lddx) #define dR(i,j) (dR + (i) + (j)*lddr) #define dSX(i,j) (dSX + (i) + (j)*lddsx) double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; magma_int_t ione = 1; double *hworkd; float *hworks; double *tau; float *stau; magmaDouble_ptr dworkd; magmaFloat_ptr dworks; magmaDouble_ptr dR, dT; magmaFloat_ptr dSA, dSX, dST; double Xnrmv, Rnrmv; double Anrm, Xnrm, Rnrm, cte, eps; magma_int_t i, j, iiter, lddsa, lddsx, lddr, nb, lhwork, minmn, size, ldworkd; /* Check arguments */ *iter = 0; *info = 0; if ( m < 0 ) *info = -1; else if ( n < 0 || n > m ) *info = -2; else if ( nrhs < 0 ) *info = -3; else if ( ldda < max(1,m)) *info = -5; else if ( lddb < max(1,m)) *info = -7; else if ( lddx < max(1,n)) *info = -9; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if ( m == 0 || n == 0 || nrhs == 0 ) return *info; nb = magma_get_sgeqrf_nb(m); minmn= min(m, n); /* dSX contains both B and X, so must be max(m or lddb,n). */ lddsa = ldda; lddsx = max(lddb,n); lddr = lddb; /* * Allocate temporary buffers */ /* dworks(dSA + dSX + dST) */ size = lddsa*n + lddsx*nrhs + ( 2*minmn + ((n+31)/32)*32 )*nb; if (MAGMA_SUCCESS != magma_smalloc( &dworks, size )) { fprintf(stderr, "Allocation of dworks failed (%d)\n", (int) size); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dSA = dworks; dSX = dSA + lddsa*n; dST = dSX + lddsx*nrhs; /* dworkd(dR) = lddr*nrhs */ ldworkd = lddr*nrhs; if (MAGMA_SUCCESS != magma_dmalloc( &dworkd, ldworkd )) { magma_free( dworks ); fprintf(stderr, "Allocation of dworkd failed\n"); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dR = dworkd; /* hworks(workspace for cgeqrs + stau) = min(m,n) + lhworks */ lhwork = (m - n + nb)*(nrhs + nb) + nrhs*nb; size = lhwork + minmn; magma_smalloc_cpu( &hworks, size ); if ( hworks == NULL ) { magma_free( dworks ); magma_free( dworkd ); fprintf(stderr, "Allocation of hworks failed\n"); *info = MAGMA_ERR_HOST_ALLOC; return *info; } stau = hworks + lhwork; eps = lapackf77_dlamch("Epsilon"); Anrm = magmablas_dlange(MagmaInfNorm, m, n, dA, ldda, (double*)dworkd ); cte = Anrm * eps * pow((double)n, 0.5) * BWDMAX; /* * Convert to single precision */ magmablas_dlag2s( m, nrhs, dB, lddb, dSX, lddsx, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } magmablas_dlag2s( m, n, dA, ldda, dSA, lddsa, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // factor dSA in single precision magma_sgeqrf_gpu( m, n, dSA, lddsa, stau, dST, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // solve dSA*dSX = dB in single precision magma_sgeqrs_gpu( m, n, nrhs, dSA, lddsa, stau, dST, dSX, lddsx, hworks, lhwork, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // residual dR = dB - dA*dX in double precision magmablas_slag2d( n, nrhs, dSX, lddsx, dX, lddx, info ); magmablas_dlacpy( MagmaUpperLower, m, nrhs, dB, lddb, dR, lddr ); if ( nrhs == 1 ) { magma_dgemv( MagmaNoTrans, m, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_dgemm( MagmaNoTrans, MagmaNoTrans, m, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } // TODO: use MAGMA_D_ABS( dX(i,j) ) instead of dlange? for( j=0; j < nrhs; j++ ) { i = magma_idamax( n, dX(0,j), 1) - 1; magma_dgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_idamax ( m, dR(0,j), 1 ) - 1; magma_dgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto REFINEMENT; } } *iter = 0; /* Free workspaces */ magma_free( dworks ); magma_free( dworkd ); magma_free_cpu( hworks ); return *info; REFINEMENT: /* TODO: this iterative refinement algorithm works only for compatibile * systems (B in colspan of A). * See Matrix Computations (3rd ed) p. 267 for correct algorithm. */ for( iiter=1; iiter < ITERMAX; ) { *info = 0; // convert residual dR to single precision dSX magmablas_dlag2s( m, nrhs, dR, lddr, dSX, lddsx, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // solve dSA*dSX = R in single precision magma_sgeqrs_gpu( m, n, nrhs, dSA, lddsa, stau, dST, dSX, lddsx, hworks, lhwork, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Add correction and setup residual // dX += dSX [including conversion] --and-- // dR[1:n] = dB[1:n] (only n rows, not whole m rows! -- useless if m > n) for( j=0; j < nrhs; j++ ) { magmablas_dsaxpycp( n, dSX(0,j), dX(0,j), dB(0,j), dR(0,j) ); } // dR = dB (whole m rows) magmablas_dlacpy( MagmaUpperLower, m, nrhs, dB, lddb, dR, lddr ); // residual dR = dB - dA*dX in double precision if ( nrhs == 1 ) { magma_dgemv( MagmaNoTrans, m, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_dgemm( MagmaNoTrans, MagmaNoTrans, m, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } /* Check whether the nrhs normwise backward errors satisfy the * stopping criterion. If yes, set ITER=IITER > 0 and return. */ for( j=0; j < nrhs; j++ ) { i = magma_idamax( n, dX(0,j), 1) - 1; magma_dgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_idamax ( m, dR(0,j), 1 ) - 1; magma_dgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto L20; } } /* If we are here, the nrhs normwise backward errors satisfy * the stopping criterion, we are good to exit. */ *iter = iiter; /* Free workspaces */ magma_free( dworks ); magma_free( dworkd ); magma_free_cpu( hworks ); return *info; L20: iiter++; } /* If we are at this place of the code, this is because we have * performed ITER=ITERMAX iterations and never satisified the * stopping criterion. Set up the ITER flag accordingly and follow * up on double precision routine. */ *iter = -ITERMAX - 1; FALLBACK: /* Single-precision iterative refinement failed to converge to a * satisfactory solution, so we resort to double precision. */ magma_free( dworks ); magma_free_cpu( hworks ); /* * Allocate temporary buffers */ /* dworkd = dT for dgeqrf */ nb = magma_get_dgeqrf_nb( m ); size = (2*min(m, n) + (n+31)/32*32 )*nb; if ( size > ldworkd ) { magma_free( dworkd ); if (MAGMA_SUCCESS != magma_dmalloc( &dworkd, size )) { fprintf(stderr, "Allocation of dworkd2 failed\n"); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } dT = dworkd; /* hworkd(dtau + workspace for dgeqrs) = min(m,n) + lhwork */ size = lhwork + minmn; magma_dmalloc_cpu( &hworkd, size ); if ( hworkd == NULL ) { magma_free( dworkd ); fprintf(stderr, "Allocation of hworkd2 failed\n"); *info = MAGMA_ERR_HOST_ALLOC; return *info; } tau = hworkd + lhwork; magma_dgeqrf_gpu( m, n, dA, ldda, tau, dT, info ); if (*info == 0) { // if m > n, then dB won't fit in dX, so solve with dB and copy n rows to dX magma_dgeqrs_gpu( m, n, nrhs, dA, ldda, tau, dT, dB, lddb, hworkd, lhwork, info ); magmablas_dlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dX, lddx ); } magma_free( dworkd ); magma_free_cpu( hworkd ); return *info; }
/** @deprecated Purpose ------- ZLAQPS computes a step of QR factorization with column pivoting of a complex M-by-N matrix A by using Blas-3. It tries to factorize NB columns from A starting from the row OFFSET+1, and updates all of the matrix with Blas-3 xGEMM. In some cases, due to catastrophic cancellations, it cannot factorize NB columns. Hence, the actual number of factorized columns is returned in KB. Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0 @param[in] offset INTEGER The number of rows of A that have been factorized in previous steps. @param[in] nb INTEGER The number of columns to factorize. @param[out] kb INTEGER The number of columns actually factorized. @param[in,out] dA COMPLEX_16 array, dimension (LDDA,N), on the GPU. On entry, the M-by-N matrix A. On exit, block A(OFFSET+1:M,1:KB) is the triangular factor obtained and block A(1:OFFSET,1:N) has been accordingly pivoted, but no factorized. The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has been updated. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[in,out] jpvt INTEGER array, dimension (N) JPVT(I) = K <==> Column K of the full matrix A has been permuted into position I in AP. @param[out] tau COMPLEX_16 array, dimension (KB) The scalar factors of the elementary reflectors. @param[in,out] vn1 DOUBLE PRECISION array, dimension (N) The vector with the partial column norms. @param[in,out] vn2 DOUBLE PRECISION array, dimension (N) The vector with the exact column norms. @param[in,out] dauxv COMPLEX_16 array, dimension (NB), on the GPU Auxiliary vector. @param[in,out] dF COMPLEX_16 array, dimension (LDDF,NB), on the GPU Matrix F' = L*Y'*A. @param[in] lddf INTEGER The leading dimension of the array F. LDDF >= max(1,N). @ingroup magma_zgeqp3_aux ********************************************************************/ extern "C" magma_int_t magma_zlaqps_gpu( magma_int_t m, magma_int_t n, magma_int_t offset, magma_int_t nb, magma_int_t *kb, magmaDoubleComplex_ptr dA, magma_int_t ldda, magma_int_t *jpvt, magmaDoubleComplex *tau, double *vn1, double *vn2, magmaDoubleComplex_ptr dauxv, magmaDoubleComplex_ptr dF, magma_int_t lddf) { #define dA(i, j) (dA + (i) + (j)*(ldda)) #define dF(i, j) (dF + (i) + (j)*(lddf)) magmaDoubleComplex c_zero = MAGMA_Z_MAKE( 0.,0.); magmaDoubleComplex c_one = MAGMA_Z_MAKE( 1.,0.); magmaDoubleComplex c_neg_one = MAGMA_Z_MAKE(-1.,0.); magma_int_t ione = 1; magma_int_t i__1, i__2; //double d__1; magmaDoubleComplex z__1; //magma_int_t j; magma_int_t k, rk; //magmaDoubleComplex Akk; magmaDoubleComplex_ptr dAks; magmaDoubleComplex tauk = MAGMA_Z_ZERO; magma_int_t pvt; //double temp, temp2; double tol3z; magma_int_t itemp; double lsticc; magmaDouble_ptr dlsticcs; magma_dmalloc( &dlsticcs, 1+256*(n+255)/256 ); //lastrk = min( m, n + offset ); tol3z = magma_dsqrt( lapackf77_dlamch("Epsilon")); lsticc = 0; k = 0; magma_zmalloc( &dAks, nb ); while( k < nb && lsticc == 0 ) { rk = offset + k; /* Determine ith pivot column and swap if necessary */ // subtract 1 from Fortran/CUBLAS idamax; pvt, k are 0-based. pvt = k + magma_idamax( n-k, &vn1[k], ione ) - 1; if (pvt != k) { /*if (pvt >= nb) { // 1. Start copy from GPU magma_zgetmatrix_async( m - offset - nb, 1, dA(offset + nb, pvt), ldda, A (offset + nb, pvt), lda, stream ); }*/ /* F gets swapped so F must be sent at the end to GPU */ i__1 = k; /*if (pvt < nb) { // no need of transfer if pivot is within the panel blasf77_zswap( &m, A(0, pvt), &ione, A(0, k), &ione ); } else { // 1. Finish copy from GPU magma_queue_sync( stream ); // 2. Swap as usual on CPU blasf77_zswap(&m, A(0, pvt), &ione, A(0, k), &ione); // 3. Restore the GPU magma_zsetmatrix_async( m - offset - nb, 1, A (offset + nb, pvt), lda, dA(offset + nb, pvt), ldda, stream); }*/ magmablas_zswap( m, dA(0, pvt), ione, dA(0, k), ione ); //blasf77_zswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf ); magmablas_zswap( i__1, dF(pvt, 0), lddf, dF(k, 0), lddf); itemp = jpvt[pvt]; jpvt[pvt] = jpvt[k]; jpvt[k] = itemp; //vn1[pvt] = vn1[k]; //vn2[pvt] = vn2[k]; #if defined(PRECISION_d) || defined(PRECISION_z) //magma_dswap( 1, &vn1[pvt], 1, &vn1[k], 1 ); //magma_dswap( 1, &vn2[pvt], 1, &vn2[k], 1 ); magma_dswap( 2, &vn1[pvt], n+offset, &vn1[k], n+offset ); #else //magma_sswap( 1, &vn1[pvt], 1, &vn1[k], 1 ); //magma_sswap( 1, &vn2[pvt], 1, &vn2[k], 1 ); magma_sswap(2, &vn1[pvt], n+offset, &vn1[k], n+offset); #endif } /* Apply previous Householder reflectors to column K: A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'. Optimization: multiply with beta=0; wait for vector and subtract */ if (k > 0) { /*#if (defined(PRECISION_c) || defined(PRECISION_z)) for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_Z_CNJG( *F(k,j) ); } #endif*/ //#define RIGHT_UPDATE #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_zgemv( MagmaNoTrans, i__1, i__2, c_neg_one, A(offset+nb, 0), lda, F(k, 0), ldf, c_one, A(offset+nb, k), ione ); #else i__1 = m - rk; i__2 = k; /*blasf77_zgemv( MagmaNoTransStr, &i__1, &i__2, &c_neg_one, A(rk, 0), &lda, F(k, 0), &ldf, &c_one, A(rk, k), &ione ); */ magma_zgemv( MagmaNoTrans, i__1, i__2, c_neg_one, dA(rk, 0), ldda, dF(k, 0), lddf, c_one, dA(rk, k), ione ); #endif /*#if (defined(PRECISION_c) || defined(PRECISION_z)) for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_Z_CNJG( *F(k,j) ); } #endif*/ } /* Generate elementary reflector H(k). */ magma_zlarfg_gpu( m-rk, dA(rk, k), dA(rk + 1, k), &tau[k], &vn1[k], &dAks[k]); //Akk = *A(rk, k); //*A(rk, k) = c_one; //magma_zgetvector( 1, &dAks[k], 1, &Akk, 1 ); /* needed to avoid the race condition */ if (k == 0) magma_zsetvector( 1, &c_one, 1, dA(rk, k), 1 ); else magma_zcopymatrix( 1, 1, dA(offset, 0), 1, dA(rk, k), 1 ); /* Compute Kth column of F: Compute F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */ if (k < n-1 || k > 0) magma_zgetvector( 1, &tau[k], 1, &tauk, 1 ); if (k < n-1) { i__1 = m - rk; i__2 = n - k - 1; /* Send the vector to the GPU */ //magma_zsetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda ); /* Multiply on GPU */ // was CALL ZGEMV( 'Conjugate transpose', M-RK+1, N-K, // TAU( K ), A( RK, K+1 ), LDA, // A( RK, K ), 1, // CZERO, F( K+1, K ), 1 ) //magma_zgetvector( 1, &tau[k], 1, &tauk, 1 ); magma_zgemv( MagmaConjTrans, m-rk, n-k-1, tauk, dA( rk, k+1 ), ldda, dA( rk, k ), 1, c_zero, dF( k+1, k ), 1 ); //magma_zscal( m-rk, tau[k], F( k+1, k), 1 ); //magma_int_t i__3 = nb-k-1; //magma_int_t i__4 = i__2 - i__3; //magma_int_t i__5 = nb-k; //magma_zgemv( MagmaConjTrans, i__1 - i__5, i__2 - i__3, // tau[k], dA(rk +i__5, k+1+i__3), ldda, // dA(rk +i__5, k ), ione, // c_zero, dF(k+1+i__3, k ), ione ); //magma_zgetmatrix_async( i__2-i__3, 1, // dF(k + 1 +i__3, k), i__2, // F (k + 1 +i__3, k), i__2, stream ); //blasf77_zgemv( MagmaConjTransStr, &i__1, &i__3, // &tau[k], A(rk, k+1), &lda, // A(rk, k ), &ione, // &c_zero, F(k+1, k ), &ione ); //magma_queue_sync( stream ); //blasf77_zgemv( MagmaConjTransStr, &i__5, &i__4, // &tau[k], A(rk, k+1+i__3), &lda, // A(rk, k ), &ione, // &c_one, F(k+1+i__3, k ), &ione ); } /* Padding F(1:K,K) with zeros. for (j = 0; j <= k; ++j) { magma_zsetvector( 1, &c_zero, 1, F(j, k), 1 ); }*/ /* Incremental updating of F: F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). F(1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K) := tau(K)(A(RK:M,K+1:N)' - F(1:N,1:K-1)*A(RK:M,1:K-1)') A(RK:M,K) so, F is (updated A)*V */ //if (k > 0 && k < n-1) { if (k > 0) { //magma_zgetvector( 1, &tau[k], 1, &tauk, 1 ); z__1 = MAGMA_Z_NEGATE( tauk ); #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_zgemv( MagmaConjTrans, i__1, i__2, z__1, dA(offset+nb, 0), lda, dA(offset+nb, k), ione, c_zero, dauxv, ione ); i__1 = k; magma_zgemv( MagmaNoTrans, n-k-1, i__1, c_one, F(k+1,0), ldf, dauxv, ione, c_one, F(k+1,k), ione ); #else i__1 = m - rk; i__2 = k; //blasf77_zgemv( MagmaConjTransStr, &i__1, &i__2, // &z__1, A(rk, 0), &lda, // A(rk, k), &ione, // &c_zero, auxv, &ione ); magma_zgemv( MagmaConjTrans, i__1, i__2, z__1, dA(rk, 0), ldda, dA(rk, k), ione, c_zero, dauxv, ione ); //i__1 = k; //blasf77_zgemv( MagmaNoTransStr, &n, &i__1, // &c_one, F(0,0), &ldf, // auxv, &ione, // &c_one, F(0,k), &ione ); /*magma_zgemv( MagmaNoTrans, n, i__1, c_one, F(0,0), ldf, auxv, ione, c_one, F(0,k), ione ); */ /* I think we only need stricly lower-triangular part :) */ magma_zgemv( MagmaNoTrans, n-k-1, i__2, c_one, dF(k+1,0), lddf, dauxv, ione, c_one, dF(k+1,k), ione ); #endif } /* Optimization: On the last iteration start sending F back to the GPU */ /* Update the current row of A: A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'. */ if (k < n-1) { i__1 = n - k - 1; i__2 = k + 1; //blasf77_zgemm( MagmaNoTransStr, MagmaConjTransStr, &ione, &i__1, &i__2, // &c_neg_one, A(rk, 0 ), &lda, // F(k+1,0 ), &ldf, // &c_one, A(rk, k+1), &lda ); #ifdef RIGHT_UPDATE /* right-looking update of rows, */ magma_zgemm( MagmaNoTrans, MagmaConjTrans, nb-k, i__1, ione, c_neg_one, dA(rk, k ), ldda, dF(k+1, k ), lddf, c_one, dA(rk, k+1), ldda ); #else /* left-looking update of rows, * * since F=A'v with original A, so no right-looking */ magma_zgemm( MagmaNoTrans, MagmaConjTrans, ione, i__1, i__2, c_neg_one, dA(rk, 0 ), ldda, dF(k+1,0 ), lddf, c_one, dA(rk, k+1), ldda ); #endif } /* Update partial column norms. */ if (rk < min(m, n+offset)-1 ) { magmablas_dznrm2_row_check_adjust(n-k-1, tol3z, &vn1[k+1], &vn2[k+1], dA(rk,k+1), ldda, dlsticcs); magma_device_sync(); #if defined(PRECISION_d) || defined(PRECISION_z) magma_dgetvector( 1, &dlsticcs[0], 1, &lsticc, 1 ); #else magma_sgetvector( 1, &dlsticcs[0], 1, &lsticc, 1 ); #endif } /*if (rk < lastrk) { for (j = k + 1; j < n; ++j) { if (vn1[j] != 0.) { // NOTE: The following 4 lines follow from the analysis in // Lapack Working Note 176. temp = MAGMA_Z_ABS( *A(rk,j) ) / vn1[j]; temp = max( 0., ((1. + temp) * (1. - temp)) ); d__1 = vn1[j] / vn2[j]; temp2 = temp * (d__1 * d__1); if (temp2 <= tol3z) { vn2[j] = (double) lsticc; lsticc = j; } else { vn1[j] *= magma_dsqrt(temp); } } } }*/ //*A(rk, k) = Akk; //magma_zsetvector( 1, &Akk, 1, A(rk, k), 1 ); //magma_zswap( 1, &dAks[k], 1, A(rk, k), 1 ); ++k; } magma_zcopymatrix( 1, k, dAks, 1, dA(offset, 0), ldda+1 ); // leave k as the last column done --k; *kb = k + 1; rk = offset + *kb - 1; /* Apply the block reflector to the rest of the matrix: A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)' */ if (*kb < min(n, m - offset)) { i__1 = m - rk - 1; i__2 = n - *kb; /* Send F to the GPU magma_zsetmatrix( i__2, *kb, F (*kb, 0), ldf, dF(*kb, 0), i__2 ); */ magma_zgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb, c_neg_one, dA(rk+1, 0 ), ldda, dF(*kb, 0 ), lddf, c_one, dA(rk+1, *kb), ldda ); } /* Recomputation of difficult columns. */ if ( lsticc > 0 ) { // printf( " -- recompute dnorms --\n" ); magmablas_dznrm2_check( m-rk-1, n-*kb, dA(rk+1,*kb), ldda, &vn1[*kb], dlsticcs ); magma_dcopymatrix( n-*kb, 1, &vn1[*kb], *kb, &vn2[*kb], *kb ); /*while( lsticc > 0 ) { itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc])); i__1 = m - rk - 1; if (lsticc <= nb) vn1[lsticc] = magma_cblas_dznrm2( i__1, A(rk+1,lsticc), ione ); else { // Where is the data, CPU or GPU ? double r1, r2; r1 = magma_cblas_dznrm2( nb-k, A(rk+1,lsticc), ione ); r2 = magma_dznrm2(m-offset-nb, dA(offset + nb + 1, lsticc), ione); vn1[lsticc] = magma_dsqrt(r1*r1+r2*r2); } // NOTE: The computation of VN1( LSTICC ) relies on the fact that // SNRM2 does not fail on vectors with norm below the value of SQRT(DLAMCH('S')) vn2[lsticc] = vn1[lsticc]; lsticc = itemp; */ } magma_free(dAks); magma_free(dlsticcs); return MAGMA_SUCCESS; } /* magma_zlaqps */
/** @deprecated Purpose ------- ZLAQPS computes a step of QR factorization with column pivoting of a complex M-by-N matrix A by using Blas-3. It tries to factorize NB columns from A starting from the row OFFSET+1, and updates all of the matrix with Blas-3 xGEMM. In some cases, due to catastrophic cancellations, it cannot factorize NB columns. Hence, the actual number of factorized columns is returned in KB. Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0 @param[in] offset INTEGER The number of rows of A that have been factorized in previous steps. @param[in] nb INTEGER The number of columns to factorize. @param[out] kb INTEGER The number of columns actually factorized. @param[in,out] dA COMPLEX_16 array, dimension (LDDA,N), on the GPU. On entry, the M-by-N matrix A. On exit, block A(OFFSET+1:M,1:KB) is the triangular factor obtained and block A(1:OFFSET,1:N) has been accordingly pivoted, but no factorized. The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has been updated. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[in,out] jpvt INTEGER array, dimension (N) JPVT(I) = K <==> Column K of the full matrix A has been permuted into position I in AP. @param[out] tau COMPLEX_16 array, dimension (KB) The scalar factors of the elementary reflectors. @param[in,out] vn1 DOUBLE PRECISION array, dimension (N) The vector with the partial column norms. @param[in,out] vn2 DOUBLE PRECISION array, dimension (N) The vector with the exact column norms. @param[in,out] dauxv COMPLEX_16 array, dimension (NB), on the GPU Auxiliary vector. @param[in,out] dF COMPLEX_16 array, dimension (LDDF,NB), on the GPU Matrix F' = L*Y'*A. @param[in] lddf INTEGER The leading dimension of the array F. LDDF >= max(1,N). @ingroup magma_zgeqp3_aux ********************************************************************/ extern "C" magma_int_t magma_zlaqps_gpu( magma_int_t m, magma_int_t n, magma_int_t offset, magma_int_t nb, magma_int_t *kb, magmaDoubleComplex_ptr dA, magma_int_t ldda, magma_int_t *jpvt, magmaDoubleComplex *tau, double *vn1, double *vn2, magmaDoubleComplex_ptr dauxv, magmaDoubleComplex_ptr dF, magma_int_t lddf) { #define dA(i, j) (dA + (i) + (j)*(ldda)) #define dF(i, j) (dF + (i) + (j)*(lddf)) magmaDoubleComplex c_zero = MAGMA_Z_MAKE( 0.,0.); magmaDoubleComplex c_one = MAGMA_Z_MAKE( 1.,0.); magmaDoubleComplex c_neg_one = MAGMA_Z_MAKE(-1.,0.); magma_int_t ione = 1; magma_int_t i__1, i__2; magmaDoubleComplex z__1; magma_int_t k, rk; magmaDoubleComplex_ptr dAks; magmaDoubleComplex tauk = MAGMA_Z_ZERO; magma_int_t pvt; double tol3z; magma_int_t itemp; double lsticc; magmaDouble_ptr dlsticcs; magma_dmalloc( &dlsticcs, 1+256*(n+255)/256 ); tol3z = magma_dsqrt( lapackf77_dlamch("Epsilon")); lsticc = 0; k = 0; magma_zmalloc( &dAks, nb ); magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); while( k < nb && lsticc == 0 ) { rk = offset + k; /* Determine ith pivot column and swap if necessary */ // subtract 1 from Fortran/CUBLAS idamax; pvt, k are 0-based. pvt = k + magma_idamax( n-k, &vn1[k], ione, queue ) - 1; if (pvt != k) { /* F gets swapped so F must be sent at the end to GPU */ i__1 = k; magmablas_zswap( m, dA(0, pvt), ione, dA(0, k), ione, queue ); magmablas_zswap( i__1, dF(pvt, 0), lddf, dF(k, 0), lddf, queue ); itemp = jpvt[pvt]; jpvt[pvt] = jpvt[k]; jpvt[k] = itemp; magma_dswap( 2, &vn1[pvt], n+offset, &vn1[k], n+offset, queue ); } /* Apply previous Householder reflectors to column K: A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'. Optimization: multiply with beta=0; wait for vector and subtract */ if (k > 0) { //#define RIGHT_UPDATE #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_zgemv( MagmaNoTrans, i__1, i__2, c_neg_one, A(offset+nb, 0), lda, F(k, 0), ldf, c_one, A(offset+nb, k), ione, queue ); #else i__1 = m - rk; i__2 = k; magma_zgemv( MagmaNoTrans, i__1, i__2, c_neg_one, dA(rk, 0), ldda, dF(k, 0), lddf, c_one, dA(rk, k), ione, queue ); #endif } /* Generate elementary reflector H(k). */ magma_zlarfg_gpu( m-rk, dA(rk, k), dA(rk + 1, k), &tau[k], &vn1[k], &dAks[k], queue ); /* needed to avoid the race condition */ if (k == 0) magma_zsetvector( 1, &c_one, 1, dA(rk, k), 1, queue ); else magma_zcopymatrix( 1, 1, dA(offset, 0), 1, dA(rk, k), 1, queue ); /* Compute Kth column of F: Compute F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */ if (k < n-1 || k > 0) magma_zgetvector( 1, &tau[k], 1, &tauk, 1, queue ); if (k < n-1) { i__1 = m - rk; i__2 = n - k - 1; /* Multiply on GPU */ magma_zgemv( MagmaConjTrans, m-rk, n-k-1, tauk, dA( rk, k+1 ), ldda, dA( rk, k ), 1, c_zero, dF( k+1, k ), 1, queue ); } /* Incremental updating of F: F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). F(1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K) := tau(K)(A(RK:M,K+1:N)' - F(1:N,1:K-1)*A(RK:M,1:K-1)') A(RK:M,K) so, F is (updated A)*V */ if (k > 0) { z__1 = MAGMA_Z_NEGATE( tauk ); #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_zgemv( MagmaConjTrans, i__1, i__2, z__1, dA(offset+nb, 0), lda, dA(offset+nb, k), ione, c_zero, dauxv, ione, queue ); i__1 = k; magma_zgemv( MagmaNoTrans, n-k-1, i__1, c_one, F(k+1,0), ldf, dauxv, ione, c_one, F(k+1,k), ione, queue ); #else i__1 = m - rk; i__2 = k; magma_zgemv( MagmaConjTrans, i__1, i__2, z__1, dA(rk, 0), ldda, dA(rk, k), ione, c_zero, dauxv, ione, queue ); /* I think we only need stricly lower-triangular part :) */ magma_zgemv( MagmaNoTrans, n-k-1, i__2, c_one, dF(k+1,0), lddf, dauxv, ione, c_one, dF(k+1,k), ione, queue ); #endif } /* Optimization: On the last iteration start sending F back to the GPU */ /* Update the current row of A: A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'. */ if (k < n-1) { i__1 = n - k - 1; i__2 = k + 1; #ifdef RIGHT_UPDATE /* right-looking update of rows, */ magma_zgemm( MagmaNoTrans, MagmaConjTrans, nb-k, i__1, ione, c_neg_one, dA(rk, k ), ldda, dF(k+1, k ), lddf, c_one, dA(rk, k+1), ldda, queue ); #else /* left-looking update of rows, * * since F=A'v with original A, so no right-looking */ magma_zgemm( MagmaNoTrans, MagmaConjTrans, ione, i__1, i__2, c_neg_one, dA(rk, 0 ), ldda, dF(k+1,0 ), lddf, c_one, dA(rk, k+1), ldda, queue ); #endif } /* Update partial column norms. */ if (rk < min(m, n+offset)-1 ) { magmablas_dznrm2_row_check_adjust( n-k-1, tol3z, &vn1[k+1], &vn2[k+1], dA(rk,k+1), ldda, dlsticcs, queue ); //magma_device_sync(); magma_dgetvector( 1, &dlsticcs[0], 1, &lsticc, 1, queue ); } ++k; } magma_zcopymatrix( 1, k, dAks, 1, dA(offset, 0), ldda+1, queue ); // leave k as the last column done --k; *kb = k + 1; rk = offset + *kb - 1; /* Apply the block reflector to the rest of the matrix: A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)' */ if (*kb < min(n, m - offset)) { i__1 = m - rk - 1; i__2 = n - *kb; magma_zgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb, c_neg_one, dA(rk+1, 0 ), ldda, dF(*kb, 0 ), lddf, c_one, dA(rk+1, *kb), ldda, queue ); } /* Recomputation of difficult columns. */ if ( lsticc > 0 ) { // printf( " -- recompute dnorms --\n" ); magmablas_dznrm2_check( m-rk-1, n-*kb, dA(rk+1,*kb), ldda, &vn1[*kb], dlsticcs, queue ); magma_dcopymatrix( n-*kb, 1, &vn1[*kb], *kb, &vn2[*kb], *kb, queue ); } magma_free( dAks ); magma_free( dlsticcs ); magma_queue_destroy( queue ); return MAGMA_SUCCESS; } /* magma_zlaqps */
/** Purpose ------- DSGESV computes the solution to a real system of linear equations A * X = B, A**T * X = B, or A**H * X = B, where A is an N-by-N matrix and X and B are N-by-NRHS matrices. DSGESV first attempts to factorize the matrix in real SINGLE PRECISION and use this factorization within an iterative refinement procedure to produce a solution with real DOUBLE PRECISION norm-wise backward error quality (see below). If the approach fails the method switches to a real DOUBLE PRECISION factorization and solve. The iterative refinement is not going to be a winning strategy if the ratio real SINGLE PRECISION performance over real DOUBLE PRECISION performance is too small. A reasonable strategy should take the number of right-hand sides and the size of the matrix into account. This might be done with a call to ILAENV in the future. Up to now, we always try iterative refinement. The iterative refinement process is stopped if ITER > ITERMAX or for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX where o ITER is the number of the current iteration in the iterative refinement process o RNRM is the infinity-norm of the residual o XNRM is the infinity-norm of the solution o ANRM is the infinity-operator-norm of the matrix A o EPS is the machine epsilon returned by DLAMCH('Epsilon') The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively. Arguments --------- @param[in] trans magma_trans_t Specifies the form of the system of equations: - = MagmaNoTrans: A * X = B (No transpose) - = MagmaTrans: A**T * X = B (Transpose) - = MagmaConjTrans: A**H * X = B (Conjugate transpose) @param[in] n INTEGER The number of linear equations, i.e., the order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in,out] dA DOUBLE PRECISION array on the GPU, dimension (ldda,N) On entry, the N-by-N coefficient matrix A. On exit, if iterative refinement has been successfully used (info.EQ.0 and ITER.GE.0, see description below), A is unchanged. If double precision factorization has been used (info.EQ.0 and ITER.LT.0, see description below), then the array dA contains the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array dA. ldda >= max(1,N). @param[out] ipiv INTEGER array, dimension (N) The pivot indices that define the permutation matrix P; row i of the matrix was interchanged with row IPIV(i). Corresponds either to the single precision factorization (if info.EQ.0 and ITER.GE.0) or the double precision factorization (if info.EQ.0 and ITER.LT.0). @param[out] dipiv INTEGER array on the GPU, dimension (N) The pivot indices; for 1 <= i <= N, after permuting, row i of the matrix was moved to row dIPIV(i). Note this is different than IPIV, where interchanges are applied one-after-another. @param[in] dB DOUBLE PRECISION array on the GPU, dimension (lddb,NRHS) The N-by-NRHS right hand side matrix B. @param[in] lddb INTEGER The leading dimension of the array dB. lddb >= max(1,N). @param[out] dX DOUBLE PRECISION array on the GPU, dimension (lddx,NRHS) If info = 0, the N-by-NRHS solution matrix X. @param[in] lddx INTEGER The leading dimension of the array dX. lddx >= max(1,N). @param dworkd (workspace) DOUBLE PRECISION array on the GPU, dimension (N*NRHS) This array is used to hold the residual vectors. @param dworks (workspace) SINGLE PRECISION array on the GPU, dimension (N*(N+NRHS)) This array is used to store the real single precision matrix and the right-hand sides or solutions in single precision. @param[out] iter INTEGER - < 0: iterative refinement has failed, double precision factorization has been performed + -1 : the routine fell back to full precision for implementation- or machine-specific reasons + -2 : narrowing the precision induced an overflow, the routine fell back to full precision + -3 : failure of SGETRF + -31: stop the iterative refinement after the 30th iteration - > 0: iterative refinement has been successfully used. Returns the number of iterations @param[out] info INTEGER - = 0: successful exit - < 0: if info = -i, the i-th argument had an illegal value - > 0: if info = i, U(i,i) computed in DOUBLE PRECISION is exactly zero. The factorization has been completed, but the factor U is exactly singular, so the solution could not be computed. @ingroup magma_dgesv_driver ********************************************************************/ extern "C" magma_int_t magma_dsgesv_gpu(magma_trans_t trans, magma_int_t n, magma_int_t nrhs, double *dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *dipiv, double *dB, magma_int_t lddb, double *dX, magma_int_t lddx, double *dworkd, float *dworks, magma_int_t *iter, magma_int_t *info) { #define dB(i,j) (dB + (i) + (j)*lddb) #define dX(i,j) (dX + (i) + (j)*lddx) #define dR(i,j) (dR + (i) + (j)*lddr) double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; magma_int_t ione = 1; double *dR; float *dSA, *dSX; double Xnrmv, Rnrmv; double Anrm, Xnrm, Rnrm, cte, eps; magma_int_t i, j, iiter, lddsa, lddr; /* Check arguments */ *iter = 0; *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -2; else if ( ldda < max(1,n)) *info = -4; else if ( lddb < max(1,n)) *info = -8; else if ( lddx < max(1,n)) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if ( n == 0 || nrhs == 0 ) return *info; lddsa = n; lddr = n; dSA = dworks; dSX = dSA + lddsa*n; dR = dworkd; eps = lapackf77_dlamch("Epsilon"); Anrm = magmablas_dlange(MagmaInfNorm, n, n, dA, ldda, (double*)dworkd ); cte = Anrm * eps * pow((double)n, 0.5) * BWDMAX; /* * Convert to single precision */ //magmablas_dlag2s( n, nrhs, dB, lddb, dSX, lddsx, info ); // done inside dsgetrs with pivots if (*info != 0) { *iter = -2; goto FALLBACK; } magmablas_dlag2s( n, n, dA, ldda, dSA, lddsa, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // factor dSA in single precision magma_sgetrf_gpu( n, n, dSA, lddsa, ipiv, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Generate parallel pivots { magma_int_t *newipiv; magma_imalloc_cpu( &newipiv, n ); if ( newipiv == NULL ) { *iter = -3; goto FALLBACK; } swp2pswp( trans, n, ipiv, newipiv ); magma_setvector( n, sizeof(magma_int_t), newipiv, 1, dipiv, 1 ); magma_free_cpu( newipiv ); } // solve dSA*dSX = dB in single precision // converts dB to dSX and applies pivots, solves, then converts result back to dX magma_dsgetrs_gpu( trans, n, nrhs, dSA, lddsa, dipiv, dB, lddb, dX, lddx, dSX, info ); // residual dR = dB - dA*dX in double precision magmablas_dlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dR, lddr ); if ( nrhs == 1 ) { magma_dgemv( trans, n, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_dgemm( trans, MagmaNoTrans, n, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } // TODO: use MAGMA_D_ABS( dX(i,j) ) instead of dlange? for( j=0; j < nrhs; j++ ) { i = magma_idamax( n, dX(0,j), 1) - 1; magma_dgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_idamax ( n, dR(0,j), 1 ) - 1; magma_dgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto REFINEMENT; } } *iter = 0; return *info; REFINEMENT: for( iiter=1; iiter < ITERMAX; ) { *info = 0; // convert residual dR to single precision dSX // solve dSA*dSX = R in single precision // convert result back to double precision dR // it's okay that dR is used for both dB input and dX output. magma_dsgetrs_gpu( trans, n, nrhs, dSA, lddsa, dipiv, dR, lddr, dR, lddr, dSX, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Add correction and setup residual // dX += dR --and-- // dR = dB // This saves going through dR a second time (if done with one more kernel). // -- not really: first time is read, second time is write. for( j=0; j < nrhs; j++ ) { magmablas_daxpycp( n, dR(0,j), dX(0,j), dB(0,j) ); } // residual dR = dB - dA*dX in double precision if ( nrhs == 1 ) { magma_dgemv( trans, n, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_dgemm( trans, MagmaNoTrans, n, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } /* Check whether the nrhs normwise backward errors satisfy the * stopping criterion. If yes, set ITER=IITER > 0 and return. */ for( j=0; j < nrhs; j++ ) { i = magma_idamax( n, dX(0,j), 1) - 1; magma_dgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_idamax ( n, dR(0,j), 1 ) - 1; magma_dgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto L20; } } /* If we are here, the nrhs normwise backward errors satisfy * the stopping criterion, we are good to exit. */ *iter = iiter; return *info; L20: iiter++; } /* If we are at this place of the code, this is because we have * performed ITER=ITERMAX iterations and never satisified the * stopping criterion. Set up the ITER flag accordingly and follow * up on double precision routine. */ *iter = -ITERMAX - 1; FALLBACK: /* Single-precision iterative refinement failed to converge to a * satisfactory solution, so we resort to double precision. */ magma_dgetrf_gpu( n, n, dA, ldda, ipiv, info ); if (*info == 0) { magmablas_dlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dX, lddx ); magma_dgetrs_gpu( trans, n, nrhs, dA, ldda, ipiv, dX, lddx, info ); } return *info; }