void MAGMAF_ZGEQRF_GPU( magma_int_t *m, magma_int_t *n, devptr_t *dA, magma_int_t *ldda, cuDoubleComplex *tau, devptr_t *dT, magma_int_t *info) { magma_zgeqrf_gpu( *m, *n, DEVPTR(dA), *ldda, tau, DEVPTR(dT), info); }
extern "C" magma_int_t magma_zgels_gpu( char trans, magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex *dA, magma_int_t ldda, magmaDoubleComplex *dB, magma_int_t lddb, magmaDoubleComplex *hwork, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= Solves the overdetermined, least squares problem min || A*X - C || using the QR factorization A. The underdetermined problem (m < n) is not currently handled. Arguments ========= TRANS (input) CHARACTER*1 = 'N': the linear system involves A. Only trans='N' is currently handled. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. M >= N >= 0. NRHS (input) INTEGER The number of columns of the matrix C. NRHS >= 0. DA (input/output) COMPLEX_16 array on the GPU, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, A is overwritten by details of its QR factorization as returned by ZGEQRF. LDDA (input) INTEGER The leading dimension of the array A, LDDA >= M. DB (input/output) COMPLEX_16 array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. LDDB (input) INTEGER The leading dimension of the array DB. LDDB >= M. HWORK (workspace/output) COMPLEX_16 array, dimension MAX(1,LWORK). On exit, if INFO = 0, HWORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array HWORK, LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB, where NB is the blocksize given by magma_get_zgeqrf_nb( M ). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the HWORK array. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ magmaDoubleComplex *dT, *tau; magma_int_t k; magma_int_t nb = magma_get_zgeqrf_nb(m); magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb; int lquery = (lwork == -1); hwork[0] = MAGMA_Z_MAKE( (double)lwkopt, 0. ); *info = 0; /* For now, N is the only case working */ if ( (trans != 'N') && (trans != 'n' ) ) *info = -1; else if (m < 0) *info = -2; else if (n < 0 || m < n) /* LQ is not handle for now*/ *info = -3; else if (nrhs < 0) *info = -4; else if (ldda < max(1,m)) *info = -6; else if (lddb < max(1,m)) *info = -8; else if (lwork < lwkopt && ! lquery) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { hwork[0] = MAGMA_Z_ONE; return *info; } /* * Allocate temporary buffers */ int ldtwork = ( 2*k + ((n+31)/32)*32 )*nb; if (nb < nrhs) ldtwork = ( 2*k + ((n+31)/32)*32 )*nrhs; if (MAGMA_SUCCESS != magma_zmalloc( &dT, ldtwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_zmalloc_cpu( &tau, k ); if ( tau == NULL ) { magma_free( dT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_zgeqrf_gpu( m, n, dA, ldda, tau, dT, info ); if ( *info == 0 ) { magma_zgeqrs_gpu( m, n, nrhs, dA, ldda, tau, dT, dB, lddb, hwork, lwork, info ); } magma_free( dT ); magma_free_cpu(tau); return *info; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zungqr_gpu */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double Anorm, error, work[1]; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *hA, *hR, *tau, *h_work; magmaDoubleComplex_ptr dA, dT; magma_int_t m, n, k; magma_int_t n2, lda, ldda, lwork, min_mn, nb, info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); opts.lapack |= opts.check; // check (-c) implies lapack (-l) printf(" m n k CPU GFlop/s (sec) GPU GFlop/s (sec) ||R|| / ||A||\n"); printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; if ( m < n || n < k ) { printf( "%5d %5d %5d skipping because m < n or n < k\n", (int) m, (int) n, (int) k ); continue; } lda = m; ldda = ((m + 31)/32)*32; n2 = lda*n; min_mn = min(m, n); nb = magma_get_zgeqrf_nb( m ); lwork = (m + 2*n+nb)*nb; gflops = FLOPS_ZUNGQR( m, n, k ) / 1e9; TESTING_MALLOC_PIN( hA, magmaDoubleComplex, lda*n ); TESTING_MALLOC_PIN( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_CPU( hR, magmaDoubleComplex, lda*n ); TESTING_MALLOC_CPU( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*n ); TESTING_MALLOC_DEV( dT, magmaDoubleComplex, ( 2*min_mn + ((n + 31)/32)*32 )*nb ); lapackf77_zlarnv( &ione, ISEED, &n2, hA ); lapackf77_zlacpy( MagmaFullStr, &m, &n, hA, &lda, hR, &lda ); Anorm = lapackf77_zlange("f", &m, &n, hA, &lda, work ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // first, get QR factors in both hA and dA // okay that magma_zgeqrf_gpu has special structure for R; R isn't used here. magma_zsetmatrix( m, n, hA, lda, dA, ldda ); magma_zgeqrf_gpu( m, n, dA, ldda, tau, dT, &info ); if (info != 0) printf("magma_zgeqrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_zgetmatrix( m, n, dA, ldda, hA, lda ); gpu_time = magma_wtime(); magma_zungqr_gpu( m, n, k, dA, ldda, tau, dT, nb, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zungqr_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); // Get dA back to the CPU to compare with the CPU result. magma_zgetmatrix( m, n, dA, ldda, hR, lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_zungqr( &m, &n, &k, hA, &lda, tau, h_work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zungqr returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute relative error |R|/|A| := |Q_magma - Q_lapack|/|A| blasf77_zaxpy( &n2, &c_neg_one, hA, &ione, hR, &ione ); error = lapackf77_zlange("f", &m, &n, hR, &lda, work) / Anorm; bool okay = (error < tol); status += ! okay; printf("%5d %5d %5d %7.1f (%7.2f) %7.1f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (okay ? "ok" : "failed")); } else { printf("%5d %5d %5d --- ( --- ) %7.1f (%7.2f) --- \n", (int) m, (int) n, (int) k, gpu_perf, gpu_time ); } TESTING_FREE_PIN( hA ); TESTING_FREE_PIN( h_work ); TESTING_FREE_CPU( hR ); TESTING_FREE_CPU( tau ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dT ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_zqr( magma_int_t m, magma_int_t n, magma_z_matrix A, magma_int_t lda, magma_z_matrix *Q, magma_z_matrix *R, magma_queue_t queue ) { magma_int_t info = 0; // local constants const magmaDoubleComplex c_zero = MAGMA_Z_ZERO; // local variables magma_int_t inc = 1; magma_int_t k = min(m,n); magma_int_t ldt; magma_int_t nb; magmaDoubleComplex *tau = NULL; magmaDoubleComplex *dT = NULL; magmaDoubleComplex *dA = NULL; magma_z_matrix dR1 = {Magma_CSR}; // allocate CPU resources CHECK( magma_zmalloc_pinned( &tau, k ) ); // query number of blocks required for QR factorization nb = magma_get_zgeqrf_nb( m, n ); ldt = (2 * k + magma_roundup(n, 32)) * nb; CHECK( magma_zmalloc( &dT, ldt ) ); // get copy of matrix array if ( A.memory_location == Magma_DEV ) { dA = A.dval; } else { CHECK( magma_zmalloc( &dA, lda * n ) ); magma_zsetvector( lda * n, A.val, inc, dA, inc, queue ); } // QR factorization magma_zgeqrf_gpu( m, n, dA, lda, tau, dT, &info ); // construct R matrix if ( R != NULL ) { if ( A.memory_location == Magma_DEV ) { CHECK( magma_zvinit( R, Magma_DEV, lda, n, c_zero, queue ) ); magmablas_zlacpy( MagmaUpper, k, n, dA, lda, R->dval, lda, queue ); } else { CHECK( magma_zvinit( &dR1, Magma_DEV, lda, n, c_zero, queue ) ); magmablas_zlacpy( MagmaUpper, k, n, dA, lda, dR1.dval, lda, queue ); CHECK( magma_zvinit( R, Magma_CPU, lda, n, c_zero, queue ) ); magma_zgetvector( lda * n, dR1.dval, inc, R->val, inc, queue ); } } // construct Q matrix if ( Q != NULL ) { magma_zungqr_gpu( m, n, k, dA, lda, tau, dT, nb, &info ); if ( A.memory_location == Magma_DEV ) { CHECK( magma_zvinit( Q, Magma_DEV, lda, n, c_zero, queue ) ); magma_zcopyvector( lda * n, dA, inc, Q->dval, inc, queue ); } else { CHECK( magma_zvinit( Q, Magma_CPU, lda, n, c_zero, queue ) ); magma_zgetvector( lda * n, dA, inc, Q->val, inc, queue ); } } cleanup: if( info != 0 ){ magma_zmfree( Q, queue ); magma_zmfree( R, queue ); magma_zmfree( &dR1, queue ); } // free resources magma_free_pinned( tau ); magma_free( dT ); if ( A.memory_location == Magma_CPU ) { magma_free( dA ); } return info; }
/** Purpose ------- ZCGEQRSV solves the least squares problem min || A*X - B ||, where A is an M-by-N matrix and X and B are M-by-NRHS matrices. ZCGEQRSV first attempts to factorize the matrix in complex SINGLE PRECISION and use this factorization within an iterative refinement procedure to produce a solution with complex DOUBLE PRECISION norm-wise backward error quality (see below). If the approach fails the method switches to a complex DOUBLE PRECISION factorization and solve. The iterative refinement is not going to be a winning strategy if the ratio complex SINGLE PRECISION performance over complex DOUBLE PRECISION performance is too small. A reasonable strategy should take the number of right-hand sides and the size of the matrix into account. This might be done with a call to ILAENV in the future. Up to now, we always try iterative refinement. The iterative refinement process is stopped if ITER > ITERMAX or for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX where o ITER is the number of the current iteration in the iterative refinement process o RNRM is the infinity-norm of the residual o XNRM is the infinity-norm of the solution o ANRM is the infinity-operator-norm of the matrix A o EPS is the machine epsilon returned by DLAMCH('Epsilon') The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. M >= N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the M-by-N coefficient matrix A. On exit, if iterative refinement has been successfully used (info.EQ.0 and ITER.GE.0, see description below), A is unchanged. If double precision factorization has been used (info.EQ.0 and ITER.LT.0, see description below), then the array dA contains the QR factorization of A as returned by function DGEQRF_GPU. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). @param[in,out] dB COMPLEX_16 array on the GPU, dimension (LDDB,NRHS) The M-by-NRHS right hand side matrix B. May be overwritten (e.g., if refinement fails). @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= max(1,M). @param[out] dX COMPLEX_16 array on the GPU, dimension (LDDX,NRHS) If info = 0, the N-by-NRHS solution matrix X. @param[in] lddx INTEGER The leading dimension of the array dX. LDDX >= max(1,N). @param[out] iter INTEGER - < 0: iterative refinement has failed, double precision factorization has been performed + -1 : the routine fell back to full precision for implementation- or machine-specific reasons + -2 : narrowing the precision induced an overflow, the routine fell back to full precision + -3 : failure of SGEQRF + -31: stop the iterative refinement after the 30th iteration - > 0: iterative refinement has been successfully used. Returns the number of iterations @param[out] info INTEGER - = 0: successful exit - < 0: if info = -i, the i-th argument had an illegal value @ingroup magma_zgels_driver ********************************************************************/ extern "C" magma_int_t magma_zcgeqrsv_gpu( magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex_ptr dA, magma_int_t ldda, magmaDoubleComplex_ptr dB, magma_int_t lddb, magmaDoubleComplex_ptr dX, magma_int_t lddx, magma_int_t *iter, magma_int_t *info) { #define dB(i,j) (dB + (i) + (j)*lddb) #define dX(i,j) (dX + (i) + (j)*lddx) #define dR(i,j) (dR + (i) + (j)*lddr) #define dSX(i,j) (dSX + (i) + (j)*lddsx) magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t ione = 1; magmaDoubleComplex *hworkd; magmaFloatComplex *hworks; magmaDoubleComplex *tau; magmaFloatComplex *stau; magmaDoubleComplex_ptr dworkd; magmaFloatComplex_ptr dworks; magmaDoubleComplex_ptr dR, dT; magmaFloatComplex_ptr dSA, dSX, dST; magmaDoubleComplex Xnrmv, Rnrmv; double Anrm, Xnrm, Rnrm, cte, eps; magma_int_t i, j, iiter, lddsa, lddsx, lddr, nb, lhwork, minmn, size, ldworkd; /* Check arguments */ *iter = 0; *info = 0; if ( m < 0 ) *info = -1; else if ( n < 0 || n > m ) *info = -2; else if ( nrhs < 0 ) *info = -3; else if ( ldda < max(1,m)) *info = -5; else if ( lddb < max(1,m)) *info = -7; else if ( lddx < max(1,n)) *info = -9; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if ( m == 0 || n == 0 || nrhs == 0 ) return *info; nb = magma_get_cgeqrf_nb(m); minmn= min(m, n); /* dSX contains both B and X, so must be max(m or lddb,n). */ lddsa = ldda; lddsx = max(lddb,n); lddr = lddb; /* * Allocate temporary buffers */ /* dworks(dSA + dSX + dST) */ size = lddsa*n + lddsx*nrhs + ( 2*minmn + ((n+31)/32)*32 )*nb; if (MAGMA_SUCCESS != magma_cmalloc( &dworks, size )) { fprintf(stderr, "Allocation of dworks failed (%d)\n", (int) size); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dSA = dworks; dSX = dSA + lddsa*n; dST = dSX + lddsx*nrhs; /* dworkd(dR) = lddr*nrhs */ ldworkd = lddr*nrhs; if (MAGMA_SUCCESS != magma_zmalloc( &dworkd, ldworkd )) { magma_free( dworks ); fprintf(stderr, "Allocation of dworkd failed\n"); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dR = dworkd; /* hworks(workspace for cgeqrs + stau) = min(m,n) + lhworks */ lhwork = (m - n + nb)*(nrhs + nb) + nrhs*nb; size = lhwork + minmn; magma_cmalloc_cpu( &hworks, size ); if ( hworks == NULL ) { magma_free( dworks ); magma_free( dworkd ); fprintf(stderr, "Allocation of hworks failed\n"); *info = MAGMA_ERR_HOST_ALLOC; return *info; } stau = hworks + lhwork; eps = lapackf77_dlamch("Epsilon"); Anrm = magmablas_zlange(MagmaInfNorm, m, n, dA, ldda, (double*)dworkd ); cte = Anrm * eps * pow((double)n, 0.5) * BWDMAX; /* * Convert to single precision */ magmablas_zlag2c( m, nrhs, dB, lddb, dSX, lddsx, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } magmablas_zlag2c( m, n, dA, ldda, dSA, lddsa, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // factor dSA in single precision magma_cgeqrf_gpu( m, n, dSA, lddsa, stau, dST, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // solve dSA*dSX = dB in single precision magma_cgeqrs_gpu( m, n, nrhs, dSA, lddsa, stau, dST, dSX, lddsx, hworks, lhwork, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // residual dR = dB - dA*dX in double precision magmablas_clag2z( n, nrhs, dSX, lddsx, dX, lddx, info ); magmablas_zlacpy( MagmaUpperLower, m, nrhs, dB, lddb, dR, lddr ); if ( nrhs == 1 ) { magma_zgemv( MagmaNoTrans, m, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } // TODO: use MAGMA_Z_ABS( dX(i,j) ) instead of zlange? for( j=0; j < nrhs; j++ ) { i = magma_izamax( n, dX(0,j), 1) - 1; magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_izamax ( m, dR(0,j), 1 ) - 1; magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto REFINEMENT; } } *iter = 0; /* Free workspaces */ magma_free( dworks ); magma_free( dworkd ); magma_free_cpu( hworks ); return *info; REFINEMENT: /* TODO: this iterative refinement algorithm works only for compatibile * systems (B in colspan of A). * See Matrix Computations (3rd ed) p. 267 for correct algorithm. */ for( iiter=1; iiter < ITERMAX; ) { *info = 0; // convert residual dR to single precision dSX magmablas_zlag2c( m, nrhs, dR, lddr, dSX, lddsx, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // solve dSA*dSX = R in single precision magma_cgeqrs_gpu( m, n, nrhs, dSA, lddsa, stau, dST, dSX, lddsx, hworks, lhwork, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Add correction and setup residual // dX += dSX [including conversion] --and-- // dR[1:n] = dB[1:n] (only n rows, not whole m rows! -- useless if m > n) for( j=0; j < nrhs; j++ ) { magmablas_zcaxpycp( n, dSX(0,j), dX(0,j), dB(0,j), dR(0,j) ); } // dR = dB (whole m rows) magmablas_zlacpy( MagmaUpperLower, m, nrhs, dB, lddb, dR, lddr ); // residual dR = dB - dA*dX in double precision if ( nrhs == 1 ) { magma_zgemv( MagmaNoTrans, m, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } /* Check whether the nrhs normwise backward errors satisfy the * stopping criterion. If yes, set ITER=IITER > 0 and return. */ for( j=0; j < nrhs; j++ ) { i = magma_izamax( n, dX(0,j), 1) - 1; magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_izamax ( m, dR(0,j), 1 ) - 1; magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto L20; } } /* If we are here, the nrhs normwise backward errors satisfy * the stopping criterion, we are good to exit. */ *iter = iiter; /* Free workspaces */ magma_free( dworks ); magma_free( dworkd ); magma_free_cpu( hworks ); return *info; L20: iiter++; } /* If we are at this place of the code, this is because we have * performed ITER=ITERMAX iterations and never satisified the * stopping criterion. Set up the ITER flag accordingly and follow * up on double precision routine. */ *iter = -ITERMAX - 1; FALLBACK: /* Single-precision iterative refinement failed to converge to a * satisfactory solution, so we resort to double precision. */ magma_free( dworks ); magma_free_cpu( hworks ); /* * Allocate temporary buffers */ /* dworkd = dT for zgeqrf */ nb = magma_get_zgeqrf_nb( m ); size = (2*min(m, n) + (n+31)/32*32 )*nb; if ( size > ldworkd ) { magma_free( dworkd ); if (MAGMA_SUCCESS != magma_zmalloc( &dworkd, size )) { fprintf(stderr, "Allocation of dworkd2 failed\n"); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } dT = dworkd; /* hworkd(dtau + workspace for zgeqrs) = min(m,n) + lhwork */ size = lhwork + minmn; magma_zmalloc_cpu( &hworkd, size ); if ( hworkd == NULL ) { magma_free( dworkd ); fprintf(stderr, "Allocation of hworkd2 failed\n"); *info = MAGMA_ERR_HOST_ALLOC; return *info; } tau = hworkd + lhwork; magma_zgeqrf_gpu( m, n, dA, ldda, tau, dT, info ); if (*info == 0) { // if m > n, then dB won't fit in dX, so solve with dB and copy n rows to dX magma_zgeqrs_gpu( m, n, nrhs, dA, ldda, tau, dT, dB, lddb, hworkd, lhwork, info ); magmablas_zlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dX, lddx ); } magma_free( dworkd ); magma_free_cpu( hworkd ); return *info; }
extern "C" magma_int_t magma_zcgeqrsv_gpu(magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex *dA, magma_int_t ldda, magmaDoubleComplex *dB, magma_int_t lddb, magmaDoubleComplex *dX, magma_int_t lddx, magma_int_t *iter, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZCGEQRSV solves the least squares problem min || A*X - B ||, where A is an M-by-N matrix and X and B are M-by-NRHS matrices. ZCGEQRSV first attempts to factorize the matrix in complex SINGLE PRECISION and use this factorization within an iterative refinement procedure to produce a solution with complex DOUBLE PRECISION norm-wise backward error quality (see below). If the approach fails the method switches to a complex DOUBLE PRECISION factorization and solve. The iterative refinement is not going to be a winning strategy if the ratio complex SINGLE PRECISION performance over complex DOUBLE PRECISION performance is too small. A reasonable strategy should take the number of right-hand sides and the size of the matrix into account. This might be done with a call to ILAENV in the future. Up to now, we always try iterative refinement. The iterative refinement process is stopped if ITER > ITERMAX or for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX where o ITER is the number of the current iteration in the iterative refinement process o RNRM is the infinity-norm of the residual o XNRM is the infinity-norm of the solution o ANRM is the infinity-operator-norm of the matrix A o EPS is the machine epsilon returned by DLAMCH('Epsilon') The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. M >= N >= 0. NRHS (input) INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. dA (input or input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the M-by-N coefficient matrix A. On exit, if iterative refinement has been successfully used (info.EQ.0 and ITER.GE.0, see description below), A is unchanged. If double precision factorization has been used (info.EQ.0 and ITER.LT.0, see description below), then the array dA contains the QR factorization of A as returned by function DGEQRF_GPU. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). dB (input or input/output) COMPLEX_16 array on the GPU, dimension (LDDB,NRHS) The M-by-NRHS right hand side matrix B. May be overwritten (e.g., if refinement fails). LDDB (input) INTEGER The leading dimension of the array dB. LDDB >= max(1,M). dX (output) COMPLEX_16 array on the GPU, dimension (LDDX,NRHS) If info = 0, the N-by-NRHS solution matrix X. LDDX (input) INTEGER The leading dimension of the array dX. LDDX >= max(1,N). ITER (output) INTEGER < 0: iterative refinement has failed, double precision factorization has been performed -1 : the routine fell back to full precision for implementation- or machine-specific reasons -2 : narrowing the precision induced an overflow, the routine fell back to full precision -3 : failure of SGEQRF -31: stop the iterative refinement after the 30th iteration > 0: iterative refinement has been successfully used. Returns the number of iterations INFO (output) INTEGER = 0: successful exit < 0: if info = -i, the i-th argument had an illegal value ===================================================================== */ #define dB(i,j) (dB + (i) + (j)*lddb) #define dX(i,j) (dX + (i) + (j)*lddx) #define dR(i,j) (dR + (i) + (j)*lddr) #define dSX(i,j) (dSX + (i) + (j)*lddsx) magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t ione = 1; magmaDoubleComplex *dworkd, *hworkd; magmaFloatComplex *dworks, *hworks; magmaDoubleComplex *dR, *tau, *dT; magmaFloatComplex *dSA, *dSX, *dST, *stau; magmaDoubleComplex Xnrmv, Rnrmv; double Anrm, Xnrm, Rnrm, cte, eps; magma_int_t i, j, iiter, lddsa, lddsx, lddr, nb, lhwork, minmn, size, ldworkd; /* Check arguments */ *iter = 0; *info = 0; if ( m < 0 ) *info = -1; else if ( n < 0 || n > m ) *info = -2; else if ( nrhs < 0 ) *info = -3; else if ( ldda < max(1,m)) *info = -5; else if ( lddb < max(1,m)) *info = -7; else if ( lddx < max(1,n)) *info = -9; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if ( m == 0 || n == 0 || nrhs == 0 ) return *info; nb = magma_get_cgeqrf_nb(m); minmn= min(m, n); /* dSX contains both B and X, so must be max(m or lddb,n). */ lddsa = ldda; lddsx = max(lddb,n); lddr = lddb; /* * Allocate temporary buffers */ /* dworks(dSA + dSX + dST) */ size = lddsa*n + lddsx*nrhs + ( 2*minmn + ((n+31)/32)*32 )*nb; if (MAGMA_SUCCESS != magma_cmalloc( &dworks, size )) { fprintf(stderr, "Allocation of dworks failed (%d)\n", (int) size); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dSA = dworks; dSX = dSA + lddsa*n; dST = dSX + lddsx*nrhs; /* dworkd(dR) = lddr*nrhs */ ldworkd = lddr*nrhs; if (MAGMA_SUCCESS != magma_zmalloc( &dworkd, ldworkd )) { magma_free( dworks ); fprintf(stderr, "Allocation of dworkd failed\n"); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dR = dworkd; /* hworks(workspace for cgeqrs + stau) = min(m,n) + lhworks */ lhwork = (m - n + nb)*(nrhs + nb) + nrhs*nb; size = lhwork + minmn; magma_cmalloc_cpu( &hworks, size ); if ( hworks == NULL ) { magma_free( dworks ); magma_free( dworkd ); fprintf(stderr, "Allocation of hworks failed\n"); *info = MAGMA_ERR_HOST_ALLOC; return *info; } stau = hworks + lhwork; eps = lapackf77_dlamch("Epsilon"); Anrm = magmablas_zlange('I', m, n, dA, ldda, (double*)dworkd ); cte = Anrm * eps * pow((double)n, 0.5) * BWDMAX; /* * Convert to single precision */ magmablas_zlag2c( m, nrhs, dB, lddb, dSX, lddsx, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } magmablas_zlag2c( m, n, dA, ldda, dSA, lddsa, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // factor dSA in single precision magma_cgeqrf_gpu( m, n, dSA, lddsa, stau, dST, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // solve dSA*dSX = dB in single precision magma_cgeqrs_gpu( m, n, nrhs, dSA, lddsa, stau, dST, dSX, lddsx, hworks, lhwork, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // residual dR = dB - dA*dX in double precision magmablas_clag2z( n, nrhs, dSX, lddsx, dX, lddx, info ); magmablas_zlacpy( MagmaUpperLower, m, nrhs, dB, lddb, dR, lddr ); if ( nrhs == 1 ) { magma_zgemv( MagmaNoTrans, m, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } // TODO: use MAGMA_Z_ABS( dX(i,j) ) instead of zlange? for( j=0; j < nrhs; j++ ) { i = magma_izamax( n, dX(0,j), 1) - 1; magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_izamax ( m, dR(0,j), 1 ) - 1; magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto REFINEMENT; } } *iter = 0; /* Free workspaces */ magma_free( dworks ); magma_free( dworkd ); magma_free_cpu( hworks ); return *info; REFINEMENT: /* TODO: this iterative refinement algorithm works only for compatibile * systems (B in colspan of A). * See Matrix Computations (3rd ed) p. 267 for correct algorithm. */ for( iiter=1; iiter < ITERMAX; ) { *info = 0; // convert residual dR to single precision dSX magmablas_zlag2c( m, nrhs, dR, lddr, dSX, lddsx, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // solve dSA*dSX = R in single precision magma_cgeqrs_gpu( m, n, nrhs, dSA, lddsa, stau, dST, dSX, lddsx, hworks, lhwork, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Add correction and setup residual // dX += dSX [including conversion] --and-- // dR[1:n] = dB[1:n] (only n rows, not whole m rows! -- useless if m > n) for( j=0; j < nrhs; j++ ) { magmablas_zcaxpycp( n, dSX(0,j), dX(0,j), dB(0,j), dR(0,j) ); } // dR = dB (whole m rows) magmablas_zlacpy( MagmaUpperLower, m, nrhs, dB, lddb, dR, lddr ); // residual dR = dB - dA*dX in double precision if ( nrhs == 1 ) { magma_zgemv( MagmaNoTrans, m, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } /* Check whether the nrhs normwise backward errors satisfy the * stopping criterion. If yes, set ITER=IITER>0 and return. */ for( j=0; j < nrhs; j++ ) { i = magma_izamax( n, dX(0,j), 1) - 1; magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_izamax ( m, dR(0,j), 1 ) - 1; magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto L20; } } /* If we are here, the nrhs normwise backward errors satisfy * the stopping criterion, we are good to exit. */ *iter = iiter; /* Free workspaces */ magma_free( dworks ); magma_free( dworkd ); magma_free_cpu( hworks ); return *info; L20: iiter++; } /* If we are at this place of the code, this is because we have * performed ITER=ITERMAX iterations and never satisified the * stopping criterion. Set up the ITER flag accordingly and follow * up on double precision routine. */ *iter = -ITERMAX - 1; FALLBACK: /* Single-precision iterative refinement failed to converge to a * satisfactory solution, so we resort to double precision. */ magma_free( dworks ); magma_free_cpu( hworks ); /* * Allocate temporary buffers */ /* dworkd = dT for zgeqrf */ nb = magma_get_zgeqrf_nb( m ); size = (2*min(m, n) + (n+31)/32*32 )*nb; if ( size > ldworkd ) { magma_free( dworkd ); if (MAGMA_SUCCESS != magma_zmalloc( &dworkd, size )) { fprintf(stderr, "Allocation of dworkd2 failed\n"); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } dT = dworkd; /* hworkd(dtau + workspace for zgeqrs) = min(m,n) + lhwork */ size = lhwork + minmn; magma_zmalloc_cpu( &hworkd, size ); if ( hworkd == NULL ) { magma_free( dworkd ); fprintf(stderr, "Allocation of hworkd2 failed\n"); *info = MAGMA_ERR_HOST_ALLOC; return *info; } tau = hworkd + lhwork; magma_zgeqrf_gpu( m, n, dA, ldda, tau, dT, info ); if (*info == 0) { // if m > n, then dB won't fit in dX, so solve with dB and copy n rows to dX magma_zgeqrs_gpu( m, n, nrhs, dA, ldda, tau, dT, dB, lddb, hworkd, lhwork, info ); magmablas_zlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dX, lddx ); } magma_free( dworkd ); magma_free_cpu( hworkd ); return *info; }
void MAGMA_ZGEQRF_GPU(magma_int_t *m, magma_int_t *n, double2 *A, magma_int_t *lda, double2 *tau, double2 *dwork, magma_int_t *info) { magma_zgeqrf_gpu(*m, *n, A, *lda, tau, dwork, info); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zungqr_gpu */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *hA, *hR, *tau, *h_work; magmaDoubleComplex *dA, *dT; magma_int_t m, n, k; magma_int_t n2, lda, ldda, lwork, min_mn, nb, info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) printf(" m n k CPU GFlop/s (sec) GPU GFlop/s (sec) ||R|| / ||A||\n"); printf("=========================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[i]; n = opts.nsize[i]; k = opts.ksize[i]; if ( m < n || n < k ) { printf( "skipping m %d, n %d, k %d because m < n or n < k\n", (int) m, (int) n, (int) k ); continue; } lda = m; ldda = ((m + 31)/32)*32; n2 = lda*n; min_mn = min(m, n); nb = magma_get_zgeqrf_nb( m ); lwork = (m + 2*n+nb)*nb; gflops = FLOPS_ZUNGQR( m, n, k ) / 1e9; TESTING_MALLOC_PIN( hA, magmaDoubleComplex, lda*n ); TESTING_MALLOC_PIN( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_CPU( hR, magmaDoubleComplex, lda*n ); TESTING_MALLOC_CPU( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*n ); TESTING_MALLOC_DEV( dT, magmaDoubleComplex, ( 2*min_mn + ((n + 31)/32)*32 )*nb ); lapackf77_zlarnv( &ione, ISEED, &n2, hA ); lapackf77_zlacpy( MagmaUpperLowerStr, &m, &n, hA, &lda, hR, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zsetmatrix( m, n, hA, lda, dA, ldda ); magma_zgeqrf_gpu( m, n, dA, ldda, tau, dT, &info ); if (info != 0) printf("magma_zgeqrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); gpu_time = magma_wtime(); magma_zungqr_gpu( m, n, k, dA, ldda, tau, dT, nb, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zungqr_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); // Get dA back to the CPU to compare with the CPU result. magma_zgetmatrix( m, n, dA, ldda, hR, lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { error = lapackf77_zlange("f", &m, &n, hA, &lda, work ); lapackf77_zgeqrf( &m, &n, hA, &lda, tau, h_work, &lwork, &info ); if (info != 0) printf("lapackf77_zgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); cpu_time = magma_wtime(); lapackf77_zungqr( &m, &n, &k, hA, &lda, tau, h_work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zungqr returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute relative error |R|/|A| := |Q_magma - Q_lapack|/|A| blasf77_zaxpy( &n2, &c_neg_one, hA, &ione, hR, &ione ); error = lapackf77_zlange("f", &m, &n, hR, &lda, work) / error; printf("%5d %5d %5d %7.1f (%7.2f) %7.1f (%7.2f) %8.2e\n", (int) m, (int) n, (int) k, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); } else { printf("%5d %5d %5d --- ( --- ) %7.1f (%7.2f) --- \n", (int) m, (int) n, (int) k, gpu_perf, gpu_time ); } TESTING_FREE_PIN( hA ); TESTING_FREE_PIN( h_work ); TESTING_FREE_CPU( hR ); TESTING_FREE_CPU( tau ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dT ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
/** Purpose ------- Solves the overdetermined, least squares problem min || A*X - C || using the QR factorization A. The underdetermined problem (m < n) is not currently handled. Arguments --------- @param[in] trans magma_trans_t - = MagmaNoTrans: the linear system involves A. Only TRANS=MagmaNoTrans is currently handled. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. M >= N >= 0. @param[in] nrhs INTEGER The number of columns of the matrix C. NRHS >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, A is overwritten by details of its QR factorization as returned by ZGEQRF. @param[in] ldda INTEGER The leading dimension of the array A, LDDA >= M. @param[in,out] dB COMPLEX_16 array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= M. @param[out] hwork (workspace) COMPLEX_16 array, dimension MAX(1,LWORK). On exit, if INFO = 0, HWORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array HWORK, LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB, where NB is the blocksize given by magma_get_zgeqrf_nb( M ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the HWORK array. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgels_driver ********************************************************************/ extern "C" magma_int_t magma_zgels_gpu( magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex_ptr dA, magma_int_t ldda, magmaDoubleComplex_ptr dB, magma_int_t lddb, magmaDoubleComplex *hwork, magma_int_t lwork, magma_int_t *info) { magmaDoubleComplex_ptr dT; magmaDoubleComplex *tau; magma_int_t k; magma_int_t nb = magma_get_zgeqrf_nb(m); magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb; int lquery = (lwork == -1); hwork[0] = MAGMA_Z_MAKE( (double)lwkopt, 0. ); *info = 0; /* For now, N is the only case working */ if ( trans != MagmaNoTrans ) *info = -1; else if (m < 0) *info = -2; else if (n < 0 || m < n) /* LQ is not handle for now*/ *info = -3; else if (nrhs < 0) *info = -4; else if (ldda < max(1,m)) *info = -6; else if (lddb < max(1,m)) *info = -8; else if (lwork < lwkopt && ! lquery) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { hwork[0] = MAGMA_Z_ONE; return *info; } /* * Allocate temporary buffers */ int ldtwork = ( 2*k + ((n+31)/32)*32 )*nb; if (nb < nrhs) ldtwork = ( 2*k + ((n+31)/32)*32 )*nrhs; if (MAGMA_SUCCESS != magma_zmalloc( &dT, ldtwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_zmalloc_cpu( &tau, k ); if ( tau == NULL ) { magma_free( dT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_zgeqrf_gpu( m, n, dA, ldda, tau, dT, info ); if ( *info == 0 ) { magma_zgeqrs_gpu( m, n, nrhs, dA, ldda, tau, dT, dB, lddb, hwork, lwork, info ); } magma_free( dT ); magma_free_cpu(tau); return *info; }
extern "C" magma_int_t magma_zcgeqrsv_gpu(magma_int_t M, magma_int_t N, magma_int_t NRHS, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *dB, magma_int_t lddb, cuDoubleComplex *dX, magma_int_t lddx, magma_int_t *iter, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= ZCGEQRSV solves the least squares problem min || A*X - B ||, where A is an M-by-N matrix and X and B are M-by-NRHS matrices. ZCGEQRSV first attempts to factorize the matrix in SINGLE PRECISION and use this factorization within an iterative refinement procedure to produce a solution with DOUBLE PRECISION norm-wise backward error quality (see below). If the approach fails the method switches to a DOUBLE PRECISION factorization and solve. The iterative refinement is not going to be a winning strategy if the ratio SINGLE PRECISION performance over DOUBLE PRECISION performance is too small. A reasonable strategy should take the number of right-hand sides and the size of the matrix into account. This might be done with a call to ILAENV in the future. Up to now, we always try iterative refinement. The iterative refinement process is stopped if ITER > ITERMAX or for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX where o ITER is the number of the current iteration in the iterative refinement process o RNRM is the infinity-norm of the residual o XNRM is the infinity-norm of the solution o ANRM is the infinity-operator-norm of the matrix A o EPS is the machine epsilon returned by DLAMCH('Epsilon') The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. M >= N >= 0. NRHS (input) INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. A (input or input/output) DOUBLE PRECISION array, dimension (ldda,N) On entry, the M-by-N coefficient matrix A. On exit, if iterative refinement has been successfully used (info.EQ.0 and ITER.GE.0, see description below), A is unchanged. If double precision factorization has been used (info.EQ.0 and ITER.LT.0, see description below), then the array A contains the QR factorization of A as returned by function DGEQRF_GPU. ldda (input) INTEGER The leading dimension of the array A. ldda >= max(1,M). B (input) DOUBLE PRECISION array, dimension (LDB,NRHS) The M-by-NRHS right hand side matrix B. LDB (input) INTEGER The leading dimension of the array B. LDB >= max(1,M). X (output) DOUBLE PRECISION array, dimension (LDX,NRHS) If info = 0, the N-by-NRHS solution matrix X. LDX (input) INTEGER The leading dimension of the array X. LDX >= max(1,N). WORK (workspace) DOUBLE PRECISION array, dimension (N*NRHS) This array is used to hold the residual vectors. SWORK (workspace) REAL array, dimension (M*(N+NRHS)) This array is used to store the single precision matrix and the right-hand sides or solutions in single precision. ITER (output) INTEGER < 0: iterative refinement has failed, double precision factorization has been performed -1 : the routine fell back to full precision for implementation- or machine-specific reasons -2 : narrowing the precision induced an overflow, the routine fell back to full precision -3 : failure of SGETRF -31: stop the iterative refinement after the 30th iterations > 0: iterative refinement has been successfully used. Returns the number of iterations info (output) INTEGER = 0: successful exit < 0: if info = -i, the i-th argument had an illegal value TAU (output) REAL array, dimension (N) On exit, TAU(i) contains the scalar factor of the elementary reflector H(i), as returned by magma_cgeqrf_gpu. LWORK (input) INTEGER The dimension of the array H_WORK. LWORK >= (M+N+NB)*NB, where NB can be obtained through magma_get_sgeqrf_nb(M). H_WORK (workspace/output) REAL array, dimension (MAX(1,LWORK)) Higher performance is achieved if H_WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. D_WORK (workspace/output) REAL array on the GPU, dimension 2*N*NB, where NB can be obtained through magma_get_sgeqrf_nb(M). It starts with NB*NB blocks that store the triangular T matrices, followed by the NB*NB blocks of the diagonal inverses for the R matrix. TAU_D (output) DOUBLE REAL array, dimension (N) On exit, if the matrix had to be factored in double precision, TAU(i) contains the scalar factor of the elementary reflector H(i), as returned by magma_zgeqrf_gpu. LWORK_D (input) INTEGER The dimension of the array H_WORK_D. LWORK_D >= (M+N+NB)*NB, where NB can be obtained through magma_get_dgeqrf_nb(M). H_WORK_D (workspace/output) DOUBLE REAL array, dimension (MAX(1,LWORK_D)) This memory is unattached if the iterative refinement worked, otherwise it is used as workspace to factor the matrix in double precision. Higher performance is achieved if H_WORK_D is in pinned memory, e.g. allocated using magma_malloc_pinned. D_WORK_D (workspace/output) DOUBLE REAL array on the GPU, dimension 2*N*NB, where NB can be obtained through magma_get_dgeqrf_nb(M). This memory is unattached if the iterative refinement worked, otherwise it is used as workspace to factor the matrix in double precision. It starts with NB*NB blocks that store the triangular T matrices, followed by the NB*NB blocks of the diagonal inverses for the R matrix. ===================================================================== */ cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; cuDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t ione = 1; cuDoubleComplex *dworkd, *hworkd; cuFloatComplex *dworks, *hworks; cuDoubleComplex *dR, *tau, *dT; cuFloatComplex *dSA, *dSX, *dST, *stau; cuDoubleComplex Xnrmv, Rnrmv; double Anrm, Xnrm, Rnrm, cte, eps; magma_int_t i, j, iiter, nb, lhwork, minmn, size; /* Check The Parameters. */ *iter = 0 ; *info = 0 ; if ( N < 0 ) *info = -1; else if(NRHS<0) *info = -3; else if( ldda < max(1,N)) *info = -5; else if( lddb < max(1,N)) *info = -7; else if( lddx < max(1,N)) *info = -9; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if( N == 0 || NRHS == 0 ) return *info; nb = magma_get_cgeqrf_nb(M); minmn= min(M, N); /* * Allocate temporary buffers */ /* dworks(dSA + dSX + dST) */ size = ldda*N + N*NRHS + ( 2*minmn + ((N+31)/32)*32 )*nb; if (MAGMA_SUCCESS != magma_cmalloc( &dworks, size )) { fprintf(stderr, "Allocation of dworks failed (%d)\n", (int) size); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dSA = dworks; dSX = dSA + ldda*N; dST = dSX + N*NRHS; /* dworkd(dR) = N*NRHS */ size = N*NRHS; if (MAGMA_SUCCESS != magma_zmalloc( &dworkd, size )) { magma_free( dworks ); fprintf(stderr, "Allocation of dworkd failed\n"); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dR = dworkd; /* hworks(stau + workspace for cgeqrs) = min(M,N) + lhworks */ lhwork = nb*max((M-N+nb+2*(NRHS)), 1); lhwork = max(lhwork, N*nb); /* We hope that magma nb is bigger than lapack nb to have enough memory in workspace */ size = minmn + lhwork; magma_cmalloc_cpu( &hworks, size ); if ( hworks == NULL ) { magma_free( dworks ); magma_free( dworkd ); fprintf(stderr, "Allocation of hworks failed\n"); *info = MAGMA_ERR_HOST_ALLOC; return *info; } stau = hworks + lhwork; eps = lapackf77_dlamch("Epsilon"); Anrm = magmablas_zlange('I', M, N, dA, ldda, (double*)dworkd ); cte = Anrm * eps * pow((double)N, 0.5) * BWDMAX ; /* * Convert to single precision */ magmablas_zlag2c(N, NRHS, dB, lddb, dSX, N, info ); if( *info != 0 ) { *iter = -2; goto L40; } magmablas_zlag2c(N, N, dA, ldda, dSA, ldda, info ); if(*info !=0){ *iter = -2; goto L40; } // In an ideal version these variables should come from user. magma_cgeqrf_gpu(M, N, dSA, ldda, stau, dST, info); if( *info != 0 ) { *iter = -3; goto L40; } magma_cgeqrs_gpu(M, N, NRHS, dSA, ldda, stau, dST, dSX, N, hworks, lhwork, info); // dX = dSX magmablas_clag2z(N, NRHS, dSX, N, dX, lddx, info); // dR = dB magmablas_zlacpy(MagmaUpperLower, N, NRHS, dB, lddb, dR, N); // dR = dB - dA * dX if( NRHS == 1 ) magma_zgemv( MagmaNoTrans, N, N, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1); else magma_zgemm( MagmaNoTrans, MagmaNoTrans, N, NRHS, N, c_neg_one, dA, ldda, dX, lddx, c_one, dR, N ); for(i=0; i<NRHS; i++){ j = magma_izamax( N, dX+i*N, 1); magma_zgetmatrix( 1, 1, dX+i*N+j-1, 1, &Xnrmv, 1 ); Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); j = magma_izamax ( N, dR+i*N, 1 ); magma_zgetmatrix( 1, 1, dR+i*N+j-1, 1, &Rnrmv, 1 ); Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if( Rnrm > Xnrm *cte ) goto L10; } *iter = 0; /* Free workspaces */ magma_free( dworks ); magma_free( dworkd ); magma_free_cpu( hworks ); return *info; L10: for(iiter=1; iiter<ITERMAX; ) { *info = 0 ; /* Convert R from double precision to single precision and store the result in SX. Solve the system SA*SX = SR. -- These two Tasks are merged here. */ // make SWORK = WORK ... residuals... magmablas_zlag2c( N, NRHS, dR, N, dSX, N, info ); magma_cgeqrs_gpu( M, N, NRHS, dSA, ldda, stau, dST, dSX, N, hworks, lhwork, info); if( *info != 0 ){ *iter = -3; goto L40; } for(i=0; i<NRHS; i++) { magmablas_zcaxpycp( dSX+i*N, dX+i*lddx, N, dB+i*lddb, dR+i*N ); } /* unnecessary may be */ magmablas_zlacpy(MagmaUpperLower, N, NRHS, dB, lddb, dR, N); if( NRHS == 1 ) magma_zgemv( MagmaNoTrans, N, N, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1); else magma_zgemm( MagmaNoTrans, MagmaNoTrans, N, NRHS, N, c_neg_one, dA, ldda, dX, lddx, c_one, dR, N); /* Check whether the NRHS normwise backward errors satisfy the stopping criterion. If yes, set ITER=IITER>0 and return. */ for(i=0;i<NRHS;i++) { j = magma_izamax( N, dX+i*N, 1); magma_zgetmatrix( 1, 1, dX+i*N+j-1, 1, &Xnrmv, 1 ); Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); j = magma_izamax ( N, dR+i*N, 1 ); magma_zgetmatrix( 1, 1, dR+i*N+j-1, 1, &Rnrmv, 1 ); Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if( Rnrm > Xnrm *cte ) goto L20; } /* If we are here, the NRHS normwise backward errors satisfy the stopping criterion, we are good to exit. */ *iter = iiter ; /* Free workspaces */ magma_free( dworks ); magma_free( dworkd ); magma_free_cpu( hworks ); return *info; L20: iiter++; } /* If we are at this place of the code, this is because we have performed ITER=ITERMAX iterations and never satisified the stopping criterion, set up the ITER flag accordingly and follow up on double precision routine. */ *iter = -ITERMAX - 1 ; L40: magma_free( dworks ); /* * Allocate temporary buffers */ /* dworkd(dT + tau) = min_mn + min_mn*nb*3 */ nb = magma_get_zgeqrf_nb(M); size = minmn * (3 * nb + 1); if ( size > (N*NRHS) ) { magma_free( dworkd ); if (MAGMA_SUCCESS != magma_zmalloc( &dworkd, size )) { fprintf(stderr, "Allocation of dworkd2 failed\n"); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } tau = dworkd; dT = tau + minmn; /* hworks(stau + workspace for cgeqrs) = min(M,N) + lhworks */ /* re-use hworks memory for hworkd if possible, else re-allocate. */ if ( (2*lhwork) <= (minmn+lhwork) ) { hworkd = (cuDoubleComplex*) hworks; } else { magma_free_cpu( hworks ); magma_zmalloc_cpu( &hworkd, lhwork ); if ( hworkd == NULL ) { magma_free( dworkd ); fprintf(stderr, "Allocation of hworkd2 failed\n"); *info = MAGMA_ERR_HOST_ALLOC; return *info; } } /* Single-precision iterative refinement failed to converge to a satisfactory solution, so we resort to double precision. */ magma_zgeqrf_gpu(M, N, dA, ldda, tau, dT, info); if ( *info == 0 ) { magmablas_zlacpy(MagmaUpperLower, N, NRHS, dB, lddb, dX, lddx); magma_zgeqrs_gpu(M, N, NRHS, dA, ldda, tau, dT, dX, lddx, hworkd, lhwork, info); } magma_free( dworkd ); magma_free_cpu( hworkd ); return *info; }