void magmaf_dgeqrf_gpu( magma_int_t *m, magma_int_t *n, devptr_t *dA, magma_int_t *ldda, double *tau, devptr_t *dT, magma_int_t *info ) { magma_dgeqrf_gpu( *m, *n, magma_ddevptr(dA), *ldda, tau, magma_ddevptr(dT), info ); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dormqr_gpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double Cnorm, error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t mm, m, n, k, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max, dt_size; double *C, *R, *A, *hwork, *tau; magmaDouble_ptr dC, dA, dT; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); double tol = opts.tolerance * lapackf77_dlamch("E"); // test all combinations of input parameters magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans[] = { MagmaTrans, MagmaNoTrans }; printf("%% M N K side trans CPU Gflop/s (sec) GPU Gflop/s (sec) ||R||_F / ||QC||_F\n"); printf("%%==============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; ldc = magma_roundup( m, opts.align ); // multiple of 32 by default // A is m x k (left) or n x k (right) mm = (side[iside] == MagmaLeft ? m : n); nb = magma_get_dgeqrf_nb( mm, k ); lda = magma_roundup( mm, opts.align ); // multiple of 32 by default gflops = FLOPS_DORMQR( m, n, k, side[iside] ) / 1e9; if ( side[iside] == MagmaLeft && m < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=left and m < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } if ( side[iside] == MagmaRight && n < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=right and n < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } if ( side[iside] == MagmaLeft ) { // side = left lwork_max = (m - k + nb)*(n + nb) + n*nb; dt_size = ( 2*min(m,k) + magma_roundup( max(m,n), 32) )*nb; } else { // side = right lwork_max = (n - k + nb)*(m + nb) + m*nb; dt_size = ( 2*min(n,k) + magma_roundup( max(m,n), 32 ) )*nb; } // this rounds it up slightly if needed to agree with lwork query below lwork_max = int( real( magma_dmake_lwork( lwork_max ))); TESTING_MALLOC_CPU( C, double, ldc*n ); TESTING_MALLOC_CPU( R, double, ldc*n ); TESTING_MALLOC_CPU( A, double, lda*k ); TESTING_MALLOC_CPU( hwork, double, lwork_max ); TESTING_MALLOC_CPU( tau, double, k ); TESTING_MALLOC_DEV( dC, double, ldc*n ); TESTING_MALLOC_DEV( dA, double, lda*k ); TESTING_MALLOC_DEV( dT, double, dt_size ); // C is full, m x n size = ldc*n; lapackf77_dlarnv( &ione, ISEED, &size, C ); magma_dsetmatrix( m, n, C, ldc, dC, ldc ); // A is m x k (left) or n x k (right) size = lda*k; lapackf77_dlarnv( &ione, ISEED, &size, A ); // compute QR factorization to get Householder vectors in dA, tau, dT magma_dsetmatrix( mm, k, A, lda, dA, lda ); magma_dgeqrf_gpu( mm, k, dA, lda, tau, dT, &info ); magma_dgetmatrix( mm, k, dA, lda, A, lda ); if (info != 0) { printf("magma_dgeqrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dormqr( lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), &m, &n, &k, A, &lda, tau, C, &ldc, hwork, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_dormqr returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_dormqr_gpu( side[iside], trans[itran], m, n, k, dA, lda, tau, dC, ldc, hwork, lwork, dT, nb, &info ); if (info != 0) { printf("magma_dormqr_gpu (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); } lwork = (magma_int_t) MAGMA_D_REAL( hwork[0] ); if ( lwork < 0 || lwork > lwork_max ) { printf("Warning: optimal lwork %d > allocated lwork_max %d\n", (int) lwork, (int) lwork_max ); lwork = lwork_max; } // dormqr2 takes a copy of dA in CPU memory if ( opts.version == 2 ) { magma_dgetmatrix( mm, k, dA, lda, A, lda ); } magmablasSetKernelStream( opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); // sync needed for L,N and R,T cases if ( opts.version == 1 ) { magma_dormqr_gpu( side[iside], trans[itran], m, n, k, dA, lda, tau, dC, ldc, hwork, lwork, dT, nb, &info ); } else if ( opts.version == 2 ) { magma_dormqr2_gpu( side[iside], trans[itran], m, n, k, dA, lda, tau, dC, ldc, A, lda, &info ); } gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_dormqr_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } magma_dgetmatrix( m, n, dC, ldc, R, ldc ); /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ size = ldc*n; blasf77_daxpy( &size, &c_neg_one, C, &ione, R, &ione ); Cnorm = lapackf77_dlange( "Fro", &m, &n, C, &ldc, work ); error = lapackf77_dlange( "Fro", &m, &n, R, &ldc, work ) / (magma_dsqrt(m*n) * Cnorm); printf( "%5d %5d %5d %4c %5c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ), cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( hwork ); TESTING_FREE_CPU( tau ); TESTING_FREE_DEV( dC ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dT ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iside, itran printf( "\n" ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_dgels_gpu( magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaDouble_ptr dA, size_t dA_offset, magma_int_t ldda, magmaDouble_ptr dB, size_t dB_offset, magma_int_t lddb, double *hwork, magma_int_t lwork, magma_int_t *info, magma_queue_t queue ) { /* -- clMagma (version 0.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver April 2012 Purpose ======= Solves the overdetermined, least squares problem min || A*X - C || using the QR factorization A. The underdetermined problem (m < n) is not currently handled. Arguments ========= TRANS (input) CHARACTER*1 = 'N': the linear system involves A. Only trans='N' is currently handled. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. M >= N >= 0. NRHS (input) INTEGER The number of columns of the matrix C. NRHS >= 0. A (input/output) DOUBLE_PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, A is overwritten by details of its QR factorization as returned by DGEQRF. LDDA (input) INTEGER The leading dimension of the array A, LDDA >= M. DB (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. LDDB (input) INTEGER The leading dimension of the array DB. LDDB >= M. HWORK (workspace/output) DOUBLE_PRECISION array, dimension MAX(1,LWORK). On exit, if INFO = 0, HWORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array HWORK, LWORK >= max(1,NRHS). For optimum performance LWORK >= (M-N+NB)*(NRHS + 2*NB), where NB is the blocksize given by magma_get_dgeqrf_nb( M ). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the HWORK array. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ #define a_ref(a_1,a_2) dA, (dA_offset+(a_1)+(a_2)*(ldda)) magmaDouble_ptr dT; double *tau; magma_int_t k; magma_int_t nb = magma_get_dgeqrf_nb(m); magma_int_t lwkopt = (m-n+nb)*(nrhs+2*nb); long int lquery = (lwork == -1); hwork[0] = MAGMA_D_MAKE( (double)lwkopt, 0. ); *info = 0; /* For now, N is the only case working */ if ( trans != MagmaNoTrans ) *info = -1; else if (m < 0) *info = -2; else if (n < 0 || m < n) /* LQ is not handle for now*/ *info = -3; else if (nrhs < 0) *info = -4; else if (ldda < max(1,m)) *info = -6; else if (lddb < max(1,m)) *info = -8; else if (lwork < lwkopt && ! lquery) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { hwork[0] = MAGMA_D_ONE; return *info; } /* * Allocate temporary buffers */ int ldtwork = ( 2*k + ((n+31)/32)*32 )*nb; if (nb < nrhs) ldtwork = ( 2*k + ((n+31)/32)*32 )*nrhs; if (MAGMA_SUCCESS != magma_malloc( &dT, ldtwork*sizeof(double) )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } tau = (double*) malloc( k * sizeof(double) ); if( tau == NULL ) { magma_free( dT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } size_t dT_offset = 0; magma_dgeqrf_gpu( m, n, dA, dA_offset, ldda, tau, dT, dT_offset, info, queue ); if ( *info == 0 ) { magma_dgeqrs_gpu( m, n, nrhs, dA, dA_offset, ldda, tau, dT, dT_offset, dB, dB_offset, lddb, hwork, lwork, info, queue ); } magma_free( dT ); free(tau); return *info; }
/** Purpose ------- DSGEQRSV solves the least squares problem min || A*X - B ||, where A is an M-by-N matrix and X and B are M-by-NRHS matrices. DSGEQRSV first attempts to factorize the matrix in real SINGLE PRECISION and use this factorization within an iterative refinement procedure to produce a solution with real DOUBLE PRECISION norm-wise backward error quality (see below). If the approach fails the method switches to a real DOUBLE PRECISION factorization and solve. The iterative refinement is not going to be a winning strategy if the ratio real SINGLE PRECISION performance over real DOUBLE PRECISION performance is too small. A reasonable strategy should take the number of right-hand sides and the size of the matrix into account. This might be done with a call to ILAENV in the future. Up to now, we always try iterative refinement. The iterative refinement process is stopped if ITER > ITERMAX or for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX where o ITER is the number of the current iteration in the iterative refinement process o RNRM is the infinity-norm of the residual o XNRM is the infinity-norm of the solution o ANRM is the infinity-operator-norm of the matrix A o EPS is the machine epsilon returned by DLAMCH('Epsilon') The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. M >= N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in,out] dA DOUBLE PRECISION array on the GPU, dimension (LDDA,N) On entry, the M-by-N coefficient matrix A. On exit, if iterative refinement has been successfully used (info.EQ.0 and ITER.GE.0, see description below), A is unchanged. If double precision factorization has been used (info.EQ.0 and ITER.LT.0, see description below), then the array dA contains the QR factorization of A as returned by function DGEQRF_GPU. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). @param[in,out] dB DOUBLE PRECISION array on the GPU, dimension (LDDB,NRHS) The M-by-NRHS right hand side matrix B. May be overwritten (e.g., if refinement fails). @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= max(1,M). @param[out] dX DOUBLE PRECISION array on the GPU, dimension (LDDX,NRHS) If info = 0, the N-by-NRHS solution matrix X. @param[in] lddx INTEGER The leading dimension of the array dX. LDDX >= max(1,N). @param[out] iter INTEGER - < 0: iterative refinement has failed, double precision factorization has been performed + -1 : the routine fell back to full precision for implementation- or machine-specific reasons + -2 : narrowing the precision induced an overflow, the routine fell back to full precision + -3 : failure of SGEQRF + -31: stop the iterative refinement after the 30th iteration - > 0: iterative refinement has been successfully used. Returns the number of iterations @param[out] info INTEGER - = 0: successful exit - < 0: if info = -i, the i-th argument had an illegal value @ingroup magma_dgels_driver ********************************************************************/ extern "C" magma_int_t magma_dsgeqrsv_gpu( magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaDouble_ptr dA, magma_int_t ldda, magmaDouble_ptr dB, magma_int_t lddb, magmaDouble_ptr dX, magma_int_t lddx, magma_int_t *iter, magma_int_t *info) { #define dB(i,j) (dB + (i) + (j)*lddb) #define dX(i,j) (dX + (i) + (j)*lddx) #define dR(i,j) (dR + (i) + (j)*lddr) #define dSX(i,j) (dSX + (i) + (j)*lddsx) double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; magma_int_t ione = 1; double *hworkd; float *hworks; double *tau; float *stau; magmaDouble_ptr dworkd; magmaFloat_ptr dworks; magmaDouble_ptr dR, dT; magmaFloat_ptr dSA, dSX, dST; double Xnrmv, Rnrmv; double Anrm, Xnrm, Rnrm, cte, eps; magma_int_t i, j, iiter, lddsa, lddsx, lddr, nb, lhwork, minmn, size, ldworkd; /* Check arguments */ *iter = 0; *info = 0; if ( m < 0 ) *info = -1; else if ( n < 0 || n > m ) *info = -2; else if ( nrhs < 0 ) *info = -3; else if ( ldda < max(1,m)) *info = -5; else if ( lddb < max(1,m)) *info = -7; else if ( lddx < max(1,n)) *info = -9; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if ( m == 0 || n == 0 || nrhs == 0 ) return *info; nb = magma_get_sgeqrf_nb(m); minmn= min(m, n); /* dSX contains both B and X, so must be max(m or lddb,n). */ lddsa = ldda; lddsx = max(lddb,n); lddr = lddb; /* * Allocate temporary buffers */ /* dworks(dSA + dSX + dST) */ size = lddsa*n + lddsx*nrhs + ( 2*minmn + ((n+31)/32)*32 )*nb; if (MAGMA_SUCCESS != magma_smalloc( &dworks, size )) { fprintf(stderr, "Allocation of dworks failed (%d)\n", (int) size); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dSA = dworks; dSX = dSA + lddsa*n; dST = dSX + lddsx*nrhs; /* dworkd(dR) = lddr*nrhs */ ldworkd = lddr*nrhs; if (MAGMA_SUCCESS != magma_dmalloc( &dworkd, ldworkd )) { magma_free( dworks ); fprintf(stderr, "Allocation of dworkd failed\n"); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dR = dworkd; /* hworks(workspace for cgeqrs + stau) = min(m,n) + lhworks */ lhwork = (m - n + nb)*(nrhs + nb) + nrhs*nb; size = lhwork + minmn; magma_smalloc_cpu( &hworks, size ); if ( hworks == NULL ) { magma_free( dworks ); magma_free( dworkd ); fprintf(stderr, "Allocation of hworks failed\n"); *info = MAGMA_ERR_HOST_ALLOC; return *info; } stau = hworks + lhwork; eps = lapackf77_dlamch("Epsilon"); Anrm = magmablas_dlange(MagmaInfNorm, m, n, dA, ldda, (double*)dworkd ); cte = Anrm * eps * pow((double)n, 0.5) * BWDMAX; /* * Convert to single precision */ magmablas_dlag2s( m, nrhs, dB, lddb, dSX, lddsx, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } magmablas_dlag2s( m, n, dA, ldda, dSA, lddsa, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // factor dSA in single precision magma_sgeqrf_gpu( m, n, dSA, lddsa, stau, dST, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // solve dSA*dSX = dB in single precision magma_sgeqrs_gpu( m, n, nrhs, dSA, lddsa, stau, dST, dSX, lddsx, hworks, lhwork, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // residual dR = dB - dA*dX in double precision magmablas_slag2d( n, nrhs, dSX, lddsx, dX, lddx, info ); magmablas_dlacpy( MagmaUpperLower, m, nrhs, dB, lddb, dR, lddr ); if ( nrhs == 1 ) { magma_dgemv( MagmaNoTrans, m, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_dgemm( MagmaNoTrans, MagmaNoTrans, m, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } // TODO: use MAGMA_D_ABS( dX(i,j) ) instead of dlange? for( j=0; j < nrhs; j++ ) { i = magma_idamax( n, dX(0,j), 1) - 1; magma_dgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_idamax ( m, dR(0,j), 1 ) - 1; magma_dgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto REFINEMENT; } } *iter = 0; /* Free workspaces */ magma_free( dworks ); magma_free( dworkd ); magma_free_cpu( hworks ); return *info; REFINEMENT: /* TODO: this iterative refinement algorithm works only for compatibile * systems (B in colspan of A). * See Matrix Computations (3rd ed) p. 267 for correct algorithm. */ for( iiter=1; iiter < ITERMAX; ) { *info = 0; // convert residual dR to single precision dSX magmablas_dlag2s( m, nrhs, dR, lddr, dSX, lddsx, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // solve dSA*dSX = R in single precision magma_sgeqrs_gpu( m, n, nrhs, dSA, lddsa, stau, dST, dSX, lddsx, hworks, lhwork, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Add correction and setup residual // dX += dSX [including conversion] --and-- // dR[1:n] = dB[1:n] (only n rows, not whole m rows! -- useless if m > n) for( j=0; j < nrhs; j++ ) { magmablas_dsaxpycp( n, dSX(0,j), dX(0,j), dB(0,j), dR(0,j) ); } // dR = dB (whole m rows) magmablas_dlacpy( MagmaUpperLower, m, nrhs, dB, lddb, dR, lddr ); // residual dR = dB - dA*dX in double precision if ( nrhs == 1 ) { magma_dgemv( MagmaNoTrans, m, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_dgemm( MagmaNoTrans, MagmaNoTrans, m, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } /* Check whether the nrhs normwise backward errors satisfy the * stopping criterion. If yes, set ITER=IITER > 0 and return. */ for( j=0; j < nrhs; j++ ) { i = magma_idamax( n, dX(0,j), 1) - 1; magma_dgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_idamax ( m, dR(0,j), 1 ) - 1; magma_dgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto L20; } } /* If we are here, the nrhs normwise backward errors satisfy * the stopping criterion, we are good to exit. */ *iter = iiter; /* Free workspaces */ magma_free( dworks ); magma_free( dworkd ); magma_free_cpu( hworks ); return *info; L20: iiter++; } /* If we are at this place of the code, this is because we have * performed ITER=ITERMAX iterations and never satisified the * stopping criterion. Set up the ITER flag accordingly and follow * up on double precision routine. */ *iter = -ITERMAX - 1; FALLBACK: /* Single-precision iterative refinement failed to converge to a * satisfactory solution, so we resort to double precision. */ magma_free( dworks ); magma_free_cpu( hworks ); /* * Allocate temporary buffers */ /* dworkd = dT for dgeqrf */ nb = magma_get_dgeqrf_nb( m ); size = (2*min(m, n) + (n+31)/32*32 )*nb; if ( size > ldworkd ) { magma_free( dworkd ); if (MAGMA_SUCCESS != magma_dmalloc( &dworkd, size )) { fprintf(stderr, "Allocation of dworkd2 failed\n"); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } dT = dworkd; /* hworkd(dtau + workspace for dgeqrs) = min(m,n) + lhwork */ size = lhwork + minmn; magma_dmalloc_cpu( &hworkd, size ); if ( hworkd == NULL ) { magma_free( dworkd ); fprintf(stderr, "Allocation of hworkd2 failed\n"); *info = MAGMA_ERR_HOST_ALLOC; return *info; } tau = hworkd + lhwork; magma_dgeqrf_gpu( m, n, dA, ldda, tau, dT, info ); if (*info == 0) { // if m > n, then dB won't fit in dX, so solve with dB and copy n rows to dX magma_dgeqrs_gpu( m, n, nrhs, dA, ldda, tau, dT, dB, lddb, hworkd, lhwork, info ); magmablas_dlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dX, lddx ); } magma_free( dworkd ); magma_free_cpu( hworkd ); return *info; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *d_A, *h_R, *tau, *dT, *h_work, tmp[1]; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb, size; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}, ISEED2[4]; magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t status = 0; double tol; opts.lapack |= (opts.version == 2 && opts.check == 2); // check (-c2) implies lapack (-l) if ( opts.version != 2 && opts.check == 1 ) { printf( " ===================================================================\n" " NOTE: -c check for this version will be wrong\n" " because tester ignores the special structure of MAGMA dgeqrf resuls.\n" " We reset it to -c2.\n" " ===================================================================\n\n"); opts.check = 2; } if ( opts.version == 2 ) { if ( opts.check == 1 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R-Q'A||_1 / (M*||A||_1*eps) ||I-Q'Q||_1 / (M*eps)\n"); printf("=========================================================================================================\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("=======================================================================\n"); } tol = 1.0; } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||Ax-b||_F/(N*||A||_F*||x||_F)\n"); printf("====================================================================================\n"); tol = opts.tolerance * lapackf77_dlamch("E"); } for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_DGEQRF( M, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_dgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); TESTING_MALLOC_CPU( tau, double, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( h_work, double, lwork ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); /* Initialize the matrix */ for ( int j=0; j<4; j++ ) ISEED2[j] = ISEED[j]; // saving seeds lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_dsetmatrix( M, N, h_R, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); if ( opts.version == 2 ) { magma_dgeqrf2_gpu( M, N, d_A, ldda, tau, &info); } else { nb = magma_get_dgeqrf_nb( M ); size = (2*min(M, N) + (N+31)/32*32 )*nb; TESTING_MALLOC_DEV( dT, double, size ); if ( opts.version == 3 ) { magma_dgeqrf3_gpu( M, N, d_A, ldda, tau, dT, &info); } else { magma_dgeqrf_gpu( M, N, d_A, ldda, tau, dT, &info); } } gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ double *tau2; TESTING_MALLOC_CPU( tau2, double, min_mn ); cpu_time = magma_wtime(); lapackf77_dgeqrf(&M, &N, h_A, &lda, tau2, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( tau2 ); } if ( opts.check == 1 ) { /* ===================================================================== Check the result =================================================================== */ magma_int_t lwork = n2+N; double *h_W1, *h_W2, *h_W3; double *h_RW, results[2]; magma_dgetmatrix( M, N, d_A, ldda, h_R, M ); TESTING_MALLOC_CPU( h_W1, double, n2 ); // Q TESTING_MALLOC_CPU( h_W2, double, n2 ); // R TESTING_MALLOC_CPU( h_W3, double, lwork ); // WORK TESTING_MALLOC_CPU( h_RW, double, M ); // RWORK lapackf77_dlarnv( &ione, ISEED2, &n2, h_A ); lapackf77_dqrt02( &M, &N, &min_mn, h_A, h_R, h_W1, h_W2, &lda, tau, h_W3, &lwork, h_RW, results ); if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, results[0], results[1] ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, gpu_perf, gpu_time, results[0], results[1] ); } // todo also check results[1] < tol? printf(" %s\n", (results[0] < tol ? "ok" : "failed")); status += ! (results[0] < tol); TESTING_FREE_CPU( h_W1 ); TESTING_FREE_CPU( h_W2 ); TESTING_FREE_CPU( h_W3 ); TESTING_FREE_CPU( h_RW ); } else if ( opts.check == 2 ) { if ( opts.version == 2 ) { /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_dgetmatrix( M, N, d_A, ldda, h_R, M ); error = lapackf77_dlange("f", &M, &N, h_A, &lda, work); blasf77_daxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("f", &M, &N, h_R, &lda, work) / error; if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e", (int) M, (int) N, gpu_perf, gpu_time, error ); } printf(" %s\n", (error < tol ? "ok" : "failed")); status += ! (error < tol); } else if ( M >= N ) { magma_int_t lwork; double *x, *b, *d_B, *hwork; const double c_zero = MAGMA_D_ZERO; const double c_one = MAGMA_D_ONE; const double c_neg_one = MAGMA_D_NEG_ONE; const magma_int_t ione = 1; // initialize RHS, b = A*random TESTING_MALLOC_CPU( x, double, N ); TESTING_MALLOC_CPU( b, double, M ); lapackf77_dlarnv( &ione, ISEED, &N, x ); blasf77_dgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_zero, b, &ione ); // copy to GPU TESTING_MALLOC_DEV( d_B, double, M ); magma_dsetvector( M, b, 1, d_B, 1 ); if ( opts.version == 1 ) { // allocate hwork magma_dgeqrs_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, tmp, -1, &info ); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, double, lwork ); // solve linear system magma_dgeqrs_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, hwork, lwork, &info ); if (info != 0) printf("magma_dgeqrs returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( hwork ); } else { // allocate hwork magma_dgeqrs3_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, tmp, -1, &info ); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, double, lwork ); // solve linear system magma_dgeqrs3_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, hwork, lwork, &info ); if (info != 0) printf("magma_dgeqrs3 returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( hwork ); } magma_dgetvector( N, d_B, 1, x, 1 ); // compute r = Ax - b, saved in b lapackf77_dlarnv( &ione, ISEED2, &n2, h_A ); blasf77_dgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_neg_one, b, &ione ); // compute residual |Ax - b| / (n*|A|*|x|) double norm_x, norm_A, norm_r, work[1]; norm_A = lapackf77_dlange( "F", &M, &N, h_A, &lda, work ); norm_r = lapackf77_dlange( "F", &M, &ione, b, &M, work ); norm_x = lapackf77_dlange( "F", &N, &ione, x, &N, work ); TESTING_FREE_CPU( x ); TESTING_FREE_CPU( b ); TESTING_FREE_DEV( d_B ); error = norm_r / (N * norm_A * norm_x); if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e", (int) M, (int) N, gpu_perf, gpu_time, error ); } printf(" %s\n", (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) --- ", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- ", (int) M, (int) N, gpu_perf, gpu_time); } printf("%s\n", (opts.check != 0 ? " (error check only for M >= N)" : "")); } } else { if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) ---\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, gpu_time); } } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); if ( opts.version != 2 ) TESTING_FREE_DEV( dT ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); const double d_neg_one = MAGMA_D_NEG_ONE; const double d_one = MAGMA_D_ONE; const double c_neg_one = MAGMA_D_NEG_ONE; const double c_one = MAGMA_D_ONE; const double c_zero = MAGMA_D_ZERO; const magma_int_t ione = 1; real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double Anorm, error=0, error2=0; double *h_A, *h_R, *tau, *h_work, tmp[1]; magmaDouble_ptr d_A, dT; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb, size; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t status = 0; double tol = opts.tolerance * lapackf77_dlamch("E"); printf( "version %d\n", (int) opts.version ); if ( opts.version == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |R - Q^H*A| |I - Q^H*Q|\n"); printf("===============================================================================\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |b - A*x|\n"); printf("================================================================\n"); } for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_DGEQRF( M, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_dgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); TESTING_MALLOC_CPU( tau, double, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( h_work, double, lwork ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_dsetmatrix( M, N, h_R, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); if ( opts.version == 2 ) { // LAPACK complaint arguments magma_dgeqrf2_gpu( M, N, d_A, ldda, tau, &info ); } else { nb = magma_get_dgeqrf_nb( M ); size = (2*min(M, N) + (N+31)/32*32 )*nb; TESTING_MALLOC_DEV( dT, double, size ); if ( opts.version == 1 ) { // stores dT, V blocks have zeros, R blocks inverted & stored in dT magma_dgeqrf_gpu( M, N, d_A, ldda, tau, dT, &info ); } #ifdef HAVE_CUBLAS else if ( opts.version == 3 ) { // stores dT, V blocks have zeros, R blocks stored in dT magma_dgeqrf3_gpu( M, N, d_A, ldda, tau, dT, &info ); } #endif else { printf( "Unknown version %d\n", (int) opts.version ); exit(1); } } gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.check && opts.version == 2 ) { /* ===================================================================== Check the result, following zqrt01 except using the reduced Q. This works for any M,N (square, tall, wide). Only for version 2, which has LAPACK complaint output. =================================================================== */ magma_dgetmatrix( M, N, d_A, ldda, h_R, lda ); magma_int_t ldq = M; magma_int_t ldr = min_mn; double *Q, *R; double *work; TESTING_MALLOC_CPU( Q, double, ldq*min_mn ); // M by K TESTING_MALLOC_CPU( R, double, ldr*N ); // K by N TESTING_MALLOC_CPU( work, double, min_mn ); // generate M by K matrix Q, where K = min(M,N) lapackf77_dlacpy( "Lower", &M, &min_mn, h_R, &lda, Q, &ldq ); lapackf77_dorgqr( &M, &min_mn, &min_mn, Q, &ldq, tau, h_work, &lwork, &info ); assert( info == 0 ); // copy K by N matrix R lapackf77_dlaset( "Lower", &min_mn, &N, &c_zero, &c_zero, R, &ldr ); lapackf77_dlacpy( "Upper", &min_mn, &N, h_R, &lda, R, &ldr ); // error = || R - Q^H*A || / (N * ||A||) blasf77_dgemm( "Conj", "NoTrans", &min_mn, &N, &M, &c_neg_one, Q, &ldq, h_A, &lda, &c_one, R, &ldr ); Anorm = lapackf77_dlange( "1", &M, &N, h_A, &lda, work ); error = lapackf77_dlange( "1", &min_mn, &N, R, &ldr, work ); if ( N > 0 && Anorm > 0 ) error /= (N*Anorm); // set R = I (K by K identity), then R = I - Q^H*Q // error = || I - Q^H*Q || / N lapackf77_dlaset( "Upper", &min_mn, &min_mn, &c_zero, &c_one, R, &ldr ); blasf77_dsyrk( "Upper", "Conj", &min_mn, &M, &d_neg_one, Q, &ldq, &d_one, R, &ldr ); error2 = lapackf77_dlansy( "1", "Upper", &min_mn, R, &ldr, work ); if ( N > 0 ) error2 /= N; TESTING_FREE_CPU( Q ); Q = NULL; TESTING_FREE_CPU( R ); R = NULL; TESTING_FREE_CPU( work ); work = NULL; } else if ( opts.check && M >= N ) { /* ===================================================================== Check the result by solving consistent linear system, A*x = b. Only for versions 1 & 3 with M >= N. =================================================================== */ magma_int_t lwork; double *x, *b, *hwork; magmaDouble_ptr d_B; const double c_zero = MAGMA_D_ZERO; const double c_one = MAGMA_D_ONE; const double c_neg_one = MAGMA_D_NEG_ONE; const magma_int_t ione = 1; // initialize RHS, b = A*random TESTING_MALLOC_CPU( x, double, N ); TESTING_MALLOC_CPU( b, double, M ); lapackf77_dlarnv( &ione, ISEED, &N, x ); blasf77_dgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_zero, b, &ione ); // copy to GPU TESTING_MALLOC_DEV( d_B, double, M ); magma_dsetvector( M, b, 1, d_B, 1 ); if ( opts.version == 1 ) { // allocate hwork magma_dgeqrs_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, tmp, -1, &info ); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, double, lwork ); // solve linear system magma_dgeqrs_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, hwork, lwork, &info ); if (info != 0) printf("magma_dgeqrs returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( hwork ); } #ifdef HAVE_CUBLAS else if ( opts.version == 3 ) { // allocate hwork magma_dgeqrs3_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, tmp, -1, &info ); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, double, lwork ); // solve linear system magma_dgeqrs3_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, hwork, lwork, &info ); if (info != 0) printf("magma_dgeqrs3 returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( hwork ); } #endif else { printf( "Unknown version %d\n", (int) opts.version ); exit(1); } magma_dgetvector( N, d_B, 1, x, 1 ); // compute r = Ax - b, saved in b blasf77_dgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_neg_one, b, &ione ); // compute residual |Ax - b| / (n*|A|*|x|) double norm_x, norm_A, norm_r, work[1]; norm_A = lapackf77_dlange( "F", &M, &N, h_A, &lda, work ); norm_r = lapackf77_dlange( "F", &M, &ione, b, &M, work ); norm_x = lapackf77_dlange( "F", &N, &ione, x, &N, work ); TESTING_FREE_CPU( x ); TESTING_FREE_CPU( b ); TESTING_FREE_DEV( d_B ); error = norm_r / (N * norm_A * norm_x); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_dgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Print performance and error. =================================================================== */ printf("%5d %5d ", (int) M, (int) N ); if ( opts.lapack ) { printf( "%7.2f (%7.2f)", cpu_perf, cpu_time ); } else { printf(" --- ( --- )" ); } printf( " %7.2f (%7.2f) ", gpu_perf, gpu_time ); if ( opts.check ) { if ( opts.version == 2 ) { bool okay = (error < tol && error2 < tol); status += ! okay; printf( "%11.2e %11.2e %s\n", error, error2, (okay ? "ok" : "failed") ); } else if ( M >= N ) { bool okay = (error < tol); status += ! okay; printf( "%10.2e %s\n", error, (okay ? "ok" : "failed") ); } else { printf( "(error check only for M >= N)\n" ); } } else { printf( " ---\n" ); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); if ( opts.version != 2 ) TESTING_FREE_DEV( dT ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }