/** Purpose ------- SGEGQR orthogonalizes the N vectors given by a real M-by-N matrix A: A = Q * R. On exit, if successful, the orthogonal vectors Q overwrite A and R is given in work (on the CPU memory). The routine is designed for tall-and-skinny matrices: M >> N, N <= 128. This version uses normal equations and SVD in an iterative process that makes the computation numerically accurate. Arguments --------- @param[in] ikind INTEGER Several versions are implemented indiceted by the ikind value: 1: This version uses normal equations and SVD in an iterative process that makes the computation numerically accurate. 2: This version uses a standard LAPACK-based orthogonalization through MAGMA's QR panel factorization (magma_sgeqr2x3_gpu) and magma_sorgqr 3: MGS 4. Cholesky QR [ Note: this method uses the normal equations which squares the condition number of A, therefore ||I - Q'Q|| < O(eps cond(A)^2) ] @param[in] m INTEGER The number of rows of the matrix A. m >= n >= 0. @param[in] n INTEGER The number of columns of the matrix A. 128 >= n >= 0. @param[in,out] dA REAL array on the GPU, dimension (ldda,n) On entry, the m-by-n matrix A. On exit, the m-by-n matrix Q with orthogonal columns. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,m). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param dwork (GPU workspace) REAL array, dimension: n^2 for ikind = 1 3 n^2 + min(m, n) + 2 for ikind = 2 0 (not used) for ikind = 3 n^2 for ikind = 4 @param[out] work (CPU workspace) REAL array, dimension 3 n^2. On exit, work(1:n^2) holds the rectangular matrix R. Preferably, for higher performance, work should be in pinned memory. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sgegqr_gpu( magma_int_t ikind, magma_int_t m, magma_int_t n, float *dA, magma_int_t ldda, float *dwork, float *work, magma_int_t *info ) { #define work(i_,j_) (work + (i_) + (j_)*n) #define dA(i_,j_) (dA + (i_) + (j_)*ldda) magma_int_t i = 0, j, k, n2 = n*n; magma_int_t ione = 1; float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; float cn = 200., mins, maxs; /* check arguments */ *info = 0; if (ikind < 1 || ikind > 4) { *info = -1; } else if (m < 0 || m < n) { *info = -2; } else if (n < 0 || n > 128) { *info = -3; } else if (ldda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (ikind == 1) { // === Iterative, based on SVD ============================================================ float *U, *VT, *vt, *R, *G, *hwork, *tau; float *S; R = work; // Size n * n G = R + n*n; // Size n * n VT = G + n*n; // Size n * n magma_smalloc_cpu( &hwork, 32 + 2*n*n + 2*n); if ( hwork == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_int_t lwork=n*n+32; // First part f hwork; used as workspace in svd U = hwork + n*n + 32; // Size n*n S = (float *)(U+n*n); // Size n tau = U + n*n + n; // Size n #if defined(PRECISION_c) || defined(PRECISION_z) float *rwork; magma_smalloc_cpu( &rwork, 5*n); if ( rwork == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } #endif do { i++; magma_sgemm(MagmaConjTrans, MagmaNoTrans, n, n, m, c_one, dA, ldda, dA, ldda, c_zero, dwork, n ); magma_sgetmatrix(n, n, dwork, n, G, n); #if defined(PRECISION_s) || defined(PRECISION_d) lapackf77_sgesvd("n", "a", &n, &n, G, &n, S, U, &n, VT, &n, hwork, &lwork, info); #else lapackf77_sgesvd("n", "a", &n, &n, G, &n, S, U, &n, VT, &n, hwork, &lwork, rwork, info); #endif mins = 100.f, maxs = 0.f; for (k=0; k < n; k++) { S[k] = magma_ssqrt( S[k] ); if (S[k] < mins) mins = S[k]; if (S[k] > maxs) maxs = S[k]; } for (k=0; k < n; k++) { vt = VT + k*n; for (j=0; j < n; j++) vt[j] *= S[j]; } lapackf77_sgeqrf(&n, &n, VT, &n, tau, hwork, &lwork, info); if (i == 1) blasf77_scopy(&n2, VT, &ione, R, &ione); else blasf77_strmm("l", "u", "n", "n", &n, &n, &c_one, VT, &n, R, &n); magma_ssetmatrix(n, n, VT, n, dwork, n); magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, n, c_one, dwork, n, dA, ldda); if (mins > 0.00001f) cn = maxs/mins; //fprintf(stderr, "Iteration %d, cond num = %f \n", i, cn); } while (cn > 10.f); magma_free_cpu( hwork ); #if defined(PRECISION_c) || defined(PRECISION_z) magma_free_cpu( rwork ); #endif // ================== end of ikind == 1 =================================================== } else if (ikind == 2) { // ================== LAPACK based =================================================== magma_int_t min_mn = min(m, n); magma_int_t nb = n; float *dtau = dwork + 2*n*n, *d_T = dwork, *ddA = dwork + n*n; float *tau = work+n*n; magmablas_slaset( MagmaFull, n, n, c_zero, c_zero, d_T, n ); magma_sgeqr2x3_gpu(m, n, dA, ldda, dtau, d_T, ddA, (float *)(dwork+min_mn+2*n*n), info); magma_sgetmatrix( min_mn, 1, dtau, min_mn, tau, min_mn); magma_sgetmatrix( n, n, ddA, n, work, n); magma_sorgqr_gpu( m, n, n, dA, ldda, tau, d_T, nb, info ); // ================== end of ikind == 2 =================================================== } else if (ikind == 3) { // ================== MGS =================================================== for(magma_int_t j = 0; j<n; j++){ for(magma_int_t i = 0; i<j; i++){ *work(i, j) = magma_sdot(m, dA(0,i), 1, dA(0,j), 1); magma_saxpy(m, -(*work(i,j)), dA(0,i), 1, dA(0,j), 1); } for(magma_int_t i = j; i<n; i++) *work(i, j) = MAGMA_S_ZERO; //*work(j,j) = MAGMA_S_MAKE( magma_snrm2(m, dA(0,j), 1), 0. ); *work(j,j) = magma_sdot(m, dA(0,j), 1, dA(0,j), 1); *work(j,j) = MAGMA_S_MAKE( sqrt(MAGMA_S_REAL( *work(j,j) )), 0.); magma_sscal(m, 1./ *work(j,j), dA(0,j), 1); } // ================== end of ikind == 3 =================================================== } else if (ikind == 4) { // ================== Cholesky QR =================================================== magma_sgemm(MagmaConjTrans, MagmaNoTrans, n, n, m, c_one, dA, ldda, dA, ldda, c_zero, dwork, n ); magma_sgetmatrix(n, n, dwork, n, work, n); lapackf77_spotrf("u", &n, work, &n, info); magma_ssetmatrix(n, n, work, n, dwork, n); magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, n, c_one, dwork, n, dA, ldda); // ================== end of ikind == 4 =================================================== } return *info; } /* magma_sgegqr_gpu */
/* //////////////////////////////////////////////////////////////////////////// -- Testing strmm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf, cpu_time; float cublas_error, Cnorm, work[1]; magma_int_t M, N; magma_int_t Ak; magma_int_t sizeA, sizeB; magma_int_t lda, ldb, ldda, lddb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float *h_A, *h_B, *h_Bcublas; float *d_A, *d_B; float c_neg_one = MAGMA_S_NEG_ONE; float alpha = MAGMA_S_MAKE( 0.29, -0.86 ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) float tol = opts.tolerance * lapackf77_slamch("E"); printf("If running lapack (option --lapack), CUBLAS error is computed\n" "relative to CPU BLAS result.\n\n"); printf("side = %s, uplo = %s, transA = %s, diag = %s \n", lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag) ); printf(" M N CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("==================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; gflops = FLOPS_STRMM(opts.side, M, N) / 1e9; if ( opts.side == MagmaLeft ) { lda = M; Ak = M; } else { lda = N; Ak = N; } ldb = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; sizeA = lda*Ak; sizeB = ldb*N; TESTING_MALLOC_CPU( h_A, float, lda*Ak ); TESTING_MALLOC_CPU( h_B, float, ldb*N ); TESTING_MALLOC_CPU( h_Bcublas, float, ldb*N ); TESTING_MALLOC_DEV( d_A, float, ldda*Ak ); TESTING_MALLOC_DEV( d_B, float, lddb*N ); /* Initialize the matrices */ lapackf77_slarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_slarnv( &ione, ISEED, &sizeB, h_B ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_ssetmatrix( Ak, Ak, h_A, lda, d_A, ldda ); magma_ssetmatrix( M, N, h_B, ldb, d_B, lddb ); // note cublas does trmm out-of-place (i.e., adds output matrix C), // but allows C=B to do in-place. cublas_time = magma_sync_wtime( NULL ); cublasStrmm( handle, cublas_side_const(opts.side), cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), cublas_diag_const(opts.diag), M, N, &alpha, d_A, ldda, d_B, lddb, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_sgetmatrix( M, N, d_B, lddb, h_Bcublas, ldb ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_strmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &alpha, h_A, &lda, h_B, &ldb ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_slange( "M", &M, &N, h_B, &ldb, work ); blasf77_saxpy( &sizeB, &c_neg_one, h_B, &ione, h_Bcublas, &ione ); cublas_error = lapackf77_slange( "M", &M, &N, h_Bcublas, &ldb, work ) / Cnorm; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error, (cublas_error < tol ? "ok" : "failed")); status += ! (cublas_error < tol); } else { printf("%5d %5d %7.2f (%7.2f) --- ( --- ) --- ---\n", (int) M, (int) N, cublas_perf, 1000.*cublas_time); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_Bcublas ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgegqr */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float e1, e2, e3, e4, e5, *work; float c_neg_one = MAGMA_S_NEG_ONE; float c_one = MAGMA_S_ONE; float c_zero = MAGMA_S_ZERO; float *h_A, *h_R, *tau, *dtau, *h_work, *h_rwork, tmp[1]; float *d_A, *dwork; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn; magma_int_t ione = 1, ldwork; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) // versions 1...4 are valid if (opts.version < 1 || opts.version > 4) { printf("Unknown version %d; exiting\n", opts.version ); return -1; } float tol, eps = lapackf77_slamch("E"); tol = 10* opts.tolerance * eps; printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||I-Q'Q||_F / M ||I-Q'Q||_I / M ||A-Q R||_I\n"); printf(" MAGMA / LAPACK MAGMA / LAPACK\n"); printf("==========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if (N > 128) { printf("%5d %5d skipping because sgegqr requires N <= 128\n", (int) M, (int) N); continue; } if (M < N) { printf("%5d %5d skipping because sgegqr requires M >= N\n", (int) M, (int) N); continue; } min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_SGEQRF( M, N ) / 1e9 + FLOPS_SORGQR( M, N, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_sgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_S_REAL( tmp[0] ); lwork = max(lwork, 3*N*N); ldwork = N*N; if (opts.version == 2) { ldwork = 3*N*N + min_mn; } TESTING_MALLOC_PIN( tau, float, min_mn ); TESTING_MALLOC_PIN( h_work, float, lwork ); TESTING_MALLOC_PIN(h_rwork, float, lwork ); TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_CPU( h_R, float, n2 ); TESTING_MALLOC_CPU( work, float, M ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); TESTING_MALLOC_DEV( dtau, float, min_mn ); TESTING_MALLOC_DEV( dwork, float, ldwork ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_ssetmatrix( M, N, h_R, lda, d_A, ldda ); // warmup magma_sgegqr_gpu( 1, M, N, d_A, ldda, dwork, h_work, &info ); magma_ssetmatrix( M, N, h_R, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_sync_wtime( 0 ); magma_sgegqr_gpu( opts.version, M, N, d_A, ldda, dwork, h_rwork, &info ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_sgegqr returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_sgetmatrix( M, N, d_A, ldda, h_R, M ); // Regenerate R // blasf77_sgemm("t", "n", &N, &N, &M, &c_one, h_R, &M, h_A, &M, &c_zero, h_rwork, &N); // magma_sprint(N, N, h_work, N); blasf77_strmm("r", "u", "n", "n", &M, &N, &c_one, h_rwork, &N, h_R, &M); blasf77_saxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); e5 = lapackf77_slange("i", &M, &N, h_R, &M, work) / lapackf77_slange("i", &M, &N, h_A, &lda, work); magma_sgetmatrix( M, N, d_A, ldda, h_R, M ); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); /* Orthogonalize on the CPU */ lapackf77_sgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); lapackf77_sorgqr(&M, &N, &N, h_A, &lda, tau, h_work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_sorgqr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ blasf77_sgemm("t", "n", &N, &N, &M, &c_one, h_R, &M, h_R, &M, &c_zero, h_work, &N); for(int ii = 0; ii < N*N; ii += N+1 ) { h_work[ii] = MAGMA_S_SUB(h_work[ii], c_one); } e1 = lapackf77_slange("f", &N, &N, h_work, &N, work) / N; e3 = lapackf77_slange("i", &N, &N, h_work, &N, work) / N; blasf77_sgemm("t", "n", &N, &N, &M, &c_one, h_A, &M, h_A, &M, &c_zero, h_work, &N); for(int ii = 0; ii < N*N; ii += N+1 ) { h_work[ii] = MAGMA_S_SUB(h_work[ii], c_one); } e2 = lapackf77_slange("f", &N, &N, h_work, &N, work) / N; e4 = lapackf77_slange("i", &N, &N, h_work, &N, work) / N; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e / %8.2e %8.2e / %8.2e %8.2e %s\n", (int) M, (int) N, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time, e1, e2, e3, e4, e5, (e1 < tol ? "ok" : "failed")); status += ! (e1 < tol); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, 1000.*gpu_time ); } TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( h_rwork ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_CPU( work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dtau ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing strmm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf, cpu_time; float cublas_error, Cnorm, work[1]; magma_int_t M, N; magma_int_t Ak; magma_int_t sizeA, sizeB; magma_int_t lda, ldb, ldda, lddb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float *h_A, *h_B, *h_Bcublas; float *d_A, *d_B; float c_neg_one = MAGMA_S_NEG_ONE; float alpha = MAGMA_S_MAKE( 0.29, -0.86 ); magma_opts opts; parse_opts( argc, argv, &opts ); printf("If running lapack (option --lapack), MAGMA and CUBLAS error are both computed\n" "relative to CPU BLAS result. Else, MAGMA error is computed relative to CUBLAS result.\n\n" "side = %c, uplo = %c, transA = %c, diag = %c \n", opts.side, opts.uplo, opts.transA, opts.diag ); printf(" M N CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("==================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; gflops = FLOPS_STRMM(opts.side, M, N) / 1e9; if ( opts.side == MagmaLeft ) { lda = M; Ak = M; } else { lda = N; Ak = N; } ldb = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; sizeA = lda*Ak; sizeB = ldb*N; TESTING_MALLOC( h_A, float, lda*Ak ); TESTING_MALLOC( h_B, float, ldb*N ); TESTING_MALLOC( h_Bcublas, float, ldb*N ); TESTING_DEVALLOC( d_A, float, ldda*Ak ); TESTING_DEVALLOC( d_B, float, lddb*N ); /* Initialize the matrices */ lapackf77_slarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_slarnv( &ione, ISEED, &sizeB, h_B ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_ssetmatrix( Ak, Ak, h_A, lda, d_A, ldda ); magma_ssetmatrix( M, N, h_B, ldb, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ); cublasStrmm( opts.side, opts.uplo, opts.transA, opts.diag, M, N, alpha, d_A, ldda, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_sgetmatrix( M, N, d_B, lddb, h_Bcublas, ldb ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_strmm( &opts.side, &opts.uplo, &opts.transA, &opts.diag, &M, &N, &alpha, h_A, &lda, h_B, &ldb ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_slange( "M", &M, &N, h_B, &ldb, work ); blasf77_saxpy( &sizeB, &c_neg_one, h_B, &ione, h_Bcublas, &ione ); cublas_error = lapackf77_slange( "M", &M, &N, h_Bcublas, &ldb, work ) / Cnorm; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e\n", (int) M, (int) N, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error ); } else { printf("%5d %5d %7.2f (%7.2f) --- ( --- ) --- ---\n", (int) M, (int) N, cublas_perf, 1000.*cublas_time); } TESTING_FREE( h_A ); TESTING_FREE( h_B ); TESTING_FREE( h_Bcublas ); TESTING_DEVFREE( d_A ); TESTING_DEVFREE( d_B ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }