/* //////////////////////////////////////////////////////////////////////////// -- Testing cgegqr */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float e1, e2, work[1]; magmaFloatComplex *h_A, *h_R, *tau, *dtau, *h_work, tmp[1]; magmaFloatComplex *d_A, *dwork, *ddA, *d_T; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||I - Q'Q||_F \n"); printf("=======================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_CGEQRF( M, N ) / 1e9 + FLOPS_CUNGQR( M, N, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_cgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); lwork = max(lwork, 3*N*N); TESTING_MALLOC_PIN( tau, magmaFloatComplex, min_mn ); TESTING_MALLOC_PIN( h_work, magmaFloatComplex, lwork ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_R, magmaFloatComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( dtau, magmaFloatComplex, min_mn ); TESTING_MALLOC_DEV( dwork, magmaFloatComplex, N*N ); TESTING_MALLOC_DEV( ddA, magmaFloatComplex, N*N ); TESTING_MALLOC_DEV( d_T, magmaFloatComplex, N*N ); cudaMemset( ddA, 0, N*N*sizeof(magmaFloatComplex) ); cudaMemset( d_T, 0, N*N*sizeof(magmaFloatComplex) ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_csetmatrix( M, N, h_R, lda, d_A, ldda ); // warmup magma_cgegqr_gpu( M, N, d_A, ldda, dwork, h_work, &info ); magma_csetmatrix( M, N, h_R, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_sync_wtime( 0 ); if (opts.version == 2) { int min_mn = min(M, N); int nb = N; cuFloatComplex *dtau = dwork; magma_cgeqr2x3_gpu(&M, &N, d_A, &ldda, dtau, d_T, ddA, (float *)(dwork+min_mn), &info); magma_cgetmatrix( min_mn, 1, dtau, min_mn, tau, min_mn); magma_cungqr_gpu( M, N, N, d_A, ldda, tau, d_T, nb, &info ); } else magma_cgegqr_gpu( M, N, d_A, ldda, dwork, h_work, &info ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cgegqr returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); /* Orthogonalize on the CPU */ lapackf77_cgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); lapackf77_cungqr(&M, &N, &N, h_A, &lda, tau, h_work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cungqr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_cgetmatrix( M, N, d_A, ldda, h_R, M ); magmaFloatComplex one = MAGMA_C_ONE, zero = MAGMA_C_ZERO; blasf77_cgemm("t", "n", &N, &N, &M, &one, h_R, &M, h_R, &M, &zero, h_work, &N); for(int ii=0; ii<N*N; ii+=(N+1)) h_work[ii] = MAGMA_C_SUB(h_work[ii], one); e1 = lapackf77_clange("f", &N, &N, h_work, &N, work); blasf77_cgemm("t", "n", &N, &N, &M, &one, h_A, &M, h_A, &M, &zero, h_work, &N); for(int ii=0; ii<N*N; ii+=(N+1)) h_work[ii] = MAGMA_C_SUB(h_work[ii], one); e2 = lapackf77_clange("f", &N, &N, h_work, &N, work); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e\n", (int) M, (int) N, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time, e1, e2 ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, 1000.*gpu_time ); } TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_work ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dtau ); TESTING_FREE_DEV( dwork ); TESTING_FREE_DEV( ddA ); TESTING_FREE_DEV( d_T ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
/** Purpose ------- CGEGQR orthogonalizes the N vectors given by a complex M-by-N matrix A: A = Q * R. On exit, if successful, the orthogonal vectors Q overwrite A and R is given in work (on the CPU memory). The routine is designed for tall-and-skinny matrices: M >> N, N <= 128. This version uses normal equations and SVD in an iterative process that makes the computation numerically accurate. Arguments --------- @param[in] ikind INTEGER Several versions are implemented indiceted by the ikind value: 1: This version uses normal equations and SVD in an iterative process that makes the computation numerically accurate. 2: This version uses a standard LAPACK-based orthogonalization through MAGMA's QR panel factorization (magma_cgeqr2x3_gpu) and magma_cungqr 3: MGS 4. Cholesky QR @param[in] m INTEGER The number of rows of the matrix A. m >= n >= 0. @param[in] n INTEGER The number of columns of the matrix A. 128 >= n >= 0. @param[in,out] dA COMPLEX array on the GPU, dimension (ldda,n) On entry, the m-by-n matrix A. On exit, the m-by-n matrix Q with orthogonal columns. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,m). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param dwork (GPU workspace) COMPLEX array, dimension: n^2 for ikind = 1 3 n^2 + min(m, n) for ikind = 2 0 (not used) for ikind = 3 n^2 for ikind = 4 @param[out] work (CPU workspace) COMPLEX array, dimension 3 n^2. On exit, work(1:n^2) holds the rectangular matrix R. Preferably, for higher performance, work should be in pinned memory. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. @ingroup magma_cgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_cgegqr_gpu( magma_int_t ikind, magma_int_t m, magma_int_t n, magmaFloatComplex *dA, magma_int_t ldda, magmaFloatComplex *dwork, magmaFloatComplex *work, magma_int_t *info ) { #define work(i_,j_) (work + (i_) + (j_)*n) #define dA(i_,j_) (dA + (i_) + (j_)*ldda) magma_int_t i = 0, j, k, n2 = n*n; magma_int_t ione = 1; magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; float cn = 200., mins, maxs; /* check arguments */ *info = 0; if (ikind < 1 || ikind > 4) { *info = -1; } else if (m < 0 || m < n) { *info = -2; } else if (n < 0 || n > 128) { *info = -3; } else if (ldda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (ikind == 1) { // === Iterative, based on SVD ============================================================ magmaFloatComplex *U, *VT, *vt, *R, *G, *hwork, *tau; float *S; R = work; // Size n * n G = R + n*n; // Size n * n VT = G + n*n; // Size n * n magma_cmalloc_cpu( &hwork, 32 + 2*n*n + 2*n); if ( hwork == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_int_t lwork=n*n+32; // First part f hwork; used as workspace in svd U = hwork + n*n + 32; // Size n*n S = (float *)(U+n*n); // Size n tau = U + n*n + n; // Size n #if defined(PRECISION_c) || defined(PRECISION_z) float *rwork; magma_smalloc_cpu( &rwork, 5*n); if ( rwork == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } #endif do { i++; magma_cgemm(MagmaConjTrans, MagmaNoTrans, n, n, m, c_one, dA, ldda, dA, ldda, c_zero, dwork, n ); magma_cgetmatrix(n, n, dwork, n, G, n); #if defined(PRECISION_s) || defined(PRECISION_d) lapackf77_cgesvd("n", "a", &n, &n, G, &n, S, U, &n, VT, &n, hwork, &lwork, info); #else lapackf77_cgesvd("n", "a", &n, &n, G, &n, S, U, &n, VT, &n, hwork, &lwork, rwork, info); #endif mins = 100.f, maxs = 0.f; for (k=0; k < n; k++) { S[k] = magma_ssqrt( S[k] ); if (S[k] < mins) mins = S[k]; if (S[k] > maxs) maxs = S[k]; } for (k=0; k < n; k++) { vt = VT + k*n; for (j=0; j < n; j++) vt[j] *= S[j]; } lapackf77_cgeqrf(&n, &n, VT, &n, tau, hwork, &lwork, info); if (i == 1) blasf77_ccopy(&n2, VT, &ione, R, &ione); else blasf77_ctrmm("l", "u", "n", "n", &n, &n, &c_one, VT, &n, R, &n); magma_csetmatrix(n, n, VT, n, dwork, n); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, n, c_one, dwork, n, dA, ldda); if (mins > 0.00001f) cn = maxs/mins; //fprintf(stderr, "Iteration %d, cond num = %f \n", i, cn); } while (cn > 10.f); magma_free_cpu( hwork ); #if defined(PRECISION_c) || defined(PRECISION_z) magma_free_cpu( rwork ); #endif // ================== end of ikind == 1 =================================================== } else if (ikind == 2) { // ================== LAPACK based =================================================== magma_int_t min_mn = min(m, n); magma_int_t nb = n; magmaFloatComplex *dtau = dwork + 2*n*n, *d_T = dwork, *ddA = dwork + n*n; magmaFloatComplex *tau = work+n*n; magmablas_claset( MagmaFull, n, n, c_zero, c_zero, d_T, n ); magma_cgeqr2x3_gpu(&m, &n, dA, &ldda, dtau, d_T, ddA, (float *)(dwork+min_mn+2*n*n), info); magma_cgetmatrix( min_mn, 1, dtau, min_mn, tau, min_mn); magma_cgetmatrix( n, n, ddA, n, work, n); magma_cungqr_gpu( m, n, n, dA, ldda, tau, d_T, nb, info ); // ================== end of ikind == 2 =================================================== } else if (ikind == 3) { // ================== MGS =================================================== for(magma_int_t j = 0; j<n; j++){ for(magma_int_t i = 0; i<j; i++){ *work(i, j) = magma_cdotc(m, dA(0,i), 1, dA(0,j), 1); magma_caxpy(m, -(*work(i,j)), dA(0,i), 1, dA(0,j), 1); } for(magma_int_t i = j; i<n; i++) *work(i, j) = MAGMA_C_ZERO; //*work(j,j) = MAGMA_C_MAKE( magma_scnrm2(m, dA(0,j), 1), 0. ); *work(j,j) = magma_cdotc(m, dA(0,j), 1, dA(0,j), 1); *work(j,j) = MAGMA_C_MAKE( sqrt(MAGMA_C_REAL( *work(j,j) )), 0.); magma_cscal(m, 1./ *work(j,j), dA(0,j), 1); } // ================== end of ikind == 3 =================================================== } else if (ikind == 4) { // ================== Cholesky QR =================================================== magma_cgemm(MagmaConjTrans, MagmaNoTrans, n, n, m, c_one, dA, ldda, dA, ldda, c_zero, dwork, n ); magma_cgetmatrix(n, n, dwork, n, work, n); lapackf77_cpotrf("u", &n, work, &n, info); magma_csetmatrix(n, n, work, n, dwork, n); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, n, c_one, dwork, n, dA, ldda); // ================== end of ikind == 4 =================================================== } return *info; } /* magma_cgegqr_gpu */
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_T, *h_R, *tau, *h_work, tmp[1]; magmaFloatComplex *d_A, *d_T, *ddA, *dtau; magmaFloatComplex *d_A2, *d_T2, *ddA2, *dtau2; float *dwork, *dwork2; magma_int_t M, N, lda, ldda, lwork, n2, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; #define BLOCK_SIZE 64 magma_opts opts; parse_opts( argc, argv, &opts ); float tol = 10. * opts.tolerance * lapackf77_slamch("E"); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); printf("version %d\n", (int) opts.version ); printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||R||_F/||A||_F ||R_T||\n"); printf("=============================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if (N > 128) { printf("%5d %5d skipping because cgeqr2x requires N <= 128\n", (int) M, (int) N); continue; } if (M < N) { printf("%5d %5d skipping because cgeqr2x requires M >= N\n", (int) M, (int) N); continue; } min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = (FLOPS_CGEQRF( M, N ) + FLOPS_CGEQRT( M, N )) / 1e9; /* Allocate memory for the matrix */ TESTING_MALLOC_CPU( tau, magmaFloatComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_T, magmaFloatComplex, N*N ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_T, magmaFloatComplex, N*N ); TESTING_MALLOC_DEV( ddA, magmaFloatComplex, N*N ); TESTING_MALLOC_DEV( dtau, magmaFloatComplex, min_mn ); TESTING_MALLOC_DEV( d_A2, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_T2, magmaFloatComplex, N*N ); TESTING_MALLOC_DEV( ddA2, magmaFloatComplex, N*N ); TESTING_MALLOC_DEV( dtau2, magmaFloatComplex, min_mn ); TESTING_MALLOC_DEV( dwork, float, max(5*min_mn, (BLOCK_SIZE*2+2)*min_mn) ); TESTING_MALLOC_DEV( dwork2, float, max(5*min_mn, (BLOCK_SIZE*2+2)*min_mn) ); // todo replace with magma_claset cudaMemset(ddA, 0, N*N*sizeof(magmaFloatComplex)); cudaMemset(d_T, 0, N*N*sizeof(magmaFloatComplex)); cudaMemset(ddA2, 0, N*N*sizeof(magmaFloatComplex)); cudaMemset(d_T2, 0, N*N*sizeof(magmaFloatComplex)); lwork = -1; lapackf77_cgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); lwork = max(lwork, N*N); TESTING_MALLOC_CPU( h_work, magmaFloatComplex, lwork ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_csetmatrix( M, N, h_R, lda, d_A, ldda ); magma_csetmatrix( M, N, h_R, lda, d_A2, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_sync_wtime(0); if (opts.version == 1) magma_cgeqr2x_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info); else if (opts.version == 2) magma_cgeqr2x2_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info); else if (opts.version == 3) magma_cgeqr2x3_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info); else { printf( "call magma_cgeqr2x4_gpu\n" ); /* Going through NULL stream is faster Going through any stream is slower Doing two streams in parallel is slower than doing them sequentially Queuing happens on the NULL stream - user defined buffers are smaller? */ magma_cgeqr2x4_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info, NULL); //magma_cgeqr2x4_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info, stream[1]); //magma_cgeqr2x4_gpu(M, N, d_A2, ldda, dtau2, d_T2, ddA2, dwork2, &info, stream[0]); //magma_cgeqr2x4_gpu(M, N, d_A2, ldda, dtau2, d_T2, ddA2, dwork2, &info, NULL); //gflops *= 2; } gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_cgeqr2x_gpu version %d returned error %d: %s.\n", (int) opts.version, (int) info, magma_strerror( info )); } else { if ( opts.check ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr, &M, &N, h_A, &lda, tau, h_work, &N); //magma_cgeqr2(&M, &N, h_A, &lda, tau, h_work, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_cgetmatrix( M, N, d_A, ldda, h_R, M ); magma_cgetmatrix( N, N, ddA, N, h_T, N ); // Restore the upper triangular part of A before the check for(int col=0; col < N; col++){ for(int row=0; row <= col; row++) h_R[row + col*M] = h_T[row + col*N]; } error = lapackf77_clange("M", &M, &N, h_A, &lda, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_clange("M", &M, &N, h_R, &lda, work) / (N * error); // Check if T is the same magma_cgetmatrix( N, N, d_T, N, h_T, N ); float terr = 0.; for(int col=0; col < N; col++) for(int row=0; row <= col; row++) terr += ( MAGMA_C_ABS(h_work[row + col*N] - h_T[row + col*N])* MAGMA_C_ABS(h_work[row + col*N] - h_T[row + col*N]) ); terr = magma_ssqrt(terr); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time, error, terr, (error < tol ? "ok" : "failed") ); status += ! (error < tol); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, 1000.*gpu_time); } } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_T ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_T ); TESTING_FREE_DEV( ddA ); TESTING_FREE_DEV( dtau ); TESTING_FREE_DEV( dwork ); TESTING_FREE_DEV( d_A2 ); TESTING_FREE_DEV( d_T2 ); TESTING_FREE_DEV( ddA2 ); TESTING_FREE_DEV( dtau2 ); TESTING_FREE_DEV( dwork2 ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); TESTING_FINALIZE(); return status; }