/* //////////////////////////////////////////////////////////////////////////// -- Testing cgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_R, *tau, *h_work, tmp[1]; magmaFloatComplex_ptr d_A, dT; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb, size; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}, ISEED2[4]; magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t status = 0; float tol; opts.lapack |= (opts.version == 2 && opts.check == 2); // check (-c2) implies lapack (-l) if ( opts.version != 2 && opts.check == 1 ) { printf( "NOTE: version %d requires -c2 check due to the special structure of the\n" "MAGMA cgeqrf results; using -c2.\n\n", (int) opts.version ); opts.check = 2; } printf( "version %d\n", (int) opts.version ); if ( opts.version == 2 ) { if ( opts.check == 1 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R-Q'A||_1 / (M*||A||_1*eps) ||I-Q'Q||_1 / (M*eps)\n"); printf("=========================================================================================================\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("=======================================================================\n"); } tol = 1.0; } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||Ax-b||_F/(N*||A||_F*||x||_F)\n"); printf("====================================================================================\n"); tol = opts.tolerance * lapackf77_slamch("E"); } for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_CGEQRF( M, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_cgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); TESTING_MALLOC_CPU( tau, magmaFloatComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_work, magmaFloatComplex, lwork ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); /* Initialize the matrix */ for ( int j=0; j<4; j++ ) ISEED2[j] = ISEED[j]; // save seeds lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_csetmatrix( M, N, h_R, lda, d_A, 0, ldda, opts.queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); if ( opts.version == 2 ) { magma_cgeqrf2_gpu( M, N, d_A, 0, ldda, tau, opts.queues2, &info ); } else { nb = magma_get_cgeqrf_nb( M ); size = (2*min(M, N) + (N+31)/32*32 )*nb; TESTING_MALLOC_DEV( dT, magmaFloatComplex, size ); if ( opts.version == 1 ) { magma_cgeqrf_gpu( M, N, d_A, 0, ldda, tau, dT, 0, opts.queue, &info ); } #ifdef HAVE_CUBLAS else if ( opts.version == 3 ) { magma_cgeqrf3_gpu( M, N, d_A, 0, ldda, tau, dT, opts.queue, &info ); } #endif else { printf( "Unknown version %d\n", opts.version ); exit(1); } } gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ magmaFloatComplex *tau2; TESTING_MALLOC_CPU( tau2, magmaFloatComplex, min_mn ); cpu_time = magma_wtime(); lapackf77_cgeqrf(&M, &N, h_A, &lda, tau2, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( tau2 ); } if ( opts.check == 1 && M >= N ) { /* ===================================================================== Check the result -- only version 1, cqrt02 requires M >= N =================================================================== */ magma_int_t lwork = n2+N; magmaFloatComplex *h_W1, *h_W2, *h_W3; float *h_RW, results[2]; magma_cgetmatrix( M, N, d_A, 0, ldda, h_R, M, opts.queue ); TESTING_MALLOC_CPU( h_W1, magmaFloatComplex, n2 ); // Q TESTING_MALLOC_CPU( h_W2, magmaFloatComplex, n2 ); // R TESTING_MALLOC_CPU( h_W3, magmaFloatComplex, lwork ); // WORK TESTING_MALLOC_CPU( h_RW, float, M ); // RWORK lapackf77_clarnv( &ione, ISEED2, &n2, h_A ); lapackf77_cqrt02( &M, &N, &min_mn, h_A, h_R, h_W1, h_W2, &lda, tau, h_W3, &lwork, h_RW, results ); if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, results[0], results[1] ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, gpu_perf, gpu_time, results[0], results[1] ); } // todo also check results[1] < tol? printf(" %s\n", (results[0] < tol ? "ok" : "failed")); status += ! (results[0] < tol); TESTING_FREE_CPU( h_W1 ); TESTING_FREE_CPU( h_W2 ); TESTING_FREE_CPU( h_W3 ); TESTING_FREE_CPU( h_RW ); } else if ( opts.check == 2 && opts.version == 2 ) { /* ===================================================================== Check the result compared to LAPACK -- only version 2 =================================================================== */ magma_cgetmatrix( M, N, d_A, 0, ldda, h_R, M, opts.queue ); error = lapackf77_clange("f", &M, &N, h_A, &lda, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_clange("f", &M, &N, h_R, &lda, work) / error; if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e", (int) M, (int) N, gpu_perf, gpu_time, error ); } printf(" %s\n", (error < tol ? "ok" : "failed")); status += ! (error < tol); } else if ( opts.check == 2 && M >= N ) { /* ===================================================================== Check the result by solving linear system -- only versions 1 & 3, M >= N =================================================================== */ magma_int_t lwork; magmaFloatComplex *x, *b, *hwork; magmaFloatComplex_ptr d_B; const magmaFloatComplex c_zero = MAGMA_C_ZERO; const magmaFloatComplex c_one = MAGMA_C_ONE; const magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; const magma_int_t ione = 1; // initialize RHS, b = A*random TESTING_MALLOC_CPU( x, magmaFloatComplex, N ); TESTING_MALLOC_CPU( b, magmaFloatComplex, M ); lapackf77_clarnv( &ione, ISEED, &N, x ); blasf77_cgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_zero, b, &ione ); // copy to GPU TESTING_MALLOC_DEV( d_B, magmaFloatComplex, M ); magma_csetvector( M, b, 1, d_B, 0, 1, opts.queue ); if ( opts.version == 1 ) { // allocate hwork magma_cgeqrs_gpu( M, N, 1, d_A, 0, ldda, tau, dT, 0, d_B, 0, M, tmp, -1, opts.queue, &info ); lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, magmaFloatComplex, lwork ); // solve linear system magma_cgeqrs_gpu( M, N, 1, d_A, 0, ldda, tau, dT, 0, d_B, 0, M, hwork, lwork, opts.queue, &info ); if (info != 0) printf("magma_cgeqrs returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( hwork ); } #ifdef HAVE_CUBLAS else if ( opts.version == 3 ) { // allocate hwork magma_cgeqrs3_gpu( M, N, 1, d_A, 0, ldda, tau, dT, 0, d_B, 0, M, tmp, -1, opts.queue, &info ); lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, magmaFloatComplex, lwork ); // solve linear system magma_cgeqrs3_gpu( M, N, 1, d_A, 0, ldda, tau, dT, 0, d_B, 0, M, hwork, lwork, opts.queue, &info ); if (info != 0) printf("magma_cgeqrs3 returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( hwork ); } #endif else { printf( "Unknown version %d\n", opts.version ); exit(1); } magma_cgetvector( N, d_B, 0, 1, x, 1, opts.queue ); // compute r = Ax - b, saved in b lapackf77_clarnv( &ione, ISEED2, &n2, h_A ); blasf77_cgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_neg_one, b, &ione ); // compute residual |Ax - b| / (n*|A|*|x|) float norm_x, norm_A, norm_r, work[1]; norm_A = lapackf77_clange( "F", &M, &N, h_A, &lda, work ); norm_r = lapackf77_clange( "F", &M, &ione, b, &M, work ); norm_x = lapackf77_clange( "F", &N, &ione, x, &N, work ); TESTING_FREE_CPU( x ); TESTING_FREE_CPU( b ); TESTING_FREE_DEV( d_B ); error = norm_r / (N * norm_A * norm_x); if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e", (int) M, (int) N, gpu_perf, gpu_time, error ); } printf(" %s\n", (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) ---", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) ---", (int) M, (int) N, gpu_perf, gpu_time); } printf("%s\n", (opts.check != 0 ? " (error check only for M >= N)" : "")); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); if ( opts.version != 2 ) TESTING_FREE_DEV( dT ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/** Purpose ------- Solves the overdetermined, least squares problem min || A*X - C || using the QR factorization A. The underdetermined problem (m < n) is not currently handled. Arguments --------- @param[in] trans magma_trans_t - = MagmaNoTrans: the linear system involves A. Only TRANS=MagmaNoTrans is currently handled. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. M >= N >= 0. @param[in] nrhs INTEGER The number of columns of the matrix C. NRHS >= 0. @param[in,out] dA COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, A is overwritten by details of its QR factorization as returned by CGEQRF3. @param[in] ldda INTEGER The leading dimension of the array A, LDDA >= M. @param[in,out] dB COMPLEX array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= M. @param[out] hwork (workspace) COMPLEX array, dimension MAX(1,LWORK). On exit, if INFO = 0, HWORK(1) returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array HWORK, LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB, where NB is the blocksize given by magma_get_cgeqrf_nb( M ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the HWORK array. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_cgels_driver ********************************************************************/ extern "C" magma_int_t magma_cgels3_gpu( magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaFloatComplex *dA, magma_int_t ldda, magmaFloatComplex *dB, magma_int_t lddb, magmaFloatComplex *hwork, magma_int_t lwork, magma_int_t *info) { magmaFloatComplex *dT, *tau; magma_int_t k; magma_int_t nb = magma_get_cgeqrf_nb(m); magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb; int lquery = (lwork == -1); hwork[0] = MAGMA_C_MAKE( (float)lwkopt, 0. ); *info = 0; /* For now, N is the only case working */ if ( trans != MagmaNoTrans ) *info = -1; else if (m < 0) *info = -2; else if (n < 0 || m < n) /* LQ is not handle for now*/ *info = -3; else if (nrhs < 0) *info = -4; else if (ldda < max(1,m)) *info = -6; else if (lddb < max(1,m)) *info = -8; else if (lwork < lwkopt && ! lquery) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { hwork[0] = MAGMA_C_ONE; return *info; } /* * Allocate temporary buffers */ int ldtwork = ( 2*k + ((n+31)/32)*32 )*nb; if (nb < nrhs) ldtwork = ( 2*k + ((n+31)/32)*32 )*nrhs; if (MAGMA_SUCCESS != magma_cmalloc( &dT, ldtwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_cmalloc_cpu( &tau, k ); if ( tau == NULL ) { magma_free( dT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_cgeqrf3_gpu( m, n, dA, ldda, tau, dT, info ); if ( *info == 0 ) { magma_cgeqrs3_gpu( m, n, nrhs, dA, ldda, tau, dT, dB, lddb, hwork, lwork, info ); } magma_free( dT ); magma_free_cpu(tau); return *info; }