/* //////////////////////////////////////////////////////////////////////////// -- Testing cunmlq */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float Cnorm, error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t mm, m, n, k, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max; magmaFloatComplex *C, *R, *A, *W, *tau; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); float tol = opts.tolerance * lapackf77_slamch("E"); // test all combinations of input parameters magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans[] = { Magma_ConjTrans, MagmaNoTrans }; printf("%% M N K side trans CPU Gflop/s (sec) GPU Gflop/s (sec) ||R||_F / ||QC||_F\n"); printf("%%==============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; nb = magma_get_cgelqf_nb( m, n ); ldc = m; // A is k x m (left) or k x n (right) mm = (side[iside] == MagmaLeft ? m : n); lda = k; gflops = FLOPS_CUNMLQ( m, n, k, side[iside] ) / 1e9; if ( side[iside] == MagmaLeft && m < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=left and m < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } if ( side[iside] == MagmaRight && n < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=right and n < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } // need at least 2*nb*nb for gelqf lwork_max = max( max( m*nb, n*nb ), 2*nb*nb ); // this rounds it up slightly if needed to agree with lwork query lwork_max = int( real( magma_cmake_lwork( lwork_max ))); TESTING_MALLOC_CPU( C, magmaFloatComplex, ldc*n ); TESTING_MALLOC_CPU( R, magmaFloatComplex, ldc*n ); TESTING_MALLOC_CPU( A, magmaFloatComplex, lda*mm ); TESTING_MALLOC_CPU( W, magmaFloatComplex, lwork_max ); TESTING_MALLOC_CPU( tau, magmaFloatComplex, k ); // C is full, m x n size = ldc*n; lapackf77_clarnv( &ione, ISEED, &size, C ); lapackf77_clacpy( "Full", &m, &n, C, &ldc, R, &ldc ); size = lda*mm; lapackf77_clarnv( &ione, ISEED, &size, A ); // compute LQ factorization to get Householder vectors in A, tau magma_cgelqf( k, mm, A, lda, tau, W, lwork_max, &info ); if (info != 0) { printf("magma_cgelqf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cunmlq( lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), &m, &n, &k, A, &lda, tau, C, &ldc, W, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_cunmlq returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_cunmlq( side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); if (info != 0) { printf("magma_cunmlq (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); } lwork = (magma_int_t) MAGMA_C_REAL( W[0] ); if ( lwork < 0 || lwork > lwork_max ) { printf("Warning: optimal lwork %d > allocated lwork_max %d\n", (int) lwork, (int) lwork_max ); lwork = lwork_max; } gpu_time = magma_wtime(); magma_cunmlq( side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_cunmlq returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ size = ldc*n; blasf77_caxpy( &size, &c_neg_one, C, &ione, R, &ione ); Cnorm = lapackf77_clange( "Fro", &m, &n, C, &ldc, work ); error = lapackf77_clange( "Fro", &m, &n, R, &ldc, work ) / (magma_ssqrt(m*n) * Cnorm); printf( "%5d %5d %5d %4c %5c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ), cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( W ); TESTING_FREE_CPU( tau ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iside, itran printf( "\n" ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgelqf */ int main( int argc, char** argv) { TESTING_CUDA_INIT(); magma_timestr_t start, end; float flops, gpu_perf, cpu_perf; float matnorm, work[1]; cuFloatComplex c_neg_one = MAGMA_C_NEG_ONE; cuFloatComplex *h_A, *h_R, *tau, *h_work, tmp[1]; /* Matrix size */ magma_int_t M = 0, N = 0, n2, lda, lwork; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112}; magma_int_t i, info, min_mn, nb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } if (N>0 && M>0) printf(" testing_cgelqf -M %d -N %d\n\n", (int) M, (int) N); else { printf("\nUsage: \n"); printf(" testing_cgelqf -M %d -N %d\n\n", (int) M, (int) N); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_cgelqf -M %d -N %d\n\n", 1024, 1024); M = N = size[9]; } n2 = M * N; min_mn = min(M, N); nb = magma_get_cgeqrf_nb(M); TESTING_MALLOC( tau, cuFloatComplex, min_mn ); TESTING_MALLOC( h_A, cuFloatComplex, n2 ); TESTING_HOSTALLOC( h_R, cuFloatComplex, n2 ); lwork = -1; lapackf77_cgelqf(&M, &N, h_A, &M, tau, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); lwork = max( lwork, M*nb ); TESTING_HOSTALLOC( h_work, cuFloatComplex, lwork ); printf(" M N CPU GFlop/s GPU GFlop/s ||R||_F / ||A||_F\n"); printf("==========================================================\n"); for(i=0; i<10; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); lda = M; n2 = lda*N; flops = FLOPS( (float)M, (float)N ) / 1000000; /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ start = get_current_time(); magma_cgelqf( M, N, h_R, lda, tau, h_work, lwork, &info); end = get_current_time(); if (info < 0) printf("Argument %d of magma_cgelqf had an illegal value.\n", (int) -info); gpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Performs operation using LAPACK =================================================================== */ start = get_current_time(); lapackf77_cgelqf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); end = get_current_time(); if (info < 0) printf("Argument %d of lapack_cgelqf had an illegal value.\n", (int) -info); cpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ matnorm = lapackf77_clange("f", &M, &N, h_A, &lda, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); printf("%5d %5d %6.2f %6.2f %e\n", (int) M, (int) N, cpu_perf, gpu_perf, lapackf77_clange("f", &M, &N, h_R, &lda, work) / matnorm); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE( tau ); TESTING_FREE( h_A ); TESTING_HOSTFREE( h_R ); TESTING_HOSTFREE( h_work ); /* Shutdown */ TESTING_CUDA_FINALIZE(); }