void MAGMAF_ZUNMQR( char *side, char *trans, magma_int_t *m, magma_int_t *n, magma_int_t *k, cuDoubleComplex *a, magma_int_t *lda, cuDoubleComplex *tau, cuDoubleComplex *c, magma_int_t *ldc, cuDoubleComplex *work, magma_int_t *lwork, magma_int_t *info) { magma_zunmqr( side[0], trans[0], *m, *n, *k, a, *lda, tau, c, *ldc, work, *lwork, info); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zunmqr */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t mm, m, n, k, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max; magmaDoubleComplex *C, *R, *A, *W, *tau; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); double tol = opts.tolerance * lapackf77_dlamch("E"); // test all combinations of input parameters magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans[] = { MagmaConjTrans, MagmaNoTrans }; printf(" M N K side trans CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||QC||_F\n"); printf("===============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; nb = magma_get_zgeqrf_nb( m ); ldc = m; // A is m x k (left) or n x k (right) mm = (side[iside] == MagmaLeft ? m : n); lda = mm; gflops = FLOPS_ZUNMQR( m, n, k, side[iside] ) / 1e9; if ( side[iside] == MagmaLeft && m < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=left and m < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } if ( side[iside] == MagmaRight && n < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=right and n < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } // need at least 2*nb*nb for geqrf lwork_max = max( max( m*nb, n*nb ), 2*nb*nb ); TESTING_MALLOC_CPU( C, magmaDoubleComplex, ldc*n ); TESTING_MALLOC_CPU( R, magmaDoubleComplex, ldc*n ); TESTING_MALLOC_CPU( A, magmaDoubleComplex, lda*k ); TESTING_MALLOC_CPU( W, magmaDoubleComplex, lwork_max ); TESTING_MALLOC_CPU( tau, magmaDoubleComplex, k ); // C is full, m x n size = ldc*n; lapackf77_zlarnv( &ione, ISEED, &size, C ); lapackf77_zlacpy( "Full", &m, &n, C, &ldc, R, &ldc ); size = lda*k; lapackf77_zlarnv( &ione, ISEED, &size, A ); // compute QR factorization to get Householder vectors in A, tau magma_zgeqrf( mm, k, A, lda, tau, W, lwork_max, &info ); if (info != 0) printf("magma_zgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_zunmqr( lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), &m, &n, &k, A, &lda, tau, C, &ldc, W, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zunmqr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_zunmqr( side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); if (info != 0) printf("magma_zunmqr (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); lwork = (magma_int_t) MAGMA_Z_REAL( W[0] ); if ( lwork < 0 || lwork > lwork_max ) { printf("optimal lwork %d > lwork_max %d\n", (int) lwork, (int) lwork_max ); lwork = lwork_max; } gpu_time = magma_wtime(); magma_zunmqr( side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zunmqr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ error = lapackf77_zlange( "Fro", &m, &n, C, &ldc, work ); size = ldc*n; blasf77_zaxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_zlange( "Fro", &m, &n, R, &ldc, work ) / error; printf( "%5d %5d %5d %4c %5c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ), cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( W ); TESTING_FREE_CPU( tau ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iside, itran printf( "\n" ); } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_zunmtr(char side, char uplo, char trans, magma_int_t m, magma_int_t n, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *c, magma_int_t ldc, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= ZUNMTR overwrites the general complex M-by-N matrix C with SIDE = 'L' SIDE = 'R' TRANS = 'N': Q * C C * Q TRANS = 'T': Q**H * C C * Q**H where Q is a complex orthogonal matrix of order nq, with nq = m if SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of nq-1 elementary reflectors, as returned by SSYTRD: if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1); if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1). Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply Q or Q**H from the Left; = 'R': apply Q or Q**H from the Right. UPLO (input) CHARACTER*1 = 'U': Upper triangle of A contains elementary reflectors from SSYTRD; = 'L': Lower triangle of A contains elementary reflectors from SSYTRD. TRANS (input) CHARACTER*1 = 'N': No transpose, apply Q; = 'T': Transpose, apply Q**H. M (input) INTEGER The number of rows of the matrix C. M >= 0. N (input) INTEGER The number of columns of the matrix C. N >= 0. A (input) COMPLEX_16 array, dimension (LDA,M) if SIDE = 'L' (LDA,N) if SIDE = 'R' The vectors which define the elementary reflectors, as returned by SSYTRD. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M) if SIDE = 'L'; LDA >= max(1,N) if SIDE = 'R'. TAU (input) COMPLEX_16 array, dimension (M-1) if SIDE = 'L' (N-1) if SIDE = 'R' TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by SSYTRD. C (input/output) COMPLEX_16 array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H * C or C * Q**H or C*Q. LDC (input) INTEGER The leading dimension of the array C. LDC >= max(1,M). WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. If SIDE = 'L', LWORK >= max(1,N); if SIDE = 'R', LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = 'L', and LWORK >= M*NB if SIDE = 'R', where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ cuDoubleComplex c_one = MAGMA_Z_ONE; char side_[2] = {side, 0}; char uplo_[2] = {uplo, 0}; char trans_[2] = {trans, 0}; magma_int_t i__2; magma_int_t i1, i2, nb, mi, ni, nq, nw; int left, upper, lquery; magma_int_t iinfo; magma_int_t lwkopt; *info = 0; left = lapackf77_lsame(side_, "L"); upper = lapackf77_lsame(uplo_, "U"); lquery = lwork == -1; /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && ! lapackf77_lsame(side_, "R")) { *info = -1; } else if (! upper && ! lapackf77_lsame(uplo_, "L")) { *info = -2; } else if (! lapackf77_lsame(trans_, "N") && ! lapackf77_lsame(trans_, "C")) { *info = -3; } else if (m < 0) { *info = -4; } else if (n < 0) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } if (*info == 0) { nb = 32; lwkopt = max(1,nw) * nb; MAGMA_Z_SET2REAL( work[0], lwkopt ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || nq == 1) { work[0] = c_one; return *info; } if (left) { mi = m - 1; ni = n; } else { mi = m; ni = n - 1; } if (upper) { /* Q was determined by a call to SSYTRD with UPLO = 'U' */ i__2 = nq - 1; //lapackf77_zunmql(side_, trans_, &mi, &ni, &i__2, &a[lda], &lda, // tau, c, &ldc, work, &lwork, &iinfo); magma_zunmql(side, trans, mi, ni, i__2, &a[lda], lda, tau, c, ldc, work, lwork, &iinfo); } else { /* Q was determined by a call to SSYTRD with UPLO = 'L' */ if (left) { i1 = 1; i2 = 0; } else { i1 = 0; i2 = 1; } i__2 = nq - 1; magma_zunmqr(side, trans, mi, ni, i__2, &a[1], lda, tau, &c[i1 + i2 * ldc], ldc, work, lwork, &iinfo); } MAGMA_Z_SET2REAL( work[0], lwkopt ); return *info; } /* magma_zunmtr */