/***************************************************************************//** Purpose ------- CUNMTR overwrites the general complex M-by-N matrix C with SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = Magma_ConjTrans: Q**H * C C * Q**H where Q is a complex unitary matrix of order nq, with nq = m if SIDE = MagmaLeft and nq = n if SIDE = MagmaRight. Q is defined as the product of nq-1 elementary reflectors, as returned by CHETRD: if UPLO = MagmaUpper, Q = H(nq-1) . . . H(2) H(1); if UPLO = MagmaLower, Q = H(1) H(2) . . . H(nq-1). Arguments --------- @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A contains elementary reflectors from CHETRD; - = MagmaLower: Lower triangle of A contains elementary reflectors from CHETRD. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = Magma_ConjTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] A COMPLEX array, dimension (LDA,M) if SIDE = MagmaLeft (LDA,N) if SIDE = MagmaRight The vectors which define the elementary reflectors, as returned by CHETRD. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M) if SIDE = MagmaLeft; LDA >= max(1,N) if SIDE = MagmaRight. @param[in] tau COMPLEX array, dimension (M-1) if SIDE = MagmaLeft (N-1) if SIDE = MagmaRight TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by CHETRD. @param[in,out] C COMPLEX array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H * C or C * Q**H or C*Q. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = MagmaLeft, and LWORK >= M*NB if SIDE = MagmaRight, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_unmtr *******************************************************************************/ extern "C" magma_int_t magma_cunmtr( magma_side_t side, magma_uplo_t uplo, magma_trans_t trans, magma_int_t m, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *C, magma_int_t ldc, magmaFloatComplex *work, magma_int_t lwork, magma_int_t *info) { #define A(i_,j_) (A + (i_) + (j_)*lda) #define C(i_,j_) (C + (i_) + (j_)*ldc) magmaFloatComplex c_one = MAGMA_C_ONE; magma_int_t i__2; magma_int_t i1, i2, nb, mi, ni, nq, nw; magma_int_t iinfo; magma_int_t lwkopt; *info = 0; bool left = (side == MagmaLeft); bool upper = (uplo == MagmaUpper); bool lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && side != MagmaRight) { *info = -1; } else if (! upper && uplo != MagmaLower) { *info = -2; } else if (trans != MagmaNoTrans && trans != Magma_ConjTrans) { *info = -3; } else if (m < 0) { *info = -4; } else if (n < 0) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } nb = 32; lwkopt = max(1,nw) * nb; if (*info == 0) { work[0] = magma_cmake_lwork( lwkopt ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || nq == 1) { work[0] = c_one; return *info; } if (left) { mi = m - 1; ni = n; } else { mi = m; ni = n - 1; } if (upper) { /* Q was determined by a call to CHETRD with UPLO = MagmaUpper */ i__2 = nq - 1; //lapackf77_cunmql(side_, trans_, &mi, &ni, &i__2, A(0,1), &lda, // tau, C, &ldc, work, &lwork, &iinfo); magma_cunmql(side, trans, mi, ni, i__2, A(0,1), lda, tau, C, ldc, work, lwork, &iinfo); } else { /* Q was determined by a call to CHETRD with UPLO = MagmaLower */ if (left) { i1 = 1; i2 = 0; } else { i1 = 0; i2 = 1; } i__2 = nq - 1; magma_cunmqr(side, trans, mi, ni, i__2, A(1,0), lda, tau, C(i1,i2), ldc, work, lwork, &iinfo); } work[0] = magma_cmake_lwork( lwkopt ); return *info; } /* magma_cunmtr */
/* //////////////////////////////////////////////////////////////////////////// -- Testing cunmqr */ int main( int argc, char** argv ) { TESTING_CUDA_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; cuFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; /* Matrix size */ magma_int_t m, n, k; const int MAXTESTS = 10; magma_int_t msize[MAXTESTS] = { 1024, 2048, 3072, 4032, 5184, 6016, 7040, 8064, 9088, 10112 }; magma_int_t nsize[MAXTESTS] = { 1024, 2048, 3072, 4032, 5184, 6016, 7040, 8064, 9088, 10112 }; magma_int_t ksize[MAXTESTS] = { 1024, 2048, 3072, 4032, 5184, 6016, 7040, 8064, 9088, 10112 }; magma_int_t size; magma_int_t info; magma_int_t iseed[4] = {0,0,0,1}; printf( "Usage: %s -N m,n,k -c\n" " -N can be repeated %d times. m > 0, n > 0, k > 0 is required.\n" " If only m,n is given, then n=k. If only m is given, then m=n=k.\n" " -c or setting $MAGMA_TESTINGS_CHECK runs LAPACK and checks result.\n\n", argv[0], MAXTESTS ); int checkres = (getenv("MAGMA_TESTINGS_CHECK") != NULL); int ntest = 0; magma_int_t nmax = 0; magma_int_t mmax = 0; magma_int_t kmax = 0; for( int i = 1; i < argc; i++ ) { if ( strcmp("-N", argv[i]) == 0 && i+1 < argc ) { magma_assert( ntest < MAXTESTS, "error: -N repeated more than maximum %d tests\n", MAXTESTS ); info = sscanf( argv[++i], "%d,%d,%d", &m, &n, &k ); if ( info == 3 && m > 0 && n > 0 && k > 0 ) { msize[ ntest ] = m; nsize[ ntest ] = n; ksize[ ntest ] = k; } else if ( info == 2 && m > 0 && n > 0 ) { msize[ ntest ] = m; nsize[ ntest ] = n; ksize[ ntest ] = n; // implicitly } else if ( info == 1 && m > 0 ) { msize[ ntest ] = m; nsize[ ntest ] = m; // implicitly ksize[ ntest ] = m; // implicitly } else { printf( "error: -N %s is invalid; ensure m > 0, n > 0, k > 0.\n", argv[i] ); exit(1); } mmax = max( mmax, msize[ntest] ); nmax = max( nmax, nsize[ntest] ); kmax = max( kmax, ksize[ntest] ); ntest++; } else if ( strcmp("-c", argv[i]) == 0 ) { checkres = true; } else { printf( "invalid argument: %s\n", argv[i] ); exit(1); } } if ( ntest == 0 ) { ntest = MAXTESTS; nmax = nsize[ntest-1]; mmax = msize[ntest-1]; kmax = ksize[ntest-1]; } m = mmax; n = nmax; k = kmax; assert( n > 0 && m > 0 && k > 0 ); magma_int_t nb = magma_get_cgeqrf_nb( m ); magma_int_t ldc = m; magma_int_t lda = max(m,n); ldc = ((ldc+31)/32)*32; lda = ((lda+31)/32)*32; // Allocate memory for matrices cuFloatComplex *C, *R, *A, *W, *tau; magma_int_t lwork = max( m*nb, n*nb ); magma_int_t lwork_max = lwork; TESTING_MALLOC( C, cuFloatComplex, ldc*n ); TESTING_MALLOC( R, cuFloatComplex, ldc*n ); TESTING_MALLOC( A, cuFloatComplex, lda*k ); TESTING_MALLOC( W, cuFloatComplex, lwork_max ); TESTING_MALLOC( tau, cuFloatComplex, k ); // test all combinations of input parameters const char* side[] = { MagmaLeftStr, MagmaRightStr }; const char* trans[] = { MagmaConjTransStr, MagmaNoTransStr }; printf(" M N K side trans CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||QC||_F\n"); printf("===============================================================================================\n"); for( int i = 0; i < ntest; ++i ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { m = msize[i]; n = nsize[i]; k = ksize[i]; if ( *side[iside] == 'L' && m < k ) { printf( "%5d %5d %5d %-5s %-9s skipping because side=left and m < k\n", (int) m, (int) n, (int) k, side[iside], trans[itran] ); continue; } if ( *side[iside] == 'R' && n < k ) { printf( "%5d %5d %5d %-5s %-9s skipping because side=right and n < k\n", (int) m, (int) n, (int) k, side[iside], trans[itran] ); continue; } gflops = FLOPS_CUNMQR( m, n, k, *side[iside] ) / 1e9; // C is full, m x n size = ldc*n; lapackf77_clarnv( &ione, iseed, &size, C ); lapackf77_clacpy( "Full", &m, &n, C, &ldc, R, &ldc ); //magma_csetmatrix( m, n, C, ldc, dC, ldc ); // A is m x k (left) or n x k (right) lda = (*side[iside] == 'L' ? m : n); size = lda*k; lapackf77_clarnv( &ione, iseed, &size, A ); // compute QR factorization to get Householder vectors in A, tau magma_cgeqrf( lda, k, A, lda, tau, W, lwork_max, &info ); if ( info != 0 ) printf("magma_cgeqrf returned error %d\n", info); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cunmqr( side[iside], trans[itran], &m, &n, &k, A, &lda, tau, C, &ldc, W, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cunmqr returned error %d.\n", (int) info); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for work size lwork = -1; magma_cunmqr( *side[iside], *trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); if (info != 0) printf("magma_cunmqr returned error %d (lwork query).\n", (int) info); lwork = (magma_int_t) MAGMA_C_REAL( W[0] ); if ( lwork < 0 || lwork > lwork_max ) printf("invalid lwork %d, lwork_max %d\n", lwork, lwork_max ); gpu_time = magma_wtime(); magma_cunmqr( *side[iside], *trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cunmqr returned error %d.\n", (int) info); //magma_cgetmatrix( m, n, dC, ldc, R, ldc ); /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ error = lapackf77_clange( "Fro", &m, &n, C, &ldc, work ); size = ldc*n; blasf77_caxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_clange( "Fro", &m, &n, R, &ldc, work ) / error; printf( "%5d %5d %5d %-5s %-9s %7.2f (%7.2f) %7.2f (%7.2f) %8.2e\n", (int) m, (int) n, (int) k, side[iside], trans[itran], cpu_perf, cpu_time, gpu_perf, gpu_time, error ); }} // end iside, itran printf( "\n" ); } // end i // Memory clean up TESTING_FREE( C ); TESTING_FREE( R ); TESTING_FREE( A ); TESTING_FREE( W ); TESTING_FREE( tau ); // Shutdown TESTING_CUDA_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cunmqr */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t m, n, k, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max; magmaFloatComplex *C, *R, *A, *W, *tau; magma_opts opts; parse_opts( argc, argv, &opts ); // test all combinations of input parameters const char* side[] = { MagmaLeftStr, MagmaRightStr }; const char* trans[] = { MagmaConjTransStr, MagmaNoTransStr }; printf(" M N K side trans CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||QC||_F\n"); printf("===============================================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { m = opts.msize[i]; n = opts.nsize[i]; k = opts.ksize[i]; nb = magma_get_cgeqrf_nb( m ); ldc = ((m + 31)/32)*32; lda = ((max(m,n) + 31)/32)*32; gflops = FLOPS_CUNMQR( m, n, k, *side[iside] ) / 1e9; if ( *side[iside] == 'L' && m < k ) { printf( "%5d %5d %5d %-5s %-9s skipping because side=left and m < k\n", (int) m, (int) n, (int) k, side[iside], trans[itran] ); continue; } if ( *side[iside] == 'R' && n < k ) { printf( "%5d %5d %5d %-5s %-9s skipping because side=right and n < k\n", (int) m, (int) n, (int) k, side[iside], trans[itran] ); continue; } lwork_max = max( m*nb, n*nb ); TESTING_MALLOC( C, magmaFloatComplex, ldc*n ); TESTING_MALLOC( R, magmaFloatComplex, ldc*n ); TESTING_MALLOC( A, magmaFloatComplex, lda*k ); TESTING_MALLOC( W, magmaFloatComplex, lwork_max ); TESTING_MALLOC( tau, magmaFloatComplex, k ); // C is full, m x n size = ldc*n; lapackf77_clarnv( &ione, ISEED, &size, C ); lapackf77_clacpy( "Full", &m, &n, C, &ldc, R, &ldc ); //magma_csetmatrix( m, n, C, ldc, dC, ldc ); // A is m x k (left) or n x k (right) lda = (*side[iside] == 'L' ? m : n); size = lda*k; lapackf77_clarnv( &ione, ISEED, &size, A ); // compute QR factorization to get Householder vectors in A, tau magma_cgeqrf( lda, k, A, lda, tau, W, lwork_max, &info ); if (info != 0) printf("magma_cgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cunmqr( side[iside], trans[itran], &m, &n, &k, A, &lda, tau, C, &ldc, W, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cunmqr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for work size lwork = -1; magma_cunmqr( *side[iside], *trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); if (info != 0) printf("magma_cunmqr (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); lwork = (magma_int_t) MAGMA_C_REAL( W[0] ); if ( lwork < 0 || lwork > lwork_max ) printf("invalid lwork %d, lwork_max %d\n", (int) lwork, (int) lwork_max ); gpu_time = magma_wtime(); magma_cunmqr( *side[iside], *trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cunmqr returned error %d: %s.\n", (int) info, magma_strerror( info )); //magma_cgetmatrix( m, n, dC, ldc, R, ldc ); /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ error = lapackf77_clange( "Fro", &m, &n, C, &ldc, work ); size = ldc*n; blasf77_caxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_clange( "Fro", &m, &n, R, &ldc, work ) / error; printf( "%5d %5d %5d %-5s %-9s %7.2f (%7.2f) %7.2f (%7.2f) %8.2e\n", (int) m, (int) n, (int) k, side[iside], trans[itran], cpu_perf, cpu_time, gpu_perf, gpu_time, error ); TESTING_FREE( C ); TESTING_FREE( R ); TESTING_FREE( A ); TESTING_FREE( W ); TESTING_FREE( tau ); }} // end iside, itran printf( "\n" ); } TESTING_FINALIZE(); return 0; }
/** Purpose ------- If VECT = MagmaQ, CUNMBR overwrites the general complex M-by-N matrix C with SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q*C C*Q TRANS = Magma_ConjTrans: Q**H*C C*Q**H If VECT = MagmaP, CUNMBR overwrites the general complex M-by-N matrix C with SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: P*C C*P TRANS = Magma_ConjTrans: P**H*C C*P**H Here Q and P**H are the unitary matrices determined by CGEBRD when reducing A complex matrix A to bidiagonal form: A = Q*B * P**H. Q and P**H are defined as products of elementary reflectors H(i) and G(i) respectively. Let nq = m if SIDE = MagmaLeft and nq = n if SIDE = MagmaRight. Thus nq is the order of the unitary matrix Q or P**H that is applied. If VECT = MagmaQ, A is assumed to have been an NQ-by-K matrix: if nq >= k, Q = H(1) H(2) . . . H(k); if nq < k, Q = H(1) H(2) . . . H(nq-1). If VECT = MagmaP, A is assumed to have been A K-by-NQ matrix: if k < nq, P = G(1) G(2) . . . G(k); if k >= nq, P = G(1) G(2) . . . G(nq-1). Arguments --------- @param[in] vect magma_vect_t - = MagmaQ: apply Q or Q**H; - = MagmaP: apply P or P**H. @param[in] side magma_side_t - = MagmaLeft: apply Q, Q**H, P or P**H from the Left; - = MagmaRight: apply Q, Q**H, P or P**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q or P; - = Magma_ConjTrans: Conjugate transpose, apply Q**H or P**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER If VECT = MagmaQ, the number of columns in the original matrix reduced by CGEBRD. If VECT = MagmaP, the number of rows in the original matrix reduced by CGEBRD. K >= 0. @param[in] A COMPLEX array, dimension (LDA,min(nq,K)) if VECT = MagmaQ (LDA,nq) if VECT = MagmaP The vectors which define the elementary reflectors H(i) and G(i), whose products determine the matrices Q and P, as returned by CGEBRD. @param[in] lda INTEGER The leading dimension of the array A. If VECT = MagmaQ, LDA >= max(1,nq); if VECT = MagmaP, LDA >= max(1,min(nq,K)). @param[in] tau COMPLEX array, dimension (min(nq,K)) TAU(i) must contain the scalar factor of the elementary reflector H(i) or G(i) which determines Q or P, as returned by CGEBRD in the array argument TAUQ or TAUP. @param[in,out] C COMPLEX array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q or P*C or P**H*C or C*P or C*P**H. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M); if N = 0 or M = 0, LWORK >= 1. For optimum performance if SIDE = MagmaLeft, LWORK >= max(1,N*NB); if SIDE = MagmaRight, LWORK >= max(1,M*NB), where NB is the optimal blocksize. (NB = 0 if M = 0 or N = 0.) \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_cgesvd_comp ********************************************************************/ extern "C" magma_int_t magma_cunmbr( magma_vect_t vect, magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *C, magma_int_t ldc, magmaFloatComplex *work, magma_int_t lwork, magma_int_t *info) { #define A(i,j) (A + (i) + (j)*lda) #define C(i,j) (C + (i) + (j)*ldc) magma_int_t i1, i2, nb, mi, ni, nq, nq_1, nw, iinfo, lwkopt; magma_int_t left, notran, applyq, lquery; magma_trans_t transt; *info = 0; applyq = (vect == MagmaQ); left = (side == MagmaLeft); notran = (trans == MagmaNoTrans); lquery = (lwork == -1); /* NQ is the order of Q or P and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (m == 0 || n == 0) { nw = 0; } /* check arguments */ if (! applyq && vect != MagmaP) { *info = -1; } else if (! left && side != MagmaRight) { *info = -2; } else if (! notran && trans != Magma_ConjTrans) { *info = -3; } else if (m < 0) { *info = -4; } else if (n < 0) { *info = -5; } else if (k < 0) { *info = -6; } else if ( ( applyq && lda < max(1,nq) ) || ( ! applyq && lda < max(1,min(nq,k)) ) ) { *info = -8; } else if (ldc < max(1,m)) { *info = -11; } else if (lwork < max(1,nw) && ! lquery) { *info = -13; } if (*info == 0) { if (nw > 0) { // TODO have get_cunmqr_nb and get_cunmlq_nb routines? see original LAPACK cunmbr. // TODO make them dependent on m, n, and k? nb = magma_get_cgebrd_nb( min( m, n )); lwkopt = max(1, nw*nb); } else { lwkopt = 1; } work[0] = MAGMA_C_MAKE( lwkopt, 0 ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0) { return *info; } if (applyq) { /* Apply Q */ if (nq >= k) { /* Q was determined by a call to CGEBRD with nq >= k */ #if VERSION == 1 lapackf77_cunmqr( lapack_side_const(side), lapack_trans_const(trans), &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, &iinfo); #else magma_cunmqr( side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, &iinfo); #endif } else if (nq > 1) { /* Q was determined by a call to CGEBRD with nq < k */ if (left) { mi = m - 1; ni = n; i1 = 1; i2 = 0; } else { mi = m; ni = n - 1; i1 = 0; i2 = 1; } nq_1 = nq - 1; #if VERSION == 1 lapackf77_cunmqr( lapack_side_const(side), lapack_trans_const(trans), &mi, &ni, &nq_1, A(1,0), &lda, tau, C(i1,i2), &ldc, work, &lwork, &iinfo); #else magma_cunmqr( side, trans, mi, ni, nq-1, A(1,0), lda, tau, C(i1,i2), ldc, work, lwork, &iinfo); #endif } } else { /* Apply P */ if (notran) { transt = Magma_ConjTrans; } else { transt = MagmaNoTrans; } if (nq > k) { /* P was determined by a call to CGEBRD with nq > k */ #if VERSION == 1 lapackf77_cunmlq( lapack_side_const(side), lapack_trans_const(transt), &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, &iinfo); #else magma_cunmlq( side, transt, m, n, k, A, lda, tau, C, ldc, work, lwork, &iinfo); #endif } else if (nq > 1) { /* P was determined by a call to CGEBRD with nq <= k */ if (left) { mi = m - 1; ni = n; i1 = 1; i2 = 0; } else { mi = m; ni = n - 1; i1 = 0; i2 = 1; } nq_1 = nq - 1; #if VERSION == 1 lapackf77_cunmlq( lapack_side_const(side), lapack_trans_const(transt), &mi, &ni, &nq_1, A(0,1), &lda, tau, C(i1,i2), &ldc, work, &lwork, &iinfo); #else magma_cunmlq( side, transt, mi, ni, nq-1, A(0,1), lda, tau, C(i1,i2), ldc, work, lwork, &iinfo); #endif } } work[0] = MAGMA_C_MAKE( lwkopt, 0 ); return *info; } /* magma_cunmbr */
/* //////////////////////////////////////////////////////////////////////////// -- Testing cunmqr */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t mm, m, n, k, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max; magmaFloatComplex *C, *R, *A, *W, *tau; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); float tol = opts.tolerance * lapackf77_slamch("E"); // test all combinations of input parameters magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans[] = { MagmaConjTrans, MagmaNoTrans }; printf(" M N K side trans CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||QC||_F\n"); printf("===============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; nb = magma_get_cgeqrf_nb( m ); ldc = m; // A is m x k (left) or n x k (right) mm = (side[iside] == MagmaLeft ? m : n); lda = mm; gflops = FLOPS_CUNMQR( m, n, k, side[iside] ) / 1e9; if ( side[iside] == MagmaLeft && m < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=left and m < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } if ( side[iside] == MagmaRight && n < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=right and n < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } // need at least 2*nb*nb for geqrf lwork_max = max( max( m*nb, n*nb ), 2*nb*nb ); TESTING_MALLOC_CPU( C, magmaFloatComplex, ldc*n ); TESTING_MALLOC_CPU( R, magmaFloatComplex, ldc*n ); TESTING_MALLOC_CPU( A, magmaFloatComplex, lda*k ); TESTING_MALLOC_CPU( W, magmaFloatComplex, lwork_max ); TESTING_MALLOC_CPU( tau, magmaFloatComplex, k ); // C is full, m x n size = ldc*n; lapackf77_clarnv( &ione, ISEED, &size, C ); lapackf77_clacpy( "Full", &m, &n, C, &ldc, R, &ldc ); size = lda*k; lapackf77_clarnv( &ione, ISEED, &size, A ); // compute QR factorization to get Householder vectors in A, tau magma_cgeqrf( mm, k, A, lda, tau, W, lwork_max, &info ); if (info != 0) printf("magma_cgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cunmqr( lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), &m, &n, &k, A, &lda, tau, C, &ldc, W, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cunmqr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_cunmqr( side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); if (info != 0) printf("magma_cunmqr (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); lwork = (magma_int_t) MAGMA_C_REAL( W[0] ); if ( lwork < 0 || lwork > lwork_max ) { printf("optimal lwork %d > lwork_max %d\n", (int) lwork, (int) lwork_max ); lwork = lwork_max; } gpu_time = magma_wtime(); magma_cunmqr( side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cunmqr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ error = lapackf77_clange( "Fro", &m, &n, C, &ldc, work ); size = ldc*n; blasf77_caxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_clange( "Fro", &m, &n, R, &ldc, work ) / error; printf( "%5d %5d %5d %4c %5c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ), cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( W ); TESTING_FREE_CPU( tau ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iside, itran printf( "\n" ); } TESTING_FINALIZE(); return status; }