/** Purpose ------- CUNGHR generates a COMPLEX unitary matrix Q which is defined as the product of IHI-ILO elementary reflectors of order N, as returned by CGEHRD: Q = H(ilo) H(ilo+1) . . . H(ihi-1). Arguments --------- @param[in] n INTEGER The order of the matrix Q. N >= 0. @param[in] ilo INTEGER @param[in] ihi INTEGER ILO and IHI must have the same values as in the previous call of CGEHRD. Q is equal to the unit matrix except in the submatrix Q(ilo+1:ihi,ilo+1:ihi). 1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the vectors which define the elementary reflectors, as returned by CGEHRD. On exit, the N-by-N unitary matrix Q. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in] tau COMPLEX array, dimension (N-1) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by CGEHRD. @param[in] T COMPLEX array on the GPU device. T contains the T matrices used in blocking the elementary reflectors H(i), e.g., this can be the 9th argument of magma_cgehrd. @param[in] nb INTEGER This is the block size used in CGEHRD, and correspondingly the size of the T matrices, used in the factorization, and stored in T. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_cgeev_comp ********************************************************************/ extern "C" magma_int_t magma_cunghr_m( magma_int_t n, magma_int_t ilo, magma_int_t ihi, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *T, magma_int_t nb, magma_int_t *info) { #define A(i,j) (A + (i) + (j)*lda) magma_int_t i, j, nh, iinfo; *info = 0; nh = ihi - ilo; if (n < 0) *info = -1; else if (ilo < 1 || ilo > max(1,n)) *info = -2; else if (ihi < min(ilo,n) || ihi > n) *info = -3; else if (lda < max(1,n)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (n == 0) return *info; /* Shift the vectors which define the elementary reflectors one column to the right, and set the first ilo and the last n-ihi rows and columns to those of the unit matrix */ for (j = ihi-1; j >= ilo; --j) { for (i = 0; i < j; ++i) *A(i, j) = MAGMA_C_ZERO; for (i = j+1; i < ihi; ++i) *A(i, j) = *A(i, j - 1); for (i = ihi; i < n; ++i) *A(i, j) = MAGMA_C_ZERO; } for (j = 0; j < ilo; ++j) { for (i = 0; i < n; ++i) *A(i, j) = MAGMA_C_ZERO; *A(j, j) = MAGMA_C_ONE; } for (j = ihi; j < n; ++j) { for (i = 0; i < n; ++i) *A(i, j) = MAGMA_C_ZERO; *A(j, j) = MAGMA_C_ONE; } if (nh > 0) { /* Generate Q(ilo+1:ihi,ilo+1:ihi) */ magma_cungqr_m( nh, nh, nh, A(ilo, ilo), lda, tau+ilo-1, T, nb, &iinfo ); } return *info; } /* magma_cunghr */
/* //////////////////////////////////////////////////////////////////////////// -- Testing cungqr */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *hA, *hR, *hT, *tau, *h_work; magmaFloatComplex *dA, *dT; magma_int_t m, n, k; magma_int_t n2, lda, ldda, lwork, min_mn, nb, info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) float tol = opts.tolerance * lapackf77_slamch("E"); printf(" m n k CPU GFlop/s (sec) GPU GFlop/s (sec) ||R|| / ||A||\n"); printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; if ( m < n || n < k ) { printf( "%5d %5d %5d skipping because m < n or n < k\n", (int) m, (int) n, (int) k ); continue; } lda = m; ldda = ((m + 31)/32)*32; n2 = lda*n; min_mn = min(m, n); nb = magma_get_cgeqrf_nb( m ); lwork = (m + 2*n+nb)*nb; gflops = FLOPS_CUNGQR( m, n, k ) / 1e9; TESTING_MALLOC_PIN( hA, magmaFloatComplex, lda*n ); TESTING_MALLOC_PIN( h_work, magmaFloatComplex, lwork ); TESTING_MALLOC_CPU( hR, magmaFloatComplex, lda*n ); TESTING_MALLOC_CPU( hT, magmaFloatComplex, min_mn*nb ); TESTING_MALLOC_CPU( tau, magmaFloatComplex, min_mn ); TESTING_MALLOC_DEV( dA, magmaFloatComplex, ldda*n ); TESTING_MALLOC_DEV( dT, magmaFloatComplex, ( 2*min_mn + ((n + 31)/32)*32 )*nb ); lapackf77_clarnv( &ione, ISEED, &n2, hA ); lapackf77_clacpy( MagmaUpperLowerStr, &m, &n, hA, &lda, hR, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // first, get QR factors magma_csetmatrix( m, n, hA, lda, dA, ldda ); magma_cgeqrf_gpu( m, n, dA, ldda, tau, dT, &info ); if ( info != 0 ) printf("magma_cgeqrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_cgetmatrix( m, n, dA, ldda, hR, lda ); magma_cgetmatrix( nb, min_mn, dT, nb, hT, nb ); gpu_time = magma_wtime(); magma_cungqr_m( m, n, k, hR, lda, tau, hT, nb, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if ( info != 0 ) printf("magma_cungqr_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { error = lapackf77_clange("f", &m, &n, hA, &lda, work ); lapackf77_cgeqrf( &m, &n, hA, &lda, tau, h_work, &lwork, &info ); if ( info != 0 ) printf("lapackf77_cgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); cpu_time = magma_wtime(); lapackf77_cungqr( &m, &n, &k, hA, &lda, tau, h_work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if ( info != 0 ) printf("lapackf77_cungqr returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute relative error |R|/|A| := |Q_magma - Q_lapack|/|A| blasf77_caxpy( &n2, &c_neg_one, hA, &ione, hR, &ione ); error = lapackf77_clange("f", &m, &n, hR, &lda, work) / error; printf("%5d %5d %5d %7.1f (%7.2f) %7.1f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); } else { printf("%5d %5d %5d --- ( --- ) %7.1f (%7.2f) --- \n", (int) m, (int) n, (int) k, gpu_perf, gpu_time ); } TESTING_FREE_PIN( hA ); TESTING_FREE_PIN( h_work ); TESTING_FREE_CPU( hR ); TESTING_FREE_CPU( hT ); TESTING_FREE_CPU( tau ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dT ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }