/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeqlf */ int main( int argc, char** argv) { TESTING_INIT(); const double d_neg_one = MAGMA_D_NEG_ONE; const double d_one = MAGMA_D_ONE; const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; const magmaDoubleComplex c_one = MAGMA_Z_ONE; const magmaDoubleComplex c_zero = MAGMA_Z_ZERO; const magma_int_t ione = 1; real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double Anorm, error=0, error2=0; magmaDoubleComplex *h_A, *h_R, *tau, *h_work, tmp[1]; magma_int_t M, N, n2, lda, lwork, info, min_mn, nb; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |L - Q^H*A| |I - Q^H*Q|\n"); printf("===============================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; nb = magma_get_zgeqlf_nb(M); gflops = FLOPS_ZGEQLF( M, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_zgeqlf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_Z_REAL( tmp[0] ); lwork = max( lwork, N*nb ); lwork = max( lwork, 2*nb*nb); TESTING_MALLOC_CPU( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_zgeqlf( M, N, h_R, lda, tau, h_work, lwork, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zgeqlf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result, following zqlt01 except using the reduced Q. This works for any M,N (square, tall, wide). =================================================================== */ if ( opts.check ) { magma_int_t ldq = M; magma_int_t ldl = min_mn; magmaDoubleComplex *Q, *L; double *work; TESTING_MALLOC_CPU( Q, magmaDoubleComplex, ldq*min_mn ); // M by K TESTING_MALLOC_CPU( L, magmaDoubleComplex, ldl*N ); // K by N TESTING_MALLOC_CPU( work, double, min_mn ); // copy M by K matrix V to Q (copying diagonal, which isn't needed) and // copy K by N matrix L lapackf77_zlaset( "Full", &min_mn, &N, &c_zero, &c_zero, L, &ldl ); if ( M >= N ) { // for M=5, N=3: A = [ V V V ] <= V full block (M-N by K) // K=N [ V V V ] // [ ----- ] // [ L V V ] <= V triangle (N by K, copying diagonal too) // [ L L V ] <= L triangle (K by N) // [ L L L ] magma_int_t M_N = M - N; lapackf77_zlacpy( "Full", &M_N, &min_mn, h_R, &lda, Q, &ldq ); lapackf77_zlacpy( "Upper", &N, &min_mn, &h_R[M_N], &lda, &Q[M_N], &ldq ); lapackf77_zlacpy( "Lower", &min_mn, &N, &h_R[M_N], &lda, L, &ldl ); } else { // for M=3, N=5: A = [ L L | L V V ] <= V triangle (K by K) // K=M [ L L | L L V ] <= L triangle (K by M) // [ L L | L L L ] // ^^^============= L full block (K by N-M) magma_int_t N_M = N - M; lapackf77_zlacpy( "Upper", &M, &min_mn, &h_R[N_M*lda], &lda, Q, &ldq ); lapackf77_zlacpy( "Full", &min_mn, &N_M, h_R, &lda, L, &ldl ); lapackf77_zlacpy( "Lower", &min_mn, &M, &h_R[N_M*lda], &lda, &L[N_M*ldl], &ldl ); } // generate M by K matrix Q, where K = min(M,N) lapackf77_zungql( &M, &min_mn, &min_mn, Q, &ldq, tau, h_work, &lwork, &info ); assert( info == 0 ); // error = || L - Q^H*A || / (N * ||A||) blasf77_zgemm( "Conj", "NoTrans", &min_mn, &N, &M, &c_neg_one, Q, &ldq, h_A, &lda, &c_one, L, &ldl ); Anorm = lapackf77_zlange( "1", &M, &N, h_A, &lda, work ); error = lapackf77_zlange( "1", &min_mn, &N, L, &ldl, work ); if ( N > 0 && Anorm > 0 ) error /= (N*Anorm); // set L = I (K by K identity), then L = I - Q^H*Q // error = || I - Q^H*Q || / N lapackf77_zlaset( "Upper", &min_mn, &min_mn, &c_zero, &c_one, L, &ldl ); blasf77_zherk( "Upper", "Conj", &min_mn, &M, &d_neg_one, Q, &ldq, &d_one, L, &ldl ); error2 = lapackf77_zlanhe( "1", "Upper", &min_mn, L, &ldl, work ); if ( N > 0 ) error2 /= N; TESTING_FREE_CPU( Q ); Q = NULL; TESTING_FREE_CPU( L ); L = NULL; TESTING_FREE_CPU( work ); work = NULL; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_zgeqlf( &M, &N, h_A, &lda, tau, h_work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapack_zgeqlf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Print performance and error. =================================================================== */ printf("%5d %5d ", (int) M, (int) N ); if ( opts.lapack ) { printf( "%7.2f (%7.2f)", cpu_perf, cpu_time ); } else { printf(" --- ( --- )" ); } printf( " %7.2f (%7.2f) ", gpu_perf, gpu_time ); if ( opts.check ) { bool okay = (error < tol && error2 < tol); status += ! okay; printf( "%11.2e %11.2e %s\n", error, error2, (okay ? "ok" : "failed") ); } else { printf( " ---\n" ); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeqlf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_R, *tau, *h_work, tmp[1]; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = 2. * opts.tolerance * lapackf77_dlamch("E"); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("=======================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; nb = magma_get_zgeqrf_nb(M); gflops = FLOPS_ZGEQLF( M, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_zgeqlf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_Z_REAL( tmp[0] ); lwork = max( lwork, N*nb ); lwork = max( lwork, 2*nb*nb); TESTING_MALLOC_CPU( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_zgeqlf( M, N, h_R, lda, tau, h_work, lwork, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zgeqlf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_zgeqlf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapack_zgeqlf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_zlange("f", &M, &N, h_A, &lda, work); blasf77_zaxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_zlange("f", &M, &N, h_R, &lda, work) / error; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_zgeqlf(magma_int_t m, magma_int_t n, magmaDoubleComplex *a, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= SGEQLF computes a QL factorization of a COMPLEX_16 M-by-N matrix A: A = Q * L. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, if m >= n, the lower triangle of the subarray A(m-n+1:m,1:n) contains the N-by-N lower triangular matrix L; if m <= n, the elements on and below the (n-m)-th superdiagonal contain the M-by-N lower trapezoidal matrix L; the remaining elements, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors (see Further Details). Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). TAU (output) COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= max(1,N). For optimum performance LWORK >= N*NB, where NB can be obtained through magma_get_zgeqlf_nb(M). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(k) . . . H(2) H(1), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(m-k+i+1:m) = 0 and v(m-k+i) = 1; v(1:m-k+i-1) is stored on exit in A(1:m-k+i-1,n-k+i), and tau in TAU(i). ===================================================================== */ #define a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1)) #define da_ref(a_1,a_2) (da+(a_2)*ldda + (a_1)) magmaDoubleComplex *da, *dwork; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t i, k, lddwork, old_i, old_ib, nb; magma_int_t rows, cols; magma_int_t ib, ki, kk, mu, nu, iinfo, ldda; int lquery; nb = magma_get_zgeqlf_nb(m); *info = 0; lquery = (lwork == -1); // silence "uninitialized" warnings old_ib = nb; old_i = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } if (*info == 0) { k = min(m,n); if (k == 0) work[0] = c_one; else { work[0] = MAGMA_Z_MAKE( n*nb, 0 ); } if (lwork < max(1,n) && ! lquery) *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; /* Quick return if possible */ if (k == 0) return *info; lddwork = ((n+31)/32)*32; ldda = ((m+31)/32)*32; if (MAGMA_SUCCESS != magma_zmalloc( &da, (n)*ldda + nb*lddwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dwork = da + ldda*(n); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); if ( (nb > 1) && (nb < k) ) { /* Use blocked code initially. The last kk columns are handled by the block method. First, copy the matrix on the GPU except the last kk columns */ magma_zsetmatrix_async( (m), (n-nb), a_ref(0, 0), lda, da_ref(0, 0), ldda, stream[0] ); ki = ((k - nb - 1) / nb) * nb; kk = min(k, ki + nb); for (i = k - kk + ki; i >= k -kk; i -= nb) { ib = min(k-i,nb); if (i < k - kk + ki) { /* 1. Copy asynchronously the current panel to the CPU. 2. Copy asynchronously the submatrix below the panel to the CPU) */ rows = m - k + i + ib; magma_zgetmatrix_async( rows, ib, da_ref(0, n-k+i), ldda, a_ref(0, n-k+i), lda, stream[1] ); magma_zgetmatrix_async( (m-rows), ib, da_ref(rows, n-k+i), ldda, a_ref(rows, n-k+i), lda, stream[0] ); /* Apply H' to A(1:m-k+i+ib-1,1:n-k+i-1) from the left in two steps - implementing the lookahead techniques. This is the main update from the lookahead techniques. */ rows = m - k + old_i + old_ib; cols = n - k + old_i - old_ib; magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaBackward, MagmaColumnwise, rows, cols, old_ib, da_ref(0, cols+old_ib), ldda, dwork, lddwork, da_ref(0, 0 ), ldda, dwork+old_ib, lddwork); } magma_queue_sync( stream[1] ); /* Compute the QL factorization of the current block A(1:m-k+i+ib-1,n-k+i:n-k+i+ib-1) */ rows = m - k + i + ib; cols = n - k + i; lapackf77_zgeqlf(&rows,&ib, a_ref(0,cols), &lda, tau+i, work, &lwork, &iinfo); if (cols > 0) { /* Form the triangular factor of the block reflector H = H(i+ib-1) . . . H(i+1) H(i) */ lapackf77_zlarft( MagmaBackwardStr, MagmaColumnwiseStr, &rows, &ib, a_ref(0, cols), &lda, tau + i, work, &ib); zpanel_to_q( MagmaLower, ib, a_ref(rows-ib,cols), lda, work+ib*ib); magma_zsetmatrix( rows, ib, a_ref(0,cols), lda, da_ref(0,cols), ldda ); zq_to_panel( MagmaLower, ib, a_ref(rows-ib,cols), lda, work+ib*ib); // Send the triangular part on the GPU magma_zsetmatrix( ib, ib, work, ib, dwork, lddwork ); /* Apply H' to A(1:m-k+i+ib-1,1:n-k+i-1) from the left in two steps - implementing the lookahead techniques. This is the update of first ib columns. */ if (i-ib >= k -kk) magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaBackward, MagmaColumnwise, rows, ib, ib, da_ref(0, cols), ldda, dwork, lddwork, da_ref(0,cols-ib), ldda, dwork+ib, lddwork); else{ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaBackward, MagmaColumnwise, rows, cols, ib, da_ref(0, cols), ldda, dwork, lddwork, da_ref(0, 0 ), ldda, dwork+ib, lddwork); } old_i = i; old_ib = ib; } } mu = m - k + i + nb; nu = n - k + i + nb; magma_zgetmatrix( m, nu, da_ref(0,0), ldda, a_ref(0,0), lda ); } else { mu = m; nu = n; } /* Use unblocked code to factor the last or only block */ if (mu > 0 && nu > 0) lapackf77_zgeqlf(&mu, &nu, a_ref(0,0), &lda, tau, work, &lwork, &iinfo); magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( da ); return *info; } /* magma_zgeqlf */