/* //////////////////////////////////////////////////////////////////////////// -- Testing sgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); const float d_neg_one = MAGMA_D_NEG_ONE; const float d_one = MAGMA_D_ONE; const float c_neg_one = MAGMA_S_NEG_ONE; const float c_one = MAGMA_S_ONE; const float c_zero = MAGMA_S_ZERO; const magma_int_t ione = 1; real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float Anorm, error=0, error2=0; float *h_A, *h_R, *tau, *h_work, tmp[1]; magmaFloat_ptr d_A, dT; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb, size; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; opts.parse_opts( argc, argv ); magma_int_t status = 0; float tol = opts.tolerance * lapackf77_slamch("E"); // version 3 can do either check if (opts.check == 1 && opts.version == 1) { opts.check = 2; printf( "%% version 1 requires check 2 (solve A*x=b)\n" ); } if (opts.check == 2 && opts.version == 2) { opts.check = 1; printf( "%% version 2 requires check 1 (R - Q^H*A)\n" ); } printf( "%% version %d\n", (int) opts.version ); if ( opts.check == 1 ) { printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) |R - Q^H*A| |I - Q^H*Q|\n"); printf("%%==============================================================================\n"); } else { printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) |b - A*x|\n"); printf("%%===============================================================\n"); } for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min( M, N ); lda = M; n2 = lda*N; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default nb = magma_get_sgeqrf_nb( M, N ); gflops = FLOPS_SGEQRF( M, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_sgeqrf( &M, &N, NULL, &M, NULL, tmp, &lwork, &info ); lwork = (magma_int_t)MAGMA_S_REAL( tmp[0] ); TESTING_MALLOC_CPU( tau, float, min_mn ); TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_CPU( h_work, float, lwork ); TESTING_MALLOC_PIN( h_R, float, n2 ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); if ( opts.version == 1 || opts.version == 3 ) { size = (2*min(M, N) + magma_roundup( N, 32 ) )*nb; TESTING_MALLOC_DEV( dT, float, size ); magmablas_slaset( MagmaFull, size, 1, c_zero, c_zero, dT, size ); } /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaFullStr, &M, &N, h_A, &lda, h_R, &lda ); magma_ssetmatrix( M, N, h_R, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ nb = magma_get_sgeqrf_nb( M, N ); gpu_time = magma_wtime(); if ( opts.version == 1 ) { // stores dT, V blocks have zeros, R blocks inverted & stored in dT magma_sgeqrf_gpu( M, N, d_A, ldda, tau, dT, &info ); } else if ( opts.version == 2 ) { // LAPACK complaint arguments magma_sgeqrf2_gpu( M, N, d_A, ldda, tau, &info ); } #ifdef HAVE_CUBLAS else if ( opts.version == 3 ) { // stores dT, V blocks have zeros, R blocks stored in dT magma_sgeqrf3_gpu( M, N, d_A, ldda, tau, dT, &info ); } #endif else { printf( "Unknown version %d\n", (int) opts.version ); return -1; } gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_sgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } if ( opts.check == 1 && (opts.version == 2 || opts.version == 3) ) { if ( opts.version == 3 ) { // copy diagonal blocks of R back to A for( int i=0; i < min_mn-nb; i += nb ) { magma_int_t ib = min( min_mn-i, nb ); magmablas_slacpy( MagmaUpper, ib, ib, &dT[min_mn*nb + i*nb], nb, &d_A[ i + i*ldda ], ldda ); } } /* ===================================================================== Check the result, following zqrt01 except using the reduced Q. This works for any M,N (square, tall, wide). Only for version 2, which has LAPACK complaint output. Or for version 3, after restoring diagonal blocks of A above. =================================================================== */ magma_sgetmatrix( M, N, d_A, ldda, h_R, lda ); magma_int_t ldq = M; magma_int_t ldr = min_mn; float *Q, *R; float *work; TESTING_MALLOC_CPU( Q, float, ldq*min_mn ); // M by K TESTING_MALLOC_CPU( R, float, ldr*N ); // K by N TESTING_MALLOC_CPU( work, float, min_mn ); // generate M by K matrix Q, where K = min(M,N) lapackf77_slacpy( "Lower", &M, &min_mn, h_R, &lda, Q, &ldq ); lapackf77_sorgqr( &M, &min_mn, &min_mn, Q, &ldq, tau, h_work, &lwork, &info ); assert( info == 0 ); // copy K by N matrix R lapackf77_slaset( "Lower", &min_mn, &N, &c_zero, &c_zero, R, &ldr ); lapackf77_slacpy( "Upper", &min_mn, &N, h_R, &lda, R, &ldr ); // error = || R - Q^H*A || / (N * ||A||) blasf77_sgemm( "Conj", "NoTrans", &min_mn, &N, &M, &c_neg_one, Q, &ldq, h_A, &lda, &c_one, R, &ldr ); Anorm = lapackf77_slange( "1", &M, &N, h_A, &lda, work ); error = lapackf77_slange( "1", &min_mn, &N, R, &ldr, work ); if ( N > 0 && Anorm > 0 ) error /= (N*Anorm); // set R = I (K by K identity), then R = I - Q^H*Q // error = || I - Q^H*Q || / N lapackf77_slaset( "Upper", &min_mn, &min_mn, &c_zero, &c_one, R, &ldr ); blasf77_ssyrk( "Upper", "Conj", &min_mn, &M, &d_neg_one, Q, &ldq, &d_one, R, &ldr ); error2 = safe_lapackf77_slansy( "1", "Upper", &min_mn, R, &ldr, work ); if ( N > 0 ) error2 /= N; TESTING_FREE_CPU( Q ); Q = NULL; TESTING_FREE_CPU( R ); R = NULL; TESTING_FREE_CPU( work ); work = NULL; } else if ( opts.check == 2 && M >= N && (opts.version == 1 || opts.version == 3) ) { /* ===================================================================== Check the result by solving consistent linear system, A*x = b. Only for versions 1 & 3 with M >= N. =================================================================== */ magma_int_t lwork2; float *x, *b, *hwork; magmaFloat_ptr d_B; // initialize RHS, b = A*random TESTING_MALLOC_CPU( x, float, N ); TESTING_MALLOC_CPU( b, float, M ); lapackf77_slarnv( &ione, ISEED, &N, x ); blasf77_sgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_zero, b, &ione ); // copy to GPU TESTING_MALLOC_DEV( d_B, float, M ); magma_ssetvector( M, b, 1, d_B, 1 ); if ( opts.version == 1 ) { // allocate hwork magma_sgeqrs_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, tmp, -1, &info ); lwork2 = (magma_int_t)MAGMA_S_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, float, lwork2 ); // solve linear system magma_sgeqrs_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, hwork, lwork2, &info ); if (info != 0) { printf("magma_sgeqrs returned error %d: %s.\n", (int) info, magma_strerror( info )); } TESTING_FREE_CPU( hwork ); } #ifdef HAVE_CUBLAS else if ( opts.version == 3 ) { // allocate hwork magma_sgeqrs3_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, tmp, -1, &info ); lwork2 = (magma_int_t)MAGMA_S_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, float, lwork2 ); // solve linear system magma_sgeqrs3_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, hwork, lwork2, &info ); if (info != 0) { printf("magma_sgeqrs3 returned error %d: %s.\n", (int) info, magma_strerror( info )); } TESTING_FREE_CPU( hwork ); } #endif else { printf( "Unknown version %d\n", (int) opts.version ); return -1; } magma_sgetvector( N, d_B, 1, x, 1 ); // compute r = Ax - b, saved in b blasf77_sgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_neg_one, b, &ione ); // compute residual |Ax - b| / (max(m,n)*|A|*|x|) float norm_x, norm_A, norm_r, work[1]; norm_A = lapackf77_slange( "F", &M, &N, h_A, &lda, work ); norm_r = lapackf77_slange( "F", &M, &ione, b, &M, work ); norm_x = lapackf77_slange( "F", &N, &ione, x, &N, work ); TESTING_FREE_CPU( x ); TESTING_FREE_CPU( b ); TESTING_FREE_DEV( d_B ); error = norm_r / (max(M,N) * norm_A * norm_x); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_sgeqrf( &M, &N, h_A, &lda, tau, h_work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_sgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } } /* ===================================================================== Print performance and error. =================================================================== */ printf("%5d %5d ", (int) M, (int) N ); if ( opts.lapack ) { printf( "%7.2f (%7.2f)", cpu_perf, cpu_time ); } else { printf(" --- ( --- )" ); } printf( " %7.2f (%7.2f) ", gpu_perf, gpu_time ); if ( opts.check == 1 ) { bool okay = (error < tol && error2 < tol); status += ! okay; printf( "%11.2e %11.2e %s\n", error, error2, (okay ? "ok" : "failed") ); } else if ( opts.check == 2 ) { if ( M >= N ) { bool okay = (error < tol); status += ! okay; printf( "%10.2e %s\n", error, (okay ? "ok" : "failed") ); } else { printf( "(error check only for M >= N)\n" ); } } else { printf( " ---\n" ); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); if ( opts.version == 1 || opts.version == 3 ) { TESTING_FREE_DEV( dT ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; float matnorm, work[1]; float mzone = MAGMA_S_NEG_ONE; float *h_A, *h_R, *tau, *hwork, tmp[1]; magmaFloat_ptr d_A; /* Matrix size */ magma_int_t M = 0, N = 0, n2, lda, ldda, lhwork; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10176}; magma_int_t i, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } if (M>0 && N>0) printf(" testing_sgeqrf_gpu -M %d -N %d\n\n", M, N); else { printf("\nUsage: \n"); printf(" testing_sgeqrf_gpu -M %d -N %d\n\n", 1024, 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_sgeqrf_gpu -M %d -N %d\n\n", 1024, 1024); M = N = size[7]; } /* Initialize */ magma_queue_t queue1, queue2; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue1 ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue2 ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } magma_queue_t queues[2] = {queue1, queue2}; ldda = ((M+31)/32)*32; n2 = M * N; min_mn = min(M, N); /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, float, min_mn ); TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_PIN( h_R, float, n2 ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); lhwork = -1; lapackf77_sgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); lhwork = (magma_int_t)MAGMA_S_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, float, lhwork ); printf("\n\n"); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("======================================================================\n"); for(i=0; i<8; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS( (float)M, (float)N ) * 1e-9; /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_sgeqrf(&M, &N, h_A, &M, tau, hwork, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; if (info < 0) printf("Argument %d of lapack_sgeqrf had an illegal value.\n", -info); cpu_perf = gflops / cpu_time; /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_ssetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue1 ); magma_sgeqrf2_gpu( M, N, d_A, 0, ldda, tau, &info, queues); magma_ssetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue1 ); clFinish(queue1); clFinish(queue2); gpu_time = magma_wtime(); magma_sgeqrf2_gpu( M, N, d_A, 0, ldda, tau, &info, queues); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of magma_sgeqrf2 had an illegal value.\n", -info); gpu_perf = gflops / gpu_time; /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_sgetmatrix( M, N, d_A, 0, ldda, h_R, 0, M, queue1 ); matnorm = lapackf77_slange("f", &M, &N, h_A, &M, work); blasf77_saxpy(&n2, &mzone, h_A, &ione, h_R, &ione); printf("%5d %5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", M, N, cpu_perf, cpu_time, gpu_perf, gpu_time, lapackf77_slange("f", &M, &N, h_R, &M, work) / matnorm); if (argc != 1) break; } /* clean up */ TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( hwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); magma_queue_destroy( queue1 ); magma_queue_destroy( queue2 ); magma_finalize(); }
/***************************************************************************//** Purpose ------- SGELQF computes an LQ factorization of a REAL M-by-N matrix dA: dA = L * Q. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA REAL array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix dA. On exit, the elements on and below the diagonal of the array contain the m-by-min(m,n) lower trapezoidal matrix L (L is lower triangular if m <= n); the elements above the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors (see Further Details). @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). @param[out] tau REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. \n Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= max(1,M). For optimum performance LWORK >= M*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(k) . . . H(2) H(1), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in A(i,i+1:n), and tau in TAU(i). @ingroup magma_gelqf *******************************************************************************/ extern "C" magma_int_t magma_sgelqf_gpu( magma_int_t m, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda, float *tau, float *work, magma_int_t lwork, magma_int_t *info) { /* Constants */ const float c_one = MAGMA_S_ONE; const magma_int_t ione = 1; MAGMA_UNUSED( ione ); // used only for real /* Local variables */ magmaFloat_ptr dAT=NULL; magma_int_t min_mn, maxm, maxn, nb; magma_int_t iinfo; *info = 0; nb = magma_get_sgelqf_nb( m, n ); min_mn = min( m, n ); work[0] = magma_smake_lwork( m*nb ); bool lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } else if (lwork < max(1,m) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (min_mn == 0) { work[0] = c_one; return *info; } maxm = magma_roundup( m, 32 ); maxn = magma_roundup( n, 32 ); magma_int_t lddat = maxn; magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); if ( m == n ) { dAT = dA; lddat = ldda; magmablas_stranspose_inplace( m, dAT, ldda, queue ); } else { if (MAGMA_SUCCESS != magma_smalloc( &dAT, maxm*maxn ) ) { *info = MAGMA_ERR_DEVICE_ALLOC; goto cleanup; } magmablas_stranspose( m, n, dA, ldda, dAT, lddat, queue ); } magma_sgeqrf2_gpu( n, m, dAT, lddat, tau, &iinfo ); assert( iinfo >= 0 ); if ( iinfo > 0 ) { *info = iinfo; } // conjugate tau #ifdef COMPLEX lapackf77_slacgv( &min_mn, tau, &ione ); #endif if ( m == n ) { magmablas_stranspose_inplace( m, dAT, lddat, queue ); } else { magmablas_stranspose( n, m, dAT, lddat, dA, ldda, queue ); magma_free( dAT ); } cleanup: magma_queue_destroy( queue ); return *info; } /* magma_sgelqf_gpu */
extern "C" magma_int_t magma_sgelqf( magma_int_t m, magma_int_t n, float *a, magma_int_t lda, float *tau, float *work, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= SGELQF computes an LQ factorization of a REAL M-by-N matrix A: A = L * Q. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) REAL array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and below the diagonal of the array contain the m-by-min(m,n) lower trapezoidal matrix L (L is lower triangular if m <= n); the elements above the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors (see Further Details). Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). TAU (output) REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= max(1,M). For optimum performance LWORK >= M*NB, where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -10 internal GPU memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(k) . . . H(2) H(1), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in A(i,i+1:n), and tau in TAU(i). ===================================================================== */ #define a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1)) float *dA, *dAT; float c_one = MAGMA_S_ONE; magma_int_t maxm, maxn, maxdim, nb; magma_int_t iinfo, ldda; int lquery; /* Function Body */ *info = 0; nb = magma_get_sgelqf_nb(m); work[0] = MAGMA_S_MAKE( (float)(m*nb), 0 ); lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1,m) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (min(m, n) == 0) { work[0] = c_one; return *info; } maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; maxdim = max(maxm, maxn); if (maxdim*maxdim < 2*maxm*maxn) { ldda = maxdim; if (MAGMA_SUCCESS != magma_smalloc( &dA, maxdim*maxdim )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_ssetmatrix( m, n, a, lda, dA, ldda ); dAT = dA; magmablas_stranspose_inplace( ldda, dAT, ldda ); } else { ldda = maxn; if (MAGMA_SUCCESS != magma_smalloc( &dA, 2*maxn*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_ssetmatrix( m, n, a, lda, dA, maxm ); dAT = dA + maxn * maxm; magmablas_stranspose2( dAT, ldda, dA, maxm, m, n ); } magma_sgeqrf2_gpu(n, m, dAT, ldda, tau, &iinfo); if (maxdim*maxdim < 2*maxm*maxn) { magmablas_stranspose_inplace( ldda, dAT, ldda ); magma_sgetmatrix( m, n, dA, ldda, a, lda ); } else { magmablas_stranspose2( dA, maxm, dAT, ldda, n, m ); magma_sgetmatrix( m, n, dA, maxm, a, lda ); } magma_free( dA ); return *info; } /* magma_sgelqf */
/** Purpose ------- SGEQRF_OOC computes a QR factorization of a REAL M-by-N matrix A: A = Q * R. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. This is an out-of-core (ooc) version that is similar to magma_sgeqrf but the difference is that this version can use a GPU even if the matrix does not fit into the GPU memory at once. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] tau REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. \n Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= N*NB, where NB can be obtained through magma_get_sgeqrf_nb(M). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sgeqrf_ooc( magma_int_t m, magma_int_t n, float *A, magma_int_t lda, float *tau, float *work, magma_int_t lwork, magma_int_t *info ) { #define A(a_1,a_2) ( A + (a_2)*(lda) + (a_1)) #define dA(a_1,a_2) (dA + (a_2)*ldda + (a_1)) float *dA, *dwork; float c_one = MAGMA_S_ONE; int k, lddwork, ldda; *info = 0; int nb = magma_get_sgeqrf_nb(min(m, n)); int lwkopt = n * nb; work[0] = MAGMA_S_MAKE( (float)lwkopt, 0 ); int lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1,n) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); /* Check how much memory do we have */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(float); magma_int_t IB, NB = (magma_int_t)(0.8*freeMem/m); NB = (NB / nb) * nb; if (NB >= n) return magma_sgeqrf(m, n, A, lda, tau, work, lwork, info); k = min(m,n); if (k == 0) { work[0] = c_one; return *info; } lddwork = ((NB+31)/32)*32+nb; ldda = ((m+31)/32)*32; if (MAGMA_SUCCESS != magma_smalloc( &dA, (NB + nb)*ldda + nb*lddwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); // magmablasSetKernelStream(stream[1]); float *ptr = dA + ldda * NB; dwork = dA + ldda*(NB + nb); /* start the main loop over the blocks that fit in the GPU memory */ for (int i=0; i < n; i += NB) { IB = min(n-i, NB); //printf("Processing %5d columns -- %5d to %5d ... \n", IB, i, i+IB); /* 1. Copy the next part of the matrix to the GPU */ magma_ssetmatrix_async( (m), IB, A(0,i), lda, dA(0,0), ldda, stream[0] ); magma_queue_sync( stream[0] ); /* 2. Update it with the previous transformations */ for (int j=0; j < min(i,k); j += nb) { magma_int_t ib = min(k-j, nb); /* Get a panel in ptr. */ // 1. Form the triangular factor of the block reflector // 2. Send it to the GPU. // 3. Put 0s in the upper triangular part of V. // 4. Send V to the GPU in ptr. // 5. Update the matrix. // 6. Restore the upper part of V. magma_int_t rows = m-j; lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, A(j,j), &lda, tau+j, work, &ib); magma_ssetmatrix_async( ib, ib, work, ib, dwork, lddwork, stream[1] ); spanel_to_q(MagmaUpper, ib, A(j,j), lda, work+ib*ib); magma_ssetmatrix_async( rows, ib, A(j,j), lda, ptr, rows, stream[1] ); magma_queue_sync( stream[1] ); magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, IB, ib, ptr, rows, dwork, lddwork, dA(j, 0), ldda, dwork+ib, lddwork); sq_to_panel(MagmaUpper, ib, A(j,j), lda, work+ib*ib); } /* 3. Do a QR on the current part */ if (i < k) magma_sgeqrf2_gpu(m-i, IB, dA(i,0), ldda, tau+i, info); /* 4. Copy the current part back to the CPU */ magma_sgetmatrix_async( (m), IB, dA(0,0), ldda, A(0,i), lda, stream[0] ); } magma_queue_sync( stream[0] ); magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( dA ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_sgeqrf_ooc */
/***************************************************************************//** Purpose ------- SGELQF computes an LQ factorization of a REAL M-by-N matrix A: A = L * Q. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and below the diagonal of the array contain the m-by-min(m,n) lower trapezoidal matrix L (L is lower triangular if m <= n); the elements above the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors (see Further Details). \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] tau REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= max(1,M). For optimum performance LWORK >= M*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. \n TODO: work is currently unused. sgeqrf2 allocates its own work of (m + n)*nb. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(k) . . . H(2) H(1), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in A(i,i+1:n), and tau in TAU(i). @ingroup magma_gelqf *******************************************************************************/ extern "C" magma_int_t magma_sgelqf( magma_int_t m, magma_int_t n, float *A, magma_int_t lda, float *tau, float *work, magma_int_t lwork, magma_int_t *info) { #define dA(i_, j_) (dA + (i_) + (j_)*ldda) #define dAT(i_, j_) (dAT + (i_) + (j_)*ldda) /* Constants */ const float c_one = MAGMA_S_ONE; const magma_int_t ione = 1; MAGMA_UNUSED( ione ); // used only for real /* Local variables */ magmaFloat_ptr dA=NULL, dAT=NULL; magma_int_t min_mn, maxm, maxn, maxdim, nb; magma_int_t iinfo, ldda, lddat; /* Function Body */ *info = 0; nb = magma_get_sgelqf_nb( m, n ); min_mn = min( m, n ); work[0] = magma_smake_lwork( m*nb ); bool lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1,m) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (min_mn == 0) { work[0] = c_one; return *info; } maxm = magma_roundup( m, 32 ); maxn = magma_roundup( n, 32 ); maxdim = max( maxm, maxn ); magma_queue_t queue = NULL; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); // copy to GPU and transpose if (maxdim*maxdim < 2*maxm*maxn) { // close to square, do everything in-place ldda = maxdim; lddat = maxdim; if (MAGMA_SUCCESS != magma_smalloc( &dA, maxdim*maxdim )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto cleanup; } magma_ssetmatrix( m, n, A, lda, dA(0,0), ldda, queue ); dAT = dA; magmablas_stranspose_inplace( lddat, dAT(0,0), lddat, queue ); } else { // rectangular, do everything out-of-place ldda = maxm; lddat = maxn; if (MAGMA_SUCCESS != magma_smalloc( &dA, 2*maxn*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto cleanup; } magma_ssetmatrix( m, n, A, lda, dA(0,0), ldda, queue ); dAT = dA + maxn * maxm; magmablas_stranspose( m, n, dA(0,0), ldda, dAT(0,0), lddat, queue ); } // factor QR magma_sgeqrf2_gpu( n, m, dAT(0,0), lddat, tau, &iinfo ); assert( iinfo >= 0 ); if ( iinfo > 0 ) { *info = iinfo; } // conjugate tau #ifdef COMPLEX lapackf77_slacgv( &min_mn, tau, &ione ); #endif // undo transpose if (maxdim*maxdim < 2*maxm*maxn) { magmablas_stranspose_inplace( lddat, dAT(0,0), lddat, queue ); magma_sgetmatrix( m, n, dA(0,0), ldda, A, lda, queue ); } else { magmablas_stranspose( n, m, dAT(0,0), lddat, dA(0,0), ldda, queue ); magma_sgetmatrix( m, n, dA(0,0), ldda, A, lda, queue ); } cleanup: magma_queue_destroy( queue ); magma_free( dA ); return *info; } /* magma_sgelqf */