/* //////////////////////////////////////////////////////////////////////////// -- Testing sgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; float c_neg_one = MAGMA_S_NEG_ONE; float *h_A, *h_R, *tau, *dtau, *h_work, tmp[1]; float *d_A; float *dwork; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); opts.lapack |= opts.check; // check (-c) implies lapack (-l) printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||R||_F / ||A||_F\n"); printf("=======================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_SGEQRF( M, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_sgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_S_REAL( tmp[0] ); TESTING_MALLOC_CPU( tau, float, min_mn ); TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_CPU( h_work, float, lwork ); TESTING_MALLOC_PIN( h_R, float, n2 ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); TESTING_MALLOC_DEV( dtau, float, min_mn ); TESTING_MALLOC_DEV( dwork, float, min_mn ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_ssetmatrix( M, N, h_R, lda, d_A, ldda ); // warmup if ( opts.warmup ) { magma_sgeqr2_gpu( M, N, d_A, ldda, dtau, dwork, &info ); magma_ssetmatrix( M, N, h_R, lda, d_A, ldda ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_sync_wtime( 0 ); magma_sgeqr2_gpu( M, N, d_A, ldda, dtau, dwork, &info ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_sgeqr2_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_sgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_sgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_sgetmatrix( M, N, d_A, ldda, h_R, M ); error = lapackf77_slange("f", &M, &N, h_A, &lda, work); blasf77_saxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_slange("f", &M, &N, h_R, &lda, work) / error; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, 1000.*gpu_time ); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dtau ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; float matnorm, work[1]; float mzone = MAGMA_S_NEG_ONE; float *h_A, *h_R, *tau, *hwork, tmp[1]; magmaFloat_ptr d_A; /* Matrix size */ magma_int_t M = 0, N = 0, n2, lda, ldda, lhwork; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10176}; magma_int_t i, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } if (M>0 && N>0) printf(" testing_sgeqrf_gpu -M %d -N %d\n\n", M, N); else { printf("\nUsage: \n"); printf(" testing_sgeqrf_gpu -M %d -N %d\n\n", 1024, 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_sgeqrf_gpu -M %d -N %d\n\n", 1024, 1024); M = N = size[7]; } /* Initialize */ magma_queue_t queue1, queue2; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue1 ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue2 ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } magma_queue_t queues[2] = {queue1, queue2}; ldda = ((M+31)/32)*32; n2 = M * N; min_mn = min(M, N); /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, float, min_mn ); TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_PIN( h_R, float, n2 ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); lhwork = -1; lapackf77_sgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); lhwork = (magma_int_t)MAGMA_S_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, float, lhwork ); printf("\n\n"); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("======================================================================\n"); for(i=0; i<8; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS( (float)M, (float)N ) * 1e-9; /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_sgeqrf(&M, &N, h_A, &M, tau, hwork, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; if (info < 0) printf("Argument %d of lapack_sgeqrf had an illegal value.\n", -info); cpu_perf = gflops / cpu_time; /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_ssetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue1 ); magma_sgeqrf2_gpu( M, N, d_A, 0, ldda, tau, &info, queues); magma_ssetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue1 ); clFinish(queue1); clFinish(queue2); gpu_time = magma_wtime(); magma_sgeqrf2_gpu( M, N, d_A, 0, ldda, tau, &info, queues); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of magma_sgeqrf2 had an illegal value.\n", -info); gpu_perf = gflops / gpu_time; /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_sgetmatrix( M, N, d_A, 0, ldda, h_R, 0, M, queue1 ); matnorm = lapackf77_slange("f", &M, &N, h_A, &M, work); blasf77_saxpy(&n2, &mzone, h_A, &ione, h_R, &ione); printf("%5d %5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", M, N, cpu_perf, cpu_time, gpu_perf, gpu_time, lapackf77_slange("f", &M, &N, h_R, &M, work) / matnorm); if (argc != 1) break; } /* clean up */ TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( hwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); magma_queue_destroy( queue1 ); magma_queue_destroy( queue2 ); magma_finalize(); }
/** Purpose ------- SGEQRF computes a QR factorization of a REAL M-by-N matrix A: A = Q * R. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. If the current stream is NULL, this version replaces it with user defined stream to overlap computation with communication. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] tau REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. \n Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= max( N*NB, 2*NB*NB ), where NB can be obtained through magma_get_sgeqrf_nb(M). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sgeqrf(magma_int_t m, magma_int_t n, float *A, magma_int_t lda, float *tau, float *work, magma_int_t lwork, magma_int_t *info ) { #define A(i,j) ( A + (i) + (j)*lda ) #define dA(i,j) (dA + (i) + (j)*ldda) float *dA, *dwork, *dT; float c_one = MAGMA_S_ONE; magma_int_t i, k, lddwork, old_i, old_ib; magma_int_t ib, ldda; /* Function Body */ *info = 0; magma_int_t nb = magma_get_sgeqrf_nb(min(m, n)); // need 2*nb*nb to store T and upper triangle of V simultaneously magma_int_t lwkopt = max(n*nb, 2*nb*nb); work[0] = MAGMA_S_MAKE( (float)lwkopt, 0 ); int lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1, lwkopt) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { work[0] = c_one; return *info; } // largest N for larfb is n-nb (trailing matrix lacks 1st panel) lddwork = ((n+31)/32)*32 - nb; ldda = ((m+31)/32)*32; magma_int_t num_gpus = magma_num_gpus(); if ( num_gpus > 1 ) { /* call multiple-GPU interface */ return magma_sgeqrf4(num_gpus, m, n, A, lda, tau, work, lwork, info); } // allocate space for dA, dwork, and dT if (MAGMA_SUCCESS != magma_smalloc( &dA, n*ldda + nb*lddwork + nb*nb )) { /* Switch to the "out-of-core" (out of GPU-memory) version */ return magma_sgeqrf_ooc(m, n, A, lda, tau, work, lwork, info); } /* Define user stream if current stream is NULL */ magma_queue_t stream[2], current_stream; magmablasGetKernelStream(¤t_stream); magma_queue_create( &stream[0] ); if (current_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else { stream[1] = current_stream; } dwork = dA + n*ldda; dT = dA + n*ldda + nb*lddwork; if ( (nb > 1) && (nb < k) ) { /* Use blocked code initially. Asynchronously send the matrix to the GPU except the first panel. */ magma_ssetmatrix_async( m, n-nb, A(0,nb), lda, dA(0,nb), ldda, stream[0] ); old_i = 0; old_ib = nb; for (i = 0; i < k-nb; i += nb) { ib = min(k-i, nb); if (i > 0) { /* download i-th panel */ magma_queue_sync( stream[1] ); magma_sgetmatrix_async( m-i, ib, dA(i,i), ldda, A(i,i), lda, stream[0] ); /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i), ldda, dT, nb, dA(old_i, old_i+2*old_ib), ldda, dwork, lddwork); magma_sgetmatrix_async( i, ib, dA(0,i), ldda, A(0,i), lda, stream[1] ); magma_queue_sync( stream[0] ); } magma_int_t rows = m-i; lapackf77_sgeqrf(&rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, A(i,i), &lda, tau+i, work, &ib); spanel_to_q(MagmaUpper, ib, A(i,i), lda, work+ib*ib); /* download the i-th V matrix */ magma_ssetmatrix_async( rows, ib, A(i,i), lda, dA(i,i), ldda, stream[0] ); /* download the T matrix */ magma_queue_sync( stream[1] ); magma_ssetmatrix_async( ib, ib, work, ib, dT, nb, stream[0] ); magma_queue_sync( stream[0] ); if (i + ib < n) { if (i+ib < k-nb) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left (look-ahead) */ magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dT, nb, dA(i, i+ib), ldda, dwork, lddwork); sq_to_panel(MagmaUpper, ib, A(i,i), lda, work+ib*ib); } else { /* After last panel, update whole trailing matrix. */ /* Apply H' to A(i:m,i+ib:n) from the left */ magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dT, nb, dA(i, i+ib), ldda, dwork, lddwork); sq_to_panel(MagmaUpper, ib, A(i,i), lda, work+ib*ib); } old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; if (i != 0) { magma_sgetmatrix_async( m, ib, dA(0,i), ldda, A(0,i), lda, stream[1] ); magma_queue_sync( stream[1] ); } magma_int_t rows = m-i; lapackf77_sgeqrf(&rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info); } magma_queue_destroy( stream[0] ); if (current_stream == NULL) { magma_queue_destroy( stream[1] ); magmablasSetKernelStream(NULL); } magma_free( dA ); return *info; } /* magma_sgeqrf */
/** Purpose ------- SGEQRF computes a QR factorization of a REAL M-by-N matrix A: A = Q * R. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. This uses 2 queues to overlap communication and computation. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] tau REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. \n Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= max( N*NB, 2*NB*NB ), where NB can be obtained through magma_get_sgeqrf_nb( M, N ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sgeqrf( magma_int_t m, magma_int_t n, float *A, magma_int_t lda, float *tau, float *work, magma_int_t lwork, magma_int_t *info ) { #define A(i_,j_) (A + (i_) + (j_)*lda) #ifdef HAVE_clBLAS #define dA(i_,j_) dA, ((i_) + (j_)*ldda + dA_offset) #define dT(i_,j_) dT, ((i_) + (j_)*nb + dT_offset) #define dwork(i_) dwork, ((i_) + dwork_offset) #else #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dT(i_,j_) (dT + (i_) + (j_)*nb) #define dwork(i_) (dwork + (i_)) #endif /* Constants */ const float c_one = MAGMA_S_ONE; /* Local variables */ magmaFloat_ptr dA, dT, dwork; magma_int_t i, ib, min_mn, ldda, lddwork, old_i, old_ib; /* Function Body */ *info = 0; magma_int_t nb = magma_get_sgeqrf_nb( m, n ); // need 2*nb*nb to store T and upper triangle of V simultaneously magma_int_t lwkopt = max( n*nb, 2*nb*nb ); work[0] = magma_smake_lwork( lwkopt ); bool lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1, lwkopt) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; min_mn = min( m, n ); if (min_mn == 0) { work[0] = c_one; return *info; } // largest N for larfb is n-nb (trailing matrix lacks 1st panel) lddwork = magma_roundup( n, 32 ) - nb; ldda = magma_roundup( m, 32 ); magma_int_t ngpu = magma_num_gpus(); if ( ngpu > 1 ) { /* call multiple-GPU interface */ return magma_sgeqrf_m( ngpu, m, n, A, lda, tau, work, lwork, info ); } // allocate space for dA, dwork, and dT if (MAGMA_SUCCESS != magma_smalloc( &dA, n*ldda + nb*lddwork + nb*nb )) { /* alloc failed so call non-GPU-resident version */ return magma_sgeqrf_ooc( m, n, A, lda, tau, work, lwork, info ); } dwork = dA + n*ldda; dT = dA + n*ldda + nb*lddwork; magma_queue_t queues[2]; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); if ( (nb > 1) && (nb < min_mn) ) { /* Use blocked code initially. Asynchronously send the matrix to the GPU except the first panel. */ magma_ssetmatrix_async( m, n-nb, A(0,nb), lda, dA(0,nb), ldda, queues[0] ); old_i = 0; old_ib = nb; for (i = 0; i < min_mn-nb; i += nb) { ib = min( min_mn-i, nb ); if (i > 0) { /* get i-th panel from device */ magma_queue_sync( queues[1] ); magma_sgetmatrix_async( m-i, ib, dA(i,i), ldda, A(i,i), lda, queues[0] ); /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i), ldda, dT(0,0), nb, dA(old_i, old_i+2*old_ib), ldda, dwork(0), lddwork, queues[1] ); magma_sgetmatrix_async( i, ib, dA(0,i), ldda, A(0,i), lda, queues[1] ); magma_queue_sync( queues[0] ); } magma_int_t rows = m-i; lapackf77_sgeqrf( &rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info ); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, A(i,i), &lda, tau+i, work, &ib ); magma_spanel_to_q( MagmaUpper, ib, A(i,i), lda, work+ib*ib ); /* put i-th V matrix onto device */ magma_ssetmatrix_async( rows, ib, A(i,i), lda, dA(i,i), ldda, queues[0] ); /* put T matrix onto device */ magma_queue_sync( queues[1] ); magma_ssetmatrix_async( ib, ib, work, ib, dT(0,0), nb, queues[0] ); magma_queue_sync( queues[0] ); if (i + ib < n) { if (i+ib < min_mn-nb) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left (look-ahead) */ magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dT(0,0), nb, dA(i, i+ib), ldda, dwork(0), lddwork, queues[1] ); magma_sq_to_panel( MagmaUpper, ib, A(i,i), lda, work+ib*ib ); } else { /* After last panel, update whole trailing matrix. */ /* Apply H' to A(i:m,i+ib:n) from the left */ magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dT(0,0), nb, dA(i, i+ib), ldda, dwork(0), lddwork, queues[1] ); magma_sq_to_panel( MagmaUpper, ib, A(i,i), lda, work+ib*ib ); } old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < min_mn) { ib = n-i; if (i != 0) { magma_sgetmatrix( m, ib, dA(0,i), ldda, A(0,i), lda, queues[1] ); } magma_int_t rows = m-i; lapackf77_sgeqrf( &rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info ); } magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); magma_free( dA ); return *info; } /* magma_sgeqrf */
extern "C" magma_int_t magma_sgeqrf3_gpu( magma_int_t m, magma_int_t n, float *dA, magma_int_t ldda, float *tau, float *dT, magma_int_t *info ) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= SGEQRF3 computes a QR factorization of a real M-by-N matrix A: A = Q * R. This version stores the triangular dT matrices used in the block QR factorization so that they can be applied directly (i.e., without being recomputed) later. As a result, the application of Q is much faster. Also, the upper triangular matrices for V have 0s in them and the corresponding parts of the upper triangular R are stored separately in dT. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) REAL array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be dividable by 16. TAU (output) REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). dT (workspace/output) REAL array on the GPU, dimension (2*MIN(M, N) + (N+31)/32*32 )*NB, where NB can be obtained through magma_get_sgeqrf_nb(M). It starts with MIN(M,N)*NB block that store the triangular T matrices, followed by the MIN(M,N)*NB block of the diagonal matrices for the R matrix. The rest of the array is used as workspace. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define a_ref(a_1,a_2) (dA+(a_2)*(ldda) + (a_1)) #define t_ref(a_1) (dT+(a_1)*nb) #define d_ref(a_1) (dT+(minmn+(a_1))*nb) #define dd_ref(a_1) (dT+(2*minmn+(a_1))*nb) #define work_ref(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) magma_int_t i, k, minmn, old_i, old_ib, rows, cols; magma_int_t ib, nb; magma_int_t ldwork, lddwork, lwork, lhwork; float *work, *ut; /* check arguments */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = minmn = min(m,n); if (k == 0) return *info; nb = magma_get_sgeqrf_nb(m); lwork = (m + n + nb)*nb; lhwork = lwork - m*nb; if (MAGMA_SUCCESS != magma_smalloc_pinned( &work, lwork )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } ut = hwork+nb*(n); memset( ut, 0, nb*nb*sizeof(float)); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); ldwork = m; lddwork= n; if ( (nb > 1) && (nb < k) ) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nb; i += nb) { ib = min(k-i, nb); rows = m -i; magma_sgetmatrix_async( rows, ib, a_ref(i,i), ldda, work_ref(i), ldwork, stream[1] ); if (i>0){ /* Apply H' to A(i:m,i+2*ib:n) from the left */ cols = n-old_i-2*old_ib; magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, m-old_i, cols, old_ib, a_ref(old_i, old_i ), ldda, t_ref(old_i), nb, a_ref(old_i, old_i+2*old_ib), ldda, dd_ref(0), lddwork); /* store the diagonal */ magma_ssetmatrix_async( old_ib, old_ib, ut, old_ib, d_ref(old_i), old_ib, stream[0] ); } magma_queue_sync( stream[1] ); lapackf77_sgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &ib); /* Put 0s in the upper triangular part of a panel (and 1s on the diagonal); copy the upper triangular in ut. */ magma_queue_sync( stream[0] ); ssplit_diag_block3(ib, work_ref(i), ldwork, ut); magma_ssetmatrix( rows, ib, work_ref(i), ldwork, a_ref(i,i), ldda ); if (i + ib < n) { /* Send the triangular factor T to the GPU */ magma_ssetmatrix( ib, ib, hwork, ib, t_ref(i), nb ); if (i+nb < k-nb){ /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, a_ref(i, i ), ldda, t_ref(i), nb, a_ref(i, i+ib), ldda, dd_ref(0), lddwork); } else { cols = n-i-ib; magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, cols, ib, a_ref(i, i ), ldda, t_ref(i), nb, a_ref(i, i+ib), ldda, dd_ref(0), lddwork); /* Fix the diagonal block */ magma_ssetmatrix( ib, ib, ut, ib, d_ref(i), ib ); } old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_sgetmatrix( rows, ib, a_ref(i, i), ldda, work, rows ); lhwork = lwork - rows*ib; lapackf77_sgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_ssetmatrix( rows, ib, work, rows, a_ref(i, i), ldda ); } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free_pinned( work ); return *info; /* End of MAGMA_SGEQRF */ } /* magma_sgeqrf */
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); const float d_neg_one = MAGMA_D_NEG_ONE; const float d_one = MAGMA_D_ONE; const float c_neg_one = MAGMA_S_NEG_ONE; const float c_one = MAGMA_S_ONE; const float c_zero = MAGMA_S_ZERO; const magma_int_t ione = 1; real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float Anorm, error=0, error2=0; float *h_A, *h_R, *tau, *h_work, tmp[1]; magmaFloat_ptr d_A, dT; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb, size; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; opts.parse_opts( argc, argv ); magma_int_t status = 0; float tol = opts.tolerance * lapackf77_slamch("E"); // version 3 can do either check if (opts.check == 1 && opts.version == 1) { opts.check = 2; printf( "%% version 1 requires check 2 (solve A*x=b)\n" ); } if (opts.check == 2 && opts.version == 2) { opts.check = 1; printf( "%% version 2 requires check 1 (R - Q^H*A)\n" ); } printf( "%% version %d\n", (int) opts.version ); if ( opts.check == 1 ) { printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) |R - Q^H*A| |I - Q^H*Q|\n"); printf("%%==============================================================================\n"); } else { printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) |b - A*x|\n"); printf("%%===============================================================\n"); } for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min( M, N ); lda = M; n2 = lda*N; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default nb = magma_get_sgeqrf_nb( M, N ); gflops = FLOPS_SGEQRF( M, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_sgeqrf( &M, &N, NULL, &M, NULL, tmp, &lwork, &info ); lwork = (magma_int_t)MAGMA_S_REAL( tmp[0] ); TESTING_MALLOC_CPU( tau, float, min_mn ); TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_CPU( h_work, float, lwork ); TESTING_MALLOC_PIN( h_R, float, n2 ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); if ( opts.version == 1 || opts.version == 3 ) { size = (2*min(M, N) + magma_roundup( N, 32 ) )*nb; TESTING_MALLOC_DEV( dT, float, size ); magmablas_slaset( MagmaFull, size, 1, c_zero, c_zero, dT, size ); } /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaFullStr, &M, &N, h_A, &lda, h_R, &lda ); magma_ssetmatrix( M, N, h_R, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ nb = magma_get_sgeqrf_nb( M, N ); gpu_time = magma_wtime(); if ( opts.version == 1 ) { // stores dT, V blocks have zeros, R blocks inverted & stored in dT magma_sgeqrf_gpu( M, N, d_A, ldda, tau, dT, &info ); } else if ( opts.version == 2 ) { // LAPACK complaint arguments magma_sgeqrf2_gpu( M, N, d_A, ldda, tau, &info ); } #ifdef HAVE_CUBLAS else if ( opts.version == 3 ) { // stores dT, V blocks have zeros, R blocks stored in dT magma_sgeqrf3_gpu( M, N, d_A, ldda, tau, dT, &info ); } #endif else { printf( "Unknown version %d\n", (int) opts.version ); return -1; } gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_sgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } if ( opts.check == 1 && (opts.version == 2 || opts.version == 3) ) { if ( opts.version == 3 ) { // copy diagonal blocks of R back to A for( int i=0; i < min_mn-nb; i += nb ) { magma_int_t ib = min( min_mn-i, nb ); magmablas_slacpy( MagmaUpper, ib, ib, &dT[min_mn*nb + i*nb], nb, &d_A[ i + i*ldda ], ldda ); } } /* ===================================================================== Check the result, following zqrt01 except using the reduced Q. This works for any M,N (square, tall, wide). Only for version 2, which has LAPACK complaint output. Or for version 3, after restoring diagonal blocks of A above. =================================================================== */ magma_sgetmatrix( M, N, d_A, ldda, h_R, lda ); magma_int_t ldq = M; magma_int_t ldr = min_mn; float *Q, *R; float *work; TESTING_MALLOC_CPU( Q, float, ldq*min_mn ); // M by K TESTING_MALLOC_CPU( R, float, ldr*N ); // K by N TESTING_MALLOC_CPU( work, float, min_mn ); // generate M by K matrix Q, where K = min(M,N) lapackf77_slacpy( "Lower", &M, &min_mn, h_R, &lda, Q, &ldq ); lapackf77_sorgqr( &M, &min_mn, &min_mn, Q, &ldq, tau, h_work, &lwork, &info ); assert( info == 0 ); // copy K by N matrix R lapackf77_slaset( "Lower", &min_mn, &N, &c_zero, &c_zero, R, &ldr ); lapackf77_slacpy( "Upper", &min_mn, &N, h_R, &lda, R, &ldr ); // error = || R - Q^H*A || / (N * ||A||) blasf77_sgemm( "Conj", "NoTrans", &min_mn, &N, &M, &c_neg_one, Q, &ldq, h_A, &lda, &c_one, R, &ldr ); Anorm = lapackf77_slange( "1", &M, &N, h_A, &lda, work ); error = lapackf77_slange( "1", &min_mn, &N, R, &ldr, work ); if ( N > 0 && Anorm > 0 ) error /= (N*Anorm); // set R = I (K by K identity), then R = I - Q^H*Q // error = || I - Q^H*Q || / N lapackf77_slaset( "Upper", &min_mn, &min_mn, &c_zero, &c_one, R, &ldr ); blasf77_ssyrk( "Upper", "Conj", &min_mn, &M, &d_neg_one, Q, &ldq, &d_one, R, &ldr ); error2 = safe_lapackf77_slansy( "1", "Upper", &min_mn, R, &ldr, work ); if ( N > 0 ) error2 /= N; TESTING_FREE_CPU( Q ); Q = NULL; TESTING_FREE_CPU( R ); R = NULL; TESTING_FREE_CPU( work ); work = NULL; } else if ( opts.check == 2 && M >= N && (opts.version == 1 || opts.version == 3) ) { /* ===================================================================== Check the result by solving consistent linear system, A*x = b. Only for versions 1 & 3 with M >= N. =================================================================== */ magma_int_t lwork2; float *x, *b, *hwork; magmaFloat_ptr d_B; // initialize RHS, b = A*random TESTING_MALLOC_CPU( x, float, N ); TESTING_MALLOC_CPU( b, float, M ); lapackf77_slarnv( &ione, ISEED, &N, x ); blasf77_sgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_zero, b, &ione ); // copy to GPU TESTING_MALLOC_DEV( d_B, float, M ); magma_ssetvector( M, b, 1, d_B, 1 ); if ( opts.version == 1 ) { // allocate hwork magma_sgeqrs_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, tmp, -1, &info ); lwork2 = (magma_int_t)MAGMA_S_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, float, lwork2 ); // solve linear system magma_sgeqrs_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, hwork, lwork2, &info ); if (info != 0) { printf("magma_sgeqrs returned error %d: %s.\n", (int) info, magma_strerror( info )); } TESTING_FREE_CPU( hwork ); } #ifdef HAVE_CUBLAS else if ( opts.version == 3 ) { // allocate hwork magma_sgeqrs3_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, tmp, -1, &info ); lwork2 = (magma_int_t)MAGMA_S_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, float, lwork2 ); // solve linear system magma_sgeqrs3_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, hwork, lwork2, &info ); if (info != 0) { printf("magma_sgeqrs3 returned error %d: %s.\n", (int) info, magma_strerror( info )); } TESTING_FREE_CPU( hwork ); } #endif else { printf( "Unknown version %d\n", (int) opts.version ); return -1; } magma_sgetvector( N, d_B, 1, x, 1 ); // compute r = Ax - b, saved in b blasf77_sgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_neg_one, b, &ione ); // compute residual |Ax - b| / (max(m,n)*|A|*|x|) float norm_x, norm_A, norm_r, work[1]; norm_A = lapackf77_slange( "F", &M, &N, h_A, &lda, work ); norm_r = lapackf77_slange( "F", &M, &ione, b, &M, work ); norm_x = lapackf77_slange( "F", &N, &ione, x, &N, work ); TESTING_FREE_CPU( x ); TESTING_FREE_CPU( b ); TESTING_FREE_DEV( d_B ); error = norm_r / (max(M,N) * norm_A * norm_x); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_sgeqrf( &M, &N, h_A, &lda, tau, h_work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_sgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } } /* ===================================================================== Print performance and error. =================================================================== */ printf("%5d %5d ", (int) M, (int) N ); if ( opts.lapack ) { printf( "%7.2f (%7.2f)", cpu_perf, cpu_time ); } else { printf(" --- ( --- )" ); } printf( " %7.2f (%7.2f) ", gpu_perf, gpu_time ); if ( opts.check == 1 ) { bool okay = (error < tol && error2 < tol); status += ! okay; printf( "%11.2e %11.2e %s\n", error, error2, (okay ? "ok" : "failed") ); } else if ( opts.check == 2 ) { if ( M >= N ) { bool okay = (error < tol); status += ! okay; printf( "%10.2e %s\n", error, (okay ? "ok" : "failed") ); } else { printf( "(error check only for M >= N)\n" ); } } else { printf( " ---\n" ); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); if ( opts.version == 1 || opts.version == 3 ) { TESTING_FREE_DEV( dT ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_sgeqrf2_mgpu( magma_int_t num_gpus, magma_int_t m, magma_int_t n, float **dlA, magma_int_t ldda, float *tau, magma_int_t *info ) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= SGEQRF2_MGPU computes a QR factorization of a real M-by-N matrix A: A = Q * R. This is a GPU interface of the routine. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) REAL array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix dA. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be dividable by 16. TAU (output) REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define dlA(dev, i, j) (dlA[dev] + (i) + (j)*(ldda)) #define hpanel(i) (hpanel + (i)) // set to NULL to make cleanup easy: free(NULL) does nothing. float *dwork[MagmaMaxGPUs]={NULL}, *dpanel[MagmaMaxGPUs]={NULL}; float *hwork=NULL, *hpanel=NULL; magma_queue_t stream[MagmaMaxGPUs][2]={{NULL}}; magma_event_t panel_event[MagmaMaxGPUs]={NULL}; magma_int_t i, j, min_mn, dev, ldhpanel, lddwork, rows; magma_int_t ib, nb; magma_int_t lhwork, lwork; magma_int_t panel_dev, i_local, i_nb_local, n_local[MagmaMaxGPUs], la_dev, dpanel_offset; magma_queue_t cqueue; magmablasGetKernelStream( &cqueue ); magma_device_t cdevice; magma_getdevice( &cdevice ); *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } min_mn = min(m,n); if (min_mn == 0) return *info; nb = magma_get_sgeqrf_nb( m ); /* dwork is (n*nb) --- for T (nb*nb) and slarfb work ((n-nb)*nb) --- * + dpanel (ldda*nb), on each GPU. * I think slarfb work could be smaller, max(n_local[:]). * Oddly, T and slarfb work get stacked on top of each other, both with lddwork=n. * on GPU that owns panel, set dpanel = dlA(dev,i,i_local). * on other GPUs, set dpanel = dwork[dev] + dpanel_offset. */ lddwork = n; dpanel_offset = lddwork*nb; for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); if ( MAGMA_SUCCESS != magma_smalloc( &(dwork[dev]), (lddwork + ldda)*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto CLEANUP; } } /* hwork is MAX( workspace for sgeqrf (n*nb), two copies of T (2*nb*nb) ) * + hpanel (m*nb). * for last block, need 2*n*nb total. */ ldhpanel = m; lhwork = max( n*nb, 2*nb*nb ); lwork = max( lhwork + ldhpanel*nb, 2*n*nb ); if ( MAGMA_SUCCESS != magma_smalloc_pinned( &hwork, lwork )) { *info = MAGMA_ERR_HOST_ALLOC; goto CLEANUP; } hpanel = hwork + lhwork; /* Set the number of local n for each GPU */ for( dev=0; dev < num_gpus; dev++ ) { n_local[dev] = ((n/nb)/num_gpus)*nb; if (dev < (n/nb) % num_gpus) n_local[dev] += nb; else if (dev == (n/nb) % num_gpus) n_local[dev] += n % nb; } for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_queue_create( &stream[dev][0] ); magma_queue_create( &stream[dev][1] ); magma_event_create( &panel_event[dev] ); } if ( nb < min_mn ) { /* Use blocked code initially */ // Note: as written, ib cannot be < nb. for( i = 0; i < min_mn-nb; i += nb ) { /* Set the GPU number that holds the current panel */ panel_dev = (i/nb) % num_gpus; /* Set the local index where the current panel is (j==i) */ i_local = i/(nb*num_gpus)*nb; ib = min(min_mn-i, nb); rows = m-i; /* Send current panel to the CPU, after panel_event indicates it has been updated */ magma_setdevice( panel_dev ); magma_queue_wait_event( stream[panel_dev][1], panel_event[panel_dev] ); magma_sgetmatrix_async( rows, ib, dlA(panel_dev, i, i_local), ldda, hpanel(i), ldhpanel, stream[panel_dev][1] ); magma_queue_sync( stream[panel_dev][1] ); // Factor panel lapackf77_sgeqrf( &rows, &ib, hpanel(i), &ldhpanel, tau+i, hwork, &lhwork, info ); if ( *info != 0 ) { fprintf( stderr, "error %d\n", (int) *info ); } // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, hpanel(i), &ldhpanel, tau+i, hwork, &ib ); spanel_to_q( MagmaUpper, ib, hpanel(i), ldhpanel, hwork + ib*ib ); // Send the current panel back to the GPUs for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); if (dev == panel_dev) dpanel[dev] = dlA(dev, i, i_local); else dpanel[dev] = dwork[dev] + dpanel_offset; magma_ssetmatrix_async( rows, ib, hpanel(i), ldhpanel, dpanel[dev], ldda, stream[dev][0] ); } for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_queue_sync( stream[dev][0] ); } // TODO: if spanel_to_q copied whole block, wouldn't need to restore // -- just send the copy to the GPUs. // TODO: also, could zero out the lower triangle and use Azzam's larfb w/ gemm. /* Restore the panel */ sq_to_panel( MagmaUpper, ib, hpanel(i), ldhpanel, hwork + ib*ib ); if (i + ib < n) { /* Send the T matrix to the GPU. */ for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_ssetmatrix_async( ib, ib, hwork, ib, dwork[dev], lddwork, stream[dev][0] ); } la_dev = (panel_dev+1) % num_gpus; for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magmablasSetKernelStream( stream[dev][0] ); if (dev == la_dev && i+nb < min_mn-nb) { // If not last panel, // for look-ahead panel, apply H' to A(i:m,i+ib:i+2*ib) i_nb_local = (i+nb)/(nb*num_gpus)*nb; magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local), ldda, // C dwork[dev]+ib, lddwork ); // work magma_event_record( panel_event[dev], stream[dev][0] ); // for trailing matrix, apply H' to A(i:m,i+2*ib:n) magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, n_local[dev]-(i_nb_local+ib), ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local+ib), ldda, // C dwork[dev]+ib, lddwork ); // work } else { // for trailing matrix, apply H' to A(i:m,i+ib:n) i_nb_local = i_local; if (dev <= panel_dev) { i_nb_local += ib; } magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, n_local[dev]-i_nb_local, ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local), ldda, // C dwork[dev]+ib, lddwork ); // work } } // Restore top of panel (after larfb is done) magma_setdevice( panel_dev ); magma_ssetmatrix_async( ib, ib, hpanel(i), ldhpanel, dlA(panel_dev, i, i_local), ldda, stream[panel_dev][0] ); } } } else { i = 0; } /* Use unblocked code to factor the last or only block row. */ if (i < min_mn) { rows = m-i; for( j=i; j < n; j += nb ) { panel_dev = (j/nb) % num_gpus; i_local = j/(nb*num_gpus)*nb; ib = min( n-j, nb ); magma_setdevice( panel_dev ); magma_sgetmatrix( rows, ib, dlA(panel_dev, i, i_local), ldda, hwork + (j-i)*rows, rows ); } // needs lwork >= 2*n*nb: // needs (m-i)*(n-i) for last block row, bounded by nb*n. // needs (n-i)*nb for sgeqrf work, bounded by n*nb. ib = n-i; // total columns in block row lhwork = lwork - ib*rows; lapackf77_sgeqrf( &rows, &ib, hwork, &rows, tau+i, hwork + ib*rows, &lhwork, info ); if ( *info != 0 ) { fprintf( stderr, "error %d\n", (int) *info ); } for( j=i; j < n; j += nb ) { panel_dev = (j/nb) % num_gpus; i_local = j/(nb*num_gpus)*nb; ib = min( n-j, nb ); magma_setdevice( panel_dev ); magma_ssetmatrix( rows, ib, hwork + (j-i)*rows, rows, dlA(panel_dev, i, i_local), ldda ); } } CLEANUP: // free(NULL) does nothing. // check that queues and events are non-zero before destroying them, though. for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); if ( stream[dev][0] ) { magma_queue_destroy( stream[dev][0] ); } if ( stream[dev][1] ) { magma_queue_destroy( stream[dev][1] ); } if ( panel_event[dev] ) { magma_event_destroy( panel_event[dev] ); } magma_free( dwork[dev] ); } magma_free_pinned( hwork ); magma_setdevice( cdevice ); magmablasSetKernelStream( cqueue ); return *info; } /* magma_sgeqrf2_mgpu */
/** Purpose ------- SGEQRF computes a QR factorization of a real M-by-N matrix A: A = Q * R. This version has LAPACK-complaint arguments. If the current stream is NULL, this version replaces it with a new stream to overlap computation with communication. Other versions (magma_sgeqrf_gpu and magma_sgeqrf3_gpu) store the intermediate T matrices. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA REAL array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] tau REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sgeqrf2_gpu( magma_int_t m, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda, float *tau, magma_int_t *info ) { #define dA(a_1,a_2) ( dA+(a_2)*(ldda) + (a_1)) #define work_ref(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) magmaFloat_ptr dwork; float *work; magma_int_t i, k, ldwork, lddwork, old_i, old_ib, rows; magma_int_t nbmin, nx, ib, nb; magma_int_t lhwork, lwork; /* Function Body */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = min(m,n); if (k == 0) return *info; nb = magma_get_sgeqrf_nb(m); lwork = (m+n) * nb; lhwork = lwork - (m)*nb; if (MAGMA_SUCCESS != magma_smalloc( &dwork, n*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if (MAGMA_SUCCESS != magma_smalloc_pinned( &work, lwork )) { magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* Define user stream if current stream is NULL */ magma_queue_t stream[2]; magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); magma_queue_create( &stream[0] ); if (orig_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else { stream[1] = orig_stream; } nbmin = 2; nx = nb; ldwork = m; lddwork= n; if (nb >= nbmin && nb < k && nx < k) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nx; i += nb) { ib = min(k-i, nb); rows = m -i; /* download i-th panel */ magma_queue_sync( stream[1] ); magma_sgetmatrix_async( rows, ib, dA(i,i), ldda, work_ref(i), ldwork, stream[0] ); if (i > 0) { /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i ), ldda, dwork, lddwork, dA(old_i, old_i+2*old_ib), ldda, dwork+old_ib, lddwork); magma_ssetmatrix_async( old_ib, old_ib, work_ref(old_i), ldwork, dA(old_i, old_i), ldda, stream[1] ); } magma_queue_sync( stream[0] ); lapackf77_sgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &ib); spanel_to_q( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); /* download the i-th V matrix */ magma_ssetmatrix_async( rows, ib, work_ref(i), ldwork, dA(i,i), ldda, stream[0] ); /* download the T matrix */ magma_queue_sync( stream[1] ); magma_ssetmatrix_async( ib, ib, hwork, ib, dwork, lddwork, stream[0] ); magma_queue_sync( stream[0] ); if (i + ib < n) { if (i+nb < k-nx) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dwork, lddwork, dA(i, i+ib), ldda, dwork+ib, lddwork); sq_to_panel( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); } else { magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dwork, lddwork, dA(i, i+ib), ldda, dwork+ib, lddwork); sq_to_panel( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); magma_ssetmatrix_async( ib, ib, work_ref(i), ldwork, dA(i,i), ldda, stream[1] ); } old_i = i; old_ib = ib; } } } else { i = 0; } magma_free( dwork ); /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_sgetmatrix_async( rows, ib, dA(i, i), ldda, work, rows, stream[1] ); magma_queue_sync( stream[1] ); lhwork = lwork - rows*ib; lapackf77_sgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_ssetmatrix_async( rows, ib, work, rows, dA(i, i), ldda, stream[1] ); } magma_free_pinned( work ); magma_queue_destroy( stream[0] ); if (orig_stream == NULL) { magma_queue_destroy( stream[1] ); } magmablasSetKernelStream( orig_stream ); return *info; } /* magma_sgeqrf2_gpu */
/** Purpose ------- SGEQRF4 computes a QR factorization of a REAL M-by-N matrix A: A = Q * R using multiple GPUs. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. Arguments --------- @param[in] num_gpus INTEGER The number of GPUs to be used for the factorization. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] tau REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. \n Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= N*NB, where NB can be obtained through magma_get_sgeqrf_nb(M). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sgeqrf4(magma_int_t num_gpus, magma_int_t m, magma_int_t n, float *A, magma_int_t lda, float *tau, float *work, magma_int_t lwork, magma_int_t *info ) { float *da[MagmaMaxGPUs]; float c_one = MAGMA_S_ONE; int i, k, ldda; *info = 0; int nb = magma_get_sgeqrf_nb(min(m, n)); int lwkopt = n * nb; work[0] = MAGMA_S_MAKE( (float)lwkopt, 0 ); int lquery = (lwork == -1); if (num_gpus < 0 || num_gpus > MagmaMaxGPUs) { *info = -1; } else if (m < 0) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } else if (lwork < max(1,n) && ! lquery) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { work[0] = c_one; return *info; } ldda = ((m+31)/32)*32; magma_int_t n_local[MagmaMaxGPUs]; for (i=0; i < num_gpus; i++) { n_local[i] = ((n/nb)/num_gpus)*nb; if (i < (n/nb)%num_gpus) n_local[i] += nb; else if (i == (n/nb)%num_gpus) n_local[i] += n%nb; magma_setdevice(i); // TODO on failure, free previously allocated memory if (MAGMA_SUCCESS != magma_smalloc( &da[i], ldda*n_local[i] )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } if (m > nb && n > nb) { /* Copy the matrix to the GPUs in 1D block cyclic distribution */ magma_ssetmatrix_1D_col_bcyclic(m, n, A, lda, da, ldda, num_gpus, nb); /* Factor using the GPU interface */ magma_sgeqrf2_mgpu( num_gpus, m, n, da, ldda, tau, info); /* Copy the matrix back from the GPUs to the CPU */ magma_sgetmatrix_1D_col_bcyclic(m, n, da, ldda, A, lda, num_gpus, nb); } else { lapackf77_sgeqrf(&m, &n, A, &lda, tau, work, &lwork, info); } /* Free the allocated GPU memory */ for (i=0; i < num_gpus; i++) { magma_setdevice(i); magma_free( da[i] ); } return *info; } /* magma_sgeqrf4 */
/** Purpose ------- SGEQRF computes a QR factorization of a REAL M-by-N matrix A: A = Q * R using multiple GPUs. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] tau REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. \n Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= N*NB, where NB can be obtained through magma_get_sgeqrf_nb( M, N ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sgeqrf_m( magma_int_t ngpu, magma_int_t m, magma_int_t n, float *A, magma_int_t lda, float *tau, float *work, magma_int_t lwork, magma_int_t *info ) { float *da[MagmaMaxGPUs]; float c_one = MAGMA_S_ONE; magma_int_t i, min_mn, ldda; *info = 0; magma_int_t nb = magma_get_sgeqrf_nb( m, n ); magma_int_t lwkopt = n * nb; work[0] = magma_smake_lwork( lwkopt ); bool lquery = (lwork == -1); if (ngpu < 0 || ngpu > MagmaMaxGPUs) { *info = -1; } else if (m < 0) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } else if (lwork < max(1,n) && ! lquery) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; min_mn = min(m,n); if (min_mn == 0) { work[0] = c_one; return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); ldda = magma_roundup( m, 32 ); magma_int_t n_local[MagmaMaxGPUs]; for (i=0; i < ngpu; i++) { n_local[i] = ((n/nb)/ngpu)*nb; if (i < (n/nb)%ngpu) n_local[i] += nb; else if (i == (n/nb)%ngpu) n_local[i] += n%nb; magma_setdevice(i); // TODO on failure, free previously allocated memory if (MAGMA_SUCCESS != magma_smalloc( &da[i], ldda*n_local[i] )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } if (m > nb && n > nb) { magma_queue_t queues[MagmaMaxGPUs]; for( magma_int_t dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_queue_create( dev, &queues[dev] ); } /* Copy the matrix to the GPUs in 1D block cyclic distribution */ magma_ssetmatrix_1D_col_bcyclic(m, n, A, lda, da, ldda, ngpu, nb, queues); for( magma_int_t dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_queue_sync( queues[dev] ); } /* Factor using the GPU interface */ magma_sgeqrf2_mgpu( ngpu, m, n, da, ldda, tau, info); /* Copy the matrix back from the GPUs to the CPU */ magma_sgetmatrix_1D_col_bcyclic(m, n, da, ldda, A, lda, ngpu, nb, queues); for( magma_int_t dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_queue_sync( queues[dev] ); magma_queue_destroy( queues[dev] ); } } else { lapackf77_sgeqrf(&m, &n, A, &lda, tau, work, &lwork, info); } /* Free the allocated GPU memory */ for (i=0; i < ngpu; i++) { magma_setdevice(i); magma_free( da[i] ); } magma_setdevice( orig_dev ); return *info; } /* magma_sgeqrf4 */
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgeqrf */ int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error, work[1]; float c_neg_one = MAGMA_S_NEG_ONE; float *h_A, *h_R, *tau, *h_work, tmp[1]; magma_int_t M, N, n2, lda, lwork, info, min_mn, nb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}, ISEED2[4]; /* Initialize */ magma_queue_t queue[2]; magma_device_t devices[MagmaMaxGPUs]; int num = 0; magma_err_t err; magma_init(); magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t status = 0; float tol, eps = lapackf77_slamch("E"); tol = opts.tolerance * eps; opts.lapack |= ( opts.check == 2 ); // check (-c2) implies lapack (-l) err = magma_get_devices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } // Create two queues on device opts.device err = magma_queue_create( devices[opts.device], &queue[0] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( devices[opts.device], &queue[1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 1 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R-Q'A||_1 / (M*||A||_1) ||I-Q'Q||_1 / M\n"); printf("===============================================================================================\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("=======================================================================\n"); } for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; nb = magma_get_sgeqrf_nb(M); gflops = FLOPS_SGEQRF( M, N ) / 1e9; lwork = -1; lapackf77_sgeqrf(&M, &N, h_A, &M, tau, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_S_REAL( tmp[0] ); lwork = max( lwork, max( N*nb, 2*nb*nb )); TESTING_MALLOC_CPU( tau, float, min_mn ); TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_PIN( h_R, float, n2 ); TESTING_MALLOC_CPU( h_work, float, lwork ); /* Initialize the matrix */ for ( int j=0; j<4; j++ ) ISEED2[j] = ISEED[j]; // saving seeds lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_sgeqrf(M, N, h_R, lda, tau, h_work, lwork, &info, queue); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_sgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ float *tau; TESTING_MALLOC_CPU( tau, float, min_mn ); cpu_time = magma_wtime(); lapackf77_sgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_sgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( tau ); } if ( opts.check == 1 ) { /* ===================================================================== Check the result =================================================================== */ magma_int_t lwork = n2+N; float *h_W1, *h_W2, *h_W3; float *h_RW, results[2]; TESTING_MALLOC_CPU( h_W1, float, n2 ); // Q TESTING_MALLOC_CPU( h_W2, float, n2 ); // R TESTING_MALLOC_CPU( h_W3, float, lwork ); // WORK TESTING_MALLOC_CPU( h_RW, float, M ); // RWORK lapackf77_slarnv( &ione, ISEED2, &n2, h_A ); lapackf77_sqrt02( &M, &N, &min_mn, h_A, h_R, h_W1, h_W2, &lda, tau, h_W3, &lwork, h_RW, results ); results[0] *= eps; results[1] *= eps; if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, results[0],results[1] ); printf("%s\n", (results[0] < tol ? "" : " failed")); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, gpu_perf, gpu_time, results[0],results[1] ); printf("%s\n", (results[0] < tol ? "" : " failed")); } status |= ! (results[0] < tol); TESTING_FREE_CPU( h_W1 ); TESTING_FREE_CPU( h_W2 ); TESTING_FREE_CPU( h_W3 ); TESTING_FREE_CPU( h_RW ); } else if ( opts.check == 2 ) { /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_slange("f", &M, &N, h_A, &lda, work); blasf77_saxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_slange("f", &M, &N, h_R, &lda, work) / error; if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e", (int) M, (int) N, gpu_perf, gpu_time, error ); } printf("%s\n", (error < tol ? "" : " failed")); status |= ! (error < tol); } else { if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) ---\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, gpu_time); } } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); } if ( opts.niter > 1 ) { printf( "\n" ); } } magma_queue_destroy( queue[0] ); magma_queue_destroy( queue[1] ); magma_finalize(); return status; }
/** Purpose ------- SGEQP3 computes a QR factorization with column pivoting of a matrix A: A*P = Q*R using Level 3 BLAS. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the upper triangle of the array contains the min(M,N)-by-N upper trapezoidal matrix R; the elements below the diagonal, together with the array TAU, represent the unitary matrix Q as a product of min(M,N) elementary reflectors. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[in,out] jpvt INTEGER array, dimension (N) On entry, if JPVT(J).ne.0, the J-th column of A is permuted to the front of A*P (a leading column); if JPVT(J)=0, the J-th column of A is a free column. On exit, if JPVT(J)=K, then the J-th column of A*P was the the K-th column of A. @param[out] tau REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors. @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO=0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. For [sd]geqp3, LWORK >= (N+1)*NB + 2*N; for [cz]geqp3, LWORK >= (N+1)*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param rwork (workspace, for [cz]geqp3 only) REAL array, dimension (2*N) @param[out] info INTEGER - = 0: successful exit. - < 0: if INFO = -i, the i-th argument had an illegal value. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_sgeqp3_comp ********************************************************************/ extern "C" magma_int_t magma_sgeqp3( magma_int_t m, magma_int_t n, float *A, magma_int_t lda, magma_int_t *jpvt, float *tau, float *work, magma_int_t lwork, #ifdef COMPLEX float *rwork, #endif magma_int_t *info ) { #define A(i, j) (A + (i) + (j)*(lda )) #define dA(i, j) (dwork + (i) + (j)*(ldda)) float *dwork, *df; magma_int_t ione = 1; magma_int_t n_j, ldda, ldwork; magma_int_t j, jb, na, nb, sm, sn, fjb, nfxd, minmn; magma_int_t topbmn, sminmn, lwkopt=0, lquery; *info = 0; lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } nb = magma_get_sgeqp3_nb(min(m, n)); minmn = min(m,n); if (*info == 0) { if (minmn == 0) { lwkopt = 1; } else { lwkopt = (n + 1)*nb; #ifdef REAL lwkopt += 2*n; #endif } work[0] = MAGMA_S_MAKE( lwkopt, 0. ); if (lwork < lwkopt && ! lquery) { *info = -8; } } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } if (minmn == 0) return *info; #ifdef REAL float *rwork = work + (n + 1)*nb; #endif ldda = ((m+31)/32)*32; ldwork = n*ldda + (n+1)*nb; if (MAGMA_SUCCESS != magma_smalloc( &dwork, ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } df = dwork + n*ldda; // dwork used for dA magma_queue_t stream; magma_queue_create( &stream ); /* Move initial columns up front. * Note jpvt uses 1-based indices for historical compatibility. */ nfxd = 0; for (j = 0; j < n; ++j) { if (jpvt[j] != 0) { if (j != nfxd) { blasf77_sswap(&m, A(0, j), &ione, A(0, nfxd), &ione); jpvt[j] = jpvt[nfxd]; jpvt[nfxd] = j + 1; } else { jpvt[j] = j + 1; } ++nfxd; } else { jpvt[j] = j + 1; } } /* Factorize fixed columns ======================= Compute the QR factorization of fixed columns and update remaining columns. */ if (nfxd > 0) { na = min(m,nfxd); lapackf77_sgeqrf(&m, &na, A, &lda, tau, work, &lwork, info); if (na < n) { n_j = n - na; lapackf77_sormqr( MagmaLeftStr, MagmaConjTransStr, &m, &n_j, &na, A, &lda, tau, A(0, na), &lda, work, &lwork, info ); } } /* Factorize free columns */ if (nfxd < minmn) { sm = m - nfxd; sn = n - nfxd; sminmn = minmn - nfxd; if (nb < sminmn) { j = nfxd; // Set the original matrix to the GPU magma_ssetmatrix_async( m, sn, A (0,j), lda, dA(0,j), ldda, stream ); } /* Initialize partial column norms. */ for (j = nfxd; j < n; ++j) { rwork[j] = magma_cblas_snrm2( sm, A(nfxd,j), ione ); rwork[n + j] = rwork[j]; } j = nfxd; if (nb < sminmn) { /* Use blocked code initially. */ magma_queue_sync( stream ); /* Compute factorization: while loop. */ topbmn = minmn - nb; while(j < topbmn) { jb = min(nb, topbmn - j); /* Factorize JB columns among columns J:N. */ n_j = n - j; if (j > nfxd) { // Get panel to the CPU magma_sgetmatrix( m-j, jb, dA(j,j), ldda, A (j,j), lda ); // Get the rows magma_sgetmatrix( jb, n_j - jb, dA(j,j + jb), ldda, A (j,j + jb), lda ); } magma_slaqps( m, n_j, j, jb, &fjb, A (0, j), lda, dA(0, j), ldda, &jpvt[j], &tau[j], &rwork[j], &rwork[n + j], work, &work[jb], n_j, &df[jb], n_j ); j += fjb; /* fjb is actual number of columns factored */ } } /* Use unblocked code to factor the last or only block. */ if (j < minmn) { n_j = n - j; if (j > nfxd) { magma_sgetmatrix( m-j, n_j, dA(j,j), ldda, A (j,j), lda ); } lapackf77_slaqp2(&m, &n_j, &j, A(0, j), &lda, &jpvt[j], &tau[j], &rwork[j], &rwork[n+j], work ); } } work[0] = MAGMA_S_MAKE( lwkopt, 0. ); magma_free( dwork ); magma_queue_destroy( stream ); return *info; } /* magma_sgeqp3 */
/** Purpose ------- SGEQRF computes a QR factorization of a real M-by-N matrix A: A = Q * R. This version stores the triangular dT matrices used in the block QR factorization so that they can be applied directly (i.e., without being recomputed) later. As a result, the application of Q is much faster. Also, the upper triangular matrices for V have 0s in them. The corresponding parts of the upper triangular R are inverted and stored separately in dT. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA REAL array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] tau REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] dT (workspace) REAL array on the GPU, dimension (2*MIN(M, N) + (N+31)/32*32 )*NB, where NB can be obtained through magma_get_sgeqrf_nb(M). It starts with MIN(M,N)*NB block that store the triangular T matrices, followed by the MIN(M,N)*NB block of the diagonal inverses for the R matrix. The rest of the array is used as workspace. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sgeqrf_gpu( magma_int_t m, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda, float *tau, magmaFloat_ptr dT, magma_int_t *info ) { #define dA(a_1,a_2) (dA + (a_2)*(ldda) + (a_1)) #define dT(a_1) (dT + (a_1)*nb) #define d_ref(a_1) (dT + ( minmn+(a_1))*nb) #define dd_ref(a_1) (dT + (2*minmn+(a_1))*nb) #define work(a_1) (work + (a_1)) #define hwork (work + (nb)*(m)) magma_int_t i, k, minmn, old_i, old_ib, rows, cols; magma_int_t ib, nb; magma_int_t ldwork, lddwork, lwork, lhwork; float *work, *ut; /* check arguments */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = minmn = min(m,n); if (k == 0) return *info; nb = magma_get_sgeqrf_nb(m); lwork = (m + n + nb)*nb; lhwork = lwork - m*nb; if (MAGMA_SUCCESS != magma_smalloc_pinned( &work, lwork )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } ut = hwork+nb*(n); memset( ut, 0, nb*nb*sizeof(float)); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); ldwork = m; lddwork= n; if ( (nb > 1) && (nb < k) ) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nb; i += nb) { ib = min(k-i, nb); rows = m -i; magma_sgetmatrix_async( rows, ib, dA(i,i), ldda, work(i), ldwork, stream[1] ); if (i > 0) { /* Apply H' to A(i:m,i+2*ib:n) from the left */ cols = n-old_i-2*old_ib; magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, cols, old_ib, dA(old_i, old_i ), ldda, dT(old_i), nb, dA(old_i, old_i+2*old_ib), ldda, dd_ref(0), lddwork); /* store the diagonal */ magma_ssetmatrix_async( old_ib, old_ib, ut, old_ib, d_ref(old_i), old_ib, stream[0] ); } magma_queue_sync( stream[1] ); lapackf77_sgeqrf(&rows, &ib, work(i), &ldwork, tau+i, hwork, &lhwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work(i), &ldwork, tau+i, hwork, &ib); /* Put 0s in the upper triangular part of a panel (and 1s on the diagonal); copy the upper triangular in ut and invert it. */ magma_queue_sync( stream[0] ); ssplit_diag_block(ib, work(i), ldwork, ut); magma_ssetmatrix( rows, ib, work(i), ldwork, dA(i,i), ldda ); if (i + ib < n) { /* Send the triangular factor T to the GPU */ magma_ssetmatrix( ib, ib, hwork, ib, dT(i), nb ); if (i+nb < k-nb) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dT(i), nb, dA(i, i+ib), ldda, dd_ref(0), lddwork); } else { cols = n-i-ib; magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, cols, ib, dA(i, i ), ldda, dT(i), nb, dA(i, i+ib), ldda, dd_ref(0), lddwork); /* Fix the diagonal block */ magma_ssetmatrix( ib, ib, ut, ib, d_ref(i), ib ); } old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_sgetmatrix( rows, ib, dA(i, i), ldda, work, rows ); lhwork = lwork - rows*ib; lapackf77_sgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_ssetmatrix( rows, ib, work, rows, dA(i, i), ldda ); } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free_pinned( work ); return *info; } /* magma_sgeqrf_gpu */
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgeqrs_gpu */ int main( int argc, char** argv) { //#if defined(PRECISION_s) /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float matnorm, work[1]; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; float *h_A, *h_A2, *h_B, *h_X, *h_R, *tau, *hwork, tmp[1]; magmaFloat_ptr d_A, d_B; /* Matrix size */ magma_int_t M = 0, N = 0, n2; magma_int_t lda, ldb, ldda, lddb, lworkgpu, lhwork; magma_int_t size[7] = {1024,2048,3072,4032,5184,6016,7000}; magma_int_t i, info, min_mn, nb, l1, l2; magma_int_t ione = 1; magma_int_t nrhs = 3; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-nrhs", argv[i])==0) nrhs = atoi(argv[++i]); } if (N>0 && M>0 && M >= N) printf(" testing_sgeqrs_gpu -nrhs %d -M %d -N %d\n\n", nrhs, M, N); else { printf("\nUsage: \n"); printf(" testing_sgeqrs_gpu -nrhs %d -M %d -N %d\n\n", nrhs, M, N); printf(" M has to be >= N, exit.\n"); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_sgeqrs_gpu -nrhs %d -M %d -N %d\n\n", nrhs, 1024, 1024); M = N = size[6]; } ldda = ((M+31)/32)*32; lddb = ldda; n2 = M * N; min_mn = min(M, N); nb = magma_get_sgeqrf_nb(M); lda = ldb = M; lworkgpu = (M-N + nb)*(nrhs+2*nb); /* Allocate host memory for the matrix */ TESTING_MALLOC_PIN( tau, float, min_mn ); TESTING_MALLOC_PIN( h_A, float, lda*N ); TESTING_MALLOC_PIN( h_A2, float, lda*N ); TESTING_MALLOC_PIN( h_B, float, ldb*nrhs ); TESTING_MALLOC_PIN( h_X, float, ldb*nrhs ); TESTING_MALLOC_PIN( h_R, float, ldb*nrhs ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); TESTING_MALLOC_DEV( d_B, float, lddb*nrhs ); /* * Get size for host workspace */ lhwork = -1; lapackf77_sgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); l1 = (magma_int_t)MAGMA_S_REAL( tmp[0] ); lhwork = -1; lapackf77_sormqr( MagmaLeftStr, MagmaTransStr, &M, &nrhs, &min_mn, h_A, &lda, tau, h_X, &ldb, tmp, &lhwork, &info); l2 = (magma_int_t)MAGMA_S_REAL( tmp[0] ); lhwork = max( max( l1, l2 ), lworkgpu ); TESTING_MALLOC_PIN( hwork, float, lhwork ); printf("\n"); printf(" ||b-Ax|| / (N||A||)\n"); printf(" M N CPU GFlop/s GPU GFlop/s CPU GPU \n"); printf("============================================================\n"); for(i=0; i<7; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); ldb = lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = (FLOPS_GEQRF( (float)M, (float)N ) + FLOPS_GEQRS( (float)M, (float)N, (float)nrhs )) / 1e9; /* Initialize the matrices */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda ); n2 = M*nrhs; lapackf77_slarnv( &ione, ISEED, &n2, h_B ); lapackf77_slacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* Warm up to measure the performance */ magma_ssetmatrix( M, N, h_A, 0, lda, d_A, 0, ldda, queue ); magma_ssetmatrix( M, nrhs, h_B, 0, ldb, d_B, 0, lddb, queue ); magma_sgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, 0, ldda, d_B, 0, lddb, hwork, lworkgpu, &info, queue); magma_ssetmatrix( M, N, h_A, 0, lda, d_A, 0, ldda, queue ); magma_ssetmatrix( M, nrhs, h_B, 0, ldb, d_B, 0, lddb, queue ); gpu_time = magma_wtime(); magma_sgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, 0, ldda, d_B, 0, lddb, hwork, lworkgpu, &info, queue); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of magma_sgels had an illegal value.\n", -info); gpu_perf = gflops / gpu_time; // Get the solution in h_X magma_sgetmatrix( N, nrhs, d_B, 0, lddb, h_X, 0, ldb, queue ); // compute the residual blasf77_sgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb); matnorm = lapackf77_slange("f", &M, &N, h_A, &lda, work); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_slacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_sgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, hwork, &lhwork, &info); cpu_time = magma_wtime()-cpu_time; cpu_perf = gflops / cpu_time; if (info < 0) printf("Argument %d of lapackf77_sgels had an illegal value.\n", -info); blasf77_sgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb); printf("%5d %5d %6.1f %6.1f %7.2e %7.2e\n", M, N, cpu_perf, gpu_perf, lapackf77_slange("f", &M, &nrhs, h_B, &M, work)/(min_mn*matnorm), lapackf77_slange("f", &M, &nrhs, h_R, &M, work)/(min_mn*matnorm) ); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_A2 ); TESTING_FREE_PIN( h_B ); TESTING_FREE_PIN( h_X ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( hwork ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); /* Shutdown */ magma_queue_destroy( queue ); magma_finalize(); }
extern "C" magma_int_t magma_sgeqrf_msub( magma_int_t num_subs, magma_int_t num_gpus, magma_int_t m, magma_int_t n, magmaFloat_ptr *dlA, magma_int_t ldda, float *tau, magma_queue_t *queues, magma_int_t *info) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= SGEQRF2_MGPU computes a QR factorization of a real M-by-N matrix A: A = Q * R. This is a GPU interface of the routine. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) REAL array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix dA. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. TAU (output) REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define dlA(gpu,a_1,a_2) dlA[gpu], ((a_2)*(ldda) + (a_1)) #define dlA_offset(a_1, a_2) ((a_2)*(ldda) + (a_1)) #define work_ref(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) #define hwrk(a_1) ( local_work + (a_1)) #define lhwrk ( local_work + (nb)*(m)) magmaFloat_ptr dwork[MagmaMaxGPUs], panel[MagmaMaxGPUs]; size_t panel_offset[MagmaMaxGPUs]; float *local_work = NULL; magma_int_t i, j, k, ldwork, lddwork, old_i, old_ib, rows; magma_int_t nbmin, nx, ib, nb; magma_int_t lhwork, lwork; int panel_id = -1, i_local, n_local[MagmaMaxGPUs * MagmaMaxSubs], la_id, displacement, tot_subs = num_gpus * num_subs; *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = min(m,n); if (k == 0) return *info; nb = magma_get_sgeqrf_nb(m); displacement = n * nb; lwork = (m+n+64) * nb; lhwork = lwork - (m)*nb; for (i=0; i<num_gpus; i++) { if (MAGMA_SUCCESS != magma_smalloc( &(dwork[i]), (n + ldda)*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } /* Set the number of local n for each GPU */ for (i=0; i<tot_subs; i++) { n_local[i] = ((n/nb)/tot_subs)*nb; if (i < (n/nb)%tot_subs) n_local[i] += nb; else if (i == (n/nb)%tot_subs) n_local[i] += n%nb; } #ifdef USE_PINNED_CLMEMORY cl_mem buffer = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(float)*lwork, NULL, NULL); for (j=0; j<num_gpus; j++) { local_work = (float*)clEnqueueMapBuffer(queues[2*j], buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(float)*lwork, 0, NULL, NULL, NULL); } #else if (MAGMA_SUCCESS != magma_smalloc_cpu( (&local_work), lwork )) { *info = -9; for (i=0; i<num_gpus; i++) { magma_free( dwork[i] ); } *info = MAGMA_ERR_HOST_ALLOC; return *info; } #endif nbmin = 2; nx = nb; ldwork = m; lddwork= n; if (nb >= nbmin && nb < k && nx < k) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nx; i += nb) { /* Set the GPU number that holds the current panel */ panel_id = (i/nb)%tot_subs; /* Set the local index where the current panel is */ i_local = i/(nb*tot_subs)*nb; ib = min(k-i, nb); rows = m -i; /* Send current panel to the CPU */ magma_queue_sync(queues[2*(panel_id%num_gpus)]); magma_sgetmatrix_async( rows, ib, dlA(panel_id, i, i_local), ldda, hwrk(i), ldwork, queues[2*(panel_id%num_gpus)+1], NULL ); if (i > 0) { /* Apply H' to A(i:m,i+2*ib:n) from the left; this is the look-ahead application to the trailing matrix */ la_id = panel_id; /* only the GPU that has next panel is done look-ahead */ magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n_local[la_id]-i_local-old_ib, old_ib, panel[la_id%num_gpus], panel_offset[la_id%num_gpus], ldda, dwork[la_id%num_gpus], 0, lddwork, dlA(la_id, old_i, i_local+old_ib), ldda, dwork[la_id%num_gpus], old_ib, lddwork, queues[2*(la_id%num_gpus)]); la_id = ((i-nb)/nb)%tot_subs; magma_ssetmatrix_async( old_ib, old_ib, hwrk(old_i), ldwork, panel[la_id%num_gpus], panel_offset[la_id%num_gpus], ldda, queues[2*(la_id%num_gpus)], NULL ); } magma_queue_sync( queues[2*(panel_id%num_gpus)+1] ); lapackf77_sgeqrf(&rows, &ib, hwrk(i), &ldwork, tau+i, lhwrk, &lhwork, info); // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, hwrk(i), &ldwork, tau+i, lhwrk, &ib); spanel_to_q( MagmaUpper, ib, hwrk(i), ldwork, lhwrk+ib*ib ); // Send the current panel back to the GPUs // Has to be done with asynchronous copies for (j=0; j<num_gpus; j++) { if (j == panel_id%num_gpus){ panel[j] = dlA(panel_id, i, i_local); panel_offset[j] = dlA_offset(i, i_local); } else { panel[j] = dwork[j]; panel_offset[j] = displacement; } magma_queue_sync( queues[2*j] ); magma_ssetmatrix_async( rows, ib, hwrk(i), ldwork, panel[j], panel_offset[j], ldda, queues[2*j+1], NULL ); /* Send the T matrix to the GPU. Has to be done with asynchronous copies */ magma_ssetmatrix_async( ib, ib, lhwrk, ib, dwork[j], 0, lddwork, queues[2*j+1], NULL ); } for(j=0; j<num_gpus; j++) { magma_queue_sync( queues[2*j+1] ); } if (i + ib < n) { if (i+nb < k-nx) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left; This is update for the next panel; part of the look-ahead */ la_id = (panel_id+1)%tot_subs; int i_loc = (i+nb)/(nb*tot_subs)*nb; for (j=0; j<tot_subs; j++) { if (j == la_id) magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, panel[j%num_gpus], panel_offset[j%num_gpus], ldda, dwork[j%num_gpus], 0, lddwork, dlA(j, i, i_loc), ldda, dwork[j%num_gpus], ib, lddwork, queues[2*(j%num_gpus)]); else if (j <= panel_id) magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[j]-i_local-ib, ib, panel[j%num_gpus], panel_offset[j%num_gpus], ldda, dwork[j%num_gpus], 0, lddwork, dlA(j, i, i_local+ib), ldda, dwork[j%num_gpus], ib, lddwork, queues[2*(j%num_gpus)]); else magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[j]-i_local, ib, panel[j%num_gpus], panel_offset[j%num_gpus], ldda, dwork[j%num_gpus], 0, lddwork, dlA(j, i, i_local), ldda, dwork[j%num_gpus], ib, lddwork, queues[2*(j%num_gpus)]); } /* Restore the panel */ sq_to_panel( MagmaUpper, ib, hwrk(i), ldwork, lhwrk+ib*ib ); } else { /* do the entire update as we exit and there would be no lookahead */ la_id = (panel_id+1)%tot_subs; int i_loc = (i+nb)/(nb*tot_subs)*nb; magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[la_id]-i_loc, ib, panel[la_id%num_gpus], panel_offset[la_id%num_gpus], ldda, dwork[la_id%num_gpus], 0, lddwork, dlA(la_id, i, i_loc), ldda, dwork[la_id%num_gpus], ib, lddwork, queues[2*(la_id%num_gpus)]); /* Restore the panel */ sq_to_panel( MagmaUpper, ib, hwrk(i), ldwork, lhwrk+ib*ib ); magma_ssetmatrix( ib, ib, hwrk(i), ldwork, dlA(panel_id, i, i_local), ldda, queues[2*(panel_id%num_gpus)]); } old_i = i; old_ib = ib; } } } else { i = 0; } for (j=0; j<num_gpus; j++) { magma_free( dwork[j] ); } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; lhwork = lwork - rows*ib; panel_id = (panel_id+1)%tot_subs; int i_loc = (i)/(nb*tot_subs)*nb; magma_sgetmatrix( rows, ib, dlA(panel_id, i, i_loc), ldda, lhwrk, rows, queues[2*(panel_id%num_gpus)]); lhwork = lwork - rows*ib; lapackf77_sgeqrf(&rows, &ib, lhwrk, &rows, tau+i, lhwrk+ib*rows, &lhwork, info); magma_ssetmatrix( rows, ib, lhwrk, rows, dlA(panel_id, i, i_loc), ldda, queues[2*(panel_id%num_gpus)]); } #ifdef USE_PINNED_CLMEMORY #else magma_free_cpu( local_work ); #endif return *info; } /* magma_sgeqrf_msub */
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgegqr */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float e1, e2, e3, e4, e5, *work; float c_neg_one = MAGMA_S_NEG_ONE; float c_one = MAGMA_S_ONE; float c_zero = MAGMA_S_ZERO; float *h_A, *h_R, *tau, *dtau, *h_work, *h_rwork, tmp[1]; float *d_A, *dwork; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn; magma_int_t ione = 1, ldwork; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) // versions 1...4 are valid if (opts.version < 1 || opts.version > 4) { printf("Unknown version %d; exiting\n", opts.version ); return -1; } float tol, eps = lapackf77_slamch("E"); tol = 10* opts.tolerance * eps; printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||I-Q'Q||_F / M ||I-Q'Q||_I / M ||A-Q R||_I\n"); printf(" MAGMA / LAPACK MAGMA / LAPACK\n"); printf("==========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if (N > 128) { printf("%5d %5d skipping because sgegqr requires N <= 128\n", (int) M, (int) N); continue; } if (M < N) { printf("%5d %5d skipping because sgegqr requires M >= N\n", (int) M, (int) N); continue; } min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_SGEQRF( M, N ) / 1e9 + FLOPS_SORGQR( M, N, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_sgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_S_REAL( tmp[0] ); lwork = max(lwork, 3*N*N); ldwork = N*N; if (opts.version == 2) { ldwork = 3*N*N + min_mn; } TESTING_MALLOC_PIN( tau, float, min_mn ); TESTING_MALLOC_PIN( h_work, float, lwork ); TESTING_MALLOC_PIN(h_rwork, float, lwork ); TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_CPU( h_R, float, n2 ); TESTING_MALLOC_CPU( work, float, M ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); TESTING_MALLOC_DEV( dtau, float, min_mn ); TESTING_MALLOC_DEV( dwork, float, ldwork ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_ssetmatrix( M, N, h_R, lda, d_A, ldda ); // warmup magma_sgegqr_gpu( 1, M, N, d_A, ldda, dwork, h_work, &info ); magma_ssetmatrix( M, N, h_R, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_sync_wtime( 0 ); magma_sgegqr_gpu( opts.version, M, N, d_A, ldda, dwork, h_rwork, &info ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_sgegqr returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_sgetmatrix( M, N, d_A, ldda, h_R, M ); // Regenerate R // blasf77_sgemm("t", "n", &N, &N, &M, &c_one, h_R, &M, h_A, &M, &c_zero, h_rwork, &N); // magma_sprint(N, N, h_work, N); blasf77_strmm("r", "u", "n", "n", &M, &N, &c_one, h_rwork, &N, h_R, &M); blasf77_saxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); e5 = lapackf77_slange("i", &M, &N, h_R, &M, work) / lapackf77_slange("i", &M, &N, h_A, &lda, work); magma_sgetmatrix( M, N, d_A, ldda, h_R, M ); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); /* Orthogonalize on the CPU */ lapackf77_sgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); lapackf77_sorgqr(&M, &N, &N, h_A, &lda, tau, h_work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_sorgqr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ blasf77_sgemm("t", "n", &N, &N, &M, &c_one, h_R, &M, h_R, &M, &c_zero, h_work, &N); for(int ii = 0; ii < N*N; ii += N+1 ) { h_work[ii] = MAGMA_S_SUB(h_work[ii], c_one); } e1 = lapackf77_slange("f", &N, &N, h_work, &N, work) / N; e3 = lapackf77_slange("i", &N, &N, h_work, &N, work) / N; blasf77_sgemm("t", "n", &N, &N, &M, &c_one, h_A, &M, h_A, &M, &c_zero, h_work, &N); for(int ii = 0; ii < N*N; ii += N+1 ) { h_work[ii] = MAGMA_S_SUB(h_work[ii], c_one); } e2 = lapackf77_slange("f", &N, &N, h_work, &N, work) / N; e4 = lapackf77_slange("i", &N, &N, h_work, &N, work) / N; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e / %8.2e %8.2e / %8.2e %8.2e %s\n", (int) M, (int) N, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time, e1, e2, e3, e4, e5, (e1 < tol ? "ok" : "failed")); status += ! (e1 < tol); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, 1000.*gpu_time ); } TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( h_rwork ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_CPU( work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dtau ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_sgeqrf2_gpu( magma_int_t m, magma_int_t n, magmaFloat_ptr dA, size_t dA_offset, magma_int_t ldda, float *tau, magma_queue_t* queue, magma_int_t *info) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= SGEQRF computes a QR factorization of a real M-by-N matrix A: A = Q * R. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) REAL array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. TAU (output) REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define dA(a_1,a_2) dA, (dA_offset + (a_1) + (a_2)*(ldda)) #define work(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) magmaFloat_ptr dwork; float *work; magma_int_t i, k, ldwork, lddwork, old_i, old_ib, rows; magma_int_t nbmin, nx, ib, nb; magma_int_t lhwork, lwork; *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = min(m,n); if (k == 0) return *info; nb = magma_get_sgeqrf_nb(m); lwork = (m+n) * nb; lhwork = lwork - (m)*nb; if ( MAGMA_SUCCESS != magma_smalloc( &dwork, n*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* if ( MAGMA_SUCCESS != magma_smalloc_cpu( &work, lwork ) ) { *info = MAGMA_ERR_HOST_ALLOC; magma_free( dwork ); return *info; } */ cl_mem buffer = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(float)*lwork, NULL, NULL); work = (float*)clEnqueueMapBuffer(queue[0], buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, lwork*sizeof(float), 0, NULL, NULL, NULL); nbmin = 2; nx = nb; ldwork = m; lddwork= n; if (nb >= nbmin && nb < k && nx < k) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nx; i += nb) { ib = min(k-i, nb); rows = m -i; magma_queue_sync( queue[1] ); magma_sgetmatrix_async(rows, ib, dA(i, i), ldda, work(i), ldwork, queue[0], NULL); if (i > 0) { /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i ), ldda, dwork,0, lddwork, dA(old_i, old_i+2*old_ib), ldda, dwork,old_ib, lddwork, queue[1]); magma_ssetmatrix_async( old_ib, old_ib, work(old_i), ldwork, dA(old_i, old_i), ldda, queue[1], NULL); } magma_queue_sync(queue[0]); lapackf77_sgeqrf(&rows, &ib, work(i), &ldwork, tau+i, hwork, &lhwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work(i), &ldwork, tau+i, hwork, &ib); spanel_to_q( MagmaUpper, ib, work(i), ldwork, hwork+ib*ib ); /* download the i-th V matrix */ magma_ssetmatrix_async(rows, ib, work(i), ldwork, dA(i,i), ldda, queue[0], NULL); /* download the T matrix */ magma_queue_sync( queue[1] ); magma_ssetmatrix_async( ib, ib, hwork, ib, dwork, 0, lddwork, queue[0], NULL); magma_queue_sync( queue[0] ); if (i + ib < n) { if (i+nb < k-nx) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dwork,0, lddwork, dA(i, i+ib), ldda, dwork,ib, lddwork, queue[1]); sq_to_panel( MagmaUpper, ib, work(i), ldwork, hwork+ib*ib ); } else { magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dwork,0, lddwork, dA(i, i+ib), ldda, dwork,ib, lddwork, queue[1]); sq_to_panel( MagmaUpper, ib, work(i), ldwork, hwork+ib*ib ); magma_ssetmatrix_async(ib, ib, work(i), ldwork, dA(i,i), ldda, queue[1], NULL); } old_i = i; old_ib = ib; } } } else { i = 0; } magma_free(dwork); /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_sgetmatrix_async(rows, ib, dA(i, i), ldda, work, rows, queue[1], NULL); magma_queue_sync(queue[1]); lhwork = lwork - rows*ib; lapackf77_sgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_ssetmatrix_async(rows, ib, work, rows, dA(i, i), ldda, queue[1], NULL); } magma_queue_sync(queue[0]); magma_queue_sync(queue[1]); // magma_free_cpu(work); clEnqueueUnmapMemObject(queue[0], buffer, work, 0, NULL, NULL); clReleaseMemObject(buffer); return *info; } /* magma_sgeqrf2_gpu */
/** Purpose ------- SGEGQR orthogonalizes the N vectors given by a real M-by-N matrix A: A = Q * R. On exit, if successful, the orthogonal vectors Q overwrite A and R is given in work (on the CPU memory). The routine is designed for tall-and-skinny matrices: M >> N, N <= 128. This version uses normal equations and SVD in an iterative process that makes the computation numerically accurate. Arguments --------- @param[in] ikind INTEGER Several versions are implemented indiceted by the ikind value: 1: This version uses normal equations and SVD in an iterative process that makes the computation numerically accurate. 2: This version uses a standard LAPACK-based orthogonalization through MAGMA's QR panel factorization (magma_sgeqr2x3_gpu) and magma_sorgqr 3: MGS 4. Cholesky QR [ Note: this method uses the normal equations which squares the condition number of A, therefore ||I - Q'Q|| < O(eps cond(A)^2) ] @param[in] m INTEGER The number of rows of the matrix A. m >= n >= 0. @param[in] n INTEGER The number of columns of the matrix A. 128 >= n >= 0. @param[in,out] dA REAL array on the GPU, dimension (ldda,n) On entry, the m-by-n matrix A. On exit, the m-by-n matrix Q with orthogonal columns. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,m). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param dwork (GPU workspace) REAL array, dimension: n^2 for ikind = 1 3 n^2 + min(m, n) + 2 for ikind = 2 0 (not used) for ikind = 3 n^2 for ikind = 4 @param[out] work (CPU workspace) REAL array, dimension 3 n^2. On exit, work(1:n^2) holds the rectangular matrix R. Preferably, for higher performance, work should be in pinned memory. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sgegqr_gpu( magma_int_t ikind, magma_int_t m, magma_int_t n, float *dA, magma_int_t ldda, float *dwork, float *work, magma_int_t *info ) { #define work(i_,j_) (work + (i_) + (j_)*n) #define dA(i_,j_) (dA + (i_) + (j_)*ldda) magma_int_t i = 0, j, k, n2 = n*n; magma_int_t ione = 1; float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; float cn = 200., mins, maxs; /* check arguments */ *info = 0; if (ikind < 1 || ikind > 4) { *info = -1; } else if (m < 0 || m < n) { *info = -2; } else if (n < 0 || n > 128) { *info = -3; } else if (ldda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (ikind == 1) { // === Iterative, based on SVD ============================================================ float *U, *VT, *vt, *R, *G, *hwork, *tau; float *S; R = work; // Size n * n G = R + n*n; // Size n * n VT = G + n*n; // Size n * n magma_smalloc_cpu( &hwork, 32 + 2*n*n + 2*n); if ( hwork == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_int_t lwork=n*n+32; // First part f hwork; used as workspace in svd U = hwork + n*n + 32; // Size n*n S = (float *)(U+n*n); // Size n tau = U + n*n + n; // Size n #if defined(PRECISION_c) || defined(PRECISION_z) float *rwork; magma_smalloc_cpu( &rwork, 5*n); if ( rwork == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } #endif do { i++; magma_sgemm(MagmaConjTrans, MagmaNoTrans, n, n, m, c_one, dA, ldda, dA, ldda, c_zero, dwork, n ); magma_sgetmatrix(n, n, dwork, n, G, n); #if defined(PRECISION_s) || defined(PRECISION_d) lapackf77_sgesvd("n", "a", &n, &n, G, &n, S, U, &n, VT, &n, hwork, &lwork, info); #else lapackf77_sgesvd("n", "a", &n, &n, G, &n, S, U, &n, VT, &n, hwork, &lwork, rwork, info); #endif mins = 100.f, maxs = 0.f; for (k=0; k < n; k++) { S[k] = magma_ssqrt( S[k] ); if (S[k] < mins) mins = S[k]; if (S[k] > maxs) maxs = S[k]; } for (k=0; k < n; k++) { vt = VT + k*n; for (j=0; j < n; j++) vt[j] *= S[j]; } lapackf77_sgeqrf(&n, &n, VT, &n, tau, hwork, &lwork, info); if (i == 1) blasf77_scopy(&n2, VT, &ione, R, &ione); else blasf77_strmm("l", "u", "n", "n", &n, &n, &c_one, VT, &n, R, &n); magma_ssetmatrix(n, n, VT, n, dwork, n); magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, n, c_one, dwork, n, dA, ldda); if (mins > 0.00001f) cn = maxs/mins; //fprintf(stderr, "Iteration %d, cond num = %f \n", i, cn); } while (cn > 10.f); magma_free_cpu( hwork ); #if defined(PRECISION_c) || defined(PRECISION_z) magma_free_cpu( rwork ); #endif // ================== end of ikind == 1 =================================================== } else if (ikind == 2) { // ================== LAPACK based =================================================== magma_int_t min_mn = min(m, n); magma_int_t nb = n; float *dtau = dwork + 2*n*n, *d_T = dwork, *ddA = dwork + n*n; float *tau = work+n*n; magmablas_slaset( MagmaFull, n, n, c_zero, c_zero, d_T, n ); magma_sgeqr2x3_gpu(m, n, dA, ldda, dtau, d_T, ddA, (float *)(dwork+min_mn+2*n*n), info); magma_sgetmatrix( min_mn, 1, dtau, min_mn, tau, min_mn); magma_sgetmatrix( n, n, ddA, n, work, n); magma_sorgqr_gpu( m, n, n, dA, ldda, tau, d_T, nb, info ); // ================== end of ikind == 2 =================================================== } else if (ikind == 3) { // ================== MGS =================================================== for(magma_int_t j = 0; j<n; j++){ for(magma_int_t i = 0; i<j; i++){ *work(i, j) = magma_sdot(m, dA(0,i), 1, dA(0,j), 1); magma_saxpy(m, -(*work(i,j)), dA(0,i), 1, dA(0,j), 1); } for(magma_int_t i = j; i<n; i++) *work(i, j) = MAGMA_S_ZERO; //*work(j,j) = MAGMA_S_MAKE( magma_snrm2(m, dA(0,j), 1), 0. ); *work(j,j) = magma_sdot(m, dA(0,j), 1, dA(0,j), 1); *work(j,j) = MAGMA_S_MAKE( sqrt(MAGMA_S_REAL( *work(j,j) )), 0.); magma_sscal(m, 1./ *work(j,j), dA(0,j), 1); } // ================== end of ikind == 3 =================================================== } else if (ikind == 4) { // ================== Cholesky QR =================================================== magma_sgemm(MagmaConjTrans, MagmaNoTrans, n, n, m, c_one, dA, ldda, dA, ldda, c_zero, dwork, n ); magma_sgetmatrix(n, n, dwork, n, work, n); lapackf77_spotrf("u", &n, work, &n, info); magma_ssetmatrix(n, n, work, n, dwork, n); magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, n, c_one, dwork, n, dA, ldda); // ================== end of ikind == 4 =================================================== } return *info; } /* magma_sgegqr_gpu */
extern "C" magma_int_t magma_sgeqrf2_mgpu( magma_int_t num_gpus, magma_int_t m, magma_int_t n, float **dlA, magma_int_t ldda, float *tau, magma_int_t *info ) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= SGEQRF2_MGPU computes a QR factorization of a real M-by-N matrix A: A = Q * R. This is a GPU interface of the routine. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) REAL array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix dA. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be dividable by 16. TAU (output) REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define dlA(gpu,a_1,a_2) ( dlA[gpu]+(a_2)*(ldda) + (a_1)) #define work_ref(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) #define hwrk_ref(a_1) ( local_work + (a_1)) #define lhwrk ( local_work + (nb)*(m)) float *dwork[4], *panel[4], *local_work; magma_int_t i, j, k, ldwork, lddwork, old_i, old_ib, rows; magma_int_t nbmin, nx, ib, nb; magma_int_t lhwork, lwork; magma_device_t cdevice; magma_getdevice(&cdevice); int panel_gpunum, i_local, n_local[4], la_gpu, displacement; *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = min(m,n); if (k == 0) return *info; nb = magma_get_sgeqrf_nb(m); displacement = n * nb; lwork = (m+n+64) * nb; lhwork = lwork - (m)*nb; for(i=0; i<num_gpus; i++){ #ifdef MultiGPUs magma_setdevice(i); #endif if (MAGMA_SUCCESS != magma_smalloc( &(dwork[i]), (n + ldda)*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } /* Set the number of local n for each GPU */ for(i=0; i<num_gpus; i++){ n_local[i] = ((n/nb)/num_gpus)*nb; if (i < (n/nb)%num_gpus) n_local[i] += nb; else if (i == (n/nb)%num_gpus) n_local[i] += n%nb; } if (MAGMA_SUCCESS != magma_smalloc_pinned( &local_work, lwork )) { *info = -9; for(i=0; i<num_gpus; i++){ #ifdef MultiGPUs magma_setdevice(i); #endif magma_free( dwork[i] ); } *info = MAGMA_ERR_HOST_ALLOC; return *info; } cudaStream_t streaml[4][2]; for(i=0; i<num_gpus; i++){ #ifdef MultiGPUs magma_setdevice(i); #endif magma_queue_create( &streaml[i][0] ); magma_queue_create( &streaml[i][1] ); } nbmin = 2; nx = nb; ldwork = m; lddwork= n; if (nb >= nbmin && nb < k && nx < k) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nx; i += nb) { /* Set the GPU number that holds the current panel */ panel_gpunum = (i/nb)%num_gpus; /* Set the local index where the current panel is */ i_local = i/(nb*num_gpus)*nb; ib = min(k-i, nb); rows = m -i; /* Send current panel to the CPU */ #ifdef MultiGPUs magma_setdevice(panel_gpunum); #endif magma_sgetmatrix_async( rows, ib, dlA(panel_gpunum, i, i_local), ldda, hwrk_ref(i), ldwork, streaml[panel_gpunum][1] ); if (i>0){ /* Apply H' to A(i:m,i+2*ib:n) from the left; this is the look-ahead application to the trailing matrix */ la_gpu = panel_gpunum; /* only the GPU that has next panel is done look-ahead */ #ifdef MultiGPUs magma_setdevice(la_gpu); #endif magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, m-old_i, n_local[la_gpu]-i_local-old_ib, old_ib, panel[la_gpu], ldda, dwork[la_gpu], lddwork, dlA(la_gpu, old_i, i_local+old_ib), ldda, dwork[la_gpu]+old_ib, lddwork); la_gpu = ((i-nb)/nb)%num_gpus; #ifdef MultiGPUs magma_setdevice(la_gpu); #endif magma_ssetmatrix_async( old_ib, old_ib, hwrk_ref(old_i), ldwork, panel[la_gpu], ldda, streaml[la_gpu][0] ); } #ifdef MultiGPUs magma_setdevice(panel_gpunum); #endif magma_queue_sync( streaml[panel_gpunum][1] ); lapackf77_sgeqrf(&rows, &ib, hwrk_ref(i), &ldwork, tau+i, lhwrk, &lhwork, info); // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, hwrk_ref(i), &ldwork, tau+i, lhwrk, &ib); spanel_to_q( MagmaUpper, ib, hwrk_ref(i), ldwork, lhwrk+ib*ib ); // Send the current panel back to the GPUs // Has to be done with asynchronous copies for(j=0; j<num_gpus; j++) { #ifdef MultiGPUs magma_setdevice(j); #endif if (j == panel_gpunum) panel[j] = dlA(j, i, i_local); else panel[j] = dwork[j]+displacement; magma_ssetmatrix_async( rows, ib, hwrk_ref(i), ldwork, panel[j], ldda, streaml[j][0] ); } for(j=0; j<num_gpus; j++) { #ifdef MultiGPUs magma_setdevice(j); #endif magma_queue_sync( streaml[j][0] ); } /* Restore the panel */ sq_to_panel( MagmaUpper, ib, hwrk_ref(i), ldwork, lhwrk+ib*ib ); if (i + ib < n) { /* Send the T matrix to the GPU. Has to be done with asynchronous copies */ for(j=0; j<num_gpus; j++) { #ifdef MultiGPUs magma_setdevice(j); #endif magma_ssetmatrix_async( ib, ib, lhwrk, ib, dwork[j], lddwork, streaml[j][0] ); } if (i+nb < k-nx) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left; This is update for the next panel; part of the look-ahead */ la_gpu = (panel_gpunum+1)%num_gpus; int i_loc = (i+nb)/(nb*num_gpus)*nb; for(j=0; j<num_gpus; j++){ #ifdef MultiGPUs magma_setdevice(j); #endif //magma_queue_sync( streaml[j][0] ); if (j==la_gpu) magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, panel[j], ldda, dwork[j], lddwork, dlA(j, i, i_loc), ldda, dwork[j]+ib, lddwork); else if (j<=panel_gpunum) magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, n_local[j]-i_local-ib, ib, panel[j], ldda, dwork[j], lddwork, dlA(j, i, i_local+ib), ldda, dwork[j]+ib, lddwork); else magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, n_local[j]-i_local, ib, panel[j], ldda, dwork[j], lddwork, dlA(j, i, i_local), ldda, dwork[j]+ib, lddwork); } } else { /* do the entire update as we exit and there would be no lookahead */ la_gpu = (panel_gpunum+1)%num_gpus; int i_loc = (i+nb)/(nb*num_gpus)*nb; #ifdef MultiGPUs magma_setdevice(la_gpu); #endif magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, n_local[la_gpu]-i_loc, ib, panel[la_gpu], ldda, dwork[la_gpu], lddwork, dlA(la_gpu, i, i_loc), ldda, dwork[la_gpu]+ib, lddwork); #ifdef MultiGPUs magma_setdevice(panel_gpunum); #endif magma_ssetmatrix( ib, ib, hwrk_ref(i), ldwork, dlA(panel_gpunum, i, i_local), ldda ); } old_i = i; old_ib = ib; } } } else { i = 0; } for(j=0; j<num_gpus; j++){ #ifdef MultiGPUs magma_setdevice(j); #endif magma_free( dwork[j] ); } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; lhwork = lwork - rows*ib; panel_gpunum = (panel_gpunum+1)%num_gpus; int i_loc = (i)/(nb*num_gpus)*nb; #ifdef MultiGPUs magma_setdevice(panel_gpunum); #endif magma_sgetmatrix( rows, ib, dlA(panel_gpunum, i, i_loc), ldda, lhwrk, rows ); lhwork = lwork - rows*ib; lapackf77_sgeqrf(&rows, &ib, lhwrk, &rows, tau+i, lhwrk+ib*rows, &lhwork, info); magma_ssetmatrix( rows, ib, lhwrk, rows, dlA(panel_gpunum, i, i_loc), ldda ); } for(i=0; i<num_gpus; i++){ #ifdef MultiGPUs magma_setdevice(i); #endif magma_queue_destroy( streaml[i][0] ); magma_queue_destroy( streaml[i][1] ); } magma_setdevice(cdevice); magma_free_pinned( local_work ); return *info; } /* magma_sgeqrf2_mgpu */
extern "C" magma_int_t magma_ssytrd_sy2sb( char uplo, magma_int_t n, magma_int_t nb, float *a, magma_int_t lda, float *tau, float *work, magma_int_t lwork, float *dT, magma_int_t threads, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= SSYTRD_HE2HB reduces a real symmetric matrix A to real symmetric band-diagonal form T by an orthogonal similarity transformation: Q**T * A * Q = T. This version stores the triangular matrices T used in the accumulated Householder transformations (I - V T V'). Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) REAL array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if UPLO = 'U', the Upper band-diagonal of A is overwritten by the corresponding elements of the band-diagonal matrix T, and the elements above the band diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = 'L', the the Lower band-diagonal of A is overwritten by the corresponding elements of the band-diagonal matrix T, and the elements below the band-diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). TAU (output) REAL array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= 1. For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. dT (output) REAL array on the GPU, dimension N*NB, where NB is the optimal blocksize. On exit dT holds the upper triangular matrices T from the accumulated Householder transformations (I - V T V') used in the factorization. The nb x nb matrices T are ordered consecutively in memory one after another. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value Further Details =============== If UPLO = 'U', the matrix Q is represented as a product of elementary reflectors Q = H(n-1) . . . H(2) H(1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in A(1:i-1,i+1), and tau in TAU(i). If UPLO = 'L', the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(n-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i), and tau in TAU(i). The contents of A on exit are illustrated by the following examples with n = 5: if UPLO = 'U': if UPLO = 'L': ( d e v2 v3 v4 ) ( d ) ( d e v3 v4 ) ( e d ) ( d e v4 ) ( v1 e d ) ( d e ) ( v1 v2 e d ) ( d ) ( v1 v2 v3 e d ) where d and e denote diagonal and off-diagonal elements of T, and vi denotes an element of the vector defining H(i). ===================================================================== */ #define a_ref(a_1,a_2) ( a + ((a_2)-1)*( lda) + (a_1)-1) #define da_ref(a_1,a_2) (da + ((a_2)-1)*(ldda) + (a_1)-1) #define tau_ref(a_1) (tau + (a_1)-1) #define t_ref(a_1) (dT + ((a_1)-1)*(lddt)) char uplo_[2] = {uplo, 0}; int ldda = ((n+31)/32)*32; int lddt = nb; float c_neg_one = MAGMA_S_NEG_ONE; float c_neg_half = MAGMA_S_NEG_HALF; float c_one = MAGMA_S_ONE ; float c_zero = MAGMA_S_ZERO; float d_one = MAGMA_D_ONE; magma_int_t pm, pn, indi, indj, pk; magma_int_t pm_old=0, pn_old=0, indi_old=0, indj_old=0; int i; int lwkopt; int lquery; *info = 0; int upper = lapackf77_lsame(uplo_, "U"); lquery = lwork == -1; if (! upper && ! lapackf77_lsame(uplo_, "L")) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } else if (lwork < 1 && ! lquery) { *info = -9; } if (*info == 0) { /* Determine the block size. */ lwkopt = n * nb; MAGMA_S_SET2REAL( work[0], lwkopt ); } if (*info != 0) return *info; else if (lquery) return *info; /* Quick return if possible */ if (n == 0) { work[0] = c_one; return *info; } float *da; if (MAGMA_SUCCESS != magma_smalloc( &da, (n + 2*nb)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_int_t mklth = min(threads,12); #if defined(USEMKL) mkl_set_num_threads(mklth); #endif #if defined(USEACML) omp_set_num_threads(mklth); #endif /* Use the first panel of da as work space */ float *dwork = da+n*ldda; float *dW = dwork + nb*ldda; #ifdef TRACING char buf[80]; #endif cudaStream_t stream[3]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); stream[2] = 0; // default stream trace_init( 1, 1, 3, stream ); float *hT = work + lwork - nb*nb; lwork -= nb*nb; memset( hT, 0, nb*nb*sizeof(float)); magmablasSetKernelStream( stream[0] ); cudaEvent_t Pupdate_event; cudaEventCreateWithFlags(&Pupdate_event,cudaEventDisableTiming); //cudaEventCreate(&Pupdate_event); if (upper) { printf("SSYTRD_HE2HB is not yet implemented for upper matrix storage. Exit.\n"); exit(1); }else { /* Copy the matrix to the GPU */ if (1 <= n-nb){ trace_gpu_start( 0, 0, "set", "set A" ); magma_ssetmatrix_async( (n-nb), (n-nb), a_ref(nb+1, nb+1), lda, da_ref(nb+1, nb+1), ldda, stream[0] ); trace_gpu_end( 0, 0 ); } /* Reduce the lower triangle of A */ for (i = 1; i <= n-nb; i += nb) { indi = i+nb; indj = i; pm = n - i - nb + 1; //pn = min(i+nb-1, n-nb) -i + 1; pn = nb; /* Get the current panel (no need for the 1st iteration) */ if (i > 1 ){ // spanel_to_q copy the upper oof diagonal part of // the matrix to work to be restored later. acctually // the zero's and one's putted are not used this is only // because we don't have a function that copy only the // upper part of A to be restored after copying the // lookahead panel that has been computted from GPU to CPU. spanel_to_q(MagmaUpper, pn-1, a_ref(i, i+1), lda, work); trace_gpu_start( 0, 1, "get", "get panel" ); //magma_queue_sync( stream[0] ); cudaStreamWaitEvent(stream[1], Pupdate_event, 0); magma_sgetmatrix_async( (pm+pn), pn, da_ref( i, i), ldda, a_ref ( i, i), lda, stream[1] ); trace_gpu_end( 0, 1 ); trace_gpu_start( 0, 2, "syr2k", "syr2k" ); magma_ssyr2k(MagmaLower, MagmaNoTrans, pm_old-pn_old, pn_old, c_neg_one, da_ref(indi_old+pn_old, indj_old), ldda, dW + pn_old , pm_old, d_one, da_ref(indi_old+pn_old, indi_old+pn_old), ldda); trace_gpu_end( 0, 2 ); trace_cpu_start( 0, "sync", "sync on 1" ); magma_queue_sync( stream[1] ); trace_cpu_end( 0 ); sq_to_panel(MagmaUpper, pn-1, a_ref(i, i+1), lda, work); } /* ========================================================== QR factorization on a panel starting nb off of the diagonal. Prepare the V and T matrices. ========================================================== */ #ifdef TRACING snprintf( buf, sizeof(buf), "panel %d", i ); #endif trace_cpu_start( 0, "geqrf", buf ); lapackf77_sgeqrf(&pm, &pn, a_ref(indi, indj), &lda, tau_ref(i), work, &lwork, info); /* Form the matrix T */ pk=min(pm,pn); lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &pm, &pk, a_ref(indi, indj), &lda, tau_ref(i), hT, &nb); /* Prepare V - put 0s in the upper triangular part of the panel (and 1s on the diagonal), temporaly storing the original in work */ spanel_to_q(MagmaUpper, pk, a_ref(indi, indj), lda, work); trace_cpu_end( 0 ); /* Send V from the CPU to the GPU */ trace_gpu_start( 0, 0, "set", "set V and T" ); magma_ssetmatrix_async( pm, pk, a_ref(indi, indj), lda, da_ref(indi, indj), ldda, stream[0] ); /* Send the triangular factor T to the GPU */ magma_ssetmatrix_async( pk, pk, hT, nb, t_ref(i), lddt, stream[0] ); trace_gpu_end( 0, 0 ); /* ========================================================== Compute W: 1. X = A (V T) 2. W = X - 0.5* V * (T' * (V' * X)) ========================================================== */ /* dwork = V T */ trace_cpu_start( 0, "sync", "sync on 0" ); // this sync is done here to be sure that the copy has been finished // because below we made a restore sq_to_panel and this restore need // to ensure that the copy has been finished. we did it here to allow // overlapp of restore with next gemm and symm. magma_queue_sync( stream[0] ); trace_cpu_end( 0 ); trace_gpu_start( 0, 2, "gemm", "work = V*T" ); magma_sgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk, c_one, da_ref(indi, indj), ldda, t_ref(i), lddt, c_zero, dwork, pm); trace_gpu_end( 0, 2 ); /* dW = X = A*V*T. dW = A*dwork */ trace_gpu_start( 0, 2, "symm", "X = A*work" ); magma_ssymm(MagmaLeft, uplo, pm, pk, c_one, da_ref(indi, indi), ldda, dwork, pm, c_zero, dW, pm); trace_gpu_end( 0, 2 ); /* restore the panel */ sq_to_panel(MagmaUpper, pk, a_ref(indi, indj), lda, work); /* dwork = V*T already ==> dwork' = T'*V' * compute T'*V'*X ==> dwork'*W ==> * dwork + pm*nb = ((T' * V') * X) = dwork' * X = dwork' * W */ trace_gpu_start( 0, 2, "gemm", "work = T'*V'*X" ); magma_sgemm(MagmaTrans, MagmaNoTrans, pk, pk, pm, c_one, dwork, pm, dW, pm, c_zero, dwork + pm*nb, nb); trace_gpu_end( 0, 2 ); /* W = X - 0.5 * V * T'*V'*X * = X - 0.5 * V * (dwork + pm*nb) = W - 0.5 * V * (dwork + pm*nb) */ trace_gpu_start( 0, 2, "gemm", "W = X - 0.5*V*(T'*V'*X)" ); magma_sgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk, c_neg_half, da_ref(indi, indj), ldda, dwork + pm*nb, nb, c_one, dW, pm); trace_gpu_end( 0, 2 ); /* ========================================================== Update the unreduced submatrix A(i+ib:n,i+ib:n), using an update of the form: A := A - V*W' - W*V' ========================================================== */ if (i + nb <= n-nb){ /* There would be next iteration; do lookahead - update the next panel */ trace_gpu_start( 0, 2, "gemm", "gemm 4 next panel left" ); magma_sgemm(MagmaNoTrans, MagmaTrans, pm, pn, pn, c_neg_one, da_ref(indi, indj), ldda, dW , pm, c_one, da_ref(indi, indi), ldda); trace_gpu_end( 0, 2 ); trace_gpu_start( 0, 2, "gemm", "gemm 5 next panel right" ); magma_sgemm(MagmaNoTrans, MagmaTrans, pm, pn, pn, c_neg_one, dW , pm, da_ref(indi, indj), ldda, c_one, da_ref(indi, indi), ldda); trace_gpu_end( 0, 2 ); cudaEventRecord(Pupdate_event, stream[0]); } else { /* no look-ahead as this is last iteration */ trace_gpu_start( 0, 2, "syr2k", "syr2k last iteration" ); magma_ssyr2k(MagmaLower, MagmaNoTrans, pk, pk, c_neg_one, da_ref(indi, indj), ldda, dW , pm, d_one, da_ref(indi, indi), ldda); trace_gpu_end( 0, 2 ); } indi_old = indi; indj_old = indj; pm_old = pm; pn_old = pn; } // end loop for(i) /* Send the last block to the CPU */ pk = min(pm,pn); if (1 <= n-nb){ spanel_to_q(MagmaUpper, pk-1, a_ref(n-pk+1, n-pk+2), lda, work); trace_gpu_start( 0, 2, "get", "get last block" ); magma_sgetmatrix( pk, pk, da_ref(n-pk+1, n-pk+1), ldda, a_ref(n-pk+1, n-pk+1), lda ); trace_gpu_end( 0, 2 ); sq_to_panel(MagmaUpper, pk-1, a_ref(n-pk+1, n-pk+2), lda, work); } }// end of LOWER trace_finalize( "ssytrd_sy2sb.svg", "trace.css" ); cudaEventDestroy(Pupdate_event); magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( da ); MAGMA_S_SET2REAL( work[0], lwkopt ); magmablasSetKernelStream( 0 ); #if defined(USEMKL) mkl_set_num_threads(1); #endif #if defined(USEACML) omp_set_num_threads(1); #endif return *info; } /* ssytrd_sy2sb_ */