/** Purpose ------- SGEQRF4 computes a QR factorization of a REAL M-by-N matrix A: A = Q * R using multiple GPUs. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. Arguments --------- @param[in] num_gpus INTEGER The number of GPUs to be used for the factorization. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] tau REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. \n Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= N*NB, where NB can be obtained through magma_get_sgeqrf_nb(M). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sgeqrf4(magma_int_t num_gpus, magma_int_t m, magma_int_t n, float *A, magma_int_t lda, float *tau, float *work, magma_int_t lwork, magma_int_t *info ) { float *da[MagmaMaxGPUs]; float c_one = MAGMA_S_ONE; int i, k, ldda; *info = 0; int nb = magma_get_sgeqrf_nb(min(m, n)); int lwkopt = n * nb; work[0] = MAGMA_S_MAKE( (float)lwkopt, 0 ); int lquery = (lwork == -1); if (num_gpus < 0 || num_gpus > MagmaMaxGPUs) { *info = -1; } else if (m < 0) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } else if (lwork < max(1,n) && ! lquery) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { work[0] = c_one; return *info; } ldda = ((m+31)/32)*32; magma_int_t n_local[MagmaMaxGPUs]; for (i=0; i < num_gpus; i++) { n_local[i] = ((n/nb)/num_gpus)*nb; if (i < (n/nb)%num_gpus) n_local[i] += nb; else if (i == (n/nb)%num_gpus) n_local[i] += n%nb; magma_setdevice(i); // TODO on failure, free previously allocated memory if (MAGMA_SUCCESS != magma_smalloc( &da[i], ldda*n_local[i] )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } if (m > nb && n > nb) { /* Copy the matrix to the GPUs in 1D block cyclic distribution */ magma_ssetmatrix_1D_col_bcyclic(m, n, A, lda, da, ldda, num_gpus, nb); /* Factor using the GPU interface */ magma_sgeqrf2_mgpu( num_gpus, m, n, da, ldda, tau, info); /* Copy the matrix back from the GPUs to the CPU */ magma_sgetmatrix_1D_col_bcyclic(m, n, da, ldda, A, lda, num_gpus, nb); } else { lapackf77_sgeqrf(&m, &n, A, &lda, tau, work, &lwork, info); } /* Free the allocated GPU memory */ for (i=0; i < num_gpus; i++) { magma_setdevice(i); magma_free( da[i] ); } return *info; } /* magma_sgeqrf4 */
/** Purpose ------- SGEQRF computes a QR factorization of a REAL M-by-N matrix A: A = Q * R using multiple GPUs. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] tau REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. \n Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= N*NB, where NB can be obtained through magma_get_sgeqrf_nb( M, N ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sgeqrf_m( magma_int_t ngpu, magma_int_t m, magma_int_t n, float *A, magma_int_t lda, float *tau, float *work, magma_int_t lwork, magma_int_t *info ) { float *da[MagmaMaxGPUs]; float c_one = MAGMA_S_ONE; magma_int_t i, min_mn, ldda; *info = 0; magma_int_t nb = magma_get_sgeqrf_nb( m, n ); magma_int_t lwkopt = n * nb; work[0] = magma_smake_lwork( lwkopt ); bool lquery = (lwork == -1); if (ngpu < 0 || ngpu > MagmaMaxGPUs) { *info = -1; } else if (m < 0) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } else if (lwork < max(1,n) && ! lquery) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; min_mn = min(m,n); if (min_mn == 0) { work[0] = c_one; return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); ldda = magma_roundup( m, 32 ); magma_int_t n_local[MagmaMaxGPUs]; for (i=0; i < ngpu; i++) { n_local[i] = ((n/nb)/ngpu)*nb; if (i < (n/nb)%ngpu) n_local[i] += nb; else if (i == (n/nb)%ngpu) n_local[i] += n%nb; magma_setdevice(i); // TODO on failure, free previously allocated memory if (MAGMA_SUCCESS != magma_smalloc( &da[i], ldda*n_local[i] )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } if (m > nb && n > nb) { magma_queue_t queues[MagmaMaxGPUs]; for( magma_int_t dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_queue_create( dev, &queues[dev] ); } /* Copy the matrix to the GPUs in 1D block cyclic distribution */ magma_ssetmatrix_1D_col_bcyclic(m, n, A, lda, da, ldda, ngpu, nb, queues); for( magma_int_t dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_queue_sync( queues[dev] ); } /* Factor using the GPU interface */ magma_sgeqrf2_mgpu( ngpu, m, n, da, ldda, tau, info); /* Copy the matrix back from the GPUs to the CPU */ magma_sgetmatrix_1D_col_bcyclic(m, n, da, ldda, A, lda, ngpu, nb, queues); for( magma_int_t dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_queue_sync( queues[dev] ); magma_queue_destroy( queues[dev] ); } } else { lapackf77_sgeqrf(&m, &n, A, &lda, tau, work, &lwork, info); } /* Free the allocated GPU memory */ for (i=0; i < ngpu; i++) { magma_setdevice(i); magma_free( da[i] ); } magma_setdevice( orig_dev ); return *info; } /* magma_sgeqrf4 */
/***************************************************************************//** Purpose ------- SGEHRD reduces a REAL general matrix A to upper Hessenberg form H by an orthogonal similarity transformation: Q' * A * Q = H . This version stores the triangular matrices used in the factorization so that they can be applied directly (i.e., without being recomputed) later. As a result, the application of Q is much faster. Arguments --------- @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] ilo INTEGER @param[in] ihi INTEGER It is assumed that A is already upper triangular in rows and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally set by a previous call to SGEBAL; otherwise they should be set to 1 and N respectively. See Further Details. 1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the N-by-N general matrix to be reduced. On exit, the upper triangle and the first subdiagonal of A are overwritten with the upper Hessenberg matrix H, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau REAL array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to zero. @param[out] work (workspace) REAL array, dimension (LWORK) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. LWORK >= N*NB. where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] T REAL array, dimension NB*N, where NB is the optimal blocksize. It stores the NB*NB blocks of the triangular T matrices used in the reduction. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value. Further Details --------------- The matrix Q is represented as a product of (ihi-ilo) elementary reflectors Q = H(ilo) H(ilo+1) . . . H(ihi-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on exit in A(i+2:ihi,i), and tau in TAU(i). The contents of A are illustrated by the following example, with n = 7, ilo = 2 and ihi = 6: @verbatim on entry, on exit, ( a a a a a a a ) ( a a h h h h a ) ( a a a a a a ) ( a h h h h a ) ( a a a a a a ) ( h h h h h h ) ( a a a a a a ) ( v2 h h h h h ) ( a a a a a a ) ( v2 v3 h h h h ) ( a a a a a a ) ( v2 v3 v4 h h h ) ( a ) ( a ) @endverbatim where a denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. This version stores the T matrices, for later use in magma_sorghr. @ingroup magma_gehrd *******************************************************************************/ extern "C" magma_int_t magma_sgehrd_m( magma_int_t n, magma_int_t ilo, magma_int_t ihi, float *A, magma_int_t lda, float *tau, float *work, magma_int_t lwork, float *T, magma_int_t *info) { #define A( i, j ) (A + (i) + (j)*lda) #define dA( dev, i, j ) (data.dA[dev] + (i) + (j)*ldda) float c_one = MAGMA_S_ONE; float c_zero = MAGMA_S_ZERO; magma_int_t nb = magma_get_sgehrd_nb(n); magma_int_t nh, iws, ldda, min_lblocks, max_lblocks, last_dev, dev; magma_int_t dpanel, di, nlocal, i, i2, ib, ldwork; magma_int_t iinfo; magma_int_t lquery; struct sgehrd_data data; magma_int_t ngpu = magma_num_gpus(); *info = 0; iws = n*(nb + nb*ngpu); work[0] = magma_smake_lwork( iws ); lquery = (lwork == -1); if (n < 0) { *info = -1; } else if (ilo < 1 || ilo > max(1,n)) { *info = -2; } else if (ihi < min(ilo,n) || ihi > n) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (lwork < iws && ! lquery) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; // Adjust from 1-based indexing ilo -= 1; // Quick return if possible nh = ihi - ilo; if (nh <= 1) { work[0] = c_one; return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); // Set elements 0:ILO-1 and IHI-1:N-2 of TAU to zero for (i = 0; i < ilo; ++i) tau[i] = c_zero; for (i = max(0,ihi-1); i < n-1; ++i) tau[i] = c_zero; // set T to zero lapackf77_slaset( "Full", &nb, &n, &c_zero, &c_zero, T, &nb ); // set to null, to simplify cleanup code for( dev = 0; dev < ngpu; ++dev ) { data.dA[dev] = NULL; data.queues[dev] = NULL; } // Now requires lwork >= iws; else dT won't be computed in unblocked code. // If not enough workspace, use unblocked code //if ( lwork < iws ) { // nb = 1; //} if (nb == 1 || nb >= nh) { // Use unblocked code below i = ilo; } else { // Use blocked code // allocate memory on GPUs for A and workspaces ldda = magma_roundup( n, 32 ); min_lblocks = (n / nb) / ngpu; max_lblocks = ((n-1) / nb) / ngpu + 1; last_dev = (n / nb) % ngpu; // V and Vd need to be padded for copying in mslahr2 data.ngpu = ngpu; data.ldda = ldda; data.ldv = nb*max_lblocks*ngpu; data.ldvd = nb*max_lblocks; for( dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); nlocal = min_lblocks*nb; if ( dev < last_dev ) { nlocal += nb; } else if ( dev == last_dev ) { nlocal += (n % nb); } ldwork = nlocal*ldda // A + nb*data.ldv // V + nb*data.ldvd // Vd + nb*ldda // Y + nb*ldda // W + nb*nb; // Ti if ( MAGMA_SUCCESS != magma_smalloc( &data.dA[dev], ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto CLEANUP; } data.dV [dev] = data.dA [dev] + nlocal*ldda; data.dVd[dev] = data.dV [dev] + nb*data.ldv; data.dY [dev] = data.dVd[dev] + nb*data.ldvd; data.dW [dev] = data.dY [dev] + nb*ldda; data.dTi[dev] = data.dW [dev] + nb*ldda; magma_queue_create( dev, &data.queues[dev] ); } // Copy the matrix to GPUs magma_ssetmatrix_1D_col_bcyclic( ngpu, n, n, nb, A, lda, data.dA, ldda, data.queues ); // round ilo down to block boundary ilo = (ilo/nb)*nb; for (i = ilo; i < ihi - 1 - nb; i += nb) { // Reduce columns i:i+nb-1 to Hessenberg form, returning the // matrices V and T of the block reflector H = I - V*T*V' // which performs the reduction, and also the matrix Y = A*V*T // Get the current panel (no need for the 1st iteration) dpanel = (i / nb) % ngpu; di = ((i / nb) / ngpu) * nb; if ( i > ilo ) { magma_setdevice( dpanel ); magma_sgetmatrix( ihi-i, nb, dA(dpanel, i, di), ldda, A(i,i), lda, data.queues[dpanel] ); } // add 1 to i for 1-based index magma_slahr2_m( ihi, i+1, nb, A(0,i), lda, &tau[i], &T[i*nb], nb, work, n, &data ); magma_slahru_m( n, ihi, i, nb, A, lda, &data ); // copy first i rows above panel to host magma_setdevice( dpanel ); magma_sgetmatrix_async( i, nb, dA(dpanel, 0, di), ldda, A(0,i), lda, data.queues[dpanel] ); } // Copy remainder to host, block-by-block for( i2 = i; i2 < n; i2 += nb ) { ib = min( nb, n-i2 ); dev = (i2 / nb) % ngpu; di = (i2 / nb) / ngpu * nb; magma_setdevice( dev ); magma_sgetmatrix( n, ib, dA(dev, 0, di), ldda, A(0,i2), lda, data.queues[dev] ); } } // Use unblocked code to reduce the rest of the matrix // add 1 to i for 1-based index i += 1; lapackf77_sgehd2(&n, &i, &ihi, A, &lda, tau, work, &iinfo); work[0] = magma_smake_lwork( iws ); CLEANUP: for( dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_free( data.dA[dev] ); magma_queue_destroy( data.queues[dev] ); } magma_setdevice( orig_dev ); return *info; } /* magma_sgehrd */
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgetrf_mgpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error; float *h_A; float *d_lA[ MagmaMaxGPUs ]; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, n_local, ngpu; magma_int_t info, min_mn, nb, ldn_local; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; nb = magma_get_sgetrf_nb( M ); gflops = FLOPS_SGETRF( M, N ) / 1e9; // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, int((N+nb-1)/nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // Allocate host memory for the matrix TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, float, n2 ); // Allocate device memory for( int dev=0; dev < ngpu; dev++){ n_local = ((N/nb)/ngpu)*nb; if (dev < (N/nb) % ngpu) n_local += nb; else if (dev == (N/nb) % ngpu) n_local += N % nb; ldn_local = ((n_local+31)/32)*32; // TODO why? magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], float, ldda*ldn_local ); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_sgetrf( &M, &N, h_A, &lda, ipiv, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_sgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( M, N, h_A, lda ); magma_ssetmatrix_1D_col_bcyclic( M, N, h_A, lda, d_lA, ldda, ngpu, nb ); gpu_time = magma_wtime(); magma_sgetrf_mgpu( ngpu, M, N, d_lA, ldda, ipiv, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_sgetrf_mgpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_sgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_A, lda, ngpu, nb ); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { error = get_residual( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else if ( opts.check ) { error = get_LU_error( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf( " ---\n" ); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); for( int dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
int main(int argc, char **argv) { #if (GPUSHMEM >= 200) TESTING_INIT(); magma_setdevice(0); magma_timestr_t start, end; float flops, magma_perf, cuda_perf, error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t n_local[4]; FILE *fp ; magma_int_t N, m, i, j, lda, LDA, M; magma_int_t matsize; magma_int_t vecsize; magma_int_t istart = 64; magma_int_t incx = 1; char uplo = MagmaLower; float alpha = MAGMA_S_MAKE(1., 0.); // MAGMA_S_MAKE( 1.5, -2.3 ); float beta = MAGMA_S_MAKE(0., 0.); // MAGMA_S_MAKE( -0.6, 0.8 ); float *A, *X, *Y[4], *Ycublas, *Ymagma; float *dA, *dX[4], *dY[4], *d_lA[4], *dYcublas ; magma_queue_t stream[4][10]; float *C_work; float *dC_work[4]; int max_num_gpus; magma_int_t num_gpus = 1, nb; magma_int_t blocks, workspace; magma_int_t offset = 0; M = 0; N = 0; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) { N = atoi(argv[++i]); istart = N; } else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-NGPU", argv[i])==0) num_gpus = atoi(argv[++i]); else if (strcmp("-offset", argv[i])==0) offset = atoi(argv[++i]); } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } if (M>0 && N>0) { printf(" testing_ssymv_mgpu -M %d -N %d -NGPU %d\n\n", (int) M, (int) N, (int) num_gpus); printf(" in %c side \n", uplo); } else { printf("\nUsage: \n"); printf(" testing_ssymv_mgpu -M %d -N %d -NGPU %d\n\n", 1024, 1024, 1); exit(1); } } else { #if defined(PRECISION_z) M = N = 8000; #else M = N = 12480; #endif num_gpus = 2; offset = 0; printf("\nUsage: \n"); printf(" testing_ssymv_mgpu -M %d -N %d -NGPU %d\n\n", (int) M, (int) N, (int) num_gpus); } /////////////////////////////////////////////////////////////////////////////////////// cudaGetDeviceCount(&max_num_gpus); if (num_gpus > max_num_gpus){ printf("More GPUs requested than available. Have to change it.\n"); num_gpus = max_num_gpus; } printf("Number of GPUs to be used = %d\n", (int) num_gpus); for(int i=0; i< num_gpus; i++) { magma_queue_create(&stream[i][0]); } LDA = ((N+31)/32)*32; matsize = N*LDA; vecsize = N*incx; nb = 32; //nb = 64; printf("block size = %d\n", (int) nb); TESTING_MALLOC( A, float, matsize ); TESTING_MALLOC( X, float, vecsize ); TESTING_MALLOC( Ycublas, float, vecsize ); TESTING_MALLOC( Ymagma, float, vecsize ); for(i=0; i<num_gpus; i++) { TESTING_MALLOC( Y[i], float, vecsize ); } magma_setdevice(0); TESTING_DEVALLOC( dA, float, matsize ); TESTING_DEVALLOC( dYcublas, float, vecsize ); for(i=0; i<num_gpus; i++) { n_local[i] = ((N/nb)/num_gpus)*nb; if (i < (N/nb)%num_gpus) n_local[i] += nb; else if (i == (N/nb)%num_gpus) n_local[i] += N%nb; magma_setdevice(i); TESTING_DEVALLOC( d_lA[i], float, LDA*n_local[i] );// potentially bugged TESTING_DEVALLOC( dX[i], float, vecsize ); TESTING_DEVALLOC( dY[i], float, vecsize ); printf("device %2d n_local = %4d\n", (int) i, (int) n_local[i]); } magma_setdevice(0); /////////////////////////////////////////////////////////////////////// /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &matsize, A ); /* Make A symmetric */ { magma_int_t i, j; for(i=0; i<N; i++) { A[i*LDA+i] = MAGMA_S_MAKE( MAGMA_S_REAL(A[i*LDA+i]), 0. ); for(j=0; j<i; j++) A[i*LDA+j] = (A[j*LDA+i]); } } blocks = N / nb + (N % nb != 0); workspace = LDA * (blocks + 1); TESTING_MALLOC( C_work, float, workspace ); for(i=0; i<num_gpus; i++){ magma_setdevice(i); TESTING_DEVALLOC( dC_work[i], float, workspace ); //fillZero(dC_work[i], workspace); } magma_setdevice(0); ////////////////////////////////////////////////////////////////////////////////////////////// fp = fopen ("results_ssymv_mgpu.csv", "w") ; if( fp == NULL ){ printf("Couldn't open output file\n"); exit(1);} printf("SSYMV float precision\n\n"); printf( " n CUBLAS,Gflop/s MAGMABLAS,Gflop/s \"error\"\n" "==============================================================\n"); fprintf(fp, " n CUBLAS,Gflop/s MAGMABLAS,Gflop/s \"error\"\n" "==============================================================\n"); // for( offset = 0; offset< N; offset ++ ) for(int size = istart ; size <= N ; size += 128) { // printf("offset = %d ", offset); m = size ; // m = N; // lda = ((m+31)/32)*32;// lda = LDA; flops = FLOPS( (float)m ) / 1e6; printf( "N %5d ", (int) m ); fprintf( fp, "%5d, ", (int) m ); vecsize = m * incx; lapackf77_slarnv( &ione, ISEED, &vecsize, X ); lapackf77_slarnv( &ione, ISEED, &vecsize, Y[0] ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_setdevice(0); magma_ssetmatrix_1D_col_bcyclic(m, m, A, LDA, d_lA, lda, num_gpus, nb); magma_setdevice(0); magma_ssetmatrix( m, m, A, LDA, dA, lda ); magma_ssetvector( m, Y[0], incx, dYcublas, incx ); for(i=0; i<num_gpus; i++){ magma_setdevice(i); magma_ssetvector( m, X, incx, dX[i], incx ); magma_ssetvector( m, Y[0], incx, dY[i], incx ); blocks = m / nb + (m % nb != 0); magma_ssetmatrix( lda, blocks, C_work, LDA, dC_work[i], lda ); } magma_setdevice(0); start = get_current_time(); cublasSsymv( uplo, m-offset, alpha, dA + offset + offset * lda, lda, dX[0] + offset, incx, beta, dYcublas + offset, incx ); end = get_current_time(); magma_sgetvector( m, dYcublas, incx, Ycublas, incx ); cuda_perf = flops / GetTimerValue(start,end); printf( "%11.2f", cuda_perf ); fprintf(fp, "%11.2f,", cuda_perf ); magma_setdevice(0); start = get_current_time(); if(nb == 32) { magmablas_ssymv2_mgpu_32_offset( uplo, m, alpha, d_lA, lda, dX, incx, beta, dY, incx, dC_work, workspace, num_gpus, nb, offset); } else // nb = 64 { magmablas_ssymv2_mgpu_offset( uplo, m, alpha, d_lA, lda, dX, incx, beta, dY, incx, dC_work, workspace, num_gpus, nb, offset); } for(i=1; i<num_gpus; i++) { magma_setdevice(i); cudaDeviceSynchronize(); } end = get_current_time(); magma_perf = flops / GetTimerValue(start,end); printf( "%11.2f", magma_perf ); fprintf(fp, "%11.2f,", magma_perf ); for(i=0; i<num_gpus; i++) { magma_setdevice(i); magma_sgetvector( m, dY[i], incx, Y[i], incx ); } magma_setdevice(0); #ifdef validate for( j= offset;j<m;j++) { for(i=1; i<num_gpus; i++) { // printf("Y[%d][%d] = %15.14f\n", i, j, Y[i][j].x); #if defined(PRECISION_z) || defined(PRECISION_c) Y[0][j].x = Y[0][j].x + Y[i][j].x; Y[0][j].y = Y[0][j].y + Y[i][j].y; #else Y[0][j] = Y[0][j] + Y[i][j]; #endif } } /* #if defined(PRECISION_z) || defined(PRECISION_c) for( j=offset;j<m;j++) { if(Y[0][j].x != Ycublas[j].x) { printf("Y-multi[%d] = %f, %f\n", j, Y[0][j].x, Y[0][j].y ); printf("Ycublas[%d] = %f, %f\n", j, Ycublas[j].x, Ycublas[j].y); } } #else for( j=offset;j<m;j++) { if(Y[0][j] != Ycublas[j]) { printf("Y-multi[%d] = %f\n", j, Y[0][j] ); printf("Ycublas[%d] = %f\n", j, Ycublas[j]); } } #endif */ /* ===================================================================== Computing the Difference Cublas VS Magma =================================================================== */ magma_int_t nw = m - offset ; blasf77_saxpy( &nw, &c_neg_one, Y[0] + offset, &incx, Ycublas + offset, &incx); error = lapackf77_slange( "M", &nw, &ione, Ycublas + offset, &nw, work ); #if 0 printf( "\t\t %8.6e", error / m ); fprintf( fp, "\t\t %8.6e", error / m ); /* * Extra check with cblas vs magma */ cblas_scopy( m, Y, incx, Ycublas, incx ); cblas_ssymv( CblasColMajor, CblasLower, m, (alpha), A, LDA, X, incx, (beta), Ycublas, incx ); blasf77_saxpy( &m, &c_neg_one, Ymagma, &incx, Ycublas, &incx); error = lapackf77_slange( "M", &m, &ione, Ycublas, &m, work ); #endif printf( "\t\t %8.6e", error / m ); fprintf( fp, "\t\t %8.6e", error / m ); #endif printf("\n"); fprintf(fp, "\n"); } fclose( fp ) ; /* Free Memory */ TESTING_FREE( A ); TESTING_FREE( X ); TESTING_FREE( Ycublas ); TESTING_FREE( Ymagma ); TESTING_FREE( C_work ); TESTING_DEVFREE( dA ); TESTING_DEVFREE( dYcublas ); for(i=0; i<num_gpus; i++) { TESTING_FREE( Y[i] ); magma_setdevice(i); TESTING_DEVFREE( d_lA[i] ); TESTING_DEVFREE( dX[i] ); TESTING_DEVFREE( dY[i] ); TESTING_DEVFREE( dC_work[i] ); } magma_setdevice(0); /////////////////////////////////////////////////////////// /* Free device */ TESTING_FINALIZE(); #endif return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing magma_ssymm_mgpu */ int main( int argc, char** argv) { TESTING_INIT(); float c_neg_one = MAGMA_S_NEG_ONE; float calpha = MAGMA_S_MAKE( 3.456, 5.678 ); float cbeta = MAGMA_S_MAKE( 1.234, 2.456 ); real_Double_t gflops, gpu_perf=0., cpu_perf=0., gpu_time=0., cpu_time=0.; real_Double_t gpu_perf2=0., gpu_time2=0.; float error=0., errorbis=0., work[1]; float *hA, *hX, *hB, *hR; float *dA[MagmaMaxGPUs], *dX[MagmaMaxGPUs], *dB[MagmaMaxGPUs], *dwork[MagmaMaxGPUs], *hwork[MagmaMaxGPUs+1]; float *dA2; magma_int_t M, N, size, lda, ldda, msize, nb, nstream; magma_int_t ione = 1; magma_int_t iseed[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); // default values nb = (opts.nb > 0 ? opts.nb : 64); nstream = (opts.nstream > 0 ? opts.nstream : 2); magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2]; magma_int_t nbcmplx = 0; magma_buildconnection_mgpu(gnode, &nbcmplx, opts.ngpu); printf("Initializing communication pattern... GPU-ncmplx %d\n\n", (int) nbcmplx); for (int i=0; i < nbcmplx; ++i) { int myngpu = gnode[i][MagmaMaxGPUs]; printf("cmplx %d has %d gpu ", i, myngpu); for(int j=0; j < myngpu; ++j) printf(" %d", (int) gnode[i][j]); printf("\n"); } magma_int_t nbevents = 2; magma_queue_t streams[MagmaMaxGPUs][20]; magma_event_t redevents[MagmaMaxGPUs][20]; magma_event_t redevents2[MagmaMaxGPUs][MagmaMaxGPUs*MagmaMaxGPUs+10]; for( int d = 0; d < opts.ngpu; ++d ) { for( magma_int_t i = 0; i < nstream; ++i ) { magma_queue_create( &streams[d][i] ); } for( magma_int_t i = 0; i < nbevents; ++i ) { cudaEventCreateWithFlags(&redevents[d][i], cudaEventDisableTiming); cudaEventCreateWithFlags(&redevents2[d][i], cudaEventDisableTiming); } } printf( "nb %d, ngpu %d, nstream %d version %d\n", (int) nb, (int) opts.ngpu, (int) nstream, (int) opts.version ); printf(" M N nb offset CPU GFlop/s (sec) GPU GFlop/s (sec) CUBLAS hemm (sec) ||R|| / ||A||*||X||\n"); printf("=========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { M = opts.msize[itest]; N = opts.nsize[itest]; for( int offset = 0; offset < N; offset += min(N,nb) ) { for( int iter = 0; iter < opts.niter; ++iter ) { msize = M - offset; lda = M; ldda = ((M + 31)/32)*32; size = lda*M; gflops = FLOPS_SSYMM( MagmaLeft, (float)msize, (float)N ) / 1e9; magma_int_t dworksiz = ldda*N*3; magma_int_t hworksiz = lda*N; TESTING_MALLOC_CPU( hA, float, lda*M ); TESTING_MALLOC_CPU( hX, float, lda*N ); TESTING_MALLOC_CPU( hB, float, lda*N ); TESTING_MALLOC_PIN( hR, float, lda*N ); for( int d = 0; d < opts.ngpu; ++d ) { magma_int_t mlocal = ((M / nb) / opts.ngpu + 1) * nb; magma_setdevice( d ); TESTING_MALLOC_DEV( dA[d], float, ldda*mlocal ); TESTING_MALLOC_DEV( dX[d], float, ldda*N ); TESTING_MALLOC_DEV( dB[d], float, ldda*N ); TESTING_MALLOC_DEV( dwork[d], float, dworksiz ); TESTING_MALLOC_PIN( hwork[d], float, hworksiz ); } TESTING_MALLOC_PIN( hwork[opts.ngpu], float, lda*N ); if ( opts.check ) { magma_setdevice( 0 ); TESTING_MALLOC_DEV( dA2, float, ldda*M ); } lapackf77_slarnv( &ione, iseed, &size, hA ); magma_smake_symmetric( M, hA, lda ); size = lda*N; lapackf77_slarnv( &ione, iseed, &size, hX ); lapackf77_slarnv( &ione, iseed, &size, hB ); lapackf77_slacpy( "Full", &M, &N, hB, &lda, hR, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_ssetmatrix_1D_col_bcyclic( M, M, hA, lda, dA, ldda, opts.ngpu, nb ); for( int d = 0; d < opts.ngpu; ++d ) { magma_setdevice( d ); //magmablasSetKernelStream( streams[ d ][ 0 ] ); magma_ssetmatrix( M, N, hX, lda, dX[d], ldda ); //if (d == 0) magma_ssetmatrix( M, N, hB, lda, dB[d], ldda ); // this is wrong coz when offset != 0 the gpu who do the beta*C may be not 0 so this should be related to stdev(starting device who own i=0 first col) magma_ssetmatrix( M, N, hB, lda, dB[d], ldda ); } //memset(hR, 0, lda*N*sizeof(float)); //trace_init( 1, opts.ngpu, nstream, (magma_queue_t*) streams ); //magma_int_t offset = 0; //nb; gpu_time = magma_sync_wtime(0); magmablas_ssymm_mgpu_com( MagmaLeft, MagmaLower, msize, N, calpha, dA, ldda, offset, dX, ldda, cbeta, dB, ldda, dwork, dworksiz, hR, lda, hwork, hworksiz, opts.ngpu, nb, streams, nstream, redevents2, nbevents, gnode, nbcmplx); gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gflops / gpu_time; #ifdef TRACING char buf[80]; snprintf( buf, sizeof(buf), "ssymm-m%d-n%d-nb%d-stream%d-ngpu%d-run%d.svg", (int) M, (int) N, (int) nb, (int) nstream, (int) opts.ngpu, (int) j ); trace_finalize( buf, "trace.css" ); #endif /* ==================================================================== Performs operation using CUBLAS =================================================================== */ if ( opts.check && iter == 0 ) { magma_setdevice( 0 ); magmablasSetKernelStream( 0 ); magma_ssetmatrix( M, M, hA, lda, dA2, ldda ); magma_ssetmatrix( M, N, hX, lda, dX[0], ldda ); magma_ssetmatrix( M, N, hB, lda, dwork[0], ldda ); gpu_time2 = magma_sync_wtime(0); magma_ssymm( MagmaLeft, MagmaLower, msize, N, calpha, dA2+offset*ldda+offset, ldda, dX[0], ldda, cbeta, dwork[0], ldda ); gpu_time2 = magma_sync_wtime(0) - gpu_time2; gpu_perf2 = gflops / gpu_time2; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.check ) { // store ||A||*||X|| errorbis = lapackf77_slange("fro", &msize, &msize, hA+offset*lda+offset, &lda, work ); errorbis *= lapackf77_slange("fro", &msize, &N, hX, &lda, work ); //printf( "A =" ); magma_sprint( M, M, hA, lda ); //printf( "X =" ); magma_sprint( M, N, hX, lda ); //printf( "B =" ); magma_sprint( M, N, hB, lda ); cpu_time = magma_wtime(); blasf77_ssymm( "Left", "Lower", &msize, &N, &calpha, hA+offset*lda+offset, &lda, hX, &lda, &cbeta, hB, &lda ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* trace_file = fopen("AJETE/C", "w"); for (int j = 0; j < N; j++) for (int i = 0; i < siz; i++) fprintf(trace_file, "%10d%10d%40.30e\n", i+1, j+1, hB[j*lda+i]); fclose(trace_file); */ magma_int_t firstprint=0; for(magma_int_t dev=0; dev < opts.ngpu; ++dev) { magma_setdevice( dev ); magma_sgetmatrix( M, N, dB[dev], ldda, hR, lda ); // compute relative error ||R||/||A||*||X||, where R := B_magma - B_lapack = R - B size = lda*N; blasf77_saxpy( &size, &c_neg_one, hB, &ione, hR, &ione ); error = lapackf77_slange("fro", &msize, &N, hR, &lda, work) / errorbis; //printf( "R =" ); magma_sprint( M, N, hR, lda ); if (firstprint == 0) { printf( "%5d %5d %5d %5d %7.1f (%7.4f) %7.1f (%7.4f) %7.1f (%7.4f) %8.2e %s\n", (int) M, (int) N, (int) nb, (int) offset, cpu_perf, cpu_time, gpu_perf, gpu_time, gpu_perf2, gpu_time2, error, (error < tol ? "ok" : "failed") ); } else { printf( "%89s %8.2e %s\n", " ", error, (error < tol ? "ok" : "failed") ); } status += ! (error < tol); firstprint =1; } } else { printf( "%5d %5d %5d %5d --- ( --- ) %7.1f (%7.4f) --- ( --- ) ---\n", (int) M, (int) N, (int) nb, (int) offset, gpu_perf, gpu_time ); } TESTING_FREE_CPU( hA ); TESTING_FREE_CPU( hX ); TESTING_FREE_CPU( hB ); TESTING_FREE_PIN( hR ); for( int d = 0; d < opts.ngpu; ++d ) { magma_setdevice( d ); TESTING_FREE_DEV( dA[d] ); TESTING_FREE_DEV( dX[d] ); TESTING_FREE_DEV( dB[d] ); TESTING_FREE_DEV( dwork[d] ); TESTING_FREE_PIN( hwork[d] ); } TESTING_FREE_PIN( hwork[opts.ngpu] ); if ( opts.check ) { magma_setdevice( 0 ); TESTING_FREE_DEV( dA2 ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } // offset printf( "\n" ); } for( int d = 0; d < opts.ngpu; ++d ) { magma_setdevice( d ); for( magma_int_t i = 0; i < nstream; ++i ) { magma_queue_destroy( streams[d][i] ); } for( magma_int_t i = 0; i < nbevents; ++i ) { magma_event_destroy( redevents[d][i] ); magma_event_destroy( redevents2[d][i] ); } } TESTING_FINALIZE(); return status; }