/** Purpose ------- CGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix may exceed the GPU memory. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Note: The factorization of big panel is done calling multiple-gpu-interface. Pivots are applied on GPU within the big panel. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_cgesv_comp ********************************************************************/ extern "C" magma_int_t magma_cgetrf_m( magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { #define A(i,j) (A + (j)*lda + (i)) #define dAT(d,i,j) (dAT[d] + (i)*nb*ldn_local + (j)*nb) #define dPT(d,i,j) (dPT[d] + (i)*nb*nb + (j)*nb*maxm) magma_timer_t time=0, time_total=0, time_alloc=0, time_set=0, time_get=0, time_comp=0; timer_start( time_total ); real_Double_t flops; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *dAT[MagmaMaxGPUs], *dA[MagmaMaxGPUs], *dPT[MagmaMaxGPUs]; magma_int_t iinfo = 0, nb, nbi, maxm, n_local[MagmaMaxGPUs], ldn_local; magma_int_t N, M, NB, NBk, I, d, ngpu0 = ngpu; magma_int_t ii, jj, h, offset, ib, rows; magma_queue_t stream[MagmaMaxGPUs][2]; magma_event_t event[MagmaMaxGPUs][2]; *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); /* initialize nb */ nb = magma_get_cgetrf_nb(m); maxm = ((m + 31)/32)*32; /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(magmaFloatComplex); /* number of columns in the big panel */ h = 1+(2+ngpu0); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); const char* ngr_nb_char = getenv("MAGMA_NGR_NB"); if ( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); //NB = 5*max(nb,32); if ( ngpu0 > ceil((float)NB/nb) ) { ngpu = (int)ceil((float)NB/nb); h = 1+(2+ngpu); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); } else { ngpu = ngpu0; } if ( ngpu*NB >= n ) { #ifdef CHECK_CGETRF_OOC printf( " * still fit in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_CGETRF_OOC printf( " * don't fit in GPU memory.\n" ); #endif NB = ngpu*NB; NB = max( nb, (NB / nb) * nb); /* making sure it's devisable by nb (x64) */ } #ifdef CHECK_CGETRF_OOC if ( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n", n, NB, nb, (float)freeMem ); else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n", n, NB, nb, (float)freeMem ); #endif if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code for scalar of one tile. */ lapackf77_cgetrf(&m, &n, A, &lda, ipiv, info); } else { /* Use hybrid blocked code. */ /* allocate memory on GPU to store the big panel */ timer_start( time_alloc ); n_local[0] = (NB/nb)/ngpu; if ( NB%(nb*ngpu) != 0 ) n_local[0]++; n_local[0] *= nb; ldn_local = ((n_local[0]+31)/32)*32; for( d=0; d < ngpu; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_cmalloc( &dA[d], (ldn_local+h*nb)*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dPT[d] = dA[d] + nb*maxm; /* for storing the previous panel from CPU */ dAT[d] = dA[d] + h*nb*maxm; /* for storing the big panel */ magma_queue_create( &stream[d][0] ); magma_queue_create( &stream[d][1] ); magma_event_create( &event[d][0] ); magma_event_create( &event[d][1] ); } //magma_setdevice(0); timer_stop( time_alloc ); for( I=0; I < n; I += NB ) { M = m; N = min( NB, n-I ); /* number of columns in this big panel */ //s = min( max(m-I,0), N )/nb; /* number of small block-columns in this big panel */ maxm = ((M + 31)/32)*32; if ( ngpu0 > ceil((float)N/nb) ) { ngpu = (int)ceil((float)N/nb); } else { ngpu = ngpu0; } for( d=0; d < ngpu; d++ ) { n_local[d] = ((N/nb)/ngpu)*nb; if (d < (N/nb)%ngpu) n_local[d] += nb; else if (d == (N/nb)%ngpu) n_local[d] += N%nb; } ldn_local = ((n_local[0]+31)/32)*32; /* upload the next big panel into GPU, transpose (A->A'), and pivot it */ timer_start( time ); magmablas_csetmatrix_transpose_mgpu(ngpu, stream, A(0,I), lda, dAT, ldn_local, dA, maxm, M, N, nb); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } time_set += timer_stop( time ); timer_start( time ); /* == --------------------------------------------------------------- == */ /* == loop around the previous big-panels to update the new big-panel == */ for( offset = 0; offset < min(m,I); offset += NB ) { NBk = min( m-offset, NB ); /* start sending the first tile from the previous big-panels to gpus */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); nbi = min( nb, NBk ); magma_csetmatrix_async( (M-offset), nbi, A(offset,offset), lda, dA[d], (maxm-offset), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][0] ); /* transpose */ magmablas_ctranspose( M-offset, nbi, dA[d], maxm-offset, dPT(d,0,0), nb ); } /* applying the pivot from the previous big-panel */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magmablas_claswp_q( ldn_local, dAT(d,0,0), ldn_local, offset+1, offset+NBk, ipiv, 1, stream[d][1] ); } /* == going through each block-column of previous big-panels == */ for( jj=0, ib=offset/nb; jj < NBk; jj += nb, ib++ ) { ii = offset+jj; rows = maxm - ii; nbi = min( nb, NBk-jj ); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); /* wait for a block-column on GPU */ magma_queue_sync( stream[d][0] ); /* start sending next column */ if ( jj+nb < NBk ) { magma_csetmatrix_async( (M-ii-nb), min(nb,NBk-jj-nb), A(ii+nb,ii+nb), lda, dA[d], (rows-nb), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][(1+jj/nb)%2] ); /* transpose next column */ magmablas_ctranspose( M-ii-nb, nb, dA[d], rows-nb, dPT(d,0,(1+jj/nb)%2), nb ); } /* update with the block column */ magmablasSetKernelStream(stream[d][1]); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d], nbi, c_one, dPT(d,0,(jj/nb)%2), nb, dAT(d,ib,0), ldn_local ); if ( M > ii+nb ) { magma_cgemm( MagmaNoTrans, MagmaNoTrans, n_local[d], M-(ii+nb), nbi, c_neg_one, dAT(d,ib,0), ldn_local, dPT(d,1,(jj/nb)%2), nb, c_one, dAT(d,ib+1,0), ldn_local ); } magma_event_record( event[d][(jj/nb)%2], stream[d][1] ); } /* end of for each block-columns in a big-panel */ } } /* end of for each previous big-panels */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } /* calling magma-gpu interface to panel-factorize the big panel */ if ( M > I ) { magma_cgetrf2_mgpu(ngpu, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, A(0,I), lda, stream, &iinfo); if ( iinfo < 0 ) { *info = iinfo; break; } else if ( iinfo != 0 ) { *info = iinfo + I * NB; //break; } /* adjust pivots */ for( ii=I; ii < min(I+N,m); ii++ ) ipiv[ii] += I; } time_comp += timer_stop( time ); /* download the current big panel to CPU */ timer_start( time ); magmablas_cgetmatrix_transpose_mgpu(ngpu, stream, dAT, ldn_local, A(0,I), lda, dA, maxm, M, N, nb); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } time_get += timer_stop( time ); } /* end of for */ timer_stop( time_total ); flops = FLOPS_CGETRF( m, n ) / 1e9; timer_printf(" memory-allocation time: %e\n", time_alloc ); timer_printf(" NB=%d nb=%d\n", (int) NB, (int) nb ); timer_printf(" memcopy and transpose %e seconds\n", time_set ); timer_printf(" total time %e seconds\n", time_total ); timer_printf(" Performance %f GFlop/s, %f seconds without htod and dtoh\n", flops / (time_comp), time_comp ); timer_printf(" Performance %f GFlop/s, %f seconds with htod\n", flops / (time_comp + time_set), time_comp + time_set ); timer_printf(" Performance %f GFlop/s, %f seconds with dtoh\n", flops / (time_comp + time_get), time_comp + time_get ); timer_printf(" Performance %f GFlop/s, %f seconds without memory-allocation\n", flops / (time_total - time_alloc), time_total - time_alloc ); for( d=0; d < ngpu0; d++ ) { magma_setdevice(d); magma_free( dA[d] ); magma_event_destroy( event[d][0] ); magma_event_destroy( event[d][1] ); magma_queue_destroy( stream[d][0] ); magma_queue_destroy( stream[d][1] ); } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); } if ( *info >= 0 ) magma_cgetrf_piv(m, n, NB, A, lda, ipiv, info); return *info; } /* magma_cgetrf_m */
/* //////////////////////////////////////////////////////////////////////////// -- Testing ctrsm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time=0, cublas_perf, cublas_time, cpu_perf=0, cpu_time=0; float magma_error, cublas_error, work[1]; magma_int_t M, N, info; magma_int_t Ak; magma_int_t sizeA, sizeB; magma_int_t lda, ldb, ldda, lddb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t *ipiv; magmaFloatComplex *h_A, *h_B, *h_Bcublas, *h_Bmagma, *h_B1, *h_X1, *h_X2; magmaFloatComplex *d_A, *d_B; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex alpha = MAGMA_C_MAKE( 0.29, -0.86 ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("side = %s, uplo = %s, transA = %s, diag = %s \n", lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag) ); printf(" M N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("==================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; gflops = FLOPS_CTRSM(opts.side, M, N) / 1e9; if ( opts.side == MagmaLeft ) { lda = M; Ak = M; } else { lda = N; Ak = N; } ldb = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; sizeA = lda*Ak; sizeB = ldb*N; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*Ak ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_B1, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_X1, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_X2, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_Bcublas, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_Bmagma, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, Ak ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*Ak ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*N ); /* Initialize the matrices */ /* Factor A into LU to get well-conditioned triangular matrix. * Copy L to U, since L seems okay when used with non-unit diagonal * (i.e., from U), while U fails when used with unit diagonal. */ lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_cgetrf( &Ak, &Ak, h_A, &lda, ipiv, &info ); for( int j = 0; j < Ak; ++j ) { for( int i = 0; i < j; ++i ) { *h_A(i,j) = *h_A(j,i); } } lapackf77_clarnv( &ione, ISEED, &sizeB, h_B ); memcpy(h_B1, h_B, sizeB*sizeof(magmaFloatComplex)); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_csetmatrix( Ak, Ak, h_A, lda, d_A, ldda ); magma_csetmatrix( M, N, h_B, ldb, d_B, lddb ); magma_time = magma_sync_wtime( NULL ); magmablas_ctrsm( opts.side, opts.uplo, opts.transA, opts.diag, M, N, alpha, d_A, ldda, d_B, lddb ); magma_time = magma_sync_wtime( NULL ) - magma_time; magma_perf = gflops / magma_time; magma_cgetmatrix( M, N, d_B, lddb, h_Bmagma, ldb ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_csetmatrix( M, N, h_B, ldb, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ); cublasCtrsm( handle, cublas_side_const(opts.side), cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), cublas_diag_const(opts.diag), M, N, &alpha, d_A, ldda, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_cgetmatrix( M, N, d_B, lddb, h_Bcublas, ldb ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_ctrsm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &alpha, h_A, &lda, h_B, &ldb ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ // ||b - Ax|| / (||A||*||x||) memcpy(h_X1, h_Bmagma, sizeB*sizeof(magmaFloatComplex)); magmaFloatComplex alpha2 = MAGMA_C_DIV( c_one, alpha ); blasf77_ctrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &alpha2, h_A, &lda, h_X1, &ldb ); blasf77_caxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X1, &ione ); float norm1 = lapackf77_clange( "M", &M, &N, h_X1, &ldb, work ); float normx = lapackf77_clange( "M", &M, &N, h_Bmagma, &ldb, work ); float normA = lapackf77_clange( "M", &Ak, &Ak, h_A, &lda, work ); magma_error = norm1/(normx*normA); memcpy(h_X2, h_Bcublas, sizeB*sizeof(magmaFloatComplex)); blasf77_ctrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &alpha2, h_A, &lda, h_X2, &ldb ); blasf77_caxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X2, &ione ); norm1 = lapackf77_clange( "M", &M, &N, h_X2, &ldb, work ); normx = lapackf77_clange( "M", &M, &N, h_Bcublas, &ldb, work ); normA = lapackf77_clange( "M", &Ak, &Ak, h_A, &lda, work ); cublas_error = norm1/(normx*normA); if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error, (magma_error < tol && cublas_error < tol? "ok" : "failed")); status += ! (magma_error < tol && cublas_error < tol); } else { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) --- ( --- ) %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, magma_error, cublas_error, (magma_error < tol && cublas_error < tol? "ok" : "failed")); status += ! (magma_error < tol && cublas_error < tol); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_B1 ); TESTING_FREE_CPU( h_X1 ); TESTING_FREE_CPU( h_X2 ); TESTING_FREE_CPU( h_Bcublas ); TESTING_FREE_CPU( h_Bmagma ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgetrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error; magmaFloatComplex *h_A; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, info, min_mn; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("=========================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_CGETRF( M, N ) / 1e9; TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_PIN( h_A, magmaFloatComplex, n2 ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_cgetrf(&M, &N, h_A, &lda, ipiv, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( M, N, h_A, lda ); gpu_time = magma_wtime(); magma_cgetrf( M, N, h_A, lda, ipiv, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { error = get_residual( M, N, h_A, lda, ipiv ); printf(" %8.2e%s\n", error, (error < tol ? "" : " failed")); status |= ! (error < tol); } else if ( opts.check ) { error = get_LU_error( M, N, h_A, lda, ipiv ); printf(" %8.2e%s\n", error, (error < tol ? "" : " failed")); status |= ! (error < tol); } else { printf(" --- \n"); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_PIN( h_A ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_cgetrf_mgpu(magma_int_t num_gpus, magma_int_t m, magma_int_t n, magmaFloatComplex **d_lA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments ========= NUM_GPUS (input) INTEGER The number of GPUS to be used for the factorization. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define inAT(id,i,j) (d_lAT[(id)] + (i)*nb*lddat + (j)*nb) magma_int_t nb, n_local[MagmaMaxGPUs]; magma_int_t maxm, mindim; magma_int_t i, j, d, lddat, lddwork; magmaFloatComplex *d_lAT[MagmaMaxGPUs]; magmaFloatComplex *d_panel[MagmaMaxGPUs], *work; magma_queue_t streaml[MagmaMaxGPUs][2]; /* Check arguments */ *info = 0; if (m < 0) *info = -2; else if (n < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); nb = magma_get_cgetrf_nb(m); if (nb <= 1 || nb >= n) { /* Use CPU code. */ magma_cmalloc_cpu( &work, m * n ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_cgetmatrix( m, n, d_lA[0], ldda, work, m ); lapackf77_cgetrf(&m, &n, work, &m, ipiv, info); magma_csetmatrix( m, n, work, m, d_lA[0], ldda ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; if( num_gpus > ceil((float)n/nb) ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) num_gpus ); *info = -1; return *info; } /* allocate workspace for each GPU */ lddat = ((((((n+nb-1)/nb)/num_gpus)*nb)+31)/32)*32; lddat = (n+nb-1)/nb; /* number of block columns */ lddat = (lddat+num_gpus-1)/num_gpus; /* number of block columns per GPU */ lddat = nb*lddat; /* number of columns per GPU */ lddat = ((lddat+31)/32)*32; /* make it a multiple of 32 */ for(i=0; i<num_gpus; i++){ magma_setdevice(i); /* local-n and local-ld */ n_local[i] = ((n/nb)/num_gpus)*nb; if (i < (n/nb)%num_gpus) n_local[i] += nb; else if (i == (n/nb)%num_gpus) n_local[i] += n%nb; /* workspaces */ if (MAGMA_SUCCESS != magma_cmalloc( &d_panel[i], (3+num_gpus)*nb*maxm )) { for( j=0; j<=i; j++ ) { magma_setdevice(j); } for( j=0; j<i; j++ ) { magma_setdevice(j); magma_free( d_panel[j] ); magma_free( d_lAT[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* local-matrix storage */ if (MAGMA_SUCCESS != magma_cmalloc( &d_lAT[i], lddat*maxm )) { for( j=0; j<=i; j++ ) { magma_setdevice(j); magma_free( d_panel[j] ); } for( j=0; j<i; j++ ) { magma_setdevice(j); magma_free( d_lAT[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* create the streams */ magma_queue_create( &streaml[i][0] ); magma_queue_create( &streaml[i][1] ); magmablasSetKernelStream(streaml[i][1]); magmablas_ctranspose2( d_lAT[i], lddat, d_lA[i], ldda, m, n_local[i] ); } for(i=0; i<num_gpus; i++){ magma_setdevice(i); cudaStreamSynchronize(streaml[i][0]); magmablasSetKernelStream(NULL); } magma_setdevice(0); /* cpu workspace */ lddwork = maxm; if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, lddwork*nb*num_gpus )) { for(i=0; i<num_gpus; i++ ) { magma_setdevice(i); magma_free( d_panel[i] ); magma_free( d_lAT[i] ); } *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* calling multi-gpu interface with allocated workspaces and streams */ //magma_cgetrf1_mgpu( num_gpus, m, n, nb, 0, d_lAT, lddat, ipiv, d_panel, work, maxm, // (magma_queue_t **)streaml, info ); magma_cgetrf2_mgpu(num_gpus, m, n, nb, 0, d_lAT, lddat, ipiv, d_panel, work, maxm, streaml, info); /* clean up */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); /* save on output */ magmablas_ctranspose2( d_lA[d], ldda, d_lAT[d], lddat, n_local[d], m ); magma_device_sync(); magma_free( d_lAT[d] ); magma_free( d_panel[d] ); magma_queue_destroy( streaml[d][0] ); magma_queue_destroy( streaml[d][1] ); magmablasSetKernelStream(NULL); } /* end of for d=1,..,num_gpus */ magma_setdevice(0); magma_free_pinned( work ); } return *info; }
/** Purpose ------- CGETRF_INCPIV computes an LU factorization of a general M-by-N tile A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 2.5 BLAS version of the algorithm. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in] ib INTEGER The inner-blocking size. IB >= 0. @param[in,out] hA COMPLEX array, dimension(LDHA, N), on cpu. On entry, only the M-by-IB first panel needs to be identical to dA(1..M, 1..IB). On exit, the content is incomplete. Shouldn't be used. @param[in] ldha INTEGER The leading dimension of the array hA. LDHA >= max(1,M). @param[in,out] dA COMPLEX array, dimension(LDDA, N), on gpu. On entry, the M-by-N tile to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). @param[out] hL COMPLEX array, dimension(LDHL, min(M,N)), on vpu. On exit, contains in the upper part the IB-by-K lower triangular tile, and in the lower part IB-by-min(M,N) the inverse of the top part. @param[in] ldhl INTEGER The leading dimension of the array hL. LDHL >= max(1,2*IB). @param[out] dL COMPLEX array, dimension(LDDL, K), on gpu. On exit, contains in the upper part the IB-by-min(M,N) lower triangular tile, and in the lower part IB-by-min(M,N) the inverse of the top part. @param[in] lddl INTEGER The leading dimension of the array dL. LDDL >= max(1,2*IB). @param[out] ipiv INTEGER array, dimension min(M,N), on the cpu. The pivot indices array. @param[out] dWORK COMPLEX array, dimension(LDDWORK, 2*IB), on gpu. Workspace. @param[in] lddwork INTEGER The leading dimension of the array dWORK. LDDWORK >= max(NB, 1). @param[out] info INTEGER - PLASMA_SUCCESS successful exit - < 0 if INFO = -k, the k-th argument had an illegal value - > 0 if INFO = k, U(k,k) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_cgesv_comp ********************************************************************/ extern "C" magma_int_t magma_cgetrf_incpiv_gpu( magma_order_t order, magma_int_t m, magma_int_t n, magma_int_t ib, magmaFloatComplex *hA, magma_int_t ldha, magmaFloatComplex *dA, magma_int_t ldda, magmaFloatComplex *hL, magma_int_t ldhl, magmaFloatComplex *dL, magma_int_t lddl, magma_int_t *ipiv, magmaFloatComplex *dwork, magma_int_t lddwork, magma_int_t *info) { #define AT(i,j) (dAT + (i)*ib*ldda + (j)*ib) #define hA(i,j) (hA + (i)*ib + (j)*ib*ldha) #define hL(j) (hL + (j)*ib*ldhl ) #define hL2(j) (hL2 + (j)*ib*ldhl ) #define dL(j) (dL + (j)*ib*lddl ) #define dL2(j) (dL2 + (j)*ib*lddl ) magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t iinfo; magma_int_t maxm, mindim; magma_int_t i, rows, cols, s, ii, sb; magmaFloatComplex *dAT; #ifndef WITHOUTTRTRI magmaFloatComplex *dL2 = dL + ib; magmaFloatComplex *hL2 = hL + ib; #endif /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); s = mindim / ib; if ( ib >= mindim ) { /* Use CPU code. */ lapackf77_cgetrf(&m, &n, hA, &ldha, ipiv, info); #ifndef WITHOUTTRTRI CORE_clacpy(PlasmaUpperLower, mindim, mindim, (PLASMA_Complex32_t*)hA, ldha, (PLASMA_Complex32_t*)hL2, ldhl ); CORE_ctrtri( PlasmaLower, PlasmaUnit, mindim, (PLASMA_Complex32_t*)hL2, ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } magma_csetmatrix( mindim, mindim, hL2, ldhl, dL2, lddl ); #endif if ( order == MagmaRowMajor ) { magma_csetmatrix( m, n, hA, ldha, dwork, lddwork ); magmablas_ctranspose( m, n, dwork, lddwork, dA, ldda ); } else { magma_csetmatrix( m, n, hA, ldha, dA, ldda ); } } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; if ( order == MagmaColMajor ) { magmablas_cgetmo_in( dA, dAT, ldda, m, n ); } else { dAT = dA; } for( i=0; i < s; i++ ) { ii = i * ib; sb = min(ib, mindim-ii); cols = maxm - ii; if ( i > 0 ) { // download i-th panel magmablas_ctranspose( sb, m, AT(0,i), ldda, dwork, maxm ); magma_cgetmatrix( m, sb, dwork, maxm, hA(0, i), ldha ); // make sure that gpu queue is empty //magma_device_sync(); #ifndef WITHOUTTRTRI magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n - (ii+sb), ib, c_one, dL2(i-1), lddl, AT(i-1,i+1), ldda ); #else magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (ii+sb), ib, c_one, AT(i-1,i-1), ldda, AT(i-1,i+1), ldda ); #endif magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-(ii+sb), m-ii, ib, c_neg_one, AT(i-1,i+1), ldda, AT(i, i-1), ldda, c_one, AT(i, i+1), ldda ); } // do the cpu part rows = m - ii; lapackf77_cgetrf( &rows, &sb, hA(i, i), &ldha, ipiv+ii, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + ii; { int j; int fin = ii + sb; for (j=ii; j < fin; j++) { ipiv[j] = ii + ipiv[j]; } } magmablas_claswp( n-ii, AT(0, i), ldda, ii+1, ii+sb, ipiv, 1 ); #ifndef WITHOUTTRTRI CORE_clacpy(PlasmaLower, sb, sb, (PLASMA_Complex32_t*)hA(i, i), ldha, (PLASMA_Complex32_t*)hL2(i), ldhl ); CORE_ctrtri( PlasmaLower, PlasmaUnit, sb, (PLASMA_Complex32_t*)hL2(i), ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } magma_csetmatrix( sb, sb, hL2(i), ldhl, dL2(i), lddl ); #endif // upload i-th panel magma_csetmatrix( rows, sb, hA(i, i), ldha, dwork, cols ); magmablas_ctranspose( rows, sb, dwork, cols, AT(i,i), ldda ); // do the small non-parallel computations if ( s > (i+1) ) { #ifndef WITHOUTTRTRI magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, sb, sb, c_one, dL2(i), lddl, AT(i, i+1), ldda); #else magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, sb, sb, c_one, AT(i, i ), ldda, AT(i, i+1), ldda); #endif magma_cgemm( MagmaNoTrans, MagmaNoTrans, sb, m-(ii+sb), sb, c_neg_one, AT(i, i+1), ldda, AT(i+1, i ), ldda, c_one, AT(i+1, i+1), ldda ); } else { /* Update of the last panel */ #ifndef WITHOUTTRTRI magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-mindim, sb, c_one, dL2(i), lddl, AT(i, i+1), ldda); #else magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-mindim, sb, c_one, AT(i, i ), ldda, AT(i, i+1), ldda); #endif /* m-(ii+sb) should be always 0 */ magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-mindim, m-(ii+sb), sb, c_neg_one, AT(i, i+1), ldda, AT(i+1, i ), ldda, c_one, AT(i+1, i+1), ldda ); } } if ( order == MagmaColMajor ) { magmablas_cgetmo_out( dA, dAT, ldda, m, n ); } } return *info; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ctrsm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf=0, cpu_time=0; float cublas_error, normA, normx, normr, work[1]; magma_int_t N, info; magma_int_t sizeA; magma_int_t lda, ldda; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t *ipiv; magmaFloatComplex *h_A, *h_b, *h_x, *h_xcublas; magmaFloatComplex *d_A, *d_x; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_opts opts; parse_opts( argc, argv, &opts ); printf("uplo = %c, transA = %c, diag = %c\n", opts.uplo, opts.transA, opts.diag ); printf(" N CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("============================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; gflops = FLOPS_CTRSM(opts.side, N, 1) / 1e9; lda = N; ldda = ((lda+31)/32)*32; sizeA = lda*N; TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( h_b, magmaFloatComplex, N ); TESTING_MALLOC_CPU( h_x, magmaFloatComplex, N ); TESTING_MALLOC_CPU( h_xcublas, magmaFloatComplex, N ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_x, magmaFloatComplex, N ); /* Initialize the matrices */ /* Factor A into LU to get well-conditioned triangular matrix. * Copy L to U, since L seems okay when used with non-unit diagonal * (i.e., from U), while U fails when used with unit diagonal. */ lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_cgetrf( &N, &N, h_A, &lda, ipiv, &info ); for( int j = 0; j < N; ++j ) { for( int i = 0; i < j; ++i ) { *h_A(i,j) = *h_A(j,i); } } lapackf77_clarnv( &ione, ISEED, &N, h_b ); blasf77_ccopy( &N, h_b, &ione, h_x, &ione ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_csetmatrix( N, N, h_A, lda, d_A, ldda ); magma_csetvector( N, h_x, 1, d_x, 1 ); cublas_time = magma_sync_wtime( NULL ); cublasCtrsv( opts.uplo, opts.transA, opts.diag, N, d_A, ldda, d_x, 1 ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_cgetvector( N, d_x, 1, h_xcublas, 1 ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_ctrsv( &opts.uplo, &opts.transA, &opts.diag, &N, h_A, &lda, h_x, &ione ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ // ||b - Ax|| / (||A||*||x||) // error for CUBLAS normA = lapackf77_clange( "F", &N, &N, h_A, &lda, work ); normx = lapackf77_clange( "F", &N, &ione, h_xcublas, &ione, work ); blasf77_ctrmv( &opts.uplo, &opts.transA, &opts.diag, &N, h_A, &lda, h_xcublas, &ione ); blasf77_caxpy( &N, &c_neg_one, h_b, &ione, h_xcublas, &ione ); normr = lapackf77_clange( "F", &N, &ione, h_xcublas, &N, work ); cublas_error = normr / (normA*normx); if ( opts.lapack ) { printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e\n", (int) N, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error ); } else { printf("%5d %7.2f (%7.2f) --- ( --- ) %8.2e\n", (int) N, cublas_perf, 1000.*cublas_time, cublas_error ); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_b ); TESTING_FREE_CPU( h_x ); TESTING_FREE_CPU( h_xcublas ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_x ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ctrsm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf=0, magma_time=0, cublas_perf, cublas_time, cpu_perf=0, cpu_time=0; float magma_error=0, cublas_error, lapack_error, work[1]; magma_int_t M, N, info; magma_int_t Ak; magma_int_t sizeA, sizeB; magma_int_t lda, ldb, ldda, lddb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t *ipiv; magmaFloatComplex *h_A, *h_B, *h_Bcublas, *h_Bmagma, *h_Blapack, *h_X; magmaFloatComplex_ptr d_A, d_B; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex alpha = MAGMA_C_MAKE( 0.29, -0.86 ); magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); // pass ngpu = -1 to test multi-GPU code using 1 gpu magma_int_t abs_ngpu = abs( opts.ngpu ); printf("%% side = %s, uplo = %s, transA = %s, diag = %s, ngpu = %d\n", lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), int(abs_ngpu) ); printf("%% M N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA CUBLAS LAPACK error\n"); printf("%%============================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; gflops = FLOPS_CTRSM(opts.side, M, N) / 1e9; if ( opts.side == MagmaLeft ) { lda = M; Ak = M; } else { lda = N; Ak = N; } ldb = M; ldda = magma_roundup( lda, opts.align ); // multiple of 32 by default lddb = magma_roundup( ldb, opts.align ); // multiple of 32 by default sizeA = lda*Ak; sizeB = ldb*N; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*Ak ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_X, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_Blapack, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_Bcublas, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_Bmagma, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, Ak ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*Ak ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*N ); /* Initialize the matrices */ /* Factor A into LU to get well-conditioned triangular matrix. * Copy L to U, since L seems okay when used with non-unit diagonal * (i.e., from U), while U fails when used with unit diagonal. */ lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_cgetrf( &Ak, &Ak, h_A, &lda, ipiv, &info ); for( int j = 0; j < Ak; ++j ) { for( int i = 0; i < j; ++i ) { *h_A(i,j) = *h_A(j,i); } } lapackf77_clarnv( &ione, ISEED, &sizeB, h_B ); memcpy( h_Blapack, h_B, sizeB*sizeof(magmaFloatComplex) ); magma_csetmatrix( Ak, Ak, h_A, lda, d_A, ldda, opts.queue ); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ #if defined(HAVE_CUBLAS) magma_csetmatrix( M, N, h_B, ldb, d_B, lddb, opts.queue ); magma_time = magma_sync_wtime( opts.queue ); if (opts.ngpu == 1) { magmablas_ctrsm( opts.side, opts.uplo, opts.transA, opts.diag, M, N, alpha, d_A, ldda, d_B, lddb, opts.queue ); } else { magma_ctrsm_m( abs_ngpu, opts.side, opts.uplo, opts.transA, opts.diag, M, N, alpha, d_A, ldda, d_B, lddb ); } magma_time = magma_sync_wtime( opts.queue ) - magma_time; magma_perf = gflops / magma_time; magma_cgetmatrix( M, N, d_B, lddb, h_Bmagma, ldb, opts.queue ); #endif /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_csetmatrix( M, N, h_B, ldb, d_B, lddb, opts.queue ); cublas_time = magma_sync_wtime( opts.queue ); #if defined(HAVE_CUBLAS) // opts.handle also uses opts.queue cublasCtrsm( opts.handle, cublas_side_const(opts.side), cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), cublas_diag_const(opts.diag), M, N, &alpha, d_A, ldda, d_B, lddb ); #elif defined(HAVE_clBLAS) clblasCtrsm( clblasColumnMajor, clblas_side_const(opts.side), clblas_uplo_const(opts.uplo), clblas_trans_const(opts.transA), clblas_diag_const(opts.diag), M, N, alpha, d_A, 0, ldda, d_B, 0, lddb, 1, &opts.queue, 0, NULL, NULL ); #endif cublas_time = magma_sync_wtime( opts.queue ) - cublas_time; cublas_perf = gflops / cublas_time; magma_cgetmatrix( M, N, d_B, lddb, h_Bcublas, ldb, opts.queue ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_ctrsm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &alpha, h_A, &lda, h_Blapack, &ldb ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ // ||b - 1/alpha*A*x|| / (||A||*||x||) magmaFloatComplex inv_alpha = MAGMA_C_DIV( c_one, alpha ); float normR, normX, normA; normA = lapackf77_clange( "M", &Ak, &Ak, h_A, &lda, work ); #if defined(HAVE_CUBLAS) // check magma memcpy( h_X, h_Bmagma, sizeB*sizeof(magmaFloatComplex) ); blasf77_ctrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &inv_alpha, h_A, &lda, h_X, &ldb ); blasf77_caxpy( &sizeB, &c_neg_one, h_B, &ione, h_X, &ione ); normR = lapackf77_clange( "M", &M, &N, h_X, &ldb, work ); normX = lapackf77_clange( "M", &M, &N, h_Bmagma, &ldb, work ); magma_error = normR/(normX*normA); #endif // check cublas memcpy( h_X, h_Bcublas, sizeB*sizeof(magmaFloatComplex) ); blasf77_ctrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &inv_alpha, h_A, &lda, h_X, &ldb ); blasf77_caxpy( &sizeB, &c_neg_one, h_B, &ione, h_X, &ione ); normR = lapackf77_clange( "M", &M, &N, h_X, &ldb, work ); normX = lapackf77_clange( "M", &M, &N, h_Bcublas, &ldb, work ); cublas_error = normR/(normX*normA); if ( opts.lapack ) { // check lapack // this verifies that the matrix wasn't so bad that it couldn't be solved accurately. memcpy( h_X, h_Blapack, sizeB*sizeof(magmaFloatComplex) ); blasf77_ctrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &inv_alpha, h_A, &lda, h_X, &ldb ); blasf77_caxpy( &sizeB, &c_neg_one, h_B, &ione, h_X, &ione ); normR = lapackf77_clange( "M", &M, &N, h_X, &ldb, work ); normX = lapackf77_clange( "M", &M, &N, h_Blapack, &ldb, work ); lapack_error = normR/(normX*normA); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error, lapack_error, (magma_error < tol && cublas_error < tol? "ok" : "failed")); status += ! (magma_error < tol && cublas_error < tol); } else { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) --- ( --- ) %8.2e %8.2e --- %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, magma_error, cublas_error, (magma_error < tol && cublas_error < tol ? "ok" : "failed")); status += ! (magma_error < tol && cublas_error < tol); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( h_Blapack ); TESTING_FREE_CPU( h_Bcublas ); TESTING_FREE_CPU( h_Bmagma ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_cgetrf2_msub( magma_int_t num_subs, magma_int_t ngpu, magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset, magmaFloatComplex_ptr *d_lAT, size_t dlAT_offset, magma_int_t lddat, magma_int_t *ipiv, magmaFloatComplex_ptr *d_panel, magmaFloatComplex_ptr *d_lAP, size_t dlAP_offset, magmaFloatComplex *w, magma_int_t ldw, magma_queue_t *queues, magma_int_t *info) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Use two buffer to send panels.. Arguments ========= NUM_GPUS (input) INTEGER The number of GPUS to be used for the factorization. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define d_lAT(id,i,j) d_lAT[(id)], (((offset)+(i)*nb)*lddat + (j)*nb) #define d_lAT_offset(i, j) (((offset)+(i)*nb)*lddat + (j)*nb) #define W(j) (w +((j)%(1+ngpu))*nb*ldw) magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t tot_subs = num_subs * ngpu; magma_int_t block_size = 32; magma_int_t iinfo, maxm, mindim; magma_int_t i, j, d, dd, rows, cols, s; magma_int_t id, j_local, j_local2, nb0, nb1; /* local submatrix info */ magma_int_t ldpan[MagmaMaxSubs * MagmaMaxGPUs], n_local[MagmaMaxSubs * MagmaMaxGPUs]; size_t dpanel_local_offset[MagmaMaxSubs * MagmaMaxGPUs]; magmaFloatComplex_ptr dpanel_local[MagmaMaxSubs * MagmaMaxGPUs]; /* Check arguments */ *info = 0; if (m < 0) *info = -2; else if (n < 0) *info = -3; else if (tot_subs*lddat < max(1,n)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); if (tot_subs > ceil((float)n/nb)) { *info = -1; return *info; } else { /* Use hybrid blocked code. */ maxm = ((m + block_size-1)/block_size)*block_size; /* some initializations */ for (i=0; i < tot_subs; i++) { n_local[i] = ((n/nb)/tot_subs)*nb; if (i < (n/nb)%tot_subs) n_local[i] += nb; else if (i == (n/nb)%tot_subs) n_local[i] += n%nb; } /* start sending the first panel to cpu */ nb0 = min(mindim, nb); magmablas_ctranspose( nb0, maxm, d_lAT(0,0,0), lddat, d_lAP[0], dlAP_offset, maxm, queues[2*0+1] ); magma_cgetmatrix_async( m, nb0, d_lAP[0], dlAP_offset, maxm, W(0), ldw, queues[2*0+1], NULL ); clFlush(queues[2*0+1]); /* ------------------------------------------------------------------------------------- */ s = mindim / nb; for (j=0; j < s; j++) { /* Set the submatrix ID that holds the current panel */ id = j%tot_subs; /* Set the local index where the current panel is */ j_local = j/tot_subs; // cols for gpu panel cols = maxm - j*nb; // rows for cpu panel rows = m - j*nb; /* synchrnoize j-th panel from id-th gpu into work */ magma_queue_sync( queues[2*(id%ngpu)+1] ); /* j-th panel factorization */ lapackf77_cgetrf( &rows, &nb, W(j), &ldw, ipiv+j*nb, &iinfo); if ((*info == 0) && (iinfo > 0)) { *info = iinfo + j*nb; //break; } /* start sending the panel to all the gpus */ d = (j+1)%ngpu; for (dd=0; dd < ngpu; dd++) { magma_csetmatrix_async( rows, nb, W(j), ldw, d_lAP[d], dlAP_offset+(j%(2+ngpu))*nb*maxm, maxm, queues[2*d+1], NULL ); d = (d+1)%ngpu; } /* apply the pivoting */ for( i=j*nb; i < j*nb + nb; ++i ) { ipiv[i] += j*nb; } d = (j+1)%tot_subs; for (dd=0; dd < tot_subs; dd++) { magmablas_claswp( lddat, d_lAT(d,0,0), lddat, j*nb + 1, j*nb + nb, ipiv, 1, queues[2*(d%ngpu)] ); d = (d+1)%tot_subs; } /* update the trailing-matrix/look-ahead */ d = (j+1)%tot_subs; for (dd=0; dd < tot_subs; dd++) { /* storage for panel */ if (d%ngpu == id%ngpu) { /* the panel belond to this gpu */ dpanel_local[d] = d_lAT[id]; dpanel_local_offset[d] = d_lAT_offset(j, j_local); ldpan[d] = lddat; /* next column */ j_local2 = j_local; if ( d <= id ) j_local2++; } else { /* the panel belong to another gpu */ dpanel_local[d] = d_panel[d%ngpu]; dpanel_local_offset[d] = (j%(2+ngpu))*nb*maxm; ldpan[d] = nb; /* next column */ j_local2 = j_local; if ( d < id ) j_local2++; } /* the size of the next column */ if (s > (j+1)) { nb0 = nb; } else { nb0 = n_local[d]-nb*(s/tot_subs); if (d < s%tot_subs) nb0 -= nb; } if (d == (j+1)%tot_subs) { /* owns the next column, look-ahead the column */ nb1 = nb0; } else { /* update the entire trailing matrix */ nb1 = n_local[d] - j_local2*nb; } /* gpu updating the trailing matrix */ if (d == (j+1)%tot_subs) { /* look-ahead, this is executed first (j.e., dd=0) */ magma_queue_sync(queues[2*(d%ngpu)]); /* pivoting done? (overwrite with panel) */ magmablas_ctranspose( cols, nb, d_lAP[d%ngpu], dlAP_offset+(j%(2+ngpu))*nb*maxm, maxm, dpanel_local[d], dpanel_local_offset[d], ldpan[d], queues[2*(d%ngpu)+1] ); magma_queue_sync(queues[2*(d%ngpu)+1]); /* panel arrived and transposed for remaining update ? */ magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb, c_one, dpanel_local[d], dpanel_local_offset[d], ldpan[d], d_lAT(d, j, j_local2), lddat, queues[2*(d%ngpu)+1]); magma_cgemm( MagmaNoTrans, MagmaNoTrans, nb1, m-(j+1)*nb, nb, c_neg_one, d_lAT(d, j, j_local2), lddat, dpanel_local[d], dpanel_local_offset[d]+nb*ldpan[d], ldpan[d], c_one, d_lAT(d, j+1, j_local2), lddat, queues[2*(d%ngpu)+1]); } else { /* no look-ahead */ if (dd < ngpu) { /* synch and transpose only the first time */ magma_queue_sync(queues[2*(d%ngpu)+1]); /* panel arrived? */ magmablas_ctranspose( cols, nb, d_lAP[d%ngpu], dlAP_offset+(j%(2+ngpu))*nb*maxm, maxm, dpanel_local[d], dpanel_local_offset[d], ldpan[d], queues[2*(d%ngpu)] ); } magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb, c_one, dpanel_local[d], dpanel_local_offset[d], ldpan[d], d_lAT(d, j, j_local2), lddat, queues[2*(d%ngpu)]); magma_cgemm( MagmaNoTrans, MagmaNoTrans, nb1, m-(j+1)*nb, nb, c_neg_one, d_lAT(d, j, j_local2), lddat, dpanel_local[d], dpanel_local_offset[d]+nb*ldpan[d], ldpan[d], c_one, d_lAT(d, j+1, j_local2), lddat, queues[2*(d%ngpu)]); } if (d == (j+1)%tot_subs) { /* Set the local index where the current panel is */ int loff = j+1; int j_local = (j+1)/tot_subs; int ldda = maxm - (j+1)*nb; int cols = m - (j+1)*nb; nb0 = min(nb, mindim - (j+1)*nb); /* size of the diagonal block */ if (nb0 > 0) { /* transpose the panel for sending it to cpu */ magmablas_ctranspose( nb0, ldda, d_lAT(d,loff,j_local), lddat, d_lAP[d%ngpu], dlAP_offset + ((j+1)%(2+ngpu))*nb*maxm, ldda, queues[2*(d%ngpu)+1] ); /* send the panel to cpu */ magma_cgetmatrix_async( cols, nb0, d_lAP[d%ngpu], dlAP_offset + ((j+1)%(2+ngpu))*nb*maxm, ldda, W(j+1), ldw, queues[2*(d%ngpu)+1], NULL ); } } else { //trace_gpu_end( d, 0 ); } d = (d+1)%tot_subs; } /* update the remaining matrix by gpu owning the next panel */ if ((j+1) < s) { d = (j+1)%tot_subs; int j_local = (j+1)/tot_subs; int rows = m - (j+1)*nb; magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d] - (j_local+1)*nb, nb, c_one, dpanel_local[d], dpanel_local_offset[d], ldpan[d], d_lAT(d,j,j_local+1), lddat, queues[2*(d%ngpu)] ); magma_cgemm( MagmaNoTrans, MagmaNoTrans, n_local[d]-(j_local+1)*nb, rows, nb, c_neg_one, d_lAT(d,j,j_local+1), lddat, dpanel_local[d], dpanel_local_offset[d]+nb*ldpan[d], ldpan[d], c_one, d_lAT(d,j+1, j_local+1), lddat, queues[2*(d%ngpu)] ); } } /* end of for j=1..s */ /* ------------------------------------------------------------------------------ */ /* Set the GPU number that holds the last panel */ id = s%tot_subs; /* Set the local index where the last panel is */ j_local = s/tot_subs; /* size of the last diagonal-block */ nb0 = min(m - s*nb, n - s*nb); rows = m - s*nb; cols = maxm - s*nb; if (nb0 > 0) { /* wait for the last panel on cpu */ magma_queue_sync( queues[2*(id%ngpu)+1] ); /* factor on cpu */ lapackf77_cgetrf( &rows, &nb0, W(s), &ldw, ipiv+s*nb, &iinfo ); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + s*nb; /* send the factor to gpus */ for (d=0; d < ngpu; d++) { magma_csetmatrix_async( rows, nb0, W(s), ldw, d_lAP[d], dlAP_offset+(s%(2+ngpu))*nb*maxm, cols, queues[2*d+1], NULL ); } for( i=s*nb; i < s*nb + nb0; ++i ) { ipiv[i] += s*nb; } for (d=0; d < tot_subs; d++) { magmablas_claswp( lddat, d_lAT(d,0,0), lddat, s*nb + 1, s*nb + nb0, ipiv, 1, queues[2*(d%ngpu)] ); } d = id; for (dd=0; dd < tot_subs; dd++) { /* wait for the pivoting to be done */ if (dd < ngpu) { /* synch only the first time */ magma_queue_sync( queues[2*(d%ngpu)] ); } j_local2 = j_local; if (d%ngpu == id%ngpu) { /* the panel belond to this gpu */ dpanel_local[d] = d_lAT[id]; dpanel_local_offset[d] = d_lAT_offset(s, j_local); if (dd < ngpu) { magmablas_ctranspose( rows, nb0, d_lAP[d%ngpu], dlAP_offset+(s%(2+ngpu))*nb*maxm, cols, dpanel_local[d], dpanel_local_offset[d], lddat, queues[2*(d%ngpu)+1] ); } /* size of the "extra" block */ if (d == id) { /* the last diagonal block belongs to this submatrix */ nb1 = nb0; } else if (d < id) { nb1 = nb; } else { nb1 = 0; } if (n_local[d] > j_local*nb+nb1) { magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d] - (j_local*nb+nb1), nb0, c_one, dpanel_local[d], dpanel_local_offset[d], lddat, d_lAT(d, s, j_local)+nb1, lddat, queues[2*(d%ngpu)+1]); } } else if (n_local[d] > j_local2*nb) { /* the panel belong to another gpu */ dpanel_local[d] = d_panel[d%ngpu]; dpanel_local_offset[d] = (s%(2+ngpu))*nb*maxm; /* next column */ if (d < ngpu) { /* transpose only the first time */ magmablas_ctranspose( rows, nb0, d_lAP[d%ngpu], dlAP_offset+(s%(2+ngpu))*nb*maxm, cols, dpanel_local[d], dpanel_local_offset[d], nb, queues[2*(d%ngpu)+1] ); } if (d < id) j_local2++; nb1 = n_local[d] - j_local2*nb; magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb0, c_one, dpanel_local[d], dpanel_local_offset[d], nb, d_lAT(d,s,j_local2), lddat, queues[2*(d%ngpu)+1]); } d = (d+1)%tot_subs; } } /* if( nb0 > 0 ) */ /* clean up */ for (d=0; d < ngpu; d++) { magma_queue_sync( queues[2*d] ); magma_queue_sync( queues[2*d+1] ); } } return *info; /* End of MAGMA_CGETRF2_MSUB */ }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgetrf_batched */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf=0., cublas_time=0., cpu_perf=0, cpu_time=0; float error; magma_int_t cublas_enable = 0; magmaFloatComplex *h_A, *h_R; magmaFloatComplex *dA_magma; magmaFloatComplex **dA_array = NULL; magma_int_t **dipiv_array = NULL; magma_int_t *ipiv, *cpu_info; magma_int_t *dipiv_magma, *dinfo_magma; magma_int_t M, N, n2, lda, ldda, min_mn, info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t batchCount; magma_int_t status = 0; magma_opts opts( MagmaOptsBatched ); opts.parse_opts( argc, argv ); //opts.lapack |= opts.check; batchCount = opts.batchcount; magma_int_t columns; float tol = opts.tolerance * lapackf77_slamch("E"); printf("%% BatchCount M N CPU Gflop/s (ms) MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) ||PA-LU||/(||A||*N)\n"); printf("%%==========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N * batchCount; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default gflops = FLOPS_CGETRF( M, N ) / 1e9 * batchCount; TESTING_MALLOC_CPU( cpu_info, magma_int_t, batchCount ); TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn * batchCount ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_R, magmaFloatComplex, n2 ); TESTING_MALLOC_DEV( dA_magma, magmaFloatComplex, ldda*N * batchCount ); TESTING_MALLOC_DEV( dipiv_magma, magma_int_t, min_mn * batchCount ); TESTING_MALLOC_DEV( dinfo_magma, magma_int_t, batchCount ); TESTING_MALLOC_DEV( dA_array, magmaFloatComplex*, batchCount ); TESTING_MALLOC_DEV( dipiv_array, magma_int_t*, batchCount ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); // make A diagonally dominant, to not need pivoting for( int s=0; s < batchCount; ++s ) { for( int i=0; i < min_mn; ++i ) { h_A[ i + i*lda + s*lda*N ] = MAGMA_C_MAKE( MAGMA_C_REAL( h_A[ i + i*lda + s*lda*N ] ) + N, MAGMA_C_IMAG( h_A[ i + i*lda + s*lda*N ] )); } } columns = N * batchCount; lapackf77_clacpy( MagmaFullStr, &M, &columns, h_A, &lda, h_R, &lda ); magma_csetmatrix( M, columns, h_R, lda, dA_magma, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_cset_pointer( dA_array, dA_magma, ldda, 0, 0, ldda*N, batchCount, opts.queue ); magma_time = magma_sync_wtime( opts.queue ); info = magma_cgetrf_nopiv_batched( M, N, dA_array, ldda, dinfo_magma, batchCount, opts.queue); magma_time = magma_sync_wtime( opts.queue ) - magma_time; magma_perf = gflops / magma_time; // check correctness of results throught "dinfo_magma" and correctness of argument throught "info" magma_getvector( batchCount, sizeof(magma_int_t), dinfo_magma, 1, cpu_info, 1); for (int i=0; i < batchCount; i++) { if (cpu_info[i] != 0 ) { printf("magma_cgetrf_batched matrix %d returned internal error %d\n", i, (int)cpu_info[i] ); } } if (info != 0) { printf("magma_cgetrf_batched returned argument error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); for (int i=0; i < batchCount; i++) { lapackf77_cgetrf(&M, &N, h_A + i*lda*N, &lda, ipiv + i * min_mn, &info); assert( info == 0 ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_cgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } } /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%10d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f)", (int) batchCount, (int) M, (int) N, cpu_perf, cpu_time*1000., magma_perf, magma_time*1000., cublas_perf*cublas_enable, cublas_time*1000.*cublas_enable ); } else { printf("%10d %5d %5d --- ( --- ) %7.2f (%7.2f) %7.2f (%7.2f)", (int) batchCount, (int) M, (int) N, magma_perf, magma_time*1000., cublas_perf*cublas_enable, cublas_time*1000.*cublas_enable ); } if ( opts.check ) { // initialize ipiv to 1, 2, 3, ... for (int i=0; i < batchCount; i++) { for (int k=0; k < min_mn; k++) { ipiv[i*min_mn+k] = k+1; } } magma_cgetmatrix( M, N*batchCount, dA_magma, ldda, h_A, lda ); error = 0; for (int i=0; i < batchCount; i++) { float err; err = get_LU_error( M, N, h_R + i * lda*N, lda, h_A + i * lda*N, ipiv + i * min_mn); if ( isnan(err) || isinf(err) ) { error = err; break; } error = max( err, error ); } bool okay = (error < tol); status += ! okay; printf(" %8.2e %s\n", error, (okay ? "ok" : "failed") ); } else { printf(" --- \n"); } TESTING_FREE_CPU( cpu_info ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( dA_magma ); TESTING_FREE_DEV( dinfo_magma ); TESTING_FREE_DEV( dipiv_magma ); TESTING_FREE_DEV( dipiv_array ); TESTING_FREE_DEV( dA_array ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_cgetrf_m(magma_int_t num_gpus0, magma_int_t m, magma_int_t n, magmaFloatComplex *a, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix may not fit entirely in the GPU memory. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Note: The factorization of big panel is done calling multiple-gpu-interface. Pivots are applied on GPU within the big panel. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define A(i,j) (a + (j)*lda + (i)) #define inAT(d,i,j) (dAT[d] + (i)*nb*ldn_local + (j)*nb) #define inPT(d,i,j) (dPT[d] + (i)*nb*nb + (j)*nb*maxm) //#define PROFILE #ifdef PROFILE float flops, time_rmajor = 0, time_rmajor2 = 0, time_rmajor3 = 0, time_mem = 0; magma_timestr_t start, start1, start2, end1, end, start0 = get_current_time(); #endif magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *dAT[MagmaMaxGPUs], *dA[MagmaMaxGPUs], *dPT[MagmaMaxGPUs]; magma_int_t iinfo = 0, nb, nbi, maxm, n_local[MagmaMaxGPUs], ldn_local; magma_int_t N, M, NB, NBk, I, d, num_gpus; magma_int_t ii, jj, h, offset, ib, rows, s; magma_queue_t stream[MagmaMaxGPUs][2]; magma_event_t event[MagmaMaxGPUs][2]; *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* initialize nb */ nb = magma_get_cgetrf_nb(m); maxm = ((m + 31)/32)*32; /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(magmaFloatComplex); /* number of columns in the big panel */ h = 1+(2+num_gpus0); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); char * ngr_nb_char = getenv("MAGMA_NGR_NB"); if( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); //NB = 5*max(nb,32); if( num_gpus0 > ceil((float)NB/nb) ) { num_gpus = (int)ceil((float)NB/nb); h = 1+(2+num_gpus); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); } else { num_gpus = num_gpus0; } if( num_gpus*NB >= n ) { #ifdef CHECK_CGETRF_OOC printf( " * still fit in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_CGETRF_OOC printf( " * don't fit in GPU memory.\n" ); #endif NB = num_gpus*NB; NB = max(nb,(NB / nb) * nb); /* making sure it's devisable by nb (x64) */ } #ifdef CHECK_CGETRF_OOC if( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n",n,NB,nb,(float)freeMem ); else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n",n,NB,nb,(float)freeMem ); #endif if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code for scalar of one tile. */ lapackf77_cgetrf(&m, &n, a, &lda, ipiv, info); } else { /* Use hybrid blocked code. */ /* allocate memory on GPU to store the big panel */ #ifdef PROFILE start = get_current_time(); #endif n_local[0] = (NB/nb)/num_gpus; if( NB%(nb*num_gpus) != 0 ) n_local[0] ++; n_local[0] *= nb; ldn_local = ((n_local[0]+31)/32)*32; for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_cmalloc( &dA[d], (ldn_local+h*nb)*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dPT[d] = dA[d] + nb*maxm; /* for storing the previous panel from CPU */ dAT[d] = dA[d] + h*nb*maxm; /* for storing the big panel */ magma_queue_create( &stream[d][0] ); magma_queue_create( &stream[d][1] ); magma_event_create( &event[d][0] ); magma_event_create( &event[d][1] ); } //magma_setdevice(0); #ifdef PROFILE end = get_current_time(); printf( " memory-allocation time: %e\n",GetTimerValue(start, end)/1000.0 ); start = get_current_time(); #endif for( I=0; I<n; I+=NB ) { M = m; N = min( NB, n-I ); /* number of columns in this big panel */ s = min(max(m-I,0),N)/nb; /* number of small block-columns in this big panel */ maxm = ((M + 31)/32)*32; if( num_gpus0 > ceil((float)N/nb) ) { num_gpus = (int)ceil((float)N/nb); } else { num_gpus = num_gpus0; } for( d=0; d<num_gpus; d++ ) { n_local[d] = ((N/nb)/num_gpus)*nb; if (d < (N/nb)%num_gpus) n_local[d] += nb; else if (d == (N/nb)%num_gpus) n_local[d] += N%nb; } ldn_local = ((n_local[0]+31)/32)*32; #ifdef PROFILE start2 = get_current_time(); #endif /* upload the next big panel into GPU, transpose (A->A'), and pivot it */ magmablas_csetmatrix_transpose_mgpu(num_gpus, stream, A(0,I), lda, dAT, ldn_local, dA, maxm, M, N, nb); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } #ifdef PROFILE start1 = get_current_time(); #endif /* == --------------------------------------------------------------- == */ /* == loop around the previous big-panels to update the new big-panel == */ for( offset = 0; offset<min(m,I); offset+=NB ) { NBk = min( m-offset, NB ); /* start sending the first tile from the previous big-panels to gpus */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); nbi = min( nb, NBk ); magma_csetmatrix_async( (M-offset), nbi, A(offset,offset), lda, dA[d], (maxm-offset), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][0] ); /* transpose */ magmablas_ctranspose2( inPT(d,0,0), nb, dA[d], maxm-offset, M-offset, nbi); } /* applying the pivot from the previous big-panel */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][1]); magmablas_cpermute_long3( inAT(d,0,0), ldn_local, ipiv, NBk, offset ); } /* == going through each block-column of previous big-panels == */ for( jj=0, ib=offset/nb; jj<NBk; jj+=nb, ib++ ) { ii = offset+jj; rows = maxm - ii; nbi = min( nb, NBk-jj ); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); /* wait for a block-column on GPU */ magma_queue_sync( stream[d][0] ); /* start sending next column */ if( jj+nb < NBk ) { magma_csetmatrix_async( (M-ii-nb), min(nb,NBk-jj-nb), A(ii+nb,ii+nb), lda, dA[d], (rows-nb), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][(1+jj/nb)%2] ); /* transpose next column */ magmablas_ctranspose2( inPT(d,0,(1+jj/nb)%2), nb, dA[d], rows-nb, M-ii-nb, nb); } /* update with the block column */ magmablasSetKernelStream(stream[d][1]); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d], nbi, c_one, inPT(d,0,(jj/nb)%2), nb, inAT(d,ib,0), ldn_local ); if( M > ii+nb ) { magma_cgemm( MagmaNoTrans, MagmaNoTrans, n_local[d], M-(ii+nb), nbi, c_neg_one, inAT(d,ib,0), ldn_local, inPT(d,1,(jj/nb)%2), nb, c_one, inAT(d,ib+1,0), ldn_local ); } magma_event_record( event[d][(jj/nb)%2], stream[d][1] ); } /* end of for each block-columns in a big-panel */ } } /* end of for each previous big-panels */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } /* calling magma-gpu interface to panel-factorize the big panel */ if( M > I ) { //magma_cgetrf1_mgpu(num_gpus, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, &a[I*lda], lda, // (magma_queue_t **)stream, &iinfo); magma_cgetrf2_mgpu(num_gpus, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, A(0,I), lda, stream, &iinfo); if( iinfo < 0 ) { *info = iinfo; break; } else if( iinfo != 0 ) { *info = iinfo + I * NB; //break; } /* adjust pivots */ for( ii=I; ii<min(I+N,m); ii++ ) ipiv[ii] += I; } #ifdef PROFILE end1 = get_current_time(); time_rmajor += GetTimerValue(start1, end1); time_rmajor3 += GetTimerValue(start2, end1); time_mem += (GetTimerValue(start2, end1)-GetTimerValue(start1, end1))/1000.0; #endif /* download the current big panel to CPU */ magmablas_cgetmatrix_transpose_mgpu(num_gpus, stream, dAT, ldn_local, A(0,I), lda, dA, maxm, M, N, nb); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } #ifdef PROFILE end1 = get_current_time(); time_rmajor2 += GetTimerValue(start1, end1); #endif } /* end of for */ #ifdef PROFILE end = get_current_time(); flops = FLOPS_CGETRF( m, n ) / 1000000; printf(" NB=%d nb=%d\n",NB,nb); printf(" memcopy and transpose %e seconds\n",time_mem ); printf(" total time %e seconds\n",GetTimerValue(start0,end)/1000.0); printf(" Performance %f GFlop/s, %f seconds without htod and dtoh\n", flops / time_rmajor, time_rmajor /1000.0); printf(" Performance %f GFlop/s, %f seconds with htod\n", flops / time_rmajor3, time_rmajor3/1000.0); printf(" Performance %f GFlop/s, %f seconds with dtoh\n", flops / time_rmajor2, time_rmajor2/1000.0); printf(" Performance %f GFlop/s, %f seconds without memory-allocation\n", flops / GetTimerValue(start, end), GetTimerValue(start,end)/1000.0); #endif for( d=0; d<num_gpus0; d++ ) { magma_setdevice(d); magma_free( dA[d] ); magma_event_destroy( event[d][0] ); magma_event_destroy( event[d][1] ); magma_queue_destroy( stream[d][0] ); magma_queue_destroy( stream[d][1] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); } if( *info >= 0 ) magma_cgetrf_piv(m, n, NB, a, lda, ipiv, info); return *info; } /* magma_cgetrf_m */
/** Purpose ------- CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] d_lA COMPLEX array of pointers on the GPU, dimension (ngpu). On entry, the M-by-N matrix A distributed over GPUs (d_lA[d] points to the local matrix on d-th GPU). It uses 1D block column cyclic format with the block size of nb, and each local matrix is stored by column. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array d_lA. LDDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_cgesv_comp ********************************************************************/ extern "C" magma_int_t magma_cgetrf_mgpu( magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaFloatComplex_ptr d_lA[], magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info) { magma_int_t nb, n_local[MagmaMaxGPUs]; magma_int_t maxm; magma_int_t i, j, d, lddat, lddwork; magmaFloatComplex *d_lAT[MagmaMaxGPUs]; magmaFloatComplex *d_panel[MagmaMaxGPUs], *work; magma_queue_t queues[MagmaMaxGPUs][2]; /* Check arguments */ *info = 0; if (m < 0) *info = -2; else if (n < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* create the queues */ for( d=0; d < ngpu; d++ ) { magma_queue_create( d, &queues[d][0] ); magma_queue_create( d, &queues[d][1] ); } /* Function Body */ nb = magma_get_cgetrf_nb( m, n ); if (nb <= 1 || nb >= n) { /* Use CPU code. */ magma_cmalloc_cpu( &work, m * n ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_cgetmatrix( m, n, d_lA[0], ldda, work, m, queues[0][0] ); lapackf77_cgetrf(&m, &n, work, &m, ipiv, info); magma_csetmatrix( m, n, work, m, d_lA[0], ldda, queues[0][0] ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ magma_device_t orig_dev; magma_getdevice( &orig_dev ); maxm = magma_roundup( m, 32 ); if ( ngpu > ceil((float)n/nb) ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); *info = -1; return *info; } /* allocate workspace for each GPU */ lddat = magma_roundup( ((magma_ceildiv( n, nb )/ngpu)*nb), 32 ); lddat = magma_ceildiv( n, nb ); /* number of block columns */ lddat = magma_ceildiv( lddat, ngpu ); /* number of block columns per GPU */ lddat = nb*lddat; /* number of columns per GPU */ lddat = magma_roundup( lddat, 32 ); /* make it a multiple of 32 */ for (i=0; i < ngpu; i++) { magma_setdevice(i); /* local-n and local-ld */ n_local[i] = ((n/nb)/ngpu)*nb; if (i < (n/nb)%ngpu) n_local[i] += nb; else if (i == (n/nb)%ngpu) n_local[i] += n%nb; /* workspaces */ if (MAGMA_SUCCESS != magma_cmalloc( &d_panel[i], (3+ngpu)*nb*maxm )) { for( j=0; j <= i; j++ ) { magma_setdevice(j); } for( j=0; j < i; j++ ) { magma_setdevice(j); magma_free( d_panel[j] ); magma_free( d_lAT[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* local-matrix storage */ if (MAGMA_SUCCESS != magma_cmalloc( &d_lAT[i], lddat*maxm )) { for( j=0; j <= i; j++ ) { magma_setdevice(j); magma_free( d_panel[j] ); } for( j=0; j < i; j++ ) { magma_setdevice(j); magma_free( d_lAT[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmablas_ctranspose( m, n_local[i], d_lA[i], ldda, d_lAT[i], lddat, queues[i][1] ); } for (i=0; i < ngpu; i++) { magma_setdevice(i); magma_queue_sync(queues[i][0]); } magma_setdevice(0); /* cpu workspace */ lddwork = maxm; if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, lddwork*nb*ngpu )) { for (i=0; i < ngpu; i++ ) { magma_setdevice(i); magma_free( d_panel[i] ); magma_free( d_lAT[i] ); } *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* calling multi-gpu interface with allocated workspaces and queues */ magma_cgetrf2_mgpu(ngpu, m, n, nb, 0, d_lAT, lddat, ipiv, d_panel, work, maxm, queues, info); /* clean up */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); /* save on output */ magmablas_ctranspose( n_local[d], m, d_lAT[d], lddat, d_lA[d], ldda, queues[d][0] ); magma_queue_sync(queues[d][0]); magma_queue_sync(queues[d][1]); magma_free( d_lAT[d] ); magma_free( d_panel[d] ); } /* end of for d=1,..,ngpu */ magma_setdevice( orig_dev ); magma_free_pinned( work ); } /* clean up */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_destroy( queues[d][0] ); magma_queue_destroy( queues[d][1] ); } return *info; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgetrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error; magmaFloatComplex *h_A; magma_int_t *ipiv; magma_int_t M, N, n2, lda, info, min_mn; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("%% ngpu %d, version %d\n", (int) opts.ngpu, (int) opts.version ); if ( opts.check == 2 ) { printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("%%========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; gflops = FLOPS_CGETRF( M, N ) / 1e9; TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_PIN( h_A, magmaFloatComplex, n2 ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( opts, M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_cgetrf( &M, &N, h_A, &lda, ipiv, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_cgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( opts, M, N, h_A, lda ); if ( opts.version == 2 || opts.version == 3 ) { // no pivoting versions, so set ipiv to identity for (magma_int_t i=0; i < min_mn; ++i ) { ipiv[i] = i+1; } } gpu_time = magma_wtime(); if ( opts.version == 1 ) { magma_cgetrf( M, N, h_A, lda, ipiv, &info ); } else if ( opts.version == 2 ) { magma_cgetrf_nopiv( M, N, h_A, lda, &info ); } else if ( opts.version == 3 ) { magma_cgetf2_nopiv( M, N, h_A, lda, &info ); } gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_cgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { error = get_residual( opts, M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else if ( opts.check ) { error = get_LU_error( opts, M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf(" --- \n"); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_PIN( h_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ctrsm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf=0, cpu_time=0; float magma_error, cublas_error, work[1]; magma_int_t M, N, info; magma_int_t Ak; magma_int_t sizeA, sizeB; magma_int_t lda, ldb, ldda, lddb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t *piv; magma_err_t err; magmaFloatComplex *h_A, *h_B, *h_Bcublas, *h_Bmagma, *h_B1, *h_X1, *h_X2, *LU, *LUT; magmaFloatComplex *d_A, *d_B; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex alpha = MAGMA_C_MAKE( 0.29, -0.86 ); magma_opts opts; parse_opts( argc, argv, &opts ); printf("If running lapack (option --lapack), MAGMA and CUBLAS error are both computed\n" "relative to CPU BLAS result. Else, MAGMA error is computed relative to CUBLAS result.\n\n" "side = %c, uplo = %c, transA = %c, diag = %c \n", opts.side, opts.uplo, opts.transA, opts.diag ); printf(" M N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("==================================================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; gflops = FLOPS_CTRSM(opts.side, M, N) / 1e9; if ( opts.side == MagmaLeft ) { lda = M; Ak = M; } else { lda = N; Ak = N; } ldb = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; sizeA = lda*Ak; sizeB = ldb*N; TESTING_MALLOC( h_A, magmaFloatComplex, lda*Ak ); TESTING_MALLOC( LU, magmaFloatComplex, lda*Ak ); TESTING_MALLOC( LUT, magmaFloatComplex, lda*Ak ); TESTING_MALLOC( h_B, magmaFloatComplex, ldb*N ); TESTING_MALLOC( h_B1, magmaFloatComplex, ldb*N ); TESTING_MALLOC( h_X1, magmaFloatComplex, ldb*N ); TESTING_MALLOC( h_X2, magmaFloatComplex, ldb*N ); TESTING_MALLOC( h_Bcublas, magmaFloatComplex, ldb*N ); TESTING_MALLOC( h_Bmagma, magmaFloatComplex, ldb*N ); TESTING_DEVALLOC( d_A, magmaFloatComplex, ldda*Ak ); TESTING_DEVALLOC( d_B, magmaFloatComplex, lddb*N ); /* Initialize the matrices */ lapackf77_clarnv( &ione, ISEED, &sizeA, LU ); err = magma_malloc_cpu( (void**) &piv, Ak*sizeof(magma_int_t) ); assert( err == 0 ); lapackf77_cgetrf( &Ak, &Ak, LU, &lda, piv, &info ); int i, j; for(i=0;i<Ak;i++){ for(j=0;j<Ak;j++){ LUT[j+i*lda] = LU[i+j*lda]; } } lapackf77_clacpy(MagmaUpperStr, &Ak, &Ak, LUT, &lda, LU, &lda); if(opts.uplo == MagmaLower){ lapackf77_clacpy(MagmaLowerStr, &Ak, &Ak, LU, &lda, h_A, &lda); }else{ lapackf77_clacpy(MagmaUpperStr, &Ak, &Ak, LU, &lda, h_A, &lda); } lapackf77_clarnv( &ione, ISEED, &sizeB, h_B ); memcpy(h_B1, h_B, sizeB*sizeof(magmaFloatComplex)); /* ===================================================================== Performs operation using MAGMA-BLAS =================================================================== */ magma_csetmatrix( Ak, Ak, h_A, lda, d_A, ldda ); magma_csetmatrix( M, N, h_B, ldb, d_B, lddb ); magma_time = magma_sync_wtime( NULL ); magmablas_ctrsm( opts.side, opts.uplo, opts.transA, opts.diag, M, N, alpha, d_A, ldda, d_B, lddb ); magma_time = magma_sync_wtime( NULL ) - magma_time; magma_perf = gflops / magma_time; magma_cgetmatrix( M, N, d_B, lddb, h_Bmagma, ldb ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_csetmatrix( M, N, h_B, ldb, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ); cublasCtrsm( opts.side, opts.uplo, opts.transA, opts.diag, M, N, alpha, d_A, ldda, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_cgetmatrix( M, N, d_B, lddb, h_Bcublas, ldb ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_ctrsm( &opts.side, &opts.uplo, &opts.transA, &opts.diag, &M, &N, &alpha, h_A, &lda, h_B, &ldb ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ // ||b - Ax|| / (||A||*||x||) memcpy(h_X1, h_Bmagma, sizeB*sizeof(magmaFloatComplex)); magmaFloatComplex alpha2 = MAGMA_C_DIV( c_one, alpha ); blasf77_ctrmm( &opts.side, &opts.uplo, &opts.transA, &opts.diag, &M, &N, &alpha2, h_A, &lda, h_X1, &ldb ); blasf77_caxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X1, &ione ); float norm1 = lapackf77_clange( "M", &M, &N, h_X1, &ldb, work ); float normx = lapackf77_clange( "M", &M, &N, h_Bmagma, &ldb, work ); float normA = lapackf77_clange( "M", &Ak, &Ak, h_A, &lda, work ); magma_error = norm1/(normx*normA); memcpy(h_X2, h_Bcublas, sizeB*sizeof(magmaFloatComplex)); blasf77_ctrmm( &opts.side, &opts.uplo, &opts.transA, &opts.diag, &M, &N, &alpha2, h_A, &lda, h_X2, &ldb ); blasf77_caxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X2, &ione ); norm1 = lapackf77_clange( "M", &M, &N, h_X2, &ldb, work ); normx = lapackf77_clange( "M", &M, &N, h_Bcublas, &ldb, work ); normA = lapackf77_clange( "M", &Ak, &Ak, h_A, &lda, work ); cublas_error = norm1/(normx*normA); if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error ); } else { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) --- ( --- ) %8.2e %8.2e\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, magma_error, cublas_error ); } TESTING_FREE( h_A ); TESTING_FREE( LU ); TESTING_FREE( LUT ); TESTING_FREE( h_B ); TESTING_FREE( h_Bcublas ); TESTING_FREE( h_Bmagma ); TESTING_FREE( h_B1 ); TESTING_FREE( h_X1 ); TESTING_FREE( h_X2 ); TESTING_DEVFREE( d_A ); TESTING_DEVFREE( d_B ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
/** Purpose ------- CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA COMPLEX array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_cgesv_comp ********************************************************************/ extern "C" magma_int_t magma_cgetrf_gpu( magma_int_t m, magma_int_t n, magmaFloatComplex_ptr dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info ) { #ifdef HAVE_clBLAS #define dA(i_, j_) dA, (dA_offset + (i_) + (j_)*ldda) #define dAT(i_, j_) dAT, (dAT_offset + (i_)*lddat + (j_)) #define dAP(i_, j_) dAP, ( (i_) + (j_)*maxm) #else #define dA(i_, j_) (dA + (i_) + (j_)*ldda) #define dAT(i_, j_) (dAT + (i_)*lddat + (j_)) #define dAP(i_, j_) (dAP + (i_) + (j_)*maxm) #endif magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t iinfo, nb; magma_int_t maxm, maxn, minmn; magma_int_t i, j, jb, rows, lddat, ldwork; magmaFloatComplex_ptr dAT=NULL, dAP=NULL; magmaFloatComplex *work=NULL; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ minmn = min( m, n ); nb = magma_get_cgetrf_nb( m, n ); magma_queue_t queues[2] = { NULL }; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); if (nb <= 1 || nb >= min(m,n)) { /* Use CPU code. */ if ( MAGMA_SUCCESS != magma_cmalloc_cpu( &work, m*n )) { *info = MAGMA_ERR_HOST_ALLOC; goto cleanup; } magma_cgetmatrix( m, n, dA(0,0), ldda, work, m, queues[0] ); lapackf77_cgetrf( &m, &n, work, &m, ipiv, info ); magma_csetmatrix( m, n, work, m, dA(0,0), ldda, queues[0] ); magma_free_cpu( work ); work=NULL; } else { /* Use hybrid blocked code. */ maxm = magma_roundup( m, 32 ); maxn = magma_roundup( n, 32 ); if (MAGMA_SUCCESS != magma_cmalloc( &dAP, nb*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto cleanup; } // square matrices can be done in place; // rectangular requires copy to transpose if ( m == n ) { dAT = dA; lddat = ldda; magmablas_ctranspose_inplace( m, dAT(0,0), lddat, queues[0] ); } else { lddat = maxn; // N-by-M if (MAGMA_SUCCESS != magma_cmalloc( &dAT, lddat*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto cleanup; } magmablas_ctranspose( m, n, dA(0,0), ldda, dAT(0,0), lddat, queues[0] ); } magma_queue_sync( queues[0] ); // finish transpose ldwork = maxm; if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, ldwork*nb )) { *info = MAGMA_ERR_HOST_ALLOC; goto cleanup; } for( j=0; j < minmn-nb; j += nb ) { // get j-th panel from device magmablas_ctranspose( nb, m-j, dAT(j,j), lddat, dAP(0,0), maxm, queues[1] ); magma_queue_sync( queues[1] ); // wait for transpose magma_cgetmatrix_async( m-j, nb, dAP(0,0), maxm, work, ldwork, queues[0] ); if ( j > 0 ) { magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-(j+nb), nb, c_one, dAT(j-nb, j-nb), lddat, dAT(j-nb, j+nb), lddat, queues[1] ); magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-(j+nb), m-j, nb, c_neg_one, dAT(j-nb, j+nb), lddat, dAT(j, j-nb), lddat, c_one, dAT(j, j+nb), lddat, queues[1] ); } // do the cpu part rows = m - j; magma_queue_sync( queues[0] ); // wait to get work lapackf77_cgetrf( &rows, &nb, work, &ldwork, ipiv+j, &iinfo ); if ( *info == 0 && iinfo > 0 ) *info = iinfo + j; // send j-th panel to device magma_csetmatrix_async( m-j, nb, work, ldwork, dAP, maxm, queues[0] ); for( i=j; i < j + nb; ++i ) { ipiv[i] += j; } magmablas_claswp( n, dAT(0,0), lddat, j + 1, j + nb, ipiv, 1, queues[1] ); magma_queue_sync( queues[0] ); // wait to set dAP magmablas_ctranspose( m-j, nb, dAP(0,0), maxm, dAT(j,j), lddat, queues[1] ); // do the small non-parallel computations (next panel update) if ( j + nb < minmn - nb ) { magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+nb), lddat, queues[1] ); magma_cgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(j+nb), nb, c_neg_one, dAT(j, j+nb), lddat, dAT(j+nb, j ), lddat, c_one, dAT(j+nb, j+nb), lddat, queues[1] ); } else { magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-(j+nb), nb, c_one, dAT(j, j ), lddat, dAT(j, j+nb), lddat, queues[1] ); magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-(j+nb), m-(j+nb), nb, c_neg_one, dAT(j, j+nb), lddat, dAT(j+nb, j ), lddat, c_one, dAT(j+nb, j+nb), lddat, queues[1] ); } } jb = min( m-j, n-j ); if ( jb > 0 ) { rows = m - j; magmablas_ctranspose( jb, rows, dAT(j,j), lddat, dAP(0,0), maxm, queues[1] ); magma_cgetmatrix( rows, jb, dAP(0,0), maxm, work, ldwork, queues[1] ); // do the cpu part lapackf77_cgetrf( &rows, &jb, work, &ldwork, ipiv+j, &iinfo ); if ( *info == 0 && iinfo > 0 ) *info = iinfo + j; for( i=j; i < j + jb; ++i ) { ipiv[i] += j; } magmablas_claswp( n, dAT(0,0), lddat, j + 1, j + jb, ipiv, 1, queues[1] ); // send j-th panel to device magma_csetmatrix( rows, jb, work, ldwork, dAP(0,0), maxm, queues[1] ); magmablas_ctranspose( rows, jb, dAP(0,0), maxm, dAT(j,j), lddat, queues[1] ); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-j-jb, jb, c_one, dAT(j,j), lddat, dAT(j,j+jb), lddat, queues[1] ); } // undo transpose if ( m == n ) { magmablas_ctranspose_inplace( m, dAT(0,0), lddat, queues[1] ); } else { magmablas_ctranspose( n, m, dAT(0,0), lddat, dA(0,0), ldda, queues[1] ); } } cleanup: magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); magma_free( dAP ); if (m != n) { magma_free( dAT ); } magma_free_pinned( work ); return *info; } /* magma_cgetrf_gpu */
extern "C" magma_int_t magma_cgetrf_gpu(magma_int_t m, magma_int_t n, magmaFloatComplex *dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. If the current stream is NULL, this version replaces it with user defined stream to overlap computation with communication. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define dAT(i,j) (dAT + (i)*nb*lddat + (j)*nb) magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t iinfo, nb; magma_int_t maxm, maxn, mindim; magma_int_t i, rows, cols, s, lddat, lddwork; magmaFloatComplex *dAT, *dAP, *work; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); nb = magma_get_cgetrf_nb(m); s = mindim / nb; if (nb <= 1 || nb >= min(m,n)) { /* Use CPU code. */ magma_cmalloc_cpu( &work, m * n ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_cgetmatrix( m, n, dA, ldda, work, m ); lapackf77_cgetrf(&m, &n, work, &m, ipiv, info); magma_csetmatrix( m, n, work, m, dA, ldda ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; lddat = maxn; lddwork = maxm; dAT = dA; if (MAGMA_SUCCESS != magma_cmalloc( &dAP, nb*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if ( m == n ) { lddat = ldda; magmablas_ctranspose_inplace( m, dAT, ldda ); } else { if (MAGMA_SUCCESS != magma_cmalloc( &dAT, maxm*maxn )) { magma_free( dAP ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmablas_ctranspose2( dAT, lddat, dA, ldda, m, n ); } if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, maxm*nb )) { magma_free( dAP ); if ( ! (m == n)) magma_free( dAT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* Define user stream if current stream is NULL */ cudaStream_t stream[2], current_stream; magmablasGetKernelStream(¤t_stream); magma_queue_create( &stream[0] ); if (current_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else stream[1] = current_stream; for( i=0; i<s; i++ ) { // download i-th panel cols = maxm - i*nb; //magmablas_ctranspose( dAP, cols, dAT(i,i), lddat, nb, cols ); magmablas_ctranspose2( dAP, cols, dAT(i,i), lddat, nb, m-i*nb ); // make sure that that the transpose has completed magma_queue_sync( stream[1] ); magma_cgetmatrix_async( m-i*nb, nb, dAP, cols, work, lddwork, stream[0]); if ( i>0 ){ magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (i+1)*nb, nb, c_one, dAT(i-1,i-1), lddat, dAT(i-1,i+1), lddat ); magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-(i+1)*nb, m-i*nb, nb, c_neg_one, dAT(i-1,i+1), lddat, dAT(i, i-1), lddat, c_one, dAT(i, i+1), lddat ); } // do the cpu part rows = m - i*nb; magma_queue_sync( stream[0] ); lapackf77_cgetrf( &rows, &nb, work, &lddwork, ipiv+i*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + i*nb; // upload i-th panel magma_csetmatrix_async( m-i*nb, nb, work, lddwork, dAP, maxm, stream[0]); magmablas_cpermute_long2( n, dAT, lddat, ipiv, nb, i*nb ); magma_queue_sync( stream[0] ); //magmablas_ctranspose(dAT(i,i), lddat, dAP, maxm, cols, nb); magmablas_ctranspose2(dAT(i,i), lddat, dAP, maxm, m-i*nb, nb); // do the small non-parallel computations (next panel update) if ( s > (i+1) ) { magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dAT(i, i ), lddat, dAT(i, i+1), lddat); magma_cgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(i+1)*nb, nb, c_neg_one, dAT(i, i+1), lddat, dAT(i+1, i ), lddat, c_one, dAT(i+1, i+1), lddat ); } else { magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, dAT(i, i ), lddat, dAT(i, i+1), lddat); magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-(i+1)*nb, m-(i+1)*nb, nb, c_neg_one, dAT(i, i+1), lddat, dAT(i+1, i ), lddat, c_one, dAT(i+1, i+1), lddat ); } } magma_int_t nb0 = min(m - s*nb, n - s*nb); rows = m - s*nb; cols = maxm - s*nb; magmablas_ctranspose2( dAP, maxm, dAT(s,s), lddat, nb0, rows); magma_cgetmatrix( rows, nb0, dAP, maxm, work, lddwork ); // do the cpu part lapackf77_cgetrf( &rows, &nb0, work, &lddwork, ipiv+s*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + s*nb; magmablas_cpermute_long2( n, dAT, lddat, ipiv, nb0, s*nb ); // upload i-th panel magma_csetmatrix( rows, nb0, work, lddwork, dAP, maxm ); magmablas_ctranspose2( dAT(s,s), lddat, dAP, maxm, rows, nb0); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, dAT(s,s), lddat, dAT(s,s)+nb0, lddat); if ( m == n ) { magmablas_ctranspose_inplace( m, dAT, lddat ); } else { magmablas_ctranspose2( dA, ldda, dAT, lddat, n, m ); magma_free( dAT ); } magma_free( dAP ); magma_free_pinned( work ); magma_queue_destroy( stream[0] ); if (current_stream == NULL) { magma_queue_destroy( stream[1] ); magmablasSetKernelStream(NULL); } } return *info; } /* End of MAGMA_CGETRF_GPU */
extern "C" magma_int_t magma_cgetrf(magma_int_t m, magma_int_t n, cuFloatComplex *a, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define inAT(i,j) (dAT + (i)*nb*ldda + (j)*nb) cuFloatComplex *dAT, *dA, *da, *work; cuFloatComplex c_one = MAGMA_C_ONE; cuFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t iinfo, nb; *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; nb = magma_get_cgetrf_nb(m); if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code. */ lapackf77_cgetrf(&m, &n, a, &lda, ipiv, info); } else { /* Use hybrid blocked code. */ magma_int_t maxm, maxn, ldda, maxdim; magma_int_t i, rows, cols, s = min(m, n)/nb; magma_int_t num_gpus = magma_num_gpus(); if ( num_gpus > 1 ) { /* call multi-GPU non-GPU-resident interface */ magma_int_t rval = magma_cgetrf_m(num_gpus, m, n, a, lda, ipiv, info); if( *info >= 0 ) magma_cgetrf_piv(num_gpus, m, n, a, lda, ipiv, info); return *info; } maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; maxdim = max(maxm, maxn); ldda = maxn; work = a; if (maxdim*maxdim < 2*maxm*maxn) { if (MAGMA_SUCCESS != magma_cmalloc( &dA, nb*maxm + maxdim*maxdim )) { /* alloc failed so call non-GPU-resident version */ magma_int_t rval = magma_cgetrf_m(num_gpus, m, n, a, lda, ipiv, info); if( *info >= 0 ) magma_cgetrf_piv(num_gpus, m, n, a, lda, ipiv, info); return *info; } da = dA + nb*maxm; ldda = maxdim; magma_csetmatrix( m, n, a, lda, da, ldda ); dAT = da; magmablas_cinplace_transpose( dAT, ldda, ldda ); } else { if (MAGMA_SUCCESS != magma_cmalloc( &dA, (nb + maxn)*maxm )) { /* alloc failed so call non-GPU-resident version */ magma_int_t rval = magma_cgetrf_m(num_gpus, m, n, a, lda, ipiv, info); if( *info >= 0 ) magma_cgetrf_piv(num_gpus, m, n, a, lda, ipiv, info); return *info; } da = dA + nb*maxm; magma_csetmatrix( m, n, a, lda, da, maxm ); if (MAGMA_SUCCESS != magma_cmalloc( &dAT, maxm*maxn )) { /* alloc failed so call non-GPU-resident version */ magma_free( dA ); magma_int_t rval = magma_cgetrf_m(num_gpus, m, n, a, lda, ipiv, info); if( *info >= 0 ) magma_cgetrf_piv(num_gpus, m, n, a, lda, ipiv, info); return *info; } magmablas_ctranspose2( dAT, ldda, da, maxm, m, n ); } lapackf77_cgetrf( &m, &nb, work, &lda, ipiv, &iinfo); for( i = 0; i < s; i++ ) { // download i-th panel cols = maxm - i*nb; if (i>0){ magmablas_ctranspose( dA, cols, inAT(i,i), ldda, nb, cols ); magma_cgetmatrix( m-i*nb, nb, dA, cols, work, lda ); // make sure that gpu queue is empty magma_device_sync(); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (i+1)*nb, nb, c_one, inAT(i-1,i-1), ldda, inAT(i-1,i+1), ldda ); magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-(i+1)*nb, m-i*nb, nb, c_neg_one, inAT(i-1,i+1), ldda, inAT(i, i-1), ldda, c_one, inAT(i, i+1), ldda ); // do the cpu part rows = m - i*nb; lapackf77_cgetrf( &rows, &nb, work, &lda, ipiv+i*nb, &iinfo); } if (*info == 0 && iinfo > 0) *info = iinfo + i*nb; magmablas_cpermute_long2( ldda, dAT, ldda, ipiv, nb, i*nb ); // upload i-th panel magma_csetmatrix( m-i*nb, nb, work, lda, dA, cols ); magmablas_ctranspose( inAT(i,i), ldda, dA, cols, cols, nb); // do the small non-parallel computations if (s > (i+1)){ magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, inAT(i, i ), ldda, inAT(i, i+1), ldda); magma_cgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(i+1)*nb, nb, c_neg_one, inAT(i, i+1), ldda, inAT(i+1, i ), ldda, c_one, inAT(i+1, i+1), ldda ); } else{ magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, inAT(i, i ), ldda, inAT(i, i+1), ldda); magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-(i+1)*nb, m-(i+1)*nb, nb, c_neg_one, inAT(i, i+1), ldda, inAT(i+1, i ), ldda, c_one, inAT(i+1, i+1), ldda ); } } magma_int_t nb0 = min(m - s*nb, n - s*nb); if ( nb0 > 0 ) { rows = m - s*nb; cols = maxm - s*nb; magmablas_ctranspose2( dA, cols, inAT(s,s), ldda, nb0, rows); magma_cgetmatrix( rows, nb0, dA, cols, work, lda ); // make sure that gpu queue is empty magma_device_sync(); // do the cpu part lapackf77_cgetrf( &rows, &nb0, work, &lda, ipiv+s*nb, &iinfo); if (*info == 0 && iinfo > 0) *info = iinfo + s*nb; magmablas_cpermute_long2( ldda, dAT, ldda, ipiv, nb0, s*nb ); magma_csetmatrix( rows, nb0, work, lda, dA, cols ); magmablas_ctranspose2( inAT(s,s), ldda, dA, cols, rows, nb0); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, inAT(s, s), ldda, inAT(s, s)+nb0, ldda); } if (maxdim*maxdim< 2*maxm*maxn){ magmablas_cinplace_transpose( dAT, ldda, ldda ); magma_cgetmatrix( m, n, da, ldda, a, lda ); } else { magmablas_ctranspose2( da, maxm, dAT, ldda, n, m ); magma_cgetmatrix( m, n, da, maxm, a, lda ); magma_free( dAT ); } magma_free( dA ); } return *info; } /* magma_cgetrf */
/** Purpose ------- CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. If the current stream is NULL, this version replaces it with user defined stream to overlap computation with communication. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_cgesv_comp ********************************************************************/ extern "C" magma_int_t magma_cgetrf(magma_int_t m, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { #define dAT(i,j) (dAT + (i)*nb*ldda + (j)*nb) magmaFloatComplex *dAT, *dA, *da, *work; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t iinfo, nb; *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; nb = magma_get_cgetrf_nb(m); if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code. */ lapackf77_cgetrf(&m, &n, A, &lda, ipiv, info); } else { /* Use hybrid blocked code. */ magma_int_t maxm, maxn, ldda, maxdim; magma_int_t i, rows, cols, s = min(m, n)/nb; maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; maxdim = max(maxm, maxn); /* set number of GPUs */ magma_int_t num_gpus = magma_num_gpus(); if ( num_gpus > 1 ) { /* call multi-GPU non-GPU-resident interface */ magma_cgetrf_m(num_gpus, m, n, A, lda, ipiv, info); return *info; } /* explicitly checking the memory requirement */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(magmaFloatComplex); int h = 1+(2+num_gpus), num_gpus2 = num_gpus; int NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); const char* ngr_nb_char = getenv("MAGMA_NGR_NB"); if ( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); if ( num_gpus > ceil((float)NB/nb) ) { num_gpus2 = (int)ceil((float)NB/nb); h = 1+(2+num_gpus2); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); } if ( num_gpus2*NB < n ) { /* require too much memory, so call non-GPU-resident version */ magma_cgetrf_m(num_gpus, m, n, A, lda, ipiv, info); return *info; } ldda = maxn; work = A; if (maxdim*maxdim < 2*maxm*maxn) { // if close to square, allocate square matrix and transpose in-place if (MAGMA_SUCCESS != magma_cmalloc( &dA, nb*maxm + maxdim*maxdim )) { /* alloc failed so call non-GPU-resident version */ magma_cgetrf_m(num_gpus, m, n, A, lda, ipiv, info); return *info; } da = dA + nb*maxm; ldda = maxdim; magma_csetmatrix( m, n, A, lda, da, ldda ); dAT = da; magmablas_ctranspose_inplace( ldda, dAT, ldda ); } else { // if very rectangular, allocate dA and dAT and transpose out-of-place if (MAGMA_SUCCESS != magma_cmalloc( &dA, (nb + maxn)*maxm )) { /* alloc failed so call non-GPU-resident version */ magma_cgetrf_m(num_gpus, m, n, A, lda, ipiv, info); return *info; } da = dA + nb*maxm; magma_csetmatrix( m, n, A, lda, da, maxm ); if (MAGMA_SUCCESS != magma_cmalloc( &dAT, maxm*maxn )) { /* alloc failed so call non-GPU-resident version */ magma_free( dA ); magma_cgetrf_m(num_gpus, m, n, A, lda, ipiv, info); return *info; } magmablas_ctranspose( m, n, da, maxm, dAT, ldda ); } lapackf77_cgetrf( &m, &nb, work, &lda, ipiv, &iinfo); /* Define user stream if current stream is NULL */ cudaStream_t stream[2], current_stream; magmablasGetKernelStream(¤t_stream); magma_queue_create( &stream[0] ); if (current_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else stream[1] = current_stream; for( i = 0; i < s; i++ ) { // download i-th panel cols = maxm - i*nb; if (i > 0) { // download i-th panel magmablas_ctranspose( nb, cols, dAT(i,i), ldda, dA, cols ); // make sure that gpu queue is empty magma_device_sync(); magma_cgetmatrix_async( m-i*nb, nb, dA, cols, work, lda, stream[0]); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (i+1)*nb, nb, c_one, dAT(i-1,i-1), ldda, dAT(i-1,i+1), ldda ); magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-(i+1)*nb, m-i*nb, nb, c_neg_one, dAT(i-1,i+1), ldda, dAT(i, i-1), ldda, c_one, dAT(i, i+1), ldda ); // do the cpu part rows = m - i*nb; magma_queue_sync( stream[0] ); lapackf77_cgetrf( &rows, &nb, work, &lda, ipiv+i*nb, &iinfo); } if (*info == 0 && iinfo > 0) *info = iinfo + i*nb; // upload i-th panel magma_csetmatrix_async( m-i*nb, nb, work, lda, dA, cols, stream[0]); magmablas_cpermute_long2( ldda, dAT, ldda, ipiv, nb, i*nb ); magma_queue_sync( stream[0] ); magmablas_ctranspose( cols, nb, dA, cols, dAT(i,i), ldda ); // do the small non-parallel computations if (s > (i+1)) { magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dAT(i, i ), ldda, dAT(i, i+1), ldda); magma_cgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(i+1)*nb, nb, c_neg_one, dAT(i, i+1), ldda, dAT(i+1, i ), ldda, c_one, dAT(i+1, i+1), ldda ); } else { magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, dAT(i, i ), ldda, dAT(i, i+1), ldda); magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-(i+1)*nb, m-(i+1)*nb, nb, c_neg_one, dAT(i, i+1), ldda, dAT(i+1, i ), ldda, c_one, dAT(i+1, i+1), ldda ); } } magma_int_t nb0 = min(m - s*nb, n - s*nb); if ( nb0 > 0 ) { rows = m - s*nb; cols = maxm - s*nb; magmablas_ctranspose( nb0, rows, dAT(s,s), ldda, dA, cols ); magma_cgetmatrix( rows, nb0, dA, cols, work, lda ); // make sure that gpu queue is empty magma_device_sync(); // do the cpu part lapackf77_cgetrf( &rows, &nb0, work, &lda, ipiv+s*nb, &iinfo); if (*info == 0 && iinfo > 0) *info = iinfo + s*nb; magmablas_cpermute_long2( ldda, dAT, ldda, ipiv, nb0, s*nb ); magma_csetmatrix( rows, nb0, work, lda, dA, cols ); magmablas_ctranspose( rows, nb0, dA, cols, dAT(s,s), ldda ); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, dAT(s, s), ldda, dAT(s, s)+nb0, ldda); } if (maxdim*maxdim < 2*maxm*maxn) { magmablas_ctranspose_inplace( ldda, dAT, ldda ); magma_cgetmatrix( m, n, da, ldda, A, lda ); } else { magmablas_ctranspose( n, m, dAT, ldda, da, maxm ); magma_cgetmatrix( m, n, da, maxm, A, lda ); magma_free( dAT ); } magma_free( dA ); magma_queue_destroy( stream[0] ); if (current_stream == NULL) { magma_queue_destroy( stream[1] ); magmablasSetKernelStream(NULL); } } return *info; } /* magma_cgetrf */
int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, t1, t2; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_trans_t trans[] = { MagmaNoTrans, MagmaConjTrans, MagmaTrans }; magma_uplo_t uplo [] = { MagmaLower, MagmaUpper }; magma_diag_t diag [] = { MagmaUnit, MagmaNonUnit }; magma_side_t side [] = { MagmaLeft, MagmaRight }; magmaFloatComplex *A, *B, *C, *C2, *LU; magmaFloatComplex_ptr dA, dB, dC1, dC2; magmaFloatComplex alpha = MAGMA_C_MAKE( 0.5, 0.1 ); magmaFloatComplex beta = MAGMA_C_MAKE( 0.7, 0.2 ); float dalpha = 0.6; float dbeta = 0.8; float work[1], error, total_error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld, info; magma_int_t *piv; magma_int_t err; magma_opts opts; parse_opts( argc, argv, &opts ); printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" ); total_error = 0.; for( int itest = 0; itest < opts.ntest; ++itest ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; printf("=========================================================================\n"); printf( "m=%d, n=%d, k=%d\n", (int) m, (int) n, (int) k ); // allocate matrices // over-allocate so they can be any combination of {m,n,k} x {m,n,k}. maxn = max( max( m, n ), k ); ld = max( 1, maxn ); size = ld*maxn; err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) ); assert( err == 0 ); err = magma_cmalloc_pinned( &A, size ); assert( err == 0 ); err = magma_cmalloc_pinned( &B, size ); assert( err == 0 ); err = magma_cmalloc_pinned( &C, size ); assert( err == 0 ); err = magma_cmalloc_pinned( &C2, size ); assert( err == 0 ); err = magma_cmalloc_pinned( &LU, size ); assert( err == 0 ); err = magma_cmalloc( &dA, size ); assert( err == 0 ); err = magma_cmalloc( &dB, size ); assert( err == 0 ); err = magma_cmalloc( &dC1, size ); assert( err == 0 ); err = magma_cmalloc( &dC2, size ); assert( err == 0 ); // initialize matrices size = maxn*maxn; lapackf77_clarnv( &ione, ISEED, &size, A ); lapackf77_clarnv( &ione, ISEED, &size, B ); lapackf77_clarnv( &ione, ISEED, &size, C ); printf( "========== Level 1 BLAS ==========\n" ); // ----- test CSWAP // swap columns 2 and 3 of dA, then copy to C2 and compare with A if ( n >= 3 ) { magma_csetmatrix( m, n, A, ld, dA, ld ); magma_csetmatrix( m, n, A, ld, dB, ld ); magma_cswap( m, dA(0,1), 1, dA(0,2), 1 ); magma_cswap( m, dB(0,1), 1, dB(0,2), 1 ); // check results, storing diff between magma and cuda calls in C2 cublasCaxpy( opts.handle, ld*n, &c_neg_one, dA, 1, dB, 1 ); magma_cgetmatrix( m, n, dB, ld, C2, ld ); error = lapackf77_clange( "F", &m, &k, C2, &ld, work ); total_error += error; printf( "cswap diff %.2g\n", error ); } else { printf( "cswap skipped for n < 3\n" ); } // ----- test ICAMAX // get argmax of column of A magma_csetmatrix( m, k, A, ld, dA, ld ); error = 0; for( int j = 0; j < k; ++j ) { magma_int_t i1 = magma_icamax( m, dA(0,j), 1 ); int i2; // NOT magma_int_t, for cublas cublasIcamax( opts.handle, m, dA(0,j), 1, &i2 ); // todo need sync here? assert( i1 == i2 ); error += abs( i1 - i2 ); } total_error += error; gflops = (float)m * k / 1e9; printf( "icamax diff %.2g\n", error ); printf( "\n" ); printf( "========== Level 2 BLAS ==========\n" ); // ----- test CGEMV // c = alpha*A*b + beta*c, with A m*n; b,c m or n-vectors // try no-trans/trans for( int ia = 0; ia < 3; ++ia ) { magma_csetmatrix( m, n, A, ld, dA, ld ); magma_csetvector( maxn, B, 1, dB, 1 ); magma_csetvector( maxn, C, 1, dC1, 1 ); magma_csetvector( maxn, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_cgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCgemv( opts.handle, cublas_trans_const(trans[ia]), m, n, &alpha, dA, ld, dB, 1, &beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 size = (trans[ia] == MagmaNoTrans ? m : n); cublasCaxpy( opts.handle, size, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetvector( size, dC2, 1, C2, 1 ); error = lapackf77_clange( "F", &size, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_CGEMV( m, n ) / 1e9; printf( "cgemv( %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_trans_const(trans[ia]), error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test CHEMV // c = alpha*A*b + beta*c, with A m*m symmetric; b,c m-vectors // try upper/lower for( int iu = 0; iu < 2; ++iu ) { magma_csetmatrix( m, m, A, ld, dA, ld ); magma_csetvector( m, B, 1, dB, 1 ); magma_csetvector( m, C, 1, dC1, 1 ); magma_csetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_chemv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasChemv( opts.handle, cublas_uplo_const(uplo[iu]), m, &alpha, dA, ld, dB, 1, &beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( opts.handle, m, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_clange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_CHEMV( m ) / 1e9; printf( "chemv( %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test CTRSV // solve A*c = c, with A m*m triangular; c m-vector // try upper/lower, no-trans/trans, unit/non-unit diag // Factor A into LU to get well-conditioned triangles, else solve yields garbage. // Still can give garbage if solves aren't consistent with LU factors, // e.g., using unit diag for U, so copy lower triangle to upper triangle. // Also used for trsm later. lapackf77_clacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld ); lapackf77_cgetrf( &maxn, &maxn, LU, &ld, piv, &info ); for( int j = 0; j < maxn; ++j ) { for( int i = 0; i < j; ++i ) { *LU(i,j) = *LU(j,i); } } for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { magma_csetmatrix( m, m, LU, ld, dA, ld ); magma_csetvector( m, C, 1, dC1, 1 ); magma_csetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_ctrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCtrsv( opts.handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, dA, ld, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( opts.handle, m, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_clange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_CTRSM( MagmaLeft, m, 1 ) / 1e9; printf( "ctrsv( %c, %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), lapacke_diag_const(diag[id]), error, gflops/t1, gflops/t2 ); }}} printf( "\n" ); printf( "========== Level 3 BLAS ==========\n" ); // ----- test CGEMM // C = alpha*A*B + beta*C, with A m*k or k*m; B k*n or n*k; C m*n // try combinations of no-trans/trans for( int ia = 0; ia < 3; ++ia ) { for( int ib = 0; ib < 3; ++ib ) { bool nta = (trans[ia] == MagmaNoTrans); bool ntb = (trans[ib] == MagmaNoTrans); magma_csetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA, ld ); magma_csetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB, ld ); magma_csetmatrix( m, n, C, ld, dC1, ld ); magma_csetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_cgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCgemm( opts.handle, cublas_trans_const(trans[ia]), cublas_trans_const(trans[ib]), m, n, k, &alpha, dA, ld, dB, ld, &beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( opts.handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CGEMM( m, n, k ) / 1e9; printf( "cgemm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_trans_const(trans[ia]), lapacke_trans_const(trans[ib]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test CHEMM // C = alpha*A*B + beta*C (left) with A m*m symmetric; B,C m*n; or // C = alpha*B*A + beta*C (right) with A n*n symmetric; B,C m*n // try left/right, upper/lower for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { magma_csetmatrix( m, m, A, ld, dA, ld ); magma_csetmatrix( m, n, B, ld, dB, ld ); magma_csetmatrix( m, n, C, ld, dC1, ld ); magma_csetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_chemm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasChemm( opts.handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), m, n, &alpha, dA, ld, dB, ld, &beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( opts.handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CHEMM( side[is], m, n ) / 1e9; printf( "chemm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_side_const(side[is]), lapacke_uplo_const(uplo[iu]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test CHERK // C = alpha*A*A^H + beta*C (no-trans) with A m*k and C m*m symmetric; or // C = alpha*A^H*A + beta*C (trans) with A k*m and C m*m symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { magma_csetmatrix( n, k, A, ld, dA, ld ); magma_csetmatrix( n, n, C, ld, dC1, ld ); magma_csetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_cherk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCherk( opts.handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), n, k, &dalpha, dA, ld, &dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( opts.handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CHERK( k, n ) / 1e9; printf( "cherk( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test CHER2K // C = alpha*A*B^H + ^alpha*B*A^H + beta*C (no-trans) with A,B n*k; C n*n symmetric; or // C = alpha*A^H*B + ^alpha*B^H*A + beta*C (trans) with A,B k*n; C n*n symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { bool nt = (trans[it] == MagmaNoTrans); magma_csetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA, ld ); magma_csetmatrix( n, n, C, ld, dC1, ld ); magma_csetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_cher2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCher2k( opts.handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), n, k, &alpha, dA, ld, dB, ld, &dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( opts.handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CHER2K( k, n ) / 1e9; printf( "cher2k( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test CTRMM // C = alpha*A*C (left) with A m*m triangular; C m*n; or // C = alpha*C*A (right) with A n*n triangular; C m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == MagmaLeft); magma_csetmatrix( (left ? m : n), (left ? m : n), A, ld, dA, ld ); magma_csetmatrix( m, n, C, ld, dC1, ld ); magma_csetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ctrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; // note cublas does trmm out-of-place (i.e., adds output matrix C), // but allows C=B to do in-place. t2 = magma_sync_wtime( 0 ); cublasCtrmm( opts.handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, n, &alpha, dA, ld, dC2, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( opts.handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CTRMM( side[is], m, n ) / 1e9; printf( "ctrmm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // ----- test CTRSM // solve A*X = alpha*B (left) with A m*m triangular; B m*n; or // solve X*A = alpha*B (right) with A n*n triangular; B m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == MagmaLeft); magma_csetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA, ld ); magma_csetmatrix( m, n, C, ld, dC1, ld ); magma_csetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ctrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCtrsm( opts.handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, n, &alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( opts.handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CTRSM( side[is], m, n ) / 1e9; printf( "ctrsm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // cleanup magma_free_cpu( piv ); magma_free_pinned( A ); magma_free_pinned( B ); magma_free_pinned( C ); magma_free_pinned( C2 ); magma_free_pinned( LU ); magma_free( dA ); magma_free( dB ); magma_free( dC1 ); magma_free( dC2 ); fflush( stdout ); } if ( total_error != 0. ) { printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n", total_error ); } else { printf( "all tests passed\n" ); } TESTING_FINALIZE(); int status = (total_error != 0.); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgetrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error; magmaFloatComplex *h_A, *h_R; magmaFloatComplex *d_A; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||PA-LU||/(||A||*N)\n"); printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_CGETRF( M, N ) / 1e9; if ( N > 512 ) { printf( "%5d %5d skipping because cgetf2 does not support N > 512\n", (int) M, (int) N ); continue; } TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_csetmatrix( M, N, h_R, lda, d_A, ldda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_cgetrf(&M, &N, h_A, &lda, ipiv, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_cgetf2_gpu( M, N, d_A, ldda, ipiv, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cgetf2_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000. ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time*1000. ); } if ( opts.check ) { magma_cgetmatrix( M, N, d_A, ldda, h_A, lda ); error = get_LU_error( M, N, h_R, lda, h_A, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); } else { printf(" --- \n"); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgetrf_mgpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error; magmaFloatComplex *h_A; magmaFloatComplex *d_lA[ MagmaMaxGPUs ]; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, n_local, ngpu; magma_int_t info, min_mn, nb, ldn_local; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; nb = magma_get_cgetrf_nb( M ); gflops = FLOPS_CGETRF( M, N ) / 1e9; // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, int((N+nb-1)/nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // Allocate host memory for the matrix TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); // Allocate device memory for( int dev=0; dev < ngpu; dev++){ n_local = ((N/nb)/ngpu)*nb; if (dev < (N/nb) % ngpu) n_local += nb; else if (dev == (N/nb) % ngpu) n_local += N % nb; ldn_local = ((n_local+31)/32)*32; // TODO why? magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], magmaFloatComplex, ldda*ldn_local ); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_cgetrf( &M, &N, h_A, &lda, ipiv, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( M, N, h_A, lda ); magma_csetmatrix_1D_col_bcyclic( M, N, h_A, lda, d_lA, ldda, ngpu, nb ); gpu_time = magma_wtime(); magma_cgetrf_mgpu( ngpu, M, N, d_lA, ldda, ipiv, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cgetrf_mgpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_cgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_A, lda, ngpu, nb ); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { error = get_residual( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else if ( opts.check ) { error = get_LU_error( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf( " ---\n" ); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); for( int dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }