extern "C" magma_int_t magma_zgetrf2_piv(magma_int_t m, magma_int_t n, magma_int_t start, magma_int_t end, magmaDoubleComplex *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { magma_int_t I, k1, k2, nb, incx, minmn; *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) return MAGMA_ERR_ILLEGAL_VALUE; /* Quick return if possible */ if (m == 0 || n == 0) return MAGMA_SUCCESS; /* initialize nb */ nb = magma_get_zgetrf_nb(m); minmn = min( end, min(m,n) ); for( I=start; I < end-nb; I += nb ) { incx = 1; k1 = 1+I+nb; k2 = minmn; lapackf77_zlaswp(&nb, A(0,I), &lda, &k1, &k2, ipiv, &incx); } return MAGMA_SUCCESS; } /* magma_zgetrf_piv */
/** Purpose ------- ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. If the current stream is NULL, this version replaces it with a new stream to overlap computation with communication. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_zgesv_comp ********************************************************************/ extern "C" magma_int_t magma_zgetrf_gpu( magma_int_t m, magma_int_t n, magmaDoubleComplex_ptr dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info) { #define dAT(i_, j_) (dAT + (i_)*nb*lddat + (j_)*nb) magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t iinfo, nb; magma_int_t maxm, maxn, mindim; magma_int_t i, j, rows, cols, s, lddat, ldwork; magmaDoubleComplex *dAT, *dAP, *work; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); nb = magma_get_zgetrf_nb(m); s = mindim / nb; if (nb <= 1 || nb >= min(m,n)) { /* Use CPU code. */ magma_zmalloc_cpu( &work, m * n ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_zgetmatrix( m, n, dA, ldda, work, m ); lapackf77_zgetrf(&m, &n, work, &m, ipiv, info); magma_zsetmatrix( m, n, work, m, dA, ldda ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; if (MAGMA_SUCCESS != magma_zmalloc( &dAP, nb*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } // square matrices can be done in place; // rectangular requires copy to transpose if ( m == n ) { dAT = dA; lddat = ldda; magmablas_ztranspose_inplace( m, dAT, ldda ); } else { lddat = maxn; // N-by-M if (MAGMA_SUCCESS != magma_zmalloc( &dAT, lddat*maxm )) { magma_free( dAP ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmablas_ztranspose( m, n, dA, ldda, dAT, lddat ); } ldwork = maxm; if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, ldwork*nb )) { magma_free( dAP ); if ( ! (m == n)) magma_free( dAT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* Define user stream if current stream is NULL */ magma_queue_t stream[2]; magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); magma_queue_create( &stream[0] ); if (orig_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else { stream[1] = orig_stream; } for( j=0; j < s; j++ ) { // download j-th panel cols = maxm - j*nb; magmablas_ztranspose( nb, m-j*nb, dAT(j,j), lddat, dAP, cols ); // make sure that the transpose has completed magma_queue_sync( stream[1] ); magma_zgetmatrix_async( m-j*nb, nb, dAP, cols, work, ldwork, stream[0]); if ( j > 0 ) { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (j+1)*nb, nb, c_one, dAT(j-1,j-1), lddat, dAT(j-1,j+1), lddat ); magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-j*nb, nb, c_neg_one, dAT(j-1,j+1), lddat, dAT(j, j-1), lddat, c_one, dAT(j, j+1), lddat ); } // do the cpu part rows = m - j*nb; magma_queue_sync( stream[0] ); lapackf77_zgetrf( &rows, &nb, work, &ldwork, ipiv+j*nb, &iinfo); if ( *info == 0 && iinfo > 0 ) *info = iinfo + j*nb; // upload j-th panel magma_zsetmatrix_async( m-j*nb, nb, work, ldwork, dAP, maxm, stream[0]); for( i=j*nb; i < j*nb + nb; ++i ) { ipiv[i] += j*nb; } magmablas_zlaswp( n, dAT, lddat, j*nb + 1, j*nb + nb, ipiv, 1 ); magma_queue_sync( stream[0] ); magmablas_ztranspose( m-j*nb, nb, dAP, maxm, dAT(j,j), lddat ); // do the small non-parallel computations (next panel update) if ( s > (j+1) ) { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat); magma_zgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat ); } else { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat); magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat ); } } magma_int_t nb0 = min(m - s*nb, n - s*nb); if ( nb0 > 0 ) { rows = m - s*nb; cols = maxm - s*nb; magmablas_ztranspose( nb0, rows, dAT(s,s), lddat, dAP, maxm ); magma_zgetmatrix( rows, nb0, dAP, maxm, work, ldwork ); // do the cpu part lapackf77_zgetrf( &rows, &nb0, work, &ldwork, ipiv+s*nb, &iinfo); if ( *info == 0 && iinfo > 0 ) *info = iinfo + s*nb; for( i=s*nb; i < s*nb + nb0; ++i ) { ipiv[i] += s*nb; } magmablas_zlaswp( n, dAT, lddat, s*nb + 1, s*nb + nb0, ipiv, 1 ); // upload j-th panel magma_zsetmatrix( rows, nb0, work, ldwork, dAP, maxm ); magmablas_ztranspose( rows, nb0, dAP, maxm, dAT(s,s), lddat ); magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, dAT(s,s), lddat, dAT(s,s)+nb0, lddat); } // undo transpose if ( m == n ) { magmablas_ztranspose_inplace( m, dAT, lddat ); } else { magmablas_ztranspose( n, m, dAT, lddat, dA, ldda ); magma_free( dAT ); } magma_free( dAP ); magma_free_pinned( work ); magma_queue_destroy( stream[0] ); if (orig_stream == NULL) { magma_queue_destroy( stream[1] ); } magmablasSetKernelStream( orig_stream ); } return *info; } /* magma_zgetrf_gpu */
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgetrf_mgpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double error; magmaDoubleComplex *h_A; magmaDoubleComplex_ptr d_lA[ MagmaMaxGPUs ]; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, n_local, ngpu; magma_int_t info, min_mn, nb, ldn_local; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; nb = magma_get_zgetrf_nb( M ); gflops = FLOPS_ZGETRF( M, N ) / 1e9; // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, int((N+nb-1)/nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // Allocate host memory for the matrix TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); // Allocate device memory for( int dev=0; dev < ngpu; dev++ ) { n_local = ((N/nb)/ngpu)*nb; if (dev < (N/nb) % ngpu) n_local += nb; else if (dev == (N/nb) % ngpu) n_local += N % nb; ldn_local = ((n_local+31)/32)*32; // TODO why? magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], magmaDoubleComplex, ldda*ldn_local ); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_zgetrf( &M, &N, h_A, &lda, ipiv, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( M, N, h_A, lda ); magma_zsetmatrix_1D_col_bcyclic( M, N, h_A, lda, d_lA, ldda, ngpu, nb ); gpu_time = magma_wtime(); magma_zgetrf_mgpu( ngpu, M, N, d_lA, ldda, ipiv, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zgetrf_mgpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_zgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_A, lda, ngpu, nb ); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { error = get_residual( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else if ( opts.check ) { error = get_LU_error( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf( " ---\n" ); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); for( int dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_err_t magma_zgetrf_msub(magma_int_t trans, magma_int_t num_subs, magma_int_t num_gpus, magma_int_t m, magma_int_t n, magmaDoubleComplex_ptr *d_lA, size_t dlA_offset, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info, magma_queue_t *queues) { /* -- clMAGMA (version 1.1.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date January 2014 Purpose ======= ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments ========= NUM_GPUS (input) INTEGER The number of GPUS to be used for the factorization. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ magma_int_t maxm, tot_subs = num_subs*num_gpus; magma_int_t i, j, d, lddat; /* submatrix info */ magma_int_t nb, n_local[ MagmaMaxSubs * MagmaMaxGPUs ]; magmaDoubleComplex_ptr d_lAT[ MagmaMaxSubs * MagmaMaxGPUs ]; /* local workspace per GPU */ magmaDoubleComplex_ptr d_panel[ MagmaMaxGPUs ]; magmaDoubleComplex_ptr d_lAP[ MagmaMaxGPUs ]; magmaDoubleComplex *work; /* Check arguments */ *info = 0; if (m < 0) *info = -2; else if (n < 0) *info = -3; else if (trans == MagmaTrans && ldda < max(1,m)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ nb = magma_get_zgetrf_nb(m); if (nb <= 1 || nb >= n) { /* Use CPU code. */ magma_zmalloc_cpu( &work, m * n ); if (work == NULL) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_zgetmatrix( m, n, d_lA[0], 0, ldda, work, 0, m, queues[0] ); lapackf77_zgetrf(&m, &n, work, &m, ipiv, info); magma_zsetmatrix( m, n, work, 0, m, d_lA[0], 0, ldda, queues[0] ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; if (tot_subs > ceil((double)n/nb)) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) tot_subs ); *info = -1; return *info; } /* allocate workspace for each GPU */ lddat = n/nb; /* number of block columns */ lddat = lddat/tot_subs; /* number of block columns per GPU */ lddat = nb*lddat; /* number of columns per GPU */ if (lddat * tot_subs < n) { /* left over */ if (n-lddat*tot_subs >= nb) { lddat += nb; } else { lddat += (n-lddat*tot_subs)%nb; } } lddat = ((lddat+31)/32)*32; /* make it a multiple of 32 */ /* allocating workspace */ for (d=0; d<num_gpus; d++) { //#define SINGLE_GPU_PER_CONTEXT #ifdef SINGLE_GPU_PER_CONTEXT if ((MAGMA_SUCCESS != magma_zmalloc_mgpu( d, &d_panel[d], (2+num_gpus)*nb*maxm )) || (MAGMA_SUCCESS != magma_zmalloc_mgpu( d, &d_lAP[d], (2+num_gpus)*nb*maxm )) ) { #else if ((MAGMA_SUCCESS != magma_zmalloc( &d_panel[d], (2+num_gpus)*nb*maxm )) || (MAGMA_SUCCESS != magma_zmalloc( &d_lAP[d], (2+num_gpus)*nb*maxm )) ) { #endif for( i=0; i<d; i++ ) { magma_free( d_panel[i] ); magma_free( d_lAP[i] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } /* transposing the local matrix */ for (i=0; i<tot_subs; i++) { /* local-n and local-ld */ n_local[i] = ((n/nb)/tot_subs)*nb; if (i < (n/nb)%tot_subs) n_local[i] += nb; else if (i == (n/nb)%tot_subs) n_local[i] += n%nb; /* local-matrix storage */ if (trans == MagmaNoTrans) { d_lAT[i] = d_lA[i]; } else { if ((m == n_local[i]) && (m%32 == 0) && (ldda%32 == 0) && (ldda == lddat)) { d_lAT[i] = d_lA[i]; magma_ztranspose_inplace( d_lA[i], 0, ldda, ldda, queues[2*(i%num_gpus)+1] ); } else { #ifdef SINGLE_GPU_PER_CONTEXT if (MAGMA_SUCCESS != magma_zmalloc_mgpu( i%num_gpus, &d_lAT[i], lddat*maxm )) { #else if (MAGMA_SUCCESS != magma_zmalloc( &d_lAT[i], lddat*maxm )) { #endif for (j=0; j<=i; j++) { magma_free( d_panel[j] ); magma_free( d_lAP[j] ); } for (j=0; j<i; j++) { if (d_lAT[j] != d_lA[j]) magma_free( d_lAT[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_ztranspose2(d_lAT[i], 0, lddat, d_lA[i], 0, ldda, m, n_local[i], queues[2*(i%num_gpus)+1]); } } } if (trans == MagmaNoTrans) { for (d=0; d<num_gpus; d++){ magma_queue_sync(queues[2*d+1]); } } /* cpu workspace */ #ifdef USE_PINNED_CLMEMORY cl_mem buffer = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(magmaDoubleComplex)*maxm*nb*(1+num_gpus), NULL, NULL); for (d=0; d<num_gpus; d++) { work = (magmaDoubleComplex*)clEnqueueMapBuffer(queues[2*d], buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(magmaDoubleComplex)*maxm*nb*(1+num_gpus), 0, NULL, NULL, NULL); } #else if (MAGMA_SUCCESS != magma_zmalloc_cpu( &work, maxm*nb*(1+num_gpus) )) { for(d=0; d<num_gpus; d++ ) magma_free( d_panel[d] ); for(d=0; d<tot_subs; d++ ) { if( d_lAT[d] != d_lA[d] ) magma_free( d_lAT[d] ); } *info = MAGMA_ERR_HOST_ALLOC; return *info; } #endif /* calling multi-gpu interface with allocated workspaces and streams */ magma_zgetrf2_msub(num_subs, num_gpus, m, n, nb, 0, d_lAT, 0, lddat, ipiv, d_lAP, d_panel, 0, work, maxm, info, queues); /* save on output */ for (d=0; d<tot_subs; d++) { if (trans == MagmaNoTrans) { //magma_zcopymatrix( n_local[d], m, d_lAT[d], 0, lddat, d_lA[d], 0, ldda, queues[2*d+1] ); } else { if (d_lAT[d] == d_lA[d]) { magma_ztranspose_inplace( d_lA[d], 0, ldda, ldda, queues[2*(d%num_gpus)+1] ); } else { magma_ztranspose2( d_lA[d], 0, ldda, d_lAT[d], 0, lddat, n_local[d], m, queues[2*(d%num_gpus)+1] ); } } } /* clean up */ for (d=0; d<num_gpus; d++) { magma_queue_sync(queues[2*d+1]); magma_free( d_panel[d] ); magma_free( d_lAP[d] ); d_panel[d] = d_lAP[d] = NULL; } for (d=0; d<tot_subs; d++) { if (d_lAT[d] != d_lA[d]) { magma_free( d_lAT[d] ); d_lAT[d] = NULL; } } #ifdef USE_PINNED_CLMEMORY for (d=0; d<num_gpus; d++) { clEnqueueUnmapMemObject(queues[2*d], buffer, work, 0, NULL, NULL); } clReleaseMemObject( buffer ); #else magma_free_cpu( work ); #endif work = NULL; } return *info; /* End of MAGMA_ZGETRF_MSUB */ }
extern "C" magma_int_t magma_zgetrf2_gpu( magma_int_t m, magma_int_t n, magmaDoubleComplex_ptr dA, size_t dA_offset, magma_int_t ldda, magma_int_t *ipiv, magma_queue_t queues[2], magma_int_t *info ) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define dA(i_, j_) dA, dA_offset + (i_)*nb + (j_)*nb*ldda #define dAT(i_, j_) dAT, dAT_offset + (i_)*nb*lddat + (j_)*nb #define dAP(i_, j_) dAP, (i_) + (j_)*maxm #define work(i_) (work + (i_)) magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t iinfo, nb; magma_int_t maxm, maxn, mindim; magma_int_t i, j, rows, s, lddat, ldwork; magmaDoubleComplex_ptr dAT, dAP; magmaDoubleComplex *work; size_t dAT_offset; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); nb = magma_get_zgetrf_nb(m); s = mindim / nb; if (nb <= 1 || nb >= min(m,n)) { /* Use CPU code. */ if ( MAGMA_SUCCESS != magma_zmalloc_cpu( &work, m*n )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_zgetmatrix( m, n, dA(0,0), ldda, work(0), m, queues[0] ); lapackf77_zgetrf( &m, &n, work, &m, ipiv, info ); magma_zsetmatrix( m, n, work(0), m, dA(0,0), ldda, queues[0] ); magma_free_cpu( work ); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; if ( MAGMA_SUCCESS != magma_zmalloc( &dAP, nb*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } // square matrices can be done in place; // rectangular requires copy to transpose if ( m == n ) { dAT = dA; dAT_offset = dA_offset; lddat = ldda; magmablas_ztranspose_inplace( m, dAT(0,0), lddat, queues[0] ); } else { lddat = maxn; // N-by-M dAT_offset = 0; if ( MAGMA_SUCCESS != magma_zmalloc( &dAT, lddat*maxm )) { magma_free( dAP ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmablas_ztranspose( m, n, dA(0,0), ldda, dAT(0,0), lddat, queues[0] ); } ldwork = maxm; /* if ( MAGMA_SUCCESS != magma_zmalloc_cpu( &work, ldwork*nb ) ) { magma_free( dAP ); if ( dA != dAT ) magma_free( dAT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } */ cl_mem work_mapped = clCreateBuffer( gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, ldwork*nb * sizeof(magmaDoubleComplex), NULL, NULL ); work = (magmaDoubleComplex*) clEnqueueMapBuffer( queues[0], work_mapped, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, ldwork*nb * sizeof(magmaDoubleComplex), 0, NULL, NULL, NULL ); for( j=0; j < s; j++ ) { // download j-th panel magmablas_ztranspose( nb, m-j*nb, dAT(j,j), lddat, dAP(0,0), maxm, queues[0] ); clFlush( queues[0] ); magma_queue_sync( queues[0] ); magma_zgetmatrix_async( m-j*nb, nb, dAP(0,0), maxm, work(0), ldwork, queues[1], NULL ); clFlush( queues[1] ); if ( j > 0 ) { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (j+1)*nb, nb, c_one, dAT(j-1,j-1), lddat, dAT(j-1,j+1), lddat, queues[0] ); magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-j*nb, nb, c_neg_one, dAT(j-1,j+1), lddat, dAT(j, j-1), lddat, c_one, dAT(j, j+1), lddat, queues[0] ); } magma_queue_sync( queues[1] ); // do the cpu part rows = m - j*nb; lapackf77_zgetrf( &rows, &nb, work, &ldwork, ipiv+j*nb, &iinfo ); if ( *info == 0 && iinfo > 0 ) *info = iinfo + j*nb; for( i=j*nb; i < j*nb + nb; ++i ) { ipiv[i] += j*nb; } magmablas_zlaswp( n, dAT(0,0), lddat, j*nb + 1, j*nb + nb, ipiv, 1, queues[0] ); clFlush( queues[0] ); // upload j-th panel magma_zsetmatrix_async( m-j*nb, nb, work(0), ldwork, dAP(0,0), maxm, queues[1], NULL ); magma_queue_sync( queues[1] ); magmablas_ztranspose( m-j*nb, nb, dAP(0,0), maxm, dAT(j,j), lddat, queues[0] ); clFlush( queues[0] ); // do the small non-parallel computations (next panel update) if ( s > (j+1) ) { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat, queues[0] ); magma_zgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat, queues[0] ); } else { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat, queues[0] ); magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat, queues[0] ); } } magma_int_t nb0 = min( m - s*nb, n - s*nb ); if ( nb0 > 0 ) { rows = m - s*nb; magmablas_ztranspose( nb0, rows, dAT(s,s), lddat, dAP(0,0), maxm, queues[0] ); clFlush( queues[0] ); magma_queue_sync( queues[0] ); magma_zgetmatrix_async( rows, nb0, dAP(0,0), maxm, work(0), ldwork, queues[1], NULL ); magma_queue_sync( queues[1] ); // do the cpu part lapackf77_zgetrf( &rows, &nb0, work, &ldwork, ipiv+s*nb, &iinfo ); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + s*nb; for( i=s*nb; i < s*nb + nb0; ++i ) { ipiv[i] += s*nb; } magmablas_zlaswp( n, dAT(0,0), lddat, s*nb + 1, s*nb + nb0, ipiv, 1, queues[0] ); clFlush( queues[0] ); // upload j-th panel magma_zsetmatrix_async( rows, nb0, work(0), ldwork, dAP(0,0), maxm, queues[1], NULL ); magma_queue_sync( queues[1] ); magmablas_ztranspose( rows, nb0, dAP(0,0), maxm, dAT(s,s), lddat, queues[0] ); clFlush( queues[0] ); magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, dAT(s,s), lddat, dAT(s,s)+nb0, lddat, queues[0] ); } // undo transpose if ( dA == dAT ) { magmablas_ztranspose_inplace( m, dAT(0,0), lddat, queues[0] ); } else { magmablas_ztranspose( n, m, dAT(0,0), lddat, dA(0,0), ldda, queues[0] ); magma_free( dAT ); } magma_queue_sync( queues[0] ); magma_queue_sync( queues[1] ); magma_free( dAP ); // magma_free_cpu( work ); clEnqueueUnmapMemObject( queues[0], work_mapped, work, 0, NULL, NULL ); clReleaseMemObject( work_mapped ); } return *info; } /* magma_zgetrf_gpu */
/** Purpose ------- ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. This version assumes the computation runs through the NULL stream and therefore is not overlapping computation with communication. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_zgesv_comp ********************************************************************/ extern "C" magma_int_t magma_zgetrf2_gpu(magma_int_t m, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info) { #define dAT(i,j) (dAT + (i)*nb*lddat + (j)*nb) magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t iinfo, nb; magma_int_t maxm, maxn, mindim; magma_int_t i, rows, cols, s, lddat, lddwork; magmaDoubleComplex *dAT, *dAP, *work; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); nb = magma_get_zgetrf_nb(m); s = mindim / nb; if (nb <= 1 || nb >= min(m,n)) { /* Use CPU code. */ magma_zmalloc_cpu( &work, m * n ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_zgetmatrix( m, n, dA, ldda, work, m ); lapackf77_zgetrf(&m, &n, work, &m, ipiv, info); magma_zsetmatrix( m, n, work, m, dA, ldda ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; lddat = maxn; lddwork = maxm; dAT = dA; if (MAGMA_SUCCESS != magma_zmalloc( &dAP, nb*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if ( m == n ) { lddat = ldda; magmablas_ztranspose_inplace( m, dAT, ldda ); } else { if (MAGMA_SUCCESS != magma_zmalloc( &dAT, maxm*maxn )) { magma_free( dAP ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmablas_ztranspose( m, n, dA, ldda, dAT, lddat ); } if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, maxm*nb )) { magma_free( dAP ); if ( ! (m == n)) magma_free( dAT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } for( i=0; i < s; i++ ) { // download i-th panel cols = maxm - i*nb; //magmablas_ztranspose( nb, cols, dAT(i,i), lddat, dAP, cols ); magmablas_ztranspose( nb, m-i*nb, dAT(i,i), lddat, dAP, cols ); magma_zgetmatrix( m-i*nb, nb, dAP, cols, work, lddwork ); // make sure that gpu queue is empty magma_device_sync(); if ( i > 0 ) { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (i+1)*nb, nb, c_one, dAT(i-1,i-1), lddat, dAT(i-1,i+1), lddat ); magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-(i+1)*nb, m-i*nb, nb, c_neg_one, dAT(i-1,i+1), lddat, dAT(i, i-1), lddat, c_one, dAT(i, i+1), lddat ); } // do the cpu part rows = m - i*nb; lapackf77_zgetrf( &rows, &nb, work, &lddwork, ipiv+i*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + i*nb; magmablas_zpermute_long2( n, dAT, lddat, ipiv, nb, i*nb ); // upload i-th panel magma_zsetmatrix( m-i*nb, nb, work, lddwork, dAP, maxm ); //magmablas_ztranspose( cols, nb, dAP, maxm, dAT(i,i), lddat ); magmablas_ztranspose( m-i*nb, nb, dAP, maxm, dAT(i,i), lddat ); // do the small non-parallel computations (next panel update) if ( s > (i+1) ) { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dAT(i, i ), lddat, dAT(i, i+1), lddat); magma_zgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(i+1)*nb, nb, c_neg_one, dAT(i, i+1), lddat, dAT(i+1, i ), lddat, c_one, dAT(i+1, i+1), lddat ); } else { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, dAT(i, i ), lddat, dAT(i, i+1), lddat); magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-(i+1)*nb, m-(i+1)*nb, nb, c_neg_one, dAT(i, i+1), lddat, dAT(i+1, i ), lddat, c_one, dAT(i+1, i+1), lddat ); } } magma_int_t nb0 = min(m - s*nb, n - s*nb); rows = m - s*nb; cols = maxm - s*nb; magmablas_ztranspose( nb0, rows, dAT(s,s), lddat, dAP, maxm ); magma_zgetmatrix( rows, nb0, dAP, maxm, work, lddwork ); // make sure that gpu queue is empty magma_device_sync(); // do the cpu part lapackf77_zgetrf( &rows, &nb0, work, &lddwork, ipiv+s*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + s*nb; magmablas_zpermute_long2( n, dAT, lddat, ipiv, nb0, s*nb ); // upload i-th panel magma_zsetmatrix( rows, nb0, work, lddwork, dAP, maxm ); magmablas_ztranspose( rows, nb0, dAP, maxm, dAT(s,s), lddat ); magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, dAT(s,s), lddat, dAT(s,s)+nb0, lddat); if ( m == n ) { magmablas_ztranspose_inplace( m, dAT, lddat ); } else { magmablas_ztranspose( n, m, dAT, lddat, dA, ldda ); magma_free( dAT ); } magma_free( dAP ); magma_free_pinned( work ); } return *info; } /* magma_zgetrf2_gpu */
/** Purpose ------- ZGETRF_NOPIV_GPU computes an LU factorization of a general M-by-N matrix A without any pivoting. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_zgesv_comp ********************************************************************/ extern "C" magma_int_t magma_zgetrf_nopiv_gpu(magma_int_t m, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info) { #define dA(i,j) (dA + (i)*nb + (j)*nb*ldda) magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t iinfo, nb; magma_int_t maxm, maxn, mindim; magma_int_t i, rows, cols, s, lddwork; magmaDoubleComplex *work; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); nb = magma_get_zgetrf_nb(m); s = mindim / nb; if (nb <= 1 || nb >= min(m,n)) { /* Use CPU code. */ magma_zmalloc_cpu( &work, m * n ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_zgetmatrix( m, n, dA, ldda, work, m ); magma_zgetrf_nopiv( m, n, work, m, info); magma_zsetmatrix( m, n, work, m, dA, ldda ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; lddwork = maxm; if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, maxm*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* Define user stream if current stream is NULL */ magma_queue_t stream[2]; magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); magma_queue_create( &stream[0] ); if (orig_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else { stream[1] = orig_stream; } for( i=0; i < s; i++ ) { // download i-th panel cols = maxm - i*nb; magma_queue_sync( stream[1] ); magma_zgetmatrix_async( m-i*nb, nb, dA(i,i), ldda, work, lddwork, stream[0] ); if ( i > 0 ) { magma_ztrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, nb, n - (i+1)*nb, c_one, dA(i-1,i-1), ldda, dA(i-1,i+1), ldda ); magma_zgemm( MagmaNoTrans, MagmaNoTrans, m-i*nb, n-(i+1)*nb, nb, c_neg_one, dA(i, i-1), ldda, dA(i-1,i+1), ldda, c_one, dA(i, i+1), ldda ); } // do the cpu part rows = m - i*nb; magma_queue_sync( stream[0] ); magma_zgetrf_nopiv( rows, nb, work, lddwork, &iinfo ); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + i*nb; // upload i-th panel magma_zsetmatrix_async( m-i*nb, nb, work, lddwork, dA(i, i), ldda, stream[0] ); magma_queue_sync( stream[0] ); // do the small non-parallel computations if ( s > (i+1) ) { magma_ztrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dA(i, i ), ldda, dA(i, i+1), ldda); magma_zgemm( MagmaNoTrans, MagmaNoTrans, m-(i+1)*nb, nb, nb, c_neg_one, dA(i+1, i ), ldda, dA(i, i+1), ldda, c_one, dA(i+1, i+1), ldda ); } else { magma_ztrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, nb, n-s*nb, c_one, dA(i, i ), ldda, dA(i, i+1), ldda); magma_zgemm( MagmaNoTrans, MagmaNoTrans, m-(i+1)*nb, n-(i+1)*nb, nb, c_neg_one, dA(i+1, i ), ldda, dA(i, i+1), ldda, c_one, dA(i+1, i+1), ldda ); } } magma_int_t nb0 = min(m - s*nb, n - s*nb); rows = m - s*nb; cols = maxm - s*nb; magma_zgetmatrix( rows, nb0, dA(s,s), ldda, work, lddwork ); // make sure that gpu queue is empty magma_device_sync(); // do the cpu part magma_zgetrf_nopiv( rows, nb0, work, lddwork, &iinfo ); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + s*nb; // upload i-th panel magma_zsetmatrix( rows, nb0, work, lddwork, dA(s,s), ldda ); magma_ztrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, nb0, n-s*nb-nb0, c_one, dA(s,s), ldda, dA(s,s)+nb0, ldda); magma_free_pinned( work ); magma_queue_destroy( stream[0] ); if (orig_stream == NULL) { magma_queue_destroy( stream[1] ); } magmablasSetKernelStream( orig_stream ); } return *info; } /* magma_zgetrf_nopiv_gpu */
extern "C" magma_int_t magma_zgetrf_mgpu( magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaDoubleComplex_ptr *d_lA, size_t dlA_offset, magma_int_t ldda, magma_int_t *ipiv, magma_queue_t *queues, magma_int_t *info) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments ========= NUM_GPUS (input) INTEGER The number of GPUS to be used for the factorization. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t nb, n_local[MagmaMaxGPUs]; magma_int_t maxm, mindim; magma_int_t d, d2, lddat, ldwork; magmaDoubleComplex_ptr d_lAT[MagmaMaxGPUs]; magmaDoubleComplex_ptr d_panel[MagmaMaxGPUs]; magmaDoubleComplex *work; /* Check arguments */ *info = 0; if (m < 0) *info = -2; else if (n < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); nb = magma_get_zgetrf_nb(m); if (nb <= 1 || nb >= n) { /* Use CPU code. */ magma_zmalloc_cpu( &work, m * n ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_zgetmatrix( m, n, d_lA[0], 0, ldda, work, m, queues[0] ); lapackf77_zgetrf(&m, &n, work, &m, ipiv, info); magma_zsetmatrix( m, n, work, m, d_lA[0], 0, ldda, queues[0] ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; if ( ngpu > ceil((double)n/nb) ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); *info = -1; return *info; } /* allocate workspace for each GPU */ lddat = (n+nb-1)/nb; /* number of block columns */ lddat = (lddat+ngpu-1)/ngpu; /* number of block columns per GPU */ lddat = nb*lddat; /* number of columns per GPU */ lddat = ((lddat+31)/32)*32; /* make it a multiple of 32 */ for( d=0; d < ngpu; d++ ) { /* local-n and local-ld */ n_local[d] = ((n/nb)/ngpu)*nb; if (d < (n/nb)%ngpu) n_local[d] += nb; else if (d == (n/nb)%ngpu) n_local[d] += n%nb; /* workspaces */ if (MAGMA_SUCCESS != magma_zmalloc( &d_panel[d], 3*nb*maxm )) { for( d2=0; d2 < d; d2++ ) { magma_free( d_panel[d2] ); magma_free( d_lAT[d2] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* local-matrix storage */ if (MAGMA_SUCCESS != magma_zmalloc( &d_lAT[d], lddat*maxm )) { for( d2=0; d2 <= d; d2++ ) { magma_free( d_panel[d2] ); } for( d2=0; d2 < d; d2++ ) { magma_free( d_lAT[d2] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmablas_ztranspose( m, n_local[d], d_lA[d], 0, ldda, d_lAT[d], 0, lddat, queues[2*d+1] ); } for( d=0; d < ngpu; d++ ) { magma_queue_sync(queues[2*d+1]); } /* cpu workspace */ ldwork = maxm; if (MAGMA_SUCCESS != magma_zmalloc_cpu( &work, ldwork*nb*ngpu )) { for( d=0; d < ngpu; d++ ) { magma_free( d_panel[d] ); magma_free( d_lAT[d] ); } *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* calling multi-gpu interface with allocated workspaces and queues */ magma_zgetrf2_mgpu(ngpu, m, n, nb, 0, d_lAT, 0, lddat, ipiv, d_panel, 0, work, maxm, queues, info); /* clean up */ for( d=0; d < ngpu; d++ ) { /* save on output */ magmablas_ztranspose( n_local[d], m, d_lAT[d], 0, lddat, d_lA[d], 0, ldda, queues[2*d+1] ); magma_queue_sync(queues[2*d+1]); magma_free( d_lAT[d] ); magma_free( d_panel[d] ); } /* end of for d=1,..,ngpu */ magma_free_cpu( work ); } return *info; }
extern "C" magma_int_t magma_zgetrf( magma_int_t m, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, magma_int_t *ipiv, magma_queue_t queue[2], magma_int_t *info) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. If the current stream is NULL, this version replaces it with user defined stream to overlap computation with communication. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define dAT(i,j) dAT, dAT_offset + ((i)*nb*lddat + (j)*nb) magmaDoubleComplex *work; magmaDoubleComplex_ptr dAT, dA, dwork, dAP; size_t dA_offset, dAT_offset; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t iinfo, nb; *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; nb = magma_get_zgetrf_nb(m); if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code. */ lapackf77_zgetrf(&m, &n, A, &lda, ipiv, info); } else { /* Use hybrid blocked code. */ magma_int_t maxm, maxn, ldda, maxdim, lddat; magma_int_t i, j, rows, cols, s = min(m, n)/nb; maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; lddat = maxn; ldda = maxm; maxdim = max(maxm, maxn); /* set number of GPUs */ magma_int_t num_gpus = magma_num_gpus(); if ( num_gpus > 1 ) { /* call multi-GPU non-GPU-resident interface */ printf("multiple-GPU verison not implemented\n"); return MAGMA_ERR_NOT_IMPLEMENTED; // magma_zgetrf_m(num_gpus, m, n, A, lda, ipiv, info); // return *info; } /* explicitly checking the memory requirement */ magma_int_t totalMem = magma_queue_meminfo( queue[0] ); totalMem /= sizeof(magmaDoubleComplex); int h = 1+(2+num_gpus), num_gpus2 = num_gpus; int NB = (magma_int_t)(0.8*totalMem/maxm-h*nb); const char* ngr_nb_char = getenv("MAGMA_NGR_NB"); if( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); if( num_gpus > ceil((double)NB/nb) ) { num_gpus2 = (int)ceil((double)NB/nb); h = 1+(2+num_gpus2); NB = (magma_int_t)(0.8*totalMem/maxm-h*nb); } if( num_gpus2*NB < n ) { /* require too much memory, so call non-GPU-resident version */ printf("non-GPU-resident version not implemented\n"); return MAGMA_ERR_NOT_IMPLEMENTED; //magma_zgetrf_m(num_gpus, m, n, A, lda, ipiv, info); //return *info; } work = A; if (maxdim*maxdim < 2*maxm*maxn) { // if close to square, allocate square matrix and transpose in-place if (MAGMA_SUCCESS != magma_zmalloc( &dwork, (nb*maxm + maxdim*maxdim) ) ) { /* alloc failed so call non-GPU-resident version */ printf("non-GPU-resident version not implemented\n"); return MAGMA_ERR_NOT_IMPLEMENTED; //magma_zgetrf_m(num_gpus, m, n, A, lda, ipiv, info); //return *info; } dAP = dwork; dA = dwork; dA_offset = nb*maxm; ldda = lddat = maxdim; magma_zsetmatrix( m, n, A, lda, dA, dA_offset, ldda, queue[0] ); dAT = dA; dAT_offset = dA_offset; magmablas_ztranspose_inplace( m, dAT, dAT_offset, ldda, queue[0] ); } else { // if very rectangular, allocate dA and dAT and transpose out-of-place if (MAGMA_SUCCESS != magma_zmalloc( &dwork, (nb + maxn)*maxm )) { /* alloc failed so call non-GPU-resident version */ printf("non-GPU-resident version not implemented\n"); return MAGMA_ERR_NOT_IMPLEMENTED; //magma_zgetrf_m(num_gpus, m, n, A, lda, ipiv, info); //return *info; } dAP = dwork; dA = dwork; dA_offset = nb*maxm; magma_zsetmatrix( m, n, A, lda, dA, dA_offset, ldda, queue[0] ); if (MAGMA_SUCCESS != magma_zmalloc( &dAT, maxm*maxn )) { /* alloc failed so call non-GPU-resident version */ magma_free( dwork ); printf("non-GPU-resident version not implemented\n"); return MAGMA_ERR_NOT_IMPLEMENTED; //magma_zgetrf_m(num_gpus, m, n, A, lda, ipiv, info); //return *info; } dAT_offset = 0; magmablas_ztranspose( m, n, dA, dA_offset, ldda, dAT, dAT_offset, lddat, queue[0] ); } lapackf77_zgetrf( &m, &nb, work, &lda, ipiv, &iinfo); for( j = 0; j < s; j++ ) { // download j-th panel cols = maxm - j*nb; if (j>0){ // download j-th panel magmablas_ztranspose( nb, cols, dAT(j,j), lddat, dAP, 0, cols, queue[0] ); magma_queue_sync(queue[0]); magma_zgetmatrix_async( m-j*nb, nb, dAP, 0, cols, work, lda, queue[1], NULL); magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (j+1)*nb, nb, c_one, dAT(j-1,j-1), lddat, dAT(j-1,j+1), lddat, queue[0] ); magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-j*nb, nb, c_neg_one, dAT(j-1,j+1), lddat, dAT(j, j-1), lddat, c_one, dAT(j, j+1), lddat, queue[0] ); // do the cpu part rows = m - j*nb; magma_queue_sync( queue[1] ); lapackf77_zgetrf( &rows, &nb, work, &lda, ipiv+j*nb, &iinfo); } if (*info == 0 && iinfo > 0) *info = iinfo + j*nb; for( i=j*nb; i < j*nb + nb; ++i ) { ipiv[i] += j*nb; } magmablas_zlaswp( n, dAT, dAT_offset, lddat, j*nb + 1, j*nb + nb, ipiv, 1, queue[0] ); // upload j-th panel magma_zsetmatrix_async( m-j*nb, nb, work, lda, dAP, 0, maxm, queue[1], NULL); magma_queue_sync( queue[1] ); magmablas_ztranspose( cols, nb, dAP, 0, maxm, dAT(j,j), lddat, queue[0] ); // do the small non-parallel computations if (s > (j+1)){ magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat, queue[0]); magma_zgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat, queue[0] ); } else{ magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat, queue[0] ); magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat, queue[0] ); } } magma_int_t nb0 = min(m - s*nb, n - s*nb); if ( nb0 > 0 ) { rows = m - s*nb; cols = maxm - s*nb; magmablas_ztranspose( nb0, rows, dAT(s,s), lddat, dAP, 0, maxm, queue[0]); magma_queue_sync(queue[0]); magma_zgetmatrix_async( rows, nb0, dAP, 0, maxm, work, lda, queue[1], NULL ); magma_queue_sync(queue[1]); // do the cpu part lapackf77_zgetrf( &rows, &nb0, work, &lda, ipiv+s*nb, &iinfo); if (*info == 0 && iinfo > 0) *info = iinfo + s*nb; for( i=s*nb; i < s*nb + nb0; ++i ) { ipiv[i] += s*nb; } magmablas_zlaswp( n, dAT, dAT_offset, lddat, s*nb + 1, s*nb + nb0, ipiv, 1, queue[0] ); magma_zsetmatrix_async( rows, nb0, work, lda, dAP, 0, maxm, queue[1], NULL ); magma_queue_sync(queue[1]); magmablas_ztranspose( rows, nb0, dAP, 0, maxm, dAT(s,s), lddat, queue[0]); magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, dAT(s, s), lddat, dAT(s, s)+nb0, lddat, queue[0] ); } if (maxdim*maxdim < 2*maxm*maxn) { magmablas_ztranspose_inplace( m, dAT, dAT_offset, lddat, queue[0] ); magma_zgetmatrix( m, n, dA, dA_offset, ldda, A, lda, queue[0] ); } else { magmablas_ztranspose( n, m, dAT, dAT_offset, lddat, dA, dA_offset, ldda, queue[0] ); magma_zgetmatrix( m, n, dA, dA_offset, ldda, A, lda, queue[0] ); magma_queue_sync(queue[0]); magma_free( dAT ); } magma_queue_sync(queue[0]); magma_free( dwork ); } return *info; } /* magma_zgetrf */
/* //////////////////////////////////////////////////////////////////////////// -- Testing zswap, zswapblk, zlaswp, zlaswpx */ int main( int argc, char** argv) { TESTING_INIT(); // OpenCL use: cl_mem , offset (two arguments); // else use: pointer + offset (one argument). #ifdef HAVE_clBLAS #define d_A1(i_, j_) d_A1 , (i_) + (j_)*ldda #define d_A2(i_, j_) d_A2 , (i_) + (j_)*ldda #define d_ipiv(i_) d_ipiv , (i_) #else #define d_A1(i_, j_) (d_A1 + (i_) + (j_)*ldda) #define d_A2(i_, j_) (d_A2 + (i_) + (j_)*ldda) #define d_ipiv(i_) (d_ipiv + (i_)) #endif #define h_A1(i_, j_) (h_A1 + (i_) + (j_)*lda) #define h_A2(i_, j_) (h_A2 + (i_) + (j_)*lda) magmaDoubleComplex *h_A1, *h_A2; magmaDoubleComplex *h_R1, *h_R2; magmaDoubleComplex_ptr d_A1, d_A2; // row-major and column-major performance real_Double_t row_perf0 = MAGMA_D_NAN, col_perf0 = MAGMA_D_NAN; real_Double_t row_perf1 = MAGMA_D_NAN, col_perf1 = MAGMA_D_NAN; real_Double_t row_perf2 = MAGMA_D_NAN, col_perf2 = MAGMA_D_NAN; real_Double_t row_perf4 = MAGMA_D_NAN; real_Double_t row_perf5 = MAGMA_D_NAN, col_perf5 = MAGMA_D_NAN; real_Double_t row_perf6 = MAGMA_D_NAN, col_perf6 = MAGMA_D_NAN; real_Double_t row_perf7 = MAGMA_D_NAN; real_Double_t cpu_perf = MAGMA_D_NAN; real_Double_t time, gbytes; magma_int_t N, lda, ldda, nb, j; magma_int_t ione = 1; magma_int_t *ipiv, *ipiv2; magmaInt_ptr d_ipiv; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); printf("%% %8s zswap zswap zswapblk zlaswp zlaswp2 zlaswpx zcopymatrix CPU (all in )\n", g_platform_str ); printf("%% N nb row-maj/col-maj row-maj/col-maj row-maj/col-maj row-maj row-maj row-maj/col-maj row-blk/col-blk zlaswp (GByte/s)\n"); printf("%%========================================================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { // For an N x N matrix, swap nb rows or nb columns using various methods. // Each test is assigned one bit in the 'check' bitmask; bit=1 indicates failure. // The variable 'shift' keeps track of which bit is for current test magma_int_t shift = 1; magma_int_t check = 0; N = opts.nsize[itest]; lda = N; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default nb = (opts.nb > 0 ? opts.nb : magma_get_zgetrf_nb( N, N )); nb = min( N, nb ); // each swap does 2N loads and 2N stores, for nb swaps gbytes = sizeof(magmaDoubleComplex) * 4.*N*nb / 1e9; TESTING_MALLOC_PIN( h_A1, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_A2, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_R1, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_R2, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, nb ); TESTING_MALLOC_CPU( ipiv2, magma_int_t, nb ); TESTING_MALLOC_DEV( d_ipiv, magma_int_t, nb ); TESTING_MALLOC_DEV( d_A1, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( d_A2, magmaDoubleComplex, ldda*N ); // getrf always makes ipiv[j] >= j+1, where ipiv is one based and j is zero based // some implementations (e.g., MacOS dlaswp) assume this for( j=0; j < nb; j++ ) { ipiv[j] = (rand() % (N-j)) + j + 1; assert( ipiv[j] >= j+1 ); assert( ipiv[j] <= N ); } /* ===================================================================== * cublas / clBLAS / Xeon Phi zswap, row-by-row (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1(0,0), ldda ); magma_zsetmatrix( N, N, h_A2, lda, d_A2(0,0), ldda ); magmablasSetKernelStream( opts.queue ); // opts.handle also uses opts.queue time = magma_sync_wtime( opts.queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { #ifdef HAVE_CUBLAS cublasZswap( opts.handle, N, d_A1(0,j), 1, d_A2(0,ipiv[j]-1), 1 ); #else magma_zswap( N, d_A1(0,j), 1, d_A2(0,ipiv[j]-1), 1, opts.queue ); #endif } } time = magma_sync_wtime( opts.queue ) - time; row_perf0 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1(0,j), &ione, h_A2(0,ipiv[j]-1), &ione); } } magma_zgetmatrix( N, N, d_A1(0,0), ldda, h_R1, lda ); magma_zgetmatrix( N, N, d_A2(0,0), ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1(0,0), ldda ); magma_zsetmatrix( N, N, h_A2, lda, d_A2(0,0), ldda ); time = magma_sync_wtime( opts.queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { #ifdef HAVE_CUBLAS cublasZswap( opts.handle, N, d_A1(j,0), ldda, d_A2(ipiv[j]-1,0), ldda ); #else magma_zswap( N, d_A1(j,0), ldda, d_A2(ipiv[j]-1,0), ldda, opts.queue ); #endif } } time = magma_sync_wtime( opts.queue ) - time; col_perf0 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_zgetmatrix( N, N, d_A1(0,0), ldda, h_R1, lda ); magma_zgetmatrix( N, N, d_A2(0,0), ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * zswap, row-by-row (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1(0,0), ldda ); magma_zsetmatrix( N, N, h_A2, lda, d_A2(0,0), ldda ); time = magma_sync_wtime( opts.queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { magmablas_zswap( N, d_A1(0,j), 1, d_A2(0,ipiv[j]-1), 1); } } time = magma_sync_wtime( opts.queue ) - time; row_perf1 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1(0,j), &ione, h_A2(0,ipiv[j]-1), &ione); } } magma_zgetmatrix( N, N, d_A1(0,0), ldda, h_R1, lda ); magma_zgetmatrix( N, N, d_A2(0,0), ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1(0,0), ldda ); magma_zsetmatrix( N, N, h_A2, lda, d_A2(0,0), ldda ); time = magma_sync_wtime( opts.queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { magmablas_zswap( N, d_A1(j,0), ldda, d_A2(ipiv[j]-1,0), ldda ); } } time = magma_sync_wtime( opts.queue ) - time; col_perf1 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_zgetmatrix( N, N, d_A1(0,0), ldda, h_R1, lda ); magma_zgetmatrix( N, N, d_A2(0,0), ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * zswapblk, blocked version (2 matrices) */ #ifdef HAVE_CUBLAS /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1(0,0), ldda ); magma_zsetmatrix( N, N, h_A2, lda, d_A2(0,0), ldda ); time = magma_sync_wtime( opts.queue ); magmablas_zswapblk( MagmaRowMajor, N, d_A1(0,0), ldda, d_A2(0,0), ldda, 1, nb, ipiv, 1, 0); time = magma_sync_wtime( opts.queue ) - time; row_perf2 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1(0,j), &ione, h_A2(0,ipiv[j]-1), &ione); } } magma_zgetmatrix( N, N, d_A1(0,0), ldda, h_R1, lda ); magma_zgetmatrix( N, N, d_A2(0,0), ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1(0,0), ldda ); magma_zsetmatrix( N, N, h_A2, lda, d_A2(0,0), ldda ); time = magma_sync_wtime( opts.queue ); magmablas_zswapblk( MagmaColMajor, N, d_A1(0,0), ldda, d_A2(0,0), ldda, 1, nb, ipiv, 1, 0); time = magma_sync_wtime( opts.queue ) - time; col_perf2 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1(j,0), &lda, h_A2(ipiv[j]-1,0), &lda); } } magma_zgetmatrix( N, N, d_A1(0,0), ldda, h_R1, lda ); magma_zgetmatrix( N, N, d_A2(0,0), ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; #endif /* ===================================================================== * LAPACK-style zlaswp (1 matrix) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1(0,0), ldda ); time = magma_sync_wtime( opts.queue ); magmablas_zlaswp( N, d_A1(0,0), ldda, 1, nb, ipiv, 1); time = magma_sync_wtime( opts.queue ) - time; row_perf4 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1(0,j), &ione, h_A1(0,ipiv[j]-1), &ione); } } magma_zgetmatrix( N, N, d_A1(0,0), ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style zlaswp (1 matrix) - d_ipiv on GPU */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1(0,0), ldda ); time = magma_sync_wtime( opts.queue ); magma_setvector( nb, sizeof(magma_int_t), ipiv, 1, d_ipiv(0), 1 ); magmablas_zlaswp2( N, d_A1(0,0), ldda, 1, nb, d_ipiv(0), 1 ); time = magma_sync_wtime( opts.queue ) - time; row_perf7 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1(0,j), &ione, h_A1(0,ipiv[j]-1), &ione); } } magma_zgetmatrix( N, N, d_A1(0,0), ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style zlaswpx (extended for row- and col-major) (1 matrix) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1(0,0), ldda ); time = magma_sync_wtime( opts.queue ); magmablas_zlaswpx( N, d_A1(0,0), ldda, 1, 1, nb, ipiv, 1); time = magma_sync_wtime( opts.queue ) - time; row_perf5 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1(0,j), &ione, h_A1(0,ipiv[j]-1), &ione); } } magma_zgetmatrix( N, N, d_A1(0,0), ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* Col Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1(0,0), ldda ); time = magma_sync_wtime( opts.queue ); magmablas_zlaswpx( N, d_A1(0,0), 1, ldda, 1, nb, ipiv, 1); time = magma_sync_wtime( opts.queue ) - time; col_perf5 = gbytes / time; /* LAPACK swap on CPU for comparison */ time = magma_wtime(); lapackf77_zlaswp( &N, h_A1, &lda, &ione, &nb, ipiv, &ione); time = magma_wtime() - time; cpu_perf = gbytes / time; magma_zgetmatrix( N, N, d_A1(0,0), ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * Copy matrix. */ time = magma_sync_wtime( opts.queue ); magma_zcopymatrix( N, nb, d_A1(0,0), ldda, d_A2(0,0), ldda ); time = magma_sync_wtime( opts.queue ) - time; // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap col_perf6 = 0.5 * gbytes / time; time = magma_sync_wtime( opts.queue ); magma_zcopymatrix( nb, N, d_A1(0,0), ldda, d_A2(0,0), ldda ); time = magma_sync_wtime( opts.queue ) - time; // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap row_perf6 = 0.5 * gbytes / time; printf("%5d %3d %6.2f%c/ %6.2f%c %6.2f%c/ %6.2f%c %6.2f%c/ %6.2f%c %6.2f%c %6.2f%c %6.2f%c/ %6.2f%c %6.2f / %6.2f %6.2f %10s\n", (int) N, (int) nb, row_perf0, ((check & 0x001) != 0 ? '*' : ' '), col_perf0, ((check & 0x002) != 0 ? '*' : ' '), row_perf1, ((check & 0x004) != 0 ? '*' : ' '), col_perf1, ((check & 0x008) != 0 ? '*' : ' '), row_perf2, ((check & 0x010) != 0 ? '*' : ' '), col_perf2, ((check & 0x020) != 0 ? '*' : ' '), row_perf4, ((check & 0x040) != 0 ? '*' : ' '), row_perf7, ((check & 0x080) != 0 ? '*' : ' '), row_perf5, ((check & 0x100) != 0 ? '*' : ' '), col_perf5, ((check & 0x200) != 0 ? '*' : ' '), row_perf6, col_perf6, cpu_perf, (check == 0 ? "ok" : "* failed") ); status += ! (check == 0); TESTING_FREE_PIN( h_A1 ); TESTING_FREE_PIN( h_A2 ); TESTING_FREE_PIN( h_R1 ); TESTING_FREE_PIN( h_R2 ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( ipiv2 ); TESTING_FREE_DEV( d_ipiv ); TESTING_FREE_DEV( d_A1 ); TESTING_FREE_DEV( d_A2 ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgetrf_mgpu */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double error; magmaDoubleComplex *h_A, *h_P; magmaDoubleComplex_ptr d_lA[ MagmaMaxSubs * MagmaMaxGPUs ]; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, info, min_mn; magma_int_t dev, j, k, ngpu, nsub, n_local, nb, nk, ldn_local, maxm; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); /* Initialize queues */ magma_queue_t queues[MagmaMaxGPUs * 2]; magma_device_t devices[MagmaMaxGPUs]; magma_int_t num = 0; magma_int_t err; err = magma_getdevices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_getdevices failed: %d\n", (int) err ); exit(-1); } for( dev=0; dev < opts.ngpu; dev++ ) { err = magma_queue_create( devices[dev], &queues[2*dev] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d (device %d)\n", (int) err, dev ); exit(-1); } err = magma_queue_create( devices[dev], &queues[2*dev+1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d (device %d)\n", (int) err, dev ); exit(-1); } } printf("trans %s, ngpu %d, nsub %d\n", lapack_trans_const(opts.transA), (int) opts.ngpu, (int) opts.nsub ); if ( opts.check == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); maxm = 32*((M+31)/32); lda = M; n2 = lda*N; nb = magma_get_zgetrf_nb(M); gflops = FLOPS_ZGETRF( M, N ) / 1e9; // nsubs * ngpu must be at least the number of blocks ngpu = opts.ngpu; nsub = opts.nsub; if ( nsub*ngpu > N/nb ) { nsub = 1; ngpu = 1; printf( " * too many GPUs for the matrix size, using %d GPUs and %d submatrices\n", (int) ngpu, (int) nsub ); } /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_P, magmaDoubleComplex, lda*nb ); /* Allocate device memory */ if ( opts.transA == MagmaNoTrans ) { ldda = N/nb; /* number of block columns */ ldda = ldda/(ngpu*nsub); /* number of block columns per GPU */ ldda = nb*ldda; /* number of columns per GPU */ if ( ldda * ngpu*nsub < N ) { /* left over */ if ( N-ldda*ngpu*nsub >= nb ) { ldda += nb; } else { ldda += (N-ldda*ngpu*nsub)%nb; } } ldda = ((ldda+31)/32)*32; /* make it a multiple of 32 */ for( j=0; j < nsub * ngpu; j++ ) { TESTING_MALLOC_DEV( d_lA[j], magmaDoubleComplex, ldda*maxm ); } } else { ldda = ((M+31)/32)*32; for( j=0; j < nsub * ngpu; j++ ) { n_local = ((N/nb)/(nsub*ngpu))*nb; if ( j < (N/nb)%(nsub*ngpu) ) { n_local += nb; } else if ( j == (N/nb)%(nsub*ngpu) ) { n_local += N%nb; } TESTING_MALLOC_DEV( d_lA[j], magmaDoubleComplex, ldda*n_local ); } } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_zgetrf( &M, &N, h_A, &lda, ipiv, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if ( info != 0 ) printf("lapackf77_zgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( M, N, h_A, lda ); if ( opts.transA == MagmaNoTrans ) { for( j=0; j < N; j += nb ) { k = (j/nb)%(nsub*ngpu); nk = min(nb, N-j); /* transpose on CPU, then copy to GPU */ int ii,jj; for( ii=0; ii < M; ii++ ) { for( jj=0; jj < nk; jj++ ) { h_P[jj+ii*nk] = h_A[j*lda + ii+jj*lda]; } } magma_zsetmatrix( nk, M, h_P, nk, d_lA[k], j/(nb*nsub*ngpu)*nb, ldda, queues[2*(k%ngpu)] ); } } else { ldda = ((M+31)/32)*32; for( j=0; j < N; j += nb ) { k = (j/nb)%(nsub*ngpu); nk = min(nb, N-j); magma_zsetmatrix( M, nk, h_A + j*lda, lda, d_lA[k], j/(nb*nsub*ngpu)*nb*ldda, ldda, queues[2*(k%ngpu)] ); } } gpu_time = magma_wtime(); magma_zgetrf_msub( opts.transA, nsub, ngpu, M, N, d_lA, 0, ldda, ipiv, queues, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zgetrf_mgpu returned error %d: %s.\n", (int) info, magma_strerror( info )); /* get the matrix from GPUs */ if ( opts.transA == MagmaNoTrans ) { for (j=0; j < N; j+=nb) { k = (j/nb)%(nsub*ngpu); nk = min(nb, N-j); /* copy to CPU and then transpose */ magma_zgetmatrix( nk, M, d_lA[k], j/(nb*nsub*ngpu)*nb, ldda, h_P, nk, queues[2*(k%ngpu)] ); int ii, jj; for( ii=0; ii < M; ii++ ) { for( jj=0; jj < nk; jj++ ) { h_A[j*lda + ii+jj*lda] = h_P[jj+ii*nk]; } } } } else { for (j=0; j < N; j+=nb) { k = (j/nb)%(nsub*ngpu); nk = min(nb, N-j); magma_zgetmatrix( M, nk, d_lA[k], j/(nb*nsub*ngpu)*nb*ldda, ldda, h_A + j*lda, lda, queues[2*(k%ngpu)] ); } } /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { error = get_residual( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else if ( opts.check ) { error = get_LU_error( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf(" --- \n"); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_P ); for( dev=0; dev < ngpu; dev++ ) { for( k=0; k < nsub; k++ ) { TESTING_FREE_DEV( d_lA[dev*nsub + k] ); } } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } /* Free queues */ for( dev=0; dev < opts.ngpu; dev++ ) { magma_queue_destroy( queues[2*dev] ); magma_queue_destroy( queues[2*dev+1] ); } TESTING_FINALIZE(); return status; }
/** Purpose ------- ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] d_lA COMPLEX_16 array of pointers on the GPU, dimension (ngpu). On entry, the M-by-N matrix A distributed over GPUs (d_lA[d] points to the local matrix on d-th GPU). It uses 1D block column cyclic format with the block size of nb, and each local matrix is stored by column. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array d_lA. LDDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_zgesv_comp ********************************************************************/ extern "C" magma_int_t magma_zgetrf_mgpu( magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaDoubleComplex_ptr d_lA[], magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info) { magma_int_t nb, n_local[MagmaMaxGPUs]; magma_int_t maxm; magma_int_t i, j, d, lddat, lddwork; magmaDoubleComplex *d_lAT[MagmaMaxGPUs]; magmaDoubleComplex *d_panel[MagmaMaxGPUs], *work; magma_queue_t streaml[MagmaMaxGPUs][2]; /* Check arguments */ *info = 0; if (m < 0) *info = -2; else if (n < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ nb = magma_get_zgetrf_nb(m); if (nb <= 1 || nb >= n) { /* Use CPU code. */ magma_zmalloc_cpu( &work, m * n ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_zgetmatrix( m, n, d_lA[0], ldda, work, m ); lapackf77_zgetrf(&m, &n, work, &m, ipiv, info); magma_zsetmatrix( m, n, work, m, d_lA[0], ldda ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); maxm = ((m + 31)/32)*32; if ( ngpu > ceil((double)n/nb) ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); *info = -1; return *info; } /* allocate workspace for each GPU */ lddat = ((((((n+nb-1)/nb)/ngpu)*nb)+31)/32)*32; lddat = (n+nb-1)/nb; /* number of block columns */ lddat = (lddat+ngpu-1)/ngpu; /* number of block columns per GPU */ lddat = nb*lddat; /* number of columns per GPU */ lddat = ((lddat+31)/32)*32; /* make it a multiple of 32 */ for (i=0; i < ngpu; i++) { magma_setdevice(i); /* local-n and local-ld */ n_local[i] = ((n/nb)/ngpu)*nb; if (i < (n/nb)%ngpu) n_local[i] += nb; else if (i == (n/nb)%ngpu) n_local[i] += n%nb; /* workspaces */ if (MAGMA_SUCCESS != magma_zmalloc( &d_panel[i], (3+ngpu)*nb*maxm )) { for( j=0; j <= i; j++ ) { magma_setdevice(j); } for( j=0; j < i; j++ ) { magma_setdevice(j); magma_free( d_panel[j] ); magma_free( d_lAT[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* local-matrix storage */ if (MAGMA_SUCCESS != magma_zmalloc( &d_lAT[i], lddat*maxm )) { for( j=0; j <= i; j++ ) { magma_setdevice(j); magma_free( d_panel[j] ); } for( j=0; j < i; j++ ) { magma_setdevice(j); magma_free( d_lAT[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* create the streams */ magma_queue_create( &streaml[i][0] ); magma_queue_create( &streaml[i][1] ); magmablasSetKernelStream(streaml[i][1]); magmablas_ztranspose( m, n_local[i], d_lA[i], ldda, d_lAT[i], lddat ); } for (i=0; i < ngpu; i++) { magma_setdevice(i); magma_queue_sync(streaml[i][0]); magmablasSetKernelStream(NULL); } magma_setdevice(0); /* cpu workspace */ lddwork = maxm; if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, lddwork*nb*ngpu )) { for (i=0; i < ngpu; i++ ) { magma_setdevice(i); magma_free( d_panel[i] ); magma_free( d_lAT[i] ); } *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* calling multi-gpu interface with allocated workspaces and streams */ magma_zgetrf2_mgpu(ngpu, m, n, nb, 0, d_lAT, lddat, ipiv, d_panel, work, maxm, streaml, info); /* clean up */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); /* save on output */ magmablas_ztranspose( n_local[d], m, d_lAT[d], lddat, d_lA[d], ldda ); magma_device_sync(); magma_free( d_lAT[d] ); magma_free( d_panel[d] ); magma_queue_destroy( streaml[d][0] ); magma_queue_destroy( streaml[d][1] ); } /* end of for d=1,..,ngpu */ magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); magma_free_pinned( work ); } return *info; }
extern "C" magma_int_t magma_zgetrf_gpu(magma_int_t m, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. If the current stream is NULL, this version replaces it with user defined stream to overlap computation with communication. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define dAT(i,j) (dAT + (i)*nb*lddat + (j)*nb) magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t iinfo, nb; magma_int_t maxm, maxn, mindim; magma_int_t i, rows, cols, s, lddat, lddwork; magmaDoubleComplex *dAT, *dAP, *work; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); nb = magma_get_zgetrf_nb(m); s = mindim / nb; if (nb <= 1 || nb >= min(m,n)) { /* Use CPU code. */ magma_zmalloc_cpu( &work, m * n ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_zgetmatrix( m, n, dA, ldda, work, m ); lapackf77_zgetrf(&m, &n, work, &m, ipiv, info); magma_zsetmatrix( m, n, work, m, dA, ldda ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; lddat = maxn; lddwork = maxm; dAT = dA; if (MAGMA_SUCCESS != magma_zmalloc( &dAP, nb*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if ( m == n ) { lddat = ldda; magmablas_ztranspose_inplace( m, dAT, ldda ); } else { if (MAGMA_SUCCESS != magma_zmalloc( &dAT, maxm*maxn )) { magma_free( dAP ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmablas_ztranspose2( dAT, lddat, dA, ldda, m, n ); } if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, maxm*nb )) { magma_free( dAP ); if ( ! (m == n)) magma_free( dAT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* Define user stream if current stream is NULL */ cudaStream_t stream[2], current_stream; magmablasGetKernelStream(¤t_stream); magma_queue_create( &stream[0] ); if (current_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else stream[1] = current_stream; for( i=0; i<s; i++ ) { // download i-th panel cols = maxm - i*nb; //magmablas_ztranspose( dAP, cols, dAT(i,i), lddat, nb, cols ); magmablas_ztranspose2( dAP, cols, dAT(i,i), lddat, nb, m-i*nb ); // make sure that that the transpose has completed magma_queue_sync( stream[1] ); magma_zgetmatrix_async( m-i*nb, nb, dAP, cols, work, lddwork, stream[0]); if ( i>0 ){ magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (i+1)*nb, nb, c_one, dAT(i-1,i-1), lddat, dAT(i-1,i+1), lddat ); magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-(i+1)*nb, m-i*nb, nb, c_neg_one, dAT(i-1,i+1), lddat, dAT(i, i-1), lddat, c_one, dAT(i, i+1), lddat ); } // do the cpu part rows = m - i*nb; magma_queue_sync( stream[0] ); lapackf77_zgetrf( &rows, &nb, work, &lddwork, ipiv+i*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + i*nb; // upload i-th panel magma_zsetmatrix_async( m-i*nb, nb, work, lddwork, dAP, maxm, stream[0]); magmablas_zpermute_long2( n, dAT, lddat, ipiv, nb, i*nb ); magma_queue_sync( stream[0] ); //magmablas_ztranspose(dAT(i,i), lddat, dAP, maxm, cols, nb); magmablas_ztranspose2(dAT(i,i), lddat, dAP, maxm, m-i*nb, nb); // do the small non-parallel computations (next panel update) if ( s > (i+1) ) { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dAT(i, i ), lddat, dAT(i, i+1), lddat); magma_zgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(i+1)*nb, nb, c_neg_one, dAT(i, i+1), lddat, dAT(i+1, i ), lddat, c_one, dAT(i+1, i+1), lddat ); } else { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, dAT(i, i ), lddat, dAT(i, i+1), lddat); magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-(i+1)*nb, m-(i+1)*nb, nb, c_neg_one, dAT(i, i+1), lddat, dAT(i+1, i ), lddat, c_one, dAT(i+1, i+1), lddat ); } } magma_int_t nb0 = min(m - s*nb, n - s*nb); rows = m - s*nb; cols = maxm - s*nb; magmablas_ztranspose2( dAP, maxm, dAT(s,s), lddat, nb0, rows); magma_zgetmatrix( rows, nb0, dAP, maxm, work, lddwork ); // do the cpu part lapackf77_zgetrf( &rows, &nb0, work, &lddwork, ipiv+s*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + s*nb; magmablas_zpermute_long2( n, dAT, lddat, ipiv, nb0, s*nb ); // upload i-th panel magma_zsetmatrix( rows, nb0, work, lddwork, dAP, maxm ); magmablas_ztranspose2( dAT(s,s), lddat, dAP, maxm, rows, nb0); magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, dAT(s,s), lddat, dAT(s,s)+nb0, lddat); if ( m == n ) { magmablas_ztranspose_inplace( m, dAT, lddat ); } else { magmablas_ztranspose2( dA, ldda, dAT, lddat, n, m ); magma_free( dAT ); } magma_free( dAP ); magma_free_pinned( work ); magma_queue_destroy( stream[0] ); if (current_stream == NULL) { magma_queue_destroy( stream[1] ); magmablasSetKernelStream(NULL); } } return *info; } /* End of MAGMA_ZGETRF_GPU */
extern "C" magma_int_t magma_zgetrf_mgpu(magma_int_t num_gpus, magma_int_t m, magma_int_t n, cuDoubleComplex **d_lA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments ========= NUM_GPUS (input) INTEGER The number of GPUS to be used for the factorization. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define inAT(id,i,j) (d_lAT[(id)] + (i)*nb*lddat + (j)*nb) cuDoubleComplex c_one = MAGMA_Z_ONE; cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t iinfo, nb, n_local[MagmaMaxGPUs]; magma_int_t maxm, mindim; magma_int_t i, j, d, rows, cols, s, lddat, lddwork; magma_int_t id, i_local, i_local2, nb0, nb1; cuDoubleComplex *d_lAT[MagmaMaxGPUs]; cuDoubleComplex *d_panel[MagmaMaxGPUs], *work; cudaStream_t streaml[4][2]; /* Check arguments */ *info = 0; if (m < 0) *info = -2; else if (n < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); nb = magma_get_zgetrf_nb(m); if (nb <= 1 || nb >= n) { /* Use CPU code. */ magma_zmalloc_cpu( &work, m * n ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_zgetmatrix( m, n, d_lA[0], ldda, work, m ); lapackf77_zgetrf(&m, &n, work, &m, ipiv, info); magma_zsetmatrix( m, n, work, m, d_lA[0], ldda ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; if( num_gpus > ceil((double)n/nb) ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) num_gpus ); *info = -1; return *info; } /* allocate workspace for each GPU */ lddat = ((((((n+nb-1)/nb)/num_gpus)*nb)+31)/32)*32; lddat = (n+nb-1)/nb; /* number of block columns */ lddat = (lddat+num_gpus-1)/num_gpus; /* number of block columns per GPU */ lddat = nb*lddat; /* number of columns per GPU */ lddat = ((lddat+31)/32)*32; /* make it a multiple of 32 */ for(i=0; i<num_gpus; i++){ magma_setdevice(i); /* local-n and local-ld */ n_local[i] = ((n/nb)/num_gpus)*nb; if (i < (n/nb)%num_gpus) n_local[i] += nb; else if (i == (n/nb)%num_gpus) n_local[i] += n%nb; /* workspaces */ if (MAGMA_SUCCESS != magma_zmalloc( &d_panel[i], 3*nb*maxm )) { for( j=0; j<=i; j++ ) { magma_setdevice(j); } for( j=0; j<i; j++ ) { magma_setdevice(j); magma_free( d_panel[j] ); magma_free( d_lAT[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* local-matrix storage */ if (MAGMA_SUCCESS != magma_zmalloc( &d_lAT[i], lddat*maxm )) { for( j=0; j<=i; j++ ) { magma_setdevice(j); magma_free( d_panel[j] ); } for( j=0; j<i; j++ ) { magma_setdevice(j); magma_free( d_lAT[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* create the streams */ magma_queue_create( &streaml[i][0] ); magma_queue_create( &streaml[i][1] ); magmablasSetKernelStream(streaml[i][1]); magmablas_ztranspose2( d_lAT[i], lddat, d_lA[i], ldda, m, n_local[i] ); } for(i=0; i<num_gpus; i++){ magma_setdevice(i); cudaStreamSynchronize(streaml[i][0]); magmablasSetKernelStream(NULL); } magma_setdevice(0); /* cpu workspace */ lddwork = maxm; if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, lddwork*nb*num_gpus )) { for(i=0; i<num_gpus; i++ ) { magma_setdevice(i); magma_free( d_panel[i] ); magma_free( d_lAT[i] ); } *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* calling multi-gpu interface with allocated workspaces and streams */ //magma_zgetrf1_mgpu( num_gpus, m, n, nb, 0, d_lAT, lddat, ipiv, d_panel, work, maxm, // (cudaStream_t **)streaml, info ); magma_zgetrf2_mgpu(num_gpus, m, n, nb, 0, d_lAT, lddat, ipiv, d_panel, work, maxm, streaml, info); /* clean up */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); /* save on output */ magmablas_ztranspose2( d_lA[d], ldda, d_lAT[d], lddat, n_local[d], m ); magma_device_sync(); magma_free( d_lAT[d] ); magma_free( d_panel[d] ); magma_queue_destroy( streaml[d][0] ); magma_queue_destroy( streaml[d][1] ); magmablasSetKernelStream(NULL); } /* end of for d=1,..,num_gpus */ magma_setdevice(0); magma_free_pinned( work ); } return *info; /* End of MAGMA_ZGETRF_MGPU */ }
magma_err_t magma_zgetrf_gpu(magma_int_t m, magma_int_t n, magmaDoubleComplex_ptr dA, size_t dA_offset, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info, magma_queue_t queue ) { /* -- clMAGMA (version 1.1.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date January 2014 Purpose ======= ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -7, internal GPU memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define inAT(i,j) dAT, dAT_offset + (i)*nb*lddat + (j)*nb magmaDoubleComplex c_one = MAGMA_Z_MAKE( 1.0, 0.0 ); magmaDoubleComplex c_neg_one = MAGMA_Z_MAKE( -1.0, 0.0 ); magma_int_t iinfo, nb; magma_int_t maxm, maxn, mindim; magma_int_t i, rows, cols, s, lddat, lddwork; magmaDoubleComplex_ptr dAT, dAP; magmaDoubleComplex *work; magma_err_t err; *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (m == 0 || n == 0) return MAGMA_SUCCESS; mindim = min(m, n); nb = magma_get_zgetrf_nb(m); s = mindim / nb; if (nb <= 1 || nb >= min(m,n)) { // use CPU code err = magma_zmalloc_cpu( &work, m*n ); if ( err != MAGMA_SUCCESS ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } chk( magma_zgetmatrix( m, n, dA, dA_offset, ldda, work, 0, m, queue )); lapackf77_zgetrf(&m, &n, work, &m, ipiv, info); chk( magma_zsetmatrix( m, n, work, 0, m, dA, dA_offset, ldda, queue )); magma_free_cpu(work); } else { size_t dAT_offset; // use hybrid blocked code maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; lddat = maxn; lddwork = maxm; if ( MAGMA_SUCCESS != magma_zmalloc( &dAP, nb*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if ((m == n) && (m % 32 == 0) && (ldda%32 == 0)) { dAT = dA; dAT_offset = dA_offset; magma_ztranspose_inplace( dAT, dAT_offset, ldda, lddat, queue ); } else { dAT_offset = 0; if ( MAGMA_SUCCESS != magma_zmalloc( &dAT, maxm*maxn )) { magma_free( dAP ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_ztranspose2( dAT, dAT_offset, lddat, dA, dA_offset, ldda, m, n, queue ); } if ( MAGMA_SUCCESS != magma_zmalloc_cpu( &work, maxm*nb ) ) { magma_free( dAP ); if (! ((m == n) && (m % 32 == 0) && (ldda%32 == 0)) ) magma_free( dAT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } for( i=0; i<s; i++ ) { // download i-th panel cols = maxm - i*nb; magma_ztranspose( dAP, 0, cols, inAT(i,i), lddat, nb, cols, queue ); magma_zgetmatrix(m-i*nb, nb, dAP, 0, cols, work, 0, lddwork, queue); if ( i>0 ){ magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (i+1)*nb, nb, c_one, inAT(i-1,i-1), lddat, inAT(i-1,i+1), lddat, queue ); magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-(i+1)*nb, m-i*nb, nb, c_neg_one, inAT(i-1,i+1), lddat, inAT(i, i-1), lddat, c_one, inAT(i, i+1), lddat, queue ); } // do the cpu part rows = m - i*nb; lapackf77_zgetrf( &rows, &nb, work, &lddwork, ipiv+i*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + i*nb; magma_zpermute_long2(n, dAT, dAT_offset, lddat, ipiv, nb, i*nb, queue ); // upload i-th panel magma_zsetmatrix(m-i*nb, nb, work, 0, lddwork, dAP, 0, maxm, queue); magma_ztranspose(inAT(i,i), lddat, dAP, 0, maxm, cols, nb, queue ); // do the small non-parallel computations if ( s > (i+1) ) { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, inAT(i, i ), lddat, inAT(i, i+1), lddat, queue); magma_zgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(i+1)*nb, nb, c_neg_one, inAT(i, i+1), lddat, inAT(i+1, i ), lddat, c_one, inAT(i+1, i+1), lddat, queue ); } else { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, inAT(i, i ), lddat, inAT(i, i+1), lddat, queue); magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-(i+1)*nb, m-(i+1)*nb, nb, c_neg_one, inAT(i, i+1), lddat, inAT(i+1, i ), lddat, c_one, inAT(i+1, i+1), lddat, queue ); } } magma_int_t nb0 = min(m - s*nb, n - s*nb); rows = m - s*nb; cols = maxm - s*nb; magma_ztranspose2( dAP, 0, maxm, inAT(s,s), lddat, nb0, rows, queue); magma_zgetmatrix(rows, nb0, dAP, 0, maxm, work, 0, lddwork, queue); // do the cpu part lapackf77_zgetrf( &rows, &nb0, work, &lddwork, ipiv+s*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + s*nb; magma_zpermute_long2(n, dAT, dAT_offset, lddat, ipiv, nb0, s*nb, queue ); // upload i-th panel magma_zsetmatrix(rows, nb0, work, 0, lddwork, dAP, 0, maxm, queue); magma_ztranspose2( inAT(s,s), lddat, dAP, 0, maxm, rows, nb0, queue ); magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, inAT(s,s), lddat, inAT(s,s)+nb0, lddat, queue); if ((m == n) && (m % 32 == 0) && (ldda%32 == 0)) { magma_ztranspose_inplace( dAT, dAT_offset, lddat, ldda, queue ); } else { magma_ztranspose2( dA, dA_offset, ldda, dAT, dAT_offset, lddat, n, m, queue ); magma_free( dAT ); } magma_free( dAP ); magma_free_cpu( work ); } return *info; /* End of MAGMA_ZGETRF_GPU */ }
/** Purpose ------- ZGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix may exceed the GPU memory. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Note: The factorization of big panel is done calling multiple-gpu-interface. Pivots are applied on GPU within the big panel. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_zgesv_comp ********************************************************************/ extern "C" magma_int_t magma_zgetrf_m( magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { #define A(i,j) (A + (j)*lda + (i)) #define dAT(d,i,j) (dAT[d] + (i)*nb*ldn_local + (j)*nb) #define dPT(d,i,j) (dPT[d] + (i)*nb*nb + (j)*nb*maxm) magma_timer_t time=0, time_total=0, time_alloc=0, time_set=0, time_get=0, time_comp=0; timer_start( time_total ); real_Double_t flops; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *dAT[MagmaMaxGPUs], *dA[MagmaMaxGPUs], *dPT[MagmaMaxGPUs]; magma_int_t iinfo = 0, nb, nbi, maxm, n_local[MagmaMaxGPUs], ldn_local; magma_int_t N, M, NB, NBk, I, d, ngpu0 = ngpu; magma_int_t ii, jj, h, offset, ib, rows; magma_queue_t stream[MagmaMaxGPUs][2]; magma_event_t event[MagmaMaxGPUs][2]; *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); /* initialize nb */ nb = magma_get_zgetrf_nb(m); maxm = ((m + 31)/32)*32; /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(magmaDoubleComplex); /* number of columns in the big panel */ h = 1+(2+ngpu0); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); const char* ngr_nb_char = getenv("MAGMA_NGR_NB"); if ( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); //NB = 5*max(nb,32); if ( ngpu0 > ceil((double)NB/nb) ) { ngpu = (int)ceil((double)NB/nb); h = 1+(2+ngpu); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); } else { ngpu = ngpu0; } if ( ngpu*NB >= n ) { #ifdef CHECK_ZGETRF_OOC printf( " * still fit in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_ZGETRF_OOC printf( " * don't fit in GPU memory.\n" ); #endif NB = ngpu*NB; NB = max( nb, (NB / nb) * nb); /* making sure it's devisable by nb (x64) */ } #ifdef CHECK_ZGETRF_OOC if ( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n", n, NB, nb, (double)freeMem ); else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n", n, NB, nb, (double)freeMem ); #endif if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code for scalar of one tile. */ lapackf77_zgetrf(&m, &n, A, &lda, ipiv, info); } else { /* Use hybrid blocked code. */ /* allocate memory on GPU to store the big panel */ timer_start( time_alloc ); n_local[0] = (NB/nb)/ngpu; if ( NB%(nb*ngpu) != 0 ) n_local[0]++; n_local[0] *= nb; ldn_local = ((n_local[0]+31)/32)*32; for( d=0; d < ngpu; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_zmalloc( &dA[d], (ldn_local+h*nb)*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dPT[d] = dA[d] + nb*maxm; /* for storing the previous panel from CPU */ dAT[d] = dA[d] + h*nb*maxm; /* for storing the big panel */ magma_queue_create( &stream[d][0] ); magma_queue_create( &stream[d][1] ); magma_event_create( &event[d][0] ); magma_event_create( &event[d][1] ); } //magma_setdevice(0); timer_stop( time_alloc ); for( I=0; I < n; I += NB ) { M = m; N = min( NB, n-I ); /* number of columns in this big panel */ //s = min( max(m-I,0), N )/nb; /* number of small block-columns in this big panel */ maxm = ((M + 31)/32)*32; if ( ngpu0 > ceil((double)N/nb) ) { ngpu = (int)ceil((double)N/nb); } else { ngpu = ngpu0; } for( d=0; d < ngpu; d++ ) { n_local[d] = ((N/nb)/ngpu)*nb; if (d < (N/nb)%ngpu) n_local[d] += nb; else if (d == (N/nb)%ngpu) n_local[d] += N%nb; } ldn_local = ((n_local[0]+31)/32)*32; /* upload the next big panel into GPU, transpose (A->A'), and pivot it */ timer_start( time ); magmablas_zsetmatrix_transpose_mgpu(ngpu, stream, A(0,I), lda, dAT, ldn_local, dA, maxm, M, N, nb); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } time_set += timer_stop( time ); timer_start( time ); /* == --------------------------------------------------------------- == */ /* == loop around the previous big-panels to update the new big-panel == */ for( offset = 0; offset < min(m,I); offset += NB ) { NBk = min( m-offset, NB ); /* start sending the first tile from the previous big-panels to gpus */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); nbi = min( nb, NBk ); magma_zsetmatrix_async( (M-offset), nbi, A(offset,offset), lda, dA[d], (maxm-offset), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][0] ); /* transpose */ magmablas_ztranspose( M-offset, nbi, dA[d], maxm-offset, dPT(d,0,0), nb ); } /* applying the pivot from the previous big-panel */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magmablas_zlaswp_q( ldn_local, dAT(d,0,0), ldn_local, offset+1, offset+NBk, ipiv, 1, stream[d][1] ); } /* == going through each block-column of previous big-panels == */ for( jj=0, ib=offset/nb; jj < NBk; jj += nb, ib++ ) { ii = offset+jj; rows = maxm - ii; nbi = min( nb, NBk-jj ); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); /* wait for a block-column on GPU */ magma_queue_sync( stream[d][0] ); /* start sending next column */ if ( jj+nb < NBk ) { magma_zsetmatrix_async( (M-ii-nb), min(nb,NBk-jj-nb), A(ii+nb,ii+nb), lda, dA[d], (rows-nb), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][(1+jj/nb)%2] ); /* transpose next column */ magmablas_ztranspose( M-ii-nb, nb, dA[d], rows-nb, dPT(d,0,(1+jj/nb)%2), nb ); } /* update with the block column */ magmablasSetKernelStream(stream[d][1]); magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d], nbi, c_one, dPT(d,0,(jj/nb)%2), nb, dAT(d,ib,0), ldn_local ); if ( M > ii+nb ) { magma_zgemm( MagmaNoTrans, MagmaNoTrans, n_local[d], M-(ii+nb), nbi, c_neg_one, dAT(d,ib,0), ldn_local, dPT(d,1,(jj/nb)%2), nb, c_one, dAT(d,ib+1,0), ldn_local ); } magma_event_record( event[d][(jj/nb)%2], stream[d][1] ); } /* end of for each block-columns in a big-panel */ } } /* end of for each previous big-panels */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } /* calling magma-gpu interface to panel-factorize the big panel */ if ( M > I ) { magma_zgetrf2_mgpu(ngpu, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, A(0,I), lda, stream, &iinfo); if ( iinfo < 0 ) { *info = iinfo; break; } else if ( iinfo != 0 ) { *info = iinfo + I * NB; //break; } /* adjust pivots */ for( ii=I; ii < min(I+N,m); ii++ ) ipiv[ii] += I; } time_comp += timer_stop( time ); /* download the current big panel to CPU */ timer_start( time ); magmablas_zgetmatrix_transpose_mgpu(ngpu, stream, dAT, ldn_local, A(0,I), lda, dA, maxm, M, N, nb); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } time_get += timer_stop( time ); } /* end of for */ timer_stop( time_total ); flops = FLOPS_ZGETRF( m, n ) / 1e9; timer_printf(" memory-allocation time: %e\n", time_alloc ); timer_printf(" NB=%d nb=%d\n", (int) NB, (int) nb ); timer_printf(" memcopy and transpose %e seconds\n", time_set ); timer_printf(" total time %e seconds\n", time_total ); timer_printf(" Performance %f GFlop/s, %f seconds without htod and dtoh\n", flops / (time_comp), time_comp ); timer_printf(" Performance %f GFlop/s, %f seconds with htod\n", flops / (time_comp + time_set), time_comp + time_set ); timer_printf(" Performance %f GFlop/s, %f seconds with dtoh\n", flops / (time_comp + time_get), time_comp + time_get ); timer_printf(" Performance %f GFlop/s, %f seconds without memory-allocation\n", flops / (time_total - time_alloc), time_total - time_alloc ); for( d=0; d < ngpu0; d++ ) { magma_setdevice(d); magma_free( dA[d] ); magma_event_destroy( event[d][0] ); magma_event_destroy( event[d][1] ); magma_queue_destroy( stream[d][0] ); magma_queue_destroy( stream[d][1] ); } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); } if ( *info >= 0 ) magma_zgetrf_piv(m, n, NB, A, lda, ipiv, info); return *info; } /* magma_zgetrf_m */
/* //////////////////////////////////////////////////////////////////////////// -- Testing zswap, zswapblk, zpermute, zlaswp, zlaswpx */ int main( int argc, char** argv) { TESTING_INIT(); magmaDoubleComplex *h_A1, *h_A2; magmaDoubleComplex *d_A1, *d_A2; magmaDoubleComplex *h_R1, *h_R2; // row-major and column-major performance real_Double_t row_perf0, col_perf0; real_Double_t row_perf1, col_perf1; real_Double_t row_perf2, col_perf2; real_Double_t row_perf3; real_Double_t row_perf4; real_Double_t row_perf5, col_perf5; real_Double_t row_perf6, col_perf6; real_Double_t row_perf7; real_Double_t cpu_perf; real_Double_t time, gbytes; magma_int_t N, lda, ldda, nb, j; magma_int_t ione = 1; magma_int_t *ipiv, *ipiv2; magma_int_t *d_ipiv; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); magma_queue_t queue = 0; printf(" cublasZswap zswap zswapblk zlaswp zpermute zlaswp2 zlaswpx zcopymatrix CPU (all in )\n"); printf(" N nb row-maj/col-maj row-maj/col-maj row-maj/col-maj row-maj row-maj row-maj row-maj/col-maj row-blk/col-blk zlaswp (GByte/s)\n"); printf("==================================================================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { // For an N x N matrix, swap nb rows or nb columns using various methods. // Each test is assigned one bit in the 'check' bitmask; bit=1 indicates failure. // The variable 'shift' keeps track of which bit is for current test int shift = 1; int check = 0; N = opts.nsize[itest]; lda = N; ldda = ((N+31)/32)*32; nb = (opts.nb > 0 ? opts.nb : magma_get_zgetrf_nb( N )); nb = min( N, nb ); // each swap does 2N loads and 2N stores, for nb swaps gbytes = sizeof(magmaDoubleComplex) * 4.*N*nb / 1e9; TESTING_MALLOC_PIN( h_A1, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_A2, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_R1, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_R2, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, nb ); TESTING_MALLOC_CPU( ipiv2, magma_int_t, nb ); TESTING_MALLOC_DEV( d_ipiv, magma_int_t, nb ); TESTING_MALLOC_DEV( d_A1, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( d_A2, magmaDoubleComplex, ldda*N ); for( j=0; j < nb; j++ ) { ipiv[j] = (magma_int_t) ((rand()*1.*N) / (RAND_MAX * 1.)) + 1; } /* ===================================================================== * cublasZswap, row-by-row (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_zsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { cublasZswap( N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1); } } time = magma_sync_wtime( queue ) - time; row_perf0 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_zgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_zgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_zsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { cublasZswap( N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda); } } time = magma_sync_wtime( queue ) - time; col_perf0 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_zgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_zgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * zswap, row-by-row (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_zsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { magmablas_zswap( N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1); } } time = magma_sync_wtime( queue ) - time; row_perf1 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_zgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_zgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_zsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { magmablas_zswap( N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda ); } } time = magma_sync_wtime( queue ) - time; col_perf1 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_zgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_zgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * zswapblk, blocked version (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_zsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); magmablas_zswapblk( MagmaRowMajor, N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0); time = magma_sync_wtime( queue ) - time; row_perf2 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_zgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_zgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_zsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); magmablas_zswapblk( MagmaColMajor, N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0); time = magma_sync_wtime( queue ) - time; col_perf2 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_zgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_zgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * zpermute_long (1 matrix) */ /* Row Major */ memcpy( ipiv2, ipiv, nb*sizeof(magma_int_t) ); // zpermute updates ipiv2 init_matrix( N, N, h_A1, lda, 0 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_zpermute_long2( N, d_A1, ldda, ipiv2, nb, 0 ); time = magma_sync_wtime( queue ) - time; row_perf3 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_zgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style zlaswp (1 matrix) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_zlaswp( N, d_A1, ldda, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; row_perf4 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_zgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style zlaswp (1 matrix) - d_ipiv on GPU */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magma_setvector( nb, sizeof(magma_int_t), ipiv, 1, d_ipiv, 1 ); magmablas_zlaswp2( N, d_A1, ldda, 1, nb, d_ipiv ); time = magma_sync_wtime( queue ) - time; row_perf7 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_zgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style zlaswpx (extended for row- and col-major) (1 matrix) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_zlaswpx( N, d_A1, ldda, 1, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; row_perf5 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_zgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* Col Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_zlaswpx( N, d_A1, 1, ldda, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; col_perf5 = gbytes / time; time = magma_wtime(); lapackf77_zlaswp( &N, h_A1, &lda, &ione, &nb, ipiv, &ione); time = magma_wtime() - time; cpu_perf = gbytes / time; magma_zgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * Copy matrix. */ time = magma_sync_wtime( queue ); magma_zcopymatrix( N, nb, d_A1, ldda, d_A2, ldda ); time = magma_sync_wtime( queue ) - time; // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap col_perf6 = 0.5 * gbytes / time; time = magma_sync_wtime( queue ); magma_zcopymatrix( nb, N, d_A1, ldda, d_A2, ldda ); time = magma_sync_wtime( queue ) - time; // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap row_perf6 = 0.5 * gbytes / time; printf("%5d %3d %6.2f%c/ %6.2f%c %6.2f%c/ %6.2f%c %6.2f%c/ %6.2f%c %6.2f%c %6.2f%c %6.2f%c %6.2f%c/ %6.2f%c %6.2f / %6.2f %6.2f %10s\n", (int) N, (int) nb, row_perf0, ((check & 0x001) != 0 ? '*' : ' '), col_perf0, ((check & 0x002) != 0 ? '*' : ' '), row_perf1, ((check & 0x004) != 0 ? '*' : ' '), col_perf1, ((check & 0x008) != 0 ? '*' : ' '), row_perf2, ((check & 0x010) != 0 ? '*' : ' '), col_perf2, ((check & 0x020) != 0 ? '*' : ' '), row_perf3, ((check & 0x040) != 0 ? '*' : ' '), row_perf4, ((check & 0x080) != 0 ? '*' : ' '), row_perf7, ((check & 0x100) != 0 ? '*' : ' '), row_perf5, ((check & 0x200) != 0 ? '*' : ' '), col_perf5, ((check & 0x400) != 0 ? '*' : ' '), row_perf6, col_perf6, cpu_perf, (check == 0 ? "ok" : "* failed") ); status += ! (check == 0); TESTING_FREE_PIN( h_A1 ); TESTING_FREE_PIN( h_A2 ); TESTING_FREE_PIN( h_R1 ); TESTING_FREE_PIN( h_R2 ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( ipiv2 ); TESTING_FREE_DEV( d_ipiv ); TESTING_FREE_DEV( d_A1 ); TESTING_FREE_DEV( d_A2 ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }