/** Purpose ------- DPOTRS solves a system of linear equations A*X = B with a symmetric positive definite matrix A using the Cholesky factorization A = U**T*U or A = L*L**T computed by DPOTRF. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in] dA DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) The triangular factor U or L from the Cholesky factorization A = U**T*U or A = L*L**T, as computed by DPOTRF. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,N). @param[in,out] dB DOUBLE_PRECISION array on the GPU, dimension (LDDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. @param[in] lddb INTEGER The leading dimension of the array B. LDDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_dposv_comp ********************************************************************/ extern "C" magma_int_t magma_dpotrs_gpu(magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs, double *dA, magma_int_t ldda, double *dB, magma_int_t lddb, magma_int_t *info) { double c_one = MAGMA_D_ONE; *info = 0; if ( uplo != MagmaUpper && uplo != MagmaLower ) *info = -1; if ( n < 0 ) *info = -2; if ( nrhs < 0) *info = -3; if ( ldda < max(1, n) ) *info = -5; if ( lddb < max(1, n) ) *info = -7; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if ( (n == 0) || (nrhs == 0) ) { return *info; } if ( uplo == MagmaUpper ) { if ( nrhs == 1) { magma_dtrsv(MagmaUpper, MagmaTrans, MagmaNonUnit, n, dA, ldda, dB, 1 ); magma_dtrsv(MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA, ldda, dB, 1 ); } else { magma_dtrsm(MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb); magma_dtrsm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb); } } else { if ( nrhs == 1) { magma_dtrsv(MagmaLower, MagmaNoTrans, MagmaNonUnit, n, dA, ldda, dB, 1 ); magma_dtrsv(MagmaLower, MagmaTrans, MagmaNonUnit, n, dA, ldda, dB, 1 ); } else { magma_dtrsm(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb); magma_dtrsm(MagmaLeft, MagmaLower, MagmaTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb); } } return *info; }
void magma_task_dev_dtrsm(Schedule* sched_obj ) { magma_int_t deviceID; magma_side_t side; magma_uplo_t uplo; magma_trans_t trans; magma_diag_t diag; magma_int_t m; magma_int_t n; double alpha; double *dA; magma_int_t lda; double *dB; magma_int_t ldb; #if (dbglevel >=1) ca_trace_start(); #endif schedule_unpack_args_12(sched_obj, deviceID, side, uplo, trans, diag, m, n, alpha, dA, lda, dB, ldb); #if (dbglevel==10) ca_dbg_printMat_transpose_gpu(m, n, dB, ldb, "A(K,K:N) before magma_dtrsm"); #endif magma_setdevice(deviceID); pthread_mutex_lock(&mutex_compute_stream); magmablasSetKernelStream(compute_stream[deviceID]); magma_dtrsm( side, uplo, trans, diag, m, n, alpha, dA, lda, dB, ldb ); //task_magma_dtrsm('R', 'U', MagmaNoTrans, 'U', gpu_ncols, nb, c_one, dAT(K,K), dAT_LD, dAT(K,K+A_N), dAT_LD); pthread_mutex_unlock(&mutex_compute_stream); #if (dbglevel==10) ca_dbg_printMat_transpose_gpu(m, n, dB, ldb, "A(K,K:N) after magma_dtrsm"); #endif #if (dbglevel >=1) ca_trace_end_gpu('U'); ca_trace_end_cpu('C'); #endif }
void magma_task_dtrsm(Schedule* sched_obj ) { magma_side_t side; magma_uplo_t uplo; magma_trans_t trans; magma_diag_t diag; magma_int_t m; magma_int_t n; double alpha; double *dA; magma_int_t lda; double *dB; magma_int_t ldb; #if (dbglevel >=1) ca_trace_start(); #endif schedule_unpack_args_11(sched_obj,side, uplo, trans, diag, m, n, alpha, dA, lda, dB, ldb); magma_dtrsm( side, uplo, trans, diag, m, n, alpha, dA, lda, dB, ldb ); //magma_task_dtrsm('R', 'U', 'N', 'U', gpu_ncols, nb, c_one, dAT(K,K), dAT_LD, dAT(K,K+A_N), dAT_LD); #if (dbglevel >=1) ca_trace_end_1gpu('U'); ca_trace_end_cpu('C'); #endif }
extern "C" magma_int_t magma_dgetrs_gpu( magma_trans_t trans, magma_int_t n, magma_int_t nrhs, magmaDouble_ptr dA, size_t dA_offset, magma_int_t ldda, magma_int_t *ipiv, magmaDouble_ptr dB, size_t dB_offset, magma_int_t lddb, magma_queue_t queue, magma_int_t *info) { /* -- clMagma (version 0.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= Solves a system of linear equations A * X = B or A' * X = B with a general N-by-N matrix A using the LU factorization computed by DGETRF_GPU. Arguments ========= TRANS (input) CHARACTER*1 Specifies the form of the system of equations: = 'N': A * X = B (No transpose) = 'T': A'* X = B (Transpose) = 'C': A'* X = B (Conjugate transpose = Transpose) N (input) INTEGER The order of the matrix A. N >= 0. NRHS (input) INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. A (input) DOUBLE_PRECISION array on the GPU, dimension (LDA,N) The factors L and U from the factorization A = P*L*U as computed by DGETRF_GPU. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). IPIV (input) INTEGER array, dimension (N) The pivot indices from DGETRF; for 1<=i<=N, row i of the matrix was interchanged with row IPIV(i). B (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. LDB (input) INTEGER The leading dimension of the array B. LDB >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value HWORK (workspace) DOUBLE_PRECISION array, dimension N*NRHS ===================================================================== */ double c_one = MAGMA_D_ONE; double *work = NULL; int notran = (trans == MagmaNoTrans); magma_int_t i1, i2, inc; *info = 0; if ( (! notran) && (trans != MagmaTrans) && (trans != MagmaConjTrans) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (nrhs < 0) { *info = -3; } else if (ldda < max(1,n)) { *info = -5; } else if (lddb < max(1,n)) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (n == 0 || nrhs == 0) { return *info; } magma_dmalloc_cpu( &work, n*nrhs ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } i1 = 1; i2 = n; if (notran) { inc = 1; /* Solve A * X = B. */ magma_dgetmatrix( n, nrhs, dB, dB_offset, lddb, work, n, queue ); lapackf77_dlaswp(&nrhs, work, &n, &i1, &i2, ipiv, &inc); magma_dsetmatrix( n, nrhs, work, n, dB, dB_offset, lddb, queue ); if ( nrhs == 1) { magma_dtrsv(MagmaLower, MagmaNoTrans, MagmaUnit, n, dA, dA_offset, ldda, dB, dB_offset, 1, queue); magma_dtrsv(MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA, dA_offset, ldda, dB, dB_offset, 1, queue); } else { magma_dtrsm(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, n, nrhs, c_one, dA, dA_offset, ldda, dB, dB_offset, lddb, queue); magma_dtrsm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA, dA_offset, ldda, dB, dB_offset, lddb, queue); } } else { inc = -1; /* Solve A' * X = B. */ if ( nrhs == 1) { magma_dtrsv(MagmaUpper, trans, MagmaNonUnit, n, dA, dA_offset, ldda, dB, dB_offset, 1, queue ); magma_dtrsv(MagmaLower, trans, MagmaUnit, n, dA, dA_offset, ldda, dB, dB_offset, 1, queue ); } else { magma_dtrsm(MagmaLeft, MagmaUpper, trans, MagmaNonUnit, n, nrhs, c_one, dA, dA_offset, ldda, dB, dB_offset, lddb, queue); magma_dtrsm(MagmaLeft, MagmaLower, trans, MagmaUnit, n, nrhs, c_one, dA, dA_offset, ldda, dB, dB_offset, lddb, queue); } magma_dgetmatrix( n, nrhs, dB, dB_offset, lddb, work, n, queue ); lapackf77_dlaswp(&nrhs, work, &n, &i1, &i2, ipiv, &inc); magma_dsetmatrix( n, nrhs, work, n, dB, dB_offset, lddb, queue ); } magma_free_cpu(work); return *info; }
/** Purpose ------- Solves a system of linear equations A * X = B, A**T * X = B, or A**H * X = B with a general N-by-N matrix A using the LU factorization computed by DGETRF_GPU. Arguments --------- @param[in] trans magma_trans_t Specifies the form of the system of equations: - = MagmaNoTrans: A * X = B (No transpose) - = MagmaTrans: A**T * X = B (Transpose) - = MagmaConjTrans: A**H * X = B (Conjugate transpose) @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in] dA DOUBLE_PRECISION array on the GPU, dimension (LDA,N) The factors L and U from the factorization A = P*L*U as computed by DGETRF_GPU. @param[in] ldda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in] ipiv INTEGER array, dimension (N) The pivot indices from DGETRF; for 1 <= i <= N, row i of the matrix was interchanged with row IPIV(i). @param[in,out] dB DOUBLE_PRECISION array on the GPU, dimension (LDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. @param[in] lddb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_dgesv_comp ********************************************************************/ extern "C" magma_int_t magma_dgetrs_gpu(magma_trans_t trans, magma_int_t n, magma_int_t nrhs, double *dA, magma_int_t ldda, magma_int_t *ipiv, double *dB, magma_int_t lddb, magma_int_t *info) { double c_one = MAGMA_D_ONE; double *work = NULL; int notran = (trans == MagmaNoTrans); magma_int_t i1, i2, inc; *info = 0; if ( (! notran) && (trans != MagmaTrans) && (trans != MagmaConjTrans) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (nrhs < 0) { *info = -3; } else if (ldda < max(1,n)) { *info = -5; } else if (lddb < max(1,n)) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (n == 0 || nrhs == 0) { return *info; } magma_dmalloc_cpu( &work, n * nrhs ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } i1 = 1; i2 = n; if (notran) { inc = 1; /* Solve A * X = B. */ magma_dgetmatrix( n, nrhs, dB, lddb, work, n ); lapackf77_dlaswp(&nrhs, work, &n, &i1, &i2, ipiv, &inc); magma_dsetmatrix( n, nrhs, work, n, dB, lddb ); if ( nrhs == 1) { magma_dtrsv(MagmaLower, MagmaNoTrans, MagmaUnit, n, dA, ldda, dB, 1 ); magma_dtrsv(MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA, ldda, dB, 1 ); } else { magma_dtrsm(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, n, nrhs, c_one, dA, ldda, dB, lddb ); magma_dtrsm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb ); } } else { inc = -1; /* Solve A**T * X = B or A**H * X = B. */ if ( nrhs == 1) { magma_dtrsv(MagmaUpper, trans, MagmaNonUnit, n, dA, ldda, dB, 1 ); magma_dtrsv(MagmaLower, trans, MagmaUnit, n, dA, ldda, dB, 1 ); } else { magma_dtrsm(MagmaLeft, MagmaUpper, trans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb ); magma_dtrsm(MagmaLeft, MagmaLower, trans, MagmaUnit, n, nrhs, c_one, dA, ldda, dB, lddb ); } magma_dgetmatrix( n, nrhs, dB, lddb, work, n ); lapackf77_dlaswp(&nrhs, work, &n, &i1, &i2, ipiv, &inc); magma_dsetmatrix( n, nrhs, work, n, dB, lddb ); } magma_free_cpu(work); return *info; }
int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, t1, t2; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; const char trans[] = { 'N', 'C', 'T' }; const char uplo[] = { 'L', 'U' }; const char diag[] = { 'U', 'N' }; const char side[] = { 'L', 'R' }; double *A, *B, *C, *C2, *LU; double *dA, *dB, *dC1, *dC2; double alpha = MAGMA_D_MAKE( 0.5, 0.1 ); double beta = MAGMA_D_MAKE( 0.7, 0.2 ); double dalpha = 0.6; double dbeta = 0.8; double work[1], error, total_error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld, info; magma_int_t *piv; magma_err_t err; magma_opts opts; parse_opts( argc, argv, &opts ); printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" ); total_error = 0.; for( int i = 0; i < opts.ntest; ++i ) { m = opts.msize[i]; n = opts.nsize[i]; k = opts.ksize[i]; printf("=========================================================================\n"); printf( "M %d, N %d, K %d\n", (int) m, (int) n, (int) k ); // allocate matrices // over-allocate so they can be any combination of {m,n,k} x {m,n,k}. maxn = max( max( m, n ), k ); ld = maxn; size = maxn*maxn; err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) ); assert( err == 0 ); err = magma_dmalloc_pinned( &A, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &B, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &C, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &C2, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &LU, size ); assert( err == 0 ); err = magma_dmalloc( &dA, size ); assert( err == 0 ); err = magma_dmalloc( &dB, size ); assert( err == 0 ); err = magma_dmalloc( &dC1, size ); assert( err == 0 ); err = magma_dmalloc( &dC2, size ); assert( err == 0 ); // initialize matrices size = maxn*maxn; lapackf77_dlarnv( &ione, ISEED, &size, A ); lapackf77_dlarnv( &ione, ISEED, &size, B ); lapackf77_dlarnv( &ione, ISEED, &size, C ); printf( "========== Level 1 BLAS ==========\n" ); // ----- test DSWAP // swap 2nd and 3rd columns of dA, then copy to C2 and compare with A assert( n >= 4 ); magma_dsetmatrix( m, n, A, ld, dA, ld ); magma_dsetmatrix( m, n, A, ld, dB, ld ); magma_dswap( m, dA(0,1), 1, dA(0,2), 1 ); magma_dswap( m, dB(0,1), 1, dB(0,2), 1 ); // check results, storing diff between magma and cuda calls in C2 cublasDaxpy( ld*n, c_neg_one, dA, 1, dB, 1 ); magma_dgetmatrix( m, n, dB, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &k, C2, &ld, work ); total_error += error; printf( "dswap diff %.2g\n", error ); // ----- test IDAMAX // get argmax of column of A magma_dsetmatrix( m, k, A, ld, dA, ld ); error = 0; for( int j = 0; j < k; ++j ) { magma_int_t i1 = magma_idamax( m, dA(0,j), 1 ); magma_int_t i2 = cublasIdamax( m, dA(0,j), 1 ); assert( i1 == i2 ); error += abs( i1 - i2 ); } total_error += error; gflops = (double)m * k / 1e9; printf( "idamax diff %.2g\n", error ); printf( "\n" ); printf( "========== Level 2 BLAS ==========\n" ); // ----- test DGEMV // c = alpha*A*b + beta*c, with A m*n; b,c m or n-vectors // try no-trans/trans for( int ia = 0; ia < 3; ++ia ) { magma_dsetmatrix( m, n, A, ld, dA, ld ); magma_dsetvector( maxn, B, 1, dB, 1 ); magma_dsetvector( maxn, C, 1, dC1, 1 ); magma_dsetvector( maxn, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 size = (trans[ia] == 'N' ? m : n); cublasDaxpy( size, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( size, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &size, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DGEMV( m, n ) / 1e9; printf( "dgemv( %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", trans[ia], error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test DSYMV // c = alpha*A*b + beta*c, with A m*m symmetric; b,c m-vectors // try upper/lower for( int iu = 0; iu < 2; ++iu ) { magma_dsetmatrix( m, m, A, ld, dA, ld ); magma_dsetvector( m, B, 1, dB, 1 ); magma_dsetvector( m, C, 1, dC1, 1 ); magma_dsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( m, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYMV( m ) / 1e9; printf( "dsymv( %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test DTRSV // solve A*c = c, with A m*m triangular; c m-vector // try upper/lower, no-trans/trans, unit/non-unit diag // Factor A into LU to get well-conditioned triangles, else solve yields garbage. // Still can give garbage if solves aren't consistent with LU factors, // e.g., using unit diag for U, so copy lower triangle to upper triangle. // Also used for trsm later. lapackf77_dlacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld ); lapackf77_dgetrf( &maxn, &maxn, LU, &ld, piv, &info ); for( int j = 0; j < maxn; ++j ) { for( int i = 0; i < j; ++i ) { *LU(i,j) = *LU(j,i); } } for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { magma_dsetmatrix( m, m, LU, ld, dA, ld ); magma_dsetvector( m, C, 1, dC1, 1 ); magma_dsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( m, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRSM( MagmaLeft, m, 1 ) / 1e9; printf( "dtrsv( %c, %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], diag[id], error, gflops/t1, gflops/t2 ); }}} printf( "\n" ); printf( "========== Level 3 BLAS ==========\n" ); // ----- test DGEMM // C = alpha*A*B + beta*C, with A m*k or k*m; B k*n or n*k; C m*n // try combinations of no-trans/trans for( int ia = 0; ia < 3; ++ia ) { for( int ib = 0; ib < 3; ++ib ) { bool nta = (trans[ia] == 'N'); bool ntb = (trans[ib] == 'N'); magma_dsetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA, ld ); magma_dsetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DGEMM( m, n, k ) / 1e9; printf( "dgemm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", trans[ia], trans[ib], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYMM // C = alpha*A*B + beta*C (left) with A m*m symmetric; B,C m*n; or // C = alpha*B*A + beta*C (right) with A n*n symmetric; B,C m*n // try left/right, upper/lower for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { magma_dsetmatrix( m, m, A, ld, dA, ld ); magma_dsetmatrix( m, n, B, ld, dB, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYMM( side[is], m, n ) / 1e9; printf( "dsymm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", side[is], uplo[iu], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYRK // C = alpha*A*A^H + beta*C (no-trans) with A m*k and C m*m symmetric; or // C = alpha*A^H*A + beta*C (trans) with A k*m and C m*m symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { magma_dsetmatrix( n, k, A, ld, dA, ld ); magma_dsetmatrix( n, n, C, ld, dC1, ld ); magma_dsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYRK( k, n ) / 1e9; printf( "dsyrk( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYR2K // C = alpha*A*B^H + ^alpha*B*A^H + beta*C (no-trans) with A,B n*k; C n*n symmetric; or // C = alpha*A^H*B + ^alpha*B^H*A + beta*C (trans) with A,B k*n; C n*n symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { bool nt = (trans[it] == 'N'); magma_dsetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA, ld ); magma_dsetmatrix( n, n, C, ld, dC1, ld ); magma_dsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYR2K( k, n ) / 1e9; printf( "dsyr2k( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DTRMM // C = alpha*A*C (left) with A m*m triangular; C m*n; or // C = alpha*C*A (right) with A n*n triangular; C m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == 'L'); magma_dsetmatrix( (left ? m : n), (left ? m : n), A, ld, dA, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRMM( side[is], m, n ) / 1e9; printf( "dtrmm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // ----- test DTRSM // solve A*X = alpha*B (left) with A m*m triangular; B m*n; or // solve X*A = alpha*B (right) with A n*n triangular; B m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == 'L'); magma_dsetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRSM( side[is], m, n ) / 1e9; printf( "dtrsm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // cleanup magma_free_cpu( piv ); magma_free_pinned( A ); magma_free_pinned( B ); magma_free_pinned( C ); magma_free_pinned( C2 ); magma_free_pinned( LU ); magma_free( dA ); magma_free( dB ); magma_free( dC1 ); magma_free( dC2 ); } if ( total_error != 0. ) { printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n", total_error ); } else { printf( "all tests passed\n" ); } TESTING_FINALIZE(); return 0; }
extern "C" magma_int_t magma_dpotrf3_mgpu(int num_gpus, char uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, double **d_lA, magma_int_t ldda, double **d_lP, magma_int_t lddp, double *a, magma_int_t lda, magma_int_t h, cudaStream_t stream[][3], cudaEvent_t event[][5], magma_int_t *info ) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix dA. Auxiliary subroutine for dpotrf2_ooc. It is multiple gpu interface to compute Cholesky of a "rectangular" matrix. The factorization has the form dA = U**T * U, if UPLO = 'U', or dA = L * L**T, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of dA is stored; = 'L': Lower triangle of dA is stored. N (input) INTEGER The order of the matrix dA. N >= 0. dA (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) On entry, the symmetric matrix dA. If UPLO = 'U', the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**T * U or dA = L * L**T. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be dividable by 16. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ magma_int_t j, jb, nb0, nb2, d, dd, id, j_local, j_local2, buf; char uplo_[2] = {uplo, 0}; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double d_one = 1.0; double d_neg_one = -1.0; int upper = lapackf77_lsame(uplo_, "U"); double *dlpanel; magma_int_t n_local[MagmaMaxGPUs], ldpanel; //cudaEvent_t event0[MagmaMaxGPUs], /* send row to CPU */ // event1[MagmaMaxGPUs], /* send diag to GPU */ // event2[MagmaMaxGPUs], /* offdiagonal update */ // event3[MagmaMaxGPUs], /* send row to GPU */ // event4[MagmaMaxGPUs]; /* lookahead */ const magma_int_t stream1 = 0, stream2 = 1, stream3 = 2; double *d_dinvA[MagmaMaxGPUs][2], *d_x[MagmaMaxGPUs][2]; /* used by dtrsm_work */ *info = 0; if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (!upper && num_gpus*ldda < max(1,n)) { *info = -4; } else if (upper && ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* initialization */ for( d=0; d<num_gpus; d++ ) { /* local-n and local-ld */ if (upper) { n_local[d] = ((n/nb)/num_gpus)*nb; if (d < (n/nb)%num_gpus) n_local[d] += nb; else if (d == (n/nb)%num_gpus) n_local[d] += n%nb; } else { n_local[d] = ((m/nb)/num_gpus)*nb; if (d < (m/nb)%num_gpus) n_local[d] += nb; else if (d == (m/nb)%num_gpus) n_local[d] += m%nb; } //magma_setdevice(d); //magma_event_create( &event0[d] ); //magma_event_create( &event1[d] ); //magma_event_create( &event2[d] ); //magma_event_create( &event3[d] ); //magma_event_create( &event4[d] ); } /* == initialize the trace */ trace_init( 1, num_gpus, 3, (CUstream_st**)stream ); if (upper) { /* ---------------------------------------------- */ /* Upper-triangular case */ /* > Compute the Cholesky factorization A = U'*U. */ /* ---------------------------------------------- */ #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK) /* invert the diagonals * Allocate device memory for the inversed diagonal blocks, size=m*NB */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( j=0; j<2; j++ ) { cudaMalloc((void**)&d_dinvA[d][j], nb*nb*sizeof(double)); cudaMalloc((void**)&d_x[d][j], n*nb*sizeof(double)); cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(double)); cudaMemset(d_x[d][j], 0, n*nb*sizeof(double)); } } magma_setdevice(0); #endif for (j=0; j<m; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (m-j)); /* Update the current diagonal block on stream1 */ magma_setdevice(id); if( j > 0 ) { magmablasSetKernelStream(stream[id][stream1]); trace_gpu_start( id, stream1, "syrk", "syrk" ); magma_dsyrk(MagmaUpper, MagmaTrans, jb, j, d_neg_one, dlA(id, 0, nb*j_local), ldda, d_one, dlA(id, j, nb*j_local), ldda); trace_gpu_end( id, stream1 ); } /* send the diagonal to cpu on stream1 */ trace_gpu_start( id, stream1, "comm", "D to CPU" ); magma_dgetmatrix_async( jb, jb, dlA(id, j, nb*j_local), ldda, Aup(j,j), lda, stream[id][stream1] ); trace_gpu_end( id, stream1 ); /* update off-diagonal blocks in the panel */ if( j > 0 ) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { j_local2 = j_local+1; if( d > id ) j_local2 --; nb0 = nb*j_local2; if( n_local[d] > nb0 ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][stream2]); if( d == id ) { dlpanel = dlA(d, 0, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d, jb, 0, buf); ldpanel = lddp; magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu } trace_gpu_start( d, stream2, "gemm", "gemm" ); magma_dgemm(MagmaTrans, MagmaNoTrans, jb, n_local[d]-nb0, j, c_neg_one, dlpanel, ldpanel, dlA(d, 0, nb0), ldda, c_one, dlA(d, j, nb0), ldda); trace_gpu_end( d, stream2 ); magma_event_record( event[d][2], stream[d][stream2] ); } d = (d+1)%num_gpus; } } /* wait for panel and factorize it on cpu */ magma_setdevice(id); magma_queue_sync( stream[id][stream1] ); trace_cpu_start( 0, "getrf", "getrf" ); lapackf77_dpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info); trace_cpu_end( 0 ); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus on stream1 */ if ( (j+jb) < n) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { if( d == id ) { dlpanel = dlA(d, j, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d, 0, 0, buf); ldpanel = lddp; } magma_setdevice(d); trace_gpu_start( d, stream1, "comm", "comm" ); magma_dsetmatrix_async( jb, jb, Aup(j,j), lda, dlpanel, ldpanel, stream[d][stream1] ); trace_gpu_end( d, stream1 ); magma_event_record( event[d][1], stream[d][stream1] ); d = (d+1)%num_gpus; } } else { magma_setdevice(id); trace_gpu_start( id, stream1, "comm", "comm" ); magma_dsetmatrix_async( jb, jb, Aup(j,j), lda, dlA(id, j, nb*j_local), ldda, stream[id][stream1] ); trace_gpu_end( id, stream1 ); } /* panel-factorize the off-diagonal */ if ( (j+jb) < n) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; if( d == id ) { dlpanel = dlA(d,j,nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d, 0, 0, buf); ldpanel = lddp; } nb2 = n_local[d] - j_local2*nb; nb0 = min(nb, nb2); magma_setdevice(d); //magma_queue_sync( stream[d][stream1] ); // synch on chol for remaining update //magma_queue_sync( stream[d][stream2] ); if( j+jb < m && d == (j/nb+1)%num_gpus ) { /* owns the next column, look-ahead next block on stream1 */ magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update magmablasSetKernelStream(stream[d][stream1]); trace_gpu_start( d, stream1, "trsm", "trsm" ); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK) magmablas_dtrsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, d_dinvA[d][0], d_x[d][0] ); #else magma_dtrsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda); #endif magma_event_record( event[d][4], stream[d][stream1] ); trace_gpu_end( d, stream1 ); } else if( nb2 > 0 ) { /* update all the blocks on stream2 */ magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor trace_gpu_start( d, stream2, "trsm", "trsm" ); magmablasSetKernelStream(stream[d][stream2]); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK) magmablas_dtrsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_dtrsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda); #endif trace_gpu_end( d, stream2 ); } d = (d+1)%num_gpus; } /* end of for */ /* ======================================================================================== */ d = (j/nb+1)%num_gpus; /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; nb0 = min(nb, n_local[d]-nb*j_local2 ); /* even on 1 gpu, off-diagonals are copied to cpu (synchronize at the end). * * so we have the Cholesky factor, but only diagonal submatrix of the big panel, * * on cpu at the end. */ if( j+jb < m ) { int d2, id2, j2, buf2; magma_setdevice(d); /* make sure all the previous sets are done */ if( h < num_gpus ) { /* > offdiagonal */ for( d2=0; d2<num_gpus; d2++ ) { j2 = j - (1+d2)*nb; if( j2 < 0 ) break; id2 = (j2/nb)%num_gpus; magma_queue_wait_event( stream[d][stream3], event[id2][0] ); } /* > diagonal */ for( d2=0; d2<num_gpus; d2++ ) { j2 = j - d2*nb; if( j2 < 0 ) break; id2 = (j2/nb)%num_gpus; magma_queue_wait_event( stream[d][stream3], event[id2][1] ); } } /* lookahead */ magma_queue_wait_event( stream[d][stream3], event[d][4] ); trace_gpu_start( d, stream3, "comm", "row to CPU" ); magma_dgetmatrix_async( (j+jb), nb0, dlA(d, 0, nb*j_local2), ldda, Aup(0,j+jb), lda, stream[d][stream3] ); trace_gpu_end( d, stream3 ); magma_event_record( event[d][3], stream[d][stream3] ); /* needed on pluto */ magma_queue_sync( stream[d][stream3] ); /* wait for the off-diagonal on cpu */ //magma_setdevice(id); //magma_queue_sync( stream[id][stream3] ); /* broadcast rows to gpus on stream2 */ buf2 = ((j+jb)/nb)%num_gpus; for( d2=0; d2<num_gpus; d2++ ) { if( d2 != d ) { magma_setdevice(d2); trace_gpu_start( d2, stream3, "comm", "row to GPUs" ); magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // rows arrived at cpu on stream3 magma_dsetmatrix_async( j+jb, nb0, Aup(0,j+jb), lda, dlP(d2,nb0,0,buf2), lddp, stream[d2][stream3] ); trace_gpu_end( d2, stream3 ); magma_event_record( event[d2][0], stream[d2][stream3] ); } } } /* ======================================================================================== */ /* gpu owning the next column */ /* after look ahead, update the remaining blocks */ if( j+jb < m ) /* no update on the last block column */ { d = (j/nb+1)%num_gpus; /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; if( d == id ) { dlpanel = dlA(d, j, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d, 0, 0, buf); ldpanel = lddp; } nb0 = min(nb, n_local[d]-nb*j_local2 ); nb2 = n_local[d]-nb*j_local2 - nb0; /* update the remaining blocks */ if( nb2 > 0 ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][stream2]); magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor trace_gpu_start( d, stream2, "trsm", "trsm" ); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK) magmablas_dtrsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_dtrsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda); #endif trace_gpu_end( d, stream2 ); } } } /* end of dtrsm */ } /* end of for j=1, .., n */ } else { /* ---------------------------------------------- */ /* Lower-triangular case */ /* > Compute the Cholesky factorization A = L*L'. */ /* ---------------------------------------------- */ #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK) /* * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( j=0; j<2; j++ ) { cudaMalloc((void**)&d_dinvA[d][j], nb*nb*sizeof(double)); cudaMalloc((void**)&d_x[d][j], nb*m *sizeof(double)); cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(double)); cudaMemset(d_x[d][j], 0, nb* m*sizeof(double)); } } magma_setdevice(0); #endif for (j=0; j<n; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (n-j)); /* Update the current diagonal block on stream1 */ magma_setdevice(id); if( j > 0 ) { magmablasSetKernelStream(stream[id][stream1]); magma_dsyrk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dlA(id, nb*j_local, 0), ldda, d_one, dlA(id, nb*j_local, j), ldda); } /* send the diagonal to cpu on stream1 */ magma_dgetmatrix_async( jb, jb, dlA(id, nb*j_local, j), ldda, Alo(j,j), lda, stream[id][stream1] ); /* update off-diagonal blocks of the panel */ if( j > 0 ) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { j_local2 = j_local+1; if( d > id ) j_local2 --; nb0 = nb*j_local2; if( nb0 < n_local[d] ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][stream2]); if( d == id ) { dlpanel = dlA(d, nb*j_local, 0); ldpanel = ldda; } else { dlpanel = dlPT(d,0,jb,buf); ldpanel = nb; magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu } magma_dgemm( MagmaNoTrans, MagmaTrans, n_local[d]-nb0, jb, j, c_neg_one, dlA(d, nb0, 0), ldda, dlpanel, ldpanel, c_one, dlA(d, nb0, j), ldda); magma_event_record( event[d][2], stream[d][stream2] ); } d = (d+1)%num_gpus; } } /* wait for the panel and factorized it on cpu */ magma_setdevice(id); magma_queue_sync( stream[id][stream1] ); lapackf77_dpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus on stream1 */ if ( (j+jb) < m) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { if( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d, 0, 0, buf); ldpanel = nb; } magma_setdevice(d); magma_dsetmatrix_async( jb, jb, Alo(j,j), lda, dlpanel, ldpanel, stream[d][stream1] ); magma_event_record( event[d][1], stream[d][stream1] ); d = (d+1)%num_gpus; } } else { magma_setdevice(id); magma_dsetmatrix_async( jb, jb, Alo(j,j), lda, dlA(id, nb*j_local, j), ldda, stream[id][stream1] ); } /* panel factorize the off-diagonal */ if ( (j+jb) < m) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; if( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d, 0, 0, buf); ldpanel = nb; } nb2 = n_local[d] - j_local2*nb; nb0 = min(nb, nb2 ); magma_setdevice(d); if( j+nb < n && d == (j/nb+1)%num_gpus ) { /* owns next column, look-ahead next block on stream1 */ magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update magmablasSetKernelStream(stream[d][stream1]); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK) magmablas_dtrsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda, d_dinvA[d][0], d_x[d][0] ); #else magma_dtrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda); #endif magma_event_record( event[d][4], stream[d][stream1] ); } else if( nb2 > 0 ) { /* other gpus updating all the blocks on stream2 */ /* update the entire column */ magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for the cholesky factor magmablasSetKernelStream(stream[d][stream2]); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK) magmablas_dtrsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_dtrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda); #endif } d = (d+1)%num_gpus; } /* end for d */ /* ======================================================================================== */ d = (j/nb+1)%num_gpus; /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; nb0 = min(nb, n_local[d]-nb*j_local2 ); /* even on 1 gpu, we copy off-diagonal to cpu (but don't synchronize). */ /* so we have the Cholesky factor on cpu at the end. */ if( j+jb < n ) { int d2, id2, j2, buf2; magma_setdevice(d); /* make sure all the previous sets are done */ if( h < num_gpus ) { /* > offdiagonal */ for( d2=0; d2<num_gpus; d2++ ) { j2 = j - (1+d2)*nb; if( j2 < 0 ) break; id2 = (j2/nb)%num_gpus; magma_queue_wait_event( stream[d][stream3], event[id2][0] ); } /* > diagonal */ for( d2=0; d2<num_gpus; d2++ ) { j2 = j - d2*nb; if( j2 < 0 ) break; id2 = (j2/nb)%num_gpus; magma_queue_wait_event( stream[d][stream3], event[id2][1] ); } } // lookahead done magma_queue_wait_event( stream[d][stream3], event[d][4] ); magma_dgetmatrix_async( nb0, j+jb, dlA(d, nb*j_local2, 0), ldda, Alo(j+jb,0), lda, stream[d][stream3] ); magma_event_record( event[d][3], stream[d][stream3] ); /* syn on rows on CPU, seem to be needed on Pluto */ magma_queue_sync( stream[d][stream3] ); /* broadcast the rows to gpus */ buf2 = ((j+jb)/nb)%num_gpus; for( d2=0; d2<num_gpus; d2++ ) { if( d2 != d ) { magma_setdevice(d2); magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // getmatrix done magma_dsetmatrix_async( nb0, j+jb, Alo(j+jb,0), lda, dlPT(d2,0,nb0,buf2), nb, stream[d2][stream3] ); magma_event_record( event[d2][0], stream[d2][stream3] ); } } } /* ======================================================================================== */ /* gpu owing the next column updates remaining blocks on stream2 */ if( j+nb < n ) { // no lookahead on the last block column d = (j/nb+1)%num_gpus; /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; if( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d,0,0,buf); ldpanel = nb; } nb0 = min(nb, n_local[d]-nb*j_local2 ); nb2 = n_local[d] - j_local2*nb - nb0; if( nb2 > 0 ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][stream2]); /* update the remaining blocks in the column */ magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK) magmablas_dtrsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_dtrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda); #endif } } } } } /* end of else not upper */ /* == finalize the trace == */ trace_finalize( "dpotrf.svg","trace.css" ); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magma_queue_sync( stream[d][2] ); magmablasSetKernelStream(NULL); //magma_event_destroy( event0[d] ); //magma_event_destroy( event1[d] ); //magma_event_destroy( event2[d] ); //magma_event_destroy( event3[d] ); //magma_event_destroy( event4[d] ); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK) for( j=0; j<2; j++ ) { magma_free( d_dinvA[d][j] ); magma_free( d_x[d][j] ); } #endif } magma_setdevice(0); return *info; } /* magma_dpotrf_mgpu */
/** Purpose ------- DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. It uses 2 queues to overlap communication and computation. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A DOUBLE PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_dgesv_comp ********************************************************************/ extern "C" magma_int_t magma_dgetrf( magma_int_t m, magma_int_t n, double *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { #ifdef HAVE_clBLAS #define dA(i_, j_) dA, ((i_)*nb + (j_)*nb*ldda + dA_offset) #define dAT(i_, j_) dAT, ((i_)*nb*lddat + (j_)*nb + dAT_offset) #define dwork(i_) dwork, (i_) #else #define dA(i_, j_) ( dA + (i_)*nb + (j_)*nb*ldda) #define dAT(i_, j_) ( dAT + (i_)*nb*lddat + (j_)*nb) #define dwork(i_) (dwork + (i_)) #endif // Constants const double c_one = MAGMA_D_ONE; const double c_neg_one = MAGMA_D_NEG_ONE; // Local variables double *work; magmaDouble_ptr dA, dAT, dwork; magma_int_t iinfo, nb; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ nb = magma_get_dgetrf_nb( m, n ); if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code. */ lapackf77_dgetrf( &m, &n, A, &lda, ipiv, info ); } else { /* Use hybrid blocked code. */ magma_int_t maxm, maxn, ldda, lddat, maxdim; magma_int_t i, j, rows, cols, s = min(m, n)/nb; maxm = magma_roundup( m, 32 ); maxn = magma_roundup( n, 32 ); maxdim = max( maxm, maxn ); lddat = maxn; ldda = maxm; /* set number of GPUs */ magma_int_t ngpu = magma_num_gpus(); if ( ngpu > 1 ) { /* call multi-GPU non-GPU-resident interface */ magma_dgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } magma_queue_t queues[2] = { NULL, NULL }; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); /* check the memory requirement */ size_t mem_size = magma_queue_mem_size( queues[0] ); mem_size /= sizeof(double); magma_int_t h = 1+(2+ngpu); magma_int_t ngpu2 = ngpu; magma_int_t NB = (magma_int_t)(0.8*mem_size/maxm - h*nb); const char* ngr_nb_char = getenv("MAGMA_NGR_NB"); if ( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); if ( ngpu > ceil((double)NB/nb) ) { ngpu2 = (magma_int_t)ceil((double)NB/nb); h = 1+(2+ngpu2); NB = (magma_int_t)(0.8*mem_size/maxm - h*nb); } if ( ngpu2*NB < n ) { /* require too much memory, so call non-GPU-resident version */ magma_dgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } work = A; if (maxdim*maxdim < 2*maxm*maxn) { // if close to square, allocate square matrix and transpose in-place // dwork is nb*maxm for panel, and maxdim*maxdim for A if (MAGMA_SUCCESS != magma_dmalloc( &dwork, nb*maxm + maxdim*maxdim )) { /* alloc failed so call non-GPU-resident version */ magma_dgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } dA = dwork + nb*maxm; ldda = lddat = maxdim; magma_dsetmatrix( m, n, A, lda, dA(0,0), ldda, queues[0] ); dAT = dA; magmablas_dtranspose_inplace( maxdim, dAT(0,0), lddat, queues[0] ); } else { // if very rectangular, allocate dA and dAT and transpose out-of-place // dwork is nb*maxm for panel, and maxm*maxn for A if (MAGMA_SUCCESS != magma_dmalloc( &dwork, (nb + maxn)*maxm )) { /* alloc failed so call non-GPU-resident version */ magma_dgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } dA = dwork + nb*maxm; magma_dsetmatrix( m, n, A, lda, dA(0,0), ldda, queues[0] ); if (MAGMA_SUCCESS != magma_dmalloc( &dAT, maxm*maxn )) { /* alloc failed so call non-GPU-resident version */ magma_free( dwork ); magma_dgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } magmablas_dtranspose( m, n, dA(0,0), ldda, dAT(0,0), lddat, queues[0] ); } lapackf77_dgetrf( &m, &nb, work, &lda, ipiv, &iinfo ); for( j = 0; j < s; j++ ) { // get j-th panel from device cols = maxm - j*nb; if (j > 0) { magmablas_dtranspose( nb, cols, dAT(j,j), lddat, dwork(0), cols, queues[0] ); magma_queue_sync( queues[0] ); magma_dgetmatrix_async( m-j*nb, nb, dwork(0), cols, work, lda, queues[1] ); magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (j+1)*nb, nb, c_one, dAT(j-1,j-1), lddat, dAT(j-1,j+1), lddat, queues[0] ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-j*nb, nb, c_neg_one, dAT(j-1,j+1), lddat, dAT(j, j-1), lddat, c_one, dAT(j, j+1), lddat, queues[0] ); // do the cpu part rows = m - j*nb; magma_queue_sync( queues[1] ); lapackf77_dgetrf( &rows, &nb, work, &lda, ipiv+j*nb, &iinfo ); } if (*info == 0 && iinfo > 0) *info = iinfo + j*nb; // put j-th panel onto device magma_dsetmatrix_async( m-j*nb, nb, work, lda, dwork(0), cols, queues[1] ); for( i=j*nb; i < j*nb + nb; ++i ) { ipiv[i] += j*nb; } magmablas_dlaswp( n, dAT(0,0), lddat, j*nb + 1, j*nb + nb, ipiv, 1, queues[0] ); magma_queue_sync( queues[1] ); magmablas_dtranspose( cols, nb, dwork(0), cols, dAT(j,j), lddat, queues[0] ); // do the small non-parallel computations (next panel update) if (s > (j+1)) { magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat, queues[0] ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat, queues[0] ); } else { magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat, queues[0] ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat, queues[0] ); } } magma_int_t nb0 = min( m - s*nb, n - s*nb ); if ( nb0 > 0 ) { rows = m - s*nb; cols = maxm - s*nb; magmablas_dtranspose( nb0, rows, dAT(s,s), lddat, dwork(0), cols, queues[0] ); magma_dgetmatrix_async( rows, nb0, dwork(0), cols, work, lda, queues[0] ); magma_queue_sync( queues[0] ); // do the cpu part lapackf77_dgetrf( &rows, &nb0, work, &lda, ipiv+s*nb, &iinfo ); if (*info == 0 && iinfo > 0) *info = iinfo + s*nb; for( i=s*nb; i < s*nb + nb0; ++i ) { ipiv[i] += s*nb; } magmablas_dlaswp( n, dAT(0,0), lddat, s*nb + 1, s*nb + nb0, ipiv, 1, queues[0] ); // put j-th panel onto device magma_dsetmatrix_async( rows, nb0, work, lda, dwork(0), cols, queues[0] ); magmablas_dtranspose( rows, nb0, dwork(0), cols, dAT(s,s), lddat, queues[0] ); magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, dAT(s, s), lddat, dAT(s, s)+nb0, lddat, queues[0] ); } // undo transpose if (maxdim*maxdim < 2*maxm*maxn) { magmablas_dtranspose_inplace( maxdim, dAT(0,0), lddat, queues[0] ); magma_dgetmatrix( m, n, dAT(0,0), lddat, A, lda, queues[0] ); } else { magmablas_dtranspose( n, m, dAT(0,0), lddat, dA(0,0), ldda, queues[0] ); magma_dgetmatrix( m, n, dA(0,0), ldda, A, lda, queues[0] ); magma_free( dAT ); } magma_free( dwork ); magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); } return *info; } /* magma_dgetrf */
/** Purpose ------- DGETRF_NOPIV_GPU computes an LU factorization of a general M-by-N matrix A without any pivoting. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA DOUBLE_PRECISION array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_dgesv_comp ********************************************************************/ extern "C" magma_int_t magma_dgetrf_nopiv_gpu(magma_int_t m, magma_int_t n, double *dA, magma_int_t ldda, magma_int_t *info) { #define dA(i,j) (dA + (i)*nb + (j)*nb*ldda) double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t iinfo, nb; magma_int_t maxm, maxn, mindim; magma_int_t i, rows, cols, s, lddwork; double *work; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); nb = magma_get_dgetrf_nb(m); s = mindim / nb; if (nb <= 1 || nb >= min(m,n)) { /* Use CPU code. */ magma_dmalloc_cpu( &work, m * n ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_dgetmatrix( m, n, dA, ldda, work, m ); magma_dgetrf_nopiv( m, n, work, m, info); magma_dsetmatrix( m, n, work, m, dA, ldda ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; lddwork = maxm; if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, maxm*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* Define user stream if current stream is NULL */ cudaStream_t stream[2], current_stream; magmablasGetKernelStream(¤t_stream); magma_queue_create( &stream[0] ); if (current_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else { stream[1] = current_stream; } for( i=0; i < s; i++ ) { // download i-th panel cols = maxm - i*nb; magma_queue_sync( stream[1] ); magma_dgetmatrix_async( m-i*nb, nb, dA(i,i), ldda, work, lddwork, stream[0] ); if ( i > 0 ) { magma_dtrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, nb, n - (i+1)*nb, c_one, dA(i-1,i-1), ldda, dA(i-1,i+1), ldda ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, m-i*nb, n-(i+1)*nb, nb, c_neg_one, dA(i, i-1), ldda, dA(i-1,i+1), ldda, c_one, dA(i, i+1), ldda ); } // do the cpu part rows = m - i*nb; magma_queue_sync( stream[0] ); magma_dgetrf_nopiv( rows, nb, work, lddwork, &iinfo ); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + i*nb; // upload i-th panel magma_dsetmatrix_async( m-i*nb, nb, work, lddwork, dA(i, i), ldda, stream[0] ); magma_queue_sync( stream[0] ); // do the small non-parallel computations if ( s > (i+1) ) { magma_dtrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dA(i, i ), ldda, dA(i, i+1), ldda); magma_dgemm( MagmaNoTrans, MagmaNoTrans, m-(i+1)*nb, nb, nb, c_neg_one, dA(i+1, i ), ldda, dA(i, i+1), ldda, c_one, dA(i+1, i+1), ldda ); } else { magma_dtrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, nb, n-s*nb, c_one, dA(i, i ), ldda, dA(i, i+1), ldda); magma_dgemm( MagmaNoTrans, MagmaNoTrans, m-(i+1)*nb, n-(i+1)*nb, nb, c_neg_one, dA(i+1, i ), ldda, dA(i, i+1), ldda, c_one, dA(i+1, i+1), ldda ); } } magma_int_t nb0 = min(m - s*nb, n - s*nb); rows = m - s*nb; cols = maxm - s*nb; magma_dgetmatrix( rows, nb0, dA(s,s), ldda, work, lddwork ); // make sure that gpu queue is empty magma_device_sync(); // do the cpu part magma_dgetrf_nopiv( rows, nb0, work, lddwork, &iinfo ); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + s*nb; // upload i-th panel magma_dsetmatrix( rows, nb0, work, lddwork, dA(s,s), ldda ); magma_dtrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, nb0, n-s*nb-nb0, c_one, dA(s,s), ldda, dA(s,s)+nb0, ldda); magma_free_pinned( work ); magma_queue_destroy( stream[0] ); if (current_stream == NULL) { magma_queue_destroy( stream[1] ); magmablasSetKernelStream(NULL); } } return *info; } /* magma_dgetrf_nopiv_gpu */
/** Purpose ------- DGESSM applies the factors L computed by DGETRF_INCPIV to a real M-by-N tile A. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in] k INTEGER The number of columns of the matrix L. K >= 0. @param[in] ib INTEGER The inner-blocking size. IB >= 0. @param[in] ipiv INTEGER array on the cpu. The pivot indices array of size K as returned by DGETRF_INCPIV. @param[in] dL1 DOUBLE_PRECISION array, dimension(LDDL1, N) The IB-by-K matrix in which is stored L^(-1) as returned by GETRF_INCPIV @param[in] lddl1 INTEGER The leading dimension of the array L1. LDDL1 >= max(1,2*IB). @param[in] dL DOUBLE_PRECISION array, dimension(LDDL, N) The M-by-K lower triangular tile on the gpu. @param[in] lddl INTEGER The leading dimension of the array L. LDDL >= max(1,M). @param[in,out] dA DOUBLE_PRECISION array, dimension (LDDA, N) On entry, the M-by-N tile A on the gpu. On exit, updated by the application of L on the gpu. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @ingroup magma_dgesv_tile ********************************************************************/ extern "C" magma_int_t magma_dgessm_gpu( magma_order_t order, magma_int_t m, magma_int_t n, magma_int_t k, magma_int_t ib, magma_int_t *ipiv, magmaDouble_ptr dL1, magma_int_t lddl1, magmaDouble_ptr dL, magma_int_t lddl, magmaDouble_ptr dA, magma_int_t ldda, magma_int_t *info) { #define AT(i,j) (dAT + (i)*ldda + (j) ) #define L(i,j) (dL + (i) + (j)*lddl ) #define dL1(j) (dL1 + (j)*lddl1) double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; int i, sb; magmaDouble_ptr dAT; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; if ( order == MagmaColMajor ) { magmablas_dgetmo_in( dA, dAT, ldda, m, n ); } else { dAT = dA; } for (i = 0; i < k; i += ib) { sb = min(ib, k-i); magmablas_dlaswp( n, dAT, ldda, i+1, i+sb, ipiv, 1 ); #ifndef WITHOUTTRTRI magma_dtrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n, sb, c_one, dL1(i), lddl1, AT(i, 0), ldda); #else magma_dtrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n, sb, c_one, L( i, i), lddl, AT(i, 0), ldda); #endif if ( (i+sb) < m) { magma_dgemm( MagmaNoTrans, MagmaTrans, n, m-(i+sb), sb, c_neg_one, AT(i, 0), ldda, L( i+sb, i), lddl, c_one, AT(i+sb, 0), ldda ); } } if ( order == MagmaColMajor ) { magmablas_dgetmo_in( dA, dAT, ldda, m, n ); } return *info; } /* magma_dgessm_gpu */
/** Purpose ------- DGETRI computes the inverse of a matrix using the LU factorization computed by DGETRF. This method inverts U and then computes inv(A) by solving the system inv(A)*L = inv(U) for inv(A). Note that it is generally both faster and more accurate to use DGESV, or DGETRF and DGETRS, to solve the system AX = B, rather than inverting the matrix and multiplying to form X = inv(A)*B. Only in special instances should an explicit inverse be computed with this routine. Arguments --------- @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] dA DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) On entry, the factors L and U from the factorization A = P*L*U as computed by DGETRF_GPU. On exit, if INFO = 0, the inverse of the original matrix A. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,N). @param[in] ipiv INTEGER array, dimension (N) The pivot indices from DGETRF; for 1 <= i <= N, row i of the matrix was interchanged with row IPIV(i). @param[out] dwork (workspace) DOUBLE_PRECISION array on the GPU, dimension (MAX(1,LWORK)) @param[in] lwork INTEGER The dimension of the array DWORK. LWORK >= N*NB, where NB is the optimal blocksize returned by magma_get_dgetri_nb(n). \n Unlike LAPACK, this version does not currently support a workspace query, because the workspace is on the GPU. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, U(i,i) is exactly zero; the matrix is singular and its cannot be computed. @ingroup magma_dgesv_comp ********************************************************************/ extern "C" magma_int_t magma_dgetri_gpu( magma_int_t n, double *dA, magma_int_t ldda, magma_int_t *ipiv, double *dwork, magma_int_t lwork, magma_int_t *info ) { #define dA(i, j) (dA + (i) + (j)*ldda) #define dL(i, j) (dL + (i) + (j)*lddl) /* Local variables */ double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double *dL = dwork; magma_int_t lddl = n; magma_int_t nb = magma_get_dgetri_nb(n); magma_int_t j, jmax, jb, jp; *info = 0; if (n < 0) *info = -1; else if (ldda < max(1,n)) *info = -3; else if ( lwork < n*nb ) *info = -6; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if ( n == 0 ) return *info; /* Invert the triangular factor U */ magma_dtrtri_gpu( MagmaUpper, MagmaNonUnit, n, dA, ldda, info ); if ( *info != 0 ) return *info; jmax = ((n-1) / nb)*nb; for( j = jmax; j >= 0; j -= nb ) { jb = min( nb, n-j ); // copy current block column of A to work space dL // (only needs lower trapezoid, but we also copy upper triangle), // then zero the strictly lower trapezoid block column of A. magmablas_dlacpy( MagmaFull, n-j, jb, dA(j,j), ldda, dL(j,0), lddl ); magmablas_dlaset( MagmaLower, n-j-1, jb, c_zero, c_zero, dA(j+1,j), ldda ); // compute current block column of Ainv // Ainv(:, j:j+jb-1) // = ( U(:, j:j+jb-1) - Ainv(:, j+jb:n) L(j+jb:n, j:j+jb-1) ) // * L(j:j+jb-1, j:j+jb-1)^{-1} // where L(:, j:j+jb-1) is stored in dL. if ( j+jb < n ) { magma_dgemm( MagmaNoTrans, MagmaNoTrans, n, jb, n-j-jb, c_neg_one, dA(0,j+jb), ldda, dL(j+jb,0), lddl, c_one, dA(0,j), ldda ); } // TODO use magmablas work interface magma_dtrsm( MagmaRight, MagmaLower, MagmaNoTrans, MagmaUnit, n, jb, c_one, dL(j,0), lddl, dA(0,j), ldda ); } // Apply column interchanges for( j = n-2; j >= 0; --j ) { jp = ipiv[j] - 1; if ( jp != j ) { magmablas_dswap( n, dA(0,j), 1, dA(0,jp), 1 ); } } return *info; }
extern "C" magma_int_t magma_dgetrf_gpu( magma_int_t m, magma_int_t n, magmaDouble_ptr dA, size_t dA_offset, magma_int_t ldda, magma_int_t *ipiv, magma_queue_t queue, magma_int_t *info ) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define dA(i_, j_) dA, dA_offset + (i_)*nb + (j_)*nb*ldda #define dAT(i_, j_) dAT, dAT_offset + (i_)*nb*lddat + (j_)*nb #define dAP(i_, j_) dAP, (i_) + (j_)*maxm #define work(i_) (work + (i_)) double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t iinfo, nb; magma_int_t maxm, maxn, mindim; magma_int_t i, j, rows, s, lddat, ldwork; magmaDouble_ptr dAT, dAP; double *work; size_t dAT_offset; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); nb = magma_get_dgetrf_nb(m); s = mindim / nb; if (nb <= 1 || nb >= min(m,n)) { /* Use CPU code. */ if ( MAGMA_SUCCESS != magma_dmalloc_cpu( &work, m*n )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_dgetmatrix( m, n, dA(0,0), ldda, work(0), m, queue ); lapackf77_dgetrf( &m, &n, work, &m, ipiv, info ); magma_dsetmatrix( m, n, work(0), m, dA(0,0), ldda, queue ); magma_free_cpu( work ); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; if ( MAGMA_SUCCESS != magma_dmalloc( &dAP, nb*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } // square matrices can be done in place; // rectangular requires copy to transpose if ( m == n ) { dAT = dA; dAT_offset = dA_offset; lddat = ldda; magmablas_dtranspose_inplace( m, dAT(0,0), lddat, queue ); } else { lddat = maxn; // N-by-M dAT_offset = 0; if ( MAGMA_SUCCESS != magma_dmalloc( &dAT, lddat*maxm )) { magma_free( dAP ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmablas_dtranspose( m, n, dA(0,0), ldda, dAT(0,0), lddat, queue ); } ldwork = maxm; if ( MAGMA_SUCCESS != magma_dmalloc_cpu( &work, ldwork*nb )) { magma_free( dAP ); if ( dA != dAT ) magma_free( dAT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } for( j=0; j < s; j++ ) { // download j-th panel magmablas_dtranspose( nb, m-j*nb, dAT(j,j), lddat, dAP(0,0), maxm, queue ); magma_dgetmatrix( m-j*nb, nb, dAP(0,0), maxm, work(0), ldwork, queue ); if ( j > 0 ){ magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (j+1)*nb, nb, c_one, dAT(j-1,j-1), lddat, dAT(j-1,j+1), lddat, queue ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-j*nb, nb, c_neg_one, dAT(j-1,j+1), lddat, dAT(j, j-1), lddat, c_one, dAT(j, j+1), lddat, queue ); } // do the cpu part rows = m - j*nb; lapackf77_dgetrf( &rows, &nb, work, &ldwork, ipiv+j*nb, &iinfo ); if ( *info == 0 && iinfo > 0 ) *info = iinfo + j*nb; for( i=j*nb; i < j*nb + nb; ++i ) { ipiv[i] += j*nb; } magmablas_dlaswp( n, dAT(0,0), lddat, j*nb + 1, j*nb + nb, ipiv, 1, queue ); // upload j-th panel magma_dsetmatrix( m-j*nb, nb, work(0), ldwork, dAP(0,0), maxm, queue ); magmablas_dtranspose( m-j*nb, nb, dAP(0,0), maxm, dAT(j,j), lddat, queue ); // do the small non-parallel computations (next panel update) if ( s > (j+1) ) { magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat, queue ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat, queue ); } else { magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat, queue ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat, queue ); } } magma_int_t nb0 = min( m - s*nb, n - s*nb ); if ( nb0 > 0 ) { rows = m - s*nb; magmablas_dtranspose( nb0, rows, dAT(s,s), lddat, dAP(0,0), maxm, queue ); magma_dgetmatrix( rows, nb0, dAP(0,0), maxm, work(0), ldwork, queue ); // do the cpu part lapackf77_dgetrf( &rows, &nb0, work, &ldwork, ipiv+s*nb, &iinfo ); if ( *info == 0 && iinfo > 0 ) *info = iinfo + s*nb; for( i=s*nb; i < s*nb + nb0; ++i ) { ipiv[i] += s*nb; } magmablas_dlaswp( n, dAT(0,0), lddat, s*nb + 1, s*nb + nb0, ipiv, 1, queue ); // upload j-th panel magma_dsetmatrix( rows, nb0, work(0), ldwork, dAP(0,0), maxm, queue ); magmablas_dtranspose( rows, nb0, dAP(0,0), maxm, dAT(s,s), lddat, queue ); magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, dAT(s,s), lddat, dAT(s,s)+nb0, lddat, queue ); } // undo transpose if ( dA == dAT ) { magmablas_dtranspose_inplace( m, dAT(0,0), lddat, queue ); } else { magmablas_dtranspose( n, m, dAT(0,0), lddat, dA(0,0), ldda, queue ); magma_free( dAT ); } magma_free( dAP ); magma_free_cpu( work ); } return *info; } /* magma_dgetrf_gpu */
/** Purpose ------- DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The factorization has the form A = U**H * U, if uplo = MagmaUpper, or A = L * L**H, if uplo = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. This uses multiple queues to overlap communication and computation. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A DOUBLE PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If uplo = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If uplo = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U**H * U or A = L * L**H. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_dposv_comp ********************************************************************/ extern "C" magma_int_t magma_dpotrf( magma_uplo_t uplo, magma_int_t n, double *A, magma_int_t lda, magma_int_t *info ) { #define A(i_, j_) (A + (i_) + (j_)*lda) #ifdef HAVE_clBLAS #define dA(i_, j_) dA, ((i_) + (j_)*ldda) #else #define dA(i_, j_) (dA + (i_) + (j_)*ldda) #endif /* Constants */ const double c_one = MAGMA_D_ONE; const double c_neg_one = MAGMA_D_NEG_ONE; const double d_one = 1.0; const double d_neg_one = -1.0; /* Local variables */ const char* uplo_ = lapack_uplo_const( uplo ); bool upper = (uplo == MagmaUpper); magma_int_t j, jb, ldda, nb; magmaDouble_ptr dA = NULL; /* Check arguments */ *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; nb = magma_get_dpotrf_nb( n ); if (nb <= 1 || nb >= n) { lapackf77_dpotrf( uplo_, &n, A, &lda, info ); } else { /* Use hybrid blocked code. */ ldda = magma_roundup( n, 32 ); magma_int_t ngpu = magma_num_gpus(); if ( ngpu > 1 ) { /* call multi-GPU non-GPU-resident interface */ return magma_dpotrf_m( ngpu, uplo, n, A, lda, info ); } if (MAGMA_SUCCESS != magma_dmalloc( &dA, n*ldda )) { /* alloc failed so call the non-GPU-resident version */ return magma_dpotrf_m( ngpu, uplo, n, A, lda, info ); } magma_queue_t queues[2] = { NULL, NULL }; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); if (upper) { /* Compute the Cholesky factorization A = U'*U. */ for (j=0; j < n; j += nb) { /* Update and factorize the current diagonal block and test for non-positive-definiteness. */ jb = min( nb, n-j ); magma_dsetmatrix_async( jb, n-j, A(j, j), lda, dA(j, j), ldda, queues[1] ); magma_dsyrk( MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dA(0, j), ldda, d_one, dA(j, j), ldda, queues[1] ); magma_queue_sync( queues[1] ); magma_dgetmatrix_async( jb, jb, dA(j, j), ldda, A(j, j), lda, queues[0] ); if (j+jb < n) { magma_dgemm( MagmaConjTrans, MagmaNoTrans, jb, n-j-jb, j, c_neg_one, dA(0, j ), ldda, dA(0, j+jb), ldda, c_one, dA(j, j+jb), ldda, queues[1] ); } magma_queue_sync( queues[0] ); // this could be on any queue; it isn't needed until exit. magma_dgetmatrix_async( j, jb, dA(0, j), ldda, A(0, j), lda, queues[0] ); lapackf77_dpotrf( MagmaUpperStr, &jb, A(j, j), &lda, info ); if (*info != 0) { *info = *info + j; break; } magma_dsetmatrix_async( jb, jb, A(j, j), lda, dA(j, j), ldda, queues[0] ); magma_queue_sync( queues[0] ); if (j+jb < n) { magma_dtrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, n-j-jb, c_one, dA(j, j ), ldda, dA(j, j+jb), ldda, queues[1] ); } } } else { //used for timing CPU and GPU int iter = 0; float cpu_time = 0.0; float gpu_time = 0.0; double gpu_iter1_low = 2103.143311; double gpu_iter1_high = 754.506104; double cpu_iter1_low = 794.636108; double cpu_iter1_high = 600.295227; double gpu_pred_high = gpu_iter1_high; double gpu_pred_low = gpu_iter1_low; double cpu_pred_high = cpu_iter1_high; double cpu_pred_low = cpu_iter1_low; double ratio_split_freq = 0; double time_until_interrupt = 0; cudaEvent_t start_cpu, stop_cpu; cudaEvent_t start_gpu, stop_gpu; // switches for different modes bool timing = false; //for initial setting only, greatly impact performance bool dvfs = false; //turn on dvfs energy saving bool relax = false; //turn on relax scheme bool r2h = false; // turn on race to halt //these parameters need to be tuned in future works. double dvfs_converage = 0.5; double prediction_offset_gpu = 0.65; double prediction_offset_cpu = 0.65; //========================================================= // Compute the Cholesky factorization A = L*L'. for (j=0; j < n; j += nb) { // Update and factorize the current diagonal block and test // for non-positive-definiteness. jb = min( nb, n-j ); magma_dsetmatrix_async( n-j, jb, A(j, j), lda, dA(j, j), ldda, queues[1] ); magma_dsyrk( MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dA(j, 0), ldda, d_one, dA(j, j), ldda, queues[1] ); magma_queue_sync( queues[1] ); magma_dgetmatrix_async( jb, jb, dA(j,j), ldda, A(j,j), lda, queues[0] ); if (timing) { //start gpu timing cudaEventCreate(&start_gpu); cudaEventCreate(&stop_gpu); cudaEventRecord(start_gpu, 0); } if (j+jb < n) { magma_dgemm( MagmaNoTrans, MagmaConjTrans, n-j-jb, jb, j, c_neg_one, dA(j+jb, 0), ldda, dA(j, 0), ldda, c_one, dA(j+jb, j), ldda, queues[1] ); } double ratio_slack_pred = 1.0 - (double)nb/(n-iter*nb); cpu_pred_high = cpu_pred_high * ratio_slack_pred; cpu_pred_low = cpu_pred_low * ratio_slack_pred; gpu_pred_high = gpu_pred_high * ratio_slack_pred * ratio_slack_pred; gpu_pred_low = gpu_pred_low * ratio_slack_pred * ratio_slack_pred; if (timing) { printf("iter:%d GPU time pred:%f\n", iter, gpu_pred_high); printf("iter:%d CPU time pred:%f\n", iter, cpu_pred_high); } if (iter < dvfs_converage*(n/nb)) { if (cpu_pred_high > gpu_pred_high) { //slack on GPU ratio_split_freq = (cpu_pred_high - gpu_pred_high) / (gpu_pred_high * ((gpu_iter1_low / gpu_iter1_high) - 1)); time_until_interrupt = gpu_pred_low * ratio_split_freq; //printf("iter:%d time_until_interrupt:%f\n", iter, time_until_interrupt); // printf("iter:%d ratio_split_freq:%f\n", iter, ratio_split_freq); if (dvfs) { if ((!relax) || (relax && ratio_split_freq > 0.05)) { if (ratio_split_freq < 1) dvfs_adjust(time_until_interrupt*prediction_offset_gpu, 'g'); else dvfs_adjust(cpu_pred_high, 'g'); } } else if (r2h) { r2h_adjust(gpu_pred_high, cpu_pred_high - gpu_pred_high, 'g'); } } else { //slack on CPU ratio_split_freq = (gpu_pred_high - cpu_pred_high) / (cpu_pred_high * ((cpu_iter1_low / cpu_iter1_high) - 1)); time_until_interrupt = cpu_pred_low * ratio_split_freq; if (dvfs) { if ((!relax) || (relax && ratio_split_freq > 0.05)) { if (ratio_split_freq < 1) dvfs_adjust(time_until_interrupt*prediction_offset_cpu, 'c'); else dvfs_adjust(gpu_pred_high, 'c'); } } else if (r2h) { r2h_adjust(cpu_pred_high, gpu_pred_high - cpu_pred_high, 'c'); } } } if (timing) { //end gpu timing cudaEventRecord(stop_gpu, 0); cudaEventSynchronize(stop_gpu); cudaEventElapsedTime(&gpu_time, start_gpu, stop_gpu); cudaEventDestroy(start_gpu); cudaEventDestroy(stop_gpu); //printf("iter:%d GPU time:%f\n", iter, gpu_time); } magma_queue_sync( queues[0] ); // this could be on any queue; it isn't needed until exit. magma_dgetmatrix_async( jb, j, dA(j, 0), ldda, A(j, 0), lda, queues[0] ); if (timing) { //start cpu timing cudaEventCreate(&start_cpu); cudaEventCreate(&stop_cpu); cudaEventRecord(start_cpu, 0); } lapackf77_dpotrf( MagmaLowerStr, &jb, A(j, j), &lda, info ); if (timing) { //end cpu timing cudaEventRecord(stop_cpu, 0); cudaEventSynchronize(stop_cpu); cudaEventElapsedTime(&cpu_time, start_cpu, stop_cpu); cudaEventDestroy(start_cpu); cudaEventDestroy(stop_cpu); // printf("iter:%d CPU time:%f\n", iter, cpu_time); // if (gpu_time < cpu_time) { // printf("slack: +\n"); // } else { // printf("slack: -\n"); // } } if (*info != 0) { *info = *info + j; break; } magma_dsetmatrix_async( jb, jb, A(j, j), lda, dA(j, j), ldda, queues[0] ); magma_queue_sync( queues[0] ); if (j+jb < n) { magma_dtrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, n-j-jb, jb, c_one, dA(j, j), ldda, dA(j+jb, j), ldda, queues[1] ); } } } magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); magma_free( dA ); } return *info; } /* magma_dpotrf */
extern "C" magma_int_t magma_dgetrf_gpu(magma_int_t m, magma_int_t n, double *dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define inAT(i,j) (dAT + (i)*nb*lddat + (j)*nb) double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t iinfo, nb; magma_int_t maxm, maxn, mindim; magma_int_t i, rows, cols, s, lddat, lddwork; double *dAT, *dAP, *work; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); nb = magma_get_dgetrf_nb(m); s = mindim / nb; if (nb <= 1 || nb >= min(m,n)) { /* Use CPU code. */ magma_dmalloc_cpu( &work, m * n ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_dgetmatrix( m, n, dA, ldda, work, m ); lapackf77_dgetrf(&m, &n, work, &m, ipiv, info); magma_dsetmatrix( m, n, work, m, dA, ldda ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; lddat = maxn; lddwork = maxm; dAT = dA; if (MAGMA_SUCCESS != magma_dmalloc( &dAP, nb*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if ((m == n) && (m % 32 == 0) && (ldda%32 == 0)){ lddat = ldda; magmablas_dinplace_transpose( dAT, ldda, m); } else { if (MAGMA_SUCCESS != magma_dmalloc( &dAT, maxm*maxn )) { magma_free( dAP ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmablas_dtranspose2( dAT, lddat, dA, ldda, m, n ); } if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, maxm*nb )) { magma_free( dAP ); if (! ((m == n) && (m % 32 == 0) && (ldda%32 == 0)) ) magma_free( dAT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } for( i=0; i<s; i++ ) { // download i-th panel cols = maxm - i*nb; magmablas_dtranspose( dAP, cols, inAT(i,i), lddat, nb, cols ); magma_dgetmatrix( m-i*nb, nb, dAP, cols, work, lddwork ); // make sure that gpu queue is empty magma_device_sync(); if ( i>0 ){ magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (i+1)*nb, nb, c_one, inAT(i-1,i-1), lddat, inAT(i-1,i+1), lddat ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, n-(i+1)*nb, m-i*nb, nb, c_neg_one, inAT(i-1,i+1), lddat, inAT(i, i-1), lddat, c_one, inAT(i, i+1), lddat ); } // do the cpu part rows = m - i*nb; lapackf77_dgetrf( &rows, &nb, work, &lddwork, ipiv+i*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + i*nb; magmablas_dpermute_long2( n, dAT, lddat, ipiv, nb, i*nb ); // upload i-th panel magma_dsetmatrix( m-i*nb, nb, work, lddwork, dAP, maxm ); magmablas_dtranspose(inAT(i,i), lddat, dAP, maxm, cols, nb); // do the small non-parallel computations if ( s > (i+1) ) { magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, inAT(i, i ), lddat, inAT(i, i+1), lddat); magma_dgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(i+1)*nb, nb, c_neg_one, inAT(i, i+1), lddat, inAT(i+1, i ), lddat, c_one, inAT(i+1, i+1), lddat ); } else { magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, inAT(i, i ), lddat, inAT(i, i+1), lddat); magma_dgemm( MagmaNoTrans, MagmaNoTrans, n-(i+1)*nb, m-(i+1)*nb, nb, c_neg_one, inAT(i, i+1), lddat, inAT(i+1, i ), lddat, c_one, inAT(i+1, i+1), lddat ); } } magma_int_t nb0 = min(m - s*nb, n - s*nb); rows = m - s*nb; cols = maxm - s*nb; magmablas_dtranspose2( dAP, maxm, inAT(s,s), lddat, nb0, rows); magma_dgetmatrix( rows, nb0, dAP, maxm, work, lddwork ); // make sure that gpu queue is empty magma_device_sync(); // do the cpu part lapackf77_dgetrf( &rows, &nb0, work, &lddwork, ipiv+s*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + s*nb; magmablas_dpermute_long2( n, dAT, lddat, ipiv, nb0, s*nb ); // upload i-th panel magma_dsetmatrix( rows, nb0, work, lddwork, dAP, maxm ); magmablas_dtranspose2( inAT(s,s), lddat, dAP, maxm, rows, nb0); magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, inAT(s,s), lddat, inAT(s,s)+nb0, lddat); if ((m == n) && (m % 32 == 0) && (ldda%32 == 0)){ magmablas_dinplace_transpose( dAT, lddat, m ); } else { magmablas_dtranspose2( dA, ldda, dAT, lddat, n, m ); magma_free( dAT ); } magma_free( dAP ); magma_free_pinned( work ); } return *info; /* End of MAGMA_DGETRF_GPU */ }
/** Purpose ------- DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The factorization has the form A = U**H * U, if uplo = MagmaUpper, or A = L * L**H, if uplo = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. If the current stream is NULL, this version replaces it with a new stream to overlap computation with communication. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If uplo = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If uplo = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U**H * U or A = L * L**H. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_dposv_comp ********************************************************************/ extern "C" magma_int_t magma_dpotrf( magma_uplo_t uplo, magma_int_t n, double *A, magma_int_t lda, magma_int_t *info) { #define A(i_, j_) (A + (j_)*lda + (i_)) #define dA(i_, j_) (dA + (j_)*ldda + (i_)) /* Local variables */ const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t ldda, nb; magma_int_t j, jb; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; magmaDouble_ptr dA; double d_one = 1.0; double d_neg_one = -1.0; int upper = (uplo == MagmaUpper); *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; magma_int_t ngpu = magma_num_gpus(); if ( ngpu > 1 ) { /* call multiple-GPU interface */ return magma_dpotrf_m(ngpu, uplo, n, A, lda, info); } ldda = ((n+31)/32)*32; if (MAGMA_SUCCESS != magma_dmalloc( &dA, (n)*ldda )) { /* alloc failed so call the non-GPU-resident version */ return magma_dpotrf_m(ngpu, uplo, n, A, lda, info); } /* Define user stream if current stream is NULL */ magma_queue_t stream[3]; magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); magma_queue_create( &stream[0] ); magma_queue_create( &stream[2] ); if (orig_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else { stream[1] = orig_stream; } nb = magma_get_dpotrf_nb(n); if (nb <= 1 || nb >= n) { lapackf77_dpotrf(uplo_, &n, A, &lda, info); } else { /* Use hybrid blocked code. */ if (upper) { /* Compute the Cholesky factorization A = U'*U. */ for (j=0; j < n; j += nb) { /* Update and factorize the current diagonal block and test for non-positive-definiteness. Computing MIN */ jb = min(nb, (n-j)); magma_dsetmatrix_async( jb, (n-j), A(j, j), lda, dA(j, j), ldda, stream[1]); magma_dsyrk(MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dA(0, j), ldda, d_one, dA(j, j), ldda); magma_queue_sync( stream[1] ); magma_dgetmatrix_async( jb, jb, dA(j, j), ldda, A(j, j), lda, stream[0] ); if ( (j+jb) < n) { magma_dgemm(MagmaConjTrans, MagmaNoTrans, jb, (n-j-jb), j, c_neg_one, dA(0, j ), ldda, dA(0, j+jb), ldda, c_one, dA(j, j+jb), ldda); } magma_dgetmatrix_async( j, jb, dA(0, j), ldda, A (0, j), lda, stream[2] ); magma_queue_sync( stream[0] ); lapackf77_dpotrf(MagmaUpperStr, &jb, A(j, j), &lda, info); if (*info != 0) { *info = *info + j; break; } magma_dsetmatrix_async( jb, jb, A(j, j), lda, dA(j, j), ldda, stream[0] ); magma_queue_sync( stream[0] ); if ( (j+jb) < n ) { magma_dtrsm(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, (n-j-jb), c_one, dA(j, j ), ldda, dA(j, j+jb), ldda); } } } else { //========================================================= // Compute the Cholesky factorization A = L*L'. for (j=0; j < n; j += nb) { // Update and factorize the current diagonal block and test // for non-positive-definiteness. Computing MIN jb = min(nb, (n-j)); magma_dsetmatrix_async( (n-j), jb, A(j, j), lda, dA(j, j), ldda, stream[1]); magma_dsyrk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dA(j, 0), ldda, d_one, dA(j, j), ldda); magma_queue_sync( stream[1] ); magma_dgetmatrix_async( jb, jb, dA(j,j), ldda, A(j,j), lda, stream[0] ); if ( (j+jb) < n) { magma_dgemm( MagmaNoTrans, MagmaConjTrans, (n-j-jb), jb, j, c_neg_one, dA(j+jb, 0), ldda, dA(j, 0), ldda, c_one, dA(j+jb, j), ldda); } magma_dgetmatrix_async( jb, j, dA(j, 0), ldda, A(j, 0), lda, stream[2] ); magma_queue_sync( stream[0] ); lapackf77_dpotrf(MagmaLowerStr, &jb, A(j, j), &lda, info); if (*info != 0) { *info = *info + j; break; } magma_dsetmatrix_async( jb, jb, A(j, j), lda, dA(j, j), ldda, stream[0] ); magma_queue_sync( stream[0] ); if ( (j+jb) < n) { magma_dtrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, (n-j-jb), jb, c_one, dA(j, j), ldda, dA(j+jb, j), ldda); } } } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[2] ); if (orig_stream == NULL) { magma_queue_destroy( stream[1] ); } magmablasSetKernelStream( orig_stream ); magma_free( dA ); return *info; } /* magma_dpotrf */
/** Purpose ------- DSYGVDX computes selected eigenvalues and, optionally, eigenvectors of a real generalized symmetric-definite eigenproblem, of the form A*x=(lambda)*B*x, A*Bx=(lambda)*x, or B*A*x=(lambda)*x. Here A and B are assumed to be symmetric and B is also positive definite. Eigenvalues and eigenvectors can be selected by specifying either a range of values or a range of indices for the desired eigenvalues. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] itype INTEGER Specifies the problem type to be solved: = 1: A*x = (lambda)*B*x = 2: A*B*x = (lambda)*x = 3: B*A*x = (lambda)*x @param[in] range magma_range_t - = MagmaRangeAll: all eigenvalues will be found. - = MagmaRangeV: all eigenvalues in the half-open interval (VL,VU] will be found. - = MagmaRangeI: the IL-th through IU-th eigenvalues will be found. @param[in] jobz magma_vec_t - = MagmaNoVec: Compute eigenvalues only; - = MagmaVec: Compute eigenvalues and eigenvectors. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangles of A and B are stored; - = MagmaLower: Lower triangles of A and B are stored. @param[in] n INTEGER The order of the matrices A and B. N >= 0. @param[in,out] A DOUBLE PRECISION array, dimension (LDA, N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. \n On exit, if JOBZ = MagmaVec, then if INFO = 0, A contains the matrix Z of eigenvectors. The eigenvectors are normalized as follows: if ITYPE = 1 or 2, Z**T * B * Z = I; if ITYPE = 3, Z**T * inv(B) * Z = I. If JOBZ = MagmaNoVec, then on exit the upper triangle (if UPLO=MagmaUpper) or the lower triangle (if UPLO=MagmaLower) of A, including the diagonal, is destroyed. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in,out] B DOUBLE PRECISION array, dimension (LDB, N) On entry, the symmetric matrix B. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of B contains the upper triangular part of the matrix B. If UPLO = MagmaLower, the leading N-by-N lower triangular part of B contains the lower triangular part of the matrix B. \n On exit, if INFO <= N, the part of B containing the matrix is overwritten by the triangular factor U or L from the Cholesky factorization B = U**T * U or B = L * L**T. @param[in] ldb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[in] vl DOUBLE PRECISION @param[in] vu DOUBLE PRECISION If RANGE=MagmaRangeV, the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = MagmaRangeAll or MagmaRangeI. @param[in] il INTEGER @param[in] iu INTEGER If RANGE=MagmaRangeI, the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = MagmaRangeAll or MagmaRangeV. @param[out] mout INTEGER The total number of eigenvalues found. 0 <= MOUT <= N. If RANGE = MagmaRangeAll, MOUT = N, and if RANGE = MagmaRangeI, MOUT = IU-IL+1. @param[out] w DOUBLE PRECISION array, dimension (N) If INFO = 0, the eigenvalues in ascending order. @param[out] work (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[out] work (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LWORK >= 2*N + N*NB. If JOBZ = MagmaVec and N > 1, LWORK >= max( 2*N + N*NB, 1 + 6*N + 2*N**2 ). NB can be obtained through magma_get_dsytrd_nb(N). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. @param[out] iwork (workspace) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. @param[in] liwork INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1. If JOBZ = MagmaVec and N > 1, LIWORK >= 3 + 5*N. \n If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: DPOTRF or DSYEVD returned an error code: <= N: if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = MagmaVec, then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1); > N: if INFO = N + i, for 1 <= i <= N, then the leading minor of order i of B is not positive definite. The factorization of B could not be completed and no eigenvalues or eigenvectors were computed. Further Details --------------- Based on contributions by Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA Modified so that no backsubstitution is performed if DSYEVD fails to converge (NEIG in old code could be greater than N causing out of bounds reference to A - reported by Ralf Meyer). Also corrected the description of INFO and the test on ITYPE. Sven, 16 Feb 05. @ingroup magma_dsygv_driver ********************************************************************/ extern "C" magma_int_t magma_dsygvdx( magma_int_t itype, magma_vec_t jobz, magma_range_t range, magma_uplo_t uplo, magma_int_t n, double *A, magma_int_t lda, double *B, magma_int_t ldb, double vl, double vu, magma_int_t il, magma_int_t iu, magma_int_t *mout, double *w, double *work, magma_int_t lwork, #ifdef COMPLEX double *rwork, magma_int_t lrwork, #endif magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { const char* uplo_ = lapack_uplo_const( uplo ); const char* jobz_ = lapack_vec_const( jobz ); double d_one = MAGMA_D_ONE; double *dA=NULL, *dB=NULL; magma_int_t ldda = magma_roundup( n, 32 ); magma_int_t lddb = ldda; magma_int_t lower; magma_trans_t trans; magma_int_t wantz, lquery; magma_int_t alleig, valeig, indeig; magma_int_t lwmin, liwmin; wantz = (jobz == MagmaVec); lower = (uplo == MagmaLower); alleig = (range == MagmaRangeAll); valeig = (range == MagmaRangeV); indeig = (range == MagmaRangeI); lquery = (lwork == -1 || liwork == -1); *info = 0; if (itype < 1 || itype > 3) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (wantz || (jobz == MagmaNoVec))) { *info = -3; } else if (! (lower || (uplo == MagmaUpper))) { *info = -4; } else if (n < 0) { *info = -5; } else if (lda < max(1,n)) { *info = -7; } else if (ldb < max(1,n)) { *info = -9; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -11; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -12; } else if (iu < min(n,il) || iu > n) { *info = -13; } } } magma_int_t nb = magma_get_dsytrd_nb( n ); if ( n <= 1 ) { lwmin = 1; liwmin = 1; } else if ( wantz ) { lwmin = max( 2*n + n*nb, 1 + 6*n + 2*n*n ); liwmin = 3 + 5*n; } else { lwmin = 2*n + n*nb; liwmin = 1; } work[0] = magma_dmake_lwork( lwmin ); iwork[0] = liwmin; if (lwork < lwmin && ! lquery) { *info = -17; } else if (liwork < liwmin && ! lquery) { *info = -19; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } /* If matrix is very small, then just call LAPACK on CPU, no need for GPU */ if (n <= 128) { lapackf77_dsygvd( &itype, jobz_, uplo_, &n, A, &lda, B, &ldb, w, work, &lwork, iwork, &liwork, info ); *mout = n; return *info; } if (MAGMA_SUCCESS != magma_dmalloc( &dA, n*ldda ) || MAGMA_SUCCESS != magma_dmalloc( &dB, n*lddb )) { magma_free( dA ); magma_free( dB ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); /* Form a Cholesky factorization of B. */ magma_dsetmatrix( n, n, B, ldb, dB, lddb, queue ); magma_dsetmatrix_async( n, n, A, lda, dA, ldda, queue ); magma_timer_t time=0; timer_start( time ); magma_dpotrf_gpu( uplo, n, dB, lddb, info ); if (*info != 0) { *info = n + *info; return *info; } timer_stop( time ); timer_printf( "time dpotrf_gpu = %6.2f\n", time ); magma_queue_sync( queue ); magma_dgetmatrix_async( n, n, dB, lddb, B, ldb, queue ); timer_start( time ); /* Transform problem to standard eigenvalue problem and solve. */ magma_dsygst_gpu( itype, uplo, n, dA, ldda, dB, lddb, info ); timer_stop( time ); timer_printf( "time dsygst_gpu = %6.2f\n", time ); /* simple fix to be able to run bigger size. * set dB=NULL so we know to re-allocate below * TODO: have dwork here that will be used as dB and then passed to dsyevd. */ if (n > 5000) { magma_queue_sync( queue ); magma_free( dB ); dB=NULL; } timer_start( time ); magma_dsyevdx_gpu( jobz, range, uplo, n, dA, ldda, vl, vu, il, iu, mout, w, A, lda, work, lwork, iwork, liwork, info ); timer_stop( time ); timer_printf( "time dsyevdx_gpu = %6.2f\n", time ); if (wantz && *info == 0) { timer_start( time ); /* allocate and copy dB back */ if (dB == NULL) { if (MAGMA_SUCCESS != magma_dmalloc( &dB, n*lddb ) ) { magma_free( dA ); dA=NULL; *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_dsetmatrix( n, n, B, ldb, dB, lddb, queue ); } /* Backtransform eigenvectors to the original problem. */ if (itype == 1 || itype == 2) { /* For A*x=(lambda)*B*x and A*B*x=(lambda)*x; backtransform eigenvectors: x = inv(L)'*y or inv(U)*y */ if (lower) { trans = MagmaTrans; } else { trans = MagmaNoTrans; } magma_dtrsm( MagmaLeft, uplo, trans, MagmaNonUnit, n, *mout, d_one, dB, lddb, dA, ldda, queue ); } else if (itype == 3) { /* For B*A*x=(lambda)*x; backtransform eigenvectors: x = L*y or U'*y */ if (lower) { trans = MagmaNoTrans; } else { trans = MagmaTrans; } magma_dtrmm( MagmaLeft, uplo, trans, MagmaNonUnit, n, *mout, d_one, dB, lddb, dA, ldda, queue ); } magma_dgetmatrix( n, *mout, dA, ldda, A, lda, queue ); timer_stop( time ); timer_printf( "time dtrsm/mm + getmatrix = %6.2f\n", time ); } magma_queue_sync( queue ); magma_queue_destroy( queue ); work[0] = magma_dmake_lwork( lwmin ); iwork[0] = liwmin; magma_free( dA ); dA=NULL; magma_free( dB ); dB=NULL; return *info; } /* magma_dsygvd */
/** Purpose ------- DSYGST reduces a real symmetric-definite generalized eigenproblem to standard form. If ITYPE = 1, the problem is A*x = lambda*B*x, and A is overwritten by inv(U^H)*A*inv(U) or inv(L)*A*inv(L^H) If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or B*A*x = lambda*x, and A is overwritten by U*A*U^H or L^H*A*L. B must have been previously factorized as U^H*U or L*L^H by DPOTRF. Arguments --------- @param[in] itype INTEGER = 1: compute inv(U^H)*A*inv(U) or inv(L)*A*inv(L^H); = 2 or 3: compute U*A*U^H or L^H*A*L. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored and B is factored as U^H*U; - = MagmaLower: Lower triangle of A is stored and B is factored as L*L^H. @param[in] n INTEGER The order of the matrices A and B. N >= 0. @param[in,out] A DOUBLE PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the transformed matrix, stored in the same format as A. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in,out] B DOUBLE PRECISION array, dimension (LDB,N) The triangular factor from the Cholesky factorization of B, as returned by DPOTRF. B is modified by the routine but restored on exit (in lapack dsygst/dsygs2). @param[in] ldb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_dsyev_comp ********************************************************************/ extern "C" magma_int_t magma_dsygst( magma_int_t itype, magma_uplo_t uplo, magma_int_t n, double *A, magma_int_t lda, double *B, magma_int_t ldb, magma_int_t *info) { #define A(i_, j_) (A + (i_) + (j_)*lda) #define B(i_, j_) (B + (i_) + (j_)*ldb) #define dA(i_, j_) (dwork + (i_) + (j_)*ldda ) #define dB(i_, j_) (dwork + (i_) + (j_)*lddb + n*ldda) /* Constants */ const double c_one = MAGMA_D_ONE; const double c_neg_one = MAGMA_D_NEG_ONE; const double c_half = MAGMA_D_HALF; const double c_neg_half = MAGMA_D_NEG_HALF; const double d_one = 1.0; /* Local variables */ const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t k, kb, kb2, nb; magma_int_t ldda = n; magma_int_t lddb = n; magmaDouble_ptr dwork; bool upper = (uplo == MagmaUpper); /* Test the input parameters. */ *info = 0; if (itype < 1 || itype > 3) { *info = -1; } else if (! upper && uplo != MagmaLower) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (ldb < max(1,n)) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; if (MAGMA_SUCCESS != magma_dmalloc( &dwork, 2*n*n )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } nb = magma_get_dsygst_nb( n ); magma_queue_t queues[2]; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); magma_dsetmatrix( n, n, A(0, 0), lda, dA(0, 0), ldda, queues[1] ); magma_dsetmatrix( n, n, B(0, 0), ldb, dB(0, 0), lddb, queues[1] ); /* Use hybrid blocked code */ if (itype == 1) { if (upper) { /* Compute inv(U^H)*A*inv(U) */ for (k = 0; k < n; k += nb) { kb = min( n-k, nb ); kb2 = min( n-k-nb, nb ); /* Update the upper triangle of A(k:n,k:n) */ lapackf77_dsygst( &itype, uplo_, &kb, A(k,k), &lda, B(k,k), &ldb, info ); magma_dsetmatrix_async( kb, kb, A(k, k), lda, dA(k, k), ldda, queues[0] ); if (k+kb < n) { magma_dtrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, kb, n-k-kb, c_one, dB(k,k), lddb, dA(k,k+kb), ldda, queues[1] ); magma_queue_sync( queues[0] ); // finish set dA(k,k) magma_dsymm( MagmaLeft, MagmaUpper, kb, n-k-kb, c_neg_half, dA(k,k), ldda, dB(k,k+kb), lddb, c_one, dA(k,k+kb), ldda, queues[1] ); magma_dsyr2k( MagmaUpper, MagmaConjTrans, n-k-kb, kb, c_neg_one, dA(k,k+kb), ldda, dB(k,k+kb), lddb, d_one, dA(k+kb,k+kb), ldda, queues[1] ); // Start copying next A block magma_queue_sync( queues[1] ); magma_dgetmatrix_async( kb2, kb2, dA(k+kb, k+kb), ldda, A(k+kb, k+kb), lda, queues[0] ); magma_dsymm( MagmaLeft, MagmaUpper, kb, n-k-kb, c_neg_half, dA(k,k), ldda, dB(k,k+kb), lddb, c_one, dA(k,k+kb), ldda, queues[1] ); magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, kb, n-k-kb, c_one, dB(k+kb,k+kb), lddb, dA(k,k+kb), ldda, queues[1] ); magma_queue_sync( queues[0] ); // finish get A(k+kb,k+kb) } } } else { /* Compute inv(L)*A*inv(L^H) */ for (k = 0; k < n; k += nb) { kb = min( n-k, nb ); kb2 = min( n-k-nb, nb ); /* Update the lower triangle of A(k:n,k:n) */ lapackf77_dsygst( &itype, uplo_, &kb, A(k,k), &lda, B(k,k), &ldb, info ); magma_dsetmatrix_async( kb, kb, A(k, k), lda, dA(k, k), ldda, queues[0] ); if (k+kb < n) { magma_dtrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, n-k-kb, kb, c_one, dB(k,k), lddb, dA(k+kb,k), ldda, queues[1] ); magma_queue_sync( queues[0] ); // finish set dA(k,k) magma_dsymm( MagmaRight, MagmaLower, n-k-kb, kb, c_neg_half, dA(k,k), ldda, dB(k+kb,k), lddb, c_one, dA(k+kb, k), ldda, queues[1] ); magma_dsyr2k( MagmaLower, MagmaNoTrans, n-k-kb, kb, c_neg_one, dA(k+kb,k), ldda, dB(k+kb,k), lddb, d_one, dA(k+kb,k+kb), ldda, queues[1] ); // Start copying next A block magma_queue_sync( queues[1] ); magma_dgetmatrix_async( kb2, kb2, dA(k+kb, k+kb), ldda, A(k+kb, k+kb), lda, queues[0] ); magma_dsymm( MagmaRight, MagmaLower, n-k-kb, kb, c_neg_half, dA(k,k), ldda, dB(k+kb,k), lddb, c_one, dA(k+kb, k), ldda, queues[1] ); magma_dtrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit, n-k-kb, kb, c_one, dB(k+kb,k+kb), lddb, dA(k+kb,k), ldda, queues[1] ); magma_queue_sync( queues[0] ); // finish get A(k+kb,k+kb) } } } } else { // itype == 2 or 3 if (upper) { /* Compute U*A*U^H */ for (k = 0; k < n; k += nb) { kb = min( n-k, nb ); magma_dgetmatrix_async( kb, kb, dA(k, k), ldda, A(k, k), lda, queues[0] ); /* Update the upper triangle of A(1:k+kb-1,1:k+kb-1) */ if (k > 0) { magma_dtrmm( MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, k, kb, c_one, dB(0,0), lddb, dA(0,k), ldda, queues[1] ); magma_dsymm( MagmaRight, MagmaUpper, k, kb, c_half, dA(k,k), ldda, dB(0,k), lddb, c_one, dA(0,k), ldda, queues[1] ); magma_dsyr2k( MagmaUpper, MagmaNoTrans, k, kb, c_one, dA(0,k), ldda, dB(0,k), lddb, d_one, dA(0,0), ldda, queues[1] ); magma_dsymm( MagmaRight, MagmaUpper, k, kb, c_half, dA(k,k), ldda, dB(0,k), lddb, c_one, dA(0,k), ldda, queues[1] ); magma_dtrmm( MagmaRight, MagmaUpper, MagmaConjTrans, MagmaNonUnit, k, kb, c_one, dB(k,k), lddb, dA(0,k), ldda, queues[1] ); } magma_queue_sync( queues[0] ); // finish get A(k,k) lapackf77_dsygst( &itype, uplo_, &kb, A(k, k), &lda, B(k, k), &ldb, info ); // this could be done on a 3rd queue magma_dsetmatrix_async( kb, kb, A(k, k), lda, dA(k, k), ldda, queues[1] ); } } else { /* Compute L^H*A*L */ for (k = 0; k < n; k += nb) { kb = min( n-k, nb ); magma_dgetmatrix_async( kb, kb, dA(k, k), ldda, A(k, k), lda, queues[0] ); /* Update the lower triangle of A(1:k+kb-1,1:k+kb-1) */ if (k > 0) { magma_dtrmm( MagmaRight, MagmaLower, MagmaNoTrans, MagmaNonUnit, kb, k, c_one, dB(0,0), lddb, dA(k,0), ldda, queues[1] ); magma_dsymm( MagmaLeft, MagmaLower, kb, k, c_half, dA(k,k), ldda, dB(k,0), lddb, c_one, dA(k, 0), ldda, queues[1] ); magma_dsyr2k( MagmaLower, MagmaConjTrans, k, kb, c_one, dA(k,0), ldda, dB(k,0), lddb, d_one, dA(0,0), ldda, queues[1] ); magma_dsymm( MagmaLeft, MagmaLower, kb, k, c_half, dA(k,k), ldda, dB(k,0), lddb, c_one, dA(k, 0), ldda, queues[1] ); magma_dtrmm( MagmaLeft, MagmaLower, MagmaConjTrans, MagmaNonUnit, kb, k, c_one, dB(k,k), lddb, dA(k,0), ldda, queues[1] ); } magma_queue_sync( queues[0] ); // finish get A(k,k) lapackf77_dsygst( &itype, uplo_, &kb, A(k,k), &lda, B(k,k), &ldb, info ); // this could be done on a 3rd queue magma_dsetmatrix_async( kb, kb, A(k, k), lda, dA(k, k), ldda, queues[1] ); } } } magma_queue_sync( queues[0] ); // finish set dA(k,k) for itype 1 magma_dgetmatrix( n, n, dA(0, 0), ldda, A(0, 0), lda, queues[1] ); magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); magma_free( dwork ); return *info; } /* magma_dsygst_gpu */
extern "C" magma_int_t magma_dpotrf_gpu(char uplo, magma_int_t n, double *dA, magma_int_t ldda, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix dA. The factorization has the form dA = U**T * U, if UPLO = 'U', or dA = L * L**T, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. If the current stream is NULL, this version replaces it with user defined stream to overlap computation with communication. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of dA is stored; = 'L': Lower triangle of dA is stored. N (input) INTEGER The order of the matrix dA. N >= 0. dA (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) On entry, the symmetric matrix dA. If UPLO = 'U', the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**T * U or dA = L * L**T. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be dividable by 16. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ magma_int_t j, jb, nb; char uplo_[2] = {uplo, 0}; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double *work; double d_one = 1.0; double d_neg_one = -1.0; int upper = lapackf77_lsame(uplo_, "U"); *info = 0; if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } nb = magma_get_dpotrf_nb(n); if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* Define user stream if current stream is NULL */ cudaStream_t stream[2], current_stream; magmablasGetKernelStream(¤t_stream); magma_queue_create( &stream[0] ); if (current_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else stream[1] = current_stream; if ((nb <= 1) || (nb >= n)) { /* Use unblocked code. */ magma_dgetmatrix_async( n, n, dA, ldda, work, n, stream[1] ); magma_queue_sync( stream[1] ); lapackf77_dpotrf(uplo_, &n, work, &n, info); magma_dsetmatrix_async( n, n, work, n, dA, ldda, stream[1] ); } else { /* Use blocked code. */ if (upper) { /* Compute the Cholesky factorization A = U'*U. */ for (j=0; j<n; j+=nb) { /* Update and factorize the current diagonal block and test for non-positive-definiteness. Computing MIN */ jb = min(nb, (n-j)); magma_dsyrk(MagmaUpper, MagmaTrans, jb, j, d_neg_one, dA(0, j), ldda, d_one, dA(j, j), ldda); magma_queue_sync( stream[1] ); magma_dgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, stream[0] ); if ( (j+jb) < n) { /* Compute the current block row. */ magma_dgemm(MagmaTrans, MagmaNoTrans, jb, (n-j-jb), j, c_neg_one, dA(0, j ), ldda, dA(0, j+jb), ldda, c_one, dA(j, j+jb), ldda); } magma_queue_sync( stream[0] ); lapackf77_dpotrf(MagmaUpperStr, &jb, work, &jb, info); magma_dsetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, stream[1] ); if (*info != 0) { *info = *info + j; break; } if ( (j+jb) < n) { magma_dtrsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, (n-j-jb), c_one, dA(j, j ), ldda, dA(j, j+jb), ldda); } } } else { //========================================================= // Compute the Cholesky factorization A = L*L'. for (j=0; j<n; j+=nb) { // Update and factorize the current diagonal block and test // for non-positive-definiteness. Computing MIN jb = min(nb, (n-j)); magma_dsyrk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dA(j, 0), ldda, d_one, dA(j, j), ldda); magma_queue_sync( stream[1] ); magma_dgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, stream[0] ); if ( (j+jb) < n) { magma_dgemm( MagmaNoTrans, MagmaTrans, (n-j-jb), jb, j, c_neg_one, dA(j+jb, 0), ldda, dA(j, 0), ldda, c_one, dA(j+jb, j), ldda); } magma_queue_sync( stream[0] ); lapackf77_dpotrf(MagmaLowerStr, &jb, work, &jb, info); magma_dsetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, stream[1] ); if (*info != 0) { *info = *info + j; break; } if ( (j+jb) < n) { magma_dtrsm(MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, (n-j-jb), jb, c_one, dA(j, j), ldda, dA(j+jb, j), ldda); } } } } magma_free_pinned( work ); magma_queue_destroy( stream[0] ); if (current_stream == NULL) { magma_queue_destroy( stream[1] ); magmablasSetKernelStream(NULL); } return *info; } /* magma_dpotrf_gpu */
extern "C" magma_int_t magma_dsygvd(magma_int_t itype, char jobz, char uplo, magma_int_t n, double *a, magma_int_t lda, double *b, magma_int_t ldb, double *w, double *work, magma_int_t lwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= DSYGVD computes all the eigenvalues, and optionally, the eigenvectors of a real generalized symmetric-definite eigenproblem, of the form A*x=(lambda)*B*x, A*Bx=(lambda)*x, or B*A*x=(lambda)*x. Here A and B are assumed to be symmetric and B is also positive definite. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments ========= ITYPE (input) INTEGER Specifies the problem type to be solved: = 1: A*x = (lambda)*B*x = 2: A*B*x = (lambda)*x = 3: B*A*x = (lambda)*x JOBZ (input) CHARACTER*1 = 'N': Compute eigenvalues only; = 'V': Compute eigenvalues and eigenvectors. UPLO (input) CHARACTER*1 = 'U': Upper triangles of A and B are stored; = 'L': Lower triangles of A and B are stored. N (input) INTEGER The order of the matrices A and B. N >= 0. A (input/output) COMPLEX*16 array, dimension (LDA, N) On entry, the symmetric matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = 'V', then if INFO = 0, A contains the matrix Z of eigenvectors. The eigenvectors are normalized as follows: if ITYPE = 1 or 2, Z**T * B * Z = I; if ITYPE = 3, Z**T * inv(B) * Z = I. If JOBZ = 'N', then on exit the upper triangle (if UPLO='U') or the lower triangle (if UPLO='L') of A, including the diagonal, is destroyed. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). B (input/output) COMPLEX*16 array, dimension (LDB, N) On entry, the symmetric matrix B. If UPLO = 'U', the leading N-by-N upper triangular part of B contains the upper triangular part of the matrix B. If UPLO = 'L', the leading N-by-N lower triangular part of B contains the lower triangular part of the matrix B. On exit, if INFO <= N, the part of B containing the matrix is overwritten by the triangular factor U or L from the Cholesky factorization B = U**T * U or B = L * L**T. LDB (input) INTEGER The leading dimension of the array B. LDB >= max(1,N). W (output) DOUBLE PRECISION array, dimension (N) If INFO = 0, the eigenvalues in ascending order. WORK (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. WORK (workspace/output) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. LWORK (input) INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = 'N' and N > 1, LWORK >= 2*N + N*NB. If JOBZ = 'V' and N > 1, LWORK >= max( 2*N + N*NB, 1 + 6*N + 2*N**2 ). NB can be obtained through magma_get_dsytrd_nb(N). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. IWORK (workspace/output) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. LIWORK (input) INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = 'N' and N > 1, LIWORK >= 1. If JOBZ = 'V' and N > 1, LIWORK >= 3 + 5*N. If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: DPOTRF or DSYEVD returned an error code: <= N: if INFO = i and JOBZ = 'N', then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = 'V', then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1); > N: if INFO = N + i, for 1 <= i <= N, then the leading minor of order i of B is not positive definite. The factorization of B could not be completed and no eigenvalues or eigenvectors were computed. Further Details =============== Based on contributions by Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA Modified so that no backsubstitution is performed if DSYEVD fails to converge (NEIG in old code could be greater than N causing out of bounds reference to A - reported by Ralf Meyer). Also corrected the description of INFO and the test on ITYPE. Sven, 16 Feb 05. ===================================================================== */ char uplo_[2] = {uplo, 0}; char jobz_[2] = {jobz, 0}; double d_one = MAGMA_D_ONE; double *da; double *db; magma_int_t ldda = n; magma_int_t lddb = n; magma_int_t lower; char trans[1]; magma_int_t wantz, lquery; magma_int_t lwmin, liwmin; magma_queue_t stream; magma_queue_create( &stream ); wantz = lapackf77_lsame(jobz_, MagmaVecStr); lower = lapackf77_lsame(uplo_, MagmaLowerStr); lquery = lwork == -1 || liwork == -1; *info = 0; if (itype < 1 || itype > 3) { *info = -1; } else if (! (wantz || lapackf77_lsame(jobz_, MagmaNoVecStr))) { *info = -2; } else if (! (lower || lapackf77_lsame(uplo_, MagmaUpperStr))) { *info = -3; } else if (n < 0) { *info = -4; } else if (lda < max(1,n)) { *info = -6; } else if (ldb < max(1,n)) { *info = -8; } magma_int_t nb = magma_get_dsytrd_nb( n ); if ( n <= 1 ) { lwmin = 1; liwmin = 1; } else if ( wantz ) { lwmin = max( 2*n + n*nb, 1 + 6*n + 2*n*n ); liwmin = 3 + 5*n; } else { lwmin = 2*n + n*nb; liwmin = 1; } // multiply by 1+eps to ensure length gets rounded up, // if it cannot be exactly represented in floating point. work[0] = lwmin * (1. + lapackf77_dlamch("Epsilon")); iwork[0] = liwmin; if (lwork < lwmin && ! lquery) { *info = -11; } else if (liwork < liwmin && ! lquery) { *info = -13; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return MAGMA_ERR_ILLEGAL_VALUE; } else if (lquery) { return MAGMA_SUCCESS; } /* Quick return if possible */ if (n == 0) { return 0; } /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ if (n <= 128){ #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif lapackf77_dsygvd(&itype, jobz_, uplo_, &n, a, &lda, b, &ldb, w, work, &lwork, iwork, &liwork, info); return *info; } if (MAGMA_SUCCESS != magma_dmalloc( &da, n*ldda ) || MAGMA_SUCCESS != magma_dmalloc( &db, n*lddb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* Form a Cholesky factorization of B. */ magma_dsetmatrix( n, n, b, ldb, db, lddb ); magma_dsetmatrix_async( n, n, a, lda, da, ldda, stream ); #ifdef ENABLE_TIMER magma_timestr_t start, end; start = get_current_time(); #endif magma_dpotrf_gpu(uplo, n, db, lddb, info); if (*info != 0) { *info = n + *info; return 0; } #ifdef ENABLE_TIMER end = get_current_time(); printf("time dpotrf_gpu = %6.2f\n", GetTimerValue(start,end)/1000.); #endif magma_queue_sync( stream ); magma_dgetmatrix_async( n, n, db, lddb, b, ldb, stream ); #ifdef ENABLE_TIMER start = get_current_time(); #endif /* Transform problem to standard eigenvalue problem and solve. */ magma_dsygst_gpu(itype, uplo, n, da, ldda, db, lddb, info); #ifdef ENABLE_TIMER end = get_current_time(); printf("time dsygst_gpu = %6.2f\n", GetTimerValue(start,end)/1000.); #endif /* simple fix to be able to run bigger size. * need to have a dwork here that will be used * a db and then passed to dsyevd. * */ if(n > 5000){ magma_queue_sync( stream ); magma_free( db ); } #ifdef ENABLE_TIMER start = get_current_time(); #endif magma_dsyevd_gpu(jobz, uplo, n, da, ldda, w, a, lda, work, lwork, iwork, liwork, info); #ifdef ENABLE_TIMER end = get_current_time(); printf("time dsyevd_gpu = %6.2f\n", GetTimerValue(start,end)/1000.); #endif if (wantz && *info == 0) { #ifdef ENABLE_TIMER start = get_current_time(); #endif /* allocate and copy db back */ if(n > 5000){ if (MAGMA_SUCCESS != magma_dmalloc( &db, n*lddb ) ){ *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_dsetmatrix( n, n, b, ldb, db, lddb ); } /* Backtransform eigenvectors to the original problem. */ if (itype == 1 || itype == 2) { /* For A*x=(lambda)*B*x and A*B*x=(lambda)*x; backtransform eigenvectors: x = inv(L)'*y or inv(U)*y */ if (lower) { *(unsigned char *)trans = MagmaTrans; } else { *(unsigned char *)trans = MagmaNoTrans; } magma_dtrsm(MagmaLeft, uplo, *trans, MagmaNonUnit, n, n, d_one, db, lddb, da, ldda); } else if (itype == 3) { /* For B*A*x=(lambda)*x; backtransform eigenvectors: x = L*y or U'*y */ if (lower) { *(unsigned char *)trans = MagmaNoTrans; } else { *(unsigned char *)trans = MagmaTrans; } magma_dtrmm(MagmaLeft, uplo, *trans, MagmaNonUnit, n, n, d_one, db, lddb, da, ldda); } magma_dgetmatrix( n, n, da, ldda, a, lda ); #ifdef ENABLE_TIMER end = get_current_time(); printf("time dtrsm/mm + getmatrix = %6.2f\n", GetTimerValue(start,end)/1000.); #endif /* free db */ if(n > 5000){ magma_free( db ); } } magma_queue_sync( stream ); magma_queue_destroy( stream ); work[0] = lwmin * (1. + lapackf77_dlamch("Epsilon")); // round up iwork[0] = liwmin; magma_free( da ); if(n <= 5000){ magma_free( db ); } return MAGMA_SUCCESS; } /* magma_dsygvd */
extern "C" magma_int_t magma_dpotrs_gpu( magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs, magmaDouble_ptr dA, size_t dA_offset, magma_int_t ldda, magmaDouble_ptr dB, size_t dB_offset, magma_int_t lddb, magma_queue_t queue, magma_int_t *info ) { /* -- clMagma (version 0.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= DPOTRS solves a system of linear equations A*X = B with a symmetric positive definite matrix A using the Cholesky factorization A = U**H*U or A = L*L**H computed by DPOTRF. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. NRHS (input) INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. dA (input) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) The triangular factor U or L from the Cholesky factorization A = U**H*U or A = L*L**H, as computed by DPOTRF. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,N). dB (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. LDDB (input) INTEGER The leading dimension of the array B. LDDB >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ double z_one = MAGMA_D_MAKE( 1.0, 0.0 ); *info = 0 ; if( (uplo != MagmaUpper) && (uplo != MagmaLower) ) *info = -1; if( n < 0 ) *info = -2; if( nrhs < 0) *info = -3; if ( ldda < max(1, n) ) *info = -5; if ( lddb < max(1, n) ) *info = -7; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if ( (n == 0) || (nrhs == 0) ) { return *info; } if( uplo== MagmaUpper){ if ( nrhs == 1) { magma_dtrsv(MagmaUpper, MagmaConjTrans, MagmaNonUnit, n, dA, dA_offset, ldda, dB, dB_offset, 1, queue); magma_dtrsv(MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA, dA_offset, ldda, dB, dB_offset, 1, queue); } else { magma_dtrsm(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, n, nrhs, z_one, dA, dA_offset, ldda, dB, dB_offset, lddb, queue); magma_dtrsm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, z_one, dA, dA_offset, ldda, dB, dB_offset, lddb, queue); } } else{ if ( nrhs == 1) { magma_dtrsv(MagmaLower, MagmaNoTrans, MagmaNonUnit, n, dA, dA_offset, ldda, dB, dB_offset, 1, queue ); magma_dtrsv(MagmaLower, MagmaConjTrans, MagmaNonUnit, n, dA, dA_offset, ldda, dB, dB_offset, 1, queue ); } else { magma_dtrsm(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit, n, nrhs, z_one, dA, dA_offset, ldda, dB, dB_offset, lddb, queue); magma_dtrsm(MagmaLeft, MagmaLower, MagmaConjTrans, MagmaNonUnit, n, nrhs, z_one, dA, dA_offset, ldda, dB, dB_offset, lddb, queue); } } magma_queue_sync( queue ); return *info; }
extern "C" magma_int_t magma_dgetri_gpu( magma_int_t n, double *dA, magma_int_t lda, magma_int_t *ipiv, double *dwork, magma_int_t lwork, magma_int_t *info ) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= DGETRI computes the inverse of a matrix using the LU factorization computed by DGETRF. This method inverts U and then computes inv(A) by solving the system inv(A)*L = inv(U) for inv(A). Note that it is generally both faster and more accurate to use DGESV, or DGETRF and DGETRS, to solve the system AX = B, rather than inverting the matrix and multiplying to form X = inv(A)*B. Only in special instances should an explicit inverse be computed with this routine. Arguments ========= N (input) INTEGER The order of the matrix A. N >= 0. dA (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDA,N) On entry, the factors L and U from the factorization A = P*L*U as computed by DGETRF_GPU. On exit, if INFO = 0, the inverse of the original matrix A. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). IPIV (input) INTEGER array, dimension (N) The pivot indices from DGETRF; for 1<=i<=N, row i of the matrix was interchanged with row IPIV(i). DWORK (workspace/output) COMPLEX*16 array on the GPU, dimension (MAX(1,LWORK)) LWORK (input) INTEGER The dimension of the array DWORK. LWORK >= N*NB, where NB is the optimal blocksize returned by magma_get_dgetri_nb(n). Unlike LAPACK, this version does not currently support a workspace query, because the workspace is on the GPU. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, U(i,i) is exactly zero; the matrix is singular and its cannot be computed. ===================================================================== */ /* Local variables */ double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double *dL = dwork; magma_int_t ldl = n; magma_int_t nb = magma_get_dgetri_nb(n); magma_int_t j, jmax, jb, jp; *info = 0; if (n < 0) *info = -1; else if (lda < max(1,n)) *info = -3; else if ( lwork < n*nb ) *info = -6; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if ( n == 0 ) return *info; /* Invert the triangular factor U */ magma_dtrtri_gpu( MagmaUpper, MagmaNonUnit, n, dA, lda, info ); if ( *info != 0 ) return *info; jmax = ((n-1) / nb)*nb; for( j = jmax; j >= 0; j -= nb ) { jb = min( nb, n-j ); // copy current block column of L to work space, // then replace with zeros in A. magmablas_dlacpy( MagmaUpperLower, n-j, jb, &dA[j + j*lda], lda, &dL[j ], ldl ); magmablas_dlaset( MagmaLower, n-j, jb, &dA[j + j*lda], lda ); // compute current block column of Ainv // Ainv(:, j:j+jb-1) // = ( U(:, j:j+jb-1) - Ainv(:, j+jb:n) L(j+jb:n, j:j+jb-1) ) // * L(j:j+jb-1, j:j+jb-1)^{-1} // where L(:, j:j+jb-1) is stored in dL. if ( j+jb < n ) { magma_dgemm( MagmaNoTrans, MagmaNoTrans, n, jb, n-j-jb, c_neg_one, &dA[(j+jb)*lda], lda, &dL[ j+jb ], ldl, c_one, &dA[ j*lda], lda ); } magma_dtrsm( MagmaRight, MagmaLower, MagmaNoTrans, MagmaUnit, n, jb, c_one, &dL[j ], ldl, &dA[j*lda], lda ); } // Apply column interchanges for( j = n-2; j >= 0; --j ) { jp = ipiv[j] - 1; if ( jp != j ) { magmablas_dswap( n, &dA[ j*lda ], 1, &dA[ jp*lda ], 1 ); } } return *info; }
/** Purpose ------- DGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix may not fit entirely in the GPU memory. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Note: The factorization of big panel is done calling multiple-gpu-interface. Pivots are applied on GPU within the big panel. Arguments --------- @param[in] num_gpus INTEGER The number of GPUs. num_gpus > 0. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_dgesv_comp ********************************************************************/ extern "C" magma_int_t magma_dgetrf_m(magma_int_t num_gpus, magma_int_t m, magma_int_t n, double *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { #define A(i,j) (A + (j)*lda + (i)) #define dAT(d,i,j) (dAT[d] + (i)*nb*ldn_local + (j)*nb) #define dPT(d,i,j) (dPT[d] + (i)*nb*nb + (j)*nb*maxm) magma_timer_t time=0, time_total=0, time_alloc=0, time_set=0, time_get=0, time_comp=0; timer_start( time_total ); real_Double_t flops; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double *dAT[MagmaMaxGPUs], *dA[MagmaMaxGPUs], *dPT[MagmaMaxGPUs]; magma_int_t iinfo = 0, nb, nbi, maxm, n_local[MagmaMaxGPUs], ldn_local; magma_int_t N, M, NB, NBk, I, d, num_gpus0 = num_gpus; magma_int_t ii, jj, h, offset, ib, rows, s; magma_queue_t stream[MagmaMaxGPUs][2]; magma_event_t event[MagmaMaxGPUs][2]; *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* initialize nb */ nb = magma_get_dgetrf_nb(m); maxm = ((m + 31)/32)*32; /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(double); /* number of columns in the big panel */ h = 1+(2+num_gpus0); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); const char* ngr_nb_char = getenv("MAGMA_NGR_NB"); if ( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); //NB = 5*max(nb,32); if ( num_gpus0 > ceil((double)NB/nb) ) { num_gpus = (int)ceil((double)NB/nb); h = 1+(2+num_gpus); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); } else { num_gpus = num_gpus0; } if ( num_gpus*NB >= n ) { #ifdef CHECK_DGETRF_OOC printf( " * still fit in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_DGETRF_OOC printf( " * don't fit in GPU memory.\n" ); #endif NB = num_gpus*NB; NB = max( nb, (NB / nb) * nb); /* making sure it's devisable by nb (x64) */ } #ifdef CHECK_DGETRF_OOC if ( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n", n, NB, nb, (double)freeMem ); else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n", n, NB, nb, (double)freeMem ); #endif if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code for scalar of one tile. */ lapackf77_dgetrf(&m, &n, A, &lda, ipiv, info); } else { /* Use hybrid blocked code. */ /* allocate memory on GPU to store the big panel */ timer_start( time_alloc ); n_local[0] = (NB/nb)/num_gpus; if ( NB%(nb*num_gpus) != 0 ) n_local[0] ++; n_local[0] *= nb; ldn_local = ((n_local[0]+31)/32)*32; for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_dmalloc( &dA[d], (ldn_local+h*nb)*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dPT[d] = dA[d] + nb*maxm; /* for storing the previous panel from CPU */ dAT[d] = dA[d] + h*nb*maxm; /* for storing the big panel */ magma_queue_create( &stream[d][0] ); magma_queue_create( &stream[d][1] ); magma_event_create( &event[d][0] ); magma_event_create( &event[d][1] ); } //magma_setdevice(0); timer_stop( time_alloc ); for( I=0; I < n; I += NB ) { M = m; N = min( NB, n-I ); /* number of columns in this big panel */ s = min( max(m-I,0), N )/nb; /* number of small block-columns in this big panel */ maxm = ((M + 31)/32)*32; if ( num_gpus0 > ceil((double)N/nb) ) { num_gpus = (int)ceil((double)N/nb); } else { num_gpus = num_gpus0; } for( d=0; d < num_gpus; d++ ) { n_local[d] = ((N/nb)/num_gpus)*nb; if (d < (N/nb)%num_gpus) n_local[d] += nb; else if (d == (N/nb)%num_gpus) n_local[d] += N%nb; } ldn_local = ((n_local[0]+31)/32)*32; /* upload the next big panel into GPU, transpose (A->A'), and pivot it */ timer_start( time ); magmablas_dsetmatrix_transpose_mgpu(num_gpus, stream, A(0,I), lda, dAT, ldn_local, dA, maxm, M, N, nb); for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } time_set += timer_stop( time ); timer_start( time ); /* == --------------------------------------------------------------- == */ /* == loop around the previous big-panels to update the new big-panel == */ for( offset = 0; offset < min(m,I); offset += NB ) { NBk = min( m-offset, NB ); /* start sending the first tile from the previous big-panels to gpus */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); nbi = min( nb, NBk ); magma_dsetmatrix_async( (M-offset), nbi, A(offset,offset), lda, dA[d], (maxm-offset), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][0] ); /* transpose */ magmablas_dtranspose( M-offset, nbi, dA[d], maxm-offset, dPT(d,0,0), nb ); } /* applying the pivot from the previous big-panel */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][1]); magmablas_dpermute_long3( dAT(d,0,0), ldn_local, ipiv, NBk, offset ); } /* == going through each block-column of previous big-panels == */ for( jj=0, ib=offset/nb; jj < NBk; jj += nb, ib++ ) { ii = offset+jj; rows = maxm - ii; nbi = min( nb, NBk-jj ); for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); /* wait for a block-column on GPU */ magma_queue_sync( stream[d][0] ); /* start sending next column */ if ( jj+nb < NBk ) { magma_dsetmatrix_async( (M-ii-nb), min(nb,NBk-jj-nb), A(ii+nb,ii+nb), lda, dA[d], (rows-nb), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][(1+jj/nb)%2] ); /* transpose next column */ magmablas_dtranspose( M-ii-nb, nb, dA[d], rows-nb, dPT(d,0,(1+jj/nb)%2), nb ); } /* update with the block column */ magmablasSetKernelStream(stream[d][1]); magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d], nbi, c_one, dPT(d,0,(jj/nb)%2), nb, dAT(d,ib,0), ldn_local ); if ( M > ii+nb ) { magma_dgemm( MagmaNoTrans, MagmaNoTrans, n_local[d], M-(ii+nb), nbi, c_neg_one, dAT(d,ib,0), ldn_local, dPT(d,1,(jj/nb)%2), nb, c_one, dAT(d,ib+1,0), ldn_local ); } magma_event_record( event[d][(jj/nb)%2], stream[d][1] ); } /* end of for each block-columns in a big-panel */ } } /* end of for each previous big-panels */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } /* calling magma-gpu interface to panel-factorize the big panel */ if ( M > I ) { //magma_dgetrf1_mgpu(num_gpus, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, A(0,I), lda, // (magma_queue_t **)stream, &iinfo); magma_dgetrf2_mgpu(num_gpus, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, A(0,I), lda, stream, &iinfo); if ( iinfo < 0 ) { *info = iinfo; break; } else if ( iinfo != 0 ) { *info = iinfo + I * NB; //break; } /* adjust pivots */ for( ii=I; ii < min(I+N,m); ii++ ) ipiv[ii] += I; } time_comp += timer_stop( time ); /* download the current big panel to CPU */ timer_start( time ); magmablas_dgetmatrix_transpose_mgpu(num_gpus, stream, dAT, ldn_local, A(0,I), lda, dA, maxm, M, N, nb); for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } time_get += timer_stop( time ); } /* end of for */ timer_stop( time_total ); flops = FLOPS_DGETRF( m, n ) / 1e9; timer_printf(" memory-allocation time: %e\n", time_alloc ); timer_printf(" NB=%d nb=%d\n", (int) NB, (int) nb ); timer_printf(" memcopy and transpose %e seconds\n", time_set ); timer_printf(" total time %e seconds\n", time_total ); timer_printf(" Performance %f GFlop/s, %f seconds without htod and dtoh\n", flops / (time_comp), time_comp ); timer_printf(" Performance %f GFlop/s, %f seconds with htod\n", flops / (time_comp + time_set), time_comp + time_set ); timer_printf(" Performance %f GFlop/s, %f seconds with dtoh\n", flops / (time_comp + time_get), time_comp + time_get ); timer_printf(" Performance %f GFlop/s, %f seconds without memory-allocation\n", flops / (time_total - time_alloc), time_total - time_alloc ); for( d=0; d < num_gpus0; d++ ) { magma_setdevice(d); magma_free( dA[d] ); magma_event_destroy( event[d][0] ); magma_event_destroy( event[d][1] ); magma_queue_destroy( stream[d][0] ); magma_queue_destroy( stream[d][1] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); } if ( *info >= 0 ) magma_dgetrf_piv(m, n, NB, A, lda, ipiv, info); return *info; } /* magma_dgetrf_m */
/** Purpose ------- DSYGVD computes all the eigenvalues, and optionally, the eigenvectors of a real generalized symmetric-definite eigenproblem, of the form A*x=(lambda)*B*x, A*Bx=(lambda)*x, or B*A*x=(lambda)*x. Here A and B are assumed to be symmetric and B is also positive definite. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] itype INTEGER Specifies the problem type to be solved: = 1: A*x = (lambda)*B*x = 2: A*B*x = (lambda)*x = 3: B*A*x = (lambda)*x @param[in] jobz magma_vec_t - = MagmaNoVec: Compute eigenvalues only; - = MagmaVec: Compute eigenvalues and eigenvectors. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangles of A and B are stored; - = MagmaLower: Lower triangles of A and B are stored. @param[in] n INTEGER The order of the matrices A and B. N >= 0. @param[in,out] A DOUBLE PRECISION array, dimension (LDA, N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. \n On exit, if JOBZ = MagmaVec, then if INFO = 0, A contains the matrix Z of eigenvectors. The eigenvectors are normalized as follows: if ITYPE = 1 or 2, Z**T * B * Z = I; if ITYPE = 3, Z**T * inv(B) * Z = I. If JOBZ = MagmaNoVec, then on exit the upper triangle (if UPLO=MagmaUpper) or the lower triangle (if UPLO=MagmaLower) of A, including the diagonal, is destroyed. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in,out] B DOUBLE PRECISION array, dimension (LDB, N) On entry, the symmetric matrix B. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of B contains the upper triangular part of the matrix B. If UPLO = MagmaLower, the leading N-by-N lower triangular part of B contains the lower triangular part of the matrix B. \n On exit, if INFO <= N, the part of B containing the matrix is overwritten by the triangular factor U or L from the Cholesky factorization B = U**T * U or B = L * L**T. @param[in] ldb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[out] w DOUBLE PRECISION array, dimension (N) If INFO = 0, the eigenvalues in ascending order. @param[out] work (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LWORK >= 2*N + N*NB. If JOBZ = MagmaVec and N > 1, LWORK >= max( 2*N + N*NB, 1 + 6*N + 2*N**2 ). NB can be obtained through magma_get_dsytrd_nb(N). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. @param[out] iwork (workspace) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. @param[in] liwork INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1. If JOBZ = MagmaVec and N > 1, LIWORK >= 3 + 5*N. \n If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: DPOTRF or DSYEVD returned an error code: <= N: if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = MagmaVec, then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1); > N: if INFO = N + i, for 1 <= i <= N, then the leading minor of order i of B is not positive definite. The factorization of B could not be completed and no eigenvalues or eigenvectors were computed. Further Details --------------- Based on contributions by Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA Modified so that no backsubstitution is performed if DSYEVD fails to converge (NEIG in old code could be greater than N causing out of bounds reference to A - reported by Ralf Meyer). Also corrected the description of INFO and the test on ITYPE. Sven, 16 Feb 05. @ingroup magma_dsygv_driver ********************************************************************/ extern "C" magma_int_t magma_dsygvd( magma_int_t itype, magma_vec_t jobz, magma_uplo_t uplo, magma_int_t n, double *A, magma_int_t lda, double *B, magma_int_t ldb, double *w, double *work, magma_int_t lwork, #ifdef COMPLEX double *rwork, magma_int_t lrwork, #endif magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { const char* uplo_ = lapack_uplo_const( uplo ); const char* jobz_ = lapack_vec_const( jobz ); double d_one = MAGMA_D_ONE; double *dA=NULL, *dB=NULL; magma_int_t ldda = n; magma_int_t lddb = n; magma_int_t lower; magma_trans_t trans; magma_int_t wantz, lquery; magma_int_t lwmin, liwmin; magma_queue_t stream; magma_queue_create( &stream ); wantz = (jobz == MagmaVec); lower = (uplo == MagmaLower); lquery = (lwork == -1 || liwork == -1); *info = 0; if (itype < 1 || itype > 3) { *info = -1; } else if (! (wantz || (jobz == MagmaNoVec))) { *info = -2; } else if (! (lower || (uplo == MagmaUpper))) { *info = -3; } else if (n < 0) { *info = -4; } else if (lda < max(1,n)) { *info = -6; } else if (ldb < max(1,n)) { *info = -8; } magma_int_t nb = magma_get_dsytrd_nb( n ); if ( n <= 1 ) { lwmin = 1; liwmin = 1; } else if ( wantz ) { lwmin = max( 2*n + n*nb, 1 + 6*n + 2*n*n ); liwmin = 3 + 5*n; } else { lwmin = 2*n + n*nb; liwmin = 1; } // multiply by 1+eps (in Double!) to ensure length gets rounded up, // if it cannot be exactly represented in floating point. real_Double_t one_eps = 1. + lapackf77_dlamch("Epsilon"); work[0] = lwmin * one_eps; iwork[0] = liwmin; if (lwork < lwmin && ! lquery) { *info = -11; } else if (liwork < liwmin && ! lquery) { *info = -13; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ if (n <= 128) { #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif lapackf77_dsygvd(&itype, jobz_, uplo_, &n, A, &lda, B, &ldb, w, work, &lwork, iwork, &liwork, info); return *info; } if (MAGMA_SUCCESS != magma_dmalloc( &dA, n*ldda ) || MAGMA_SUCCESS != magma_dmalloc( &dB, n*lddb )) { magma_free( dA ); magma_free( dB ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* Form a Cholesky factorization of B. */ magma_dsetmatrix( n, n, B, ldb, dB, lddb ); magma_dsetmatrix_async( n, n, A, lda, dA, ldda, stream ); magma_timer_t time=0; timer_start( time ); magma_dpotrf_gpu(uplo, n, dB, lddb, info); if (*info != 0) { *info = n + *info; return *info; } timer_stop( time ); timer_printf( "time dpotrf_gpu = %6.2f\n", time ); magma_queue_sync( stream ); magma_dgetmatrix_async( n, n, dB, lddb, B, ldb, stream ); timer_start( time ); /* Transform problem to standard eigenvalue problem and solve. */ magma_dsygst_gpu(itype, uplo, n, dA, ldda, dB, lddb, info); timer_stop( time ); timer_printf( "time dsygst_gpu = %6.2f\n", time ); /* simple fix to be able to run bigger size. * need to have a dwork here that will be used * as dB and then passed to dsyevd. * */ if (n > 5000) { magma_queue_sync( stream ); magma_free( dB ); } timer_start( time ); magma_dsyevd_gpu(jobz, uplo, n, dA, ldda, w, A, lda, work, lwork, iwork, liwork, info); timer_stop( time ); timer_printf( "time dsyevd_gpu = %6.2f\n", time ); if (wantz && *info == 0) { timer_start( time ); /* allocate and copy dB back */ if (n > 5000) { if (MAGMA_SUCCESS != magma_dmalloc( &dB, n*lddb ) ) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_dsetmatrix( n, n, B, ldb, dB, lddb ); } /* Backtransform eigenvectors to the original problem. */ if (itype == 1 || itype == 2) { /* For A*x=(lambda)*B*x and A*B*x=(lambda)*x; backtransform eigenvectors: x = inv(L)'*y or inv(U)*y */ if (lower) { trans = MagmaTrans; } else { trans = MagmaNoTrans; } magma_dtrsm(MagmaLeft, uplo, trans, MagmaNonUnit, n, n, d_one, dB, lddb, dA, ldda); } else if (itype == 3) { /* For B*A*x=(lambda)*x; backtransform eigenvectors: x = L*y or U'*y */ if (lower) { trans = MagmaNoTrans; } else { trans = MagmaTrans; } magma_dtrmm(MagmaLeft, uplo, trans, MagmaNonUnit, n, n, d_one, dB, lddb, dA, ldda); } magma_dgetmatrix( n, n, dA, ldda, A, lda ); /* free dB */ if (n > 5000) { magma_free( dB ); } timer_stop( time ); timer_printf( "time dtrsm/mm + getmatrix = %6.2f\n", time ); } magma_queue_sync( stream ); magma_queue_destroy( stream ); work[0] = lwmin * one_eps; // round up iwork[0] = liwmin; magma_free( dA ); if (n <= 5000) { magma_free( dB ); } return *info; } /* magma_dsygvd */
/** Purpose ======= DSYTRF_nopiv computes the LDLt factorization of a real symmetric matrix A. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The factorization has the form A = U^H * D * U , if UPLO = 'U', or A = L * D * L^H, if UPLO = 'L', where U is an upper triangular matrix, L is lower triangular, and D is a diagonal matrix. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] UPLO CHARACTER*1 - = 'U': Upper triangle of A is stored; - = 'L': Lower triangle of A is stored. @param[in] N INTEGER The order of the matrix A. N >= 0. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U^H D U or A = L D L^H. \n Higher performance is achieved if A is in pinned memory. @param[in] LDA INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] INFO INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -6, the GPU memory allocation failed - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_dsysv_comp ******************************************************************* */ extern "C" magma_int_t magma_dsytrf_nopiv( magma_uplo_t uplo, magma_int_t n, double *A, magma_int_t lda, magma_int_t *info) { #define A(i, j) ( A +(j)*lda + (i)) #define dA(i, j) (dA +(j)*ldda + (i)) #define dW(i, j) (dW +(j)*ldda + (i)) #define dWt(i, j) (dW +(j)*nb + (i)) double zone = MAGMA_D_ONE; double mzone = MAGMA_D_NEG_ONE; int upper = (uplo == MagmaUpper); magma_int_t j, k, jb, ldda, nb, ib, iinfo; magmaDouble_ptr dA; magmaDouble_ptr dW; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return MAGMA_ERR_ILLEGAL_VALUE; } /* Quick return */ if ( n == 0 ) return MAGMA_SUCCESS; ldda = ((n+31)/32)*32; nb = magma_get_dsytrf_nopiv_nb(n); ib = min(32, nb); // inner-block for diagonal factorization if ((MAGMA_SUCCESS != magma_dmalloc(&dA, n *ldda)) || (MAGMA_SUCCESS != magma_dmalloc(&dW, nb*ldda))) { /* alloc failed so call the non-GPU-resident version */ *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t stream[2]; magma_event_t event; magma_queue_create(&stream[0]); magma_queue_create(&stream[1]); magma_event_create( &event ); trace_init( 1, 1, 2, stream ); /* Use hybrid blocked code. */ if (upper) { //========================================================= // Compute the LDLt factorization A = U'*D*U without pivoting. // copy matrix to GPU for (j=0; j<n; j+=nb) { jb = min(nb, (n-j)); trace_gpu_start( 0, 0, "set", "set" ); magma_dsetmatrix_async(j+jb, jb, A(0, j), lda, dA(0, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); } // main loop for (j=0; j<n; j += nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); if ( j!=0) { //magma_event_sync(event); magma_dgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, stream[1]); } trace_gpu_end( 0, 0 ); // factorize the diagonal block magma_queue_sync(stream[1]); trace_cpu_start( 0, "potrf", "potrf" ); dsytrf_nopiv_cpu(MagmaUpper, jb, ib, A(j, j), lda, info); trace_cpu_end( 0 ); if (*info != 0){ *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_dsetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); // copy j-th column of U back to CPU trace_gpu_start( 0, 1, "get", "get" ); magma_dgetmatrix_async(j, jb, dA(0, j), ldda, A(0, j), lda, stream[1]); trace_gpu_end( 0, 1 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column magmablasSetKernelStream( stream[0] ); trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_dtrsm(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaUnit, jb, (n-j-jb), zone, dA(j, j), ldda, dA(j, j+jb), ldda); magma_dcopymatrix( jb, n-j-jb, dA( j, j+jb ), ldda, dWt( 0, j+jb ), nb ); // update the trailing submatrix with D magmablas_dlascl_diag(MagmaUpper, jb, n-j-jb, dA(j, j), ldda, dA(j, j+jb), ldda, &iinfo); trace_gpu_end( 0, 0 ); // update the trailing submatrix with U and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k<n; k+=nb) { magma_int_t kb = min(nb,n-k); magma_dgemm(MagmaConjTrans, MagmaNoTrans, kb, n-k, jb, mzone, dWt(0, k), nb, dA(j, k), ldda, zone, dA(k, k), ldda); if (k==j+jb) { // magma_event_record( event, stream[0] ); magma_queue_sync( stream[0] ); } } trace_gpu_end( 0, 0 ); } } } else { //========================================================= // Compute the LDLt factorization A = L*D*L' without pivoting. // copy the matrix to GPU for (j=0; j<n; j+=nb) { jb = min(nb, (n-j)); trace_gpu_start( 0, 0, "set", "set" ); magma_dsetmatrix_async((n-j), jb, A(j, j), lda, dA(j, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); } // main loop for (j=0; j<n; j+=nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); if (j!=0) { //magma_event_sync(event); magma_dgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, stream[1]); } trace_gpu_end( 0, 0 ); // factorize the diagonal block magma_queue_sync(stream[1]); trace_cpu_start( 0, "potrf", "potrf" ); dsytrf_nopiv_cpu(MagmaLower, jb, ib, A(j, j), lda, info); trace_cpu_end( 0 ); if (*info != 0){ *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_dsetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); // copy j-th row of L back to CPU trace_gpu_start( 0, 1, "get", "get" ); magma_dgetmatrix_async(jb, j, dA(j, 0), ldda, A(j, 0), lda, stream[1]); trace_gpu_end( 0, 1 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column magmablasSetKernelStream( stream[0] ); trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_dtrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaUnit, (n-j-jb), jb, zone, dA(j, j), ldda, dA(j+jb, j), ldda); magma_dcopymatrix( n-j-jb,jb, dA( j+jb, j ), ldda, dW( j+jb, 0 ), ldda ); // update the trailing submatrix with D magmablas_dlascl_diag(MagmaLower, n-j-jb, jb, dA(j, j), ldda, dA(j+jb, j), ldda, &iinfo); trace_gpu_end( 0, 0 ); // update the trailing submatrix with L and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k<n; k+=nb) { magma_int_t kb = min(nb,n-k); magma_dgemm(MagmaNoTrans, MagmaConjTrans, n-k, kb, jb, mzone, dA(k, j), ldda, dW(k, 0), ldda, zone, dA(k, k), ldda); if (k==j+jb) { //magma_event_record( event, stream[0] ); magma_queue_sync(stream[0]); } } trace_gpu_end( 0, 0 ); } } } trace_finalize( "dsytrf.svg","trace.css" ); magma_queue_destroy(stream[0]); magma_queue_destroy(stream[1]); magma_event_destroy( event ); magma_free(dW); magma_free(dA); return MAGMA_SUCCESS; } /* magma_dsytrf_nopiv */
/** Purpose ------- DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. If the current stream is NULL, this version replaces it with a new stream to overlap computation with communication. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] dA DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) On entry, the symmetric matrix dA. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_dposv_comp ********************************************************************/ extern "C" magma_int_t magma_dpotrf_gpu(magma_uplo_t uplo, magma_int_t n, double *dA, magma_int_t ldda, magma_int_t *info) { #define dA(i, j) (dA + (j)*ldda + (i)) magma_int_t j, jb, nb; const char* uplo_ = lapack_uplo_const( uplo ); double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double *work; double d_one = 1.0; double d_neg_one = -1.0; int upper = (uplo == MagmaUpper); *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } nb = magma_get_dpotrf_nb(n); if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* Define user stream if current stream is NULL */ magma_queue_t stream[2]; magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); magma_queue_create( &stream[0] ); if (orig_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else { stream[1] = orig_stream; } if ((nb <= 1) || (nb >= n)) { /* Use unblocked code. */ magma_dgetmatrix_async( n, n, dA, ldda, work, n, stream[1] ); magma_queue_sync( stream[1] ); lapackf77_dpotrf(uplo_, &n, work, &n, info); magma_dsetmatrix_async( n, n, work, n, dA, ldda, stream[1] ); } else { /* Use blocked code. */ if (upper) { /* Compute the Cholesky factorization A = U'*U. */ for (j=0; j < n; j += nb) { /* Update and factorize the current diagonal block and test for non-positive-definiteness. Computing MIN */ jb = min(nb, (n-j)); magma_dsyrk(MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dA(0, j), ldda, d_one, dA(j, j), ldda); magma_queue_sync( stream[1] ); magma_dgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, stream[0] ); if ( (j+jb) < n) { /* Compute the current block row. */ magma_dgemm(MagmaConjTrans, MagmaNoTrans, jb, (n-j-jb), j, c_neg_one, dA(0, j ), ldda, dA(0, j+jb), ldda, c_one, dA(j, j+jb), ldda); } magma_queue_sync( stream[0] ); lapackf77_dpotrf(MagmaUpperStr, &jb, work, &jb, info); magma_dsetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, stream[1] ); if (*info != 0) { *info = *info + j; break; } if ( (j+jb) < n) { magma_dtrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, (n-j-jb), c_one, dA(j, j ), ldda, dA(j, j+jb), ldda); } } } else { //========================================================= // Compute the Cholesky factorization A = L*L'. for (j=0; j < n; j += nb) { // Update and factorize the current diagonal block and test // for non-positive-definiteness. Computing MIN jb = min(nb, (n-j)); magma_dsyrk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dA(j, 0), ldda, d_one, dA(j, j), ldda); magma_queue_sync( stream[1] ); magma_dgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, stream[0] ); if ( (j+jb) < n) { magma_dgemm( MagmaNoTrans, MagmaConjTrans, (n-j-jb), jb, j, c_neg_one, dA(j+jb, 0), ldda, dA(j, 0), ldda, c_one, dA(j+jb, j), ldda); } magma_queue_sync( stream[0] ); lapackf77_dpotrf(MagmaLowerStr, &jb, work, &jb, info); magma_dsetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, stream[1] ); if (*info != 0) { *info = *info + j; break; } if ( (j+jb) < n) { magma_dtrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, (n-j-jb), jb, c_one, dA(j, j), ldda, dA(j+jb, j), ldda); } } } } magma_free_pinned( work ); magma_queue_destroy( stream[0] ); if (orig_stream == NULL) { magma_queue_destroy( stream[1] ); } magmablasSetKernelStream( orig_stream ); return *info; } /* magma_dpotrf_gpu */
extern "C" magma_err_t magma_dgetrf2_mgpu(magma_int_t num_gpus, magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset, magmaDouble_ptr *d_lAT, size_t dlAT_offset, magma_int_t lddat, magma_int_t *ipiv, magmaDouble_ptr *d_lAP, size_t dlAP_offset, double *w, magma_int_t ldw, magma_int_t *info, magma_queue_t *queues) { /* -- clMAGMA (version 1.1.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date January 2014 Purpose ======= DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Use two buffer to send panels.. Arguments ========= NUM_GPUS (input) INTEGER The number of GPUS to be used for the factorization. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -7, internal GPU memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define inAT(id,i,j) d_lAT[(id)], (((offset)+(i)*nb)*lddat + (j)*nb) #define inAT_offset(i, j) (((offset)+(i)*nb)*lddat + (j)*nb) #define W(j) (w+((j)%num_gpus)*nb*ldw) double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t block_size = 32; magma_int_t iinfo, n_local[4]; magma_int_t maxm, mindim; magma_int_t i, ii, d, dd, rows, cols, s, ldpan[4]; magma_int_t id, i_local, i_local2, nb0, nb1; magmaDouble_ptr d_panel[4], panel_local[4]; size_t d_panel_offset[4]; size_t panel_local_offset[4]; //cudaStream_t streaml[4][2]; /* Check arguments */ *info = 0; if (m < 0) *info = -2; else if (n < 0) *info = -3; else if (num_gpus*lddat < max(1,n)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); //nb = magma_get_dgetrf_nb(m); if( num_gpus > ceil((double)n/nb) ) { *info = -1; return *info; } else{ printf("dgetrf2_mgpu num_gpu: %d\n", num_gpus); /* Use hybrid blocked code. */ maxm = ((m + block_size-1)/block_size)*block_size; /* some initializations */ for(i=0; i<num_gpus; i++){ n_local[i] = ((n/nb)/num_gpus)*nb; if (i < (n/nb)%num_gpus) n_local[i] += nb; else if (i == (n/nb)%num_gpus) n_local[i] += n%nb; /* workspaces */ //d_panel[i] = &(d_lAP[i][nb*maxm]); /* temporary panel storage */ d_panel[i] = d_lAP[i]; d_panel_offset[i] = nb*maxm; } /* start sending the panel to cpu */ nb0 = min(mindim, nb); if( nb0 == nb ) magma_dtranspose( d_lAP[0], 0, maxm, inAT(0,0,0), lddat, nb0, maxm, queues[2*0+1] ); else magma_dtranspose2( d_lAP[0], 0, maxm, inAT(0,0,0), lddat, nb0, maxm, queues[2*0+1] ); magma_dgetmatrix_async( m, nb0, d_lAP[0], 0, maxm, W(0), 0, ldw, queues[2*0+1], NULL ); clFlush(queues[2*0+1]); /* ------------------------------------------------------------------------------------- */ s = mindim / nb; for( i=0; i<s; i++ ) { /* Set the GPU number that holds the current panel */ id = i%num_gpus; /* Set the local index where the current panel is */ i_local = i/num_gpus; // cols for gpu panel cols = maxm - i*nb; // rows for cpu panel rows = m - i*nb; /* synchrnoize i-th panel from id-th gpu into work */ magma_queue_sync( queues[2*id+1] ); /* i-th panel factorization */ lapackf77_dgetrf( &rows, &nb, W(i), &ldw, ipiv+i*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) { *info = iinfo + i*nb; //break; } /* start sending the panel to all the gpus */ d = (i+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { magma_dsetmatrix_async( rows, nb, W(i), 0, ldw, d_lAP[d], dlAP_offset, cols, queues[2*d+1], NULL ); d = (d+1)%num_gpus; } /* apply the pivoting */ d = (i+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { if(dd==0){ // row offset will be added to ipiv in long2 magma_dpermute_long2( lddat, inAT(d,0,0), lddat, ipiv, nb, i*nb, queues[2*d] ); }else{ // ipiv is already added by row offset, calling long3 //magma_dpermute_long2( lddat, inAT(d,0,0), lddat, ipiv, nb, i*nb, queues[2*d] ); magma_dpermute_long3( lddat, inAT(d,0,0), lddat, ipiv, nb, i*nb, queues[2*d] ); } d = (d+1)%num_gpus; } /* update the trailing-matrix/look-ahead */ d = (i+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { /* storage for panel */ if( d == id ) { /* the panel belond to this gpu */ //panel_local[d] = inAT(d,i,i_local); panel_local[d] = d_lAT[d]; panel_local_offset[d] = inAT_offset(i, i_local); ldpan[d] = lddat; /* next column */ i_local2 = i_local+1; } else { /* the panel belong to another gpu */ //panel_local[d] = &d_panel[d][(i%2)*nb*maxm]; panel_local[d] = d_panel[d]; panel_local_offset[d] = d_panel_offset[d] + (i%2)*nb*maxm; //panel_local[d] = d_panel[d]; ldpan[d] = nb; /* next column */ i_local2 = i_local; if( d < id ) i_local2 ++; } /* the size of the next column */ if ( s > (i+1) ) { nb0 = nb; } else { nb0 = n_local[d]-nb*(s/num_gpus); if( d < s%num_gpus ) nb0 -= nb; } if( d == (i+1)%num_gpus) { /* owns the next column, look-ahead the column */ nb1 = nb0; /* make sure all the pivoting has been applied */ //magma_queue_sync(queues[2*d]); } else { /* update the entire trailing matrix */ nb1 = n_local[d] - i_local2*nb; /* synchronization to make sure panel arrived on gpu */ //magma_queue_sync(queues[2*d+1]); } /* magma_queue_sync(queues[2*d]); magma_queue_sync(queues[2*d+1]); */ //magma_dtranspose(panel_local[d], panel_local_offset[d], ldpan[d], d_lAP[d], 0, cols, cols, nb, queues[2*d]); /* gpu updating the trailing matrix */ if(d == (i+1)%num_gpus){ magma_queue_sync(queues[2*d]); magma_dtranspose(panel_local[d], panel_local_offset[d], ldpan[d], d_lAP[d], 0, cols, cols, nb, queues[2*d+1]); magma_queue_sync(queues[2*d+1]); magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb, c_one, panel_local[d], panel_local_offset[d], ldpan[d], inAT(d, i, i_local2), lddat, queues[2*d+1]); magma_dgemm( MagmaNoTrans, MagmaNoTrans, nb1, m-(i+1)*nb, nb, c_neg_one, inAT(d, i, i_local2), lddat, panel_local[d], panel_local_offset[d]+nb*ldpan[d], ldpan[d], c_one, inAT(d, i+1, i_local2), lddat, queues[2*d+1]); }else{ magma_queue_sync(queues[2*d+1]); magma_dtranspose(panel_local[d], panel_local_offset[d], ldpan[d], d_lAP[d], 0, cols, cols, nb, queues[2*d]); magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb, c_one, panel_local[d], panel_local_offset[d], ldpan[d], inAT(d, i, i_local2), lddat, queues[2*d]); magma_dgemm( MagmaNoTrans, MagmaNoTrans, nb1, m-(i+1)*nb, nb, c_neg_one, inAT(d, i, i_local2), lddat, panel_local[d], panel_local_offset[d]+nb*ldpan[d], ldpan[d], c_one, inAT(d, i+1, i_local2), lddat, queues[2*d]); } if( d == (i+1)%num_gpus ) { /* Set the local index where the current panel is */ int loff = i+1; int i_local = (i+1)/num_gpus; int ldda = maxm - (i+1)*nb; int cols = m - (i+1)*nb; nb0 = min(nb, mindim - (i+1)*nb); /* size of the diagonal block */ if( nb0 > 0 ) { /* transpose the panel for sending it to cpu */ if( i+1 < s ) magma_dtranspose( d_lAP[d], 0, ldda, inAT(d,loff,i_local), lddat, nb0, ldda, queues[2*d+1] ); else magma_dtranspose2( d_lAP[d], 0, ldda, inAT(d,loff,i_local), lddat, nb0, ldda, queues[2*d+1] ); //clFinish(queues[2*d+1]); /* send the panel to cpu */ magma_dgetmatrix_async( cols, nb0, d_lAP[d], 0, ldda, W(i+1), 0, ldw, queues[2*d+1], NULL ); } } else { //trace_gpu_end( d, 0 ); } d = (d+1)%num_gpus; } /* update the remaining matrix by gpu owning the next panel */ if( (i+1) < s ) { int i_local = (i+1)/num_gpus; int rows = m - (i+1)*nb; d = (i+1)%num_gpus; magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d] - (i_local+1)*nb, nb, c_one, panel_local[d], panel_local_offset[d], ldpan[d], inAT(d,i,i_local+1), lddat, queues[2*d] ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, n_local[d]-(i_local+1)*nb, rows, nb, c_neg_one, inAT(d,i,i_local+1), lddat, panel_local[d], panel_local_offset[d]+nb*ldpan[d], ldpan[d], c_one, inAT(d,i+1, i_local+1), lddat, queues[2*d] ); } } /* end of for i=1..s */ /* ------------------------------------------------------------------------------ */ /* Set the GPU number that holds the last panel */ id = s%num_gpus; /* Set the local index where the last panel is */ i_local = s/num_gpus; /* size of the last diagonal-block */ nb0 = min(m - s*nb, n - s*nb); rows = m - s*nb; cols = maxm - s*nb; if( nb0 > 0 ) { /* wait for the last panel on cpu */ magma_queue_sync( queues[2*id+1]); /* factor on cpu */ lapackf77_dgetrf( &rows, &nb0, W(s), &ldw, ipiv+s*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + s*nb; /* send the factor to gpus */ for( d=0; d<num_gpus; d++ ) { i_local2 = i_local; if( d < id ) i_local2 ++; if( d == id || n_local[d] > i_local2*nb ) { magma_dsetmatrix_async( rows, nb0, W(s), 0, ldw, d_lAP[d], 0, cols, queues[2*d+1], NULL ); } } for( d=0; d<num_gpus; d++ ) { if(d==0){ magma_dpermute_long2( lddat, inAT(d,0,0), lddat, ipiv, nb0, s*nb, queues[2*d] ); }else{ //magma_dpermute_long2( lddat, inAT(d,0,0), lddat, ipiv, nb0, s*nb, queues[2*d] ); magma_dpermute_long3( lddat, inAT(d,0,0), lddat, ipiv, nb0, s*nb, queues[2*d] ); } } for( d=0; d<num_gpus; d++ ) { //magma_queue_sync( queues[2*d+1] ); /* wait for the pivoting to be done */ magma_queue_sync( queues[2*d] ); i_local2 = i_local; if( d < id ) i_local2++; if( d == id ) { /* the panel belond to this gpu */ //panel_local[d] = inAT(d,s,i_local); panel_local[d] = d_lAT[d]; panel_local_offset[d] = inAT_offset(s, i_local); /* next column */ nb1 = n_local[d] - i_local*nb-nb0; magma_dtranspose2( panel_local[d], panel_local_offset[d], lddat, d_lAP[d], 0, cols, rows, nb0, queues[2*d+1]); if( nb1 > 0 ){ magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb0, c_one, panel_local[d], panel_local_offset[d], lddat, d_lAT[d], inAT_offset(s,i_local)+nb0, lddat, queues[2*d+1]); } } else if( n_local[d] > i_local2*nb ) { /* the panel belong to another gpu */ //panel_local[d] = &d_panel[d][(s%2)*nb*maxm]; panel_local[d] = d_panel[d]; panel_local_offset[d] = d_panel_offset[d] + (s%2)*nb*maxm; //panel_local[d] = d_panel[d]; /* next column */ nb1 = n_local[d] - i_local2*nb; magma_dtranspose2( panel_local[d], panel_local_offset[d], nb, d_lAP[d], 0, cols, rows, nb0, queues[2*d+1]); //cublasDtrsm magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb0, c_one, panel_local[d], panel_local_offset[d], nb, inAT(d,s,i_local2), lddat, queues[2*d+1]); } } } /* if( nb0 > 0 ) */ /* clean up */ for( d=0; d<num_gpus; d++ ) { magma_queue_sync( queues[2*d] ); magma_queue_sync( queues[2*d+1] ); //magma_queue_destroy(streaml[d][0]); //magma_queue_destroy(streaml[d][1]); } } return *info; /* End of MAGMA_DGETRF2_MGPU */ }
extern "C" magma_int_t magma_dlobpcg( magma_d_sparse_matrix A, magma_d_solver_par *solver_par ){ #define residualNorms(i,iter) ( residualNorms + (i) + (iter)*n ) #define magmablas_swap(x, y) { pointer = x; x = y; y = pointer; } #define hresidualNorms(i,iter) (hresidualNorms + (i) + (iter)*n ) #define gramA( m, n) (gramA + (m) + (n)*ldgram) #define gramB( m, n) (gramB + (m) + (n)*ldgram) #define gevectors(m, n) (gevectors + (m) + (n)*ldgram) #define h_gramB( m, n) (h_gramB + (m) + (n)*ldgram) #define magma_d_bspmv_tuned(m, n, alpha, A, X, beta, AX) { \ magmablas_dtranspose( m, n, X, m, blockW, n ); \ magma_d_vector x, ax; \ x.memory_location = Magma_DEV; x.num_rows = m*n; x.nnz = m*n; x.val = blockW; \ ax.memory_location= Magma_DEV; ax.num_rows = m*n; ax.nnz = m*n; ax.val = AX; \ magma_d_spmv(alpha, A, x, beta, ax ); \ magmablas_dtranspose( n, m, blockW, n, X, m ); \ } //************************************************************** // Memory allocation for the eigenvectors, eigenvalues, and workspace solver_par->solver = Magma_LOBPCG; magma_int_t m = A.num_rows; magma_int_t n =(solver_par->num_eigenvalues); double *blockX = solver_par->eigenvectors; double *evalues = solver_par->eigenvalues; double *dwork, *hwork; double *blockP, *blockAP, *blockR, *blockAR, *blockAX, *blockW; double *gramA, *gramB, *gramM; double *gevectors, *h_gramB; double *pointer, *origX = blockX; double *eval_gpu; magma_int_t lwork = max( 2*n+n*magma_get_dsytrd_nb(n), 1 + 6*3*n + 2* 3*n* 3*n); magma_dmalloc_pinned( &hwork , lwork ); magma_dmalloc( &blockAX , m*n ); magma_dmalloc( &blockAR , m*n ); magma_dmalloc( &blockAP , m*n ); magma_dmalloc( &blockR , m*n ); magma_dmalloc( &blockP , m*n ); magma_dmalloc( &blockW , m*n ); magma_dmalloc( &dwork , m*n ); magma_dmalloc( &eval_gpu , 3*n ); //**********************************************************+ magma_int_t verbosity = 1; magma_int_t *iwork, liwork = 15*n+9; // === Set solver parameters === double residualTolerance = solver_par->epsilon; magma_int_t maxIterations = solver_par->maxiter; // === Set some constants & defaults === double c_one = MAGMA_D_ONE, c_zero = MAGMA_D_ZERO; double *residualNorms, *condestGhistory, condestG; double *gevalues; magma_int_t *activeMask; // === Check some parameters for possible quick exit === solver_par->info = 0; if (m < 2) solver_par->info = -1; else if (n > m) solver_par->info = -2; if (solver_par->info != 0) { magma_xerbla( __func__, -(solver_par->info) ); return solver_par->info; } magma_int_t *info = &(solver_par->info); // local info variable; // === Allocate GPU memory for the residual norms' history === magma_dmalloc(&residualNorms, (maxIterations+1) * n); magma_malloc( (void **)&activeMask, (n+1) * sizeof(magma_int_t) ); // === Allocate CPU work space === magma_dmalloc_cpu(&condestGhistory, maxIterations+1); magma_dmalloc_cpu(&gevalues, 3 * n); magma_malloc_cpu((void **)&iwork, liwork * sizeof(magma_int_t)); double *hW; magma_dmalloc_pinned(&hW, n*n); magma_dmalloc_pinned(&gevectors, 9*n*n); magma_dmalloc_pinned(&h_gramB , 9*n*n); // === Allocate GPU workspace === magma_dmalloc(&gramM, n * n); magma_dmalloc(&gramA, 9 * n * n); magma_dmalloc(&gramB, 9 * n * n); #if defined(PRECISION_z) || defined(PRECISION_c) double *rwork; magma_int_t lrwork = 1 + 5*(3*n) + 2*(3*n)*(3*n); magma_dmalloc_cpu(&rwork, lrwork); #endif // === Set activemask to one === for(int k =0; k<n; k++) iwork[k]=1; magma_setmatrix(n, 1, sizeof(magma_int_t), iwork, n ,activeMask, n); magma_int_t gramDim, ldgram = 3*n, ikind = 4; // === Make the initial vectors orthonormal === magma_dgegqr_gpu(ikind, m, n, blockX, m, dwork, hwork, info ); //magma_dorthomgs( m, n, blockX ); magma_d_bspmv_tuned(m, n, c_one, A, blockX, c_zero, blockAX ); // === Compute the Gram matrix = (X, AX) & its eigenstates === magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, n, m, c_one, blockX, m, blockAX, m, c_zero, gramM, n); magma_dsyevd_gpu( MagmaVec, MagmaUpper, n, gramM, n, evalues, hW, n, hwork, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, info ); // === Update X = X * evectors === magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockX, m, gramM, n, c_zero, blockW, m); magmablas_swap(blockW, blockX); // === Update AX = AX * evectors === magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockAX, m, gramM, n, c_zero, blockW, m); magmablas_swap(blockW, blockAX); condestGhistory[1] = 7.82; magma_int_t iterationNumber, cBlockSize, restart = 1, iter; //Chronometry real_Double_t tempo1, tempo2; magma_device_sync(); tempo1=magma_wtime(); // === Main LOBPCG loop ============================================================ for(iterationNumber = 1; iterationNumber < maxIterations; iterationNumber++) { // === compute the residuals (R = Ax - x evalues ) magmablas_dlacpy( MagmaUpperLower, m, n, blockAX, m, blockR, m); /* for(int i=0; i<n; i++){ magma_daxpy(m, MAGMA_D_MAKE(-evalues[i],0), blockX+i*m, 1, blockR+i*m, 1); } */ #if defined(PRECISION_z) || defined(PRECISION_d) magma_dsetmatrix( 3*n, 1, evalues, 3*n, eval_gpu, 3*n ); #else magma_ssetmatrix( 3*n, 1, evalues, 3*n, eval_gpu, 3*n ); #endif magma_dlobpcg_res( m, n, eval_gpu, blockX, blockR, eval_gpu); magmablas_dnrm2_cols(m, n, blockR, m, residualNorms(0, iterationNumber)); // === remove the residuals corresponding to already converged evectors magma_dcompact(m, n, blockR, m, residualNorms(0, iterationNumber), residualTolerance, activeMask, &cBlockSize); if (cBlockSize == 0) break; // === apply a preconditioner P to the active residulas: R_new = P R_old // === for now set P to be identity (no preconditioner => nothing to be done ) // magmablas_dlacpy( MagmaUpperLower, m, cBlockSize, blockR, m, blockW, m); /* // === make the preconditioned residuals orthogonal to X magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, cBlockSize, m, c_one, blockX, m, blockR, m, c_zero, gramB(0,0), ldgram); magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, n, c_mone, blockX, m, gramB(0,0), ldgram, c_one, blockR, m); */ // === make the active preconditioned residuals orthonormal magma_dgegqr_gpu(ikind, m, cBlockSize, blockR, m, dwork, hwork, info ); //magma_dorthomgs( m, cBlockSize, blockR ); // === compute AR magma_d_bspmv_tuned(m, cBlockSize, c_one, A, blockR, c_zero, blockAR ); if (!restart) { // === compact P & AP as well magma_dcompactActive(m, n, blockP, m, activeMask); magma_dcompactActive(m, n, blockAP, m, activeMask); /* // === make P orthogonal to X ? magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, cBlockSize, m, c_one, blockX, m, blockP, m, c_zero, gramB(0,0), ldgram); magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, n, c_mone, blockX, m, gramB(0,0), ldgram, c_one, blockP, m); // === make P orthogonal to R ? magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockR, m, blockP, m, c_zero, gramB(0,0), ldgram); magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, cBlockSize, c_mone, blockR, m, gramB(0,0), ldgram, c_one, blockP, m); */ // === Make P orthonormal & properly change AP (without multiplication by A) magma_dgegqr_gpu(ikind, m, cBlockSize, blockP, m, dwork, hwork, info ); //magma_dorthomgs( m, cBlockSize, blockP ); //magma_d_bspmv_tuned(m, cBlockSize, c_one, A, blockP, c_zero, blockAP ); magma_dsetmatrix( cBlockSize, cBlockSize, hwork, cBlockSize, dwork, cBlockSize); // magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, // m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m); // replacement according to Stan #if defined(PRECISION_s) || defined(PRECISION_d) magmablas_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m); #else magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m); #endif } iter = max(1,iterationNumber-10- (int)(log(1.*cBlockSize))); double condestGmean = 0.; for(int i = 0; i<iterationNumber-iter+1; i++) condestGmean += condestGhistory[i]; condestGmean = condestGmean / (iterationNumber-iter+1); if (restart) gramDim = n+cBlockSize; else gramDim = n+2*cBlockSize; /* --- The Raileight-Ritz method for [X R P] ----------------------- [ X R P ]' [AX AR AP] y = evalues [ X R P ]' [ X R P ], i.e., GramA GramB / X'AX X'AR X'AP \ / X'X X'R X'P \ | R'AX R'AR R'AP | y = evalues | R'X R'R R'P | \ P'AX P'AR P'AP / \ P'X P'R P'P / ----------------------------------------------------------------- */ // === assemble GramB; first, set it to I magmablas_dlaset(MagmaFull, ldgram, ldgram, c_zero, c_one, gramB, ldgram); // identity if (!restart) { magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockP, m, blockX, m, c_zero, gramB(n+cBlockSize,0), ldgram); magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockP, m, blockR, m, c_zero, gramB(n+cBlockSize,n), ldgram); } magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockR, m, blockX, m, c_zero, gramB(n,0), ldgram); // === get GramB from the GPU to the CPU and compute its eigenvalues only magma_dgetmatrix(gramDim, gramDim, gramB, ldgram, h_gramB, ldgram); lapackf77_dsyev("N", "L", &gramDim, h_gramB, &ldgram, gevalues, hwork, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, #endif info); // === check stability criteria if we need to restart condestG = log10( gevalues[gramDim-1]/gevalues[0] ) + 1.; if ((condestG/condestGmean>2 && condestG>2) || condestG>8) { // Steepest descent restart for stability restart=1; printf("restart at step #%d\n", (int) iterationNumber); } // === assemble GramA; first, set it to I magmablas_dlaset(MagmaFull, ldgram, ldgram, c_zero, c_one, gramA, ldgram); // identity magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockR, m, blockAX, m, c_zero, gramA(n,0), ldgram); magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockR, m, blockAR, m, c_zero, gramA(n,n), ldgram); if (!restart) { magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockP, m, blockAX, m, c_zero, gramA(n+cBlockSize,0), ldgram); magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockP, m, blockAR, m, c_zero, gramA(n+cBlockSize,n), ldgram); magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockP, m, blockAP, m, c_zero, gramA(n+cBlockSize,n+cBlockSize), ldgram); } /* // === Compute X' AX or just use the eigenvalues below ? magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, n, m, c_one, blockX, m, blockAX, m, c_zero, gramA(0,0), ldgram); */ if (restart==0) { magma_dgetmatrix(gramDim, gramDim, gramA, ldgram, gevectors, ldgram); } else { gramDim = n+cBlockSize; magma_dgetmatrix(gramDim, gramDim, gramA, ldgram, gevectors, ldgram); } for(int k=0; k<n; k++) *gevectors(k,k) = MAGMA_D_MAKE(evalues[k], 0); // === the previous eigensolver destroyed what is in h_gramB => must copy it again magma_dgetmatrix(gramDim, gramDim, gramB, ldgram, h_gramB, ldgram); magma_int_t itype = 1; lapackf77_dsygvd(&itype, "V", "L", &gramDim, gevectors, &ldgram, h_gramB, &ldgram, gevalues, hwork, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, info); for(int k =0; k<n; k++) evalues[k] = gevalues[k]; // === copy back the result to gramA on the GPU and use it for the updates magma_dsetmatrix(gramDim, gramDim, gevectors, ldgram, gramA, ldgram); if (restart == 0) { // === contribution from P to the new X (in new search direction P) magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockP, m, gramA(n+cBlockSize,0), ldgram, c_zero, dwork, m); magmablas_swap(dwork, blockP); // === contribution from R to the new X (in new search direction P) magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockR, m, gramA(n,0), ldgram, c_one, blockP, m); // === corresponding contribution from AP to the new AX (in AP) magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockAP, m, gramA(n+cBlockSize,0), ldgram, c_zero, dwork, m); magmablas_swap(dwork, blockAP); // === corresponding contribution from AR to the new AX (in AP) magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockAR, m, gramA(n,0), ldgram, c_one, blockAP, m); } else { // === contribution from R (only) to the new X magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockR, m, gramA(n,0), ldgram, c_zero, blockP, m); // === corresponding contribution from AR (only) to the new AX magma_dgemm(MagmaNoTrans, MagmaNoTrans,m, n, cBlockSize, c_one, blockAR, m, gramA(n,0), ldgram, c_zero, blockAP, m); } // === contribution from old X to the new X + the new search direction P magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockX, m, gramA, ldgram, c_zero, dwork, m); magmablas_swap(dwork, blockX); //magma_daxpy(m*n, c_one, blockP, 1, blockX, 1); magma_dlobpcg_maxpy( m, n, blockP, blockX ); // === corresponding contribution from old AX to new AX + AP magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockAX, m, gramA, ldgram, c_zero, dwork, m); magmablas_swap(dwork, blockAX); //magma_daxpy(m*n, c_one, blockAP, 1, blockAX, 1); magma_dlobpcg_maxpy( m, n, blockAP, blockAX ); condestGhistory[iterationNumber+1]=condestG; if (verbosity==1) { // double res; // magma_dgetmatrix(1, 1, // (double*)residualNorms(0, iterationNumber), 1, // (double*)&res, 1); // // printf("Iteration %4d, CBS %4d, Residual: %10.7f\n", // iterationNumber, cBlockSize, res); printf("%4d-%2d ", (int) iterationNumber, (int) cBlockSize); magma_dprint_gpu(1, n, residualNorms(0, iterationNumber), 1); } restart = 0; } // === end for iterationNumber = 1,maxIterations ======================= // fill solver info magma_device_sync(); tempo2=magma_wtime(); solver_par->runtime = (real_Double_t) tempo2-tempo1; solver_par->numiter = iterationNumber; if( solver_par->numiter < solver_par->maxiter){ solver_par->info = 0; }else if( solver_par->init_res > solver_par->final_res ) solver_par->info = -2; else solver_par->info = -1; // ============================================================================= // === postprocessing; // ============================================================================= // === compute the real AX and corresponding eigenvalues magma_d_bspmv_tuned(m, n, c_one, A, blockX, c_zero, blockAX ); magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, n, m, c_one, blockX, m, blockAX, m, c_zero, gramM, n); magma_dsyevd_gpu( MagmaVec, MagmaUpper, n, gramM, n, gevalues, dwork, n, hwork, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, info ); for(int k =0; k<n; k++) evalues[k] = gevalues[k]; // === update X = X * evectors magmablas_swap(blockX, dwork); magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, dwork, m, gramM, n, c_zero, blockX, m); // === update AX = AX * evectors to compute the final residual magmablas_swap(blockAX, dwork); magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, dwork, m, gramM, n, c_zero, blockAX, m); // === compute R = AX - evalues X magmablas_dlacpy( MagmaUpperLower, m, n, blockAX, m, blockR, m); for(int i=0; i<n; i++) magma_daxpy(m, MAGMA_D_MAKE(-evalues[i], 0), blockX+i*m, 1, blockR+i*m, 1); // === residualNorms[iterationNumber] = || R || magmablas_dnrm2_cols(m, n, blockR, m, residualNorms(0, iterationNumber)); // === restore blockX if needed if (blockX != origX) magmablas_dlacpy( MagmaUpperLower, m, n, blockX, m, origX, m); printf("Eigenvalues:\n"); for(int i =0; i<n; i++) printf("%e ", evalues[i]); printf("\n\n"); printf("Final residuals:\n"); magma_dprint_gpu(1, n, residualNorms(0, iterationNumber), 1); printf("\n\n"); //=== Print residual history in a file for plotting ==== double *hresidualNorms; magma_dmalloc_cpu(&hresidualNorms, (iterationNumber+1) * n); magma_dgetmatrix(n, iterationNumber, (double*)residualNorms, n, (double*)hresidualNorms, n); printf("Residuals are stored in file residualNorms\n"); printf("Plot the residuals using: myplot \n"); FILE *residuals_file; residuals_file = fopen("residualNorms", "w"); for(int i =1; i<iterationNumber; i++) { for(int j = 0; j<n; j++) fprintf(residuals_file, "%f ", *hresidualNorms(j,i)); fprintf(residuals_file, "\n"); } fclose(residuals_file); magma_free_cpu(hresidualNorms); // === free work space magma_free( residualNorms ); magma_free_cpu( condestGhistory ); magma_free_cpu( gevalues ); magma_free_cpu( iwork ); magma_free_pinned( hW ); magma_free_pinned( gevectors ); magma_free_pinned( h_gramB ); magma_free( gramM ); magma_free( gramA ); magma_free( gramB ); magma_free( activeMask ); magma_free( blockAX ); magma_free( blockAR ); magma_free( blockAP ); magma_free( blockR ); magma_free( blockP ); magma_free( blockW ); magma_free( dwork ); magma_free( eval_gpu ); magma_free_pinned( hwork ); #if defined(PRECISION_z) || defined(PRECISION_c) magma_free_cpu( rwork ); #endif return MAGMA_SUCCESS; }
/** Purpose ------- DGEQRS solves the least squares problem min || A*X - C || using the QR factorization A = Q*R computed by DGEQRF3_GPU. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. M >= N >= 0. @param[in] nrhs INTEGER The number of columns of the matrix C. NRHS >= 0. @param[in] dA DOUBLE PRECISION array on the GPU, dimension (LDDA,N) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,n, as returned by DGEQRF3_GPU in the first n columns of its array argument A. dA is modified by the routine but restored on exit. @param[in] ldda INTEGER The leading dimension of the array A, LDDA >= M. @param[in] tau DOUBLE PRECISION array, dimension (N) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by MAGMA_DGEQRF_GPU. @param[in,out] dB DOUBLE PRECISION array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. @param[in,out] dT DOUBLE PRECISION array that is the output (the 6th argument) of magma_dgeqrf_gpu of size 2*MIN(M, N)*NB + ceil(N/32)*32 )* MAX(NB, NRHS). The array starts with a block of size MIN(M,N)*NB that stores the triangular T matrices used in the QR factorization, followed by MIN(M,N)*NB block storing the diagonal block matrices for the R matrix, followed by work space of size (ceil(N/32)*32)* MAX(NB, NRHS). @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= M. @param[out] hwork (workspace) DOUBLE PRECISION array, dimension (LWORK) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK, LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB, where NB is the blocksize given by magma_get_dgeqrf_nb( M, N ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the WORK array. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_dgels_comp ********************************************************************/ extern "C" magma_int_t magma_dgeqrs3_gpu( magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaDouble_ptr dA, magma_int_t ldda, double const *tau, magmaDouble_ptr dT, magmaDouble_ptr dB, magma_int_t lddb, double *hwork, magma_int_t lwork, magma_int_t *info) { #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dT(i_) (dT + (lddwork + (i_))*nb) double c_one = MAGMA_D_ONE; magma_int_t min_mn, lddwork; magma_int_t nb = magma_get_dgeqrf_nb( m, n ); magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb; bool lquery = (lwork == -1); hwork[0] = magma_dmake_lwork( lwkopt ); *info = 0; if (m < 0) *info = -1; else if (n < 0 || m < n) *info = -2; else if (nrhs < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; else if (lddb < max(1,m)) *info = -8; else if (lwork < lwkopt && ! lquery) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; min_mn = min(m,n); if (min_mn == 0) { hwork[0] = c_one; return *info; } lddwork = min_mn; magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); /* B := Q^H * B */ magma_dormqr_gpu( MagmaLeft, MagmaTrans, m, nrhs, n, dA(0,0), ldda, tau, dB, lddb, hwork, lwork, dT, nb, info ); if ( *info != 0 ) { magma_queue_destroy( queue ); return *info; } /* Solve R*X = B(1:n,:) 1. Move the (min_mn - 1)/nb block diagonal submatrices from dT to R 2. Solve 3. Restore the data format moving data from R back to dT */ magmablas_dswapdblk( min_mn-1, nb, dA(0,0), ldda, 1, dT(0), nb, 0, queue ); if ( nrhs == 1 ) { magma_dtrsv( MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA(0,0), ldda, dB, 1, queue ); } else { magma_dtrsm( MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA(0,0), ldda, dB, lddb, queue ); } magmablas_dswapdblk( min_mn-1, nb, dT(0), nb, 0, dA(0,0), ldda, 1, queue ); magma_queue_destroy( queue ); return *info; }
extern "C" magma_int_t magma_dsytrf_nopiv(magma_uplo_t uplo, magma_int_t n, double *A, magma_int_t lda, magma_int_t *info) { /* -- MAGMA (version 1.6.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2011 Purpose ======= DSYTRF_nopiv computes the LDLt factorization of a real symmetric matrix A. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The factorization has the form A = U\*\*H * D * U, if UPLO = 'U', or A = L * D * L\*\*H, if UPLO = 'L', where U is an upper triangular matrix, L is lower triangular, and D is a diagonal matrix. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) DOUBLE_PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U\*\*H*U or A = L*L\*\*H. Higher performance is achieved if A is in pinned memory, e.g. allocated using cudaMallocHost. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -6, the GPU memory allocation failed > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ /* Local variables */ double zone = MAGMA_D_ONE; double mzone = MAGMA_D_NEG_ONE; int upper = (uplo == MagmaUpper); magma_int_t j, k, jb, ldda, nb, ib, iinfo; magmaDouble_ptr dA; magmaDouble_ptr dW; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return MAGMA_ERR_ILLEGAL_VALUE; } /* Quick return */ if ( n == 0 ) return MAGMA_SUCCESS; ldda = ((n+31)/32)*32; nb = magma_get_dsytrf_nopiv_nb(n); ib = min(32, nb); // inner-block for diagonal factorization if ((MAGMA_SUCCESS != magma_dmalloc(&dA, n *ldda)) || (MAGMA_SUCCESS != magma_dmalloc(&dW, nb*ldda))) { /* alloc failed so call the non-GPU-resident version */ *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t stream[2]; magma_event_t event; magma_queue_create(&stream[0]); magma_queue_create(&stream[1]); magma_event_create( &event ); trace_init( 1, 1, 2, (CUstream_st**)stream ); //if (nb <= 1 || nb >= n) //{ // lapackf77_dpotrf(uplo_, &n, a, &lda, info); //} else { /* Use hybrid blocked code. */ if (upper) { //========================================================= // Compute the LDLt factorization A = U'*D*U without pivoting. // copy matrix to GPU for (j=0; j<n; j+=nb) { jb = min(nb, (n-j)); trace_gpu_start( 0, 0, "set", "set" ); magma_dsetmatrix_async(j+jb, jb, A(0, j), lda, dA(0, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); } // main loop for (j=0; j<n; j += nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); magma_dgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, stream[0]); trace_gpu_end( 0, 0 ); // copy j-th column of U back to CPU magma_queue_wait_event( stream[1], event ); trace_gpu_start( 0, 1, "get", "get" ); magma_dgetmatrix_async(j, jb, dA(0, j), ldda, A(0, j), lda, stream[1]); trace_gpu_end( 0, 1 ); // factorize the diagonal block magma_queue_sync(stream[0]); trace_cpu_start( 0, "potrf", "potrf" ); dsytrf_nopiv_cpu(MagmaUpper, jb, ib, A(j, j), lda, info); trace_cpu_end( 0 ); if (*info != 0){ *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_dsetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column magmablasSetKernelStream( stream[0] ); trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_dtrsm(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaUnit, jb, (n-j-jb), zone, dA(j, j), ldda, dA(j, j+jb), ldda); magma_dcopymatrix( jb, n-j-jb, dA( j, j+jb ), ldda, dWt( 0, j+jb ), nb ); // update the trailing submatrix with D magmablas_dlascl_diag(MagmaUpper, jb, n-j-jb, dA(j, j), ldda, dA(j, j+jb), ldda, &iinfo); magma_event_record( event, stream[0] ); trace_gpu_end( 0, 0 ); // update the trailing submatrix with U and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k<n; k+=nb) { magma_int_t kb = min(nb,n-k); magma_dgemm(MagmaConjTrans, MagmaNoTrans, kb, n-k, jb, mzone, dWt(0, k), nb, dA(j, k), ldda, zone, dA(k, k), ldda); } trace_gpu_end( 0, 0 ); } } } else { //========================================================= // Compute the LDLt factorization A = L*D*L' without pivoting. // copy the matrix to GPU for (j=0; j<n; j+=nb) { jb = min(nb, (n-j)); trace_gpu_start( 0, 0, "set", "set" ); magma_dsetmatrix_async((n-j), jb, A(j, j), lda, dA(j, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); } // main loop for (j=0; j<n; j+=nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); magma_dgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, stream[0]); trace_gpu_end( 0, 0 ); // copy j-th row of L back to CPU magma_queue_wait_event( stream[1], event ); trace_gpu_start( 0, 1, "get", "get" ); magma_dgetmatrix_async(jb, j, dA(j, 0), ldda, A(j, 0), lda, stream[1]); trace_gpu_end( 0, 1 ); // factorize the diagonal block magma_queue_sync(stream[0]); trace_cpu_start( 0, "potrf", "potrf" ); dsytrf_nopiv_cpu(MagmaLower, jb, ib, A(j, j), lda, info); trace_cpu_end( 0 ); if (*info != 0){ *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_dsetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column magmablasSetKernelStream( stream[0] ); trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_dtrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaUnit, (n-j-jb), jb, zone, dA(j, j), ldda, dA(j+jb, j), ldda); magma_dcopymatrix( n-j-jb,jb, dA( j+jb, j ), ldda, dW( j+jb, 0 ), ldda ); // update the trailing submatrix with D magmablas_dlascl_diag(MagmaLower, n-j-jb, jb, dA(j, j), ldda, dA(j+jb, j), ldda, &iinfo); magma_event_record( event, stream[0] ); trace_gpu_end( 0, 0 ); // update the trailing submatrix with L and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k<n; k+=nb) { magma_int_t kb = min(nb,n-k); magma_dgemm(MagmaNoTrans, MagmaConjTrans, n-k, kb, jb, mzone, dA(k, j), ldda, dW(k, 0), ldda, zone, dA(k, k), ldda); } trace_gpu_end( 0, 0 ); } } } } trace_finalize( "dsytrf.svg","trace.css" ); magma_queue_destroy(stream[0]); magma_queue_destroy(stream[1]); magma_event_destroy( event ); magma_free(dW); magma_free(dA); return MAGMA_SUCCESS; } /* magma_dsytrf_nopiv */
extern "C" magma_int_t magma_dpotrf2_mgpu( magma_int_t num_gpus, magma_uplo_t uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, magmaDouble_ptr *d_lA, size_t d_lA_offset, magma_int_t ldda, magmaDouble_ptr *d_lP, magma_int_t lddp, double *a, magma_int_t lda, magma_int_t h, magma_queue_t *queues, magma_int_t *info ) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = 'U', or dA = L * L**H, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of dA is stored; = 'L': Lower triangle of dA is stored. N (input) INTEGER The order of the matrix dA. N >= 0. dA (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) On entry, the symmetric matrix dA. If UPLO = 'U', the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ magma_int_t j, jb, nb0, nb2, dd, d, id, j_local, j_local2, buf; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double d_one = 1.0; double d_neg_one = -1.0; magmaDouble_ptr dlpanel; size_t dlpanel_offset; magma_int_t n_local[MagmaMaxGPUs], ldpanel; magma_event_t events[MagmaMaxGPUs]; *info = 0; if ( (uplo != MagmaUpper) && (uplo != MagmaLower) ) { *info = -1; } else if (n < 0) { *info = -2; } else if ((uplo != MagmaUpper) && num_gpus*ldda < max(1,n)) { *info = -4; } else if ((uplo == MagmaUpper) && ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } { for( d=0; d<num_gpus; d++ ) { /* local-n and local-ld */ if (uplo == MagmaUpper) { n_local[d] = ((n/nb)/num_gpus)*nb; if (d < (n/nb)%num_gpus) n_local[d] += nb; else if (d == (n/nb)%num_gpus) n_local[d] += n%nb; } else { n_local[d] = ((m/nb)/num_gpus)*nb; if (d < (m/nb)%num_gpus) n_local[d] += nb; else if (d == (m/nb)%num_gpus) n_local[d] += m%nb; } } /* Use blocked code. */ if (uplo == MagmaUpper) { /* ---------------------------------------------- */ /* Upper-triangular case */ /* > Compute the Cholesky factorization A = U'*U. */ /* ---------------------------------------------- */ for(j=0;j<m;j+=nb){ /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (m-j)); if(j>0){ magma_queue_sync(queues[id*2]); // wait for the column on CPU /* broadcast off-diagonal column to all gpus */ d = (j/nb+1)%num_gpus; for(dd=0;dd<num_gpus;dd++){ if(d != id){ //magma_queue_sync(queues[2*d]); magma_dsetmatrix_async( j, jb, Aup(0,j), lda, dlP(d,jb,0,buf), lddp, queues[d*2], NULL ); } d = (d+1)%num_gpus; } } /* Update the current diagonal block */ if( j > 0 ) { magma_dsyrk(MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dlA(id, 0, nb*j_local), ldda, d_one, dlA(id, j, nb*j_local), ldda, queues[2*id+1]); } /* send the diagonal to cpu */ magma_queue_sync(queues[2*id+1]);// wait for syrk magma_dgetmatrix_async( jb, jb, dlA(id, j, nb*j_local), ldda, Aup(j,j), lda, queues[2*id], NULL); if(j>0){ /* Compute the local block column of the panel. */ d = (j/nb+1)%num_gpus; for(dd=0;dd<num_gpus;dd++){ j_local2 = j_local+1; if(d>id) j_local2 --; nb0 = nb*j_local2; if(n_local[d]>nb0){ /* wait for the off-diagonal */ if(d!=id){ dlpanel = d_lP[d]; dlpanel_offset = dlP_offset(jb, 0, buf); ldpanel = lddp; /* wait for the offdiagonal column */ magma_queue_sync(queues[d*2]); }else{ dlpanel = d_lA[d]; dlpanel_offset = dlA_offset(0, nb*j_local); ldpanel = ldda; } /* update the panel */ magma_dgemm(MagmaConjTrans, MagmaNoTrans, jb, n_local[d]-nb0, j, c_neg_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, 0, nb0), ldda, c_one, dlA(d, j, nb0), ldda, queues[2*d+1]); } d = (d+1)%num_gpus; } } /* factor the diagonal */ magma_queue_sync( queues[id*2] ); // wait for the diagonal lapackf77_dpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus */ if ( (j+jb) < n) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { if( d == id ) { dlpanel = d_lA[d]; dlpanel_offset = dlA_offset(j, nb*j_local); ldpanel = ldda; } else { dlpanel = d_lP[d]; dlpanel_offset = dlP_offset(0, 0, buf); ldpanel = lddp; } magma_dsetmatrix_async( jb, jb, Aup(j,j), lda, dlpanel, dlpanel_offset, ldpanel, queues[d*2], NULL); d = (d+1)%num_gpus; } } else { magma_dsetmatrix_async( jb, jb, Aup(j,j), lda, dlA(id, j, nb*j_local), ldda, queues[id*2], NULL ); } /* panel-factorize the off-diagonal */ if((j+jb)<n){ d = (j/nb+1)%num_gpus; for(dd=0;dd<num_gpus;dd++){ /* next column */ j_local2 = j_local+1; if(d>id) j_local2--; if( d == id ) { dlpanel = d_lA[d]; dlpanel_offset = dlA_offset(j, nb*j_local); ldpanel = ldda; } else { dlpanel = d_lP[d]; dlpanel_offset = dlP_offset(0, 0, buf); ldpanel = lddp; } nb2 = n_local[d]-nb*j_local2; nb0 = min(nb, nb2 ); magma_queue_sync( queues[2*d]); // wait for the diagonal if(j+jb < m && d == (j/nb+1)%num_gpus){ /* owns the next column, look-ahead the column */ magma_dtrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, j, nb*j_local2), ldda, queues[2*d+1]); /* send the column to cpu */ if(j+jb < m){ magma_queue_sync(queues[2*d+1]); // wait for lookahead magma_dgetmatrix_async( (j+jb), nb0, dlA(d, 0, nb*j_local2), ldda, Aup(0,j+jb), lda, queues[2*d], NULL); } /* update the remaining blocks */ nb2 = nb2 - nb0; magma_dtrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda, queues[2*d+1]); }else if(nb2 > 0){ /* update the entire trailing matrix */ magma_dtrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, j, nb*j_local2), ldda, queues[d*2+1]); } d = (d+1)%num_gpus; } } } } else { /* -------------------------------------------- */ /* Lower-triangular case */ /* Compute the Cholesky factorization A = L*L'. */ /* -------------------------------------------- */ for (j=0; j<n; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (n-j)); if( j > 0 ) { /* needed on pluto... */ //magma_setdevice(id); magma_queue_sync( queues[id*2] ); // wait for the column on CPU /* broadcast offdiagonal row to all gpus */ d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { if( d != id ) { /* wait for it on CPU */ //magma_queue_sync( queues[d*2] ); /* send it to GPU */ magma_dsetmatrix_async( jb, j, Alo(j,0), lda, dlPT(d,0,jb,buf), nb, queues[d*2], NULL ); clFlush(queues[d*2]); } d = (d+1)%num_gpus; } } /* Update the current diagonal block */ if( j > 0 ) { magma_dsyrk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dlA(id, nb*j_local, 0), ldda, d_one, dlA(id, nb*j_local, j), ldda, queues[id*2+1]); magma_queue_sync( queues[id*2+1] ); // wait for syrk } /* update the offdiagonal blocks */ if ( j > 0 ) { /* compute the block-rows of the panel */ d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { j_local2 = j_local+1; if( d > id ) j_local2 --; nb0 = nb*j_local2; if( nb0 < n_local[d] ) { if( d != id ) { //dlpanel = dlPT(d); dlpanel = d_lP[d]; dlpanel_offset = dlPT_offset(0, jb, buf); ldpanel = nb; /* wait for offdiagonal row */ magma_queue_sync(queues[d*2]); } else { dlpanel = d_lA[d]; dlpanel_offset = dlA_offset(nb*j_local, 0); ldpanel = ldda; } magma_dgemm( MagmaNoTrans, MagmaConjTrans, n_local[d]-nb0, jb, j, c_neg_one, dlA(d, nb0, 0), ldda, dlpanel, dlpanel_offset, ldpanel, c_one, dlA(d, nb0, j), ldda, queues[d*2+1]); } d = (d+1)%num_gpus; } } /* send the diagonal to cpu */ magma_dgetmatrix_async( jb, jb, dlA(id, nb*j_local, j), ldda, Alo(j,j), lda, queues[id*2], &events[id] ); clFlush(queues[id*2]); /* factor the diagonal */ magma_queue_sync( queues[id*2] ); lapackf77_dpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info); if (*info != 0) { printf("row number: %d\n", (int) j); *info = *info + j; break; } /* send the diagonal to gpus */ if ( (j+jb) < m ) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { if( d == id ) { dlpanel = d_lA[d]; dlpanel_offset = dlA_offset(nb*j_local, j); ldpanel = ldda; } else { //dlpanel = dlPT(d); dlpanel = d_lP[d]; dlpanel_offset = dlPT_offset(0, 0, buf); ldpanel = nb; } magma_dsetmatrix_async( jb, jb, Alo(j,j), lda, dlpanel, dlpanel_offset, ldpanel, queues[d*2], NULL ); clFlush(queues[d*2]); d = (d+1)%num_gpus; } } else { magma_dsetmatrix_async( jb, jb, Alo(j,j), lda, dlA(id, nb*j_local, j), ldda, queues[id*2], NULL ); clFlush(queues[id*2]); } /* factorize off-diagonal blocks */ if ( (j+jb) < m ) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; if( d == id ) { dlpanel = d_lA[d]; dlpanel_offset = dlA_offset(nb*j_local, j); ldpanel = ldda; } else { //dlpanel = dlPT(d); dlpanel = d_lP[d]; dlpanel_offset = dlPT_offset(0, 0, buf); ldpanel = nb; } nb2 = n_local[d] - j_local2*nb; nb0 = min(nb, nb2 ); magma_queue_sync(queues[d*2]); // wait for the diagonal if( j+jb < n && d == (j/nb+1)%num_gpus ) { /* owns the next column, look-ahead the column */ magma_dtrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, nb*j_local2, j), ldda, queues[d*2+1]); /* send the column to cpu */ if( j+jb < n ) { magma_queue_sync( queues[d*2+1] ); // wait for lookahead magma_dgetmatrix_async( nb0, j+jb, dlA(d, nb*j_local2, 0), ldda, Alo(j+jb,0), lda, queues[d*2], NULL); clFlush(queues[d*2]); } /* update the remaining blocks */ nb2 = nb2 - nb0; magma_dtrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda, queues[d*2+1]); } else if( nb2 > 0 ) { /* update the entire trailing matrix */ magma_dtrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, nb*j_local2, j), ldda, queues[d*2+1]); } d = (d+1)%num_gpus; } } } } /* end of else not upper */ /* clean up */ for( d=0; d<num_gpus; d++ ) { magma_queue_sync( queues[d*2] ); magma_queue_sync( queues[d*2+1] ); } } /* end of not lapack */ return *info; } /* magma_dpotrf_mgpu */