/** Purpose ------- ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_zposv_comp ********************************************************************/ extern "C" magma_int_t magma_zpotrf_gpu( magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex_ptr dA, magma_int_t ldda, magma_int_t *info ) { #ifdef HAVE_clBLAS #define dA(i_, j_) dA, ((i_) + (j_)*ldda + dA_offset) #else #define dA(i_, j_) (dA + (i_) + (j_)*ldda) #endif /* Constants */ const magmaDoubleComplex c_one = MAGMA_Z_ONE; const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; const double d_one = 1.0; const double d_neg_one = -1.0; /* Local variables */ const char* uplo_ = lapack_uplo_const( uplo ); bool upper = (uplo == MagmaUpper); magma_int_t j, jb, nb; magmaDoubleComplex *work; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } nb = magma_get_zpotrf_nb( n ); if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_queue_t queues[2]; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); if (nb <= 1 || nb >= n) { /* Use unblocked code. */ magma_zgetmatrix( n, n, dA(0,0), ldda, work, n, queues[0] ); lapackf77_zpotrf( uplo_, &n, work, &n, info ); magma_zsetmatrix( n, n, work, n, dA(0,0), ldda, queues[0] ); } else { /* Use blocked code. */ if (upper) { //========================================================= /* Compute the Cholesky factorization A = U'*U. */ for (j=0; j < n; j += nb) { // apply all previous updates to diagonal block, // then transfer it to CPU jb = min( nb, n-j ); magma_zherk( MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dA(0, j), ldda, d_one, dA(j, j), ldda, queues[1] ); magma_queue_sync( queues[1] ); magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, queues[0] ); // apply all previous updates to block row right of diagonal block if (j+jb < n) { magma_zgemm( MagmaConjTrans, MagmaNoTrans, jb, n-j-jb, j, c_neg_one, dA(0, j ), ldda, dA(0, j+jb), ldda, c_one, dA(j, j+jb), ldda, queues[1] ); } // simultaneous with above zgemm, transfer diagonal block, // factor it on CPU, and test for positive definiteness magma_queue_sync( queues[0] ); lapackf77_zpotrf( MagmaUpperStr, &jb, work, &jb, info ); magma_zsetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, queues[1] ); if (*info != 0) { *info = *info + j; break; } // apply diagonal block to block row right of diagonal block if (j+jb < n) { magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, n-j-jb, c_one, dA(j, j), ldda, dA(j, j+jb), ldda, queues[1] ); } } } else { //========================================================= // Compute the Cholesky factorization A = L*L'. for (j=0; j < n; j += nb) { // apply all previous updates to diagonal block, // then transfer it to CPU jb = min( nb, n-j ); magma_zherk( MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dA(j, 0), ldda, d_one, dA(j, j), ldda, queues[1] ); magma_queue_sync( queues[1] ); magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, queues[0] ); // apply all previous updates to block column below diagonal block if (j+jb < n) { magma_zgemm( MagmaNoTrans, MagmaConjTrans, n-j-jb, jb, j, c_neg_one, dA(j+jb, 0), ldda, dA(j, 0), ldda, c_one, dA(j+jb, j), ldda, queues[1] ); } // simultaneous with above zgemm, transfer diagonal block, // factor it on CPU, and test for positive definiteness magma_queue_sync( queues[0] ); lapackf77_zpotrf( MagmaLowerStr, &jb, work, &jb, info ); magma_zsetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, queues[1] ); if (*info != 0) { *info = *info + j; break; } // apply diagonal block to block column below diagonal if (j+jb < n) { magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, n-j-jb, jb, c_one, dA(j, j), ldda, dA(j+jb, j), ldda, queues[1] ); } } } } magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); magma_free_pinned( work ); return *info; } /* magma_zpotrf_gpu */
extern "C" magma_int_t magma_zpotrf3_mgpu(magma_int_t num_gpus, char uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, magmaDoubleComplex *d_lA[], magma_int_t ldda, magmaDoubleComplex *d_lP[], magma_int_t lddp, magmaDoubleComplex *a, magma_int_t lda, magma_int_t h, magma_queue_t stream[][3], magma_event_t event[][5], magma_int_t *info ) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. Auxiliary subroutine for zpotrf2_ooc. It is multiple gpu interface to compute Cholesky of a "rectangular" matrix. The factorization has the form dA = U**H * U, if UPLO = 'U', or dA = L * L**H, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of dA is stored; = 'L': Lower triangle of dA is stored. N (input) INTEGER The order of the matrix dA. N >= 0. dA (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = 'U', the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be dividable by 16. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ magma_int_t j, jb, nb0, nb2, d, dd, id, j_local, j_local2, buf; char uplo_[2] = {uplo, 0}; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; double d_one = 1.0; double d_neg_one = -1.0; int upper = lapackf77_lsame(uplo_, "U"); magmaDoubleComplex *dlpanel; magma_int_t n_local[MagmaMaxGPUs], ldpanel; const magma_int_t stream1 = 0, stream2 = 1, stream3 = 2; #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK) /* used by ztrsm_work */ int trsm_nb = 128; int trsm_n = trsm_nb*((nb+trsm_nb-1)/trsm_nb); magmaDoubleComplex *d_dinvA[MagmaMaxGPUs]; magmaDoubleComplex *d_x[MagmaMaxGPUs]; #define dinvA(d,j) &(d_dinvA[(d)][(j)*trsm_nb*trsm_n]) #define dx(d,j) &(d_x[(d)][(j)*nb*m]) /* * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); if ( (MAGMA_SUCCESS != magma_zmalloc( &d_dinvA[d], 2*trsm_nb*trsm_n )) || (MAGMA_SUCCESS != magma_zmalloc( &d_x[d], 2*nb*(upper ? n : m) )) ) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } magma_setdevice(0); #endif *info = 0; if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (!upper && num_gpus*ldda < max(1,n)) { *info = -4; } else if (upper && ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* initialization */ for( d=0; d<num_gpus; d++ ) { /* local-n and local-ld */ if (upper) { n_local[d] = (n/(nb*num_gpus))*nb; if (d < (n/nb)%num_gpus) n_local[d] += nb; else if (d == (n/nb)%num_gpus) n_local[d] += n%nb; } else { n_local[d] = (m/(nb*num_gpus))*nb; if (d < (m/nb)%num_gpus) n_local[d] += nb; else if (d == (m/nb)%num_gpus) n_local[d] += m%nb; } } /* == initialize the trace */ trace_init( 1, num_gpus, 3, (CUstream_st**)stream ); if (upper) { /* ---------------------------------------------- */ /* Upper-triangular case */ /* > Compute the Cholesky factorization A = U'*U. */ /* ---------------------------------------------- */ for (j=0; j<m; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; // right now, we have num_gpu buffers, so id and buf are the same.. /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (m-j)); /* Update the current diagonal block on stream1 */ magma_setdevice(id); if( j > 0 ) { magmablasSetKernelStream(stream[id][stream1]); trace_gpu_start( id, stream1, "syrk", "syrk" ); magma_zherk(MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dlA(id, 0, nb*j_local), ldda, d_one, dlA(id, j, nb*j_local), ldda); trace_gpu_end( id, stream1 ); } /* send the diagonal to cpu on stream1 */ trace_gpu_start( id, stream1, "comm", "D to CPU" ); magma_zgetmatrix_async( jb, jb, dlA(id, j, nb*j_local), ldda, Aup(j,j), lda, stream[id][stream1] ); trace_gpu_end( id, stream1 ); /* update off-diagonal blocks in the panel */ if( j > 0 ) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { j_local2 = j_local+1; if( d > id ) j_local2 --; nb0 = nb*j_local2; // number of local columns in the panel, while jb is panel-size (number of rows) if( n_local[d] > nb0 ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][stream2]); if( d == id ) { dlpanel = dlA(d,0,nb*j_local); ldpanel = ldda; // the GPU owns the row from start, and no need of synch. //magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu } else { dlpanel = dlP(d,nb,0,buf); ldpanel = lddp; magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu } trace_gpu_start( d, stream2, "gemm", "gemm" ); magma_zgemm(MagmaConjTrans, MagmaNoTrans, jb, n_local[d]-nb0, j, c_neg_one, dlpanel, ldpanel, dlA(d, 0, nb0), ldda, c_one, dlA(d, j, nb0), ldda); trace_gpu_end( d, stream2 ); magma_event_record( event[d][2], stream[d][stream2] ); } d = (d+1)%num_gpus; } } /* wait for panel and factorize it on cpu */ magma_setdevice(id); magma_queue_sync( stream[id][stream1] ); trace_cpu_start( 0, "getrf", "getrf" ); lapackf77_zpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info); trace_cpu_end( 0 ); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus on stream1 */ if ( (j+jb) < n) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { if( d == id ) { dlpanel = dlA(d, j, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d,0,0,buf); ldpanel = lddp; } magma_setdevice(d); trace_gpu_start( d, stream1, "comm", "comm" ); magma_zsetmatrix_async( jb, jb, Aup(j,j), lda, dlpanel, ldpanel, stream[d][stream1] ); trace_gpu_end( d, stream1 ); magma_event_record( event[d][1], stream[d][stream1] ); d = (d+1)%num_gpus; } } else { magma_setdevice(id); trace_gpu_start( id, stream1, "comm", "comm" ); magma_zsetmatrix_async( jb, jb, Aup(j,j), lda, dlA(id, j, nb*j_local), ldda, stream[id][stream1] ); trace_gpu_end( id, stream1 ); } /* panel-factorize the off-diagonal */ if ( (j+jb) < n) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; if( d == id ) { dlpanel = dlA(d,j,nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d,0,0,buf); ldpanel = lddp; } nb2 = n_local[d] - j_local2*nb; magma_setdevice(d); if( j+jb < m && d == (j/nb+1)%num_gpus ) { /* owns the next column, look-ahead next block on stream1 */ nb0 = min(nb, nb2); magmablasSetKernelStream(stream[d][stream1]); magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update trace_gpu_start( d, stream1, "trsm", "trsm" ); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK) magmablas_zlaset( MagmaUpperLower, trsm_nb, trsm_n, dinvA(d,0),trsm_nb ); magmablas_zlaset( MagmaUpperLower, nb0,jb, dx(d,0),nb0 ); magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, 1, dinvA(d,0), dx(d,0) ); #else magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda); #endif magma_event_record( event[d][4], stream[d][stream1] ); trace_gpu_end( d, stream1 ); } else if( nb2 > 0 ) { /* update all the blocks on stream2 */ magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor trace_gpu_start( d, stream2, "trsm", "trsm" ); magmablasSetKernelStream(stream[d][stream2]); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK) magmablas_zlaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,0),trsm_nb ); magmablas_zlaset( MagmaUpperLower, nb2,jb, dx(d,0),nb2 ); magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, 1, dinvA(d,0), dx(d,0) ); #else magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda); #endif trace_gpu_end( d, stream2 ); } d = (d+1)%num_gpus; } /* end of for */ /* ========================================================== */ if( j+jb < m ) { d = (j/nb+1)%num_gpus; /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; nb0 = min(nb, n_local[d]-nb*j_local2 ); /* even on 1 gpu, off-diagonals are copied to cpu (synchronize at the end). * * so we have the Cholesky factor, but only diagonal submatrix of the big panel, * * on cpu at the end. */ int d2, buf2; magma_setdevice(d); /* lookahead done */ magma_queue_wait_event( stream[d][stream3], event[d][4] ); trace_gpu_start( d, stream3, "comm", "row to CPU" ); magma_zgetmatrix_async( (j+jb), nb0, dlA(d, 0, nb*j_local2), ldda, Aup(0,j+jb), lda, stream[d][stream3] ); trace_gpu_end( d, stream3 ); magma_event_record( event[d][3], stream[d][stream3] ); /* needed on pluto */ //magma_queue_sync( stream[d][stream3] ); /* broadcast rows to gpus on stream2 */ buf2 = ((j+jb)/nb)%num_gpus; for( d2=0; d2<num_gpus; d2++ ) { if( d2 != d ) { magma_setdevice(d2); trace_gpu_start( d2, stream3, "comm", "row to GPUs" ); magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // rows arrived at cpu on stream3 magma_zsetmatrix_async( j+jb, nb0, Aup(0,j+jb), lda, dlP(d2,nb,0,buf2), lddp, stream[d2][stream3] ); trace_gpu_end( d2, stream3 ); magma_event_record( event[d2][0], stream[d2][stream3] ); } } /* =========================== */ /* update the remaining blocks */ nb2 = n_local[d]-(nb*j_local2 + nb0); if( nb2 > 0 ) { if( d == id ) { dlpanel = dlA(d, j, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d,0,0,buf); ldpanel = lddp; } magma_setdevice(d); magmablasSetKernelStream(stream[d][stream2]); trace_gpu_start( d, stream2, "trsm", "trsm" ); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK) int flag = 0; if (flag == 0) { magma_queue_wait_event( stream[d][stream2], event[d][4] ); // lookahead -> diagonal inversion } else { magmablas_zlaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,flag),trsm_nb ); magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received } magmablas_zlaset( MagmaUpperLower, nb2,jb, dx(d,1),nb2 ); magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda, flag, dinvA(d,flag), dx(d,1) ); #else magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda); #endif trace_gpu_end( d, stream2 ); } } } /* end of ztrsm */ } /* end of for j=1, .., n */ } else { /* ---------------------------------------------- */ /* Lower-triangular case */ /* > Compute the Cholesky factorization A = L*L'. */ /* ---------------------------------------------- */ for (j=0; j<n; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (n-j)); /* Update the current diagonal block on stream1 */ magma_setdevice(id); if( j > 0 ) { magmablasSetKernelStream(stream[id][stream1]); magma_zherk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dlA(id, nb*j_local, 0), ldda, d_one, dlA(id, nb*j_local, j), ldda); } /* send the diagonal to cpu on stream1 */ magma_zgetmatrix_async( jb, jb, dlA(id, nb*j_local, j), ldda, Alo(j,j), lda, stream[id][stream1] ); /* update off-diagonal blocks of the panel */ if( j > 0 ) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { j_local2 = j_local+1; if( d > id ) j_local2 --; nb0 = nb*j_local2; if( nb0 < n_local[d] ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][stream2]); if( d == id ) { dlpanel = dlA(d, nb*j_local, 0); ldpanel = ldda; } else { dlpanel = dlPT(d,0,nb,buf); ldpanel = nb; magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu } magma_zgemm( MagmaNoTrans, MagmaConjTrans, n_local[d]-nb0, jb, j, c_neg_one, dlA(d, nb0, 0), ldda, dlpanel, ldpanel, c_one, dlA(d, nb0, j), ldda); magma_event_record( event[d][2], stream[d][stream2] ); } d = (d+1)%num_gpus; } } /* wait for the panel and factorized it on cpu */ magma_setdevice(id); magma_queue_sync( stream[id][stream1] ); lapackf77_zpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus on stream1 */ if ( (j+jb) < m) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { if( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d, 0, 0, buf); ldpanel = nb; } magma_setdevice(d); magma_zsetmatrix_async( jb, jb, Alo(j,j), lda, dlpanel, ldpanel, stream[d][stream1] ); magma_event_record( event[d][1], stream[d][stream1] ); d = (d+1)%num_gpus; } } else { magma_setdevice(id); magma_zsetmatrix_async( jb, jb, Alo(j,j), lda, dlA(id, nb*j_local, j), ldda, stream[id][stream1] ); } /* panel factorize the off-diagonal */ if ( (j+jb) < m) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; if( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d, 0, 0, buf); ldpanel = nb; } nb2 = n_local[d] - j_local2*nb; nb0 = min(nb, nb2); magma_setdevice(d); if( j+nb < n && d == (j/nb+1)%num_gpus ) { /* owns next column, look-ahead next block on stream1 */ if ( j > 0 ) magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update magmablasSetKernelStream(stream[d][stream1]); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK) magmablas_zlaset( MagmaUpperLower, trsm_nb, trsm_n, dinvA(d,0),trsm_nb ); magmablas_zlaset( MagmaUpperLower, nb0,jb, dx(d,0),nb0 ); magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda, 1, dinvA(d,0), dx(d,0) ); #else magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda); #endif magma_event_record( event[d][4], stream[d][stream1] ); } else if( nb2 > 0 ) { /* other gpus updating all the blocks on stream2 */ /* update the entire column */ magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for the cholesky factor magmablasSetKernelStream(stream[d][stream2]); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK) magmablas_zlaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,0),trsm_nb ); magmablas_zlaset( MagmaUpperLower, nb2,jb, dx(d,0),nb2 ); magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda, 1, dinvA(d,0), dx(d,0) ); #else magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda); #endif } d = (d+1)%num_gpus; } /* end for d */ /* ========================================================== */ if( j+jb < n ) { d = (j/nb+1)%num_gpus; /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; nb0 = min(nb, n_local[d]-nb*j_local2 ); /* even on 1 gpu, we copy off-diagonal to cpu (but don't synchronize). */ /* so we have the Cholesky factor on cpu at the end. */ int d2, buf2; //#define ZPOTRF_DEVICE_TO_DEVICE #ifdef ZPOTRF_DEVICE_TO_DEVICE // lookahead done /* broadcast the rows to gpus */ buf2 = ((j+jb)/nb)%num_gpus; for( d2=0; d2<num_gpus; d2++ ) { magma_setdevice(d2); magma_queue_wait_event( stream[d2][stream3], event[d][4] ); if( d2 != d ) { magma_zcopymatrix_async( nb0, j+jb, dlPT(d2,0,nb,buf2), nb, // first nbxnb reserved for diagonal block dlA(d, nb*j_local2, 0), ldda, stream[d2][stream3] ); magma_event_record( event[d2][0], stream[d2][stream3] ); } else { magma_zgetmatrix_async( nb0, j+jb, dlA(d, nb*j_local2, 0), ldda, Alo(j+jb,0), lda, stream[d][stream3] ); } } #else // lookahead done magma_setdevice(d); magma_queue_wait_event( stream[d][stream3], event[d][4] ); magma_zgetmatrix_async( nb0, j+jb, dlA(d, nb*j_local2, 0), ldda, Alo(j+jb,0), lda, stream[d][stream3] ); magma_event_record( event[d][3], stream[d][stream3] ); /* syn on rows on CPU, seem to be needed on Pluto */ //magma_queue_sync( stream[d][stream3] ); /* broadcast the rows to gpus */ buf2 = ((j+jb)/nb)%num_gpus; for( d2=0; d2<num_gpus; d2++ ) { if( d2 != d ) { magma_setdevice(d2); magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // getmatrix done magma_zsetmatrix_async( nb0, j+jb, Alo(j+jb,0), lda, dlPT(d2,0,nb,buf2), nb, // first nbxnb reserved for diagonal block stream[d2][stream3] ); magma_event_record( event[d2][0], stream[d2][stream3] ); } } #endif /* =================================== */ /* updates remaining blocks on stream2 */ nb2 = n_local[d] - (j_local2*nb + nb0); if( nb2 > 0 ) { if( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d,0,0,buf); ldpanel = nb; } magma_setdevice(d); magmablasSetKernelStream(stream[d][stream2]); /* update the remaining blocks in the column */ #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK) int flag = 0; if (flag == 0) { magma_queue_wait_event( stream[d][stream2], event[d][4] ); // lookahead -> diagonal inversion } else { magmablas_zlaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,flag),trsm_nb ); magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received } magmablas_zlaset( MagmaUpperLower, nb2,jb, dx(d,1),nb2 ); magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda, flag, dinvA(d,flag), dx(d,1) ); #else magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda); #endif } } } } } /* end of else not upper */ /* == finalize the trace == */ trace_finalize( "zpotrf.svg","trace.css" ); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( j=0; j<3; j++ ) { magma_queue_sync( stream[d][j] ); } #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK) magma_free( d_dinvA[d] ); magma_free( d_x[d] ); #endif magmablasSetKernelStream(NULL); } magma_setdevice(0); return *info; } /* magma_zpotrf_mgpu */
int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, t1, t2; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_trans_t trans[] = { MagmaNoTrans, MagmaConjTrans, MagmaTrans }; magma_uplo_t uplo [] = { MagmaLower, MagmaUpper }; magma_diag_t diag [] = { MagmaUnit, MagmaNonUnit }; magma_side_t side [] = { MagmaLeft, MagmaRight }; magmaDoubleComplex *A, *B, *C, *C2, *LU; magmaDoubleComplex *dA, *dB, *dC1, *dC2; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 0.5, 0.1 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( 0.7, 0.2 ); double dalpha = 0.6; double dbeta = 0.8; double work[1], error, total_error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld, info; magma_int_t *piv; magma_int_t err; magma_opts opts; parse_opts( argc, argv, &opts ); printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" ); total_error = 0.; for( int itest = 0; itest < opts.ntest; ++itest ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; printf("=========================================================================\n"); printf( "m=%d, n=%d, k=%d\n", (int) m, (int) n, (int) k ); // allocate matrices // over-allocate so they can be any combination of {m,n,k} x {m,n,k}. maxn = max( max( m, n ), k ); ld = max( 1, maxn ); size = ld*maxn; err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) ); assert( err == 0 ); err = magma_zmalloc_pinned( &A, size ); assert( err == 0 ); err = magma_zmalloc_pinned( &B, size ); assert( err == 0 ); err = magma_zmalloc_pinned( &C, size ); assert( err == 0 ); err = magma_zmalloc_pinned( &C2, size ); assert( err == 0 ); err = magma_zmalloc_pinned( &LU, size ); assert( err == 0 ); err = magma_zmalloc( &dA, size ); assert( err == 0 ); err = magma_zmalloc( &dB, size ); assert( err == 0 ); err = magma_zmalloc( &dC1, size ); assert( err == 0 ); err = magma_zmalloc( &dC2, size ); assert( err == 0 ); // initialize matrices size = maxn*maxn; lapackf77_zlarnv( &ione, ISEED, &size, A ); lapackf77_zlarnv( &ione, ISEED, &size, B ); lapackf77_zlarnv( &ione, ISEED, &size, C ); printf( "========== Level 1 BLAS ==========\n" ); // ----- test ZSWAP // swap columns 2 and 3 of dA, then copy to C2 and compare with A if ( n >= 3 ) { magma_zsetmatrix( m, n, A, ld, dA, ld ); magma_zsetmatrix( m, n, A, ld, dB, ld ); magma_zswap( m, dA(0,1), 1, dA(0,2), 1 ); magma_zswap( m, dB(0,1), 1, dB(0,2), 1 ); // check results, storing diff between magma and cuda calls in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dA, 1, dB, 1 ); magma_zgetmatrix( m, n, dB, ld, C2, ld ); error = lapackf77_zlange( "F", &m, &k, C2, &ld, work ); total_error += error; printf( "zswap diff %.2g\n", error ); } else { printf( "zswap skipped for n < 3\n" ); } // ----- test IZAMAX // get argmax of column of A magma_zsetmatrix( m, k, A, ld, dA, ld ); error = 0; for( int j = 0; j < k; ++j ) { magma_int_t i1 = magma_izamax( m, dA(0,j), 1 ); int i2; // NOT magma_int_t, for cublas cublasIzamax( handle, m, dA(0,j), 1, &i2 ); // todo need sync here? assert( i1 == i2 ); error += abs( i1 - i2 ); } total_error += error; gflops = (double)m * k / 1e9; printf( "izamax diff %.2g\n", error ); printf( "\n" ); printf( "========== Level 2 BLAS ==========\n" ); // ----- test ZGEMV // c = alpha*A*b + beta*c, with A m*n; b,c m or n-vectors // try no-trans/trans for( int ia = 0; ia < 3; ++ia ) { magma_zsetmatrix( m, n, A, ld, dA, ld ); magma_zsetvector( maxn, B, 1, dB, 1 ); magma_zsetvector( maxn, C, 1, dC1, 1 ); magma_zsetvector( maxn, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_zgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZgemv( handle, cublas_trans_const(trans[ia]), m, n, &alpha, dA, ld, dB, 1, &beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 size = (trans[ia] == MagmaNoTrans ? m : n); cublasZaxpy( handle, size, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetvector( size, dC2, 1, C2, 1 ); error = lapackf77_zlange( "F", &size, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_ZGEMV( m, n ) / 1e9; printf( "zgemv( %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_trans_const(trans[ia]), error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test ZHEMV // c = alpha*A*b + beta*c, with A m*m symmetric; b,c m-vectors // try upper/lower for( int iu = 0; iu < 2; ++iu ) { magma_zsetmatrix( m, m, A, ld, dA, ld ); magma_zsetvector( m, B, 1, dB, 1 ); magma_zsetvector( m, C, 1, dC1, 1 ); magma_zsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_zhemv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZhemv( handle, cublas_uplo_const(uplo[iu]), m, &alpha, dA, ld, dB, 1, &beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, m, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_zlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_ZHEMV( m ) / 1e9; printf( "zhemv( %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test ZTRSV // solve A*c = c, with A m*m triangular; c m-vector // try upper/lower, no-trans/trans, unit/non-unit diag // Factor A into LU to get well-conditioned triangles, else solve yields garbage. // Still can give garbage if solves aren't consistent with LU factors, // e.g., using unit diag for U, so copy lower triangle to upper triangle. // Also used for trsm later. lapackf77_zlacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld ); lapackf77_zgetrf( &maxn, &maxn, LU, &ld, piv, &info ); for( int j = 0; j < maxn; ++j ) { for( int i = 0; i < j; ++i ) { *LU(i,j) = *LU(j,i); } } for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { magma_zsetmatrix( m, m, LU, ld, dA, ld ); magma_zsetvector( m, C, 1, dC1, 1 ); magma_zsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_ztrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZtrsv( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, dA, ld, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, m, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_zlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_ZTRSM( MagmaLeft, m, 1 ) / 1e9; printf( "ztrsv( %c, %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), lapacke_diag_const(diag[id]), error, gflops/t1, gflops/t2 ); }}} printf( "\n" ); printf( "========== Level 3 BLAS ==========\n" ); // ----- test ZGEMM // C = alpha*A*B + beta*C, with A m*k or k*m; B k*n or n*k; C m*n // try combinations of no-trans/trans for( int ia = 0; ia < 3; ++ia ) { for( int ib = 0; ib < 3; ++ib ) { bool nta = (trans[ia] == MagmaNoTrans); bool ntb = (trans[ib] == MagmaNoTrans); magma_zsetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA, ld ); magma_zsetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB, ld ); magma_zsetmatrix( m, n, C, ld, dC1, ld ); magma_zsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_zgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZgemm( handle, cublas_trans_const(trans[ia]), cublas_trans_const(trans[ib]), m, n, k, &alpha, dA, ld, dB, ld, &beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZGEMM( m, n, k ) / 1e9; printf( "zgemm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_trans_const(trans[ia]), lapacke_trans_const(trans[ib]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test ZHEMM // C = alpha*A*B + beta*C (left) with A m*m symmetric; B,C m*n; or // C = alpha*B*A + beta*C (right) with A n*n symmetric; B,C m*n // try left/right, upper/lower for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { magma_zsetmatrix( m, m, A, ld, dA, ld ); magma_zsetmatrix( m, n, B, ld, dB, ld ); magma_zsetmatrix( m, n, C, ld, dC1, ld ); magma_zsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_zhemm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZhemm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), m, n, &alpha, dA, ld, dB, ld, &beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZHEMM( side[is], m, n ) / 1e9; printf( "zhemm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_side_const(side[is]), lapacke_uplo_const(uplo[iu]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test ZHERK // C = alpha*A*A^H + beta*C (no-trans) with A m*k and C m*m symmetric; or // C = alpha*A^H*A + beta*C (trans) with A k*m and C m*m symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { magma_zsetmatrix( n, k, A, ld, dA, ld ); magma_zsetmatrix( n, n, C, ld, dC1, ld ); magma_zsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_zherk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZherk( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), n, k, &dalpha, dA, ld, &dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZHERK( k, n ) / 1e9; printf( "zherk( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test ZHER2K // C = alpha*A*B^H + ^alpha*B*A^H + beta*C (no-trans) with A,B n*k; C n*n symmetric; or // C = alpha*A^H*B + ^alpha*B^H*A + beta*C (trans) with A,B k*n; C n*n symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { bool nt = (trans[it] == MagmaNoTrans); magma_zsetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA, ld ); magma_zsetmatrix( n, n, C, ld, dC1, ld ); magma_zsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_zher2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZher2k( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), n, k, &alpha, dA, ld, dB, ld, &dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZHER2K( k, n ) / 1e9; printf( "zher2k( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test ZTRMM // C = alpha*A*C (left) with A m*m triangular; C m*n; or // C = alpha*C*A (right) with A n*n triangular; C m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == MagmaLeft); magma_zsetmatrix( (left ? m : n), (left ? m : n), A, ld, dA, ld ); magma_zsetmatrix( m, n, C, ld, dC1, ld ); magma_zsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ztrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; // note cublas does trmm out-of-place (i.e., adds output matrix C), // but allows C=B to do in-place. t2 = magma_sync_wtime( 0 ); cublasZtrmm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, n, &alpha, dA, ld, dC2, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZTRMM( side[is], m, n ) / 1e9; printf( "ztrmm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // ----- test ZTRSM // solve A*X = alpha*B (left) with A m*m triangular; B m*n; or // solve X*A = alpha*B (right) with A n*n triangular; B m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == MagmaLeft); magma_zsetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA, ld ); magma_zsetmatrix( m, n, C, ld, dC1, ld ); magma_zsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ztrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZtrsm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, n, &alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZTRSM( side[is], m, n ) / 1e9; printf( "ztrsm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // cleanup magma_free_cpu( piv ); magma_free_pinned( A ); magma_free_pinned( B ); magma_free_pinned( C ); magma_free_pinned( C2 ); magma_free_pinned( LU ); magma_free( dA ); magma_free( dB ); magma_free( dC1 ); magma_free( dC2 ); fflush( stdout ); } if ( total_error != 0. ) { printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n", total_error ); } else { printf( "all tests passed\n" ); } TESTING_FINALIZE(); int status = (total_error != 0.); return status; }
/** Purpose ------- ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. This version assumes the computation runs through the NULL stream and therefore is not overlapping some computation with communication. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_zposv_comp ********************************************************************/ extern "C" magma_int_t magma_zpotrf_gpu(magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info) { #define dA(i, j) (dA + (j)*ldda + (i)) magma_int_t j, jb, nb; const char* uplo_ = lapack_uplo_const( uplo ); magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *work; double d_one = 1.0; double d_neg_one = -1.0; int upper = (uplo == MagmaUpper); *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } nb = magma_get_zpotrf_nb(n); if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); if ((nb <= 1) || (nb >= n)) { /* Use unblocked code. */ magma_zgetmatrix( n, n, dA, ldda, work, n ); lapackf77_zpotrf(uplo_, &n, work, &n, info); magma_zsetmatrix( n, n, work, n, dA, ldda ); } else { /* Use blocked code. */ if (upper) { /* Compute the Cholesky factorization A = U'*U. */ for (j=0; j < n; j += nb) { /* Update and factorize the current diagonal block and test for non-positive-definiteness. Computing MIN */ jb = min(nb, (n-j)); magma_zherk(MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dA(0, j), ldda, d_one, dA(j, j), ldda); magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, stream[1] ); if ( (j+jb) < n) { /* Compute the current block row. */ magma_zgemm(MagmaConjTrans, MagmaNoTrans, jb, (n-j-jb), j, c_neg_one, dA(0, j ), ldda, dA(0, j+jb), ldda, c_one, dA(j, j+jb), ldda); } magma_queue_sync( stream[1] ); lapackf77_zpotrf(MagmaUpperStr, &jb, work, &jb, info); magma_zsetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, stream[0] ); if (*info != 0) { *info = *info + j; break; } if ( (j+jb) < n) { magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, (n-j-jb), c_one, dA(j, j ), ldda, dA(j, j+jb), ldda); } } } else { //========================================================= // Compute the Cholesky factorization A = L*L'. for (j=0; j < n; j += nb) { // Update and factorize the current diagonal block and test // for non-positive-definiteness. Computing MIN jb = min(nb, (n-j)); magma_zherk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dA(j, 0), ldda, d_one, dA(j, j), ldda); magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, stream[1] ); if ( (j+jb) < n) { magma_zgemm( MagmaNoTrans, MagmaConjTrans, (n-j-jb), jb, j, c_neg_one, dA(j+jb, 0), ldda, dA(j, 0), ldda, c_one, dA(j+jb, j), ldda); } magma_queue_sync( stream[1] ); lapackf77_zpotrf(MagmaLowerStr, &jb, work, &jb, info); magma_zsetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, stream[0] ); if (*info != 0) { *info = *info + j; break; } if ( (j+jb) < n) { magma_ztrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, (n-j-jb), jb, c_one, dA(j, j), ldda, dA(j+jb, j), ldda); } } } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free_pinned( work ); return *info; } /* magma_zpotrf_gpu */
extern "C" magma_int_t magma_zpotrf2_mgpu(int num_gpus, char uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, magmaDoubleComplex **d_lA, magma_int_t ldda, magmaDoubleComplex **d_lP, magma_int_t lddp, magmaDoubleComplex *a, magma_int_t lda, magma_int_t h, magma_queue_t stream[][3], magma_event_t event[][5], magma_int_t *info ) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = 'U', or dA = L * L**H, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of dA is stored; = 'L': Lower triangle of dA is stored. N (input) INTEGER The order of the matrix dA. N >= 0. dA (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = 'U', the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be dividable by 16. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ magma_int_t j, jb, nb0, nb2, dd, d, id, j_local, j_local2, buf; char uplo_[2] = {uplo, 0}; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; double d_one = 1.0; double d_neg_one = -1.0; int upper = lapackf77_lsame(uplo_, "U"); magmaDoubleComplex *dlpanel; //magma_event_t event0[MagmaMaxGPUs], // syrk // event1[MagmaMaxGPUs], // send off-diagonal // event2[MagmaMaxGPUs], // send diagonal // event3[MagmaMaxGPUs]; // trsm magma_int_t n_local[MagmaMaxGPUs], ldpanel; int stream0 = 0, stream1 = 1; #ifdef ZTRSM_WORK magmaDoubleComplex *d_dinvA[MagmaMaxGPUs][2], *d_x[MagmaMaxGPUs][2]; /* used by ztrsm_work */ #endif *info = 0; if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (!upper && num_gpus*ldda < max(1,n)) { *info = -4; } else if (upper && ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } for( d=0; d<num_gpus; d++ ) { /* local-n and local-ld */ if (upper) { n_local[d] = ((n/nb)/num_gpus)*nb; if (d < (n/nb)%num_gpus) n_local[d] += nb; else if (d == (n/nb)%num_gpus) n_local[d] += n%nb; } else { n_local[d] = ((m/nb)/num_gpus)*nb; if (d < (m/nb)%num_gpus) n_local[d] += nb; else if (d == (m/nb)%num_gpus) n_local[d] += m%nb; } //magma_setdevice(d); //magma_event_create( &event0[d] ); //magma_event_create( &event1[d] ); //magma_event_create( &event2[d] ); //magma_event_create( &event3[d] ); } magma_setdevice(0); /* == initialize the trace */ trace_init( 1, num_gpus, 3, stream ); /* Use blocked code. */ if (upper) { /* ---------------------------------------------- */ /* Upper-triangular case */ /* > Compute the Cholesky factorization A = U'*U. */ /* ---------------------------------------------- */ #if defined(PRECISION_d) && defined(ZTRSM_WORK) /* invert the diagonals * Allocate device memory for the inversed diagonal blocks, size=m*NB */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( j=0; j<2; j++ ) { magma_zmalloc( &d_dinvA[d][j], nb*nb ); magma_zmalloc( &d_x[d][j], n*nb ); cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(magmaDoubleComplex)); cudaMemset(d_x[d][j], 0, n*nb*sizeof(magmaDoubleComplex)); } } magma_setdevice(0); #endif for (j=0; j<m; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (m-j)); if( j > 0 ) { /* needed on pluto... */ magma_setdevice(id); magma_queue_sync( stream[id][stream0] ); // wait for the column on CPU /* broadcast off-diagonal column to all gpus */ d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { if( d != id ) { magma_setdevice(d); /* wait for it on CPU */ magma_queue_wait_event( stream[d][stream0], event[id][1] ); /* send it to GPU */ trace_gpu_start( d, stream0, "comm", "rows to GPUs" ); magma_zsetmatrix_async( j, jb, Aup(0,j), lda, dlP(d,jb,0,buf), lddp, stream[d][stream0] ); trace_gpu_end( d, stream0 ); magma_event_record( event[d][1], stream[d][stream0] ); } d = (d+1)%num_gpus; } } /* Update the current diagonal block */ magma_setdevice(id); if( j > 0 ) { magmablasSetKernelStream(stream[id][stream1]); trace_gpu_start( id, stream1, "syrk", "syrk" ); magma_zherk(MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dlA(id, 0, nb*j_local), ldda, d_one, dlA(id, j, nb*j_local), ldda); trace_gpu_end( id, stream1 ); magma_event_record( event[id][0], stream[id][stream1] ); } /* send the diagonal to cpu */ magma_queue_wait_event( stream[id][stream0], event[id][0] ); // wait for syrk trace_gpu_start( id, stream0, "comm", "D to CPU" ); magma_zgetmatrix_async( jb, jb, dlA(id, j, nb*j_local), ldda, Aup(j,j), lda, stream[id][stream0] ); trace_gpu_end( id, stream0 ); if ( j > 0 ) { /* Compute the local block column of the panel. */ d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { j_local2 = j_local+1; if( d > id ) j_local2 --; nb0 = nb*j_local2; if( n_local[d] > nb0 ) { /* wait for the off-diagonal */ if( d != id ) { //magma_queue_sync( stream[id][3] ); dlpanel = dlP(d, jb, 0, buf); ldpanel = lddp; /* wait for the offdiagonal column */ magma_queue_wait_event( stream[d][stream1], event[d][1] ); } else { dlpanel = dlA(d, 0, nb*j_local); ldpanel = ldda; } /* update the panel */ magma_setdevice(d); magmablasSetKernelStream(stream[d][stream1]); trace_gpu_start( d, stream1, "gemm", "gemm" ); magma_zgemm(MagmaConjTrans, MagmaNoTrans, jb, n_local[d]-nb0, j, c_neg_one, dlpanel, ldpanel, dlA(d, 0, nb0), ldda, c_one, dlA(d, j, nb0), ldda); trace_gpu_end( d, stream1 ); } d = (d+1)%num_gpus; } } /* factor the diagonal */ magma_setdevice(id); magma_queue_sync( stream[id][stream0] ); // wait for the diagonal trace_cpu_start( 0, "getrf", "getrf" ); lapackf77_zpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info); trace_cpu_end( 0 ); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus */ if ( (j+jb) < n) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { magma_setdevice(d); if( d == id ) { dlpanel = dlA(d, j, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d, 0, 0, buf); ldpanel = lddp; } trace_gpu_start( d, stream0, "comm", "D to GPUs" ); magma_zsetmatrix_async( jb, jb, Aup(j,j), lda, dlpanel, ldpanel, stream[d][stream0] ); trace_gpu_end( d, stream0 ); magma_event_record( event[d][2], stream[d][stream0] ); d = (d+1)%num_gpus; } } else { magma_setdevice(id); trace_gpu_start( id, stream0, "comm", "D to GPUs" ); magma_zsetmatrix_async( jb, jb, Aup(j,j), lda, dlA(id, j, nb*j_local), ldda, stream[id][stream0] ); trace_gpu_end( id, stream0 ); } /* panel-factorize the off-diagonal */ if ( (j+jb) < n) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; if( d == id ) { dlpanel = dlA(d, j, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d, 0, 0, buf); ldpanel = lddp; } nb2 = n_local[d]-nb*j_local2; nb0 = min(nb, nb2 ); magma_setdevice(d); magmablasSetKernelStream(stream[d][stream1]); magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for the diagonal if( j+jb < m && d == (j/nb+1)%num_gpus ) { /* owns the next column, look-ahead the column */ trace_gpu_start( d, stream1, "trsm", "trsm" ); #if defined(PRECISION_d) && defined(ZTRSM_WORK) magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, d_dinvA[d][0], d_x[d][0] ); /*nb2 = n_local[d] - j_local2*nb; magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, d_dinvA[d], d_x[d] );*/ #else /*nb2 = n_local[d] - j_local2*nb; magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldda, dlA(d, j, nb*j_local2), ldda); */ magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda); #endif trace_gpu_end( d, stream1 ); magma_event_record( event[d][3], stream[d][stream1] ); /* send the column to cpu */ if( j+jb < m ) { trace_gpu_start( d, stream0, "comm", "rows to CPU" ); magma_queue_wait_event( stream[d][stream0], event[d][3] ); // wait for lookahead magma_zgetmatrix_async( (j+jb), nb0, dlA(d, 0, nb*j_local2), ldda, Aup(0,j+jb), lda, stream[d][stream0] ); trace_gpu_end( d, stream0 ); magma_event_record( event[d][1], stream[d][stream0] ); } /* update the remaining blocks */ nb2 = nb2 - nb0; #if defined(PRECISION_d) && defined(ZTRSM_WORK) magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda); #endif } else if( nb2 > 0 ) { /* update the entire trailing matrix */ trace_gpu_start( d, stream1, "trsm", "trsm" ); #if defined(PRECISION_d) && defined(ZTRSM_WORK) magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda); #endif trace_gpu_end( d, stream1 ); } d = (d+1)%num_gpus; } } /* end of ztrsm */ } /* end of for j=1, .., n */ } else { /* -------------------------------------------- */ /* Lower-triangular case */ /* Compute the Cholesky factorization A = L*L'. */ /* -------------------------------------------- */ #if defined(PRECISION_d) && defined(ZTRSM_WORK) /* * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( j=0; j<2; j++ ) { magma_zmalloc( &d_dinvA[d][j], nb*nb ); magma_zmalloc( &d_x[d][j], nb*m ); cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(magmaDoubleComplex)); cudaMemset(d_x[d][j], 0, nb* m*sizeof(magmaDoubleComplex)); } } magma_setdevice(0); #endif for (j=0; j<n; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (n-j)); if( j > 0 ) { /* needed on pluto... */ magma_setdevice(id); magma_queue_sync( stream[id][stream0] ); // wait for the column on CPU /* broadcast offdiagonal row to all gpus */ d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { if( d != id ) { magma_setdevice(d); /* wait for it on CPU */ magma_queue_wait_event( stream[d][stream0], event[id][1] ); /* send it to GPU */ magma_zsetmatrix_async( jb, j, Alo(j,0), lda, dlPT(d,0,jb,buf), nb, stream[d][stream0] ); magma_event_record( event[d][1], stream[d][stream0] ); } d = (d+1)%num_gpus; } } /* Update the current diagonal block */ magma_setdevice(id); if( j > 0 ) { magmablasSetKernelStream(stream[id][stream1]); magma_zherk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dlA(id, nb*j_local, 0), ldda, d_one, dlA(id, nb*j_local, j), ldda); magma_event_record( event[id][0], stream[id][stream1] ); } /* send the diagonal to cpu */ magma_queue_wait_event( stream[id][stream0], event[id][0] ); // wait for syrk magma_zgetmatrix_async( jb, jb, dlA(id, nb*j_local, j), ldda, Alo(j,j), lda, stream[id][stream0] ); /* update the offdiagonal blocks */ if ( j > 0 ) { /* compute the block-rows of the panel */ d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { j_local2 = j_local+1; if( d > id ) j_local2 --; nb0 = nb*j_local2; if( nb0 < n_local[d] ) { if( d != id ) { dlpanel = dlPT(d, 0, jb, buf); ldpanel = nb; /* wait for offdiagonal row */ magma_queue_wait_event( stream[d][stream1], event[d][1] ); } else { dlpanel = dlA(d, nb*j_local, 0); ldpanel = ldda; } magma_setdevice(d); magmablasSetKernelStream(stream[d][stream1]); magma_zgemm( MagmaNoTrans, MagmaConjTrans, n_local[d]-nb0, jb, j, c_neg_one, dlA(d, nb0, 0), ldda, dlpanel, ldpanel, c_one, dlA(d, nb0, j), ldda); } d = (d+1)%num_gpus; } } /* factor the diagonal */ magma_setdevice(id); magma_queue_sync( stream[id][stream0] ); lapackf77_zpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus */ if ( (j+jb) < m ) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { magma_setdevice(d); if( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d, 0, 0, buf); ldpanel = nb; } magma_zsetmatrix_async( jb, jb, Alo(j,j), lda, dlpanel, ldpanel, stream[d][stream0] ); magma_event_record( event[d][2], stream[d][stream0] ); d = (d+1)%num_gpus; } } else { magma_setdevice(id); magma_zsetmatrix_async( jb, jb, Alo(j,j), lda, dlA(id, nb*j_local, j), ldda, stream[id][stream0] ); } /* factorize off-diagonal blocks */ if ( (j+jb) < m ) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; if( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d, 0, 0, buf); ldpanel = nb; } nb2 = n_local[d] - j_local2*nb; nb0 = min(nb, nb2 ); magma_setdevice(d); magmablasSetKernelStream(stream[d][stream1]); magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for the diagonal if( j+jb < n && d == (j/nb+1)%num_gpus ) { /* owns the next column, look-ahead the column */ #if defined(PRECISION_d) && defined(ZTRSM_WORK) magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda, d_dinvA[d][0], d_x[d][0]); #else magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda); #endif magma_event_record( event[d][3], stream[d][stream1] ); /* send the column to cpu */ if( j+jb < n ) { magma_queue_wait_event( stream[d][stream0], event[d][3] ); // wait for lookahead magma_zgetmatrix_async( nb0, j+jb, dlA(d, nb*j_local2, 0), ldda, Alo(j+jb,0), lda, stream[d][stream0] ); magma_event_record( event[d][1], stream[d][stream0] ); } /* update the remaining blocks */ nb2 = nb2 - nb0; #if defined(PRECISION_d) && defined(ZTRSM_WORK) magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda); #endif } else if( nb2 > 0 ) { /* update the entire trailing matrix */ #if defined(PRECISION_d) && defined(ZTRSM_WORK) magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda); #endif } d = (d+1)%num_gpus; } } } } /* end of else not upper */ /* == finalize the trace == */ trace_finalize( "zpotrf.svg","trace.css" ); /* clean up */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); //magma_event_destroy( event0[d] ); //magma_event_destroy( event1[d] ); //magma_event_destroy( event2[d] ); //magma_event_destroy( event3[d] ); } magma_setdevice(0); return *info; } /* magma_zpotrf_mgpu */
extern "C" magma_err_t magma_zpotrf2_msub(int num_subs, int num_gpus, magma_uplo_t uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, magmaDoubleComplex_ptr *d_lA, size_t d_lA_offset, magma_int_t ldda, magmaDoubleComplex_ptr *d_lP, magma_int_t lddp, magmaDoubleComplex *a, magma_int_t lda, magma_int_t h, magma_int_t *info, magma_queue_t *queues ) { /* -- clMAGMA (version 1.1.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date January 2014 Purpose ======= ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = 'U', or dA = L * L**H, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of dA is stored; = 'L': Lower triangle of dA is stored. N (input) INTEGER The order of the matrix dA. N >= 0. dA (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = 'U', the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be dividable by 16. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ int tot_subs = num_subs*num_gpus; magma_int_t j, jb, nb0, nb2, dd, d, id, j_local, j_local2; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; double d_one = 1.0; double d_neg_one = -1.0; magmaDoubleComplex_ptr dlpanel; size_t dlpanel_offset; magma_int_t n_local[MagmaMaxSubs * MagmaMaxGPUs], ldpanel; // initialize trace trace_init(1, num_gpus, 2, queues); *info = 0; if ( (uplo != MagmaUpper) && (uplo != MagmaLower) ) { *info = -1; } else if (n < 0) { *info = -2; } else if ((uplo != MagmaUpper) && tot_subs*ldda < max(1,n)) { *info = -4; } else if ((uplo == MagmaUpper) && ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } for (d=0; d<tot_subs; d++) { /* local-n and local-ld */ if (uplo == MagmaUpper) { n_local[d] = ((n/nb)/tot_subs)*nb; if (d < (n/nb)%tot_subs) n_local[d] += nb; else if (d == (n/nb)%tot_subs) n_local[d] += n%nb; } else { n_local[d] = ((m/nb)/tot_subs)*nb; if (d < (m/nb)%tot_subs) n_local[d] += nb; else if (d == (m/nb)%tot_subs) n_local[d] += m%nb; } } /* Use blocked code. */ if (uplo == MagmaUpper) { /* ---------------------------------------------- */ /* Upper-triangular case */ /* > Compute the Cholesky factorization A = U'*U. */ /* ---------------------------------------------- */ for (j=0; j<m; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%tot_subs; /* Set the local index where the current panel is */ j_local = j/(nb*tot_subs); jb = min(nb, (m-j)); if (j > 0) { // Wait for the column on CPU magma_queue_sync(queues[2*(id%num_gpus)]); /* broadcast off-diagonal column to all gpus */ d = (j/nb+1)%num_gpus; for (dd=0; dd<num_gpus; dd++) { if (d != id%num_gpus) { magma_zsetmatrix_async( j, jb, Aup_off(0,j), lda, dlP(d,jb,0,id%num_gpus), lddp, queues[2*d], trace_gpu_event(d, 0, "set", "set-col") ); } d = (d+1)%num_gpus; } /* Update the current diagonal block */ trace_gpu_start(id%num_gpus, 1, "herk", "herk"); magma_zherk(MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dlA(id, 0, nb*j_local), ldda, d_one, dlA(id, j, nb*j_local), ldda, queues[2*(id%num_gpus)+1]); magma_queue_sync(queues[2*(id%num_gpus)+1]); // Wait for syrk } /* Send the diagonal to cpu */ magma_zgetmatrix_async( jb, jb, dlA(id, j, nb*j_local), ldda, Aup_off(j,j), lda, queues[2*(id%num_gpus)], trace_gpu_event(id%num_gpus, 0, "get", "get-diag") ); if (j > 0) { /* Compute the local block column of the panel. */ d = (j/nb+1)%tot_subs; for (dd=0; dd<tot_subs; dd++) { j_local2 = j_local+1; if (d > id) j_local2 --; nb0 = nb*j_local2; if (n_local[d] > nb0) { if (d%num_gpus != id%num_gpus) { dlpanel = d_lP[d%num_gpus]; dlpanel_offset = dlP_offset(jb, 0, id%num_gpus); ldpanel = lddp; /* Wait for the offdiagonal column */ if (dd < num_gpus) magma_queue_sync(queues[2*(d%num_gpus)]); } else { dlpanel = d_lA[id]; dlpanel_offset = dlA_offset(0, nb*j_local); ldpanel = ldda; } /* update the panel */ trace_gpu_start(d%num_gpus, 1, "gemm", "gemm"); magma_zgemm(MagmaConjTrans, MagmaNoTrans, jb, n_local[d]-nb0, j, c_neg_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, 0, nb0), ldda, c_one, dlA(d, j, nb0), ldda, queues[2*(d%num_gpus)+1]); } d = (d+1)%tot_subs; } } /* factor the diagonal */ magma_queue_sync( queues[2*(id%num_gpus)] ); // wait for the diagonal trace_cpu_start(0, "potrf", "potrf"); lapackf77_zpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info); trace_cpu_end(0); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus */ if ((j+jb) < n) { d = (j/nb+1)%num_gpus; for (dd=0; dd<num_gpus; dd++) { if (d == id%num_gpus) { dlpanel = d_lA[id]; dlpanel_offset = dlA_offset(j, nb*j_local); ldpanel = ldda; } else { dlpanel = d_lP[d]; dlpanel_offset = dlP_offset(0, 0, id%num_gpus); ldpanel = lddp; } magma_zsetmatrix_async( jb, jb, Aup_off(j,j), lda, dlpanel, dlpanel_offset, ldpanel, queues[2*d], trace_gpu_event(d, 0, "set", "set-diag")); d = (d+1)%num_gpus; } } else { magma_zsetmatrix_async( jb, jb, Aup_off(j,j), lda, dlA(id, j, nb*j_local), ldda, queues[2*(id%num_gpus)], trace_gpu_event(id%num_gpus, 0, "set", "set-diag") ); } /* panel-factorize the off-diagonal */ if ((j+jb) < n) { d = (j/nb+1)%tot_subs; for (dd=0; dd<tot_subs; dd++) { /* next column */ j_local2 = j_local+1; if (d > id) j_local2--; if (d%num_gpus == id%num_gpus) { dlpanel = d_lA[id]; dlpanel_offset = dlA_offset(j, nb*j_local); ldpanel = ldda; } else { dlpanel = d_lP[d%num_gpus]; dlpanel_offset = dlP_offset(0, 0, id%num_gpus); ldpanel = lddp; } nb2 = n_local[d]-nb*j_local2; nb0 = min(nb, nb2); if (dd < num_gpus) magma_queue_sync( queues[2*(d%num_gpus)] ); // wait for the diagonal if (j+jb < m && d == (j/nb+1)%tot_subs) { /* owns the next column, look-ahead the column */ trace_gpu_start(d%num_gpus, 1, "trsm", "trsm"); magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, j, nb*j_local2), ldda, queues[2*(d%num_gpus)+1] ); /* send the column to cpu */ magma_queue_sync(queues[2*(d%num_gpus)+1]); // wait for lookahead magma_zgetmatrix_async( (j+jb), nb0, dlA(d, 0, nb*j_local2), ldda, Aup_off(0,j+jb), lda, queues[2*(d%num_gpus)], trace_gpu_event(d%num_gpus, 0, "get", "get-col") ); /* update the remaining blocks */ nb2 = nb2 - nb0; trace_gpu_start(d%num_gpus, 1, "trsm", "trsm"); magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda, queues[2*(d%num_gpus)+1] ); } else if (nb2 > 0) { /* update the entire trailing matrix */ trace_gpu_start(d%num_gpus, 1, "trsm", "trsm"); magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, j, nb*j_local2), ldda, queues[2*(d%num_gpus)+1] ); } d = (d+1)%tot_subs; } } } } else { /* -------------------------------------------- */ /* Lower-triangular case */ /* Compute the Cholesky factorization A = L*L'. */ /* -------------------------------------------- */ for (j=0; j<n; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%tot_subs; /* Set the local index where the current panel is */ j_local = j/(nb*tot_subs); jb = min(nb, (n-j)); if (j > 0) { if (num_gpus > 1) { // Wait for the row on CPU to broadcast magma_queue_sync(queues[2*(id%num_gpus)]); } /* broadcast off-diagonal row to all the GPUs */ d = (j/nb+1)%num_gpus; for (dd=0; dd<num_gpus; dd++) { if (d != id%num_gpus) { /* send it to GPU-d */ magma_zsetmatrix_async( jb, j, Alo_off(j,0), lda, dlPT(d,0,jb,id%num_gpus), nb, queues[2*d], trace_gpu_event(d, 0, "set", "set-row") ); } d = (d+1)%num_gpus; } /* Update the current diagonal block */ trace_gpu_start(id%num_gpus, 1, "herk", "herk"); magma_zherk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dlA(id, nb*j_local, 0), ldda, d_one, dlA(id, nb*j_local, j), ldda, queues[2*(id%num_gpus)+1]); magma_queue_sync(queues[2*(id%num_gpus)+1]); // wait for syrk } /* send the diagonal to cpu */ magma_zgetmatrix_async( jb, jb, dlA(id, nb*j_local, j), ldda, Alo_off(j,j), lda, queues[2*(id%num_gpus)], trace_gpu_event(id%num_gpus, 0, "get", "get") ); /* update the offdiagonal blocks */ if (j > 0) { /* compute the block-rows of the panel */ d = (j/nb+1)%tot_subs; for (dd=0; dd<tot_subs; dd++) { j_local2 = j_local+1; if (d > id) j_local2 --; nb0 = nb*j_local2; if (nb0 < n_local[d]) { if (d%num_gpus != id%num_gpus) { dlpanel = d_lP[d%num_gpus]; dlpanel_offset = dlPT_offset(0, jb, id%num_gpus); ldpanel = nb; /* Wait for offdiagonal row */ if (dd < num_gpus) magma_queue_sync(queues[2*(d%num_gpus)]); } else { dlpanel = d_lA[id]; dlpanel_offset = dlA_offset(nb*j_local, 0); ldpanel = ldda; } /* Update the panel */ trace_gpu_start(d%num_gpus, 1, "gemm", "gemm"); magma_zgemm( MagmaNoTrans, MagmaConjTrans, n_local[d]-nb0, jb, j, c_neg_one, dlA(d, nb0, 0), ldda, dlpanel, dlpanel_offset, ldpanel, c_one, dlA(d, nb0, j), ldda, queues[2*(d%num_gpus)+1]); } d = (d+1)%tot_subs; } } /* factor the diagonal */ magma_queue_sync( queues[2*(id%num_gpus)] ); trace_cpu_start(0, "potrf", "potrf"); lapackf77_zpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info); trace_cpu_end(0); if (*info != 0) { printf( " zpotrf returned %d (id=%d,j=%d,j_local=%d,jb=%d)\n",*info,id,j,j_local,jb ); *info = *info + j; break; } /* send the diagonal to gpus */ if ((j+jb) < m) { d = (j/nb+1)%num_gpus; for (dd=0; dd<num_gpus; dd++) { if (d == id%num_gpus) { dlpanel = d_lA[id]; dlpanel_offset = dlA_offset(nb*j_local, j); ldpanel = ldda; } else { dlpanel = d_lP[d]; dlpanel_offset = dlPT_offset(0, 0, id%num_gpus); ldpanel = nb; } magma_zsetmatrix_async( jb, jb, Alo_off(j,j), lda, dlpanel, dlpanel_offset, ldpanel, queues[2*d], trace_gpu_event(d, 0, "set", "set-diag") ); d = (d+1)%num_gpus; } } else { magma_zsetmatrix_async( jb, jb, Alo_off(j,j), lda, dlA(id, nb*j_local, j), ldda, queues[2*(id%num_gpus)], trace_gpu_event(id%num_gpus, 0, "set", "set-diag") ); } /* factorize off-diagonal blocks */ if ((j+jb) < m) { d = (j/nb+1)%tot_subs; for (dd=0; dd<tot_subs; dd++) { /* next column */ j_local2 = j_local+1; if (d > id) j_local2--; if (d%num_gpus == id%num_gpus) { dlpanel = d_lA[id]; dlpanel_offset = dlA_offset(nb*j_local, j); ldpanel = ldda; } else { dlpanel = d_lP[d%num_gpus]; dlpanel_offset = dlPT_offset(0, 0, id%num_gpus); ldpanel = nb; } nb2 = n_local[d] - j_local2*nb; nb0 = min(nb, nb2 ); // wait for the diagonal if (dd < num_gpus) magma_queue_sync(queues[2*(d%num_gpus)]); if (j+jb < n && d == (j/nb+1)%tot_subs) { /* owns the next column, look-ahead the column */ trace_gpu_start(d%num_gpus, 1, "trsm", "trsm"); magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, nb*j_local2, j), ldda, queues[2*(d%num_gpus)+1]); /* send the column to cpu */ magma_queue_sync( queues[2*(d%num_gpus)+1] ); // wait for lookahead magma_zgetmatrix_async( nb0, j+jb, dlA(d, nb*j_local2, 0), ldda, Alo_off(j+jb,0), lda, queues[2*(d%num_gpus)], trace_gpu_event(d%num_gpus, 0, "get", "get") ); /* update the remaining blocks */ nb2 = nb2 - nb0; trace_gpu_start(d%num_gpus, 1, "trsm", "trsm"); magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda, queues[2*(d%num_gpus)+1]); } else if (nb2 > 0) { /* update the entire trailing matrix */ trace_gpu_start(d%num_gpus, 1, "trsm", "trsm"); magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, nb*j_local2, j), ldda, queues[2*(d%num_gpus)+1]); } d = (d+1)%tot_subs; } } } } /* end of else not upper */ /* clean up */ for( d=0; d<num_gpus; d++ ) { magma_queue_sync( queues[2*d] ); magma_queue_sync( queues[2*d+1] ); } trace_finalize("zpotrf_msub.svg", "trace.css"); return *info; } /* magma_zpotrf2_msub */
extern "C" void magma_zherk_mgpu( magma_int_t ngpu, magma_uplo_t uplo, magma_trans_t trans, magma_int_t nb, magma_int_t n, magma_int_t k, double alpha, magmaDoubleComplex_ptr dB[], magma_int_t lddb, magma_int_t b_offset, double beta, magmaDoubleComplex_ptr dC[], magma_int_t lddc, magma_int_t c_offset, magma_int_t nqueue, magma_queue_t queues[][10]) { #define dB(id, i, j) (dB[(id)]+(j)*lddb + (i)+b_offset) #define dC(id, i, j) (dC[(id)]+(j)*lddc + (i)) #define STREAM_ID(i) (nqueue > 1 ? 1+((i)/nb)%(nqueue-1) : 0) magma_int_t i, id, ib, ii, kk, n1; magmaDoubleComplex z_alpha = MAGMA_Z_MAKE(alpha,0.0); magmaDoubleComplex z_beta = MAGMA_Z_MAKE(beta, 0.0); magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); /* diagonal update */ for( i=0; i < n; i += nb ) { id = ((i+c_offset)/nb)%ngpu; kk = STREAM_ID( i+c_offset ); ib = min(nb, n-i); ii = nb*((i+c_offset)/(nb*ngpu)); /* zher2k on diagonal block */ magma_setdevice(id); magmablasSetKernelStream( queues[id][kk] ); trace_gpu_start( id, kk, "syr2k", "syr2k" ); magma_zherk(uplo, trans, ib, k, alpha, dB(id, i, 0 ), lddb, beta, dC(id, i+c_offset, ii), lddc); trace_gpu_end( id, kk ); } /* off-diagonal update */ if (uplo == MagmaUpper) { for( i=nb; i < n; i += nb ) { id = ((i+c_offset)/nb)%ngpu; kk = STREAM_ID( i+c_offset ); ib = min(nb, n-i); ii = nb*((i+c_offset)/(nb*ngpu)); magma_setdevice(id); magmablasSetKernelStream( queues[id][kk] ); magma_zgemm(MagmaNoTrans, MagmaConjTrans, i, ib, k, z_alpha, dB(id, 0, 0 ), lddb, dB(id, i, 0 ), lddb, z_beta, dC(id, 0, ii), lddc); } } else { for( i=0; i < n-nb; i += nb ) { id = ((i+c_offset)/nb)%ngpu; kk = STREAM_ID( i+c_offset ); ib = min(nb, n-i); ii = nb*((i+c_offset)/(nb*ngpu)); n1 = n-i-ib; /* zgemm on off-diagonal blocks */ magma_setdevice(id); magmablasSetKernelStream( queues[id][kk] ); trace_gpu_start( id, kk, "gemm_up", "gemm_up" ); magma_zgemm(MagmaNoTrans, MagmaConjTrans, n1, ib, k, z_alpha, dB(id, i+ib, 0 ), lddb, dB(id, i, 0 ), lddb, z_beta, dC(id, i+c_offset+ib, ii), lddc); trace_gpu_end( id, kk ); } } // TODO why not sync? //for( id=0; id < ngpu; id++ ) { // magma_setdevice(id); // //for( kk=0; kk < nqueue; kk++ ) // // magma_queue_sync( queues[id][kk] ); //} magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); }
/** Purpose ------- ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. If the current stream is NULL, this version replaces it with a new stream to overlap computation with communication. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_zposv_comp ********************************************************************/ extern "C" magma_int_t magma_zpotrf_batched( magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex **dA_array, magma_int_t ldda, magma_int_t *info_array, magma_int_t batchCount, magma_queue_t queue) { #define A(i_, j_) (A + (i_) + (j_)*ldda) double d_alpha = -1.0; double d_beta = 1.0; cudaMemset(info_array, 0, batchCount*sizeof(magma_int_t)); magma_int_t arginfo = 0; if ( uplo != MagmaUpper && uplo != MagmaLower) { arginfo = -1; } else if (n < 0) { arginfo = -2; } else if (ldda < max(1,n)) { arginfo = -4; } if (arginfo != 0) { magma_xerbla( __func__, -(arginfo) ); return arginfo; } // Quick return if possible if (n == 0) { return arginfo; } if( n > 2048 ){ printf("=========================================================================================\n"); printf(" WARNING batched routines are designed for small sizes it might be better to use the\n Native/Hybrid classical routines if you want performance\n"); printf("=========================================================================================\n"); } magma_int_t j, k, ib; magma_int_t nb = POTRF_NB; magma_int_t gemm_crossover = 127;//nb > 32 ? 127 : 160; #if defined(USE_CUOPT) cublasHandle_t myhandle; cublasCreate_v2(&myhandle); #else cublasHandle_t myhandle=NULL; #endif magmaDoubleComplex **dA_displ = NULL; magmaDoubleComplex **dW0_displ = NULL; magmaDoubleComplex **dW1_displ = NULL; magmaDoubleComplex **dW2_displ = NULL; magmaDoubleComplex **dW3_displ = NULL; magmaDoubleComplex **dW4_displ = NULL; magmaDoubleComplex **dinvA_array = NULL; magmaDoubleComplex **dwork_array = NULL; magma_malloc((void**)&dA_displ, batchCount * sizeof(*dA_displ)); magma_malloc((void**)&dW0_displ, batchCount * sizeof(*dW0_displ)); magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ)); magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ)); magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ)); magma_malloc((void**)&dW4_displ, batchCount * sizeof(*dW4_displ)); magma_malloc((void**)&dinvA_array, batchCount * sizeof(*dinvA_array)); magma_malloc((void**)&dwork_array, batchCount * sizeof(*dwork_array)); magma_int_t invA_msize = ((n+TRI_NB-1)/TRI_NB)*TRI_NB*TRI_NB; magma_int_t dwork_msize = n*nb; magmaDoubleComplex* dinvA = NULL; magmaDoubleComplex* dwork = NULL;// dinvA and dwork are workspace in ztrsm magmaDoubleComplex **cpuAarray = NULL; magma_zmalloc( &dinvA, invA_msize * batchCount); magma_zmalloc( &dwork, dwork_msize * batchCount ); magma_malloc_cpu((void**) &cpuAarray, batchCount*sizeof(magmaDoubleComplex*)); /* check allocation */ if ( dA_displ == NULL || dW0_displ == NULL || dW1_displ == NULL || dW2_displ == NULL || dW3_displ == NULL || dW4_displ == NULL || dinvA_array == NULL || dwork_array == NULL || dinvA == NULL || dwork == NULL || cpuAarray == NULL ) { magma_free(dA_displ); magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dinvA_array); magma_free(dwork_array); magma_free( dinvA ); magma_free( dwork ); free(cpuAarray); magma_int_t info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(info) ); return info; } magmablas_zlaset_q(MagmaFull, invA_msize, batchCount, MAGMA_Z_ZERO, MAGMA_Z_ZERO, dinvA, invA_msize, queue); magmablas_zlaset_q(MagmaFull, dwork_msize, batchCount, MAGMA_Z_ZERO, MAGMA_Z_ZERO, dwork, dwork_msize, queue); zset_pointer(dwork_array, dwork, 1, 0, 0, dwork_msize, batchCount, queue); zset_pointer(dinvA_array, dinvA, TRI_NB, 0, 0, invA_msize, batchCount, queue); magma_queue_t cstream; magmablasGetKernelStream(&cstream); magma_int_t streamid; const magma_int_t nbstreams=32; magma_queue_t stream[nbstreams]; for(k=0; k<nbstreams; k++){ magma_queue_create( &stream[k] ); } magma_getvector( batchCount, sizeof(magmaDoubleComplex*), dA_array, 1, cpuAarray, 1); magmablasSetKernelStream(NULL); if (uplo == MagmaUpper) { printf("Upper side is unavailable \n"); goto fin; } else { for(j = 0; j < n; j+=nb) { ib = min(nb, n-j); #if 1 //=============================================== // panel factorization //=============================================== magma_zdisplace_pointers(dA_displ, dA_array, ldda, j, j, batchCount, queue); zset_pointer(dwork_array, dwork, 1, 0, 0, dwork_msize, batchCount, queue); zset_pointer(dinvA_array, dinvA, TRI_NB, 0, 0, invA_msize, batchCount, queue); #if 0 arginfo = magma_zpotrf_panel_batched( uplo, n-j, ib, dA_displ, ldda, dwork_array, dwork_msize, dinvA_array, invA_msize, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, j, batchCount, myhandle); #else //arginfo = magma_zpotrf_rectile_batched( arginfo = magma_zpotrf_recpanel_batched( uplo, n-j, ib, 32, dA_displ, ldda, dwork_array, dwork_msize, dinvA_array, invA_msize, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, j, batchCount, myhandle, queue); #endif if(arginfo != 0 ) goto fin; //=============================================== // end of panel //=============================================== #endif #if 1 //real_Double_t gpu_time; //gpu_time = magma_sync_wtime(NULL); if( (n-j-ib) > 0){ if( (n-j-ib) > gemm_crossover) { //------------------------------------------- // USE STREAM HERK //------------------------------------------- // since it use different stream I need to wait the panel. // But since the code use the NULL stream everywhere, // so I don't need it, because the NULL stream do the sync by itself //magma_queue_sync(NULL); /* you must know the matrix layout inorder to do it */ for(k=0; k<batchCount; k++) { streamid = k%nbstreams; magmablasSetKernelStream(stream[streamid]); // call herk, class zherk must call cpu pointer magma_zherk(MagmaLower, MagmaNoTrans, n-j-ib, ib, d_alpha, (const magmaDoubleComplex*) cpuAarray[k] + j+ib+j*ldda, ldda, d_beta, cpuAarray[k] + j+ib+(j+ib)*ldda, ldda); } // need to synchronise to be sure that panel do not start before // finishing the update at least of the next panel // BUT no need for it as soon as the other portion of the code // use the NULL stream which do the sync by itself //magma_device_sync(); magmablasSetKernelStream(NULL); } else { //------------------------------------------- // USE BATCHED GEMM(which is a HERK in fact, since it only access the lower part) //------------------------------------------- magma_zdisplace_pointers(dA_displ, dA_array, ldda, j+ib, j, batchCount, queue); magma_zdisplace_pointers(dW1_displ, dA_array, ldda, j+ib, j+ib, batchCount, queue); magmablas_zherk_batched(uplo, MagmaNoTrans, n-j-ib, ib, d_alpha, dA_displ, ldda, d_beta, dW1_displ, ldda, batchCount, queue); } } //gpu_time = magma_sync_wtime(NULL) - gpu_time; //real_Double_t flops = (n-j-ib) * (n-j-ib) * ib / 1e9 * batchCount; //real_Double_t gpu_perf = flops / gpu_time; //printf("Rows= %d, Colum=%d, herk time = %7.2fms, Gflops= %7.2f\n", n-j-ib, ib, gpu_time*1000, gpu_perf); #endif } } fin: magma_queue_sync(NULL); for(k=0; k<nbstreams; k++){ magma_queue_destroy( stream[k] ); } magmablasSetKernelStream(cstream); #if defined(USE_CUOPT) cublasDestroy_v2(myhandle); #endif magma_free(dA_displ); magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dinvA_array); magma_free(dwork_array); magma_free( dinvA ); magma_free( dwork ); free(cpuAarray); return arginfo; }
/** Purpose ------- ZLAUUM computes the product U * U' or L' * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. If UPLO = MagmaUpper then the upper triangle of the result is stored, overwriting the factor U in A. If UPLO = MagmaLower then the lower triangle of the result is stored, overwriting the factor L in A. This is the blocked form of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t Specifies whether the triangular factor stored in the array A is upper or lower triangular: - = MagmaUpper: Upper triangular - = MagmaLower: Lower triangular @param[in] n INTEGER The order of the triangular factor U or L. N >= 0. @param[in,out] A COPLEX_16 array, dimension (LDA,N) On entry, the triangular factor U or L. On exit, if UPLO = MagmaUpper, the upper triangle of A is overwritten with the upper triangle of the product U * U'; if UPLO = MagmaLower, the lower triangle of A is overwritten with the lower triangle of the product L' * L. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -k, the k-th argument had an illegal value @ingroup magma_zposv_aux ***************************************************************************/ extern "C" magma_int_t magma_zlauum( magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define dA(i, j) (dA + (j)*ldda + (i)) /* Local variables */ const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t ldda, nb; magma_int_t i, ib; magmaDoubleComplex c_one = MAGMA_Z_ONE; double d_one = MAGMA_D_ONE; magmaDoubleComplex *dA; int upper = (uplo == MagmaUpper); *info = 0; if (! upper && uplo != MagmaLower) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,n)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; ldda = ((n+31)/32)*32; if (MAGMA_SUCCESS != magma_zmalloc( &dA, (n)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); nb = magma_get_zpotrf_nb(n); if (nb <= 1 || nb >= n) lapackf77_zlauum(uplo_, &n, A, &lda, info); else { if (upper) { /* Compute the product U * U'. */ for (i=0; i < n; i += nb) { ib=min(nb,n-i); magma_zsetmatrix_async( ib, ib, A(i,i), lda, dA(i, i), ldda, stream[1] ); magma_zsetmatrix_async( ib, (n-i-ib), A(i,i+ib), lda, dA(i,i+ib), ldda, stream[0] ); magma_queue_sync( stream[1] ); magma_ztrmm( MagmaRight, MagmaUpper, MagmaConjTrans, MagmaNonUnit, i, ib, c_one, dA(i,i), ldda, dA(0, i),ldda); lapackf77_zlauum(MagmaUpperStr, &ib, A(i,i), &lda, info); magma_zsetmatrix_async( ib, ib, A(i, i), lda, dA(i, i), ldda, stream[0] ); if (i+ib < n) { magma_zgemm( MagmaNoTrans, MagmaConjTrans, i, ib, (n-i-ib), c_one, dA(0,i+ib), ldda, dA(i, i+ib),ldda, c_one, dA(0,i), ldda); magma_queue_sync( stream[0] ); magma_zherk( MagmaUpper, MagmaNoTrans, ib,(n-i-ib), d_one, dA(i, i+ib), ldda, d_one, dA(i, i), ldda); } magma_zgetmatrix( i+ib, ib, dA(0, i), ldda, A(0, i), lda ); } } else { /* Compute the product L' * L. */ for (i=0; i < n; i += nb) { ib=min(nb,n-i); magma_zsetmatrix_async( ib, ib, A(i,i), lda, dA(i, i), ldda, stream[1] ); magma_zsetmatrix_async( (n-i-ib), ib, A(i+ib, i), lda, dA(i+ib, i), ldda, stream[0] ); magma_queue_sync( stream[1] ); magma_ztrmm( MagmaLeft, MagmaLower, MagmaConjTrans, MagmaNonUnit, ib, i, c_one, dA(i,i), ldda, dA(i, 0),ldda); lapackf77_zlauum(MagmaLowerStr, &ib, A(i,i), &lda, info); magma_zsetmatrix_async( ib, ib, A(i, i), lda, dA(i, i), ldda, stream[0] ); if (i+ib < n) { magma_zgemm(MagmaConjTrans, MagmaNoTrans, ib, i, (n-i-ib), c_one, dA( i+ib,i), ldda, dA(i+ib, 0),ldda, c_one, dA(i,0), ldda); magma_queue_sync( stream[0] ); magma_zherk(MagmaLower, MagmaConjTrans, ib, (n-i-ib), d_one, dA(i+ib, i), ldda, d_one, dA(i, i), ldda); } magma_zgetmatrix( ib, i+ib, dA(i, 0), ldda, A(i, 0), lda ); } } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( dA ); return *info; }
/** Purpose ------- ZPOTRF_OOC computes the Cholesky factorization of a complex Hermitian positive definite matrix A. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix A may exceed the GPU memory. The factorization has the form A = U**H * U, if UPLO = MagmaUpper, or A = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A COMPLEX_16 array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U**H * U or A = L * L**H. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_zposv_comp ********************************************************************/ extern "C" magma_int_t magma_zpotrf_m( magma_int_t ngpu, magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, magma_int_t *info) { #define A(i, j) ( A + (j)*lda + (i)) #define dA(d, i, j) (dwork[(d)] + (j)*lddla + (i)) #define dT(d, i, j) ( dt[(d)] + (j)*ldda + (i)) #define dAup(d, i, j) (dwork[(d)] + (j)*NB + (i)) #define dTup(d, i, j) ( dt[(d)] + (j)*nb + (i)) /* Local variables */ double d_one = 1.0; double d_neg_one = -1.0; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; const char* uplo_ = lapack_uplo_const( uplo ); int upper = (uplo == MagmaUpper); magmaDoubleComplex *dwork[MagmaMaxGPUs], *dt[MagmaMaxGPUs]; magma_int_t ldda, lddla, nb, iinfo, n_local[MagmaMaxGPUs], J2, d, ngpu0 = ngpu; magma_int_t j, jj, jb, J, JB, NB, h; magma_queue_t stream[MagmaMaxGPUs][3]; magma_event_t event[MagmaMaxGPUs][5]; magma_timer_t time_total=0, time_sum=0, time=0; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); nb = magma_get_dpotrf_nb(n); if ( ngpu0 > n/nb ) { ngpu = n/nb; if ( n%nb != 0 ) ngpu ++; } else { ngpu = ngpu0; } //ldda = ((n+31)/32)*32; ldda = ((n+nb-1)/nb)*nb; lddla = ((nb*((n+nb*ngpu-1)/(nb*ngpu))+31)/32)*32; /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(magmaDoubleComplex); //MB = n; /* number of rows in the big panel */ NB = (magma_int_t)((0.8*freeMem - max(2,ngpu)*nb*ldda - (n+nb)*nb)/lddla); /* number of columns in the big panel */ //NB = min(5*nb,n); if ( NB >= n ) { #ifdef CHECK_ZPOTRF_OOC printf( " * still fits in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_ZPOTRF_OOC printf( " * doesn't fit in GPU memory.\n" ); #endif NB = (NB/nb) * nb; /* making sure it's devisable by nb */ } #ifdef CHECK_ZPOTRF_OOC if ( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d, lddla=%d, freeMem=%.2e).\n", n, NB, nb, lddla, (double)freeMem ); else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d, lddla=%d, freeMem=%.2e).\n", n, NB, nb, lddla, (double)freeMem ); fflush(stdout); #endif for (d=0; d < ngpu; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_zmalloc( &dt[d], NB*lddla + max(2,ngpu)*nb*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dwork[d] = &dt[d][max(2,ngpu)*nb*ldda]; for( j=0; j < 3; j++ ) magma_queue_create( &stream[d][j] ); for( j=0; j < 5; j++ ) magma_event_create( &event[d][j] ); magma_device_sync(); // synch the device } magma_setdevice(0); timer_start( time_total ); if (nb <= 1 || nb >= n) { lapackf77_zpotrf(uplo_, &n, A, &lda, info); } else { /* Use hybrid blocked code. */ if (upper) { /* =========================================================== * * Compute the Cholesky factorization A = U'*U. * * big panel is divided by block-row and distributed in block * * column cyclic format */ /* for each big-panel */ for( J=0; J < n; J += NB ) { JB = min(NB,n-J); if ( ngpu0 > (n-J)/nb ) { ngpu = (n-J)/nb; if ( (n-J)%nb != 0 ) ngpu ++; } else { ngpu = ngpu0; } /* load the new big-panel by block-rows */ magma_zhtodpo( ngpu, uplo, JB, n, J, J, nb, A, lda, dwork, NB, stream, &iinfo); /* update with the previous big-panels */ timer_start( time ); for( j=0; j < J; j += nb ) { /* upload the diagonal of the block column (broadcast to all GPUs) */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_zsetmatrix_async( nb, JB, A(j, J), lda, dTup(d, 0, J), nb, stream[d][0] ); n_local[d] = 0; } /* distribute off-diagonal blocks to GPUs */ for( jj=J+JB; jj < n; jj += nb ) { d = ((jj-J)/nb)%ngpu; magma_setdevice(d); jb = min(nb, n-jj); magma_zsetmatrix_async( nb, jb, A(j, jj), lda, dTup(d, 0, J+JB+n_local[d]), nb, stream[d][0] ); n_local[d] += jb; } /* wait for the communication */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); } /* update the current big-panel using the previous block-row */ /* -- process the big diagonal block of the big panel */ for( jj=0; jj < JB; jj += nb ) { // jj is 'local' column index within the big panel d = (jj/nb)%ngpu; J2 = jj/(nb*ngpu); magma_setdevice(d); magmablasSetKernelStream(stream[d][J2%2]); // the last stream (2) used to process off-diagonal J2 = nb*J2; jb = min(nb,JB-jj); // number of columns in this current block-row magma_zgemm( MagmaConjTrans, MagmaNoTrans, jj, jb, nb, c_neg_one, dTup(d, 0, J ), nb, dTup(d, 0, J+jj), nb, c_one, dAup(d, 0, J2), NB); magma_zherk(MagmaUpper, MagmaConjTrans, jb, nb, d_neg_one, dTup(d, 0, J+jj), nb, d_one, dAup(d, jj, J2), NB); } /* -- process the remaining big off-diagonal block of the big panel */ if ( n > J+JB ) { for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][2]); /* local number of columns in the big panel */ n_local[d] = ((n-J)/(nb*ngpu))*nb; if (d < ((n-J)/nb)%ngpu) n_local[d] += nb; else if (d == ((n-J)/nb)%ngpu) n_local[d] += (n-J)%nb; /* subtracting the local number of columns in the diagonal */ J2 = nb*(JB/(nb*ngpu)); if ( d < (JB/nb)%ngpu ) J2 += nb; n_local[d] -= J2; magma_zgemm( MagmaConjTrans, MagmaNoTrans, JB, n_local[d], nb, c_neg_one, dTup(d, 0, J ), nb, dTup(d, 0, J+JB), nb, c_one, dAup(d, 0, J2), NB); } } /* wait for the previous updates */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); for( jj=0; jj < 3; jj++ ) magma_queue_sync( stream[d][jj] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); } /* end of updates with previous rows */ /* factor the big panel */ h = (JB+nb-1)/nb; // big diagonal of big panel will be on CPU // using three streams magma_zpotrf3_mgpu(ngpu, uplo, JB, n-J, J, J, nb, dwork, NB, dt, ldda, A, lda, h, stream, event, &iinfo); if ( iinfo != 0 ) { *info = J+iinfo; break; } time_sum += timer_stop( time ); /* upload the off-diagonal (and diagonal!!!) big panel */ magma_zdtohpo(ngpu, uplo, JB, n, J, J, nb, NB, A, lda, dwork, NB, stream, &iinfo); //magma_zdtohpo(ngpu, uplo, JB, n, J, J, nb, 0, A, lda, dwork, NB, stream, &iinfo); } } else { /* ========================================================= * * Compute the Cholesky factorization A = L*L'. */ /* for each big-panel */ for( J=0; J < n; J += NB ) { JB = min(NB,n-J); if ( ngpu0 > (n-J)/nb ) { ngpu = (n-J)/nb; if ( (n-J)%nb != 0 ) ngpu ++; } else { ngpu = ngpu0; } /* load the new big-panel by block-columns */ magma_zhtodpo( ngpu, uplo, n, JB, J, J, nb, A, lda, dwork, lddla, stream, &iinfo); /* update with the previous big-panels */ timer_start( time ); for( j=0; j < J; j += nb ) { /* upload the diagonal of big panel */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_zsetmatrix_async( JB, nb, A(J, j), lda, dT(d, J, 0), ldda, stream[d][0] ); n_local[d] = 0; } /* upload off-diagonals */ for( jj=J+JB; jj < n; jj += nb ) { d = ((jj-J)/nb)%ngpu; magma_setdevice(d); jb = min(nb, n-jj); magma_zsetmatrix_async( jb, nb, A(jj, j), lda, dT(d, J+JB+n_local[d], 0), ldda, stream[d][0] ); n_local[d] += jb; } /* wait for the communication */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); } /* update the current big-panel using the previous block-row */ for( jj=0; jj < JB; jj += nb ) { /* diagonal */ d = (jj/nb)%ngpu; J2 = jj/(nb*ngpu); magma_setdevice(d); magmablasSetKernelStream(stream[d][J2%2]); J2 = nb*J2; jb = min(nb,JB-jj); magma_zgemm( MagmaNoTrans, MagmaConjTrans, jb, jj, nb, c_neg_one, dT(d, J+jj, 0), ldda, dT(d, J, 0), ldda, c_one, dA(d, J2, 0), lddla); magma_zherk(MagmaLower, MagmaNoTrans, jb, nb, d_neg_one, dT(d, J+jj, 0), ldda, d_one, dA(d, J2, jj), lddla); } if ( n > J+JB ) { /* off-diagonal */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][2]); /* local number of columns in the big panel */ n_local[d] = (((n-J)/nb)/ngpu)*nb; if (d < ((n-J)/nb)%ngpu) n_local[d] += nb; else if (d == ((n-J)/nb)%ngpu) n_local[d] += (n-J)%nb; /* subtracting local number of columns in diagonal */ J2 = nb*(JB/(nb*ngpu)); if ( d < (JB/nb)%ngpu ) J2 += nb; n_local[d] -= J2; magma_zgemm( MagmaNoTrans, MagmaConjTrans, n_local[d], JB, nb, c_neg_one, dT(d, J+JB, 0), ldda, dT(d, J, 0), ldda, c_one, dA(d, J2, 0), lddla); } } /* wait for the previous updates */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); for( jj=0; jj < 3; jj++ ) magma_queue_sync( stream[d][jj] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); } /* factor the big panel */ h = (JB+nb-1)/nb; // big diagonal of big panel will be on CPU // using three streams magma_zpotrf3_mgpu(ngpu, uplo, n-J, JB, J, J, nb, dwork, lddla, dt, ldda, A, lda, h, stream, event, &iinfo); if ( iinfo != 0 ) { *info = J+iinfo; break; } time_sum += timer_stop( time ); /* upload the off-diagonal big panel */ magma_zdtohpo( ngpu, uplo, n, JB, J, J, nb, JB, A, lda, dwork, lddla, stream, &iinfo); } /* end of for J */ } /* if upper */ } /* if nb */ timer_stop( time_total ); if ( ngpu0 > n/nb ) { ngpu = n/nb; if ( n%nb != 0 ) ngpu ++; } else { ngpu = ngpu0; } for (d=0; d < ngpu; d++ ) { magma_setdevice(d); for( j=0; j < 3; j++ ) { magma_queue_destroy( stream[d][j] ); } magma_free( dt[d] ); for( j=0; j < 5; j++ ) { magma_event_destroy( event[d][j] ); } } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); timer_printf( "\n n=%d NB=%d nb=%d\n", (int) n, (int) NB, (int) nb ); timer_printf( " Without memory allocation: %f / %f = %f GFlop/s\n", FLOPS_ZPOTRF(n) / 1e9, time_total, FLOPS_ZPOTRF(n) / 1e9 / time_total ); timer_printf( " Performance %f / %f = %f GFlop/s\n", FLOPS_ZPOTRF(n) / 1e9, time_sum, FLOPS_ZPOTRF(n) / 1e9 / time_sum ); return *info; } /* magma_zpotrf_ooc */
extern "C" magma_int_t magma_zlauum(char uplo, magma_int_t n, cuDoubleComplex *a, magma_int_t lda, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= ZLAUUM computes the product U * U' or L' * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. If UPLO = 'U' or 'u' then the upper triangle of the result is stored, overwriting the factor U in A. If UPLO = 'L' or 'l' then the lower triangle of the result is stored, overwriting the factor L in A. This is the blocked form of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 Specifies whether the triangular factor stored in the array A is upper or lower triangular: = 'U': Upper triangular = 'L': Lower triangular N (input) INTEGER The order of the triangular factor U or L. N >= 0. A (input/output) COPLEX_16 array, dimension (LDA,N) On entry, the triangular factor U or L. On exit, if UPLO = 'U', the upper triangle of A is overwritten with the upper triangle of the product U * U'; if UPLO = 'L', the lower triangle of A is overwritten with the lower triangle of the product L' * L. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -k, the k-th argument had an illegal value ===================================================================== */ /* Local variables */ char uplo_[2] = {uplo, 0}; magma_int_t ldda, nb; magma_int_t i, ib; cuDoubleComplex c_one = MAGMA_Z_ONE; double d_one = MAGMA_D_ONE; cuDoubleComplex *work; int upper = lapackf77_lsame(uplo_, "U"); *info = 0; if ((! upper) && (! lapackf77_lsame(uplo_, "L"))) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,n)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; ldda = ((n+31)/32)*32; if (MAGMA_SUCCESS != magma_zmalloc( &work, (n)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } cudaStream_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); nb = magma_get_zpotrf_nb(n); if (nb <= 1 || nb >= n) lapackf77_zlauum(uplo_, &n, a, &lda, info); else { if (upper) { /* Compute the product U * U'. */ for (i=0; i<n; i=i+nb) { ib=min(nb,n-i); //cublasSetMatrix(ib, (n-i), sizeof(cuDoubleComplex), A(i, i), lda, dA(i, i), ldda); magma_zsetmatrix_async( ib, ib, A(i,i), lda, dA(i, i), ldda, stream[1] ); magma_zsetmatrix_async( ib, (n-i-ib), A(i,i+ib), lda, dA(i,i+ib), ldda, stream[0] ); magma_queue_sync( stream[1] ); magma_ztrmm( MagmaRight, MagmaUpper, MagmaConjTrans, MagmaNonUnit, i, ib, c_one, dA(i,i), ldda, dA(0, i),ldda); lapackf77_zlauum(MagmaUpperStr, &ib, A(i,i), &lda, info); magma_zsetmatrix_async( ib, ib, A(i, i), lda, dA(i, i), ldda, stream[0] ); if (i+ib < n) { magma_zgemm( MagmaNoTrans, MagmaConjTrans, i, ib, (n-i-ib), c_one, dA(0,i+ib), ldda, dA(i, i+ib),ldda, c_one, dA(0,i), ldda); magma_queue_sync( stream[0] ); magma_zherk( MagmaUpper, MagmaNoTrans, ib,(n-i-ib), d_one, dA(i, i+ib), ldda, d_one, dA(i, i), ldda); } magma_zgetmatrix( i+ib, ib, dA(0, i), ldda, A(0, i), lda ); } } else { /* Compute the product L' * L. */ for(i=0; i<n; i=i+nb) { ib=min(nb,n-i); //cublasSetMatrix((n-i), ib, sizeof(cuDoubleComplex), // A(i, i), lda, dA(i, i), ldda); magma_zsetmatrix_async( ib, ib, A(i,i), lda, dA(i, i), ldda, stream[1] ); magma_zsetmatrix_async( (n-i-ib), ib, A(i+ib, i), lda, dA(i+ib, i), ldda, stream[0] ); magma_queue_sync( stream[1] ); magma_ztrmm( MagmaLeft, MagmaLower, MagmaConjTrans, MagmaNonUnit, ib, i, c_one, dA(i,i), ldda, dA(i, 0),ldda); lapackf77_zlauum(MagmaLowerStr, &ib, A(i,i), &lda, info); //cublasSetMatrix(ib, ib, sizeof(cuDoubleComplex), // A(i, i), lda, dA(i, i), ldda); magma_zsetmatrix_async( ib, ib, A(i, i), lda, dA(i, i), ldda, stream[0] ); if (i+ib < n) { magma_zgemm(MagmaConjTrans, MagmaNoTrans, ib, i, (n-i-ib), c_one, dA( i+ib,i), ldda, dA(i+ib, 0),ldda, c_one, dA(i,0), ldda); magma_queue_sync( stream[0] ); magma_zherk(MagmaLower, MagmaConjTrans, ib, (n-i-ib), d_one, dA(i+ib, i), ldda, d_one, dA(i, i), ldda); } magma_zgetmatrix( ib, i+ib, dA(i, 0), ldda, A(i, 0), lda ); } } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( work ); return *info; }
/** Purpose ------- ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] d_lA COMPLEX_16 array of pointers on the GPU, dimension (num_gpus) On entry, the Hermitian matrix dA distributed over GPUs (dl_A[d] points to the local matrix on the d-th GPU). It is distributed in 1D block column or row cyclic (with the block size of nb) if UPLO = MagmaUpper or MagmaLower, respectively. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_zposv_comp ********************************************************************/ extern "C" magma_int_t magma_zpotrf_mgpu_right(magma_int_t num_gpus, magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex **d_lA, magma_int_t ldda, magma_int_t *info ) { #define dlA(id, i, j) (d_lA[(id)] + (j) * ldda + (i)) #define dlP(id, i, j) (d_lP[(id)] + (j) * ldda + (i)) #define panel(j) (panel + (j)) #define tmppanel(j) (tmppanel + (j)) #define tmpprevpanel(j) (tmpprevpanel + (j)) #define STREAM_ID(i) (num_streams > 1 ? 1+((i)/nb)%(num_streams-1) : 0) magmaDoubleComplex z_one = MAGMA_Z_MAKE( 1.0, 0.0 ); magmaDoubleComplex mz_one = MAGMA_Z_MAKE( -1.0, 0.0 ); double one = 1.0; double m_one = -1.0; const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t j, nb, d, id, j_local, blkid, crosspoint, prevj, prevtrsmrows, num_streams = 5; magmaDoubleComplex *panel, *tmppanel0, *tmppanel1, *tmppanel, *tmpprevpanel; magmaDoubleComplex *d_lP[MagmaMaxGPUs], *dlpanel, *dlpanels[MagmaMaxGPUs]; magma_int_t rows, trsmrows, ngpu, n_local[MagmaMaxGPUs], ldpanel; magma_queue_t stream[MagmaMaxGPUs][10]; *info = 0; if ( uplo != MagmaUpper && uplo != MagmaLower ) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); nb = magma_get_zpotrf_nb(n); ldpanel = ldda; magma_setdevice(0); if (MAGMA_SUCCESS != magma_zmalloc_pinned( &panel, 2 * nb * ldpanel )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } tmppanel0 = panel; tmppanel1 = tmppanel0 + nb * ldpanel; if ((nb <= 1) || (nb >= n)) { // Use unblocked code. magma_zgetmatrix( n, n, dlA(0, 0, 0), ldda, panel, ldpanel); lapackf77_zpotrf( uplo_, &n, panel, &ldpanel, info); magma_zsetmatrix( n, n, panel, ldpanel, dlA(0, 0, 0), ldda ); } else { for( d = 0; d < num_gpus; d++ ) { // local-n and local-ld n_local[d] = ((n / nb) / num_gpus) * nb; if (d < (n / nb) % num_gpus) n_local[d] += nb; else if (d == (n / nb) % num_gpus) n_local[d] += n % nb; magma_setdevice(d); magma_device_sync(); if (MAGMA_SUCCESS != magma_zmalloc( &d_lP[d], nb * ldda )) { for( j = 0; j < d; j++ ) { magma_setdevice(j); magma_free( d_lP[d] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } for( j=0; j < num_streams; j++ ) { magma_queue_create( &stream[d][j] ); } } //#define ENABLE_TIMER #if defined (ENABLE_TIMER) real_Double_t therk[4], tmtc, tcchol, tctrsm, tctm, tmnp, tcnp; real_Double_t ttot_herk[4] = {0,0,0,0}, ttot_mtc = 0, ttot_cchol = 0, ttot_ctrsm = 0, ttot_ctm = 0, ttot_mnp = 0, ttot_cnp = 0; printf("\n\n %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n", "j", "nb", "row", "mtc", "CPU_np", "panel", "ctrsm", "CH+TRSM", "CPU", "dsyrk[0]", "dsyrk[1]", "dsyrk[2]", "dsyrk[3]", "ctm P", "gpu_np"); printf(" ====================================================================================================\n"); #endif // Use blocked code. if (uplo == MagmaUpper) { printf( " === not supported, yet ===\n" ); } else { blkid = -1; if (num_gpus == 4) crosspoint = n; else if (num_gpus == 3) crosspoint = n; else if (num_gpus == 2) crosspoint = 20160; else crosspoint = 0; crosspoint = 0; //n; //n -- > gpu always does next panel, 0 --> cpu always does next panel crosspoint = n; #if defined (ENABLE_TIMER) real_Double_t tget = get_time(), tset = 0.0, ttot = 0.0; #endif if ( n > nb ) { // send first panel to cpu magma_setdevice(0); tmppanel = tmppanel0; magma_zgetmatrix_async(n, nb, dlA(0, 0, 0), ldda, tmppanel(0), ldpanel, stream[0][0] ); } #if defined (ENABLE_TIMER) for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_device_sync(); } tget = get_time()-tget; #endif // Compute the Cholesky factorization A = L*L' for (j = 0; (j + nb) < n; j += nb) { #if defined (ENABLE_TIMER) therk[0] = therk[1] = therk[2] = therk[3] = tmtc = tcchol = tctrsm = tctm = tmnp = tcnp = 0.0; #endif blkid += 1; tmppanel = (blkid % 2 == 0) ? tmppanel0 : tmppanel1; // Set the gpu number that holds the current panel id = (j / nb) % num_gpus; magma_setdevice(id); // Set the local index where the current panel is j_local = j / (nb * num_gpus) * nb; rows = n - j; // Wait for the panel on cpu magma_queue_sync( stream[id][0] ); if (j > 0 && prevtrsmrows > crosspoint) { #if defined (ENABLE_TIMER) tcnp = get_time(); #endif tmpprevpanel = ((blkid - 1) % 2) == 0 ? tmppanel0 : tmppanel1; blasf77_zgemm( MagmaNoTransStr, MagmaConjTransStr, &rows, &nb, &nb, &mz_one, tmpprevpanel(j), &ldpanel, tmpprevpanel(j), &ldpanel, &z_one, tmppanel(j), &ldpanel ); #if defined (ENABLE_TIMER) tcnp = get_time() - tcnp; ttot_cnp += tcnp; #endif } #if defined (ENABLE_TIMER) tcchol = get_time(); #endif lapackf77_zpotrf(MagmaLowerStr, &nb, tmppanel(j), &ldpanel, info); if (*info != 0) { *info = *info + j; break; } #if defined (ENABLE_TIMER) tcchol = get_time() - tcchol; ttot_cchol += tcchol; tctrsm = get_time(); #endif trsmrows = rows - nb; if (trsmrows > 0) { blasf77_ztrsm(MagmaRightStr, MagmaLowerStr, MagmaConjTransStr, MagmaNonUnitStr, &trsmrows, &nb, &z_one, tmppanel(j), &ldpanel, tmppanel(j + nb), &ldpanel); } #if defined (ENABLE_TIMER) tctrsm = get_time() - tctrsm; ttot_ctrsm += tctrsm; tctm = get_time(); #endif d = (id + 1) % num_gpus; // send current panel to gpus for (ngpu = 0; ngpu < num_gpus; ngpu++, d = (d + 1) % num_gpus ) { magma_int_t myrows = 0; magma_int_t row_offset = 0; if ( d == id ) { dlpanel = dlA(d, j, j_local); myrows = rows; row_offset = 0; } else { dlpanel = dlP(d, 0, 0); myrows = trsmrows; row_offset = nb; } if (myrows > 0) { magma_setdevice(d); magma_zsetmatrix_async(myrows, nb, tmppanel(j + row_offset), ldpanel, dlpanel, ldda, stream[d][0] ); } } /* make sure panel is on GPUs */ d = (id + 1) % num_gpus; for (ngpu = 0; ngpu < num_gpus; ngpu++, d = (d + 1) % num_gpus ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); } #if defined (ENABLE_TIMER) tctm = get_time() - tctm; ttot_ctm += tctm; #endif if ( (j + nb) < n) { magma_int_t offset = 0; magma_int_t row_offset = 0; if (j + nb + nb < n) { d = (id + 1) % num_gpus; magma_setdevice(d); magma_int_t j_local2 = (j + nb) / (nb * num_gpus) * nb; if (trsmrows <= crosspoint) { #if defined (ENABLE_TIMER) tmnp = get_time(); #endif // do gemm on look ahead panel if ( d == id ) { dlpanel = dlA(d, j + nb, j_local); } else { dlpanel = dlP(d, 0, 0); } magmablasSetKernelStream(stream[d][STREAM_ID(j_local2)]); #define ZHERK_ON_DIAG #ifdef ZHERK_ON_DIAG magma_zherk( MagmaLower, MagmaNoTrans, nb, nb, m_one, dlpanel, ldda, one, dlA(d, j + nb, j_local2), ldda); magma_zgemm( MagmaNoTrans, MagmaConjTrans, trsmrows-nb, nb, nb, mz_one, dlpanel+nb, ldda, dlpanel, ldda, z_one, dlA(d, j + nb +nb, j_local2), ldda); #else magma_zgemm( MagmaNoTrans, MagmaConjTrans, trsmrows, nb, nb, mz_one, dlpanel, ldda, dlpanel, ldda, z_one, dlA(d, j + nb, j_local2), ldda); #endif #if defined (ENABLE_TIMER) magma_device_sync(); tmnp = get_time() - tmnp; ttot_mnp += tmnp; #endif } // send next panel to cpu magma_queue_sync( stream[d][STREAM_ID(j_local2)] ); // make sure lookahead is done tmppanel = ((blkid+1) % 2 == 0) ? tmppanel0 : tmppanel1; magma_zgetmatrix_async(rows-nb, nb, dlA(d, j+nb, j_local2), ldda, tmppanel(j+nb), ldpanel, stream[d][0] ); tmppanel = (blkid % 2 == 0) ? tmppanel0 : tmppanel1; offset = j + nb + nb; row_offset = nb; } else { offset = j + nb; row_offset = 0; } if (n - offset > 0) { // syrk on multiple gpu for (d = 0; d < num_gpus; d++ ) { if ( d == id ) { dlpanels[d] = dlA(d, j + nb + row_offset, j_local); } else { dlpanels[d] = dlP(d, row_offset, 0); } } #if defined (ENABLE_TIMER) for( d=0; d < num_gpus; d++ ) therk[d] = get_time(); #endif //magmablasSetKernelStream(stream[d]); //magma_zherk(MagmaLower, MagmaNoTrans, n - offset, nb, // m_one, dlpanel, ldda, // one, &d_lA[d][offset + offset*ldda], ldda ); #ifdef ZHERK_ON_DIAG magma_zherk_mgpu #else magma_zherk_mgpu2 #endif (num_gpus, MagmaLower, MagmaNoTrans, nb, n - offset, nb, m_one, dlpanels, ldda, 0, one, d_lA, ldda, offset, num_streams, stream ); #if defined (ENABLE_TIMER) for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_device_sync(); therk[d] = get_time() - therk[d]; ttot_herk[d] += therk[d]; } #endif } prevtrsmrows = trsmrows; prevj = j; #if defined (ENABLE_TIMER) ttot += (tcnp+tcchol+tctrsm+therk[0]+therk[1]+therk[2]+tctm+tmnp); printf("%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(%d) %10.3lf\n", j, nb, rows, tmtc, tcnp, // gemm tcchol, // potrf tctrsm, // trsm (tcchol + tctrsm), (tmtc+tcnp+tcchol+tctrsm), therk[0], therk[1], therk[2], therk[3], // syrk tctm, // copy panel to GPU tmnp, // lookahead on GPU (id + 1) % num_gpus, (tcnp+tcchol+tctrsm+therk[0]+therk[1]+therk[2]+tctm+tmnp)); fflush(0); #endif } } for( d = 0; d < num_gpus; d++ ) { magma_setdevice(d); for( id=0; id < num_streams; id++ ) { magma_queue_sync( stream[d][id] ); } } #if defined (ENABLE_TIMER) printf("\n%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(-) %10.3lf\n", n, n, 0, ttot_mtc, ttot_cnp, // gemm ttot_cchol, // potrf ttot_ctrsm, // trsm (ttot_cchol + ttot_ctrsm), (ttot_mtc+ttot_cnp+ttot_cchol+ttot_ctrsm), ttot_herk[0], ttot_herk[1], ttot_herk[2], ttot_herk[3], // syrk ttot_ctm, // copy panel to GPU ttot_mnp, // lookahead on GPU (ttot_cnp+ttot_cchol+ttot_ctrsm+ttot_herk[0]+ttot_herk[1]+ttot_herk[2]+ttot_ctm+ttot_mnp)); printf("%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(-) %10.3lf (ratio)\n", n, n, 0, ttot_mtc/ttot, ttot_cnp/ttot, // gemm ttot_cchol/ttot, // potrf ttot_ctrsm/ttot, // trsm (ttot_cchol + ttot_ctrsm)/ttot, (ttot_mtc+ttot_cnp+ttot_cchol+ttot_ctrsm)/ttot, ttot_herk[0]/ttot, ttot_herk[1]/ttot, ttot_herk[2]/ttot, ttot_herk[3]/ttot, // syrk ttot_ctm/ttot, // copy panel to GPU ttot_mnp/ttot, // lookahead on GPU (ttot_cnp+ttot_cchol+ttot_ctrsm+ttot_herk[0]+ttot_herk[1]+ttot_herk[2]+ttot_ctm+ttot_mnp)/ttot); #endif // cholesky for the last block if (j < n && *info == 0) { rows = n - j; id = (j / nb) % num_gpus; // Set the local index where the current panel is j_local = j / (nb * num_gpus) * nb; magma_setdevice(id); #if defined (ENABLE_TIMER) tset = get_time(); #endif magma_zgetmatrix(rows, rows, dlA(id, j, j_local), ldda, panel(j), ldpanel); lapackf77_zpotrf(MagmaLowerStr, &rows, panel(j), &ldpanel, info); magma_zsetmatrix(rows, rows, panel(j), ldpanel, dlA(id, j, j_local), ldda); #if defined (ENABLE_TIMER) tset = get_time() - tset; #endif } #if defined (ENABLE_TIMER) printf( " matrix_get,set: %10.3lf %10.3lf -> %10.3lf\n",tget,tset,ttot+tget+tset ); #endif } // end of else not upper // clean up for( d = 0; d < num_gpus; d++ ) { magma_setdevice(d); for( j=0; j < num_streams; j++ ) { magma_queue_destroy( stream[d][j] ); } magma_free( d_lP[d] ); } } // end of not lapack // free workspace magma_free_pinned( panel ); magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_zpotrf_mgpu_right */
extern "C" void magma_zherk_mgpu( magma_int_t num_gpus, magma_uplo_t uplo, magma_trans_t trans, magma_int_t nb, magma_int_t n, magma_int_t k, double alpha, magmaDoubleComplex **db, magma_int_t lddb, magma_int_t offset_b, double beta, magmaDoubleComplex **dc, magma_int_t lddc, magma_int_t offset, magma_int_t num_streams, magma_queue_t stream[][10]) { #define dB(id, i, j) (db[(id)]+(j)*lddb + (i)+offset_b) #define dC(id, i, j) (dc[(id)]+(j)*lddc + (i)) const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t i, id, ib, ii, kk, n1; magmaDoubleComplex z_alpha = MAGMA_Z_MAKE(alpha,0.0); magmaDoubleComplex z_beta = MAGMA_Z_MAKE(beta, 0.0); magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); /* diagonal update */ for( i=0; i < n; i += nb ) { id = ((i+offset)/nb)%num_gpus; kk = STREAM_ID( i+offset ); ib = min(nb, n-i); ii = nb*((i+offset)/(nb*num_gpus)); /* zher2k on diagonal block */ magma_setdevice(id); magmablasSetKernelStream(stream[id][kk]); trace_gpu_start( id, kk, "syr2k", "syr2k" ); magma_zherk(uplo, trans, ib, k, alpha, dB(id, i, 0 ), lddb, beta, dC(id, i+offset, ii), lddc); trace_gpu_end( id, kk ); } /* off-diagonal update */ if (uplo == MagmaUpper) { for( i=nb; i < n; i += nb ) { id = ((i+offset)/nb)%num_gpus; kk = STREAM_ID( i+offset ); ib = min(nb, n-i); ii = nb*((i+offset)/(nb*num_gpus)); magma_setdevice(id); magmablasSetKernelStream(stream[id][kk]); magma_zgemm(MagmaNoTrans, MagmaConjTrans, i, ib, k, z_alpha, dB(id, 0, 0 ), lddb, dB(id, i, 0 ), lddb, z_beta, dC(id, 0, ii), lddc); } } else { for( i=0; i < n-nb; i += nb ) { id = ((i+offset)/nb)%num_gpus; kk = STREAM_ID( i+offset ); ib = min(nb, n-i); ii = nb*((i+offset)/(nb*num_gpus)); n1 = n-i-ib; /* zgemm on off-diagonal blocks */ magma_setdevice(id); magmablasSetKernelStream(stream[id][kk]); trace_gpu_start( id, kk, "gemm_up", "gemm_up" ); magma_zgemm(MagmaNoTrans, MagmaConjTrans, n1, ib, k, z_alpha, dB(id, i+ib, 0 ), lddb, dB(id, i, 0 ), lddb, z_beta, dC(id, i+offset+ib, ii), lddc); trace_gpu_end( id, kk ); } } // TODO why not sync? //for( id=0; id < num_gpus; id++ ) { // magma_setdevice(id); // //for( kk=0; kk < num_streams; kk++ ) // // magma_queue_sync(stream[id][kk]); //} magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); }
magma_int_t magma_zsytrf_stapiv_gpu(char uplo, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, double criteria, PASTIX_INT * npiv, PASTIX_FLOAT * tmp4, magma_int_t *info) { /* -- MAGMA (version 1.2.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver May 2012 Purpose ======= ZSYTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = 'U', or dA = L * L**H, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of dA is stored; = 'L': Lower triangle of dA is stored. N (input) INTEGER The order of the matrix dA. N >= 0. dA (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = 'U', the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be dividable by 16. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ magma_int_t j, jb, nb; char uplo_[2] = {uplo, 0}; cuDoubleComplex c_one = MAGMA_Z_ONE; cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; cuDoubleComplex *work; double d_one = 1.0; double d_neg_one = -1.0; long int upper = uplo_lapackf77_lsame(uplo_, "U"); *info = 0; if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } nb = magma_get_zsytrf_nb(n); if (MAGMA_SUCCESS != magma_zmalloc_host( &work, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } static cudaStream_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); if ((nb <= 1) || (nb >= n)) { /* Use unblocked code. */ magma_zgetmatrix( n, n, dA, ldda, work, n ); assert(!upper); /* PaStiX only works with lower */ PASTIX_sytrf_block((PASTIX_FLOAT*)work, n, n, npiv, criteria, tmp4); magma_zsetmatrix( n, n, work, n, dA, ldda ); } else { /* Use blocked code. */ if (upper) { assert(0); /* PaStiX only works with lower */ /* Compute the Cholesky factorization A = U'*U. */ for (j=0; j<n; j+=nb) { /* Update and factorize the current diagonal block and test for non-positive-definiteness. Computing MIN */ jb = min(nb, (n-j)); magma_zherk(MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dA(0, j), ldda, d_one, dA(j, j), ldda); magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, stream[1] ); if ( (j+jb) < n) { /* Compute the current block row. */ magma_zgemm(MagmaConjTrans, MagmaNoTrans, jb, (n-j-jb), j, c_neg_one, dA(0, j ), ldda, dA(0, j+jb), ldda, c_one, dA(j, j+jb), ldda); } magma_queue_sync( stream[1] ); /* lapackf77_zsytrf(MagmaUpperStr, &jb, work, &jb, info); */ magma_zsetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, stream[0] ); if (*info != 0) { *info = *info + j; break; } if ( (j+jb) < n) magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, (n-j-jb), c_one, dA(j, j ), ldda, dA(j, j+jb), ldda); } } else { //========================================================= // Compute the Cholesky factorization A = L*L'. for (j=0; j<n; j+=nb) { // Update and factorize the current diagonal block and test // for non-positive-definiteness. Computing MIN jb = min(nb, (n-j)); magma_zsyrk(MagmaLower, MagmaNoTrans, jb, j, c_neg_one, dA(j, 0), ldda, c_one, dA(j, j), ldda); magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, stream[1] ); if ( (j+jb) < n) { magma_zgemm( MagmaNoTrans, MagmaConjTrans, (n-j-jb), jb, j, c_neg_one, dA(j+jb, 0), ldda, dA(j, 0), ldda, c_one, dA(j+jb, j), ldda); } magma_queue_sync( stream[1] ); PASTIX_sytrf_block((PASTIX_FLOAT*)work, jb, jb, npiv, criteria, tmp4); magma_zsetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, stream[0] ); if (*info != 0) { *info = *info + j; break; } if ( (j+jb) < n) magma_ztrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, (n-j-jb), jb, c_one, dA(j, j), ldda, dA(j+jb, j), ldda); } } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free_host( work ); return *info; } /* magma_zsytrf_gpu */