/** Purpose ------- SORGQR generates an M-by-N REAL matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by SGEQRF. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] A REAL array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by SGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] lda INTEGER The first dimension of the array A. LDA >= max(1,M). @param[in] tau REAL array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by SGEQRF_GPU. @param[in] T REAL array, dimension (NB, min(M,N)). T contains the T matrices used in blocking the elementary reflectors H(i), e.g., this can be the 6th argument of magma_sgeqrf_gpu (except stored on the CPU, not the GPU). @param[in] nb INTEGER This is the block size used in SGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in T. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sorgqr_m( magma_int_t m, magma_int_t n, magma_int_t k, float *A, magma_int_t lda, float *tau, float *T, magma_int_t nb, magma_int_t *info) { #define A(i,j) ( A + (i) + (j)*lda ) #define dA(d,i,j) (dA[d] + (i) + (j)*ldda) #define dT(d,i,j) (dT[d] + (i) + (j)*nb) float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldwork; magma_int_t d, i, ib, j, jb, ki, kk; float *work=NULL; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } magma_int_t di, dn; magma_int_t dpanel; magma_int_t ngpu = magma_num_gpus(); magma_device_t orig_dev; magma_getdevice( &orig_dev ); // Allocate memory on GPUs for A and workspaces magma_int_t ldda = magma_roundup( m, 32 ); magma_int_t lddwork = magma_roundup( n, 32 ); magma_int_t min_lblocks = (n / nb) / ngpu; // min. blocks per gpu magma_int_t last_dev = (n / nb) % ngpu; // device with last block magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; float *dA[ MagmaMaxGPUs ] = { NULL }; float *dT[ MagmaMaxGPUs ] = { NULL }; float *dV[ MagmaMaxGPUs ] = { NULL }; float *dW[ MagmaMaxGPUs ] = { NULL }; magma_queue_t queues[ MagmaMaxGPUs ] = { NULL }; for( d = 0; d < ngpu; ++d ) { // example with n = 75, nb = 10, ngpu = 3 // min_lblocks = 2 // last_dev = 1 // gpu 0: 2 blocks, cols: 0- 9, 30-39, 60-69 // gpu 1: 1+ blocks, cols: 10-19, 40-49, 70-74 (partial) // gpu 2: 1 block, cols: 20-29, 50-59 magma_setdevice( d ); nlocal[d] = min_lblocks*nb; if ( d < last_dev ) { nlocal[d] += nb; } else if ( d == last_dev ) { nlocal[d] += (n % nb); } ldwork = nlocal[d]*ldda // dA + nb*m // dT + nb*ldda // dV + nb*lddwork; // dW if ( MAGMA_SUCCESS != magma_smalloc( &dA[d], ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto cleanup; } dT[d] = dA[d] + nlocal[d]*ldda; dV[d] = dT[d] + nb*m; dW[d] = dV[d] + nb*ldda; magma_queue_create( d, &queues[d] ); } trace_init( 1, ngpu, 1, queues ); // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate CPU work space // n*nb for larfb work // m*nb for V // nb*nb for T lwork = (n + m + nb) * nb; magma_smalloc_cpu( &work, lwork ); if (work == NULL) { *info = MAGMA_ERR_HOST_ALLOC; goto cleanup; } float *work_T, *work_V; work_T = work + n*nb; work_V = work + n*nb + nb*nb; // Use unblocked code for the last or only block. if (kk < n) { trace_cpu_start( 0, "ungqr", "ungqr last block" ); m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; // sorgqr requires less workspace (n*nb), but is slow if k < sorgqr's block size. // replacing it with the 4 routines below is much faster (e.g., 60x). //magma_int_t iinfo; //lapackf77_sorgqr( &m_kk, &n_kk, &k_kk, // A(kk, kk), &lda, // &tau[kk], work, &lwork, &iinfo ); lapackf77_slacpy( MagmaFullStr, &m_kk, &k_kk, A(kk,kk), &lda, work_V, &m_kk); lapackf77_slaset( MagmaFullStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda ); lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &k_kk, work_V, &m_kk, &tau[kk], work_T, &k_kk); lapackf77_slarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &n_kk, &k_kk, work_V, &m_kk, work_T, &k_kk, A(kk, kk), &lda, work, &n_kk ); if (kk > 0) { for( j=kk; j < n; j += nb ) { jb = min( n-j, nb ); d = (j / nb) % ngpu; di = ((j / nb) / ngpu) * nb; magma_setdevice( d ); magma_ssetmatrix( m_kk, jb, A(kk, j), lda, dA(d, kk, di), ldda, queues[d] ); // Set A(1:kk,kk+1:n) to zero. magmablas_slaset( MagmaFull, kk, jb, c_zero, c_zero, dA(d, 0, di), ldda, queues[d] ); } } trace_cpu_end( 0 ); } if (kk > 0) { // Use blocked code // send T to all GPUs for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); trace_gpu_start( d, 0, "set", "set T" ); magma_ssetmatrix_async( nb, min(m,n), T, nb, dT[d], nb, queues[d] ); trace_gpu_end( d, 0 ); } // queue: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation for( i = ki; i >= 0; i -= nb ) { ib = min(nb, k - i); mi = m - i; dpanel = (i / nb) % ngpu; di = ((i / nb) / ngpu) * nb; // Send current panel to dV on the GPUs lapackf77_slaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); trace_gpu_start( d, 0, "set", "set V" ); magma_ssetmatrix_async( mi, ib, A(i, i), lda, dV[d], ldda, queues[d] ); trace_gpu_end( d, 0 ); } // set panel to identity magma_setdevice( dpanel ); trace_gpu_start( dpanel, 0, "laset", "laset" ); magmablas_slaset( MagmaFull, i, ib, c_zero, c_zero, dA(dpanel, 0, di), ldda, queues[dpanel] ); magmablas_slaset( MagmaFull, mi, ib, c_zero, c_one, dA(dpanel, i, di), ldda, queues[dpanel] ); trace_gpu_end( dpanel, 0 ); if (i < n) { // Apply H to A(i:m,i:n) from the left for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_indices_1D_bcyclic( nb, ngpu, d, i, n, &di, &dn ); trace_gpu_start( d, 0, "larfb", "larfb" ); magma_slarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, dn-di, ib, dV[d], ldda, dT(d,0,i), nb, dA(d, i, di), ldda, dW[d], lddwork, queues[d] ); trace_gpu_end( d, 0 ); } } } // copy result back to CPU trace_cpu_start( 0, "get", "get A" ); magma_sgetmatrix_1D_col_bcyclic( m, n, dA, ldda, A, lda, ngpu, nb, queues ); trace_cpu_end( 0 ); } #ifdef TRACING char name[80]; snprintf( name, sizeof(name), "sorgqr-n%d-ngpu%d.svg", m, ngpu ); trace_finalize( name, "trace.css" ); #endif cleanup: for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_free( dA[d] ); magma_queue_destroy( queues[d] ); } magma_free_cpu( work ); magma_setdevice( orig_dev ); return *info; } /* magma_sorgqr */
extern "C" magma_int_t magma_zpotrf2_mgpu(int num_gpus, char uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, magmaDoubleComplex **d_lA, magma_int_t ldda, magmaDoubleComplex **d_lP, magma_int_t lddp, magmaDoubleComplex *a, magma_int_t lda, magma_int_t h, magma_queue_t stream[][3], magma_event_t event[][5], magma_int_t *info ) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = 'U', or dA = L * L**H, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of dA is stored; = 'L': Lower triangle of dA is stored. N (input) INTEGER The order of the matrix dA. N >= 0. dA (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = 'U', the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be dividable by 16. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ magma_int_t j, jb, nb0, nb2, dd, d, id, j_local, j_local2, buf; char uplo_[2] = {uplo, 0}; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; double d_one = 1.0; double d_neg_one = -1.0; int upper = lapackf77_lsame(uplo_, "U"); magmaDoubleComplex *dlpanel; //magma_event_t event0[MagmaMaxGPUs], // syrk // event1[MagmaMaxGPUs], // send off-diagonal // event2[MagmaMaxGPUs], // send diagonal // event3[MagmaMaxGPUs]; // trsm magma_int_t n_local[MagmaMaxGPUs], ldpanel; int stream0 = 0, stream1 = 1; #ifdef ZTRSM_WORK magmaDoubleComplex *d_dinvA[MagmaMaxGPUs][2], *d_x[MagmaMaxGPUs][2]; /* used by ztrsm_work */ #endif *info = 0; if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (!upper && num_gpus*ldda < max(1,n)) { *info = -4; } else if (upper && ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } for( d=0; d<num_gpus; d++ ) { /* local-n and local-ld */ if (upper) { n_local[d] = ((n/nb)/num_gpus)*nb; if (d < (n/nb)%num_gpus) n_local[d] += nb; else if (d == (n/nb)%num_gpus) n_local[d] += n%nb; } else { n_local[d] = ((m/nb)/num_gpus)*nb; if (d < (m/nb)%num_gpus) n_local[d] += nb; else if (d == (m/nb)%num_gpus) n_local[d] += m%nb; } //magma_setdevice(d); //magma_event_create( &event0[d] ); //magma_event_create( &event1[d] ); //magma_event_create( &event2[d] ); //magma_event_create( &event3[d] ); } magma_setdevice(0); /* == initialize the trace */ trace_init( 1, num_gpus, 3, stream ); /* Use blocked code. */ if (upper) { /* ---------------------------------------------- */ /* Upper-triangular case */ /* > Compute the Cholesky factorization A = U'*U. */ /* ---------------------------------------------- */ #if defined(PRECISION_d) && defined(ZTRSM_WORK) /* invert the diagonals * Allocate device memory for the inversed diagonal blocks, size=m*NB */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( j=0; j<2; j++ ) { magma_zmalloc( &d_dinvA[d][j], nb*nb ); magma_zmalloc( &d_x[d][j], n*nb ); cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(magmaDoubleComplex)); cudaMemset(d_x[d][j], 0, n*nb*sizeof(magmaDoubleComplex)); } } magma_setdevice(0); #endif for (j=0; j<m; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (m-j)); if( j > 0 ) { /* needed on pluto... */ magma_setdevice(id); magma_queue_sync( stream[id][stream0] ); // wait for the column on CPU /* broadcast off-diagonal column to all gpus */ d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { if( d != id ) { magma_setdevice(d); /* wait for it on CPU */ magma_queue_wait_event( stream[d][stream0], event[id][1] ); /* send it to GPU */ trace_gpu_start( d, stream0, "comm", "rows to GPUs" ); magma_zsetmatrix_async( j, jb, Aup(0,j), lda, dlP(d,jb,0,buf), lddp, stream[d][stream0] ); trace_gpu_end( d, stream0 ); magma_event_record( event[d][1], stream[d][stream0] ); } d = (d+1)%num_gpus; } } /* Update the current diagonal block */ magma_setdevice(id); if( j > 0 ) { magmablasSetKernelStream(stream[id][stream1]); trace_gpu_start( id, stream1, "syrk", "syrk" ); magma_zherk(MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dlA(id, 0, nb*j_local), ldda, d_one, dlA(id, j, nb*j_local), ldda); trace_gpu_end( id, stream1 ); magma_event_record( event[id][0], stream[id][stream1] ); } /* send the diagonal to cpu */ magma_queue_wait_event( stream[id][stream0], event[id][0] ); // wait for syrk trace_gpu_start( id, stream0, "comm", "D to CPU" ); magma_zgetmatrix_async( jb, jb, dlA(id, j, nb*j_local), ldda, Aup(j,j), lda, stream[id][stream0] ); trace_gpu_end( id, stream0 ); if ( j > 0 ) { /* Compute the local block column of the panel. */ d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { j_local2 = j_local+1; if( d > id ) j_local2 --; nb0 = nb*j_local2; if( n_local[d] > nb0 ) { /* wait for the off-diagonal */ if( d != id ) { //magma_queue_sync( stream[id][3] ); dlpanel = dlP(d, jb, 0, buf); ldpanel = lddp; /* wait for the offdiagonal column */ magma_queue_wait_event( stream[d][stream1], event[d][1] ); } else { dlpanel = dlA(d, 0, nb*j_local); ldpanel = ldda; } /* update the panel */ magma_setdevice(d); magmablasSetKernelStream(stream[d][stream1]); trace_gpu_start( d, stream1, "gemm", "gemm" ); magma_zgemm(MagmaConjTrans, MagmaNoTrans, jb, n_local[d]-nb0, j, c_neg_one, dlpanel, ldpanel, dlA(d, 0, nb0), ldda, c_one, dlA(d, j, nb0), ldda); trace_gpu_end( d, stream1 ); } d = (d+1)%num_gpus; } } /* factor the diagonal */ magma_setdevice(id); magma_queue_sync( stream[id][stream0] ); // wait for the diagonal trace_cpu_start( 0, "getrf", "getrf" ); lapackf77_zpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info); trace_cpu_end( 0 ); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus */ if ( (j+jb) < n) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { magma_setdevice(d); if( d == id ) { dlpanel = dlA(d, j, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d, 0, 0, buf); ldpanel = lddp; } trace_gpu_start( d, stream0, "comm", "D to GPUs" ); magma_zsetmatrix_async( jb, jb, Aup(j,j), lda, dlpanel, ldpanel, stream[d][stream0] ); trace_gpu_end( d, stream0 ); magma_event_record( event[d][2], stream[d][stream0] ); d = (d+1)%num_gpus; } } else { magma_setdevice(id); trace_gpu_start( id, stream0, "comm", "D to GPUs" ); magma_zsetmatrix_async( jb, jb, Aup(j,j), lda, dlA(id, j, nb*j_local), ldda, stream[id][stream0] ); trace_gpu_end( id, stream0 ); } /* panel-factorize the off-diagonal */ if ( (j+jb) < n) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; if( d == id ) { dlpanel = dlA(d, j, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d, 0, 0, buf); ldpanel = lddp; } nb2 = n_local[d]-nb*j_local2; nb0 = min(nb, nb2 ); magma_setdevice(d); magmablasSetKernelStream(stream[d][stream1]); magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for the diagonal if( j+jb < m && d == (j/nb+1)%num_gpus ) { /* owns the next column, look-ahead the column */ trace_gpu_start( d, stream1, "trsm", "trsm" ); #if defined(PRECISION_d) && defined(ZTRSM_WORK) magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, d_dinvA[d][0], d_x[d][0] ); /*nb2 = n_local[d] - j_local2*nb; magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, d_dinvA[d], d_x[d] );*/ #else /*nb2 = n_local[d] - j_local2*nb; magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldda, dlA(d, j, nb*j_local2), ldda); */ magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda); #endif trace_gpu_end( d, stream1 ); magma_event_record( event[d][3], stream[d][stream1] ); /* send the column to cpu */ if( j+jb < m ) { trace_gpu_start( d, stream0, "comm", "rows to CPU" ); magma_queue_wait_event( stream[d][stream0], event[d][3] ); // wait for lookahead magma_zgetmatrix_async( (j+jb), nb0, dlA(d, 0, nb*j_local2), ldda, Aup(0,j+jb), lda, stream[d][stream0] ); trace_gpu_end( d, stream0 ); magma_event_record( event[d][1], stream[d][stream0] ); } /* update the remaining blocks */ nb2 = nb2 - nb0; #if defined(PRECISION_d) && defined(ZTRSM_WORK) magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda); #endif } else if( nb2 > 0 ) { /* update the entire trailing matrix */ trace_gpu_start( d, stream1, "trsm", "trsm" ); #if defined(PRECISION_d) && defined(ZTRSM_WORK) magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda); #endif trace_gpu_end( d, stream1 ); } d = (d+1)%num_gpus; } } /* end of ztrsm */ } /* end of for j=1, .., n */ } else { /* -------------------------------------------- */ /* Lower-triangular case */ /* Compute the Cholesky factorization A = L*L'. */ /* -------------------------------------------- */ #if defined(PRECISION_d) && defined(ZTRSM_WORK) /* * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( j=0; j<2; j++ ) { magma_zmalloc( &d_dinvA[d][j], nb*nb ); magma_zmalloc( &d_x[d][j], nb*m ); cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(magmaDoubleComplex)); cudaMemset(d_x[d][j], 0, nb* m*sizeof(magmaDoubleComplex)); } } magma_setdevice(0); #endif for (j=0; j<n; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (n-j)); if( j > 0 ) { /* needed on pluto... */ magma_setdevice(id); magma_queue_sync( stream[id][stream0] ); // wait for the column on CPU /* broadcast offdiagonal row to all gpus */ d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { if( d != id ) { magma_setdevice(d); /* wait for it on CPU */ magma_queue_wait_event( stream[d][stream0], event[id][1] ); /* send it to GPU */ magma_zsetmatrix_async( jb, j, Alo(j,0), lda, dlPT(d,0,jb,buf), nb, stream[d][stream0] ); magma_event_record( event[d][1], stream[d][stream0] ); } d = (d+1)%num_gpus; } } /* Update the current diagonal block */ magma_setdevice(id); if( j > 0 ) { magmablasSetKernelStream(stream[id][stream1]); magma_zherk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dlA(id, nb*j_local, 0), ldda, d_one, dlA(id, nb*j_local, j), ldda); magma_event_record( event[id][0], stream[id][stream1] ); } /* send the diagonal to cpu */ magma_queue_wait_event( stream[id][stream0], event[id][0] ); // wait for syrk magma_zgetmatrix_async( jb, jb, dlA(id, nb*j_local, j), ldda, Alo(j,j), lda, stream[id][stream0] ); /* update the offdiagonal blocks */ if ( j > 0 ) { /* compute the block-rows of the panel */ d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { j_local2 = j_local+1; if( d > id ) j_local2 --; nb0 = nb*j_local2; if( nb0 < n_local[d] ) { if( d != id ) { dlpanel = dlPT(d, 0, jb, buf); ldpanel = nb; /* wait for offdiagonal row */ magma_queue_wait_event( stream[d][stream1], event[d][1] ); } else { dlpanel = dlA(d, nb*j_local, 0); ldpanel = ldda; } magma_setdevice(d); magmablasSetKernelStream(stream[d][stream1]); magma_zgemm( MagmaNoTrans, MagmaConjTrans, n_local[d]-nb0, jb, j, c_neg_one, dlA(d, nb0, 0), ldda, dlpanel, ldpanel, c_one, dlA(d, nb0, j), ldda); } d = (d+1)%num_gpus; } } /* factor the diagonal */ magma_setdevice(id); magma_queue_sync( stream[id][stream0] ); lapackf77_zpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus */ if ( (j+jb) < m ) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { magma_setdevice(d); if( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d, 0, 0, buf); ldpanel = nb; } magma_zsetmatrix_async( jb, jb, Alo(j,j), lda, dlpanel, ldpanel, stream[d][stream0] ); magma_event_record( event[d][2], stream[d][stream0] ); d = (d+1)%num_gpus; } } else { magma_setdevice(id); magma_zsetmatrix_async( jb, jb, Alo(j,j), lda, dlA(id, nb*j_local, j), ldda, stream[id][stream0] ); } /* factorize off-diagonal blocks */ if ( (j+jb) < m ) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; if( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d, 0, 0, buf); ldpanel = nb; } nb2 = n_local[d] - j_local2*nb; nb0 = min(nb, nb2 ); magma_setdevice(d); magmablasSetKernelStream(stream[d][stream1]); magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for the diagonal if( j+jb < n && d == (j/nb+1)%num_gpus ) { /* owns the next column, look-ahead the column */ #if defined(PRECISION_d) && defined(ZTRSM_WORK) magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda, d_dinvA[d][0], d_x[d][0]); #else magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda); #endif magma_event_record( event[d][3], stream[d][stream1] ); /* send the column to cpu */ if( j+jb < n ) { magma_queue_wait_event( stream[d][stream0], event[d][3] ); // wait for lookahead magma_zgetmatrix_async( nb0, j+jb, dlA(d, nb*j_local2, 0), ldda, Alo(j+jb,0), lda, stream[d][stream0] ); magma_event_record( event[d][1], stream[d][stream0] ); } /* update the remaining blocks */ nb2 = nb2 - nb0; #if defined(PRECISION_d) && defined(ZTRSM_WORK) magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda); #endif } else if( nb2 > 0 ) { /* update the entire trailing matrix */ #if defined(PRECISION_d) && defined(ZTRSM_WORK) magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda); #endif } d = (d+1)%num_gpus; } } } } /* end of else not upper */ /* == finalize the trace == */ trace_finalize( "zpotrf.svg","trace.css" ); /* clean up */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); //magma_event_destroy( event0[d] ); //magma_event_destroy( event1[d] ); //magma_event_destroy( event2[d] ); //magma_event_destroy( event3[d] ); } magma_setdevice(0); return *info; } /* magma_zpotrf_mgpu */
extern "C" magma_int_t magma_zpotrf3_mgpu(magma_int_t num_gpus, char uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, magmaDoubleComplex *d_lA[], magma_int_t ldda, magmaDoubleComplex *d_lP[], magma_int_t lddp, magmaDoubleComplex *a, magma_int_t lda, magma_int_t h, magma_queue_t stream[][3], magma_event_t event[][5], magma_int_t *info ) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. Auxiliary subroutine for zpotrf2_ooc. It is multiple gpu interface to compute Cholesky of a "rectangular" matrix. The factorization has the form dA = U**H * U, if UPLO = 'U', or dA = L * L**H, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of dA is stored; = 'L': Lower triangle of dA is stored. N (input) INTEGER The order of the matrix dA. N >= 0. dA (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = 'U', the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be dividable by 16. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ magma_int_t j, jb, nb0, nb2, d, dd, id, j_local, j_local2, buf; char uplo_[2] = {uplo, 0}; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; double d_one = 1.0; double d_neg_one = -1.0; int upper = lapackf77_lsame(uplo_, "U"); magmaDoubleComplex *dlpanel; magma_int_t n_local[MagmaMaxGPUs], ldpanel; const magma_int_t stream1 = 0, stream2 = 1, stream3 = 2; #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK) /* used by ztrsm_work */ int trsm_nb = 128; int trsm_n = trsm_nb*((nb+trsm_nb-1)/trsm_nb); magmaDoubleComplex *d_dinvA[MagmaMaxGPUs]; magmaDoubleComplex *d_x[MagmaMaxGPUs]; #define dinvA(d,j) &(d_dinvA[(d)][(j)*trsm_nb*trsm_n]) #define dx(d,j) &(d_x[(d)][(j)*nb*m]) /* * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); if ( (MAGMA_SUCCESS != magma_zmalloc( &d_dinvA[d], 2*trsm_nb*trsm_n )) || (MAGMA_SUCCESS != magma_zmalloc( &d_x[d], 2*nb*(upper ? n : m) )) ) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } magma_setdevice(0); #endif *info = 0; if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (!upper && num_gpus*ldda < max(1,n)) { *info = -4; } else if (upper && ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* initialization */ for( d=0; d<num_gpus; d++ ) { /* local-n and local-ld */ if (upper) { n_local[d] = (n/(nb*num_gpus))*nb; if (d < (n/nb)%num_gpus) n_local[d] += nb; else if (d == (n/nb)%num_gpus) n_local[d] += n%nb; } else { n_local[d] = (m/(nb*num_gpus))*nb; if (d < (m/nb)%num_gpus) n_local[d] += nb; else if (d == (m/nb)%num_gpus) n_local[d] += m%nb; } } /* == initialize the trace */ trace_init( 1, num_gpus, 3, (CUstream_st**)stream ); if (upper) { /* ---------------------------------------------- */ /* Upper-triangular case */ /* > Compute the Cholesky factorization A = U'*U. */ /* ---------------------------------------------- */ for (j=0; j<m; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; // right now, we have num_gpu buffers, so id and buf are the same.. /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (m-j)); /* Update the current diagonal block on stream1 */ magma_setdevice(id); if( j > 0 ) { magmablasSetKernelStream(stream[id][stream1]); trace_gpu_start( id, stream1, "syrk", "syrk" ); magma_zherk(MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dlA(id, 0, nb*j_local), ldda, d_one, dlA(id, j, nb*j_local), ldda); trace_gpu_end( id, stream1 ); } /* send the diagonal to cpu on stream1 */ trace_gpu_start( id, stream1, "comm", "D to CPU" ); magma_zgetmatrix_async( jb, jb, dlA(id, j, nb*j_local), ldda, Aup(j,j), lda, stream[id][stream1] ); trace_gpu_end( id, stream1 ); /* update off-diagonal blocks in the panel */ if( j > 0 ) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { j_local2 = j_local+1; if( d > id ) j_local2 --; nb0 = nb*j_local2; // number of local columns in the panel, while jb is panel-size (number of rows) if( n_local[d] > nb0 ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][stream2]); if( d == id ) { dlpanel = dlA(d,0,nb*j_local); ldpanel = ldda; // the GPU owns the row from start, and no need of synch. //magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu } else { dlpanel = dlP(d,nb,0,buf); ldpanel = lddp; magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu } trace_gpu_start( d, stream2, "gemm", "gemm" ); magma_zgemm(MagmaConjTrans, MagmaNoTrans, jb, n_local[d]-nb0, j, c_neg_one, dlpanel, ldpanel, dlA(d, 0, nb0), ldda, c_one, dlA(d, j, nb0), ldda); trace_gpu_end( d, stream2 ); magma_event_record( event[d][2], stream[d][stream2] ); } d = (d+1)%num_gpus; } } /* wait for panel and factorize it on cpu */ magma_setdevice(id); magma_queue_sync( stream[id][stream1] ); trace_cpu_start( 0, "getrf", "getrf" ); lapackf77_zpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info); trace_cpu_end( 0 ); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus on stream1 */ if ( (j+jb) < n) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { if( d == id ) { dlpanel = dlA(d, j, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d,0,0,buf); ldpanel = lddp; } magma_setdevice(d); trace_gpu_start( d, stream1, "comm", "comm" ); magma_zsetmatrix_async( jb, jb, Aup(j,j), lda, dlpanel, ldpanel, stream[d][stream1] ); trace_gpu_end( d, stream1 ); magma_event_record( event[d][1], stream[d][stream1] ); d = (d+1)%num_gpus; } } else { magma_setdevice(id); trace_gpu_start( id, stream1, "comm", "comm" ); magma_zsetmatrix_async( jb, jb, Aup(j,j), lda, dlA(id, j, nb*j_local), ldda, stream[id][stream1] ); trace_gpu_end( id, stream1 ); } /* panel-factorize the off-diagonal */ if ( (j+jb) < n) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; if( d == id ) { dlpanel = dlA(d,j,nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d,0,0,buf); ldpanel = lddp; } nb2 = n_local[d] - j_local2*nb; magma_setdevice(d); if( j+jb < m && d == (j/nb+1)%num_gpus ) { /* owns the next column, look-ahead next block on stream1 */ nb0 = min(nb, nb2); magmablasSetKernelStream(stream[d][stream1]); magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update trace_gpu_start( d, stream1, "trsm", "trsm" ); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK) magmablas_zlaset( MagmaUpperLower, trsm_nb, trsm_n, dinvA(d,0),trsm_nb ); magmablas_zlaset( MagmaUpperLower, nb0,jb, dx(d,0),nb0 ); magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, 1, dinvA(d,0), dx(d,0) ); #else magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda); #endif magma_event_record( event[d][4], stream[d][stream1] ); trace_gpu_end( d, stream1 ); } else if( nb2 > 0 ) { /* update all the blocks on stream2 */ magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor trace_gpu_start( d, stream2, "trsm", "trsm" ); magmablasSetKernelStream(stream[d][stream2]); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK) magmablas_zlaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,0),trsm_nb ); magmablas_zlaset( MagmaUpperLower, nb2,jb, dx(d,0),nb2 ); magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, 1, dinvA(d,0), dx(d,0) ); #else magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda); #endif trace_gpu_end( d, stream2 ); } d = (d+1)%num_gpus; } /* end of for */ /* ========================================================== */ if( j+jb < m ) { d = (j/nb+1)%num_gpus; /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; nb0 = min(nb, n_local[d]-nb*j_local2 ); /* even on 1 gpu, off-diagonals are copied to cpu (synchronize at the end). * * so we have the Cholesky factor, but only diagonal submatrix of the big panel, * * on cpu at the end. */ int d2, buf2; magma_setdevice(d); /* lookahead done */ magma_queue_wait_event( stream[d][stream3], event[d][4] ); trace_gpu_start( d, stream3, "comm", "row to CPU" ); magma_zgetmatrix_async( (j+jb), nb0, dlA(d, 0, nb*j_local2), ldda, Aup(0,j+jb), lda, stream[d][stream3] ); trace_gpu_end( d, stream3 ); magma_event_record( event[d][3], stream[d][stream3] ); /* needed on pluto */ //magma_queue_sync( stream[d][stream3] ); /* broadcast rows to gpus on stream2 */ buf2 = ((j+jb)/nb)%num_gpus; for( d2=0; d2<num_gpus; d2++ ) { if( d2 != d ) { magma_setdevice(d2); trace_gpu_start( d2, stream3, "comm", "row to GPUs" ); magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // rows arrived at cpu on stream3 magma_zsetmatrix_async( j+jb, nb0, Aup(0,j+jb), lda, dlP(d2,nb,0,buf2), lddp, stream[d2][stream3] ); trace_gpu_end( d2, stream3 ); magma_event_record( event[d2][0], stream[d2][stream3] ); } } /* =========================== */ /* update the remaining blocks */ nb2 = n_local[d]-(nb*j_local2 + nb0); if( nb2 > 0 ) { if( d == id ) { dlpanel = dlA(d, j, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d,0,0,buf); ldpanel = lddp; } magma_setdevice(d); magmablasSetKernelStream(stream[d][stream2]); trace_gpu_start( d, stream2, "trsm", "trsm" ); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK) int flag = 0; if (flag == 0) { magma_queue_wait_event( stream[d][stream2], event[d][4] ); // lookahead -> diagonal inversion } else { magmablas_zlaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,flag),trsm_nb ); magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received } magmablas_zlaset( MagmaUpperLower, nb2,jb, dx(d,1),nb2 ); magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda, flag, dinvA(d,flag), dx(d,1) ); #else magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda); #endif trace_gpu_end( d, stream2 ); } } } /* end of ztrsm */ } /* end of for j=1, .., n */ } else { /* ---------------------------------------------- */ /* Lower-triangular case */ /* > Compute the Cholesky factorization A = L*L'. */ /* ---------------------------------------------- */ for (j=0; j<n; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (n-j)); /* Update the current diagonal block on stream1 */ magma_setdevice(id); if( j > 0 ) { magmablasSetKernelStream(stream[id][stream1]); magma_zherk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dlA(id, nb*j_local, 0), ldda, d_one, dlA(id, nb*j_local, j), ldda); } /* send the diagonal to cpu on stream1 */ magma_zgetmatrix_async( jb, jb, dlA(id, nb*j_local, j), ldda, Alo(j,j), lda, stream[id][stream1] ); /* update off-diagonal blocks of the panel */ if( j > 0 ) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { j_local2 = j_local+1; if( d > id ) j_local2 --; nb0 = nb*j_local2; if( nb0 < n_local[d] ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][stream2]); if( d == id ) { dlpanel = dlA(d, nb*j_local, 0); ldpanel = ldda; } else { dlpanel = dlPT(d,0,nb,buf); ldpanel = nb; magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu } magma_zgemm( MagmaNoTrans, MagmaConjTrans, n_local[d]-nb0, jb, j, c_neg_one, dlA(d, nb0, 0), ldda, dlpanel, ldpanel, c_one, dlA(d, nb0, j), ldda); magma_event_record( event[d][2], stream[d][stream2] ); } d = (d+1)%num_gpus; } } /* wait for the panel and factorized it on cpu */ magma_setdevice(id); magma_queue_sync( stream[id][stream1] ); lapackf77_zpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus on stream1 */ if ( (j+jb) < m) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { if( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d, 0, 0, buf); ldpanel = nb; } magma_setdevice(d); magma_zsetmatrix_async( jb, jb, Alo(j,j), lda, dlpanel, ldpanel, stream[d][stream1] ); magma_event_record( event[d][1], stream[d][stream1] ); d = (d+1)%num_gpus; } } else { magma_setdevice(id); magma_zsetmatrix_async( jb, jb, Alo(j,j), lda, dlA(id, nb*j_local, j), ldda, stream[id][stream1] ); } /* panel factorize the off-diagonal */ if ( (j+jb) < m) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; if( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d, 0, 0, buf); ldpanel = nb; } nb2 = n_local[d] - j_local2*nb; nb0 = min(nb, nb2); magma_setdevice(d); if( j+nb < n && d == (j/nb+1)%num_gpus ) { /* owns next column, look-ahead next block on stream1 */ if ( j > 0 ) magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update magmablasSetKernelStream(stream[d][stream1]); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK) magmablas_zlaset( MagmaUpperLower, trsm_nb, trsm_n, dinvA(d,0),trsm_nb ); magmablas_zlaset( MagmaUpperLower, nb0,jb, dx(d,0),nb0 ); magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda, 1, dinvA(d,0), dx(d,0) ); #else magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda); #endif magma_event_record( event[d][4], stream[d][stream1] ); } else if( nb2 > 0 ) { /* other gpus updating all the blocks on stream2 */ /* update the entire column */ magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for the cholesky factor magmablasSetKernelStream(stream[d][stream2]); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK) magmablas_zlaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,0),trsm_nb ); magmablas_zlaset( MagmaUpperLower, nb2,jb, dx(d,0),nb2 ); magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda, 1, dinvA(d,0), dx(d,0) ); #else magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda); #endif } d = (d+1)%num_gpus; } /* end for d */ /* ========================================================== */ if( j+jb < n ) { d = (j/nb+1)%num_gpus; /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; nb0 = min(nb, n_local[d]-nb*j_local2 ); /* even on 1 gpu, we copy off-diagonal to cpu (but don't synchronize). */ /* so we have the Cholesky factor on cpu at the end. */ int d2, buf2; //#define ZPOTRF_DEVICE_TO_DEVICE #ifdef ZPOTRF_DEVICE_TO_DEVICE // lookahead done /* broadcast the rows to gpus */ buf2 = ((j+jb)/nb)%num_gpus; for( d2=0; d2<num_gpus; d2++ ) { magma_setdevice(d2); magma_queue_wait_event( stream[d2][stream3], event[d][4] ); if( d2 != d ) { magma_zcopymatrix_async( nb0, j+jb, dlPT(d2,0,nb,buf2), nb, // first nbxnb reserved for diagonal block dlA(d, nb*j_local2, 0), ldda, stream[d2][stream3] ); magma_event_record( event[d2][0], stream[d2][stream3] ); } else { magma_zgetmatrix_async( nb0, j+jb, dlA(d, nb*j_local2, 0), ldda, Alo(j+jb,0), lda, stream[d][stream3] ); } } #else // lookahead done magma_setdevice(d); magma_queue_wait_event( stream[d][stream3], event[d][4] ); magma_zgetmatrix_async( nb0, j+jb, dlA(d, nb*j_local2, 0), ldda, Alo(j+jb,0), lda, stream[d][stream3] ); magma_event_record( event[d][3], stream[d][stream3] ); /* syn on rows on CPU, seem to be needed on Pluto */ //magma_queue_sync( stream[d][stream3] ); /* broadcast the rows to gpus */ buf2 = ((j+jb)/nb)%num_gpus; for( d2=0; d2<num_gpus; d2++ ) { if( d2 != d ) { magma_setdevice(d2); magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // getmatrix done magma_zsetmatrix_async( nb0, j+jb, Alo(j+jb,0), lda, dlPT(d2,0,nb,buf2), nb, // first nbxnb reserved for diagonal block stream[d2][stream3] ); magma_event_record( event[d2][0], stream[d2][stream3] ); } } #endif /* =================================== */ /* updates remaining blocks on stream2 */ nb2 = n_local[d] - (j_local2*nb + nb0); if( nb2 > 0 ) { if( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d,0,0,buf); ldpanel = nb; } magma_setdevice(d); magmablasSetKernelStream(stream[d][stream2]); /* update the remaining blocks in the column */ #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK) int flag = 0; if (flag == 0) { magma_queue_wait_event( stream[d][stream2], event[d][4] ); // lookahead -> diagonal inversion } else { magmablas_zlaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,flag),trsm_nb ); magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received } magmablas_zlaset( MagmaUpperLower, nb2,jb, dx(d,1),nb2 ); magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda, flag, dinvA(d,flag), dx(d,1) ); #else magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda); #endif } } } } } /* end of else not upper */ /* == finalize the trace == */ trace_finalize( "zpotrf.svg","trace.css" ); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( j=0; j<3; j++ ) { magma_queue_sync( stream[d][j] ); } #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK) magma_free( d_dinvA[d] ); magma_free( d_x[d] ); #endif magmablasSetKernelStream(NULL); } magma_setdevice(0); return *info; } /* magma_zpotrf_mgpu */
/** Purpose ======= SSYTRF_nopiv_gpu computes the LDLt factorization of a real symmetric matrix A. The factorization has the form A = U^H * D * U , if UPLO = 'U', or A = L * D * L^H, if UPLO = 'L', where U is an upper triangular matrix, L is lower triangular, and D is a diagonal matrix. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] UPLO CHARACTER*1 - = 'U': Upper triangle of A is stored; - = 'L': Lower triangle of A is stored. @param[in] N INTEGER The order of the matrix A. N >= 0. @param[in,out] dA REAL array on the GPU, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U^H D U or A = L D L^H. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using cudaMallocHost. @param[in] LDA INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] INFO INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -6, the GPU memory allocation failed - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_ssytrf_comp ******************************************************************* */ extern "C" magma_int_t magma_ssytrf_nopiv_gpu( magma_uplo_t uplo, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda, magma_int_t *info) { #define A(i, j) (A) #define dA(i, j) (dA +(j)*ldda + (i)) #define dW(i, j) (dW +(j)*ldda + (i)) #define dWt(i, j) (dW +(j)*nb + (i)) /* Local variables */ float zone = MAGMA_S_ONE; float mzone = MAGMA_S_NEG_ONE; int upper = (uplo == MagmaUpper); magma_int_t j, k, jb, nb, ib, iinfo; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return MAGMA_ERR_ILLEGAL_VALUE; } /* Quick return */ if ( n == 0 ) return MAGMA_SUCCESS; nb = magma_get_ssytrf_nopiv_nb(n); ib = min(32, nb); // inner-block for diagonal factorization magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); magma_queue_t stream[2]; magma_event_t event; magma_queue_create(&stream[0]); magma_queue_create(&stream[1]); magma_event_create( &event ); trace_init( 1, 1, 2, stream ); // CPU workspace float *A; if (MAGMA_SUCCESS != magma_smalloc_pinned( &A, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } // GPU workspace magmaFloat_ptr dW; if (MAGMA_SUCCESS != magma_smalloc( &dW, (1+nb)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* Use hybrid blocked code. */ if (upper) { //========================================================= // Compute the LDLt factorization A = U'*D*U without pivoting. // main loop for (j=0; j<n; j += nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); //magma_queue_wait_event( stream[1], event ); magma_event_sync(event); magma_sgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), nb, stream[1]); trace_gpu_end( 0, 0 ); // factorize the diagonal block magma_queue_sync(stream[1]); trace_cpu_start( 0, "potrf", "potrf" ); ssytrf_nopiv_cpu(MagmaUpper, jb, ib, A(j, j), nb, info); trace_cpu_end( 0 ); if (*info != 0){ *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async(jb, jb, A(j, j), nb, dA(j, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column magmablasSetKernelStream( stream[0] ); trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_strsm(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaUnit, jb, (n-j-jb), zone, dA(j, j), ldda, dA(j, j+jb), ldda); magma_scopymatrix( jb, n-j-jb, dA( j, j+jb ), ldda, dWt( 0, j+jb ), nb ); // update the trailing submatrix with D magmablas_slascl_diag(MagmaUpper, jb, n-j-jb, dA(j, j), ldda, dA(j, j+jb), ldda, &iinfo); trace_gpu_end( 0, 0 ); // update the trailing submatrix with U and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k<n; k+=nb) { magma_int_t kb = min(nb,n-k); magma_sgemm(MagmaConjTrans, MagmaNoTrans, kb, n-k, jb, mzone, dWt(0, k), nb, dA(j, k), ldda, zone, dA(k, k), ldda); if (k==j+jb) magma_event_record( event, stream[0] ); } trace_gpu_end( 0, 0 ); } } } else { //========================================================= // Compute the LDLt factorization A = L*D*L' without pivoting. // main loop for (j=0; j<n; j+=nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); //magma_queue_wait_event( stream[0], event ); magma_event_sync(event); magma_sgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), nb, stream[1]); trace_gpu_end( 0, 0 ); // factorize the diagonal block magma_queue_sync(stream[1]); trace_cpu_start( 0, "potrf", "potrf" ); ssytrf_nopiv_cpu(MagmaLower, jb, ib, A(j, j), nb, info); trace_cpu_end( 0 ); if (*info != 0){ *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async(jb, jb, A(j, j), nb, dA(j, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column magmablasSetKernelStream( stream[0] ); trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_strsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaUnit, (n-j-jb), jb, zone, dA(j, j), ldda, dA(j+jb, j), ldda); magma_scopymatrix( n-j-jb,jb, dA( j+jb, j ), ldda, dW( j+jb, 0 ), ldda ); // update the trailing submatrix with D magmablas_slascl_diag(MagmaLower, n-j-jb, jb, dA(j, j), ldda, dA(j+jb, j), ldda, &iinfo); trace_gpu_end( 0, 0 ); // update the trailing submatrix with L and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k<n; k+=nb) { magma_int_t kb = min(nb,n-k); magma_sgemm(MagmaNoTrans, MagmaConjTrans, n-k, kb, jb, mzone, dA(k, j), ldda, dW(k, 0), ldda, zone, dA(k, k), ldda); if (k==j+jb) magma_event_record( event, stream[0] ); } trace_gpu_end( 0, 0 ); } } } trace_finalize( "ssytrf.svg","trace.css" ); magma_queue_destroy(stream[0]); magma_queue_destroy(stream[1]); magma_event_destroy( event ); magma_free( dW ); magma_free_pinned( A ); magmablasSetKernelStream( orig_stream ); return MAGMA_SUCCESS; } /* magma_ssytrf_nopiv */
/** Purpose ------- SPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix dA. Auxiliary subroutine for spotrf2_ooc. It is multiple gpu interface to compute Cholesky of a "rectangular" matrix. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] dA REAL array on the GPU, dimension (LDDA,N) On entry, the symmetric matrix dA. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_sposv_comp ********************************************************************/ extern "C" magma_int_t magma_spotrf3_mgpu( magma_int_t ngpu, magma_uplo_t uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, magmaFloat_ptr d_lA[], magma_int_t ldda, magmaFloat_ptr d_lP[], magma_int_t lddp, float *A, magma_int_t lda, magma_int_t h, magma_queue_t queues[][3], magma_event_t events[][5], magma_int_t *info ) { #define Alo(i, j) (A + ((j)+off_j)*lda + (nb*(((i)/nb)%h)+off_i)) #define Aup(i, j) (A + (nb*(((j)/nb)%h)+off_j)*lda + (i+off_i)) #define dlA(id, i, j) (d_lA[(id)] + (j)*ldda + (i)) #define dlP(id, i, j, k) (d_lP[(id)] + (k)*nb*lddp + (j)*lddp + (i)) #define dlPT(id, i, j, k) (d_lP[(id)] + (k)*nb*lddp + (j)*nb + (i)) magma_int_t j, jb, nb0, nb2, d, dd, id, j_local, j_local2, buf; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; float d_one = 1.0; float d_neg_one = -1.0; int upper = (uplo == MagmaUpper); float *dlpanel; magma_int_t n_local[MagmaMaxGPUs], ldpanel; const magma_int_t stream1 = 0, stream2 = 1, stream3 = 2; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (!upper && ngpu*ldda < max(1,n)) { *info = -4; } else if (upper && ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK) /* used by strsm_work */ float c_zero = MAGMA_S_ZERO; int trsm_nb = 128; int trsm_n = trsm_nb*((nb+trsm_nb-1)/trsm_nb); float *d_dinvA[MagmaMaxGPUs]; float *d_x[MagmaMaxGPUs]; #define dinvA(d,j) &(d_dinvA[(d)][(j)*trsm_nb*trsm_n]) #define dx(d,j) &(d_x[(d)][(j)*nb*m]) /* * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE */ // TODO free memory on failure. for( d=0; d < ngpu; d++ ) { magma_setdevice(d); if ( (MAGMA_SUCCESS != magma_smalloc( &d_dinvA[d], 2*trsm_nb*trsm_n )) || (MAGMA_SUCCESS != magma_smalloc( &d_x[d], 2*nb*(upper ? n : m) )) ) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } magma_setdevice(0); #endif /* initialization */ for( d=0; d < ngpu; d++ ) { /* local-n and local-ld */ if (upper) { n_local[d] = (n/(nb*ngpu))*nb; if (d < (n/nb)%ngpu) n_local[d] += nb; else if (d == (n/nb)%ngpu) n_local[d] += n%nb; } else { n_local[d] = (m/(nb*ngpu))*nb; if (d < (m/nb)%ngpu) n_local[d] += nb; else if (d == (m/nb)%ngpu) n_local[d] += m%nb; } } /* == initialize the trace */ trace_init( 1, ngpu, 3, (CUstream_st**)queues ); if (upper) { /* ---------------------------------------------- */ /* Upper-triangular case */ /* > Compute the Cholesky factorization A = U'*U. */ /* ---------------------------------------------- */ for (j=0; j < m; j += nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%ngpu; buf = (j/nb)%ngpu; // right now, we have ngpu buffers, so id and buf are the same.. /* Set the local index where the current panel is */ j_local = j/(nb*ngpu); jb = min(nb, (m-j)); /* Update the current diagonal block on stream1 */ magma_setdevice(id); if ( j > 0 ) { magmablasSetKernelStream( queues[id][stream1] ); trace_gpu_start( id, stream1, "syrk", "syrk" ); magma_ssyrk(MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dlA(id, 0, nb*j_local), ldda, d_one, dlA(id, j, nb*j_local), ldda); trace_gpu_end( id, stream1 ); } /* send the diagonal to cpu on stream1 */ trace_gpu_start( id, stream1, "comm", "D to CPU" ); magma_sgetmatrix_async( jb, jb, dlA(id, j, nb*j_local), ldda, Aup(j,j), lda, queues[id][stream1] ); trace_gpu_end( id, stream1 ); /* update off-diagonal blocks in the panel */ if ( j > 0 ) { d = (j/nb+1)%ngpu; for( dd=0; dd < ngpu; dd++ ) { j_local2 = j_local+1; if ( d > id ) j_local2 --; nb0 = nb*j_local2; // number of local columns in the panel, while jb is panel-size (number of rows) if ( n_local[d] > nb0 ) { magma_setdevice(d); magmablasSetKernelStream( queues[d][stream2] ); if ( d == id ) { dlpanel = dlA(d,0,nb*j_local); ldpanel = ldda; // the GPU owns the row from start, and no need of synch. //magma_queue_wait_event( queues[d][stream2], events[d][0] ); // rows arrived at gpu magma_queue_wait_event( queues[d][stream2], events[d][4] ); // wait for look-ahead trsm to finish } else { dlpanel = dlP(d,nb,0,buf); ldpanel = lddp; magma_queue_wait_event( queues[d][stream2], events[d][0] ); // rows arrived at gpu } trace_gpu_start( d, stream2, "gemm", "gemm" ); magma_sgemm(MagmaConjTrans, MagmaNoTrans, jb, n_local[d]-nb0, j, c_neg_one, dlpanel, ldpanel, dlA(d, 0, nb0), ldda, c_one, dlA(d, j, nb0), ldda); trace_gpu_end( d, stream2 ); magma_event_record( events[d][2], queues[d][stream2] ); } d = (d+1)%ngpu; } } /* wait for panel and factorize it on cpu */ magma_setdevice(id); magma_queue_sync( queues[id][stream1] ); trace_cpu_start( 0, "getrf", "getrf" ); lapackf77_spotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info); trace_cpu_end( 0 ); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus on stream1 */ if ( (j+jb) < n) { d = (j/nb+1)%ngpu; for( dd=0; dd < ngpu; dd++ ) { if ( d == id ) { dlpanel = dlA(d, j, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d,0,0,buf); ldpanel = lddp; } magma_setdevice(d); trace_gpu_start( d, stream1, "comm", "comm" ); magma_ssetmatrix_async( jb, jb, Aup(j,j), lda, dlpanel, ldpanel, queues[d][stream1] ); trace_gpu_end( d, stream1 ); magma_event_record( events[d][1], queues[d][stream1] ); d = (d+1)%ngpu; } } else { magma_setdevice(id); trace_gpu_start( id, stream1, "comm", "comm" ); magma_ssetmatrix_async( jb, jb, Aup(j,j), lda, dlA(id, j, nb*j_local), ldda, queues[id][stream1] ); trace_gpu_end( id, stream1 ); } /* panel-factorize the off-diagonal */ if ( (j+jb) < n) { d = (j/nb+1)%ngpu; for( dd=0; dd < ngpu; dd++ ) { /* next column */ j_local2 = j_local+1; if ( d > id ) j_local2--; if ( d == id ) { dlpanel = dlA(d,j,nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d,0,0,buf); ldpanel = lddp; } nb2 = n_local[d] - j_local2*nb; magma_setdevice(d); if ( j+jb < m && d == (j/nb+1)%ngpu ) { /* owns the next column, look-ahead next block on stream1 */ nb0 = min(nb, nb2); magmablasSetKernelStream( queues[d][stream1] ); magma_queue_wait_event( queues[d][stream1], events[d][2] ); // wait for gemm update trace_gpu_start( d, stream1, "trsm", "trsm" ); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK) magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,0), trsm_nb ); magmablas_slaset( MagmaFull, nb0, jb, c_zero, c_zero, dx(d,0), nb0 ); magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, 1, dinvA(d,0), dx(d,0) ); #else magma_strsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda); #endif magma_event_record( events[d][4], queues[d][stream1] ); trace_gpu_end( d, stream1 ); } else if ( nb2 > 0 ) { /* update all the blocks on stream2 */ magma_queue_wait_event( queues[d][stream2], events[d][1] ); // wait for cholesky factor trace_gpu_start( d, stream2, "trsm", "trsm" ); magmablasSetKernelStream( queues[d][stream2] ); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK) magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,0), trsm_nb ); magmablas_slaset( MagmaFull, nb2, jb, c_zero, c_zero, dx(d,0), nb2 ); magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, 1, dinvA(d,0), dx(d,0) ); #else magma_strsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda); #endif trace_gpu_end( d, stream2 ); } d = (d+1)%ngpu; } /* end of for */ /* ========================================================== */ if ( j+jb < m ) { d = (j/nb+1)%ngpu; /* next column */ j_local2 = j_local+1; if ( d > id ) j_local2--; nb0 = min(nb, n_local[d]-nb*j_local2 ); /* even on 1 gpu, off-diagonals are copied to cpu (synchronize at the end). * * so we have the Cholesky factor, but only diagonal submatrix of the big panel, * * on cpu at the end. */ int d2, buf2; magma_setdevice(d); /* lookahead done */ magma_queue_wait_event( queues[d][stream3], events[d][4] ); trace_gpu_start( d, stream3, "comm", "row to CPU" ); magma_sgetmatrix_async( (j+jb), nb0, dlA(d, 0, nb*j_local2), ldda, Aup(0,j+jb), lda, queues[d][stream3] ); trace_gpu_end( d, stream3 ); magma_event_record( events[d][3], queues[d][stream3] ); /* needed on pluto */ //magma_queue_sync( queues[d][stream3] ); /* broadcast rows to gpus on stream2 */ buf2 = ((j+jb)/nb)%ngpu; for( d2=0; d2 < ngpu; d2++ ) { if ( d2 != d ) { magma_setdevice(d2); trace_gpu_start( d2, stream3, "comm", "row to GPUs" ); magma_queue_wait_event( queues[d2][stream3], events[d][3] ); // rows arrived at cpu on stream3 magma_ssetmatrix_async( j+jb, nb0, Aup(0,j+jb), lda, dlP(d2,nb,0,buf2), lddp, queues[d2][stream3] ); trace_gpu_end( d2, stream3 ); magma_event_record( events[d2][0], queues[d2][stream3] ); } } /* =========================== */ /* update the remaining blocks */ nb2 = n_local[d]-(nb*j_local2 + nb0); if ( nb2 > 0 ) { if ( d == id ) { dlpanel = dlA(d, j, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d,0,0,buf); ldpanel = lddp; } magma_setdevice(d); magmablasSetKernelStream( queues[d][stream2] ); trace_gpu_start( d, stream2, "trsm", "trsm" ); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK) int flag = 0; if (flag == 0) { magma_queue_wait_event( queues[d][stream2], events[d][4] ); // lookahead -> diagonal inversion } else { magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,flag), trsm_nb ); magma_queue_wait_event( queues[d][stream2], events[d][1] ); // panel received } magmablas_slaset( MagmaFull, nb2, jb, c_zero, c_zero, dx(d,1), nb2 ); magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda, flag, dinvA(d,flag), dx(d,1) ); #else magma_queue_wait_event( queues[d][stream2], events[d][1] ); // wait for cholesky factor magma_strsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda); #endif trace_gpu_end( d, stream2 ); } } } /* end of strsm */ } /* end of for j=1, .., n */ } else { /* ---------------------------------------------- */ /* Lower-triangular case */ /* > Compute the Cholesky factorization A = L*L'. */ /* ---------------------------------------------- */ for (j=0; j < n; j += nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%ngpu; buf = (j/nb)%ngpu; /* Set the local index where the current panel is */ j_local = j/(nb*ngpu); jb = min(nb, (n-j)); /* Update the current diagonal block on stream1 */ magma_setdevice(id); if ( j > 0 ) { magmablasSetKernelStream( queues[id][stream1] ); magma_ssyrk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dlA(id, nb*j_local, 0), ldda, d_one, dlA(id, nb*j_local, j), ldda); } /* send the diagonal to cpu on stream1 */ magma_sgetmatrix_async( jb, jb, dlA(id, nb*j_local, j), ldda, Alo(j,j), lda, queues[id][stream1] ); /* update off-diagonal blocks of the panel */ if ( j > 0 ) { d = (j/nb+1)%ngpu; for( dd=0; dd < ngpu; dd++ ) { j_local2 = j_local+1; if ( d > id ) j_local2 --; nb0 = nb*j_local2; if ( nb0 < n_local[d] ) { magma_setdevice(d); magmablasSetKernelStream( queues[d][stream2] ); if ( d == id ) { dlpanel = dlA(d, nb*j_local, 0); ldpanel = ldda; magma_queue_wait_event( queues[d][stream2], events[d][4] ); // wait for look-ahead trsm to finish } else { dlpanel = dlPT(d,0,nb,buf); ldpanel = nb; magma_queue_wait_event( queues[d][stream2], events[d][0] ); // rows arrived at gpu } magma_sgemm( MagmaNoTrans, MagmaConjTrans, n_local[d]-nb0, jb, j, c_neg_one, dlA(d, nb0, 0), ldda, dlpanel, ldpanel, c_one, dlA(d, nb0, j), ldda); magma_event_record( events[d][2], queues[d][stream2] ); } d = (d+1)%ngpu; } } /* wait for the panel and factorized it on cpu */ magma_setdevice(id); magma_queue_sync( queues[id][stream1] ); lapackf77_spotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus on stream1 */ if ( (j+jb) < m) { d = (j/nb+1)%ngpu; for( dd=0; dd < ngpu; dd++ ) { if ( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d, 0, 0, buf); ldpanel = nb; } magma_setdevice(d); magma_ssetmatrix_async( jb, jb, Alo(j,j), lda, dlpanel, ldpanel, queues[d][stream1] ); magma_event_record( events[d][1], queues[d][stream1] ); d = (d+1)%ngpu; } } else { magma_setdevice(id); magma_ssetmatrix_async( jb, jb, Alo(j,j), lda, dlA(id, nb*j_local, j), ldda, queues[id][stream1] ); } /* panel factorize the off-diagonal */ if ( (j+jb) < m) { d = (j/nb+1)%ngpu; for( dd=0; dd < ngpu; dd++ ) { /* next column */ j_local2 = j_local+1; if ( d > id ) j_local2--; if ( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d, 0, 0, buf); ldpanel = nb; } nb2 = n_local[d] - j_local2*nb; nb0 = min(nb, nb2); magma_setdevice(d); if ( j+nb < n && d == (j/nb+1)%ngpu ) { /* owns next column, look-ahead next block on stream1 */ if ( j > 0 ) magma_queue_wait_event( queues[d][stream1], events[d][2] ); // wait for gemm update magmablasSetKernelStream( queues[d][stream1] ); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK) magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,0), trsm_nb ); magmablas_slaset( MagmaFull, nb0, jb, c_zero, c_zero, dx(d,0), nb0 ); magmablas_strsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda, 1, dinvA(d,0), dx(d,0) ); #else magma_strsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda); #endif magma_event_record( events[d][4], queues[d][stream1] ); } else if ( nb2 > 0 ) { /* other gpus updating all the blocks on stream2 */ /* update the entire column */ magma_queue_wait_event( queues[d][stream2], events[d][1] ); // wait for the cholesky factor magmablasSetKernelStream( queues[d][stream2] ); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK) magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,0), trsm_nb ); magmablas_slaset( MagmaFull, nb2, jb, c_zero, c_zero, dx(d,0), nb2 ); magmablas_strsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda, 1, dinvA(d,0), dx(d,0) ); #else magma_strsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda); #endif } d = (d+1)%ngpu; } /* end for d */ /* ========================================================== */ if ( j+jb < n ) { d = (j/nb+1)%ngpu; /* next column */ j_local2 = j_local+1; if ( d > id ) j_local2--; nb0 = min(nb, n_local[d]-nb*j_local2 ); /* even on 1 gpu, we copy off-diagonal to cpu (but don't synchronize). */ /* so we have the Cholesky factor on cpu at the end. */ int d2, buf2; //#define SPOTRF_DEVICE_TO_DEVICE #ifdef SPOTRF_DEVICE_TO_DEVICE // lookahead done /* broadcast the rows to gpus */ buf2 = ((j+jb)/nb)%ngpu; for( d2=0; d2 < ngpu; d2++ ) { magma_setdevice(d2); magma_queue_wait_event( queues[d2][stream3], events[d][4] ); if ( d2 != d ) { magma_scopymatrix_async( nb0, j+jb, dlPT(d2,0,nb,buf2), nb, // first nbxnb reserved for diagonal block dlA(d, nb*j_local2, 0), ldda, queues[d2][stream3] ); magma_event_record( events[d2][0], queues[d2][stream3] ); } else { magma_sgetmatrix_async( nb0, j+jb, dlA(d, nb*j_local2, 0), ldda, Alo(j+jb,0), lda, queues[d][stream3] ); } } #else // lookahead done magma_setdevice(d); magma_queue_wait_event( queues[d][stream3], events[d][4] ); magma_sgetmatrix_async( nb0, j+jb, dlA(d, nb*j_local2, 0), ldda, Alo(j+jb,0), lda, queues[d][stream3] ); magma_event_record( events[d][3], queues[d][stream3] ); /* syn on rows on CPU, seem to be needed on Pluto */ //magma_queue_sync( queues[d][stream3] ); /* broadcast the rows to gpus */ buf2 = ((j+jb)/nb)%ngpu; for( d2=0; d2 < ngpu; d2++ ) { if ( d2 != d ) { magma_setdevice(d2); magma_queue_wait_event( queues[d2][stream3], events[d][3] ); // getmatrix done magma_ssetmatrix_async( nb0, j+jb, Alo(j+jb,0), lda, dlPT(d2,0,nb,buf2), nb, // first nbxnb reserved for diagonal block queues[d2][stream3] ); magma_event_record( events[d2][0], queues[d2][stream3] ); } } #endif /* =================================== */ /* updates remaining blocks on stream2 */ nb2 = n_local[d] - (j_local2*nb + nb0); if ( nb2 > 0 ) { if ( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d,0,0,buf); ldpanel = nb; } magma_setdevice(d); magmablasSetKernelStream( queues[d][stream2] ); /* update the remaining blocks in the column */ #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK) int flag = 0; if (flag == 0) { magma_queue_wait_event( queues[d][stream2], events[d][4] ); // lookahead -> diagonal inversion } else { magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,flag), trsm_nb ); magma_queue_wait_event( queues[d][stream2], events[d][1] ); // panel received } magmablas_slaset( MagmaFull, nb2, jb, c_zero, c_zero, dx(d,1), nb2 ); magmablas_strsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda, flag, dinvA(d,flag), dx(d,1) ); #else magma_queue_wait_event( queues[d][stream2], events[d][1] ); // panel received magma_strsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda); #endif } } } } } /* end of else not upper */ /* == finalize the trace == */ trace_finalize( "spotrf.svg", "trace.css" ); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); for( j=0; j < 3; j++ ) { magma_queue_sync( queues[d][j] ); } #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK) magma_free( d_dinvA[d] ); magma_free( d_x[d] ); #endif } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_spotrf_mgpu */
/* //////////////////////////////////////////////////////////////////////////// -- Testing magma_zhemm_mgpu */ int main( int argc, char** argv) { TESTING_INIT(); magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex calpha = MAGMA_Z_MAKE( 3.456, 5.678 ); magmaDoubleComplex cbeta = MAGMA_Z_MAKE( 1.234, 2.456 ); real_Double_t gflops, gpu_perf=0., cpu_perf=0., gpu_time=0., cpu_time=0.; real_Double_t gpu_perf2=0., gpu_time2=0.; double error=0., errorbis=0., work[1]; magmaDoubleComplex *hA, *hX, *hB, *hR; magmaDoubleComplex_ptr dA[MagmaMaxGPUs], dX[MagmaMaxGPUs], dB[MagmaMaxGPUs], dwork[MagmaMaxGPUs], hwork[MagmaMaxGPUs+1]; magmaDoubleComplex_ptr dA2; magma_int_t M, N, size, lda, ldda, msize, nb, nstream; magma_int_t ione = 1; magma_int_t iseed[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); // default values nb = (opts.nb > 0 ? opts.nb : 64); nstream = (opts.nstream > 0 ? opts.nstream : 2); magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2]; magma_int_t nbcmplx = 0; magma_buildconnection_mgpu(gnode, &nbcmplx, opts.ngpu); printf("Initializing communication pattern... GPU-ncmplx %d\n\n", (int) nbcmplx); for (int i=0; i < nbcmplx; ++i) { int myngpu = gnode[i][MagmaMaxGPUs]; printf("cmplx %d has %d gpu ", i, myngpu); for(int j=0; j < myngpu; ++j) printf(" %d", (int) gnode[i][j]); printf("\n"); } magma_int_t nbevents = 2; magma_queue_t streams[MagmaMaxGPUs][20]; magma_event_t redevents[MagmaMaxGPUs][20]; magma_event_t redevents2[MagmaMaxGPUs][MagmaMaxGPUs*MagmaMaxGPUs+10]; for( int d = 0; d < opts.ngpu; ++d ) { for( magma_int_t i = 0; i < nstream; ++i ) { magma_queue_create( &streams[d][i] ); } for( magma_int_t i = 0; i < nbevents; ++i ) { cudaEventCreateWithFlags(&redevents[d][i], cudaEventDisableTiming); cudaEventCreateWithFlags(&redevents2[d][i], cudaEventDisableTiming); } } printf( "nb %d, ngpu %d, nstream %d version %d\n", (int) nb, (int) opts.ngpu, (int) nstream, (int) opts.version ); printf(" M N nb offset CPU GFlop/s (sec) GPU GFlop/s (sec) CUBLAS hemm (sec) ||R|| / ||A||*||X||\n"); printf("=========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { M = opts.msize[itest]; N = opts.nsize[itest]; for( int offset = 0; offset < N; offset += min(N,nb) ) { for( int iter = 0; iter < opts.niter; ++iter ) { msize = M - offset; lda = M; ldda = ((M + 31)/32)*32; size = lda*M; gflops = FLOPS_ZHEMM( MagmaLeft, (double)msize, (double)N ) / 1e9; magma_int_t dworksiz = ldda*N*3; magma_int_t hworksiz = lda*N; TESTING_MALLOC_CPU( hA, magmaDoubleComplex, lda*M ); TESTING_MALLOC_CPU( hX, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( hB, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( hR, magmaDoubleComplex, lda*N ); for( int d = 0; d < opts.ngpu; ++d ) { magma_int_t mlocal = ((M / nb) / opts.ngpu + 1) * nb; magma_setdevice( d ); TESTING_MALLOC_DEV( dA[d], magmaDoubleComplex, ldda*mlocal ); TESTING_MALLOC_DEV( dX[d], magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dB[d], magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dwork[d], magmaDoubleComplex, dworksiz ); TESTING_MALLOC_PIN( hwork[d], magmaDoubleComplex, hworksiz ); } TESTING_MALLOC_PIN( hwork[opts.ngpu], magmaDoubleComplex, lda*N ); if ( opts.check ) { magma_setdevice( 0 ); TESTING_MALLOC_DEV( dA2, magmaDoubleComplex, ldda*M ); } lapackf77_zlarnv( &ione, iseed, &size, hA ); magma_zmake_hermitian( M, hA, lda ); size = lda*N; lapackf77_zlarnv( &ione, iseed, &size, hX ); lapackf77_zlarnv( &ione, iseed, &size, hB ); lapackf77_zlacpy( "Full", &M, &N, hB, &lda, hR, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zsetmatrix_1D_col_bcyclic( M, M, hA, lda, dA, ldda, opts.ngpu, nb ); for( int d = 0; d < opts.ngpu; ++d ) { magma_setdevice( d ); //magmablasSetKernelStream( streams[ d ][ 0 ] ); magma_zsetmatrix( M, N, hX, lda, dX[d], ldda ); //if (d == 0) magma_zsetmatrix( M, N, hB, lda, dB[d], ldda ); // this is wrong coz when offset != 0 the gpu who do the beta*C may be not 0 so this should be related to stdev(starting device who own i=0 first col) magma_zsetmatrix( M, N, hB, lda, dB[d], ldda ); } //memset(hR, 0, lda*N*sizeof(magmaDoubleComplex)); trace_init( 1, opts.ngpu, nstream, (magma_queue_t*) streams ); //magma_int_t offset = 0; //nb; gpu_time = magma_sync_wtime(0); magmablas_zhemm_mgpu_com( MagmaLeft, MagmaLower, msize, N, calpha, dA, ldda, offset, dX, ldda, cbeta, dB, ldda, dwork, dworksiz, hR, lda, hwork, hworksiz, opts.ngpu, nb, streams, nstream, redevents2, nbevents, gnode, nbcmplx); gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gflops / gpu_time; #ifdef TRACING char buf[80]; snprintf( buf, sizeof(buf), "zhemm-m%d-n%d-nb%d-stream%d-ngpu%d-run%d.svg", (int) M, (int) N, (int) nb, (int) nstream, (int) opts.ngpu, (int) iter ); trace_finalize( buf, "trace.css" ); #endif /* ==================================================================== Performs operation using CUBLAS =================================================================== */ if ( opts.check && iter == 0 ) { magma_setdevice( 0 ); magmablasSetKernelStream( 0 ); magma_zsetmatrix( M, M, hA, lda, dA2, ldda ); magma_zsetmatrix( M, N, hX, lda, dX[0], ldda ); magma_zsetmatrix( M, N, hB, lda, dwork[0], ldda ); gpu_time2 = magma_sync_wtime(0); magma_zhemm( MagmaLeft, MagmaLower, msize, N, calpha, dA2+offset*ldda+offset, ldda, dX[0], ldda, cbeta, dwork[0], ldda ); gpu_time2 = magma_sync_wtime(0) - gpu_time2; gpu_perf2 = gflops / gpu_time2; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.check ) { // store ||A||*||X|| errorbis = lapackf77_zlange("fro", &msize, &msize, hA+offset*lda+offset, &lda, work ); errorbis *= lapackf77_zlange("fro", &msize, &N, hX, &lda, work ); //printf( "A =" ); magma_zprint( M, M, hA, lda ); //printf( "X =" ); magma_zprint( M, N, hX, lda ); //printf( "B =" ); magma_zprint( M, N, hB, lda ); cpu_time = magma_wtime(); blasf77_zhemm( "Left", "Lower", &msize, &N, &calpha, hA+offset*lda+offset, &lda, hX, &lda, &cbeta, hB, &lda ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* trace_file = fopen("AJETE/C", "w"); for (int j = 0; j < N; j++) for (int i = 0; i < siz; i++) fprintf(trace_file, "%10d%10d%40.30e\n", i+1, j+1, hB[j*lda+i]); fclose(trace_file); */ magma_int_t firstprint=0; for(magma_int_t dev=0; dev < opts.ngpu; ++dev) { magma_setdevice( dev ); magma_zgetmatrix( M, N, dB[dev], ldda, hR, lda ); // compute relative error ||R||/||A||*||X||, where R := B_magma - B_lapack = R - B size = lda*N; blasf77_zaxpy( &size, &c_neg_one, hB, &ione, hR, &ione ); error = lapackf77_zlange("fro", &msize, &N, hR, &lda, work) / errorbis; //printf( "R =" ); magma_zprint( M, N, hR, lda ); if (firstprint == 0) { printf( "%5d %5d %5d %5d %7.1f (%7.4f) %7.1f (%7.4f) %7.1f (%7.4f) %8.2e %s\n", (int) M, (int) N, (int) nb, (int) offset, cpu_perf, cpu_time, gpu_perf, gpu_time, gpu_perf2, gpu_time2, error, (error < tol ? "ok" : "failed") ); } else { printf( "%89s %8.2e %s\n", " ", error, (error < tol ? "ok" : "failed") ); } status += ! (error < tol); firstprint =1; } } else { printf( "%5d %5d %5d %5d --- ( --- ) %7.1f (%7.4f) --- ( --- ) ---\n", (int) M, (int) N, (int) nb, (int) offset, gpu_perf, gpu_time ); } TESTING_FREE_CPU( hA ); TESTING_FREE_CPU( hX ); TESTING_FREE_CPU( hB ); TESTING_FREE_PIN( hR ); for( int d = 0; d < opts.ngpu; ++d ) { magma_setdevice( d ); TESTING_FREE_DEV( dA[d] ); TESTING_FREE_DEV( dX[d] ); TESTING_FREE_DEV( dB[d] ); TESTING_FREE_DEV( dwork[d] ); TESTING_FREE_PIN( hwork[d] ); } TESTING_FREE_PIN( hwork[opts.ngpu] ); if ( opts.check ) { magma_setdevice( 0 ); TESTING_FREE_DEV( dA2 ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } // offset printf( "\n" ); } for( int d = 0; d < opts.ngpu; ++d ) { magma_setdevice( d ); for( magma_int_t i = 0; i < nstream; ++i ) { magma_queue_destroy( streams[d][i] ); } for( magma_int_t i = 0; i < nbevents; ++i ) { magma_event_destroy( redevents[d][i] ); magma_event_destroy( redevents2[d][i] ); } } TESTING_FINALIZE(); return status; }
/** Purpose ------- SSYTRD reduces a real symmetric matrix A to real symmetric tridiagonal form T by an orthogonal similarity transformation: Q**H * A * Q = T. Arguments --------- @param[in] num_gpus INTEGER The number of GPUs. num_gpus > 0. @param[in] num_streams INTEGER The number of GPU streams used for update. 10 >= num_streams > 0. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if UPLO = MagmaUpper, the diagonal and first superdiagonal of A are overwritten by the corresponding elements of the tridiagonal matrix T, and the elements above the first superdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = MagmaLower, the diagonal and first subdiagonal of A are over- written by the corresponding elements of the tridiagonal matrix T, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] d REAL array, dimension (N) The diagonal elements of the tridiagonal matrix T: D(i) = A(i,i). @param[out] e REAL array, dimension (N-1) The off-diagonal elements of the tridiagonal matrix T: E(i) = A(i,i+1) if UPLO = MagmaUpper, E(i) = A(i+1,i) if UPLO = MagmaLower. @param[out] tau REAL array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= 1. For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n-1) . . . H(2) H(1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in A(1:i-1,i+1), and tau in TAU(i). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(n-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i), and tau in TAU(i). The contents of A on exit are illustrated by the following examples with n = 5: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( d e v2 v3 v4 ) ( d ) ( d e v3 v4 ) ( e d ) ( d e v4 ) ( v1 e d ) ( d e ) ( v1 v2 e d ) ( d ) ( v1 v2 v3 e d ) where d and e denote diagonal and off-diagonal elements of T, and vi denotes an element of the vector defining H(i). @ingroup magma_ssyev_comp ********************************************************************/ extern "C" magma_int_t magma_ssytrd_mgpu( magma_int_t num_gpus, magma_int_t num_streams, magma_uplo_t uplo, magma_int_t n, float *A, magma_int_t lda, float *d, float *e, float *tau, float *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define dA(id, i, j) (dA[(id)] + (j)*ldda + (i)) #define dW(id, i, j) (dwork[(id)] + (j)*ldda + (i)) const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t ln, ldda; magma_int_t nb = magma_get_ssytrd_nb(n), ib; float c_neg_one = MAGMA_S_NEG_ONE; float c_one = MAGMA_S_ONE; float d_one = MAGMA_D_ONE; //float mv_time = 0.0; #ifdef PROFILE_SY2RK float up_time = 0.0; #endif magma_int_t kk, nx; magma_int_t i = 0, ii, iii, j, did, i_n; magma_int_t iinfo; magma_int_t ldwork, lddwork, lwkopt, ldwork2; magma_int_t lquery; magma_queue_t stream[MagmaMaxGPUs][10]; float *dx[MagmaMaxGPUs], *dy[MagmaMaxGPUs], *hwork; float *dwork2[MagmaMaxGPUs]; *info = 0; int upper = (uplo == MagmaUpper); lquery = (lwork == -1); if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } else if (lwork < nb*n && ! lquery) { *info = -9; } else if ( num_streams > 2 ) { *info = 2; // TODO fix } /* Determine the block size. */ ldwork = lddwork = n; lwkopt = n * nb; if (*info == 0) { work[0] = MAGMA_S_MAKE( lwkopt, 0 ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; /* Quick return if possible */ if (n == 0) { work[0] = c_one; return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); float *dA[MagmaMaxGPUs]; float *dwork[MagmaMaxGPUs]; float times[11]; for( did=0; did < 11; did++ ) times[did] = 0; //#define PROFILE_SY2RK #ifdef PROFILE_SY2RK magma_event_t start, stop; float etime; magma_setdevice(0); magma_event_create( &start ); magma_event_create( &stop ); #endif ldda = lda; ln = ((nb*(1+n/(nb*num_gpus))+31)/32)*32; ldwork2 = (1+ n / nb + (n % nb != 0)) * ldda; for( did=0; did < num_gpus; did++ ) { magma_setdevice(did); // TODO fix memory leak if ( MAGMA_SUCCESS != magma_smalloc(&dA[did], ln*ldda+3*lddwork*nb) || MAGMA_SUCCESS != magma_smalloc(&dx[did], num_streams*n) || MAGMA_SUCCESS != magma_smalloc(&dy[did], num_streams*n) || MAGMA_SUCCESS != magma_smalloc(&dwork2[did], ldwork2 ) ) { for( i=0; i < did; i++ ) { magma_setdevice(i); magma_free(dA[i]); magma_free(dx[i]); magma_free(dy[i]); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dwork[did] = dA[did] + ln*ldda; for( kk=0; kk < num_streams; kk++ ) magma_queue_create(&stream[did][kk]); } magma_setdevice(0); // TODO fix memory leak dwork2 if ( MAGMA_SUCCESS != magma_smalloc_pinned( &hwork, num_streams*num_gpus*n ) ) { for( i=0; i < num_gpus; i++ ) { magma_setdevice(i); magma_free(dA[i]); magma_free(dx[i]); magma_free(dy[i]); } *info = MAGMA_ERR_HOST_ALLOC; return *info; } if (n < 2048) nx = n; else nx = 512; if (upper) { /* Copy the matrix to the GPU */ if (1 <= n-nx) { magma_shtodhe(num_gpus, uplo, n, nb, A, lda, dA, ldda, stream, &iinfo ); } /* Reduce the upper triangle of A. Columns 1:kk are handled by the unblocked method. */ for (i = nb*((n-1)/nb); i >= nx; i -= nb) { ib = min(nb, n-i); ii = nb*(i/(nb*num_gpus)); did = (i/nb)%num_gpus; /* wait for the next panel */ if (i != nb*((n-1)/nb)) { magma_setdevice(did); magma_queue_sync(stream[did][0]); } magma_slatrd_mgpu(num_gpus, uplo, n, i+ib, ib, nb, A(0, 0), lda, e, tau, work, ldwork, dA, ldda, 0, dwork, i+ib, dwork2, ldwork2, 1, dx, dy, hwork, stream, times); magma_ssyr2k_mgpu(num_gpus, MagmaUpper, MagmaNoTrans, nb, i, ib, c_neg_one, dwork, i+ib, 0, d_one, dA, ldda, 0, num_streams, stream); /* get the next panel */ if (i-nb >= nx ) { ib = min(nb, n-(i-nb)); ii = nb*((i-nb)/(nb*num_gpus)); did = ((i-nb)/nb)%num_gpus; magma_setdevice(did); magma_sgetmatrix_async( (i-nb)+ib, ib, dA(did, 0, ii), ldda, A(0, i-nb), lda, stream[did][0] ); } /* Copy superdiagonal elements back into A, and diagonal elements into D */ for (j = i; j < i+ib; ++j) { if ( j > 0 ) { *A(j-1,j) = MAGMA_S_MAKE( e[j - 1], 0 ); } d[j] = MAGMA_S_REAL( *A(j, j) ); } } /* end of for i=... */ if ( nx > 0 ) { if (1 <= n-nx) { /* else A is already on CPU */ for (i=0; i < nx; i += nb) { ib = min(nb, n-i); ii = nb*(i/(nb*num_gpus)); did = (i/nb)%num_gpus; magma_setdevice(did); magma_sgetmatrix_async( nx, ib, dA(did, 0, ii), ldda, A(0, i), lda, stream[did][0] ); } } for( did=0; did < num_gpus; did++ ) { magma_setdevice(did); magma_queue_sync(stream[did][0]); } /* Use unblocked code to reduce the last or only block */ lapackf77_ssytd2(uplo_, &nx, A(0, 0), &lda, d, e, tau, &iinfo); } } else { trace_init( 1, num_gpus, num_streams, (CUstream_st**)stream ); /* Copy the matrix to the GPU */ if (1 <= n-nx) { magma_shtodhe(num_gpus, uplo, n, nb, A, lda, dA, ldda, stream, &iinfo ); } /* Reduce the lower triangle of A */ for (i = 0; i < n-nx; i += nb) { ib = min(nb, n-i); ii = nb*(i/(nb*num_gpus)); did = (i/nb)%num_gpus; /* Reduce columns i:i+ib-1 to tridiagonal form and form the matrix W which is needed to update the unreduced part of the matrix */ /* Get the current panel (no need for the 1st iteration) */ if (i != 0) { magma_setdevice(did); trace_gpu_start( did, 0, "comm", "get" ); magma_sgetmatrix_async( n-i, ib, dA(did, i, ii), ldda, A(i,i), lda, stream[did][0] ); trace_gpu_end( did, 0 ); magma_queue_sync(stream[did][0]); magma_setdevice(0); } magma_slatrd_mgpu(num_gpus, uplo, n, n-i, ib, nb, A(i, i), lda, &e[i], &tau[i], work, ldwork, dA, ldda, i, dwork, (n-i), dwork2, ldwork2, 1, dx, dy, hwork, stream, times ); #ifdef PROFILE_SY2RK magma_setdevice(0); if ( i > 0 ) { cudaEventElapsedTime(&etime, start, stop); up_time += (etime/1000.0); } magma_event_record(start, 0); #endif magma_ssyr2k_mgpu(num_gpus, MagmaLower, MagmaNoTrans, nb, n-i-ib, ib, c_neg_one, dwork, n-i, ib, d_one, dA, ldda, i+ib, num_streams, stream); #ifdef PROFILE_SY2RK magma_setdevice(0); magma_event_record(stop, 0); #endif /* Copy subdiagonal elements back into A, and diagonal elements into D */ for (j = i; j < i+ib; ++j) { if ( j+1 < n ) { *A(j+1,j) = MAGMA_S_MAKE( e[j], 0 ); } d[j] = MAGMA_S_REAL( *A(j, j) ); } } /* for i=... */ /* Use unblocked code to reduce the last or only block */ if ( i < n ) { iii = i; i_n = n-i; if ( i > 0 ) { for (; i < n; i += nb) { ib = min(nb, n-i); ii = nb*(i/(nb*num_gpus)); did = (i/nb)%num_gpus; magma_setdevice(did); magma_sgetmatrix_async( i_n, ib, dA(did, iii, ii), ldda, A(iii, i), lda, stream[did][0] ); } for( did=0; did < num_gpus; did++ ) { magma_setdevice(did); magma_queue_sync(stream[did][0]); } } lapackf77_ssytrd(uplo_, &i_n, A(iii, iii), &lda, &d[iii], &e[iii], &tau[iii], work, &lwork, &iinfo); } } #ifdef PROFILE_SY2RK magma_setdevice(0); if ( n > nx ) { cudaEventElapsedTime(&etime, start, stop); up_time += (etime/1000.0); } magma_event_destroy( start ); magma_event_destroy( stop ); #endif trace_finalize( "ssytrd.svg", "trace.css" ); for( did=0; did < num_gpus; did++ ) { magma_setdevice(did); for( kk=0; kk < num_streams; kk++ ) magma_queue_sync(stream[did][kk]); for( kk=0; kk < num_streams; kk++ ) magma_queue_destroy(stream[did][kk]); magma_free(dA[did]); magma_free(dx[did]); magma_free(dy[did]); magma_free(dwork2[did]); } magma_free_pinned(hwork); magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); work[0] = MAGMA_S_MAKE( lwkopt, 0 ); #ifdef PROFILE_SY2RK printf( " n=%d nb=%d\n", n, nb ); printf( " Time in SLARFG: %.2e seconds\n", times[0] ); //printf( " Time in SSYMV : %.2e seconds\n", mv_time ); printf( " Time in SSYR2K: %.2e seconds\n", up_time ); #endif return *info; } /* magma_ssytrd */
/** Purpose ======= DSYTRF computes the factorization of a real symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. The form of the factorization is A = U*D*U^H or A = L*D*L^H where U (or L) is a product of permutation and unit upper (lower) triangular matrices, and D is symmetric and block diagonal with 1-by-1 and 2-by-2 diagonal blocks. This is the blocked version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A DOUBLE PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, the block diagonal matrix D and the multipliers used to obtain the factor U or L (see below for further details). @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] ipiv INTEGER array, dimension (N) Details of the interchanges and the block structure of D. If IPIV(k) > 0, then rows and columns k and IPIV(k) were interchanged and D(k,k) is a 1-by-1 diagonal block. If UPLO = MagmaUpper and IPIV(k) = IPIV(k-1) < 0, then rows and columns k-1 and -IPIV(k) were interchanged and D(k-1:k,k-1:k) is a 2-by-2 diagonal block. If UPLO = MagmaLower and IPIV(k) = IPIV(k+1) < 0, then rows and columns k+1 and -IPIV(k) were interchanged and D(k:k+1,k:k+1) is a 2-by-2 diagonal block. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, D(i,i) is exactly zero. The factorization has been completed, but the block diagonal matrix D is exactly singular, and division by zero will occur if it is used to solve a system of equations. Further Details =============== If UPLO = MagmaUpper, then A = U*D*U', where U = P(n)*U(n)* ... *P(k)U(k)* ..., i.e., U is a product of terms P(k)*U(k), where k decreases from n to 1 in steps of 1 or 2, and D is a block diagonal matrix with 1-by-1 and 2-by-2 diagonal blocks D(k). P(k) is a permutation matrix as defined by IPIV(k), and U(k) is a unit upper triangular matrix, such that if the diagonal block D(k) is of order s (s = 1 or 2), then ( I v 0 ) k-s U(k) = ( 0 I 0 ) s ( 0 0 I ) n-k k-s s n-k If s = 1, D(k) overwrites A(k,k), and v overwrites A(1:k-1,k). If s = 2, the upper triangle of D(k) overwrites A(k-1,k-1), A(k-1,k), and A(k,k), and v overwrites A(1:k-2,k-1:k). If UPLO = MagmaLower, then A = L*D*L', where L = P(1)*L(1)* ... *P(k)*L(k)* ..., i.e., L is a product of terms P(k)*L(k), where k increases from 1 to n in steps of 1 or 2, and D is a block diagonal matrix with 1-by-1 and 2-by-2 diagonal blocks D(k). P(k) is a permutation matrix as defined by IPIV(k), and L(k) is a unit lower triangular matrix, such that if the diagonal block D(k) is of order s (s = 1 or 2), then ( I 0 0 ) k-1 L(k) = ( 0 I 0 ) s ( 0 v I ) n-k-s+1 k-1 s n-k-s+1 If s = 1, D(k) overwrites A(k,k), and v overwrites A(k+1:n,k). If s = 2, the lower triangle of D(k) overwrites A(k,k), A(k+1,k), and A(k+1,k+1), and v overwrites A(k+2:n,k:k+1). @ingroup magma_dsysv_comp ********************************************************************/ extern "C" magma_int_t magma_dsytrf( magma_uplo_t uplo, magma_int_t n, double *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { #define A(i_, j_) ( A + (i_) + (j_)*lda ) #define dA(i_, j_) (dA + (i_) + (j_)*ldda) /* .. Local Scalars .. */ magma_int_t nb = magma_get_dsytrf_nb(n); magma_int_t iinfo = 0, nk, kb, j, k; /* Test the input parameters. */ *info = 0; bool upper = (uplo == MagmaUpper); if ( ! upper && uplo != MagmaLower ) { *info = -1; } else if ( n < 0 ) { *info = -2; } else if ( lda < max( 1, n ) ) { *info = -4; } if ( *info != 0 ) { magma_xerbla( __func__, -(*info) ); return *info; } // TODO fix memory leak of dA if dW fails magma_int_t ldda = magma_roundup( n, 32 ); magmaDouble_ptr dA, dW; if ((MAGMA_SUCCESS != magma_dmalloc( &dA, n*ldda )) || (MAGMA_SUCCESS != magma_dmalloc( &dW, (1+nb)*ldda ))) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_t queues[2]; magma_event_t event[2]; magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); magma_event_create( &event[0] ); magma_event_create( &event[1] ); trace_init( 1, 1, 2, queues ); /* copy matrix to GPU */ trace_gpu_start( 0, 0, "set", "setA" ); //magma_dsetmatrix_async( n, n, A(0,0), lda, dA(0,0), ldda, queues[0] ); if ( upper ) { for (k = 0; k < n; k += nb ) { kb = min(nb, n-k); magma_dsetmatrix_async( k+kb, kb, A(0,k), lda, dA(0,k), ldda, queues[0] ); } } else { for (k = 0; k < n; k += nb ) { kb = min(nb, n-k); magma_dsetmatrix_async( n-k, kb, A(k,k), lda, dA(k,k), ldda, queues[0] ); } } trace_gpu_end( 0, 0 ); if ( upper ) { /* Factorize A as U*D*U' using the upper triangle of A K is the main loop index, decreasing from N to 1 in steps of KB, where KB is the number of columns factorized by DLASYF; KB is either NB or NB-1, or K for the last block */ kb = min(n,nb); for (k = n-1; k >= 0; k -= kb ) { nk = k+1; kb = min(nb, nk); if ( k+1 > nb ) { /* Factorize columns k-kb+1:k of A and use blocked code to update columns 1:k-kb */ magma_dlasyf_gpu( MagmaUpper, nk, kb, &kb, A( 0, 0 ), lda, dA( 0, 0 ), ldda, &ipiv[0], dW, ldda, queues, event, &iinfo ); } else { /* Use unblocked code to factorize columns 1:k of A */ magma_queue_sync( queues[0] ); magma_dgetmatrix( nk, nk, dA( 0, 0 ), ldda, A( 0, 0 ), lda, queues[0] ); lapackf77_dsytf2( MagmaUpperStr, &nk, A( 0, 0 ), &lda, &ipiv[0], &iinfo ); kb = k+1; } /* Set INFO on the first occurrence of a zero pivot */ if ( *info == 0 && iinfo > 0 ) *info = iinfo; } } else { /* Factorize A as L*D*L' using the lower triangle of A K is the main loop index, increasing from 1 to N in steps of KB, where KB is the number of columns factorized by DLASYF; KB is either NB or NB-1, or N-K+1 for the last block */ for (k = 0; k < n; k += kb ) { nk = n-k; kb = min(nb, n - k); if ( k < n-nb ) { /* Factorize columns k:k+kb-1 of A and use blocked code to update columns k+kb:n */ magma_dlasyf_gpu( MagmaLower, nk, nb, &kb, A( k, k ), lda, dA( k, k ), ldda, &ipiv[k], dW, ldda, queues, event, &iinfo ); } else { /* Use unblocked code to factorize columns k:n of A */ magma_queue_sync( queues[0] ); magma_dgetmatrix( nk, nk, dA(k,k), ldda, A(k,k), lda, queues[0] ); lapackf77_dsytf2( MagmaLowerStr, &nk, A( k, k ), &lda, &ipiv[k], &iinfo ); } /* Set INFO on the first occurrence of a zero pivot */ if ( *info == 0 && iinfo > 0 ) *info = iinfo + k; /* Adjust IPIV */ for (j = k; j < k + kb; j ++) { if ( ipiv[j] > 0 ) { ipiv[j] = ipiv[j] + k; } else { ipiv[j] = ipiv[j] - k; } } } } trace_finalize( "dsytrf.svg", "trace.css" ); magma_queue_sync( queues[0] ); magma_queue_sync( queues[1] ); magma_event_destroy( event[0] ); magma_event_destroy( event[1] ); magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); magma_free( dA ); magma_free( dW ); return *info; } /* End of DSYTRF */
/** Purpose ------- DSYTRD_HE2HB reduces a real symmetric matrix A to real symmetric band-diagonal form T by an orthogonal similarity transformation: Q**H * A * Q = T. This version stores the triangular matrices T used in the accumulated Householder transformations (I - V T V'). Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if UPLO = MagmaUpper, the Upper band-diagonal of A is overwritten by the corresponding elements of the band-diagonal matrix T, and the elements above the band diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = MagmaLower, the the Lower band-diagonal of A is overwritten by the corresponding elements of the band-diagonal matrix T, and the elements below the band-diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau DOUBLE_PRECISION array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= 1. For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] dT DOUBLE_PRECISION array on the GPU, dimension N*NB, where NB is the optimal blocksize. On exit dT holds the upper triangular matrices T from the accumulated Householder transformations (I - V T V') used in the factorization. The nb x nb matrices T are ordered consecutively in memory one after another. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n-1) . . . H(2) H(1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in A(1:i-1,i+1), and tau in TAU(i). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(n-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i), and tau in TAU(i). The contents of A on exit are illustrated by the following examples with n = 5: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( d e v2 v3 v4 ) ( d ) ( d e v3 v4 ) ( e d ) ( d e v4 ) ( v1 e d ) ( d e ) ( v1 v2 e d ) ( d ) ( v1 v2 v3 e d ) where d and e denote diagonal and off-diagonal elements of T, and vi denotes an element of the vector defining H(i). @ingroup magma_dsyev_2stage ********************************************************************/ extern "C" magma_int_t magma_dsytrd_sy2sb( magma_uplo_t uplo, magma_int_t n, magma_int_t nb, double *A, magma_int_t lda, double *tau, double *work, magma_int_t lwork, double *dT, magma_int_t *info) { #define A(a_1,a_2) ( A + ((a_2)-1)*( lda) + (a_1)-1) #define dA(a_1,a_2) (dA + ((a_2)-1)*(ldda) + (a_1)-1) #define tau_ref(a_1) (tau + (a_1)-1) #define dT(a_1) (dT + ((a_1)-1)*(lddt)) int ldda = ((n+31)/32)*32; int lddt = nb; double c_neg_one = MAGMA_D_NEG_ONE; double c_neg_half = MAGMA_D_NEG_HALF; double c_one = MAGMA_D_ONE; double c_zero = MAGMA_D_ZERO; double d_one = MAGMA_D_ONE; magma_int_t pm, pn, indi, indj, pk; magma_int_t pm_old=0, pn_old=0, indi_old=0, indj_old=0; int i; int lwkopt; int lquery; *info = 0; int upper = (uplo == MagmaUpper); lquery = (lwork == -1); if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } else if (lwork < 1 && ! lquery) { *info = -9; } /* Determine the block size. */ lwkopt = n * nb; if (*info == 0) { work[0] = MAGMA_D_MAKE( lwkopt, 0 ); } if (*info != 0) return *info; else if (lquery) return *info; /* Quick return if possible */ if (n == 0) { work[0] = c_one; return *info; } magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); double *dA; if (MAGMA_SUCCESS != magma_dmalloc( &dA, (n + 2*nb)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } // limit to 16 threads magma_int_t orig_threads = magma_get_lapack_numthreads(); magma_set_lapack_numthreads( min(orig_threads,16) ); /* Use the first panel of dA as work space */ double *dwork = dA + n*ldda; double *dW = dwork + nb*ldda; #ifdef TRACING char buf[80]; #endif magma_queue_t stream[3]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); stream[2] = 0; // default stream trace_init( 1, 1, 3, stream ); double *hT = work + lwork - nb*nb; lwork -= nb*nb; memset( hT, 0, nb*nb*sizeof(double)); magmablasSetKernelStream( stream[0] ); magma_event_t Pupdate_event; cudaEventCreateWithFlags(&Pupdate_event,cudaEventDisableTiming); //magma_event_create(&Pupdate_event); if (upper) { printf("DSYTRD_HE2HB is not yet implemented for upper matrix storage. Exit.\n"); exit(1); } else { /* Copy the matrix to the GPU */ if (1 <= n-nb) { trace_gpu_start( 0, 0, "set", "set A" ); magma_dsetmatrix_async( (n-nb), (n-nb), A(nb+1, nb+1), lda, dA(nb+1, nb+1), ldda, stream[0] ); trace_gpu_end( 0, 0 ); } /* Reduce the lower triangle of A */ for (i = 1; i <= n-nb; i += nb) { indi = i+nb; indj = i; pm = n - i - nb + 1; //pn = min(i+nb-1, n-nb) -i + 1; pn = nb; /* Get the current panel (no need for the 1st iteration) */ if (i > 1 ) { // dpanel_to_q copy the upper oof diagonal part of // the matrix to work to be restored later. acctually // the zero's and one's putted are not used this is only // because we don't have a function that copy only the // upper part of A to be restored after copying the // lookahead panel that has been computted from GPU to CPU. dpanel_to_q(MagmaUpper, pn-1, A(i, i+1), lda, work); trace_gpu_start( 0, 1, "get", "get panel" ); //magma_queue_sync( stream[0] ); magma_queue_wait_event(stream[1], Pupdate_event); //, 0); magma_dgetmatrix_async( (pm+pn), pn, dA( i, i), ldda, A ( i, i), lda, stream[1] ); trace_gpu_end( 0, 1 ); trace_gpu_start( 0, 2, "her2k", "her2k" ); magma_dsyr2k(MagmaLower, MagmaNoTrans, pm_old-pn_old, pn_old, c_neg_one, dA(indi_old+pn_old, indj_old), ldda, dW + pn_old, pm_old, d_one, dA(indi_old+pn_old, indi_old+pn_old), ldda); trace_gpu_end( 0, 2 ); trace_cpu_start( 0, "sync", "sync on 1" ); magma_queue_sync( stream[1] ); trace_cpu_end( 0 ); dq_to_panel(MagmaUpper, pn-1, A(i, i+1), lda, work); } /* ========================================================== QR factorization on a panel starting nb off of the diagonal. Prepare the V and T matrices. ========================================================== */ #ifdef TRACING snprintf( buf, sizeof(buf), "panel %d", i ); #endif trace_cpu_start( 0, "geqrf", buf ); lapackf77_dgeqrf(&pm, &pn, A(indi, indj), &lda, tau_ref(i), work, &lwork, info); /* Form the matrix T */ pk=min(pm,pn); lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &pm, &pk, A(indi, indj), &lda, tau_ref(i), hT, &nb); /* Prepare V - put 0s in the upper triangular part of the panel (and 1s on the diagonal), temporaly storing the original in work */ dpanel_to_q(MagmaUpper, pk, A(indi, indj), lda, work); trace_cpu_end( 0 ); /* Send V from the CPU to the GPU */ trace_gpu_start( 0, 0, "set", "set V and T" ); magma_dsetmatrix_async( pm, pk, A(indi, indj), lda, dA(indi, indj), ldda, stream[0] ); /* Send the triangular factor T to the GPU */ magma_dsetmatrix_async( pk, pk, hT, nb, dT(i), lddt, stream[0] ); trace_gpu_end( 0, 0 ); /* ========================================================== Compute W: 1. X = A (V T) 2. W = X - 0.5* V * (T' * (V' * X)) ========================================================== */ /* dwork = V T */ trace_cpu_start( 0, "sync", "sync on 0" ); // this sync is done here to be sure that the copy has been finished // because below we made a restore dq_to_panel and this restore need // to ensure that the copy has been finished. we did it here to allow // overlapp of restore with next gemm and symm. magma_queue_sync( stream[0] ); trace_cpu_end( 0 ); trace_gpu_start( 0, 2, "gemm", "work = V*T" ); magma_dgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk, c_one, dA(indi, indj), ldda, dT(i), lddt, c_zero, dwork, pm); trace_gpu_end( 0, 2 ); /* dW = X = A*V*T. dW = A*dwork */ trace_gpu_start( 0, 2, "hemm", "X = A*work" ); magma_dsymm(MagmaLeft, uplo, pm, pk, c_one, dA(indi, indi), ldda, dwork, pm, c_zero, dW, pm); trace_gpu_end( 0, 2 ); /* restore the panel */ dq_to_panel(MagmaUpper, pk, A(indi, indj), lda, work); /* dwork = V*T already ==> dwork' = T'*V' * compute T'*V'*X ==> dwork'*W ==> * dwork + pm*nb = ((T' * V') * X) = dwork' * X = dwork' * W */ trace_gpu_start( 0, 2, "gemm", "work = T'*V'*X" ); magma_dgemm(MagmaConjTrans, MagmaNoTrans, pk, pk, pm, c_one, dwork, pm, dW, pm, c_zero, dwork + pm*nb, nb); trace_gpu_end( 0, 2 ); /* W = X - 0.5 * V * T'*V'*X * = X - 0.5 * V * (dwork + pm*nb) = W - 0.5 * V * (dwork + pm*nb) */ trace_gpu_start( 0, 2, "gemm", "W = X - 0.5*V*(T'*V'*X)" ); magma_dgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk, c_neg_half, dA(indi, indj), ldda, dwork + pm*nb, nb, c_one, dW, pm); trace_gpu_end( 0, 2 ); /* ========================================================== Update the unreduced submatrix A(i+ib:n,i+ib:n), using an update of the form: A := A - V*W' - W*V' ========================================================== */ if (i + nb <= n-nb) { /* There would be next iteration; do lookahead - update the next panel */ trace_gpu_start( 0, 2, "gemm", "gemm 4 next panel left" ); magma_dgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one, dA(indi, indj), ldda, dW, pm, c_one, dA(indi, indi), ldda); trace_gpu_end( 0, 2 ); trace_gpu_start( 0, 2, "gemm", "gemm 5 next panel right" ); magma_dgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one, dW, pm, dA(indi, indj), ldda, c_one, dA(indi, indi), ldda); trace_gpu_end( 0, 2 ); magma_event_record(Pupdate_event, stream[0]); } else { /* no look-ahead as this is last iteration */ trace_gpu_start( 0, 2, "her2k", "her2k last iteration" ); magma_dsyr2k(MagmaLower, MagmaNoTrans, pk, pk, c_neg_one, dA(indi, indj), ldda, dW, pm, d_one, dA(indi, indi), ldda); trace_gpu_end( 0, 2 ); } indi_old = indi; indj_old = indj; pm_old = pm; pn_old = pn; } // end loop for (i) /* Send the last block to the CPU */ pk = min(pm,pn); if (1 <= n-nb) { dpanel_to_q(MagmaUpper, pk-1, A(n-pk+1, n-pk+2), lda, work); trace_gpu_start( 0, 2, "get", "get last block" ); magma_dgetmatrix( pk, pk, dA(n-pk+1, n-pk+1), ldda, A(n-pk+1, n-pk+1), lda ); trace_gpu_end( 0, 2 ); dq_to_panel(MagmaUpper, pk-1, A(n-pk+1, n-pk+2), lda, work); } }// end of LOWER trace_finalize( "dsytrd_sy2sb.svg", "trace.css" ); magma_event_destroy( Pupdate_event ); magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( dA ); work[0] = MAGMA_D_MAKE( lwkopt, 0 ); magmablasSetKernelStream( orig_stream ); magma_set_lapack_numthreads( orig_threads ); return *info; } /* magma_dsytrd_sy2sb */
/** Purpose ------- ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Use two buffer to send panels. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] d_lAT COMPLEX_16 array of pointers on the GPU, dimension (ngpu). On entry, the M-by-N matrix A distributed over GPUs (d_lAT[d] points to the local matrix on d-th GPU). It uses a 1D block column cyclic format (with the block size nb), and each local matrix is stored by row. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] lddat INTEGER The leading dimension of the array d_lAT[d]. LDDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param (workspace) on device d_lAP COMPLEX_16 array of pointers on the GPU, dimension (ngpu). d_lAP[d] is the workspace on d-th GPU. Each local workspace must be of size (3+ngpu)*nb*maxm, where maxm is m rounded up to a multiple of 32 and nb is the block size. @param (workspace) W COMPLEX_16 array, dimension (ngpu*nb*maxm). It is used to store panel on CPU. @param[in] ldw INTEGER The leading dimension of the workspace w. @param[in] queues magma_queue_t queues[d] points to the streams for the d-th GPU to execute in. Each GPU require two streams. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_zgesv_comp ********************************************************************/ extern "C" magma_int_t magma_zgetrf2_mgpu( magma_int_t ngpu, magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset, magmaDoubleComplex_ptr d_lAT[], magma_int_t lddat, magma_int_t *ipiv, magmaDoubleComplex_ptr d_lAP[], magmaDoubleComplex *W, magma_int_t ldw, magma_queue_t queues[][2], magma_int_t *info) { #define dAT(id,i,j) (d_lAT[(id)] + ((offset)+(i)*nb)*lddat + (j)*nb) #define W(j) (W + ((j)%ngpu)*nb*ldw) magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t block_size = 32; magma_int_t iinfo, n_local[MagmaMaxGPUs]; magma_int_t maxm, mindim; magma_int_t i, j, d, dd, rows, cols, s, ldpan[MagmaMaxGPUs]; magma_int_t id, j_local, j_local2, nb0, nb1, h = 2+ngpu; magmaDoubleComplex *d_panel[MagmaMaxGPUs], *panel_local[MagmaMaxGPUs]; /* Check arguments */ *info = 0; if (m < 0) *info = -2; else if (n < 0) *info = -3; else if (ngpu*lddat < max(1,n)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); if ( ngpu > ceil((double)n/nb) ) { *info = -1; return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); /* Use hybrid blocked code. */ maxm = ((m + block_size-1)/block_size)*block_size; /* some initializations */ for (d=0; d < ngpu; d++) { magma_setdevice(d); n_local[d] = ((n/nb)/ngpu)*nb; if (d < (n/nb) % ngpu) n_local[d] += nb; else if (d == (n/nb) % ngpu) n_local[d] += n % nb; /* workspaces */ d_panel[d] = &(d_lAP[d][h*nb*maxm]); /* temporary panel storage */ } trace_init( 1, ngpu, 2, (CUstream_st**)queues ); /* start sending the panel to cpu */ nb0 = min(mindim, nb); magma_setdevice(0); magmablasSetKernelStream(queues[0][1]); trace_gpu_start( 0, 1, "comm", "get" ); magmablas_ztranspose( nb0, m, dAT(0,0,0), lddat, d_lAP[0], maxm ); magma_zgetmatrix_async( m, nb0, d_lAP[0], maxm, W(0), ldw, queues[0][1] ); trace_gpu_end( 0, 1 ); /* ------------------------------------------------------------------------------------- */ magma_timer_t time=0; timer_start( time ); s = mindim / nb; for( j=0; j < s; j++ ) { /* Set the GPU number that holds the current panel */ id = j % ngpu; magma_setdevice(id); /* Set the local index where the current panel is */ j_local = j/ngpu; cols = maxm - j*nb; rows = m - j*nb; /* synchronize j-th panel from id-th gpu into work */ magma_queue_sync( queues[id][1] ); /* j-th panel factorization */ trace_cpu_start( 0, "getrf", "getrf" ); lapackf77_zgetrf( &rows, &nb, W(j), &ldw, ipiv+j*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) { *info = iinfo + j*nb; } trace_cpu_end( 0 ); /* start sending the panel to all the gpus */ d = (j+1) % ngpu; for( dd=0; dd < ngpu; dd++ ) { magma_setdevice(d); trace_gpu_start( 0, 1, "comm", "set" ); magma_zsetmatrix_async( rows, nb, W(j), ldw, &d_lAP[d][(j%h)*nb*maxm], cols, queues[d][1] ); trace_gpu_end( 0, 1 ); d = (d+1) % ngpu; } /* apply the pivoting */ d = (j+1) % ngpu; for( dd=0; dd < ngpu; dd++ ) { magma_setdevice(d); trace_gpu_start( d, 1, "pivot", "pivot" ); if ( dd == 0 ) { for( i=j*nb; i < j*nb + nb; ++i ) { ipiv[i] += j*nb; } } magmablas_zlaswp_q( lddat, dAT(d,0,0), lddat, j*nb + 1, j*nb + nb, ipiv, 1, queues[d][0] ); trace_gpu_end( d, 1 ); d = (d+1) % ngpu; } /* update the trailing-matrix/look-ahead */ d = (j+1) % ngpu; for( dd=0; dd < ngpu; dd++ ) { magma_setdevice(d); /* storage for panel */ if ( d == id ) { /* the panel belond to this gpu */ panel_local[d] = dAT(d,j,j_local); ldpan[d] = lddat; /* next column */ j_local2 = j_local+1; } else { /* the panel belong to another gpu */ panel_local[d] = d_panel[d]; ldpan[d] = nb; /* next column */ j_local2 = j_local; if ( d < id ) j_local2 ++; } /* the size of the next column */ if ( s > (j+1) ) { nb0 = nb; } else { nb0 = n_local[d]-nb*(s/ngpu); if ( d < s % ngpu ) nb0 -= nb; } if ( d == (j+1) % ngpu) { /* owns the next column, look-ahead the column */ nb1 = nb0; magmablasSetKernelStream(queues[d][1]); /* make sure all the pivoting has been applied */ magma_queue_sync(queues[d][0]); trace_gpu_start( d, 1, "gemm", "gemm" ); /* transpose panel on GPU */ magmablas_ztranspose( rows, nb, &d_lAP[d][(j%h)*nb*maxm], cols, panel_local[d], ldpan[d] ); /* synch for remaining update */ magma_queue_sync(queues[d][1]); } else { /* update the entire trailing matrix */ nb1 = n_local[d] - j_local2*nb; magmablasSetKernelStream(queues[d][0]); /* synchronization to make sure panel arrived on gpu */ magma_queue_sync(queues[d][1]); trace_gpu_start( d, 0, "gemm", "gemm" ); /* transpose panel on GPU */ magmablas_ztranspose( rows, nb, &d_lAP[d][(j%h)*nb*maxm], cols, panel_local[d], ldpan[d] ); } /* gpu updating the trailing matrix */ magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb, c_one, panel_local[d], ldpan[d], dAT(d, j, j_local2), lddat); magma_zgemm( MagmaNoTrans, MagmaNoTrans, nb1, m-(j+1)*nb, nb, c_neg_one, dAT(d, j, j_local2), lddat, &(panel_local[d][nb*ldpan[d]]), ldpan[d], c_one, dAT(d, j+1, j_local2), lddat ); if ( d == (j+1) % ngpu ) { /* Set the local index where the current panel is */ int loff = j+1; int j_local = (j+1)/ngpu; int ldda = maxm - (j+1)*nb; int cols = m - (j+1)*nb; nb0 = min(nb, mindim - (j+1)*nb); /* size of the diagonal block */ trace_gpu_end( d, 1 ); if ( nb0 > 0 ) { /* transpose the panel for sending it to cpu */ trace_gpu_start( d, 1, "comm", "get" ); magmablas_ztranspose( nb0, m-(j+1)*nb, dAT(d,loff,j_local), lddat, &d_lAP[d][((j+1)%h)*nb*maxm], ldda ); /* send the panel to cpu */ magma_zgetmatrix_async( cols, nb0, &d_lAP[d][((j+1)%h)*nb*maxm], ldda, W(j+1), ldw, queues[d][1] ); trace_gpu_end( d, 1 ); } } else { trace_gpu_end( d, 0 ); } d = (d+1) % ngpu; } /* update the remaining matrix by gpu owning the next panel */ if ( (j+1) < s ) { int j_local = (j+1)/ngpu; int rows = m - (j+1)*nb; d = (j+1) % ngpu; magma_setdevice(d); magmablasSetKernelStream(queues[d][0]); trace_gpu_start( d, 0, "gemm", "gemm" ); magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d] - (j_local+1)*nb, nb, c_one, panel_local[d], ldpan[d], dAT(d,j,j_local+1), lddat ); magma_zgemm( MagmaNoTrans, MagmaNoTrans, n_local[d]-(j_local+1)*nb, rows, nb, c_neg_one, dAT(d,j,j_local+1), lddat, &(panel_local[d][nb*ldpan[d]]), ldpan[d], c_one, dAT(d,j+1, j_local+1), lddat ); trace_gpu_end( d, 0 ); } } /* end of for j=1..s */ /* ------------------------------------------------------------------------------ */ /* Set the GPU number that holds the last panel */ id = s % ngpu; /* Set the local index where the last panel is */ j_local = s/ngpu; /* size of the last diagonal-block */ nb0 = min(m - s*nb, n - s*nb); rows = m - s*nb; cols = maxm - s*nb; if ( nb0 > 0 ) { magma_setdevice(id); /* wait for the last panel on cpu */ magma_queue_sync( queues[id][1] ); /* factor on cpu */ lapackf77_zgetrf( &rows, &nb0, W(s), &ldw, ipiv+s*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + s*nb; /* send the factor to gpus */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); j_local2 = j_local; if ( d < id ) j_local2 ++; if ( d == id || n_local[d] > j_local2*nb ) { magma_zsetmatrix_async( rows, nb0, W(s), ldw, &d_lAP[d][(s%h)*nb*maxm], cols, queues[d][1] ); } } for( d=0; d < ngpu; d++ ) { magma_setdevice(d); if ( d == 0 ) { for( i=s*nb; i < s*nb + nb0; ++i ) { ipiv[i] += s*nb; } } magmablas_zlaswp_q( lddat, dAT(d,0,0), lddat, s*nb + 1, s*nb + nb0, ipiv, 1, queues[d][0] ); } for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magmablasSetKernelStream(queues[d][1]); /* wait for the pivoting to be done */ magma_queue_sync( queues[d][0] ); j_local2 = j_local; if ( d < id ) j_local2++; if ( d == id ) { /* the panel belond to this gpu */ panel_local[d] = dAT(d,s,j_local); /* next column */ nb1 = n_local[d] - j_local*nb-nb0; magmablas_ztranspose( rows, nb0, &d_lAP[d][(s%h)*nb*maxm], cols, panel_local[d], lddat ); if ( nb1 > 0 ) { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb0, c_one, panel_local[d], lddat, dAT(d,s,j_local)+nb0, lddat); } } else if ( n_local[d] > j_local2*nb ) { /* the panel belong to another gpu */ panel_local[d] = d_panel[d]; /* next column */ nb1 = n_local[d] - j_local2*nb; magmablas_ztranspose( rows, nb0, &d_lAP[d][(s%h)*nb*maxm], cols, panel_local[d], nb ); magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb0, c_one, panel_local[d], nb, dAT(d,s,j_local2), lddat); } } } /* if ( nb0 > 0 ) */ /* clean up */ trace_finalize( "zgetrf_mgpu.svg","trace.css" ); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( queues[d][0] ); magma_queue_sync( queues[d][1] ); } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); timer_start( time ); timer_printf("\n Performance %f GFlop/s\n", FLOPS_ZGETRF(m,n) / 1e9 / time ); return *info; } /* magma_zgetrf2_mgpu */
/** Purpose ======= ZHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method. The form of the factorization is A = U*D*U**H or A = L*D*L**H where U (or L) is a product of permutation and unit upper (lower) triangular matrices, and D is Hermitian and block diagonal with 1-by-1 and 2-by-2 diagonal blocks. This is the blocked version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] UPLO CHARACTER*1 - = 'U': Upper triangle of A is stored; - = 'L': Lower triangle of A is stored. @param[in] N INTEGER The order of the matrix A. N >= 0. @param[in,out] A COMPLEX*16 array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, the block diagonal matrix D and the multipliers used to obtain the factor U or L (see below for further details). @param[in] LDA INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] IPIV INTEGER array, dimension (N) Details of the interchanges and the block structure of D. If IPIV(k) > 0, then rows and columns k and IPIV(k) were interchanged and D(k,k) is a 1-by-1 diagonal block. If UPLO = 'U' and IPIV(k) = IPIV(k-1) < 0, then rows and columns k-1 and -IPIV(k) were interchanged and D(k-1:k,k-1:k) is a 2-by-2 diagonal block. If UPLO = 'L' and IPIV(k) = IPIV(k+1) < 0, then rows and columns k+1 and -IPIV(k) were interchanged and D(k:k+1,k:k+1) is a 2-by-2 diagonal block. @param[out] INFO INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, D(i,i) is exactly zero. The factorization has been completed, but the block diagonal matrix D is exactly singular, and division by zero will occur if it is used to solve a system of equations. Further Details =============== If UPLO = 'U', then A = U*D*U', where U = P(n)*U(n)* ... *P(k)U(k)* ..., i.e., U is a product of terms P(k)*U(k), where k decreases from n to 1 in steps of 1 or 2, and D is a block diagonal matrix with 1-by-1 and 2-by-2 diagonal blocks D(k). P(k) is a permutation matrix as defined by IPIV(k), and U(k) is a unit upper triangular matrix, such that if the diagonal block D(k) is of order s (s = 1 or 2), then ( I v 0 ) k-s U(k) = ( 0 I 0 ) s ( 0 0 I ) n-k k-s s n-k If s = 1, D(k) overwrites A(k,k), and v overwrites A(1:k-1,k). If s = 2, the upper triangle of D(k) overwrites A(k-1,k-1), A(k-1,k), and A(k,k), and v overwrites A(1:k-2,k-1:k). If UPLO = 'L', then A = L*D*L', where L = P(1)*L(1)* ... *P(k)*L(k)* ..., i.e., L is a product of terms P(k)*L(k), where k increases from 1 to n in steps of 1 or 2, and D is a block diagonal matrix with 1-by-1 and 2-by-2 diagonal blocks D(k). P(k) is a permutation matrix as defined by IPIV(k), and L(k) is a unit lower triangular matrix, such that if the diagonal block D(k) is of order s (s = 1 or 2), then ( I 0 0 ) k-1 L(k) = ( 0 I 0 ) s ( 0 v I ) n-k-s+1 k-1 s n-k-s+1 If s = 1, D(k) overwrites A(k,k), and v overwrites A(k+1:n,k). If s = 2, the lower triangle of D(k) overwrites A(k,k), A(k+1,k), and A(k+1,k+1), and v overwrites A(k+2:n,k:k+1). @ingroup magma_zhetrf_comp ********************************************************************/ extern "C" magma_int_t magma_zhetrf( magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { #define A(i, j) ( A + (j)*lda + (i)) #define dA(i, j) (dA + (j)*ldda + (i)) /* .. Local Scalars .. */ magma_int_t upper; magma_int_t nb = magma_get_zhetrf_nb(n); magma_int_t iinfo = 0, nk, kb; /* .. Executable Statements .. */ /* Test the input parameters. */ *info = 0; upper = (uplo == MagmaUpper); if ( !upper && uplo != MagmaLower ) { *info = -1; } else if ( n < 0 ) { *info = -2; } else if ( lda < max( 1, n ) ) { *info = -4; } if ( *info != 0 ) { magma_xerbla( __func__, -(*info) ); return *info; } magma_int_t ldda = 32*((n+31)/32); magmaDoubleComplex *dA, *dW; if ((MAGMA_SUCCESS != magma_zmalloc( &dA, n*ldda )) || (MAGMA_SUCCESS != magma_zmalloc( &dW, (1+nb)*ldda ))) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t stream[2]; magma_event_t event[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); magma_event_create( &event[1] ); trace_init( 1, 1, 2, (CUstream_st**)stream ); /* copy matrix to GPU */ trace_gpu_start( 0, 0, "set", "setA" ); //magma_zsetmatrix_async( n, n, A(0,0), lda, dA(0,0), ldda, stream[0] ); if ( upper ) { for (int k = 0; k < n; k+=nb ) { kb = min(nb, n-k); magma_zsetmatrix_async( k+kb, kb, A(0,k), lda, dA(0,k), ldda, stream[0] ); } } else { for (int k = 0; k < n; k+=nb ) { kb = min(nb, n-k); magma_zsetmatrix_async( n-k, kb, A(k,k), lda, dA(k,k), ldda, stream[0] ); } } trace_gpu_end( 0, 0 ); if ( upper ) { /* Factorize A as U*D*U' using the upper triangle of A K is the main loop index, decreasing from N to 1 in steps of KB, where KB is the number of columns factorized by ZLAHEF; KB is either NB or NB-1, or K for the last block */ kb = min(n,nb); for (int k = n-1; k >= 0; k-=kb ) { nk = k+1; kb = min(nb, nk); if ( k+1 > nb ) { /* Factorize columns k-kb+1:k of A and use blocked code to update columns 1:k-kb */ magma_zlahef_gpu( MagmaUpper, nk, kb, &kb, A( 0, 0 ), lda, dA( 0, 0 ), ldda, &ipiv[0], dW, ldda, stream, event, &iinfo ); } else { /* Use unblocked code to factorize columns 1:k of A */ magma_queue_sync( stream[0] ); magma_zgetmatrix( nk, nk, dA( 0, 0 ),ldda, A( 0, 0 ),lda ); lapackf77_zhetf2( MagmaUpperStr, &nk, A( 0, 0 ), &lda, &ipiv[0], &iinfo ); kb = k+1; } /* Set INFO on the first occurrence of a zero pivot */ if ( *info == 0 && iinfo > 0 ) *info = iinfo; } } else { /* Factorize A as L*D*L' using the lower triangle of A K is the main loop index, increasing from 1 to N in steps of KB, where KB is the number of columns factorized by ZLAHEF; KB is either NB or NB-1, or N-K+1 for the last block */ for (int k = 0; k < n; k += kb ) { nk = n-k; kb = min(nb, n - k); if ( k < n-nb ) { /* Factorize columns k:k+kb-1 of A and use blocked code to update columns k+kb:n */ magma_zlahef_gpu( MagmaLower, nk, nb, &kb, A( k, k ), lda, dA( k, k ), ldda, &ipiv[k], dW, ldda, stream, event, &iinfo ); } else { /* Use unblocked code to factorize columns k:n of A */ magma_queue_sync( stream[0] ); magma_zgetmatrix( nk,nk, dA(k,k),ldda, A(k,k),lda ); lapackf77_zhetf2( MagmaLowerStr, &nk, A( k, k ), &lda, &ipiv[k], &iinfo ); } /* Set INFO on the first occurrence of a zero pivot */ if ( *info == 0 && iinfo > 0 ) *info = iinfo + k; /* Adjust IPIV */ for (int j = k; j < k + kb; j ++) { if ( ipiv[j] > 0 ) { ipiv[j] = ipiv[j] + k; } else { ipiv[j] = ipiv[j] - k; } } } } trace_finalize( "zhetrf.svg","trace.css" ); magma_queue_sync( stream[0] ); magma_queue_sync( stream[1] ); magmablasSetKernelStream( NULL ); magma_event_destroy( event[0] ); magma_event_destroy( event[1] ); magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( dA ); magma_free( dW ); return *info; /* End of ZHETRF */ }
extern "C" magma_int_t magma_dpotrf3_mgpu(int num_gpus, char uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, double **d_lA, magma_int_t ldda, double **d_lP, magma_int_t lddp, double *a, magma_int_t lda, magma_int_t h, cudaStream_t stream[][3], cudaEvent_t event[][5], magma_int_t *info ) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix dA. Auxiliary subroutine for dpotrf2_ooc. It is multiple gpu interface to compute Cholesky of a "rectangular" matrix. The factorization has the form dA = U**T * U, if UPLO = 'U', or dA = L * L**T, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of dA is stored; = 'L': Lower triangle of dA is stored. N (input) INTEGER The order of the matrix dA. N >= 0. dA (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) On entry, the symmetric matrix dA. If UPLO = 'U', the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**T * U or dA = L * L**T. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be dividable by 16. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ magma_int_t j, jb, nb0, nb2, d, dd, id, j_local, j_local2, buf; char uplo_[2] = {uplo, 0}; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double d_one = 1.0; double d_neg_one = -1.0; int upper = lapackf77_lsame(uplo_, "U"); double *dlpanel; magma_int_t n_local[MagmaMaxGPUs], ldpanel; //cudaEvent_t event0[MagmaMaxGPUs], /* send row to CPU */ // event1[MagmaMaxGPUs], /* send diag to GPU */ // event2[MagmaMaxGPUs], /* offdiagonal update */ // event3[MagmaMaxGPUs], /* send row to GPU */ // event4[MagmaMaxGPUs]; /* lookahead */ const magma_int_t stream1 = 0, stream2 = 1, stream3 = 2; double *d_dinvA[MagmaMaxGPUs][2], *d_x[MagmaMaxGPUs][2]; /* used by dtrsm_work */ *info = 0; if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (!upper && num_gpus*ldda < max(1,n)) { *info = -4; } else if (upper && ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* initialization */ for( d=0; d<num_gpus; d++ ) { /* local-n and local-ld */ if (upper) { n_local[d] = ((n/nb)/num_gpus)*nb; if (d < (n/nb)%num_gpus) n_local[d] += nb; else if (d == (n/nb)%num_gpus) n_local[d] += n%nb; } else { n_local[d] = ((m/nb)/num_gpus)*nb; if (d < (m/nb)%num_gpus) n_local[d] += nb; else if (d == (m/nb)%num_gpus) n_local[d] += m%nb; } //magma_setdevice(d); //magma_event_create( &event0[d] ); //magma_event_create( &event1[d] ); //magma_event_create( &event2[d] ); //magma_event_create( &event3[d] ); //magma_event_create( &event4[d] ); } /* == initialize the trace */ trace_init( 1, num_gpus, 3, (CUstream_st**)stream ); if (upper) { /* ---------------------------------------------- */ /* Upper-triangular case */ /* > Compute the Cholesky factorization A = U'*U. */ /* ---------------------------------------------- */ #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK) /* invert the diagonals * Allocate device memory for the inversed diagonal blocks, size=m*NB */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( j=0; j<2; j++ ) { cudaMalloc((void**)&d_dinvA[d][j], nb*nb*sizeof(double)); cudaMalloc((void**)&d_x[d][j], n*nb*sizeof(double)); cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(double)); cudaMemset(d_x[d][j], 0, n*nb*sizeof(double)); } } magma_setdevice(0); #endif for (j=0; j<m; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (m-j)); /* Update the current diagonal block on stream1 */ magma_setdevice(id); if( j > 0 ) { magmablasSetKernelStream(stream[id][stream1]); trace_gpu_start( id, stream1, "syrk", "syrk" ); magma_dsyrk(MagmaUpper, MagmaTrans, jb, j, d_neg_one, dlA(id, 0, nb*j_local), ldda, d_one, dlA(id, j, nb*j_local), ldda); trace_gpu_end( id, stream1 ); } /* send the diagonal to cpu on stream1 */ trace_gpu_start( id, stream1, "comm", "D to CPU" ); magma_dgetmatrix_async( jb, jb, dlA(id, j, nb*j_local), ldda, Aup(j,j), lda, stream[id][stream1] ); trace_gpu_end( id, stream1 ); /* update off-diagonal blocks in the panel */ if( j > 0 ) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { j_local2 = j_local+1; if( d > id ) j_local2 --; nb0 = nb*j_local2; if( n_local[d] > nb0 ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][stream2]); if( d == id ) { dlpanel = dlA(d, 0, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d, jb, 0, buf); ldpanel = lddp; magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu } trace_gpu_start( d, stream2, "gemm", "gemm" ); magma_dgemm(MagmaTrans, MagmaNoTrans, jb, n_local[d]-nb0, j, c_neg_one, dlpanel, ldpanel, dlA(d, 0, nb0), ldda, c_one, dlA(d, j, nb0), ldda); trace_gpu_end( d, stream2 ); magma_event_record( event[d][2], stream[d][stream2] ); } d = (d+1)%num_gpus; } } /* wait for panel and factorize it on cpu */ magma_setdevice(id); magma_queue_sync( stream[id][stream1] ); trace_cpu_start( 0, "getrf", "getrf" ); lapackf77_dpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info); trace_cpu_end( 0 ); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus on stream1 */ if ( (j+jb) < n) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { if( d == id ) { dlpanel = dlA(d, j, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d, 0, 0, buf); ldpanel = lddp; } magma_setdevice(d); trace_gpu_start( d, stream1, "comm", "comm" ); magma_dsetmatrix_async( jb, jb, Aup(j,j), lda, dlpanel, ldpanel, stream[d][stream1] ); trace_gpu_end( d, stream1 ); magma_event_record( event[d][1], stream[d][stream1] ); d = (d+1)%num_gpus; } } else { magma_setdevice(id); trace_gpu_start( id, stream1, "comm", "comm" ); magma_dsetmatrix_async( jb, jb, Aup(j,j), lda, dlA(id, j, nb*j_local), ldda, stream[id][stream1] ); trace_gpu_end( id, stream1 ); } /* panel-factorize the off-diagonal */ if ( (j+jb) < n) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; if( d == id ) { dlpanel = dlA(d,j,nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d, 0, 0, buf); ldpanel = lddp; } nb2 = n_local[d] - j_local2*nb; nb0 = min(nb, nb2); magma_setdevice(d); //magma_queue_sync( stream[d][stream1] ); // synch on chol for remaining update //magma_queue_sync( stream[d][stream2] ); if( j+jb < m && d == (j/nb+1)%num_gpus ) { /* owns the next column, look-ahead next block on stream1 */ magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update magmablasSetKernelStream(stream[d][stream1]); trace_gpu_start( d, stream1, "trsm", "trsm" ); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK) magmablas_dtrsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, d_dinvA[d][0], d_x[d][0] ); #else magma_dtrsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda); #endif magma_event_record( event[d][4], stream[d][stream1] ); trace_gpu_end( d, stream1 ); } else if( nb2 > 0 ) { /* update all the blocks on stream2 */ magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor trace_gpu_start( d, stream2, "trsm", "trsm" ); magmablasSetKernelStream(stream[d][stream2]); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK) magmablas_dtrsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_dtrsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda); #endif trace_gpu_end( d, stream2 ); } d = (d+1)%num_gpus; } /* end of for */ /* ======================================================================================== */ d = (j/nb+1)%num_gpus; /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; nb0 = min(nb, n_local[d]-nb*j_local2 ); /* even on 1 gpu, off-diagonals are copied to cpu (synchronize at the end). * * so we have the Cholesky factor, but only diagonal submatrix of the big panel, * * on cpu at the end. */ if( j+jb < m ) { int d2, id2, j2, buf2; magma_setdevice(d); /* make sure all the previous sets are done */ if( h < num_gpus ) { /* > offdiagonal */ for( d2=0; d2<num_gpus; d2++ ) { j2 = j - (1+d2)*nb; if( j2 < 0 ) break; id2 = (j2/nb)%num_gpus; magma_queue_wait_event( stream[d][stream3], event[id2][0] ); } /* > diagonal */ for( d2=0; d2<num_gpus; d2++ ) { j2 = j - d2*nb; if( j2 < 0 ) break; id2 = (j2/nb)%num_gpus; magma_queue_wait_event( stream[d][stream3], event[id2][1] ); } } /* lookahead */ magma_queue_wait_event( stream[d][stream3], event[d][4] ); trace_gpu_start( d, stream3, "comm", "row to CPU" ); magma_dgetmatrix_async( (j+jb), nb0, dlA(d, 0, nb*j_local2), ldda, Aup(0,j+jb), lda, stream[d][stream3] ); trace_gpu_end( d, stream3 ); magma_event_record( event[d][3], stream[d][stream3] ); /* needed on pluto */ magma_queue_sync( stream[d][stream3] ); /* wait for the off-diagonal on cpu */ //magma_setdevice(id); //magma_queue_sync( stream[id][stream3] ); /* broadcast rows to gpus on stream2 */ buf2 = ((j+jb)/nb)%num_gpus; for( d2=0; d2<num_gpus; d2++ ) { if( d2 != d ) { magma_setdevice(d2); trace_gpu_start( d2, stream3, "comm", "row to GPUs" ); magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // rows arrived at cpu on stream3 magma_dsetmatrix_async( j+jb, nb0, Aup(0,j+jb), lda, dlP(d2,nb0,0,buf2), lddp, stream[d2][stream3] ); trace_gpu_end( d2, stream3 ); magma_event_record( event[d2][0], stream[d2][stream3] ); } } } /* ======================================================================================== */ /* gpu owning the next column */ /* after look ahead, update the remaining blocks */ if( j+jb < m ) /* no update on the last block column */ { d = (j/nb+1)%num_gpus; /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; if( d == id ) { dlpanel = dlA(d, j, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d, 0, 0, buf); ldpanel = lddp; } nb0 = min(nb, n_local[d]-nb*j_local2 ); nb2 = n_local[d]-nb*j_local2 - nb0; /* update the remaining blocks */ if( nb2 > 0 ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][stream2]); magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor trace_gpu_start( d, stream2, "trsm", "trsm" ); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK) magmablas_dtrsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_dtrsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda); #endif trace_gpu_end( d, stream2 ); } } } /* end of dtrsm */ } /* end of for j=1, .., n */ } else { /* ---------------------------------------------- */ /* Lower-triangular case */ /* > Compute the Cholesky factorization A = L*L'. */ /* ---------------------------------------------- */ #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK) /* * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( j=0; j<2; j++ ) { cudaMalloc((void**)&d_dinvA[d][j], nb*nb*sizeof(double)); cudaMalloc((void**)&d_x[d][j], nb*m *sizeof(double)); cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(double)); cudaMemset(d_x[d][j], 0, nb* m*sizeof(double)); } } magma_setdevice(0); #endif for (j=0; j<n; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (n-j)); /* Update the current diagonal block on stream1 */ magma_setdevice(id); if( j > 0 ) { magmablasSetKernelStream(stream[id][stream1]); magma_dsyrk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dlA(id, nb*j_local, 0), ldda, d_one, dlA(id, nb*j_local, j), ldda); } /* send the diagonal to cpu on stream1 */ magma_dgetmatrix_async( jb, jb, dlA(id, nb*j_local, j), ldda, Alo(j,j), lda, stream[id][stream1] ); /* update off-diagonal blocks of the panel */ if( j > 0 ) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { j_local2 = j_local+1; if( d > id ) j_local2 --; nb0 = nb*j_local2; if( nb0 < n_local[d] ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][stream2]); if( d == id ) { dlpanel = dlA(d, nb*j_local, 0); ldpanel = ldda; } else { dlpanel = dlPT(d,0,jb,buf); ldpanel = nb; magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu } magma_dgemm( MagmaNoTrans, MagmaTrans, n_local[d]-nb0, jb, j, c_neg_one, dlA(d, nb0, 0), ldda, dlpanel, ldpanel, c_one, dlA(d, nb0, j), ldda); magma_event_record( event[d][2], stream[d][stream2] ); } d = (d+1)%num_gpus; } } /* wait for the panel and factorized it on cpu */ magma_setdevice(id); magma_queue_sync( stream[id][stream1] ); lapackf77_dpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus on stream1 */ if ( (j+jb) < m) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { if( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d, 0, 0, buf); ldpanel = nb; } magma_setdevice(d); magma_dsetmatrix_async( jb, jb, Alo(j,j), lda, dlpanel, ldpanel, stream[d][stream1] ); magma_event_record( event[d][1], stream[d][stream1] ); d = (d+1)%num_gpus; } } else { magma_setdevice(id); magma_dsetmatrix_async( jb, jb, Alo(j,j), lda, dlA(id, nb*j_local, j), ldda, stream[id][stream1] ); } /* panel factorize the off-diagonal */ if ( (j+jb) < m) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; if( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d, 0, 0, buf); ldpanel = nb; } nb2 = n_local[d] - j_local2*nb; nb0 = min(nb, nb2 ); magma_setdevice(d); if( j+nb < n && d == (j/nb+1)%num_gpus ) { /* owns next column, look-ahead next block on stream1 */ magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update magmablasSetKernelStream(stream[d][stream1]); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK) magmablas_dtrsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda, d_dinvA[d][0], d_x[d][0] ); #else magma_dtrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda); #endif magma_event_record( event[d][4], stream[d][stream1] ); } else if( nb2 > 0 ) { /* other gpus updating all the blocks on stream2 */ /* update the entire column */ magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for the cholesky factor magmablasSetKernelStream(stream[d][stream2]); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK) magmablas_dtrsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_dtrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda); #endif } d = (d+1)%num_gpus; } /* end for d */ /* ======================================================================================== */ d = (j/nb+1)%num_gpus; /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; nb0 = min(nb, n_local[d]-nb*j_local2 ); /* even on 1 gpu, we copy off-diagonal to cpu (but don't synchronize). */ /* so we have the Cholesky factor on cpu at the end. */ if( j+jb < n ) { int d2, id2, j2, buf2; magma_setdevice(d); /* make sure all the previous sets are done */ if( h < num_gpus ) { /* > offdiagonal */ for( d2=0; d2<num_gpus; d2++ ) { j2 = j - (1+d2)*nb; if( j2 < 0 ) break; id2 = (j2/nb)%num_gpus; magma_queue_wait_event( stream[d][stream3], event[id2][0] ); } /* > diagonal */ for( d2=0; d2<num_gpus; d2++ ) { j2 = j - d2*nb; if( j2 < 0 ) break; id2 = (j2/nb)%num_gpus; magma_queue_wait_event( stream[d][stream3], event[id2][1] ); } } // lookahead done magma_queue_wait_event( stream[d][stream3], event[d][4] ); magma_dgetmatrix_async( nb0, j+jb, dlA(d, nb*j_local2, 0), ldda, Alo(j+jb,0), lda, stream[d][stream3] ); magma_event_record( event[d][3], stream[d][stream3] ); /* syn on rows on CPU, seem to be needed on Pluto */ magma_queue_sync( stream[d][stream3] ); /* broadcast the rows to gpus */ buf2 = ((j+jb)/nb)%num_gpus; for( d2=0; d2<num_gpus; d2++ ) { if( d2 != d ) { magma_setdevice(d2); magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // getmatrix done magma_dsetmatrix_async( nb0, j+jb, Alo(j+jb,0), lda, dlPT(d2,0,nb0,buf2), nb, stream[d2][stream3] ); magma_event_record( event[d2][0], stream[d2][stream3] ); } } } /* ======================================================================================== */ /* gpu owing the next column updates remaining blocks on stream2 */ if( j+nb < n ) { // no lookahead on the last block column d = (j/nb+1)%num_gpus; /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; if( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d,0,0,buf); ldpanel = nb; } nb0 = min(nb, n_local[d]-nb*j_local2 ); nb2 = n_local[d] - j_local2*nb - nb0; if( nb2 > 0 ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][stream2]); /* update the remaining blocks in the column */ magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK) magmablas_dtrsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_dtrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda); #endif } } } } } /* end of else not upper */ /* == finalize the trace == */ trace_finalize( "dpotrf.svg","trace.css" ); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magma_queue_sync( stream[d][2] ); magmablasSetKernelStream(NULL); //magma_event_destroy( event0[d] ); //magma_event_destroy( event1[d] ); //magma_event_destroy( event2[d] ); //magma_event_destroy( event3[d] ); //magma_event_destroy( event4[d] ); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK) for( j=0; j<2; j++ ) { magma_free( d_dinvA[d][j] ); magma_free( d_x[d][j] ); } #endif } magma_setdevice(0); return *info; } /* magma_dpotrf_mgpu */
extern "C" magma_int_t magma_zpotrf2_mgpu( int num_gpus, magma_uplo_t uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, magmaDoubleComplex_ptr *d_lA, size_t d_lA_offset, magma_int_t ldda, magmaDoubleComplex_ptr *d_lP, magma_int_t lddp, magmaDoubleComplex *a, magma_int_t lda, magma_int_t h, magma_queue_t *queues, magma_int_t *info ) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = 'U', or dA = L * L**H, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of dA is stored; = 'L': Lower triangle of dA is stored. N (input) INTEGER The order of the matrix dA. N >= 0. dA (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = 'U', the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ magma_int_t j, jb, nb0, nb2, dd, d, id, j_local, j_local2, buf; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; double d_one = 1.0; double d_neg_one = -1.0; magmaDoubleComplex_ptr dlpanel; size_t dlpanel_offset; magma_int_t n_local[MagmaMaxGPUs], ldpanel; magma_event_t* events[MagmaMaxGPUs][2]; *info = 0; if ( (uplo != MagmaUpper) && (uplo != MagmaLower) ) { *info = -1; } else if (n < 0) { *info = -2; } else if ((uplo != MagmaUpper) && num_gpus*ldda < max(1,n)) { *info = -4; } else if ((uplo == MagmaUpper) && ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } { for( d=0; d<num_gpus; d++ ) { /* local-n and local-ld */ if (uplo == MagmaUpper) { n_local[d] = ((n/nb)/num_gpus)*nb; if (d < (n/nb)%num_gpus) n_local[d] += nb; else if (d == (n/nb)%num_gpus) n_local[d] += n%nb; } else { n_local[d] = ((m/nb)/num_gpus)*nb; if (d < (m/nb)%num_gpus) n_local[d] += nb; else if (d == (m/nb)%num_gpus) n_local[d] += m%nb; } } /* Use blocked code. */ if (uplo == MagmaUpper) { /* ---------------------------------------------- */ /* Upper-triangular case */ /* > Compute the Cholesky factorization A = U'*U. */ /* ---------------------------------------------- */ printf("support Lower case only!\n"); } else { trace_init(1, num_gpus, 2, queues); /* -------------------------------------------- */ /* Lower-triangular case */ /* Compute the Cholesky factorization A = L*L'. */ /* -------------------------------------------- */ // try one queue per device firstly! for (j=0; j<n; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (n-j)); if( j > 0 ) { /* needed on pluto... */ //magma_setdevice(id); magma_queue_sync( queues[id*2] ); // wait for the column on CPU /* broadcast offdiagonal row to all gpus */ d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { if( d != id ) { /* wait for it on CPU */ //magma_queue_sync( queues[d*2] ); /* send it to GPU */ trace_gpu_start(d, 0, "comm_set", "set"); events[d][0] = trace_gpu_end(d, 0); magma_zsetmatrix_async( jb, j, Alo(j,0), 0, lda, dlPT(d,0,jb,buf), nb, queues[d*2], events[d][0] ); clFlush(queues[d*2]); } d = (d+1)%num_gpus; } } /* Update the current diagonal block */ if( j > 0 ) { trace_gpu_start(id, 1, "herk", "herk"); events[id][1] = trace_gpu_end(id, 1); magma_zherk_trace(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dlA(id, nb*j_local, 0), ldda, d_one, dlA(id, nb*j_local, j), ldda, queues[id*2+1], events[id][1]); // magma_queue_sync( queues[id*2+1] ); // wait for syrk clWaitForEvents(1, events[id][1]); } /* send the diagonal to cpu */ trace_gpu_start(id, 0, "comm_get", "get"); events[id][0] = trace_gpu_end(id, 0); magma_zgetmatrix_async( jb, jb, dlA(id, nb*j_local, j), ldda, Alo(j,j), 0, lda, queues[id*2], events[id][0] ); clFlush(queues[id*2]); /* update the offdiagonal blocks */ if ( j > 0 ) { /* compute the block-rows of the panel */ d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { j_local2 = j_local+1; if( d > id ) j_local2 --; nb0 = nb*j_local2; if( nb0 < n_local[d] ) { if( d != id ) { //dlpanel = dlPT(d); dlpanel = d_lP[d]; dlpanel_offset = dlPT_offset(0, jb, buf); ldpanel = nb; /* wait for offdiagonal row */ magma_queue_sync(queues[d*2]); } else { dlpanel = d_lA[d]; dlpanel_offset = dlA_offset(nb*j_local, 0); ldpanel = ldda; } trace_gpu_start(d, 1, "gemm", "gemm"); events[d][1] = trace_gpu_end(d, 1); magma_zgemm_trace( MagmaNoTrans, MagmaConjTrans, n_local[d]-nb0, jb, j, c_neg_one, dlA(d, nb0, 0), ldda, dlpanel, dlpanel_offset, ldpanel, c_one, dlA(d, nb0, j), ldda, queues[d*2+1], events[d][1]); } d = (d+1)%num_gpus; } } /* send the diagonal to cpu */ /* trace_gpu_start(id, 0, "comm_get", "get"); events[id][0] = trace_gpu_end(id, 0); magma_zgetmatrix_async( jb, jb, dlA(id, nb*j_local, j), ldda, Alo(j,j), 0, lda, queues[id*2], events[id][0] ); clFlush(queues[id*2]); */ /* factor the diagonal */ magma_queue_sync( queues[id*2] ); trace_cpu_start(0, "potrf", "potrf"); lapackf77_zpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info); trace_cpu_end(0); if (*info != 0) { printf("row number: %d\n", j); *info = *info + j; break; } /* send the diagonal to gpus */ if ( (j+jb) < m ) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { if( d == id ) { dlpanel = d_lA[d]; dlpanel_offset = dlA_offset(nb*j_local, j); ldpanel = ldda; } else { //dlpanel = dlPT(d); dlpanel = d_lP[d]; dlpanel_offset = dlPT_offset(0, 0, buf); ldpanel = nb; } trace_gpu_start(d, 0, "comm_set", "set"); events[d][0] = trace_gpu_end(d, 0); magma_zsetmatrix_async( jb, jb, Alo(j,j), 0, lda, dlpanel, dlpanel_offset, ldpanel, queues[d*2], events[d][0] ); clFlush(queues[d*2]); d = (d+1)%num_gpus; } } else { trace_gpu_start(id, 0, "comm_set", "set"); events[id][0] = trace_gpu_end(id, 0); magma_zsetmatrix_async( jb, jb, Alo(j,j), 0, lda, dlA(id, nb*j_local, j), ldda, queues[id*2], events[id][0] ); clFlush(queues[id*2]); } /* factorize off-diagonal blocks */ if ( (j+jb) < m ) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; if( d == id ) { dlpanel = d_lA[d]; dlpanel_offset = dlA_offset(nb*j_local, j); ldpanel = ldda; } else { //dlpanel = dlPT(d); dlpanel = d_lP[d]; dlpanel_offset = dlPT_offset(0, 0, buf); ldpanel = nb; } nb2 = n_local[d] - j_local2*nb; nb0 = min(nb, nb2 ); magma_queue_sync(queues[d*2]); // wait for the diagonal if( j+jb < n && d == (j/nb+1)%num_gpus ) { /* owns the next column, look-ahead the column */ trace_gpu_start(d, 1, "trsm", "trsm"); events[d][1] = trace_gpu_end(d, 1); magma_ztrsm_trace( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, nb*j_local2, j), ldda, queues[d*2+1], events[d][1]); /* send the column to cpu */ if( j+jb < n ) { magma_queue_sync( queues[d*2+1] ); // wait for lookahead trace_gpu_start(d, 0, "comm_get", "get"); events[d][0] = trace_gpu_end(d, 0); magma_zgetmatrix_async( nb0, j+jb, dlA(d, nb*j_local2, 0), ldda, Alo(j+jb,0), 0, lda, queues[d*2], events[d][0]); clFlush(queues[d*2]); } /* update the remaining blocks */ nb2 = nb2 - nb0; trace_gpu_start(d, 1, "trsm", "trsm"); events[d][1] = trace_gpu_end(d, 1); magma_ztrsm_trace( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda, queues[d*2+1], events[d][1]); } else if( nb2 > 0 ) { /* update the entire trailing matrix */ trace_gpu_start(d, 1, "trsm", "trsm"); events[d][1] = trace_gpu_end(d, 1); magma_ztrsm_trace( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, nb*j_local2, j), ldda, queues[d*2+1], events[d][1]); } d = (d+1)%num_gpus; } } } } /* end of else not upper */ /* clean up */ for( d=0; d<num_gpus; d++ ) { magma_queue_sync( queues[d*2] ); magma_queue_sync( queues[d*2+1] ); } trace_finalize("dpotrf2_mgpu.svg", "trace.css"); } /* end of not lapack */ return *info; } /* magma_zpotrf_mgpu */
extern "C" magma_int_t magma_ssytrd_sy2sb( char uplo, magma_int_t n, magma_int_t nb, float *a, magma_int_t lda, float *tau, float *work, magma_int_t lwork, float *dT, magma_int_t threads, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= SSYTRD_HE2HB reduces a real symmetric matrix A to real symmetric band-diagonal form T by an orthogonal similarity transformation: Q**T * A * Q = T. This version stores the triangular matrices T used in the accumulated Householder transformations (I - V T V'). Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) REAL array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if UPLO = 'U', the Upper band-diagonal of A is overwritten by the corresponding elements of the band-diagonal matrix T, and the elements above the band diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = 'L', the the Lower band-diagonal of A is overwritten by the corresponding elements of the band-diagonal matrix T, and the elements below the band-diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). TAU (output) REAL array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= 1. For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. dT (output) REAL array on the GPU, dimension N*NB, where NB is the optimal blocksize. On exit dT holds the upper triangular matrices T from the accumulated Householder transformations (I - V T V') used in the factorization. The nb x nb matrices T are ordered consecutively in memory one after another. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value Further Details =============== If UPLO = 'U', the matrix Q is represented as a product of elementary reflectors Q = H(n-1) . . . H(2) H(1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in A(1:i-1,i+1), and tau in TAU(i). If UPLO = 'L', the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(n-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i), and tau in TAU(i). The contents of A on exit are illustrated by the following examples with n = 5: if UPLO = 'U': if UPLO = 'L': ( d e v2 v3 v4 ) ( d ) ( d e v3 v4 ) ( e d ) ( d e v4 ) ( v1 e d ) ( d e ) ( v1 v2 e d ) ( d ) ( v1 v2 v3 e d ) where d and e denote diagonal and off-diagonal elements of T, and vi denotes an element of the vector defining H(i). ===================================================================== */ #define a_ref(a_1,a_2) ( a + ((a_2)-1)*( lda) + (a_1)-1) #define da_ref(a_1,a_2) (da + ((a_2)-1)*(ldda) + (a_1)-1) #define tau_ref(a_1) (tau + (a_1)-1) #define t_ref(a_1) (dT + ((a_1)-1)*(lddt)) char uplo_[2] = {uplo, 0}; int ldda = ((n+31)/32)*32; int lddt = nb; float c_neg_one = MAGMA_S_NEG_ONE; float c_neg_half = MAGMA_S_NEG_HALF; float c_one = MAGMA_S_ONE ; float c_zero = MAGMA_S_ZERO; float d_one = MAGMA_D_ONE; magma_int_t pm, pn, indi, indj, pk; magma_int_t pm_old=0, pn_old=0, indi_old=0, indj_old=0; int i; int lwkopt; int lquery; *info = 0; int upper = lapackf77_lsame(uplo_, "U"); lquery = lwork == -1; if (! upper && ! lapackf77_lsame(uplo_, "L")) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } else if (lwork < 1 && ! lquery) { *info = -9; } if (*info == 0) { /* Determine the block size. */ lwkopt = n * nb; MAGMA_S_SET2REAL( work[0], lwkopt ); } if (*info != 0) return *info; else if (lquery) return *info; /* Quick return if possible */ if (n == 0) { work[0] = c_one; return *info; } float *da; if (MAGMA_SUCCESS != magma_smalloc( &da, (n + 2*nb)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_int_t mklth = min(threads,12); #if defined(USEMKL) mkl_set_num_threads(mklth); #endif #if defined(USEACML) omp_set_num_threads(mklth); #endif /* Use the first panel of da as work space */ float *dwork = da+n*ldda; float *dW = dwork + nb*ldda; #ifdef TRACING char buf[80]; #endif cudaStream_t stream[3]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); stream[2] = 0; // default stream trace_init( 1, 1, 3, stream ); float *hT = work + lwork - nb*nb; lwork -= nb*nb; memset( hT, 0, nb*nb*sizeof(float)); magmablasSetKernelStream( stream[0] ); cudaEvent_t Pupdate_event; cudaEventCreateWithFlags(&Pupdate_event,cudaEventDisableTiming); //cudaEventCreate(&Pupdate_event); if (upper) { printf("SSYTRD_HE2HB is not yet implemented for upper matrix storage. Exit.\n"); exit(1); }else { /* Copy the matrix to the GPU */ if (1 <= n-nb){ trace_gpu_start( 0, 0, "set", "set A" ); magma_ssetmatrix_async( (n-nb), (n-nb), a_ref(nb+1, nb+1), lda, da_ref(nb+1, nb+1), ldda, stream[0] ); trace_gpu_end( 0, 0 ); } /* Reduce the lower triangle of A */ for (i = 1; i <= n-nb; i += nb) { indi = i+nb; indj = i; pm = n - i - nb + 1; //pn = min(i+nb-1, n-nb) -i + 1; pn = nb; /* Get the current panel (no need for the 1st iteration) */ if (i > 1 ){ // spanel_to_q copy the upper oof diagonal part of // the matrix to work to be restored later. acctually // the zero's and one's putted are not used this is only // because we don't have a function that copy only the // upper part of A to be restored after copying the // lookahead panel that has been computted from GPU to CPU. spanel_to_q(MagmaUpper, pn-1, a_ref(i, i+1), lda, work); trace_gpu_start( 0, 1, "get", "get panel" ); //magma_queue_sync( stream[0] ); cudaStreamWaitEvent(stream[1], Pupdate_event, 0); magma_sgetmatrix_async( (pm+pn), pn, da_ref( i, i), ldda, a_ref ( i, i), lda, stream[1] ); trace_gpu_end( 0, 1 ); trace_gpu_start( 0, 2, "syr2k", "syr2k" ); magma_ssyr2k(MagmaLower, MagmaNoTrans, pm_old-pn_old, pn_old, c_neg_one, da_ref(indi_old+pn_old, indj_old), ldda, dW + pn_old , pm_old, d_one, da_ref(indi_old+pn_old, indi_old+pn_old), ldda); trace_gpu_end( 0, 2 ); trace_cpu_start( 0, "sync", "sync on 1" ); magma_queue_sync( stream[1] ); trace_cpu_end( 0 ); sq_to_panel(MagmaUpper, pn-1, a_ref(i, i+1), lda, work); } /* ========================================================== QR factorization on a panel starting nb off of the diagonal. Prepare the V and T matrices. ========================================================== */ #ifdef TRACING snprintf( buf, sizeof(buf), "panel %d", i ); #endif trace_cpu_start( 0, "geqrf", buf ); lapackf77_sgeqrf(&pm, &pn, a_ref(indi, indj), &lda, tau_ref(i), work, &lwork, info); /* Form the matrix T */ pk=min(pm,pn); lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &pm, &pk, a_ref(indi, indj), &lda, tau_ref(i), hT, &nb); /* Prepare V - put 0s in the upper triangular part of the panel (and 1s on the diagonal), temporaly storing the original in work */ spanel_to_q(MagmaUpper, pk, a_ref(indi, indj), lda, work); trace_cpu_end( 0 ); /* Send V from the CPU to the GPU */ trace_gpu_start( 0, 0, "set", "set V and T" ); magma_ssetmatrix_async( pm, pk, a_ref(indi, indj), lda, da_ref(indi, indj), ldda, stream[0] ); /* Send the triangular factor T to the GPU */ magma_ssetmatrix_async( pk, pk, hT, nb, t_ref(i), lddt, stream[0] ); trace_gpu_end( 0, 0 ); /* ========================================================== Compute W: 1. X = A (V T) 2. W = X - 0.5* V * (T' * (V' * X)) ========================================================== */ /* dwork = V T */ trace_cpu_start( 0, "sync", "sync on 0" ); // this sync is done here to be sure that the copy has been finished // because below we made a restore sq_to_panel and this restore need // to ensure that the copy has been finished. we did it here to allow // overlapp of restore with next gemm and symm. magma_queue_sync( stream[0] ); trace_cpu_end( 0 ); trace_gpu_start( 0, 2, "gemm", "work = V*T" ); magma_sgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk, c_one, da_ref(indi, indj), ldda, t_ref(i), lddt, c_zero, dwork, pm); trace_gpu_end( 0, 2 ); /* dW = X = A*V*T. dW = A*dwork */ trace_gpu_start( 0, 2, "symm", "X = A*work" ); magma_ssymm(MagmaLeft, uplo, pm, pk, c_one, da_ref(indi, indi), ldda, dwork, pm, c_zero, dW, pm); trace_gpu_end( 0, 2 ); /* restore the panel */ sq_to_panel(MagmaUpper, pk, a_ref(indi, indj), lda, work); /* dwork = V*T already ==> dwork' = T'*V' * compute T'*V'*X ==> dwork'*W ==> * dwork + pm*nb = ((T' * V') * X) = dwork' * X = dwork' * W */ trace_gpu_start( 0, 2, "gemm", "work = T'*V'*X" ); magma_sgemm(MagmaTrans, MagmaNoTrans, pk, pk, pm, c_one, dwork, pm, dW, pm, c_zero, dwork + pm*nb, nb); trace_gpu_end( 0, 2 ); /* W = X - 0.5 * V * T'*V'*X * = X - 0.5 * V * (dwork + pm*nb) = W - 0.5 * V * (dwork + pm*nb) */ trace_gpu_start( 0, 2, "gemm", "W = X - 0.5*V*(T'*V'*X)" ); magma_sgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk, c_neg_half, da_ref(indi, indj), ldda, dwork + pm*nb, nb, c_one, dW, pm); trace_gpu_end( 0, 2 ); /* ========================================================== Update the unreduced submatrix A(i+ib:n,i+ib:n), using an update of the form: A := A - V*W' - W*V' ========================================================== */ if (i + nb <= n-nb){ /* There would be next iteration; do lookahead - update the next panel */ trace_gpu_start( 0, 2, "gemm", "gemm 4 next panel left" ); magma_sgemm(MagmaNoTrans, MagmaTrans, pm, pn, pn, c_neg_one, da_ref(indi, indj), ldda, dW , pm, c_one, da_ref(indi, indi), ldda); trace_gpu_end( 0, 2 ); trace_gpu_start( 0, 2, "gemm", "gemm 5 next panel right" ); magma_sgemm(MagmaNoTrans, MagmaTrans, pm, pn, pn, c_neg_one, dW , pm, da_ref(indi, indj), ldda, c_one, da_ref(indi, indi), ldda); trace_gpu_end( 0, 2 ); cudaEventRecord(Pupdate_event, stream[0]); } else { /* no look-ahead as this is last iteration */ trace_gpu_start( 0, 2, "syr2k", "syr2k last iteration" ); magma_ssyr2k(MagmaLower, MagmaNoTrans, pk, pk, c_neg_one, da_ref(indi, indj), ldda, dW , pm, d_one, da_ref(indi, indi), ldda); trace_gpu_end( 0, 2 ); } indi_old = indi; indj_old = indj; pm_old = pm; pn_old = pn; } // end loop for(i) /* Send the last block to the CPU */ pk = min(pm,pn); if (1 <= n-nb){ spanel_to_q(MagmaUpper, pk-1, a_ref(n-pk+1, n-pk+2), lda, work); trace_gpu_start( 0, 2, "get", "get last block" ); magma_sgetmatrix( pk, pk, da_ref(n-pk+1, n-pk+1), ldda, a_ref(n-pk+1, n-pk+1), lda ); trace_gpu_end( 0, 2 ); sq_to_panel(MagmaUpper, pk-1, a_ref(n-pk+1, n-pk+2), lda, work); } }// end of LOWER trace_finalize( "ssytrd_sy2sb.svg", "trace.css" ); cudaEventDestroy(Pupdate_event); magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( da ); MAGMA_S_SET2REAL( work[0], lwkopt ); magmablasSetKernelStream( 0 ); #if defined(USEMKL) mkl_set_num_threads(1); #endif #if defined(USEACML) omp_set_num_threads(1); #endif return *info; } /* ssytrd_sy2sb_ */
/** Purpose ------- ZUNGQR generates an M-by-N COMPLEX_16 matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by ZGEQRF. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] A COMPLEX_16 array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] lda INTEGER The first dimension of the array A. LDA >= max(1,M). @param[in] tau COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF_GPU. @param[in] T COMPLEX_16 array, dimension (NB, min(M,N)). T contains the T matrices used in blocking the elementary reflectors H(i), e.g., this can be the 6th argument of magma_zgeqrf_gpu (except stored on the CPU, not the GPU). @param[in] nb INTEGER This is the block size used in ZGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in T. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument has an illegal value @ingroup magma_zgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_zungqr_m( magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *T, magma_int_t nb, magma_int_t *info) { #define A(i,j) ( A + (i) + (j)*lda ) #define dA(d,i,j) (dA[d] + (i) + (j)*ldda) #define dT(d,i,j) (dT[d] + (i) + (j)*nb) magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldwork; magma_int_t i, ib, ki, kk, iinfo; magmaDoubleComplex *work; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } magma_int_t di, dn; magma_int_t dpanel; magma_int_t ngpu = magma_num_gpus(); magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); // Allocate memory on GPUs for A and workspaces magma_int_t ldda = ((m + 31) / 32) * 32; magma_int_t lddwork = ((n + 31) / 32) * 32; magma_int_t min_lblocks = (n / nb) / ngpu; // min. blocks per gpu magma_int_t last_dev = (n / nb) % ngpu; // device with last block magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; magmaDoubleComplex *dA[ MagmaMaxGPUs ] = { NULL }; magmaDoubleComplex *dT[ MagmaMaxGPUs ] = { NULL }; magmaDoubleComplex *dV[ MagmaMaxGPUs ] = { NULL }; magmaDoubleComplex *dW[ MagmaMaxGPUs ] = { NULL }; magma_queue_t stream[ MagmaMaxGPUs ] = { NULL }; for( int d = 0; d < ngpu; ++d ) { // example with n = 75, nb = 10, ngpu = 3 // min_lblocks = 2 // last_dev = 1 // gpu 0: 2 blocks, cols: 0- 9, 30-39, 60-69 // gpu 1: 1+ blocks, cols: 10-19, 40-49, 70-74 (partial) // gpu 2: 1 block, cols: 20-29, 50-59 magma_setdevice( d ); nlocal[d] = min_lblocks*nb; if ( d < last_dev ) { nlocal[d] += nb; } else if ( d == last_dev ) { nlocal[d] += (n % nb); } ldwork = nlocal[d]*ldda // dA + nb*m // dT + nb*ldda // dV + nb*lddwork; // dW if ( MAGMA_SUCCESS != magma_zmalloc( &dA[d], ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto CLEANUP; } dT[d] = dA[d] + nlocal[d]*ldda; dV[d] = dT[d] + nb*m; dW[d] = dV[d] + nb*ldda; magma_queue_create( &stream[d] ); } trace_init( 1, ngpu, 1, stream ); // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate CPU work space // n*nb for zungqr workspace lwork = n * nb; magma_zmalloc_cpu( &work, lwork ); if (work == NULL) { *info = MAGMA_ERR_HOST_ALLOC; goto CLEANUP; } // Use unblocked code for the last or only block. if (kk < n) { trace_cpu_start( 0, "ungqr", "ungqr last block" ); m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; dpanel = (kk / nb) % ngpu; di = ((kk / nb) / ngpu) * nb; magma_setdevice( dpanel ); lapackf77_zungqr( &m_kk, &n_kk, &k_kk, A(kk, kk), &lda, &tau[kk], work, &lwork, &iinfo ); magma_zsetmatrix( m_kk, n_kk, A(kk, kk), lda, dA(dpanel, kk, di), ldda ); // Set A(1:kk,kk+1:n) to zero. magmablas_zlaset( MagmaFull, kk, n - kk, c_zero, c_zero, dA(dpanel, 0, di), ldda ); trace_cpu_end( 0 ); } if (kk > 0) { // Use blocked code // send T to all GPUs for( int d = 0; d < ngpu; ++d ) { magma_setdevice( d ); trace_gpu_start( d, 0, "set", "set T" ); magma_zsetmatrix_async( nb, min(m,n), T, nb, dT[d], nb, stream[d] ); trace_gpu_end( d, 0 ); } // stream: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation for( i = ki; i >= 0; i -= nb ) { ib = min(nb, k - i); mi = m - i; dpanel = (i / nb) % ngpu; di = ((i / nb) / ngpu) * nb; // Send current panel to the GPUs lapackf77_zlaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); for( int d = 0; d < ngpu; ++d ) { magma_setdevice( d ); trace_gpu_start( d, 0, "set", "set V" ); magma_zsetmatrix_async( mi, ib, A(i, i), lda, dV[d], ldda, stream[d] ); trace_gpu_end( d, 0 ); } // set panel to identity magma_setdevice( dpanel ); magmablasSetKernelStream( stream[dpanel] ); trace_gpu_start( dpanel, 0, "laset", "laset" ); magmablas_zlaset( MagmaFull, i, ib, c_zero, c_zero, dA(dpanel, 0, di), ldda ); magmablas_zlaset( MagmaFull, mi, ib, c_zero, c_one, dA(dpanel, i, di), ldda ); trace_gpu_end( dpanel, 0 ); if (i < n) { // Apply H to A(i:m,i:n) from the left for( int d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( stream[d] ); magma_indices_1D_bcyclic( nb, ngpu, d, i, n, &di, &dn ); trace_gpu_start( d, 0, "larfb", "larfb" ); magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, dn-di, ib, dV[d], ldda, dT(d,0,i), nb, dA(d, i, di), ldda, dW[d], lddwork ); trace_gpu_end( d, 0 ); } } } } // copy result back to CPU trace_cpu_start( 0, "get", "get A" ); magma_zgetmatrix_1D_col_bcyclic( m, n, dA, ldda, A, lda, ngpu, nb ); trace_cpu_end( 0 ); #ifdef TRACING char name[80]; snprintf( name, sizeof(name), "zungqr-n%d-ngpu%d.svg", m, ngpu ); trace_finalize( name, "trace.css" ); #endif CLEANUP: for( int d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_free( dA[d] ); magma_queue_destroy( stream[d] ); } magma_free_cpu( work ); magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_zungqr */
/** Purpose ------- SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Use two buffer to send panels. Arguments --------- @param[in] num_gpus INTEGER The number of GPUs to be used for the factorization. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A REAL array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_sgesv_comp ********************************************************************/ extern "C" magma_int_t magma_sgetrf2_mgpu(magma_int_t num_gpus, magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset, float *d_lAT[], magma_int_t lddat, magma_int_t *ipiv, float *d_lAP[], float *w, magma_int_t ldw, magma_queue_t streaml[][2], magma_int_t *info) { #define dAT(id,i,j) (d_lAT[(id)] + ((offset)+(i)*nb)*lddat + (j)*nb) #define W(j) (w+((j)%num_gpus)*nb*ldw) float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t block_size = 32; magma_int_t iinfo, n_local[MagmaMaxGPUs]; magma_int_t maxm, mindim; magma_int_t i, d, dd, rows, cols, s, ldpan[MagmaMaxGPUs]; magma_int_t id, i_local, i_local2, nb0, nb1, h = 2+num_gpus; float *d_panel[MagmaMaxGPUs], *panel_local[MagmaMaxGPUs]; /* Check arguments */ *info = 0; if (m < 0) *info = -2; else if (n < 0) *info = -3; else if (num_gpus*lddat < max(1,n)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); if ( num_gpus > ceil((float)n/nb) ) { *info = -1; return *info; } /* Use hybrid blocked code. */ maxm = ((m + block_size-1)/block_size)*block_size; /* some initializations */ for (i=0; i < num_gpus; i++) { magma_setdevice(i); n_local[i] = ((n/nb)/num_gpus)*nb; if (i < (n/nb)%num_gpus) n_local[i] += nb; else if (i == (n/nb)%num_gpus) n_local[i] += n%nb; /* workspaces */ d_panel[i] = &(d_lAP[i][h*nb*maxm]); /* temporary panel storage */ } trace_init( 1, num_gpus, 2, (CUstream_st**)streaml ); /* start sending the panel to cpu */ nb0 = min(mindim, nb); magma_setdevice(0); magmablasSetKernelStream(streaml[0][1]); trace_gpu_start( 0, 1, "comm", "get" ); magmablas_stranspose( nb0, m, dAT(0,0,0), lddat, d_lAP[0], maxm ); magma_sgetmatrix_async( m, nb0, d_lAP[0], maxm, W(0), ldw, streaml[0][1] ); trace_gpu_end( 0, 1 ); /* ------------------------------------------------------------------------------------- */ magma_timer_t time=0; timer_start( time ); s = mindim / nb; for( i=0; i < s; i++ ) { /* Set the GPU number that holds the current panel */ id = i%num_gpus; magma_setdevice(id); /* Set the local index where the current panel is */ i_local = i/num_gpus; cols = maxm - i*nb; rows = m - i*nb; /* synchrnoize i-th panel from id-th gpu into work */ magma_queue_sync( streaml[id][1] ); /* i-th panel factorization */ trace_cpu_start( 0, "getrf", "getrf" ); lapackf77_sgetrf( &rows, &nb, W(i), &ldw, ipiv+i*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) { *info = iinfo + i*nb; } trace_cpu_end( 0 ); /* start sending the panel to all the gpus */ d = (i+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { magma_setdevice(d); trace_gpu_start( 0, 1, "comm", "set" ); magma_ssetmatrix_async( rows, nb, W(i), ldw, &d_lAP[d][(i%h)*nb*maxm], cols, streaml[d][1] ); trace_gpu_end( 0, 1 ); d = (d+1)%num_gpus; } /* apply the pivoting */ d = (i+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { magma_setdevice(d); magmablasSetKernelStream(streaml[d][0]); trace_gpu_start( d, 1, "pivot", "pivot" ); if ( dd == 0 ) magmablas_spermute_long2( lddat, dAT(d,0,0), lddat, ipiv, nb, i*nb ); else magmablas_spermute_long3( dAT(d,0,0), lddat, ipiv, nb, i*nb ); trace_gpu_end( d, 1 ); d = (d+1)%num_gpus; } /* update the trailing-matrix/look-ahead */ d = (i+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { magma_setdevice(d); /* storage for panel */ if ( d == id ) { /* the panel belond to this gpu */ panel_local[d] = dAT(d,i,i_local); ldpan[d] = lddat; /* next column */ i_local2 = i_local+1; } else { /* the panel belong to another gpu */ panel_local[d] = d_panel[d]; ldpan[d] = nb; /* next column */ i_local2 = i_local; if ( d < id ) i_local2 ++; } /* the size of the next column */ if ( s > (i+1) ) { nb0 = nb; } else { nb0 = n_local[d]-nb*(s/num_gpus); if ( d < s%num_gpus ) nb0 -= nb; } if ( d == (i+1)%num_gpus) { /* owns the next column, look-ahead the column */ nb1 = nb0; magmablasSetKernelStream(streaml[d][1]); /* make sure all the pivoting has been applied */ magma_queue_sync(streaml[d][0]); trace_gpu_start( d, 1, "gemm", "gemm" ); /* transpose panel on GPU */ magmablas_stranspose( rows, nb, &d_lAP[d][(i%h)*nb*maxm], cols, panel_local[d], ldpan[d] ); /* synch for remaining update */ magma_queue_sync(streaml[d][1]); } else { /* update the entire trailing matrix */ nb1 = n_local[d] - i_local2*nb; magmablasSetKernelStream(streaml[d][0]); /* synchronization to make sure panel arrived on gpu */ magma_queue_sync(streaml[d][1]); trace_gpu_start( d, 0, "gemm", "gemm" ); /* transpose panel on GPU */ magmablas_stranspose( rows, nb, &d_lAP[d][(i%h)*nb*maxm], cols, panel_local[d], ldpan[d] ); } /* gpu updating the trailing matrix */ magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb, c_one, panel_local[d], ldpan[d], dAT(d, i, i_local2), lddat); magma_sgemm( MagmaNoTrans, MagmaNoTrans, nb1, m-(i+1)*nb, nb, c_neg_one, dAT(d, i, i_local2), lddat, &(panel_local[d][nb*ldpan[d]]), ldpan[d], c_one, dAT(d, i+1, i_local2), lddat ); if ( d == (i+1)%num_gpus ) { /* Set the local index where the current panel is */ int loff = i+1; int i_local = (i+1)/num_gpus; int ldda = maxm - (i+1)*nb; int cols = m - (i+1)*nb; nb0 = min(nb, mindim - (i+1)*nb); /* size of the diagonal block */ trace_gpu_end( d, 1 ); if ( nb0 > 0 ) { /* transpose the panel for sending it to cpu */ trace_gpu_start( d, 1, "comm", "get" ); magmablas_stranspose( nb0, m-(i+1)*nb, dAT(d,loff,i_local), lddat, &d_lAP[d][((i+1)%h)*nb*maxm], ldda ); /* send the panel to cpu */ magma_sgetmatrix_async( cols, nb0, &d_lAP[d][((i+1)%h)*nb*maxm], ldda, W(i+1), ldw, streaml[d][1] ); trace_gpu_end( d, 1 ); } } else { trace_gpu_end( d, 0 ); } d = (d+1)%num_gpus; } /* update the remaining matrix by gpu owning the next panel */ if ( (i+1) < s ) { int i_local = (i+1)/num_gpus; int rows = m - (i+1)*nb; d = (i+1)%num_gpus; magma_setdevice(d); magmablasSetKernelStream(streaml[d][0]); trace_gpu_start( d, 0, "gemm", "gemm" ); magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d] - (i_local+1)*nb, nb, c_one, panel_local[d], ldpan[d], dAT(d,i,i_local+1), lddat ); magma_sgemm( MagmaNoTrans, MagmaNoTrans, n_local[d]-(i_local+1)*nb, rows, nb, c_neg_one, dAT(d,i,i_local+1), lddat, &(panel_local[d][nb*ldpan[d]]), ldpan[d], c_one, dAT(d,i+1, i_local+1), lddat ); trace_gpu_end( d, 0 ); } } /* end of for i=1..s */ /* ------------------------------------------------------------------------------ */ /* Set the GPU number that holds the last panel */ id = s%num_gpus; /* Set the local index where the last panel is */ i_local = s/num_gpus; /* size of the last diagonal-block */ nb0 = min(m - s*nb, n - s*nb); rows = m - s*nb; cols = maxm - s*nb; if ( nb0 > 0 ) { magma_setdevice(id); /* wait for the last panel on cpu */ magma_queue_sync( streaml[id][1] ); /* factor on cpu */ lapackf77_sgetrf( &rows, &nb0, W(s), &ldw, ipiv+s*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + s*nb; /* send the factor to gpus */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); i_local2 = i_local; if ( d < id ) i_local2 ++; if ( d == id || n_local[d] > i_local2*nb ) { magma_ssetmatrix_async( rows, nb0, W(s), ldw, &d_lAP[d][(s%h)*nb*maxm], cols, streaml[d][1] ); } } for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(streaml[d][0]); if ( d == 0 ) magmablas_spermute_long2( lddat, dAT(d,0,0), lddat, ipiv, nb0, s*nb ); else magmablas_spermute_long3( dAT(d,0,0), lddat, ipiv, nb0, s*nb ); } for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(streaml[d][1]); /* wait for the pivoting to be done */ magma_queue_sync( streaml[d][0] ); i_local2 = i_local; if ( d < id ) i_local2++; if ( d == id ) { /* the panel belond to this gpu */ panel_local[d] = dAT(d,s,i_local); /* next column */ nb1 = n_local[d] - i_local*nb-nb0; magmablas_stranspose( rows, nb0, &d_lAP[d][(s%h)*nb*maxm], cols, panel_local[d], lddat ); if ( nb1 > 0 ) { magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb0, c_one, panel_local[d], lddat, dAT(d,s,i_local)+nb0, lddat); } } else if ( n_local[d] > i_local2*nb ) { /* the panel belong to another gpu */ panel_local[d] = d_panel[d]; /* next column */ nb1 = n_local[d] - i_local2*nb; magmablas_stranspose( rows, nb0, &d_lAP[d][(s%h)*nb*maxm], cols, panel_local[d], nb ); magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb0, c_one, panel_local[d], nb, dAT(d,s,i_local2), lddat); } } } /* if ( nb0 > 0 ) */ /* clean up */ trace_finalize( "sgetrf_mgpu.svg","trace.css" ); for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( streaml[d][0] ); magma_queue_sync( streaml[d][1] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); timer_start( time ); timer_printf("\n Performance %f GFlop/s\n", FLOPS_SGETRF(m,n) / 1e9 / time ); return *info; } /* magma_sgetrf2_mgpu */
/** Purpose ======= SSYTRF_nopiv computes the LDLt factorization of a real symmetric matrix A. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The factorization has the form A = U^H * D * U, if UPLO = MagmaUpper, or A = L * D * L^H, if UPLO = MagmaLower, where U is an upper triangular matrix, L is lower triangular, and D is a diagonal matrix. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U^H D U or A = L D L^H. \n Higher performance is achieved if A is in pinned memory. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -6, the GPU memory allocation failed - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_ssysv_comp ******************************************************************* */ extern "C" magma_int_t magma_ssytrf_nopiv( magma_uplo_t uplo, magma_int_t n, float *A, magma_int_t lda, magma_int_t *info) { #define A(i, j) ( A +(j)*lda + (i)) #define dA(i, j) (dA +(j)*ldda + (i)) #define dW(i, j) (dW +(j)*ldda + (i)) #define dWt(i, j) (dW +(j)*nb + (i)) /* Constants */ const float c_one = MAGMA_S_ONE; const float c_neg_one = MAGMA_S_NEG_ONE; /* Local variables */ bool upper = (uplo == MagmaUpper); magma_int_t j, k, jb, ldda, nb, ib, iinfo; magmaFloat_ptr dA; magmaFloat_ptr dW; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return MAGMA_ERR_ILLEGAL_VALUE; } /* Quick return */ if ( n == 0 ) return MAGMA_SUCCESS; ldda = magma_roundup( n, 32 ); nb = magma_get_ssytrf_nopiv_nb(n); ib = min(32, nb); // inner-block for diagonal factorization if ((MAGMA_SUCCESS != magma_smalloc(&dA, n *ldda)) || (MAGMA_SUCCESS != magma_smalloc(&dW, nb*ldda))) { /* alloc failed so call the non-GPU-resident version */ *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_device_t cdev; magma_queue_t queues[2]; magma_event_t event; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); magma_event_create( &event ); trace_init( 1, 1, 2, queues ); /* Use hybrid blocked code. */ if (upper) { //========================================================= // Compute the LDLt factorization A = U'*D*U without pivoting. // copy matrix to GPU for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async(j+jb, jb, A(0, j), lda, dA(0, j), ldda, queues[0]); trace_gpu_end( 0, 0 ); } // main loop for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); if ( j != 0) { //magma_event_sync(event); magma_sgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, queues[1]); } trace_gpu_end( 0, 0 ); // factorize the diagonal block magma_queue_sync(queues[1]); trace_cpu_start( 0, "potrf", "potrf" ); magma_ssytrf_nopiv_cpu( MagmaUpper, jb, ib, A(j, j), lda, info ); trace_cpu_end( 0 ); if (*info != 0) { *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, queues[0]); trace_gpu_end( 0, 0 ); // copy j-th column of U back to CPU trace_gpu_start( 0, 1, "get", "get" ); magma_sgetmatrix_async(j, jb, dA(0, j), ldda, A(0, j), lda, queues[1]); trace_gpu_end( 0, 1 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_strsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaUnit, jb, (n-j-jb), c_one, dA(j, j), ldda, dA(j, j+jb), ldda, queues[0] ); magma_scopymatrix( jb, n-j-jb, dA( j, j+jb ), ldda, dWt( 0, j+jb ), nb, queues[0] ); // update the trailing submatrix with D magmablas_slascl_diag( MagmaUpper, jb, n-j-jb, dA(j, j), ldda, dA(j, j+jb), ldda, queues[0], &iinfo); trace_gpu_end( 0, 0 ); // update the trailing submatrix with U and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k < n; k += nb) { magma_int_t kb = min(nb,n-k); magma_sgemm( MagmaConjTrans, MagmaNoTrans, kb, n-k, jb, c_neg_one, dWt(0, k), nb, dA(j, k), ldda, c_one, dA(k, k), ldda, queues[0]); if (k == j+jb) { // magma_event_record( event, queues[0] ); magma_queue_sync( queues[0] ); } } trace_gpu_end( 0, 0 ); } } } else { //========================================================= // Compute the LDLt factorization A = L*D*L' without pivoting. // copy the matrix to GPU for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async((n-j), jb, A(j, j), lda, dA(j, j), ldda, queues[0]); trace_gpu_end( 0, 0 ); } // main loop for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); if (j != 0) { //magma_event_sync(event); magma_sgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, queues[1]); } trace_gpu_end( 0, 0 ); // factorize the diagonal block magma_queue_sync(queues[1]); trace_cpu_start( 0, "potrf", "potrf" ); magma_ssytrf_nopiv_cpu( MagmaLower, jb, ib, A(j, j), lda, info ); trace_cpu_end( 0 ); if (*info != 0) { *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, queues[0]); trace_gpu_end( 0, 0 ); // copy j-th row of L back to CPU trace_gpu_start( 0, 1, "get", "get" ); magma_sgetmatrix_async(jb, j, dA(j, 0), ldda, A(j, 0), lda, queues[1]); trace_gpu_end( 0, 1 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_strsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaUnit, (n-j-jb), jb, c_one, dA(j, j), ldda, dA(j+jb, j), ldda, queues[0] ); magma_scopymatrix( n-j-jb,jb, dA( j+jb, j ), ldda, dW( j+jb, 0 ), ldda, queues[0] ); // update the trailing submatrix with D magmablas_slascl_diag( MagmaLower, n-j-jb, jb, dA(j, j), ldda, dA(j+jb, j), ldda, queues[0], &iinfo ); trace_gpu_end( 0, 0 ); // update the trailing submatrix with L and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k < n; k += nb) { magma_int_t kb = min(nb,n-k); magma_sgemm( MagmaNoTrans, MagmaConjTrans, n-k, kb, jb, c_neg_one, dA(k, j), ldda, dW(k, 0), ldda, c_one, dA(k, k), ldda, queues[0] ); if (k == j+jb) { //magma_event_record( event, queues[0] ); magma_queue_sync(queues[0]); } } trace_gpu_end( 0, 0 ); } } } trace_finalize( "ssytrf.svg","trace.css" ); magma_queue_destroy(queues[0]); magma_queue_destroy(queues[1]); magma_event_destroy( event ); magma_free(dW); magma_free(dA); return MAGMA_SUCCESS; } /* magma_ssytrf_nopiv */
/** Purpose ------- ZHETRD reduces a complex Hermitian matrix A to real symmetric tridiagonal form T by an orthogonal similarity transformation: Q**H * A * Q = T. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] nqueue INTEGER The number of GPU streams used for update. 10 >= nqueue > 0. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A COMPLEX_16 array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if UPLO = MagmaUpper, the diagonal and first superdiagonal of A are overwritten by the corresponding elements of the tridiagonal matrix T, and the elements above the first superdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = MagmaLower, the diagonal and first subdiagonal of A are over- written by the corresponding elements of the tridiagonal matrix T, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] d COMPLEX_16 array, dimension (N) The diagonal elements of the tridiagonal matrix T: D(i) = A(i,i). @param[out] e COMPLEX_16 array, dimension (N-1) The off-diagonal elements of the tridiagonal matrix T: E(i) = A(i,i+1) if UPLO = MagmaUpper, E(i) = A(i+1,i) if UPLO = MagmaLower. @param[out] tau COMPLEX_16 array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= N*NB, where NB is the optimal blocksize given by magma_get_zhetrd_nb(). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n-1) . . . H(2) H(1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in A(1:i-1,i+1), and tau in TAU(i). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(n-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i), and tau in TAU(i). The contents of A on exit are illustrated by the following examples with n = 5: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( d e v2 v3 v4 ) ( d ) ( d e v3 v4 ) ( e d ) ( d e v4 ) ( v1 e d ) ( d e ) ( v1 v2 e d ) ( d ) ( v1 v2 v3 e d ) where d and e denote diagonal and off-diagonal elements of T, and vi denotes an element of the vector defining H(i). @ingroup magma_zheev_comp ********************************************************************/ extern "C" magma_int_t magma_zhetrd_mgpu( magma_int_t ngpu, magma_int_t nqueue, magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, double *d, double *e, magmaDoubleComplex *tau, magmaDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define dA(id, i, j) (dA[(id)] + (j)*ldda + (i)) #define dW(id, i, j) (dW[(id)] + (j)*ldda + (i)) const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t nlocal, ldda; magma_int_t nb = magma_get_zhetrd_nb(n), ib, ib2; const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; const magmaDoubleComplex c_one = MAGMA_Z_ONE; const double d_one = MAGMA_D_ONE; #ifdef PROFILE_SY2RK double mv_time = 0.0; double up_time = 0.0; #endif magma_int_t kk, nx; magma_int_t i, ii, iii, j, dev, i_n; magma_int_t iinfo; magma_int_t ldwork, lddw, lwkopt, ldwork2, lhwork; magma_int_t lquery; // set pointers to NULL so it is safe to goto CLEANUP if any malloc fails. magma_queue_t queues[MagmaMaxGPUs][10] = { { NULL, NULL } }; magma_queue_t queues0[MagmaMaxGPUs] = { NULL }; magmaDoubleComplex *hwork = NULL; magmaDoubleComplex_ptr dwork2[MagmaMaxGPUs] = { NULL }; magmaDoubleComplex_ptr dA[MagmaMaxGPUs] = { NULL }; magmaDoubleComplex_ptr dW[MagmaMaxGPUs] = { NULL }; *info = 0; int upper = (uplo == MagmaUpper); lquery = (lwork == -1); if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } else if (lwork < nb*n && ! lquery) { *info = -9; } else if ( nqueue > 2 ) { *info = 2; // TODO fix } /* Determine the block size. */ ldwork = n; lwkopt = n * nb; if (*info == 0) { work[0] = MAGMA_Z_MAKE( lwkopt, 0 ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { work[0] = c_one; return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); //#define PROFILE_SY2RK #ifdef PROFILE_SY2RK double times[11] = { 0 }; magma_event_t start, stop; float etime; magma_setdevice( 0 ); magma_event_create( &start ); magma_event_create( &stop ); #endif ldda = roundup( lda, 32 ); lddw = ldda; nlocal = nb*(1 + n/(nb*ngpu)); ldwork2 = ldda*( ((n - 1)/nb + 1) + 1); // i.e., ldda*(blocks + 1) for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); // TODO fix memory leak if ( MAGMA_SUCCESS != magma_zmalloc( &dA[dev], nlocal*ldda + 3*lddw*nb ) || MAGMA_SUCCESS != magma_zmalloc( &dwork2[dev], ldwork2 ) ) { *info = MAGMA_ERR_DEVICE_ALLOC; goto CLEANUP; } dW[dev] = dA[dev] + nlocal*ldda; for( kk=0; kk < nqueue; kk++ ) { magma_queue_create( &queues[dev][kk] ); } queues0[dev] = queues[dev][0]; } lhwork = nqueue*ngpu*n; if ( MAGMA_SUCCESS != magma_zmalloc_pinned( &hwork, lhwork ) ) { *info = MAGMA_ERR_HOST_ALLOC; goto CLEANUP; } // crossover point: use CPU code for last nx columns //if (n < 2048) // nx = n; //else // nx = 512; nx = min( 128, n ); // nx <= n is required if (upper) { /* Copy the matrix to the GPU */ if (1 <= n-nx) { magma_zhtodhe( ngpu, uplo, n, nb, A, lda, dA, ldda, queues, &iinfo ); } /* Reduce the upper triangle of A. Columns 1:kk are handled by the unblocked method. */ for (i = nb*((n-1)/nb); i >= nx; i -= nb) { ib = min(nb, n-i); ii = nb*(i/(nb*ngpu)); dev = (i/nb)%ngpu; /* wait for the next panel */ if (i != nb*((n-1)/nb)) { magma_setdevice( dev ); magma_queue_sync( queues[dev][0] ); } magma_zlatrd_mgpu( ngpu, uplo, i+ib, ib, nb, A(0, 0), lda, e, tau, work, ldwork, dA, ldda, 0, dW, i+ib, hwork, lhwork, dwork2, ldwork2, queues0 ); magma_zher2k_mgpu( ngpu, MagmaUpper, MagmaNoTrans, nb, i, ib, c_neg_one, dW, i+ib, 0, d_one, dA, ldda, 0, nqueue, queues); /* get the next panel */ if (i-nb >= nx ) { ib2 = min(nb, n-(i-nb)); ii = nb*((i-nb)/(nb*ngpu)); dev = ((i-nb)/nb)%ngpu; magma_setdevice( dev ); magma_zgetmatrix_async( (i-nb)+ib2, ib2, dA(dev, 0, ii), ldda, A(0, i-nb), lda, queues[dev][0] ); } /* Copy superdiagonal elements back into A, and diagonal elements into D */ for (j = i; j < i+ib; ++j) { if ( j > 0 ) { *A(j-1,j) = MAGMA_Z_MAKE( e[j - 1], 0 ); } d[j] = MAGMA_Z_REAL( *A(j, j) ); } } /* end of for i=... */ if ( nx > 0 ) { if (1 <= n-nx) { /* else A is already on CPU */ for (i=0; i < nx; i += nb) { ib = min(nb, n-i); ii = nb*(i/(nb*ngpu)); dev = (i/nb)%ngpu; magma_setdevice( dev ); magma_zgetmatrix_async( nx, ib, dA(dev, 0, ii), ldda, A(0, i), lda, queues[dev][0] ); } } for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_queue_sync( queues[dev][0] ); } /* Use CPU code to reduce the last or only block */ lapackf77_zhetrd( uplo_, &nx, A(0, 0), &lda, d, e, tau, work, &lwork, &iinfo ); } } else { trace_init( 1, ngpu, nqueue, (CUstream_st**)queues ); /* Copy the matrix to the GPU */ if (1 <= n-nx) { magma_zhtodhe( ngpu, uplo, n, nb, A, lda, dA, ldda, queues, &iinfo ); } /* Reduce the lower triangle of A */ for (i = 0; i < n-nx; i += nb) { ib = min(nb, n-i); ii = nb*(i/(nb*ngpu)); dev = (i/nb)%ngpu; /* Reduce columns i:i+ib-1 to tridiagonal form and form the matrix W which is needed to update the unreduced part of the matrix */ /* Get the current panel (no need for the 1st iteration) */ if (i != 0) { magma_setdevice( dev ); trace_gpu_start( dev, 0, "comm", "get" ); magma_zgetmatrix_async( n-i, ib, dA(dev, i, ii), ldda, A(i,i), lda, queues[dev][0] ); trace_gpu_end( dev, 0 ); magma_queue_sync( queues[dev][0] ); magma_setdevice( 0 ); } magma_zlatrd_mgpu( ngpu, uplo, n-i, ib, nb, A(i, i), lda, &e[i], &tau[i], work, ldwork, dA, ldda, i, dW, n-i, hwork, lhwork, dwork2, ldwork2, queues0 ); #ifdef PROFILE_SY2RK magma_setdevice( 0 ); if ( i > 0 ) { cudaEventElapsedTime( &etime, start, stop ); up_time += (etime/1000.0); } magma_event_record( start, 0 ); #endif magma_zher2k_mgpu( ngpu, MagmaLower, MagmaNoTrans, nb, n-i-ib, ib, c_neg_one, dW, n-i, ib, d_one, dA, ldda, i+ib, nqueue, queues); #ifdef PROFILE_SY2RK magma_setdevice( 0 ); magma_event_record( stop, 0 ); #endif /* Copy subdiagonal elements back into A, and diagonal elements into D */ for (j = i; j < i+ib; ++j) { if ( j+1 < n ) { *A(j+1,j) = MAGMA_Z_MAKE( e[j], 0 ); } d[j] = MAGMA_Z_REAL( *A(j, j) ); } } /* for i=... */ /* Use CPU code to reduce the last or only block */ if ( i < n ) { iii = i; i_n = n-i; if ( i > 0 ) { for (; i < n; i += nb) { ib = min(nb, n-i); ii = nb*(i/(nb*ngpu)); dev = (i/nb)%ngpu; magma_setdevice( dev ); magma_zgetmatrix_async( i_n, ib, dA(dev, iii, ii), ldda, A(iii, i), lda, queues[dev][0] ); } for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_queue_sync( queues[dev][0] ); } } lapackf77_zhetrd( uplo_, &i_n, A(iii, iii), &lda, &d[iii], &e[iii], &tau[iii], work, &lwork, &iinfo ); } } for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); for( kk=0; kk < nqueue; kk++ ) { magma_queue_sync( queues[dev][kk] ); } } #ifdef PROFILE_SY2RK magma_setdevice( 0 ); if ( n > nx ) { cudaEventElapsedTime( &etime, start, stop ); up_time += (etime/1000.0); } magma_event_destroy( start ); magma_event_destroy( stop ); #endif trace_finalize( "zhetrd.svg", "trace.css" ); #ifdef PROFILE_SY2RK printf( " n=%d nb=%d\n", n, nb ); printf( " Time in ZLARFG: %.2e seconds\n", times[0] ); //printf( " Time in ZHEMV : %.2e seconds\n", mv_time ); printf( " Time in ZHER2K: %.2e seconds\n", up_time ); #endif CLEANUP: for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); for( kk=0; kk < nqueue; kk++ ) { magma_queue_destroy( queues[dev][kk] ); } magma_free( dA[dev] ); magma_free( dwork2[dev] ); } magma_free_pinned( hwork ); magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); work[0] = MAGMA_Z_MAKE( lwkopt, 0 ); return *info; } /* magma_zhetrd */
extern "C" magma_int_t magma_dsytrf_nopiv(magma_uplo_t uplo, magma_int_t n, double *A, magma_int_t lda, magma_int_t *info) { /* -- MAGMA (version 1.6.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2011 Purpose ======= DSYTRF_nopiv computes the LDLt factorization of a real symmetric matrix A. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The factorization has the form A = U\*\*H * D * U, if UPLO = 'U', or A = L * D * L\*\*H, if UPLO = 'L', where U is an upper triangular matrix, L is lower triangular, and D is a diagonal matrix. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) DOUBLE_PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U\*\*H*U or A = L*L\*\*H. Higher performance is achieved if A is in pinned memory, e.g. allocated using cudaMallocHost. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -6, the GPU memory allocation failed > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ /* Local variables */ double zone = MAGMA_D_ONE; double mzone = MAGMA_D_NEG_ONE; int upper = (uplo == MagmaUpper); magma_int_t j, k, jb, ldda, nb, ib, iinfo; magmaDouble_ptr dA; magmaDouble_ptr dW; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return MAGMA_ERR_ILLEGAL_VALUE; } /* Quick return */ if ( n == 0 ) return MAGMA_SUCCESS; ldda = ((n+31)/32)*32; nb = magma_get_dsytrf_nopiv_nb(n); ib = min(32, nb); // inner-block for diagonal factorization if ((MAGMA_SUCCESS != magma_dmalloc(&dA, n *ldda)) || (MAGMA_SUCCESS != magma_dmalloc(&dW, nb*ldda))) { /* alloc failed so call the non-GPU-resident version */ *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t stream[2]; magma_event_t event; magma_queue_create(&stream[0]); magma_queue_create(&stream[1]); magma_event_create( &event ); trace_init( 1, 1, 2, (CUstream_st**)stream ); //if (nb <= 1 || nb >= n) //{ // lapackf77_dpotrf(uplo_, &n, a, &lda, info); //} else { /* Use hybrid blocked code. */ if (upper) { //========================================================= // Compute the LDLt factorization A = U'*D*U without pivoting. // copy matrix to GPU for (j=0; j<n; j+=nb) { jb = min(nb, (n-j)); trace_gpu_start( 0, 0, "set", "set" ); magma_dsetmatrix_async(j+jb, jb, A(0, j), lda, dA(0, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); } // main loop for (j=0; j<n; j += nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); magma_dgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, stream[0]); trace_gpu_end( 0, 0 ); // copy j-th column of U back to CPU magma_queue_wait_event( stream[1], event ); trace_gpu_start( 0, 1, "get", "get" ); magma_dgetmatrix_async(j, jb, dA(0, j), ldda, A(0, j), lda, stream[1]); trace_gpu_end( 0, 1 ); // factorize the diagonal block magma_queue_sync(stream[0]); trace_cpu_start( 0, "potrf", "potrf" ); dsytrf_nopiv_cpu(MagmaUpper, jb, ib, A(j, j), lda, info); trace_cpu_end( 0 ); if (*info != 0){ *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_dsetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column magmablasSetKernelStream( stream[0] ); trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_dtrsm(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaUnit, jb, (n-j-jb), zone, dA(j, j), ldda, dA(j, j+jb), ldda); magma_dcopymatrix( jb, n-j-jb, dA( j, j+jb ), ldda, dWt( 0, j+jb ), nb ); // update the trailing submatrix with D magmablas_dlascl_diag(MagmaUpper, jb, n-j-jb, dA(j, j), ldda, dA(j, j+jb), ldda, &iinfo); magma_event_record( event, stream[0] ); trace_gpu_end( 0, 0 ); // update the trailing submatrix with U and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k<n; k+=nb) { magma_int_t kb = min(nb,n-k); magma_dgemm(MagmaConjTrans, MagmaNoTrans, kb, n-k, jb, mzone, dWt(0, k), nb, dA(j, k), ldda, zone, dA(k, k), ldda); } trace_gpu_end( 0, 0 ); } } } else { //========================================================= // Compute the LDLt factorization A = L*D*L' without pivoting. // copy the matrix to GPU for (j=0; j<n; j+=nb) { jb = min(nb, (n-j)); trace_gpu_start( 0, 0, "set", "set" ); magma_dsetmatrix_async((n-j), jb, A(j, j), lda, dA(j, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); } // main loop for (j=0; j<n; j+=nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); magma_dgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, stream[0]); trace_gpu_end( 0, 0 ); // copy j-th row of L back to CPU magma_queue_wait_event( stream[1], event ); trace_gpu_start( 0, 1, "get", "get" ); magma_dgetmatrix_async(jb, j, dA(j, 0), ldda, A(j, 0), lda, stream[1]); trace_gpu_end( 0, 1 ); // factorize the diagonal block magma_queue_sync(stream[0]); trace_cpu_start( 0, "potrf", "potrf" ); dsytrf_nopiv_cpu(MagmaLower, jb, ib, A(j, j), lda, info); trace_cpu_end( 0 ); if (*info != 0){ *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_dsetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column magmablasSetKernelStream( stream[0] ); trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_dtrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaUnit, (n-j-jb), jb, zone, dA(j, j), ldda, dA(j+jb, j), ldda); magma_dcopymatrix( n-j-jb,jb, dA( j+jb, j ), ldda, dW( j+jb, 0 ), ldda ); // update the trailing submatrix with D magmablas_dlascl_diag(MagmaLower, n-j-jb, jb, dA(j, j), ldda, dA(j+jb, j), ldda, &iinfo); magma_event_record( event, stream[0] ); trace_gpu_end( 0, 0 ); // update the trailing submatrix with L and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k<n; k+=nb) { magma_int_t kb = min(nb,n-k); magma_dgemm(MagmaNoTrans, MagmaConjTrans, n-k, kb, jb, mzone, dA(k, j), ldda, dW(k, 0), ldda, zone, dA(k, k), ldda); } trace_gpu_end( 0, 0 ); } } } } trace_finalize( "dsytrf.svg","trace.css" ); magma_queue_destroy(stream[0]); magma_queue_destroy(stream[1]); magma_event_destroy( event ); magma_free(dW); magma_free(dA); return MAGMA_SUCCESS; } /* magma_dsytrf_nopiv */
extern "C" magma_int_t magma_cungqr_m( magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *T, magma_int_t nb, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= CUNGQR generates an M-by-N COMPLEX matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by CGEQRF. Arguments ========= M (input) INTEGER The number of rows of the matrix Q. M >= 0. N (input) INTEGER The number of columns of the matrix Q. M >= N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. A (input/output) COMPLEX array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by CGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. LDA (input) INTEGER The first dimension of the array A. LDA >= max(1,M). TAU (input) COMPLEX array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by CGEQRF_GPU. T (input) COMPLEX array, dimension (NB, min(M,N)). T contains the T matrices used in blocking the elementary reflectors H(i), e.g., this can be the 6th argument of magma_cgeqrf_gpu (except stored on the CPU, not the GPU). NB (input) INTEGER This is the block size used in CGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in T. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument has an illegal value ===================================================================== */ #define A(i,j) ( A + (i) + (j)*lda ) #define dA(d,i,j) (dA[d] + (i) + (j)*ldda) #define dT(d,i,j) (dT[d] + (i) + (j)*nb) magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldwork; magma_int_t i, ib, ki, kk, iinfo; magmaFloatComplex *work; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } magma_int_t di, dn; int dpanel; int ngpu = magma_num_gpus(); int doriginal; magma_getdevice( &doriginal ); // Allocate memory on GPUs for A and workspaces magma_int_t ldda = ((m + 31) / 32) * 32; magma_int_t lddwork = ((n + 31) / 32) * 32; magma_int_t min_lblocks = (n / nb) / ngpu; // min. blocks per gpu magma_int_t last_dev = (n / nb) % ngpu; // device with last block magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; magmaFloatComplex *dA[ MagmaMaxGPUs ] = { NULL }; magmaFloatComplex *dT[ MagmaMaxGPUs ] = { NULL }; magmaFloatComplex *dV[ MagmaMaxGPUs ] = { NULL }; magmaFloatComplex *dW[ MagmaMaxGPUs ] = { NULL }; magma_queue_t stream[ MagmaMaxGPUs ] = { NULL }; for( int d = 0; d < ngpu; ++d ) { // example with n = 75, nb = 10, ngpu = 3 // min_lblocks = 2 // last_dev = 1 // gpu 0: 2 blocks, cols: 0- 9, 30-39, 60-69 // gpu 1: 1+ blocks, cols: 10-19, 40-49, 70-74 (partial) // gpu 2: 1 block , cols: 20-29, 50-59 magma_setdevice( d ); nlocal[d] = min_lblocks*nb; if ( d < last_dev ) { nlocal[d] += nb; } else if ( d == last_dev ) { nlocal[d] += (n % nb); } ldwork = nlocal[d]*ldda // dA + nb*m // dT + nb*ldda // dV + nb*lddwork; // dW if ( MAGMA_SUCCESS != magma_cmalloc( &dA[d], ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto CLEANUP; } dT[d] = dA[d] + nlocal[d]*ldda; dV[d] = dT[d] + nb*m; dW[d] = dV[d] + nb*ldda; magma_queue_create( &stream[d] ); } trace_init( 1, ngpu, 1, stream ); // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate CPU work space // n*nb for cungqr workspace lwork = n * nb; magma_cmalloc_cpu( &work, lwork ); if (work == NULL) { *info = MAGMA_ERR_HOST_ALLOC; goto CLEANUP; } // Use unblocked code for the last or only block. if (kk < n) { trace_cpu_start( 0, "ungqr", "ungqr last block" ); m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; dpanel = (kk / nb) % ngpu; di = ((kk / nb) / ngpu) * nb; magma_setdevice( dpanel ); lapackf77_cungqr( &m_kk, &n_kk, &k_kk, A(kk, kk), &lda, &tau[kk], work, &lwork, &iinfo ); magma_csetmatrix( m_kk, n_kk, A(kk, kk), lda, dA(dpanel, kk, di), ldda ); // Set A(1:kk,kk+1:n) to zero. magmablas_claset( MagmaUpperLower, kk, n - kk, dA(dpanel, 0, di), ldda ); trace_cpu_end( 0 ); } if (kk > 0) { // Use blocked code // send T to all GPUs for( int d = 0; d < ngpu; ++d ) { magma_setdevice( d ); trace_gpu_start( d, 0, "set", "set T" ); magma_csetmatrix_async( nb, min(m,n), T, nb, dT[d], nb, stream[d] ); trace_gpu_end( d, 0 ); } // stream: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation for( i = ki; i >= 0; i -= nb ) { ib = min(nb, k - i); mi = m - i; dpanel = (i / nb) % ngpu; di = ((i / nb) / ngpu) * nb; // Send current panel to the GPUs lapackf77_claset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); for( int d = 0; d < ngpu; ++d ) { magma_setdevice( d ); trace_gpu_start( d, 0, "set", "set V" ); magma_csetmatrix_async( mi, ib, A(i, i), lda, dV[d], ldda, stream[d] ); trace_gpu_end( d, 0 ); } // set panel to identity magma_setdevice( dpanel ); magmablasSetKernelStream( stream[dpanel] ); trace_gpu_start( dpanel, 0, "laset", "laset" ); magmablas_claset( MagmaUpperLower, i, ib, dA(dpanel, 0, di), ldda ); magmablas_claset_identity( mi, ib, dA(dpanel, i, di), ldda ); trace_gpu_end( dpanel, 0 ); if (i < n) { // Apply H to A(i:m,i:n) from the left for( int d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( stream[d] ); magma_indices_1D_bcyclic( nb, ngpu, d, i, n, &di, &dn ); trace_gpu_start( d, 0, "larfb", "larfb" ); magma_clarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, dn-di, ib, dV[d], ldda, dT(d,0,i), nb, dA(d, i, di), ldda, dW[d], lddwork ); trace_gpu_end( d, 0 ); } } } } // copy result back to CPU trace_cpu_start( 0, "get", "get A" ); magma_cgetmatrix_1D_col_bcyclic( m, n, dA, ldda, A, lda, ngpu, nb ); trace_cpu_end( 0 ); #ifdef TRACING char name[80]; snprintf( name, sizeof(name), "cungqr-n%d-ngpu%d.svg", m, ngpu ); trace_finalize( name, "trace.css" ); #endif CLEANUP: for( int d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( NULL ); magma_free( dA[d] ); dA[d] = NULL; if ( stream[d] != NULL ) { magma_queue_destroy( stream[d] ); } } magma_free_cpu( work ); magma_setdevice( doriginal ); return *info; } /* magma_cungqr */
extern "C" magma_int_t magma_zgetrf2_mgpu(magma_int_t num_gpus, magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset, cuDoubleComplex **d_lAT, magma_int_t lddat, magma_int_t *ipiv, cuDoubleComplex **d_lAP, cuDoubleComplex *w, magma_int_t ldw, cudaStream_t streaml[][2], magma_int_t *info) #endif { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2010 Purpose ======= ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Use two buffer to send panels.. Arguments ========= NUM_GPUS (input) INTEGER The number of GPUS to be used for the factorization. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -7, internal GPU memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define inAT(id,i,j) (d_lAT[(id)] + ((offset)+(i)*nb)*lddat + (j)*nb) #define W(j) (w+((j)%num_gpus)*nb*ldw) cuDoubleComplex c_one = MAGMA_Z_ONE; cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t block_size = 32; magma_int_t iinfo, n_local[4]; magma_int_t maxm, mindim; magma_int_t i, ii, d, dd, rows, cols, s, ldpan[4]; magma_int_t id, i_local, i_local2, nb0, nb1; cuDoubleComplex *d_panel[4], *panel_local[4]; //cudaStream_t streaml[4][2]; /* Check arguments */ *info = 0; if (m < 0) *info = -2; else if (n < 0) *info = -3; else if (num_gpus*lddat < max(1,n)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); //nb = magma_get_zgetrf_nb(m); if( num_gpus > ceil((double)n/nb) ) { *info = -1; return *info; } { /* Use hybrid blocked code. */ maxm = ((m + block_size-1)/block_size)*block_size; /* some initializations */ for(i=0; i<num_gpus; i++){ magmaSetDevice(i); n_local[i] = ((n/nb)/num_gpus)*nb; if (i < (n/nb)%num_gpus) n_local[i] += nb; else if (i == (n/nb)%num_gpus) n_local[i] += n%nb; /* workspaces */ d_panel[i] = &(d_lAP[i][nb*maxm]); /* temporary panel storage */ /* create local streams */ //magma_queue_create(&streaml[i][0]); //magma_queue_create(&streaml[i][1]); } trace_init( 1, num_gpus, 2, (CUstream_st**)streaml ); /* start sending the panel to cpu */ nb0 = min(mindim, nb); magmaSetDevice(0); magmablasSetKernelStream(streaml[0][1]); trace_gpu_start( 0, 1, "comm", "get" ); if( nb0 == nb ) magmablas_ztranspose( d_lAP[0], maxm, inAT(0,0,0), lddat, nb0, maxm ); else magmablas_ztranspose2( d_lAP[0], maxm, inAT(0,0,0), lddat, nb0, maxm ); magma_zgetmatrix_async( m, nb0, d_lAP[0], maxm, W(0), ldw, streaml[0][1] ); trace_gpu_end( 0, 1 ); /* ------------------------------------------------------------------------------------- */ #ifdef PROFILE magma_timestr_t start_timer, end_timer; start_timer = get_current_time(); #endif s = mindim / nb; for( i=0; i<s; i++ ) { /* Set the GPU number that holds the current panel */ id = i%num_gpus; magmaSetDevice(id); /* Set the local index where the current panel is */ i_local = i/num_gpus; cols = maxm - i*nb; rows = m - i*nb; /* synchrnoize i-th panel from id-th gpu into work */ magma_queue_sync( streaml[id][1] ); /* i-th panel factorization */ trace_cpu_start( 0, "getrf", "getrf" ); #ifdef PANEL_FACT_MC cntxt->nb = 12; magma_zgetrf_mc(cntxt, &rows, &nb, W(i), &ldw, ipiv+i*nb, &iinfo); #else lapackf77_zgetrf( &rows, &nb, W(i), &ldw, ipiv+i*nb, &iinfo); #endif if ( (*info == 0) && (iinfo > 0) ) { *info = iinfo + i*nb; //break; } trace_cpu_end( 0 ); /* start sending the panel to all the gpus */ d = (i+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { magmaSetDevice(d); trace_gpu_start( 0, 1, "comm", "set" ); magma_zsetmatrix_async( rows, nb, W(i), ldw, d_lAP[d], cols, streaml[d][1] ); trace_gpu_end( 0, 1 ); d = (d+1)%num_gpus; } /* apply the pivoting */ d = (i+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { magmaSetDevice(d); magmablasSetKernelStream(streaml[d][0]); trace_gpu_start( d, 1, "pivot", "pivot" ); if( dd == 0 ) magmablas_zpermute_long2( lddat, inAT(d,0,0), lddat, ipiv, nb, i*nb ); else magmablas_zpermute_long3( inAT(d,0,0), lddat, ipiv, nb, i*nb ); trace_gpu_end( d, 1 ); d = (d+1)%num_gpus; } /* update the trailing-matrix/look-ahead */ d = (i+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { magmaSetDevice(d); /* storage for panel */ if( d == id ) { /* the panel belond to this gpu */ panel_local[d] = inAT(d,i,i_local); ldpan[d] = lddat; /* next column */ i_local2 = i_local+1; } else { /* the panel belong to another gpu */ panel_local[d] = &d_panel[d][(i%2)*nb*maxm]; //panel_local[d] = d_panel[d]; ldpan[d] = nb; /* next column */ i_local2 = i_local; if( d < id ) i_local2 ++; } /* the size of the next column */ if ( s > (i+1) ) { nb0 = nb; } else { nb0 = n_local[d]-nb*(s/num_gpus); if( d < s%num_gpus ) nb0 -= nb; } if( d == (i+1)%num_gpus) { /* owns the next column, look-ahead the column */ nb1 = nb0; magmablasSetKernelStream(streaml[d][1]); /* make sure all the pivoting has been applied */ magma_queue_sync(streaml[d][0]); trace_gpu_start( d, 1, "gemm", "gemm" ); } else { /* update the entire trailing matrix */ nb1 = n_local[d] - i_local2*nb; magmablasSetKernelStream(streaml[d][0]); /* synchronization to make sure panel arrived on gpu */ magma_queue_sync(streaml[d][1]); trace_gpu_start( d, 0, "gemm", "gemm" ); } magmablas_ztranspose(panel_local[d], ldpan[d], d_lAP[d], cols, cols, nb); /* gpu updating the trailing matrix */ //magmablas_ztrsm( magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb, c_one, panel_local[d], ldpan[d], inAT(d, i, i_local2), lddat); //cublasZgemm magma_zgemm( MagmaNoTrans, MagmaNoTrans, nb1, m-(i+1)*nb, nb, c_neg_one, inAT(d, i, i_local2), lddat, &(panel_local[d][nb*ldpan[d]]), ldpan[d], c_one, inAT(d, i+1, i_local2), lddat ); if( d == (i+1)%num_gpus ) { /* Set the local index where the current panel is */ int loff = i+1; int i_local = (i+1)/num_gpus; int ldda = maxm - (i+1)*nb; int cols = m - (i+1)*nb; nb0 = min(nb, mindim - (i+1)*nb); /* size of the diagonal block */ trace_gpu_end( d, 1 ); if( nb0 > 0 ) { /* transpose the panel for sending it to cpu */ trace_gpu_start( d, 1, "comm", "get" ); if( i+1 < s ) magmablas_ztranspose( d_lAP[d], ldda, inAT(d,loff,i_local), lddat, nb0, ldda ); else magmablas_ztranspose2( d_lAP[d], ldda, inAT(d,loff,i_local), lddat, nb0, ldda ); /* send the panel to cpu */ magma_zgetmatrix_async( cols, nb0, d_lAP[d], ldda, W(i+1), ldw, streaml[d][1] ); trace_gpu_end( d, 1 ); } } else { trace_gpu_end( d, 0 ); } d = (d+1)%num_gpus; } /* update the remaining matrix by gpu owning the next panel */ if( (i+1) < s ) { int i_local = (i+1)/num_gpus; int rows = m - (i+1)*nb; d = (i+1)%num_gpus; magmaSetDevice(d); magmablasSetKernelStream(streaml[d][0]); trace_gpu_start( d, 0, "gemm", "gemm" ); //magmablas_ztrsm magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d] - (i_local+1)*nb, nb, c_one, panel_local[d], ldpan[d], inAT(d,i,i_local+1), lddat ); //cublasZgemm magma_zgemm( MagmaNoTrans, MagmaNoTrans, n_local[d]-(i_local+1)*nb, rows, nb, c_neg_one, inAT(d,i,i_local+1), lddat, &(panel_local[d][nb*ldpan[d]]), ldpan[d], c_one, inAT(d,i+1, i_local+1), lddat ); trace_gpu_end( d, 0 ); } } /* end of for i=1..s */ /* ------------------------------------------------------------------------------ */ /* Set the GPU number that holds the last panel */ id = s%num_gpus; /* Set the local index where the last panel is */ i_local = s/num_gpus; /* size of the last diagonal-block */ nb0 = min(m - s*nb, n - s*nb); rows = m - s*nb; cols = maxm - s*nb; if( nb0 > 0 ) { magmaSetDevice(id); /* wait for the last panel on cpu */ magma_queue_sync( streaml[id][1] ); /* factor on cpu */ lapackf77_zgetrf( &rows, &nb0, W(s), &ldw, ipiv+s*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + s*nb; /* send the factor to gpus */ for( d=0; d<num_gpus; d++ ) { magmaSetDevice(d); i_local2 = i_local; if( d < id ) i_local2 ++; if( d == id || n_local[d] > i_local2*nb ) { magma_zsetmatrix_async( rows, nb0, W(s), ldw, d_lAP[d], cols, streaml[d][1] ); } } for( d=0; d<num_gpus; d++ ) { magmaSetDevice(d); magmablasSetKernelStream(streaml[d][0]); if( d == 0 ) magmablas_zpermute_long2( lddat, inAT(d,0,0), lddat, ipiv, nb0, s*nb ); else magmablas_zpermute_long3( inAT(d,0,0), lddat, ipiv, nb0, s*nb ); } for( d=0; d<num_gpus; d++ ) { magmaSetDevice(d); magmablasSetKernelStream(streaml[d][1]); /* wait for the pivoting to be done */ magma_queue_sync( streaml[d][0] ); i_local2 = i_local; if( d < id ) i_local2++; if( d == id ) { /* the panel belond to this gpu */ panel_local[d] = inAT(d,s,i_local); /* next column */ nb1 = n_local[d] - i_local*nb-nb0; magmablas_ztranspose2( panel_local[d], lddat, d_lAP[d], cols, rows, nb0); if( nb1 > 0 ) //cublasZtrsm magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb0, c_one, panel_local[d], lddat, inAT(d,s,i_local)+nb0, lddat); } else if( n_local[d] > i_local2*nb ) { /* the panel belong to another gpu */ panel_local[d] = &d_panel[d][(s%2)*nb*maxm]; //panel_local[d] = d_panel[d]; /* next column */ nb1 = n_local[d] - i_local2*nb; magmablas_ztranspose2( panel_local[d], nb, d_lAP[d], cols, rows, nb0); //cublasZtrsm magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb0, c_one, panel_local[d], nb, inAT(d,s,i_local2), lddat); } } } /* if( nb0 > 0 ) */ /* clean up */ trace_finalize( "zgetrf_mgpu.svg","trace.css" ); for( d=0; d<num_gpus; d++ ) { magmaSetDevice(d); magma_queue_sync( streaml[d][0] ); magma_queue_sync( streaml[d][1] ); //magma_queue_destroy(streaml[d][0]); //magma_queue_destroy(streaml[d][1]); magmablasSetKernelStream(NULL); } magmaSetDevice(0); #ifdef PROFILE end_timer = get_current_time(); printf("\n Performance %f GFlop/s\n", (2./3.*n*n*n /1000000.) / GetTimerValue(start_timer, end_timer)); #endif } return *info; /* End of MAGMA_ZGETRF2_MGPU */ }
extern "C" magma_int_t magma_cpotrf2_msub( magma_int_t num_subs, magma_int_t num_gpus, magma_uplo_t uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, magmaFloatComplex_ptr *d_lA, size_t d_lA_offset, magma_int_t ldda, magmaFloatComplex_ptr *d_lP, magma_int_t lddp, magmaFloatComplex *a, magma_int_t lda, magma_int_t h, magma_queue_t *queues, magma_int_t *info ) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= CPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = 'U', or dA = L * L**H, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of dA is stored; = 'L': Lower triangle of dA is stored. N (input) INTEGER The order of the matrix dA. N >= 0. dA (input/output) COMPLEX array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = 'U', the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ int tot_subs = num_subs*num_gpus; magma_int_t j, jb, nb0, nb2, dd, d, id, j_local, j_local2; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; float d_one = 1.0; float d_neg_one = -1.0; magmaFloatComplex_ptr dlpanel; size_t dlpanel_offset; magma_int_t n_local[MagmaMaxSubs * MagmaMaxGPUs], ldpanel; // initialize trace trace_init(1, num_gpus, 2, queues); *info = 0; if ( (uplo != MagmaUpper) && (uplo != MagmaLower) ) { *info = -1; } else if (n < 0) { *info = -2; } else if ((uplo != MagmaUpper) && tot_subs*ldda < max(1,n)) { *info = -4; } else if ((uplo == MagmaUpper) && ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } for (d=0; d<tot_subs; d++) { /* local-n and local-ld */ if (uplo == MagmaUpper) { n_local[d] = ((n/nb)/tot_subs)*nb; if (d < (n/nb)%tot_subs) n_local[d] += nb; else if (d == (n/nb)%tot_subs) n_local[d] += n%nb; } else { n_local[d] = ((m/nb)/tot_subs)*nb; if (d < (m/nb)%tot_subs) n_local[d] += nb; else if (d == (m/nb)%tot_subs) n_local[d] += m%nb; } } /* Use blocked code. */ if (uplo == MagmaUpper) { /* ---------------------------------------------- */ /* Upper-triangular case */ /* > Compute the Cholesky factorization A = U'*U. */ /* ---------------------------------------------- */ for (j=0; j<m; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%tot_subs; /* Set the local index where the current panel is */ j_local = j/(nb*tot_subs); jb = min(nb, (m-j)); if (j > 0) { // Wait for the column on CPU magma_queue_sync(queues[2*(id%num_gpus)]); /* broadcast off-diagonal column to all gpus */ d = (j/nb+1)%num_gpus; for (dd=0; dd<num_gpus; dd++) { if (d != id%num_gpus) { magma_csetmatrix_async( j, jb, Aup(0,j), lda, dlP(d,jb,0,id%num_gpus), lddp, queues[2*d], trace_gpu_event(d, 0, "set", "set-col") ); } d = (d+1)%num_gpus; } /* Update the current diagonal block */ trace_gpu_start(id%num_gpus, 1, "herk", "herk"); magma_cherk(MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dlA(id, 0, nb*j_local), ldda, d_one, dlA(id, j, nb*j_local), ldda, queues[2*(id%num_gpus)+1]); magma_queue_sync(queues[2*(id%num_gpus)+1]); // Wait for syrk } /* Send the diagonal to cpu */ magma_cgetmatrix_async( jb, jb, dlA(id, j, nb*j_local), ldda, Aup(j,j), lda, queues[2*(id%num_gpus)], trace_gpu_event(id%num_gpus, 0, "get", "get-diag") ); if (j > 0) { /* Compute the local block column of the panel. */ d = (j/nb+1)%tot_subs; for (dd=0; dd<tot_subs; dd++) { j_local2 = j_local+1; if (d > id) j_local2 --; nb0 = nb*j_local2; if (n_local[d] > nb0) { if (d%num_gpus != id%num_gpus) { dlpanel = d_lP[d%num_gpus]; dlpanel_offset = dlP_offset(jb, 0, id%num_gpus); ldpanel = lddp; /* Wait for the offdiagonal column */ if (dd < num_gpus) magma_queue_sync(queues[2*(d%num_gpus)]); } else { dlpanel = d_lA[id]; dlpanel_offset = dlA_offset(0, nb*j_local); ldpanel = ldda; } /* update the panel */ trace_gpu_start(d%num_gpus, 1, "gemm", "gemm"); magma_cgemm(MagmaConjTrans, MagmaNoTrans, jb, n_local[d]-nb0, j, c_neg_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, 0, nb0), ldda, c_one, dlA(d, j, nb0), ldda, queues[2*(d%num_gpus)+1]); } d = (d+1)%tot_subs; } } /* factor the diagonal */ magma_queue_sync( queues[2*(id%num_gpus)] ); // wait for the diagonal trace_cpu_start(0, "potrf", "potrf"); lapackf77_cpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info); trace_cpu_end(0); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus */ if ((j+jb) < n) { d = (j/nb+1)%num_gpus; for (dd=0; dd<num_gpus; dd++) { if (d == id%num_gpus) { dlpanel = d_lA[id]; dlpanel_offset = dlA_offset(j, nb*j_local); ldpanel = ldda; } else { dlpanel = d_lP[d]; dlpanel_offset = dlP_offset(0, 0, id%num_gpus); ldpanel = lddp; } magma_csetmatrix_async( jb, jb, Aup(j,j), lda, dlpanel, dlpanel_offset, ldpanel, queues[2*d], trace_gpu_event(d, 0, "set", "set-diag")); d = (d+1)%num_gpus; } } else { magma_csetmatrix_async( jb, jb, Aup(j,j), lda, dlA(id, j, nb*j_local), ldda, queues[2*(id%num_gpus)], trace_gpu_event(id%num_gpus, 0, "set", "set-diag") ); } /* panel-factorize the off-diagonal */ if ((j+jb) < n) { d = (j/nb+1)%tot_subs; for (dd=0; dd<tot_subs; dd++) { /* next column */ j_local2 = j_local+1; if (d > id) j_local2--; if (d%num_gpus == id%num_gpus) { dlpanel = d_lA[id]; dlpanel_offset = dlA_offset(j, nb*j_local); ldpanel = ldda; } else { dlpanel = d_lP[d%num_gpus]; dlpanel_offset = dlP_offset(0, 0, id%num_gpus); ldpanel = lddp; } nb2 = n_local[d]-nb*j_local2; nb0 = min(nb, nb2); if (dd < num_gpus) magma_queue_sync( queues[2*(d%num_gpus)] ); // wait for the diagonal if (j+jb < m && d == (j/nb+1)%tot_subs) { /* owns the next column, look-ahead the column */ trace_gpu_start(d%num_gpus, 1, "trsm", "trsm"); magma_ctrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, j, nb*j_local2), ldda, queues[2*(d%num_gpus)+1] ); /* send the column to cpu */ magma_queue_sync(queues[2*(d%num_gpus)+1]); // wait for lookahead magma_cgetmatrix_async( (j+jb), nb0, dlA(d, 0, nb*j_local2), ldda, Aup(0,j+jb), lda, queues[2*(d%num_gpus)], trace_gpu_event(d%num_gpus, 0, "get", "get-col") ); /* update the remaining blocks */ nb2 = nb2 - nb0; trace_gpu_start(d%num_gpus, 1, "trsm", "trsm"); magma_ctrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda, queues[2*(d%num_gpus)+1] ); } else if (nb2 > 0) { /* update the entire trailing matrix */ trace_gpu_start(d%num_gpus, 1, "trsm", "trsm"); magma_ctrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, j, nb*j_local2), ldda, queues[2*(d%num_gpus)+1] ); } d = (d+1)%tot_subs; } } } } else { /* -------------------------------------------- */ /* Lower-triangular case */ /* Compute the Cholesky factorization A = L*L'. */ /* -------------------------------------------- */ for (j=0; j<n; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%tot_subs; /* Set the local index where the current panel is */ j_local = j/(nb*tot_subs); jb = min(nb, (n-j)); if (j > 0) { if (num_gpus > 1) { // Wait for the row on CPU to broadcast magma_queue_sync(queues[2*(id%num_gpus)]); } /* broadcast off-diagonal row to all the GPUs */ d = (j/nb+1)%num_gpus; for (dd=0; dd<num_gpus; dd++) { if (d != id%num_gpus) { /* send it to GPU-d */ magma_csetmatrix_async( jb, j, Alo(j,0), lda, dlPT(d,0,jb,id%num_gpus), nb, queues[2*d], trace_gpu_event(d, 0, "set", "set-row") ); } d = (d+1)%num_gpus; } /* Update the current diagonal block */ trace_gpu_start(id%num_gpus, 1, "herk", "herk"); magma_cherk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dlA(id, nb*j_local, 0), ldda, d_one, dlA(id, nb*j_local, j), ldda, queues[2*(id%num_gpus)+1]); magma_queue_sync(queues[2*(id%num_gpus)+1]); // wait for syrk } /* send the diagonal to cpu */ magma_cgetmatrix_async( jb, jb, dlA(id, nb*j_local, j), ldda, Alo(j,j), lda, queues[2*(id%num_gpus)], trace_gpu_event(id%num_gpus, 0, "get", "get") ); /* update the offdiagonal blocks */ if (j > 0) { /* compute the block-rows of the panel */ d = (j/nb+1)%tot_subs; for (dd=0; dd<tot_subs; dd++) { j_local2 = j_local+1; if (d > id) j_local2 --; nb0 = nb*j_local2; if (nb0 < n_local[d]) { if (d%num_gpus != id%num_gpus) { dlpanel = d_lP[d%num_gpus]; dlpanel_offset = dlPT_offset(0, jb, id%num_gpus); ldpanel = nb; /* Wait for offdiagonal row */ if (dd < num_gpus) magma_queue_sync(queues[2*(d%num_gpus)]); } else { dlpanel = d_lA[id]; dlpanel_offset = dlA_offset(nb*j_local, 0); ldpanel = ldda; } /* Update the panel */ trace_gpu_start(d%num_gpus, 1, "gemm", "gemm"); magma_cgemm( MagmaNoTrans, MagmaConjTrans, n_local[d]-nb0, jb, j, c_neg_one, dlA(d, nb0, 0), ldda, dlpanel, dlpanel_offset, ldpanel, c_one, dlA(d, nb0, j), ldda, queues[2*(d%num_gpus)+1]); } d = (d+1)%tot_subs; } } /* factor the diagonal */ magma_queue_sync( queues[2*(id%num_gpus)] ); trace_cpu_start(0, "potrf", "potrf"); lapackf77_cpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info); trace_cpu_end(0); if (*info != 0) { printf( " cpotrf returned %d (id=%d,j=%d,j_local=%d,jb=%d)\n",*info,id,j,j_local,jb ); *info = *info + j; break; } /* send the diagonal to gpus */ if ((j+jb) < m) { d = (j/nb+1)%num_gpus; for (dd=0; dd<num_gpus; dd++) { if (d == id%num_gpus) { dlpanel = d_lA[id]; dlpanel_offset = dlA_offset(nb*j_local, j); ldpanel = ldda; } else { dlpanel = d_lP[d]; dlpanel_offset = dlPT_offset(0, 0, id%num_gpus); ldpanel = nb; } magma_csetmatrix_async( jb, jb, Alo(j,j), lda, dlpanel, dlpanel_offset, ldpanel, queues[2*d], trace_gpu_event(d, 0, "set", "set-diag") ); d = (d+1)%num_gpus; } } else { magma_csetmatrix_async( jb, jb, Alo(j,j), lda, dlA(id, nb*j_local, j), ldda, queues[2*(id%num_gpus)], trace_gpu_event(id%num_gpus, 0, "set", "set-diag") ); } /* factorize off-diagonal blocks */ if ((j+jb) < m) { d = (j/nb+1)%tot_subs; for (dd=0; dd<tot_subs; dd++) { /* next column */ j_local2 = j_local+1; if (d > id) j_local2--; if (d%num_gpus == id%num_gpus) { dlpanel = d_lA[id]; dlpanel_offset = dlA_offset(nb*j_local, j); ldpanel = ldda; } else { dlpanel = d_lP[d%num_gpus]; dlpanel_offset = dlPT_offset(0, 0, id%num_gpus); ldpanel = nb; } nb2 = n_local[d] - j_local2*nb; nb0 = min(nb, nb2 ); // wait for the diagonal if (dd < num_gpus) magma_queue_sync(queues[2*(d%num_gpus)]); if (j+jb < n && d == (j/nb+1)%tot_subs) { /* owns the next column, look-ahead the column */ trace_gpu_start(d%num_gpus, 1, "trsm", "trsm"); magma_ctrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, nb*j_local2, j), ldda, queues[2*(d%num_gpus)+1]); /* send the column to cpu */ magma_queue_sync( queues[2*(d%num_gpus)+1] ); // wait for lookahead magma_cgetmatrix_async( nb0, j+jb, dlA(d, nb*j_local2, 0), ldda, Alo(j+jb,0), lda, queues[2*(d%num_gpus)], trace_gpu_event(d%num_gpus, 0, "get", "get") ); /* update the remaining blocks */ nb2 = nb2 - nb0; trace_gpu_start(d%num_gpus, 1, "trsm", "trsm"); magma_ctrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda, queues[2*(d%num_gpus)+1]); } else if (nb2 > 0) { /* update the entire trailing matrix */ trace_gpu_start(d%num_gpus, 1, "trsm", "trsm"); magma_ctrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, nb*j_local2, j), ldda, queues[2*(d%num_gpus)+1]); } d = (d+1)%tot_subs; } } } } /* end of else not upper */ /* clean up */ for( d=0; d<num_gpus; d++ ) { magma_queue_sync( queues[2*d] ); magma_queue_sync( queues[2*d+1] ); } trace_finalize("cpotrf_msub.svg", "trace.css"); return *info; } /* magma_cpotrf2_msub */
/** Purpose ------- SPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix dA. The factorization has the form dA = U**T * U, if UPLO = MagmaUpper, or dA = L * L**T, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] dA REAL array on the GPU, dimension (LDDA,N) On entry, the symmetric matrix dA. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**T * U or dA = L * L**T. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_sposv_comp ********************************************************************/ extern "C" magma_int_t magma_spotrf2_mgpu(int num_gpus, magma_uplo_t uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, float **d_lA, magma_int_t ldda, float **d_lP, magma_int_t lddp, float *A, magma_int_t lda, magma_int_t h, magma_queue_t stream[][3], magma_event_t event[][5], magma_int_t *info ) { #define Alo(i, j) (A + ((j)+off_j)*lda + (nb*(((i)/nb)%h)+off_i)) #define Aup(i, j) (A + (nb*(((j)/nb)%h)+off_j)*lda + (i+off_i)) #define dlA(id, i, j) (d_lA[(id)] + (j)*ldda + (i)) #define dlP(id, i, j, k) (d_lP[(id)] + (k)*nb*lddp + (j)*lddp + (i)) #define dlPT(id, i, j, k) (d_lP[(id)] + (k)*nb*lddp + (j)*nb + (i)) magma_int_t j, jb, nb0, nb2, dd, d, id, j_local, j_local2, buf; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; float d_one = 1.0; float d_neg_one = -1.0; int upper = (uplo == MagmaUpper); float *dlpanel; //magma_event_t event0[MagmaMaxGPUs], // syrk // event1[MagmaMaxGPUs], // send off-diagonal // event2[MagmaMaxGPUs], // send diagonal // event3[MagmaMaxGPUs]; // trsm magma_int_t n_local[MagmaMaxGPUs], ldpanel; int stream0 = 0, stream1 = 1; #ifdef STRSM_WORK float *d_dinvA[MagmaMaxGPUs][2], *d_x[MagmaMaxGPUs][2]; /* used by strsm_work */ #endif *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (!upper && num_gpus*ldda < max(1,n)) { *info = -4; } else if (upper && ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } for( d=0; d < num_gpus; d++ ) { /* local-n and local-ld */ if (upper) { n_local[d] = ((n/nb)/num_gpus)*nb; if (d < (n/nb)%num_gpus) n_local[d] += nb; else if (d == (n/nb)%num_gpus) n_local[d] += n%nb; } else { n_local[d] = ((m/nb)/num_gpus)*nb; if (d < (m/nb)%num_gpus) n_local[d] += nb; else if (d == (m/nb)%num_gpus) n_local[d] += m%nb; } //magma_setdevice(d); //magma_event_create( &event0[d] ); //magma_event_create( &event1[d] ); //magma_event_create( &event2[d] ); //magma_event_create( &event3[d] ); } magma_setdevice(0); /* == initialize the trace */ trace_init( 1, num_gpus, 3, (magma_queue_t*)stream ); /* Use blocked code. */ if (upper) { /* ---------------------------------------------- */ /* Upper-triangular case */ /* > Compute the Cholesky factorization A = U'*U. */ /* ---------------------------------------------- */ #if defined(PRECISION_d) && defined(STRSM_WORK) /* invert the diagonals * Allocate device memory for the inversed diagonal blocks, size=m*NB */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); for( j=0; j < 2; j++ ) { magma_smalloc( &d_dinvA[d][j], nb*nb ); magma_smalloc( &d_x[d][j], n*nb ); cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(float)); cudaMemset(d_x[d][j], 0, n*nb*sizeof(float)); } } magma_setdevice(0); #endif for (j=0; j < m; j += nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (m-j)); if ( j > 0 ) { /* needed on pluto... */ magma_setdevice(id); magma_queue_sync( stream[id][stream0] ); // wait for the column on CPU /* broadcast off-diagonal column to all gpus */ d = (j/nb+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { if ( d != id ) { magma_setdevice(d); /* wait for it on CPU */ magma_queue_wait_event( stream[d][stream0], event[id][1] ); /* send it to GPU */ trace_gpu_start( d, stream0, "comm", "rows to GPUs" ); magma_ssetmatrix_async( j, jb, Aup(0,j), lda, dlP(d,jb,0,buf), lddp, stream[d][stream0] ); trace_gpu_end( d, stream0 ); magma_event_record( event[d][1], stream[d][stream0] ); } d = (d+1)%num_gpus; } } /* Update the current diagonal block */ magma_setdevice(id); if ( j > 0 ) { magmablasSetKernelStream(stream[id][stream1]); trace_gpu_start( id, stream1, "syrk", "syrk" ); magma_ssyrk(MagmaUpper, MagmaTrans, jb, j, d_neg_one, dlA(id, 0, nb*j_local), ldda, d_one, dlA(id, j, nb*j_local), ldda); trace_gpu_end( id, stream1 ); magma_event_record( event[id][0], stream[id][stream1] ); } /* send the diagonal to cpu */ magma_queue_wait_event( stream[id][stream0], event[id][0] ); // wait for syrk trace_gpu_start( id, stream0, "comm", "D to CPU" ); magma_sgetmatrix_async( jb, jb, dlA(id, j, nb*j_local), ldda, Aup(j,j), lda, stream[id][stream0] ); trace_gpu_end( id, stream0 ); if ( j > 0 ) { /* Compute the local block column of the panel. */ d = (j/nb+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { j_local2 = j_local+1; if ( d > id ) j_local2 --; nb0 = nb*j_local2; if ( n_local[d] > nb0 ) { /* wait for the off-diagonal */ if ( d != id ) { //magma_queue_sync( stream[id][3] ); dlpanel = dlP(d, jb, 0, buf); ldpanel = lddp; /* wait for the offdiagonal column */ magma_queue_wait_event( stream[d][stream1], event[d][1] ); } else { dlpanel = dlA(d, 0, nb*j_local); ldpanel = ldda; } /* update the panel */ magma_setdevice(d); magmablasSetKernelStream(stream[d][stream1]); trace_gpu_start( d, stream1, "gemm", "gemm" ); magma_sgemm(MagmaTrans, MagmaNoTrans, jb, n_local[d]-nb0, j, c_neg_one, dlpanel, ldpanel, dlA(d, 0, nb0), ldda, c_one, dlA(d, j, nb0), ldda); trace_gpu_end( d, stream1 ); } d = (d+1)%num_gpus; } } /* factor the diagonal */ magma_setdevice(id); magma_queue_sync( stream[id][stream0] ); // wait for the diagonal trace_cpu_start( 0, "getrf", "getrf" ); lapackf77_spotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info); trace_cpu_end( 0 ); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus */ if ( (j+jb) < n) { d = (j/nb+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { magma_setdevice(d); if ( d == id ) { dlpanel = dlA(d, j, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d, 0, 0, buf); ldpanel = lddp; } trace_gpu_start( d, stream0, "comm", "D to GPUs" ); magma_ssetmatrix_async( jb, jb, Aup(j,j), lda, dlpanel, ldpanel, stream[d][stream0] ); trace_gpu_end( d, stream0 ); magma_event_record( event[d][2], stream[d][stream0] ); d = (d+1)%num_gpus; } } else { magma_setdevice(id); trace_gpu_start( id, stream0, "comm", "D to GPUs" ); magma_ssetmatrix_async( jb, jb, Aup(j,j), lda, dlA(id, j, nb*j_local), ldda, stream[id][stream0] ); trace_gpu_end( id, stream0 ); } /* panel-factorize the off-diagonal */ if ( (j+jb) < n) { d = (j/nb+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { /* next column */ j_local2 = j_local+1; if ( d > id ) j_local2--; if ( d == id ) { dlpanel = dlA(d, j, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d, 0, 0, buf); ldpanel = lddp; } nb2 = n_local[d]-nb*j_local2; nb0 = min(nb, nb2 ); magma_setdevice(d); magmablasSetKernelStream(stream[d][stream1]); magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for the diagonal if ( j+jb < m && d == (j/nb+1)%num_gpus ) { /* owns the next column, look-ahead the column */ trace_gpu_start( d, stream1, "trsm", "trsm" ); #if defined(PRECISION_d) && defined(STRSM_WORK) magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, d_dinvA[d][0], d_x[d][0] ); /*nb2 = n_local[d] - j_local2*nb; magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, d_dinvA[d], d_x[d] ); */ #else /*nb2 = n_local[d] - j_local2*nb; magma_strsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldda, dlA(d, j, nb*j_local2), ldda); */ magma_strsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda); #endif trace_gpu_end( d, stream1 ); magma_event_record( event[d][3], stream[d][stream1] ); /* send the column to cpu */ if ( j+jb < m ) { trace_gpu_start( d, stream0, "comm", "rows to CPU" ); magma_queue_wait_event( stream[d][stream0], event[d][3] ); // wait for lookahead magma_sgetmatrix_async( (j+jb), nb0, dlA(d, 0, nb*j_local2), ldda, Aup(0,j+jb), lda, stream[d][stream0] ); trace_gpu_end( d, stream0 ); magma_event_record( event[d][1], stream[d][stream0] ); } /* update the remaining blocks */ nb2 = nb2 - nb0; #if defined(PRECISION_d) && defined(STRSM_WORK) magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_strsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda); #endif } else if ( nb2 > 0 ) { /* update the entire trailing matrix */ trace_gpu_start( d, stream1, "trsm", "trsm" ); #if defined(PRECISION_d) && defined(STRSM_WORK) magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_strsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda); #endif trace_gpu_end( d, stream1 ); } d = (d+1)%num_gpus; } } /* end of strsm */ } /* end of for j=1, .., n */ } else { /* -------------------------------------------- */ /* Lower-triangular case */ /* Compute the Cholesky factorization A = L*L'. */ /* -------------------------------------------- */ #if defined(PRECISION_d) && defined(STRSM_WORK) /* * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); for( j=0; j < 2; j++ ) { magma_smalloc( &d_dinvA[d][j], nb*nb ); magma_smalloc( &d_x[d][j], nb*m ); cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(float)); cudaMemset(d_x[d][j], 0, nb* m*sizeof(float)); } } magma_setdevice(0); #endif for (j=0; j < n; j += nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (n-j)); if ( j > 0 ) { /* needed on pluto... */ magma_setdevice(id); magma_queue_sync( stream[id][stream0] ); // wait for the column on CPU /* broadcast offdiagonal row to all gpus */ d = (j/nb+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { if ( d != id ) { magma_setdevice(d); /* wait for it on CPU */ magma_queue_wait_event( stream[d][stream0], event[id][1] ); /* send it to GPU */ magma_ssetmatrix_async( jb, j, Alo(j,0), lda, dlPT(d,0,jb,buf), nb, stream[d][stream0] ); magma_event_record( event[d][1], stream[d][stream0] ); } d = (d+1)%num_gpus; } } /* Update the current diagonal block */ magma_setdevice(id); if ( j > 0 ) { magmablasSetKernelStream(stream[id][stream1]); magma_ssyrk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dlA(id, nb*j_local, 0), ldda, d_one, dlA(id, nb*j_local, j), ldda); magma_event_record( event[id][0], stream[id][stream1] ); } /* send the diagonal to cpu */ magma_queue_wait_event( stream[id][stream0], event[id][0] ); // wait for syrk magma_sgetmatrix_async( jb, jb, dlA(id, nb*j_local, j), ldda, Alo(j,j), lda, stream[id][stream0] ); /* update the offdiagonal blocks */ if ( j > 0 ) { /* compute the block-rows of the panel */ d = (j/nb+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { j_local2 = j_local+1; if ( d > id ) j_local2 --; nb0 = nb*j_local2; if ( nb0 < n_local[d] ) { if ( d != id ) { dlpanel = dlPT(d, 0, jb, buf); ldpanel = nb; /* wait for offdiagonal row */ magma_queue_wait_event( stream[d][stream1], event[d][1] ); } else { dlpanel = dlA(d, nb*j_local, 0); ldpanel = ldda; } magma_setdevice(d); magmablasSetKernelStream(stream[d][stream1]); magma_sgemm( MagmaNoTrans, MagmaTrans, n_local[d]-nb0, jb, j, c_neg_one, dlA(d, nb0, 0), ldda, dlpanel, ldpanel, c_one, dlA(d, nb0, j), ldda); } d = (d+1)%num_gpus; } } /* factor the diagonal */ magma_setdevice(id); magma_queue_sync( stream[id][stream0] ); lapackf77_spotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus */ if ( (j+jb) < m ) { d = (j/nb+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { magma_setdevice(d); if ( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d, 0, 0, buf); ldpanel = nb; } magma_ssetmatrix_async( jb, jb, Alo(j,j), lda, dlpanel, ldpanel, stream[d][stream0] ); magma_event_record( event[d][2], stream[d][stream0] ); d = (d+1)%num_gpus; } } else { magma_setdevice(id); magma_ssetmatrix_async( jb, jb, Alo(j,j), lda, dlA(id, nb*j_local, j), ldda, stream[id][stream0] ); } /* factorize off-diagonal blocks */ if ( (j+jb) < m ) { d = (j/nb+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { /* next column */ j_local2 = j_local+1; if ( d > id ) j_local2--; if ( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d, 0, 0, buf); ldpanel = nb; } nb2 = n_local[d] - j_local2*nb; nb0 = min(nb, nb2 ); magma_setdevice(d); magmablasSetKernelStream(stream[d][stream1]); magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for the diagonal if ( j+jb < n && d == (j/nb+1)%num_gpus ) { /* owns the next column, look-ahead the column */ #if defined(PRECISION_d) && defined(STRSM_WORK) magmablas_strsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda, d_dinvA[d][0], d_x[d][0]); #else magma_strsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda); #endif magma_event_record( event[d][3], stream[d][stream1] ); /* send the column to cpu */ if ( j+jb < n ) { magma_queue_wait_event( stream[d][stream0], event[d][3] ); // wait for lookahead magma_sgetmatrix_async( nb0, j+jb, dlA(d, nb*j_local2, 0), ldda, Alo(j+jb,0), lda, stream[d][stream0] ); magma_event_record( event[d][1], stream[d][stream0] ); } /* update the remaining blocks */ nb2 = nb2 - nb0; #if defined(PRECISION_d) && defined(STRSM_WORK) magmablas_strsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_strsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda); #endif } else if ( nb2 > 0 ) { /* update the entire trailing matrix */ #if defined(PRECISION_d) && defined(STRSM_WORK) magmablas_strsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_strsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda); #endif } d = (d+1)%num_gpus; } } } } /* end of else not upper */ /* == finalize the trace == */ trace_finalize( "spotrf.svg", "trace.css" ); /* clean up */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); //magma_event_destroy( event0[d] ); //magma_event_destroy( event1[d] ); //magma_event_destroy( event2[d] ); //magma_event_destroy( event3[d] ); } magma_setdevice(0); return *info; } /* magma_spotrf_mgpu */
/* //////////////////////////////////////////////////////////////////////////// -- Testing magma_dsymm_mgpu */ int main( int argc, char** argv) { TESTING_INIT(); double c_neg_one = MAGMA_D_NEG_ONE; double alpha = MAGMA_D_MAKE( 3.456, 5.678 ); double beta = MAGMA_D_MAKE( 1.234, 2.456 ); real_Double_t gflops, gpu_perf=0., cpu_perf=0., gpu_time=0., cpu_time=0.; real_Double_t gpu_perf2=0., gpu_time2=0.; double Anorm, error, work[1]; double *hA, *hB, *hC, *hR; magmaDouble_ptr dA[MagmaMaxGPUs], dB[MagmaMaxGPUs], dC[MagmaMaxGPUs], dwork[MagmaMaxGPUs]; magmaDouble_ptr dA2; magma_int_t i, j, dev, M, N, size, lda, ldb, ldc, ldda, lddb, lddc, msize, nb; magma_int_t ione = 1; magma_int_t iseed[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); opts.ngpu = abs( opts.ngpu ); // always uses multi-GPU code double tol = opts.tolerance * lapackf77_dlamch("E"); // default values nb = (opts.nb > 0 ? opts.nb : 64); magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2]; magma_int_t ncmplx = 0; magma_buildconnection_mgpu( gnode, &ncmplx, opts.ngpu ); printf("%% Initializing communication pattern... GPU-ncmplx %d\n", (int) ncmplx); for (i=0; i < ncmplx; ++i) { magma_int_t myngpu = gnode[i][MagmaMaxGPUs]; printf("%% cmplx %d has %d GPUs:", i, myngpu); for (j=0; j < myngpu; ++j) { printf(" %d", (int) gnode[i][j]); if (j < myngpu-1) { printf(","); } } printf("\n"); } // number of queues per GPU. Requires ngpu. magma_int_t nqueue = opts.ngpu; // number of events per GPU. Require ngpu*ngpu. magma_int_t nevents = opts.ngpu*opts.ngpu; magma_queue_t queues[MagmaMaxGPUs][20], queues0[MagmaMaxGPUs]; magma_event_t events[MagmaMaxGPUs][MagmaMaxGPUs*MagmaMaxGPUs + 10]; for( dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); for( i = 0; i < nqueue; ++i ) { magma_queue_create( dev, &queues[dev][i] ); } queues0[dev] = queues[dev][0]; for( i = 0; i < nevents; ++i ) { cudaEventCreateWithFlags( &events[dev][i], cudaEventDisableTiming ); } } printf("%% nb %d, ngpu %d, version %d\n", (int) nb, (int) opts.ngpu, (int) opts.version ); printf("%% M N nb offset CPU Gflop/s (sec) GPU Gflop/s (sec) CUBLAS hemm (sec) ||R|| / ||A||*||B||\n"); printf("%%========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { M = opts.msize[itest]; N = opts.nsize[itest]; for( int offset = 0; offset < N; offset += min(N,nb) ) { for( int iter = 0; iter < opts.niter; ++iter ) { msize = M - offset; lda = M; // TODO depends on side ldb = M; ldc = M; ldda = magma_roundup( lda, opts.align ); // multiple of 32 by default lddb = magma_roundup( ldb, opts.align ); // multiple of 32 by default lddc = magma_roundup( ldc, opts.align ); // multiple of 32 by default gflops = FLOPS_DSYMM( MagmaLeft, (double)msize, (double)N ) / 1e9; magma_int_t dworksiz = lddc*N + (M*N)*opts.ngpu; TESTING_MALLOC_CPU( hA, double, lda*M ); TESTING_MALLOC_CPU( hB, double, ldb*N ); TESTING_MALLOC_CPU( hC, double, ldc*N ); TESTING_MALLOC_PIN( hR, double, ldc*N ); for( dev = 0; dev < opts.ngpu; ++dev ) { magma_int_t mlocal = ((M / nb) / opts.ngpu + 1) * nb; magma_setdevice( dev ); TESTING_MALLOC_DEV( dA[dev], double, ldda*mlocal ); TESTING_MALLOC_DEV( dB[dev], double, lddb*N ); TESTING_MALLOC_DEV( dC[dev], double, lddc*N ); TESTING_MALLOC_DEV( dwork[dev], double, dworksiz ); } if ( opts.check ) { magma_setdevice( 0 ); TESTING_MALLOC_DEV( dA2, double, ldda*M ); } size = lda*M; lapackf77_dlarnv( &ione, iseed, &size, hA ); magma_dmake_symmetric( M, hA, lda ); size = ldb*N; lapackf77_dlarnv( &ione, iseed, &size, hB ); size = ldc*N; lapackf77_dlarnv( &ione, iseed, &size, hC ); lapackf77_dlacpy( "Full", &M, &N, hC, &ldc, hR, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix_1D_col_bcyclic( M, M, hA, lda, dA, ldda, opts.ngpu, nb, queues0 ); for( dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); magma_dsetmatrix( M, N, hB, lda, dB[dev], ldda, opts.queue ); // since when offset != 0, the GPU that does beta*C may not be 0, // send initial hC to all GPUs. magma_dsetmatrix( M, N, hC, lda, dC[dev], ldda, opts.queue ); } trace_init( 1, opts.ngpu, nqueue, (magma_queue_t*) queues ); gpu_time = magma_sync_wtime(0); magmablas_dsymm_mgpu( MagmaLeft, MagmaLower, msize, N, alpha, dA, ldda, offset, dB, ldda, beta, dC, ldda, dwork, dworksiz, opts.ngpu, nb, queues, nqueue, events, nevents, gnode, ncmplx); gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gflops / gpu_time; #ifdef TRACING char buf[80]; snprintf( buf, sizeof(buf), "dsymm-m%d-n%d-nb%d-ngpu%d-run%d.svg", (int) M, (int) N, (int) nb, (int) opts.ngpu, (int) iter ); trace_finalize( buf, "trace.css" ); #endif /* ==================================================================== Performs operation using CUBLAS =================================================================== */ if ( opts.check && iter == 0 ) { magma_setdevice( 0 ); magma_dsetmatrix( M, M, hA, lda, dA2, ldda, opts.queue ); magma_dsetmatrix( M, N, hB, lda, dB[0], ldda, opts.queue ); magma_dsetmatrix( M, N, hC, lda, dwork[0], ldda, opts.queue ); gpu_time2 = magma_sync_wtime(0); magma_dsymm( MagmaLeft, MagmaLower, msize, N, alpha, dA2 + offset + offset*ldda, ldda, dB[0], ldda, beta, dwork[0], ldda, opts.queue ); gpu_time2 = magma_sync_wtime(0) - gpu_time2; gpu_perf2 = gflops / gpu_time2; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.check ) { // store ||A||*||B|| Anorm = lapackf77_dlange("fro", &msize, &msize, hA + offset + offset*lda, &lda, work ); Anorm *= lapackf77_dlange("fro", &msize, &N, hB, &lda, work ); //printf( "A =" ); magma_dprint( M, M, hA, lda ); //printf( "B =" ); magma_dprint( M, N, hB, lda ); //printf( "C =" ); magma_dprint( M, N, hC, lda ); cpu_time = magma_wtime(); blasf77_dsymm( "Left", "Lower", &msize, &N, &alpha, hA + offset + offset*lda, &lda, hB, &lda, &beta, hC, &lda ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; for (dev=0; dev < opts.ngpu; ++dev) { magma_setdevice( dev ); magma_dgetmatrix( M, N, dC[dev], ldda, hR, lda, opts.queue ); // compute relative error ||R||/||A||*||B||, where R := C_magma - C_lapack = R - C size = ldc*N; blasf77_daxpy( &size, &c_neg_one, hC, &ione, hR, &ione ); error = lapackf77_dlange("fro", &msize, &N, hR, &lda, work) / Anorm; //printf( "R =" ); magma_dprint( M, N, hR, lda ); bool okay = (error < tol); status += ! okay; if (dev == 0) { printf( "%5d %5d %5d %5d %7.1f (%7.4f) %7.1f (%7.4f) %7.1f (%7.4f) %8.2e %s\n", (int) M, (int) N, (int) nb, (int) offset, cpu_perf, cpu_time, gpu_perf, gpu_time, gpu_perf2, gpu_time2, error, (okay ? "ok" : "failed") ); } else { printf( " dev %d %74s %8.2e %s\n", dev, "", error, (okay ? "ok" : "failed") ); } } } else { printf( "%5d %5d %5d %5d --- ( --- ) %7.1f (%7.4f) --- ( --- ) ---\n", (int) M, (int) N, (int) nb, (int) offset, gpu_perf, gpu_time ); } TESTING_FREE_CPU( hA ); TESTING_FREE_CPU( hB ); TESTING_FREE_CPU( hC ); TESTING_FREE_PIN( hR ); for( dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); TESTING_FREE_DEV( dA[dev] ); TESTING_FREE_DEV( dB[dev] ); TESTING_FREE_DEV( dC[dev] ); TESTING_FREE_DEV( dwork[dev] ); } if ( opts.check ) { magma_setdevice( 0 ); TESTING_FREE_DEV( dA2 ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } // offset printf( "\n" ); } for( dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); for( i = 0; i < nqueue; ++i ) { magma_queue_destroy( queues[dev][i] ); } for( i = 0; i < nevents; ++i ) { magma_event_destroy( events[dev][i] ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }