/** Purpose ------- ZUNMQR overwrites the general complex M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = Magma_ConjTrans: Q**H * C C * Q**H @endverbatim where Q is a complex unitary matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by ZGEQRF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = Magma_ConjTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] A COMPLEX_16 array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF in the first k columns of its array argument A. @param[in] lda INTEGER The leading dimension of the array A. If SIDE = MagmaLeft, LDA >= max(1,M); if SIDE = MagmaRight, LDA >= max(1,N). @param[in] tau COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF. @param[in,out] C COMPLEX_16 array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = MagmaLeft, and LWORK >= M*NB if SIDE = MagmaRight, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_zunmqr_m( magma_int_t ngpu, magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *C, magma_int_t ldc, magmaDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define C(i, j) (C + (j)*ldc + (i)) #define dC(gpui, i, j) (dw[gpui] + (j)*lddc + (i)) #define dA_c(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddac) #define dA_r(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddar) #define dT(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + (ind)*((nb+1)*nb)) #define dwork(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + 2*((nb+1)*nb) + (ind)*(lddwork*nb)) /* Constants */ magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; /* Local variables */ const char* side_ = lapack_side_const( side ); const char* trans_ = lapack_trans_const( trans ); magma_int_t nb = 128; magmaDoubleComplex *T = NULL; magmaDoubleComplex_ptr dw[MagmaMaxGPUs] = { NULL }; magma_queue_t queues[MagmaMaxGPUs][2] = {{ NULL }}; magma_event_t events[MagmaMaxGPUs][2] = {{ NULL }}; magma_int_t ind_c; magma_device_t dev; magma_device_t orig_dev; magma_getdevice( &orig_dev ); *info = 0; magma_int_t left = (side == MagmaLeft); magma_int_t notran = (trans == MagmaNoTrans); magma_int_t lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ magma_int_t nq, nw; if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != Magma_ConjTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } magma_int_t lwkopt = max(1,nw) * nb; if (*info == 0) { work[0] = magma_zmake_lwork( lwkopt ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } if (nb >= k) { /* Use CPU code */ lapackf77_zunmqr(side_, trans_, &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, info); return *info; } magma_int_t lddc = magma_roundup( m, 64 ); // TODO why 64 instead of 32 ? magma_int_t lddac = nq; magma_int_t lddar = nb; magma_int_t lddwork = nw; magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; magma_int_t nb_l=256; magma_int_t nbl = magma_ceildiv( n, nb_l ); // number of blocks magma_int_t maxnlocal = magma_ceildiv( nbl, ngpu )*nb_l; ngpu = min( ngpu, magma_ceildiv( n, nb_l )); // Don't use GPU that will not have data. magma_int_t ldw = maxnlocal*lddc // dC + 2*lddac*lddar // 2*dA + 2*(nb + 1 + lddwork)*nb; // 2*(dT and dwork) if (MAGMA_SUCCESS != magma_zmalloc_pinned( &T, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; goto cleanup; } for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); if (MAGMA_SUCCESS != magma_zmalloc( &dw[dev], ldw )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto cleanup; } magma_queue_create( dev, &queues[dev][0] ); magma_queue_create( dev, &queues[dev][1] ); magma_event_create( &events[dev][0] ); magma_event_create( &events[dev][1] ); } /* Use hybrid CPU-MGPU code */ if (left) { //copy C to mgpus for (magma_int_t i = 0; i < nbl; ++i) { dev = i % ngpu; magma_setdevice( dev ); magma_int_t kb = min(nb_l, n-i*nb_l); magma_zsetmatrix_async( m, kb, C(0, i*nb_l), ldc, dC(dev, 0, i/ngpu*nb_l), lddc, queues[dev][0] ); nlocal[dev] += kb; } magma_int_t i1, i2, i3; if ( !notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } ind_c = 0; for (magma_int_t i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { // start the copy of A panel magma_int_t kb = min(nb, k - i); for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_event_sync( events[dev][ind_c] ); // check if the new data can be copied magma_zsetmatrix_async(nq-i, kb, A(i, i), lda, dA_c(dev, ind_c, i, 0), lddac, queues[dev][0] ); // set upper triangular part of dA to identity magmablas_zlaset_band( MagmaUpper, kb, kb, kb, c_zero, c_one, dA_c(dev, ind_c, i, 0), lddac, queues[dev][0] ); } /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ magma_int_t nqi = nq - i; lapackf77_zlarft("F", "C", &nqi, &kb, A(i, i), &lda, &tau[i], T, &kb); /* H or H' is applied to C(1:m,i:n) */ /* Apply H or H'; First copy T to the GPU */ for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_zsetmatrix_async(kb, kb, T, kb, dT(dev, ind_c), kb, queues[dev][0] ); } for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_queue_sync( queues[dev][0] ); // check if the data was copied magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, m-i, nlocal[dev], kb, dA_c(dev, ind_c, i, 0), lddac, dT(dev, ind_c), kb, dC(dev, i, 0), lddc, dwork(dev, ind_c), lddwork, queues[dev][1] ); magma_event_record(events[dev][ind_c], queues[dev][1] ); } ind_c = (ind_c+1)%2; } for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_queue_sync( queues[dev][1] ); } //copy C from mgpus for (magma_int_t i = 0; i < nbl; ++i) { dev = i % ngpu; magma_setdevice( dev ); magma_int_t kb = min(nb_l, n-i*nb_l); magma_zgetmatrix( m, kb, dC(dev, 0, i/ngpu*nb_l), lddc, C(0, i*nb_l), ldc, queues[dev][1] ); // magma_zgetmatrix_async( m, kb, // dC(dev, 0, i/ngpu*nb_l), lddc, // C(0, i*nb_l), ldc, queues[dev][0] ); } } else { *info = MAGMA_ERR_NOT_IMPLEMENTED; magma_xerbla( __func__, -(*info) ); goto cleanup; /* if ( notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } mi = m; ic = 0; for (i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { ib = min(nb, k - i); // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) i__4 = nq - i; lapackf77_zlarft("F", "C", &i__4, &ib, A(i, i), &lda, &tau[i], T, &ib); // 1) copy the panel from A to the GPU, and // 2) set upper triangular part of dA to identity magma_zsetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda, queues[dev][1] ); magmablas_zlaset_band( MagmaUpper, ib, ib, ib, c_zero, c_one, dA(i, 0), ldda, queues[dev][1] ); // H or H' is applied to C(1:m,i:n) ni = n - i; jc = i; // Apply H or H'; First copy T to the GPU magma_zsetmatrix( ib, ib, T, ib, dT, ib, queues[dev][1] ); magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i, 0), ldda, dT, ib, dC(ic, jc), lddc, dwork, lddwork, queues[dev][1] ); } */ } cleanup: work[0] = magma_zmake_lwork( lwkopt ); for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_event_destroy( events[dev][0] ); magma_event_destroy( events[dev][1] ); magma_queue_destroy( queues[dev][0] ); magma_queue_destroy( queues[dev][1] ); magma_free( dw[dev] ); } magma_setdevice( orig_dev ); magma_free_pinned( T ); return *info; } /* magma_zunmqr */
/** Purpose ------- ZPOTRF_OOC computes the Cholesky factorization of a complex Hermitian positive definite matrix A. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix A may exceed the GPU memory. The factorization has the form A = U**H * U, if UPLO = MagmaUpper, or A = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A COMPLEX_16 array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U**H * U or A = L * L**H. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_zposv_comp ********************************************************************/ extern "C" magma_int_t magma_zpotrf_m( magma_int_t ngpu, magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, magma_int_t *info) { #define A(i, j) ( A + (j)*lda + (i)) #define dA(d, i, j) (dwork[(d)] + (j)*lddla + (i)) #define dT(d, i, j) ( dt[(d)] + (j)*ldda + (i)) #define dAup(d, i, j) (dwork[(d)] + (j)*NB + (i)) #define dTup(d, i, j) ( dt[(d)] + (j)*nb + (i)) /* Local variables */ double d_one = 1.0; double d_neg_one = -1.0; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; const char* uplo_ = lapack_uplo_const( uplo ); int upper = (uplo == MagmaUpper); magmaDoubleComplex *dwork[MagmaMaxGPUs], *dt[MagmaMaxGPUs]; magma_int_t ldda, lddla, nb, iinfo, n_local[MagmaMaxGPUs], J2, d, ngpu0 = ngpu; magma_int_t j, jj, jb, J, JB, NB, h; magma_queue_t stream[MagmaMaxGPUs][3]; magma_event_t event[MagmaMaxGPUs][5]; magma_timer_t time_total=0, time_sum=0, time=0; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); nb = magma_get_dpotrf_nb(n); if ( ngpu0 > n/nb ) { ngpu = n/nb; if ( n%nb != 0 ) ngpu ++; } else { ngpu = ngpu0; } //ldda = ((n+31)/32)*32; ldda = ((n+nb-1)/nb)*nb; lddla = ((nb*((n+nb*ngpu-1)/(nb*ngpu))+31)/32)*32; /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(magmaDoubleComplex); //MB = n; /* number of rows in the big panel */ NB = (magma_int_t)((0.8*freeMem - max(2,ngpu)*nb*ldda - (n+nb)*nb)/lddla); /* number of columns in the big panel */ //NB = min(5*nb,n); if ( NB >= n ) { #ifdef CHECK_ZPOTRF_OOC printf( " * still fits in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_ZPOTRF_OOC printf( " * doesn't fit in GPU memory.\n" ); #endif NB = (NB/nb) * nb; /* making sure it's devisable by nb */ } #ifdef CHECK_ZPOTRF_OOC if ( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d, lddla=%d, freeMem=%.2e).\n", n, NB, nb, lddla, (double)freeMem ); else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d, lddla=%d, freeMem=%.2e).\n", n, NB, nb, lddla, (double)freeMem ); fflush(stdout); #endif for (d=0; d < ngpu; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_zmalloc( &dt[d], NB*lddla + max(2,ngpu)*nb*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dwork[d] = &dt[d][max(2,ngpu)*nb*ldda]; for( j=0; j < 3; j++ ) magma_queue_create( &stream[d][j] ); for( j=0; j < 5; j++ ) magma_event_create( &event[d][j] ); magma_device_sync(); // synch the device } magma_setdevice(0); timer_start( time_total ); if (nb <= 1 || nb >= n) { lapackf77_zpotrf(uplo_, &n, A, &lda, info); } else { /* Use hybrid blocked code. */ if (upper) { /* =========================================================== * * Compute the Cholesky factorization A = U'*U. * * big panel is divided by block-row and distributed in block * * column cyclic format */ /* for each big-panel */ for( J=0; J < n; J += NB ) { JB = min(NB,n-J); if ( ngpu0 > (n-J)/nb ) { ngpu = (n-J)/nb; if ( (n-J)%nb != 0 ) ngpu ++; } else { ngpu = ngpu0; } /* load the new big-panel by block-rows */ magma_zhtodpo( ngpu, uplo, JB, n, J, J, nb, A, lda, dwork, NB, stream, &iinfo); /* update with the previous big-panels */ timer_start( time ); for( j=0; j < J; j += nb ) { /* upload the diagonal of the block column (broadcast to all GPUs) */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_zsetmatrix_async( nb, JB, A(j, J), lda, dTup(d, 0, J), nb, stream[d][0] ); n_local[d] = 0; } /* distribute off-diagonal blocks to GPUs */ for( jj=J+JB; jj < n; jj += nb ) { d = ((jj-J)/nb)%ngpu; magma_setdevice(d); jb = min(nb, n-jj); magma_zsetmatrix_async( nb, jb, A(j, jj), lda, dTup(d, 0, J+JB+n_local[d]), nb, stream[d][0] ); n_local[d] += jb; } /* wait for the communication */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); } /* update the current big-panel using the previous block-row */ /* -- process the big diagonal block of the big panel */ for( jj=0; jj < JB; jj += nb ) { // jj is 'local' column index within the big panel d = (jj/nb)%ngpu; J2 = jj/(nb*ngpu); magma_setdevice(d); magmablasSetKernelStream(stream[d][J2%2]); // the last stream (2) used to process off-diagonal J2 = nb*J2; jb = min(nb,JB-jj); // number of columns in this current block-row magma_zgemm( MagmaConjTrans, MagmaNoTrans, jj, jb, nb, c_neg_one, dTup(d, 0, J ), nb, dTup(d, 0, J+jj), nb, c_one, dAup(d, 0, J2), NB); magma_zherk(MagmaUpper, MagmaConjTrans, jb, nb, d_neg_one, dTup(d, 0, J+jj), nb, d_one, dAup(d, jj, J2), NB); } /* -- process the remaining big off-diagonal block of the big panel */ if ( n > J+JB ) { for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][2]); /* local number of columns in the big panel */ n_local[d] = ((n-J)/(nb*ngpu))*nb; if (d < ((n-J)/nb)%ngpu) n_local[d] += nb; else if (d == ((n-J)/nb)%ngpu) n_local[d] += (n-J)%nb; /* subtracting the local number of columns in the diagonal */ J2 = nb*(JB/(nb*ngpu)); if ( d < (JB/nb)%ngpu ) J2 += nb; n_local[d] -= J2; magma_zgemm( MagmaConjTrans, MagmaNoTrans, JB, n_local[d], nb, c_neg_one, dTup(d, 0, J ), nb, dTup(d, 0, J+JB), nb, c_one, dAup(d, 0, J2), NB); } } /* wait for the previous updates */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); for( jj=0; jj < 3; jj++ ) magma_queue_sync( stream[d][jj] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); } /* end of updates with previous rows */ /* factor the big panel */ h = (JB+nb-1)/nb; // big diagonal of big panel will be on CPU // using three streams magma_zpotrf3_mgpu(ngpu, uplo, JB, n-J, J, J, nb, dwork, NB, dt, ldda, A, lda, h, stream, event, &iinfo); if ( iinfo != 0 ) { *info = J+iinfo; break; } time_sum += timer_stop( time ); /* upload the off-diagonal (and diagonal!!!) big panel */ magma_zdtohpo(ngpu, uplo, JB, n, J, J, nb, NB, A, lda, dwork, NB, stream, &iinfo); //magma_zdtohpo(ngpu, uplo, JB, n, J, J, nb, 0, A, lda, dwork, NB, stream, &iinfo); } } else { /* ========================================================= * * Compute the Cholesky factorization A = L*L'. */ /* for each big-panel */ for( J=0; J < n; J += NB ) { JB = min(NB,n-J); if ( ngpu0 > (n-J)/nb ) { ngpu = (n-J)/nb; if ( (n-J)%nb != 0 ) ngpu ++; } else { ngpu = ngpu0; } /* load the new big-panel by block-columns */ magma_zhtodpo( ngpu, uplo, n, JB, J, J, nb, A, lda, dwork, lddla, stream, &iinfo); /* update with the previous big-panels */ timer_start( time ); for( j=0; j < J; j += nb ) { /* upload the diagonal of big panel */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_zsetmatrix_async( JB, nb, A(J, j), lda, dT(d, J, 0), ldda, stream[d][0] ); n_local[d] = 0; } /* upload off-diagonals */ for( jj=J+JB; jj < n; jj += nb ) { d = ((jj-J)/nb)%ngpu; magma_setdevice(d); jb = min(nb, n-jj); magma_zsetmatrix_async( jb, nb, A(jj, j), lda, dT(d, J+JB+n_local[d], 0), ldda, stream[d][0] ); n_local[d] += jb; } /* wait for the communication */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); } /* update the current big-panel using the previous block-row */ for( jj=0; jj < JB; jj += nb ) { /* diagonal */ d = (jj/nb)%ngpu; J2 = jj/(nb*ngpu); magma_setdevice(d); magmablasSetKernelStream(stream[d][J2%2]); J2 = nb*J2; jb = min(nb,JB-jj); magma_zgemm( MagmaNoTrans, MagmaConjTrans, jb, jj, nb, c_neg_one, dT(d, J+jj, 0), ldda, dT(d, J, 0), ldda, c_one, dA(d, J2, 0), lddla); magma_zherk(MagmaLower, MagmaNoTrans, jb, nb, d_neg_one, dT(d, J+jj, 0), ldda, d_one, dA(d, J2, jj), lddla); } if ( n > J+JB ) { /* off-diagonal */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][2]); /* local number of columns in the big panel */ n_local[d] = (((n-J)/nb)/ngpu)*nb; if (d < ((n-J)/nb)%ngpu) n_local[d] += nb; else if (d == ((n-J)/nb)%ngpu) n_local[d] += (n-J)%nb; /* subtracting local number of columns in diagonal */ J2 = nb*(JB/(nb*ngpu)); if ( d < (JB/nb)%ngpu ) J2 += nb; n_local[d] -= J2; magma_zgemm( MagmaNoTrans, MagmaConjTrans, n_local[d], JB, nb, c_neg_one, dT(d, J+JB, 0), ldda, dT(d, J, 0), ldda, c_one, dA(d, J2, 0), lddla); } } /* wait for the previous updates */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); for( jj=0; jj < 3; jj++ ) magma_queue_sync( stream[d][jj] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); } /* factor the big panel */ h = (JB+nb-1)/nb; // big diagonal of big panel will be on CPU // using three streams magma_zpotrf3_mgpu(ngpu, uplo, n-J, JB, J, J, nb, dwork, lddla, dt, ldda, A, lda, h, stream, event, &iinfo); if ( iinfo != 0 ) { *info = J+iinfo; break; } time_sum += timer_stop( time ); /* upload the off-diagonal big panel */ magma_zdtohpo( ngpu, uplo, n, JB, J, J, nb, JB, A, lda, dwork, lddla, stream, &iinfo); } /* end of for J */ } /* if upper */ } /* if nb */ timer_stop( time_total ); if ( ngpu0 > n/nb ) { ngpu = n/nb; if ( n%nb != 0 ) ngpu ++; } else { ngpu = ngpu0; } for (d=0; d < ngpu; d++ ) { magma_setdevice(d); for( j=0; j < 3; j++ ) { magma_queue_destroy( stream[d][j] ); } magma_free( dt[d] ); for( j=0; j < 5; j++ ) { magma_event_destroy( event[d][j] ); } } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); timer_printf( "\n n=%d NB=%d nb=%d\n", (int) n, (int) NB, (int) nb ); timer_printf( " Without memory allocation: %f / %f = %f GFlop/s\n", FLOPS_ZPOTRF(n) / 1e9, time_total, FLOPS_ZPOTRF(n) / 1e9 / time_total ); timer_printf( " Performance %f / %f = %f GFlop/s\n", FLOPS_ZPOTRF(n) / 1e9, time_sum, FLOPS_ZPOTRF(n) / 1e9 / time_sum ); return *info; } /* magma_zpotrf_ooc */
/** Purpose ------- DGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix may not fit entirely in the GPU memory. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Note: The factorization of big panel is done calling multiple-gpu-interface. Pivots are applied on GPU within the big panel. Arguments --------- @param[in] num_gpus INTEGER The number of GPUs. num_gpus > 0. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_dgesv_comp ********************************************************************/ extern "C" magma_int_t magma_dgetrf_m(magma_int_t num_gpus, magma_int_t m, magma_int_t n, double *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { #define A(i,j) (A + (j)*lda + (i)) #define dAT(d,i,j) (dAT[d] + (i)*nb*ldn_local + (j)*nb) #define dPT(d,i,j) (dPT[d] + (i)*nb*nb + (j)*nb*maxm) magma_timer_t time=0, time_total=0, time_alloc=0, time_set=0, time_get=0, time_comp=0; timer_start( time_total ); real_Double_t flops; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double *dAT[MagmaMaxGPUs], *dA[MagmaMaxGPUs], *dPT[MagmaMaxGPUs]; magma_int_t iinfo = 0, nb, nbi, maxm, n_local[MagmaMaxGPUs], ldn_local; magma_int_t N, M, NB, NBk, I, d, num_gpus0 = num_gpus; magma_int_t ii, jj, h, offset, ib, rows, s; magma_queue_t stream[MagmaMaxGPUs][2]; magma_event_t event[MagmaMaxGPUs][2]; *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* initialize nb */ nb = magma_get_dgetrf_nb(m); maxm = ((m + 31)/32)*32; /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(double); /* number of columns in the big panel */ h = 1+(2+num_gpus0); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); const char* ngr_nb_char = getenv("MAGMA_NGR_NB"); if ( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); //NB = 5*max(nb,32); if ( num_gpus0 > ceil((double)NB/nb) ) { num_gpus = (int)ceil((double)NB/nb); h = 1+(2+num_gpus); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); } else { num_gpus = num_gpus0; } if ( num_gpus*NB >= n ) { #ifdef CHECK_DGETRF_OOC printf( " * still fit in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_DGETRF_OOC printf( " * don't fit in GPU memory.\n" ); #endif NB = num_gpus*NB; NB = max( nb, (NB / nb) * nb); /* making sure it's devisable by nb (x64) */ } #ifdef CHECK_DGETRF_OOC if ( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n", n, NB, nb, (double)freeMem ); else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n", n, NB, nb, (double)freeMem ); #endif if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code for scalar of one tile. */ lapackf77_dgetrf(&m, &n, A, &lda, ipiv, info); } else { /* Use hybrid blocked code. */ /* allocate memory on GPU to store the big panel */ timer_start( time_alloc ); n_local[0] = (NB/nb)/num_gpus; if ( NB%(nb*num_gpus) != 0 ) n_local[0] ++; n_local[0] *= nb; ldn_local = ((n_local[0]+31)/32)*32; for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_dmalloc( &dA[d], (ldn_local+h*nb)*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dPT[d] = dA[d] + nb*maxm; /* for storing the previous panel from CPU */ dAT[d] = dA[d] + h*nb*maxm; /* for storing the big panel */ magma_queue_create( &stream[d][0] ); magma_queue_create( &stream[d][1] ); magma_event_create( &event[d][0] ); magma_event_create( &event[d][1] ); } //magma_setdevice(0); timer_stop( time_alloc ); for( I=0; I < n; I += NB ) { M = m; N = min( NB, n-I ); /* number of columns in this big panel */ s = min( max(m-I,0), N )/nb; /* number of small block-columns in this big panel */ maxm = ((M + 31)/32)*32; if ( num_gpus0 > ceil((double)N/nb) ) { num_gpus = (int)ceil((double)N/nb); } else { num_gpus = num_gpus0; } for( d=0; d < num_gpus; d++ ) { n_local[d] = ((N/nb)/num_gpus)*nb; if (d < (N/nb)%num_gpus) n_local[d] += nb; else if (d == (N/nb)%num_gpus) n_local[d] += N%nb; } ldn_local = ((n_local[0]+31)/32)*32; /* upload the next big panel into GPU, transpose (A->A'), and pivot it */ timer_start( time ); magmablas_dsetmatrix_transpose_mgpu(num_gpus, stream, A(0,I), lda, dAT, ldn_local, dA, maxm, M, N, nb); for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } time_set += timer_stop( time ); timer_start( time ); /* == --------------------------------------------------------------- == */ /* == loop around the previous big-panels to update the new big-panel == */ for( offset = 0; offset < min(m,I); offset += NB ) { NBk = min( m-offset, NB ); /* start sending the first tile from the previous big-panels to gpus */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); nbi = min( nb, NBk ); magma_dsetmatrix_async( (M-offset), nbi, A(offset,offset), lda, dA[d], (maxm-offset), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][0] ); /* transpose */ magmablas_dtranspose( M-offset, nbi, dA[d], maxm-offset, dPT(d,0,0), nb ); } /* applying the pivot from the previous big-panel */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][1]); magmablas_dpermute_long3( dAT(d,0,0), ldn_local, ipiv, NBk, offset ); } /* == going through each block-column of previous big-panels == */ for( jj=0, ib=offset/nb; jj < NBk; jj += nb, ib++ ) { ii = offset+jj; rows = maxm - ii; nbi = min( nb, NBk-jj ); for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); /* wait for a block-column on GPU */ magma_queue_sync( stream[d][0] ); /* start sending next column */ if ( jj+nb < NBk ) { magma_dsetmatrix_async( (M-ii-nb), min(nb,NBk-jj-nb), A(ii+nb,ii+nb), lda, dA[d], (rows-nb), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][(1+jj/nb)%2] ); /* transpose next column */ magmablas_dtranspose( M-ii-nb, nb, dA[d], rows-nb, dPT(d,0,(1+jj/nb)%2), nb ); } /* update with the block column */ magmablasSetKernelStream(stream[d][1]); magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d], nbi, c_one, dPT(d,0,(jj/nb)%2), nb, dAT(d,ib,0), ldn_local ); if ( M > ii+nb ) { magma_dgemm( MagmaNoTrans, MagmaNoTrans, n_local[d], M-(ii+nb), nbi, c_neg_one, dAT(d,ib,0), ldn_local, dPT(d,1,(jj/nb)%2), nb, c_one, dAT(d,ib+1,0), ldn_local ); } magma_event_record( event[d][(jj/nb)%2], stream[d][1] ); } /* end of for each block-columns in a big-panel */ } } /* end of for each previous big-panels */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } /* calling magma-gpu interface to panel-factorize the big panel */ if ( M > I ) { //magma_dgetrf1_mgpu(num_gpus, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, A(0,I), lda, // (magma_queue_t **)stream, &iinfo); magma_dgetrf2_mgpu(num_gpus, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, A(0,I), lda, stream, &iinfo); if ( iinfo < 0 ) { *info = iinfo; break; } else if ( iinfo != 0 ) { *info = iinfo + I * NB; //break; } /* adjust pivots */ for( ii=I; ii < min(I+N,m); ii++ ) ipiv[ii] += I; } time_comp += timer_stop( time ); /* download the current big panel to CPU */ timer_start( time ); magmablas_dgetmatrix_transpose_mgpu(num_gpus, stream, dAT, ldn_local, A(0,I), lda, dA, maxm, M, N, nb); for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } time_get += timer_stop( time ); } /* end of for */ timer_stop( time_total ); flops = FLOPS_DGETRF( m, n ) / 1e9; timer_printf(" memory-allocation time: %e\n", time_alloc ); timer_printf(" NB=%d nb=%d\n", (int) NB, (int) nb ); timer_printf(" memcopy and transpose %e seconds\n", time_set ); timer_printf(" total time %e seconds\n", time_total ); timer_printf(" Performance %f GFlop/s, %f seconds without htod and dtoh\n", flops / (time_comp), time_comp ); timer_printf(" Performance %f GFlop/s, %f seconds with htod\n", flops / (time_comp + time_set), time_comp + time_set ); timer_printf(" Performance %f GFlop/s, %f seconds with dtoh\n", flops / (time_comp + time_get), time_comp + time_get ); timer_printf(" Performance %f GFlop/s, %f seconds without memory-allocation\n", flops / (time_total - time_alloc), time_total - time_alloc ); for( d=0; d < num_gpus0; d++ ) { magma_setdevice(d); magma_free( dA[d] ); magma_event_destroy( event[d][0] ); magma_event_destroy( event[d][1] ); magma_queue_destroy( stream[d][0] ); magma_queue_destroy( stream[d][1] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); } if ( *info >= 0 ) magma_dgetrf_piv(m, n, NB, A, lda, ipiv, info); return *info; } /* magma_dgetrf_m */
/** Purpose ======= ZHETRF_nopiv_gpu computes the LDLt factorization of a complex Hermitian matrix A. The factorization has the form A = U^H * D * U , if UPLO = 'U', or A = L * D * L^H, if UPLO = 'L', where U is an upper triangular matrix, L is lower triangular, and D is a diagonal matrix. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] UPLO CHARACTER*1 - = 'U': Upper triangle of A is stored; - = 'L': Lower triangle of A is stored. @param[in] N INTEGER The order of the matrix A. N >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U^H D U or A = L D L^H. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using cudaMallocHost. @param[in] LDA INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] INFO INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -6, the GPU memory allocation failed - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_zhetrf_comp ******************************************************************* */ extern "C" magma_int_t magma_zhetrf_nopiv_gpu( magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex_ptr dA, magma_int_t ldda, magma_int_t *info) { #define A(i, j) (A) #define dA(i, j) (dA +(j)*ldda + (i)) #define dW(i, j) (dW +(j)*ldda + (i)) #define dWt(i, j) (dW +(j)*nb + (i)) /* Local variables */ magmaDoubleComplex zone = MAGMA_Z_ONE; magmaDoubleComplex mzone = MAGMA_Z_NEG_ONE; int upper = (uplo == MagmaUpper); magma_int_t j, k, jb, nb, ib, iinfo; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return MAGMA_ERR_ILLEGAL_VALUE; } /* Quick return */ if ( n == 0 ) return MAGMA_SUCCESS; nb = magma_get_zhetrf_nopiv_nb(n); ib = min(32, nb); // inner-block for diagonal factorization magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); magma_queue_t stream[2]; magma_event_t event; magma_queue_create(&stream[0]); magma_queue_create(&stream[1]); magma_event_create( &event ); trace_init( 1, 1, 2, stream ); // CPU workspace magmaDoubleComplex *A; if (MAGMA_SUCCESS != magma_zmalloc_pinned( &A, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } // GPU workspace magmaDoubleComplex_ptr dW; if (MAGMA_SUCCESS != magma_zmalloc( &dW, (1+nb)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* Use hybrid blocked code. */ if (upper) { //========================================================= // Compute the LDLt factorization A = U'*D*U without pivoting. // main loop for (j=0; j<n; j += nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); //magma_queue_wait_event( stream[1], event ); magma_event_sync(event); magma_zgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), nb, stream[1]); trace_gpu_end( 0, 0 ); // factorize the diagonal block magma_queue_sync(stream[1]); trace_cpu_start( 0, "potrf", "potrf" ); zhetrf_nopiv_cpu(MagmaUpper, jb, ib, A(j, j), nb, info); trace_cpu_end( 0 ); if (*info != 0){ *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_zsetmatrix_async(jb, jb, A(j, j), nb, dA(j, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column magmablasSetKernelStream( stream[0] ); trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_ztrsm(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaUnit, jb, (n-j-jb), zone, dA(j, j), ldda, dA(j, j+jb), ldda); magma_zcopymatrix( jb, n-j-jb, dA( j, j+jb ), ldda, dWt( 0, j+jb ), nb ); // update the trailing submatrix with D magmablas_zlascl_diag(MagmaUpper, jb, n-j-jb, dA(j, j), ldda, dA(j, j+jb), ldda, &iinfo); trace_gpu_end( 0, 0 ); // update the trailing submatrix with U and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k<n; k+=nb) { magma_int_t kb = min(nb,n-k); magma_zgemm(MagmaConjTrans, MagmaNoTrans, kb, n-k, jb, mzone, dWt(0, k), nb, dA(j, k), ldda, zone, dA(k, k), ldda); if (k==j+jb) magma_event_record( event, stream[0] ); } trace_gpu_end( 0, 0 ); } } } else { //========================================================= // Compute the LDLt factorization A = L*D*L' without pivoting. // main loop for (j=0; j<n; j+=nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); //magma_queue_wait_event( stream[0], event ); magma_event_sync(event); magma_zgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), nb, stream[1]); trace_gpu_end( 0, 0 ); // factorize the diagonal block magma_queue_sync(stream[1]); trace_cpu_start( 0, "potrf", "potrf" ); zhetrf_nopiv_cpu(MagmaLower, jb, ib, A(j, j), nb, info); trace_cpu_end( 0 ); if (*info != 0){ *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_zsetmatrix_async(jb, jb, A(j, j), nb, dA(j, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column magmablasSetKernelStream( stream[0] ); trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_ztrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaUnit, (n-j-jb), jb, zone, dA(j, j), ldda, dA(j+jb, j), ldda); magma_zcopymatrix( n-j-jb,jb, dA( j+jb, j ), ldda, dW( j+jb, 0 ), ldda ); // update the trailing submatrix with D magmablas_zlascl_diag(MagmaLower, n-j-jb, jb, dA(j, j), ldda, dA(j+jb, j), ldda, &iinfo); trace_gpu_end( 0, 0 ); // update the trailing submatrix with L and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k<n; k+=nb) { magma_int_t kb = min(nb,n-k); magma_zgemm(MagmaNoTrans, MagmaConjTrans, n-k, kb, jb, mzone, dA(k, j), ldda, dW(k, 0), ldda, zone, dA(k, k), ldda); if (k==j+jb) magma_event_record( event, stream[0] ); } trace_gpu_end( 0, 0 ); } } } trace_finalize( "zhetrf.svg","trace.css" ); magma_queue_destroy(stream[0]); magma_queue_destroy(stream[1]); magma_event_destroy( event ); magma_free( dW ); magma_free_pinned( A ); magmablasSetKernelStream( orig_stream ); return MAGMA_SUCCESS; } /* magma_zhetrf_nopiv */
/* //////////////////////////////////////////////////////////////////////////// -- Testing magma_dsymm_mgpu */ int main( int argc, char** argv) { TESTING_INIT(); double c_neg_one = MAGMA_D_NEG_ONE; double alpha = MAGMA_D_MAKE( 3.456, 5.678 ); double beta = MAGMA_D_MAKE( 1.234, 2.456 ); real_Double_t gflops, gpu_perf=0., cpu_perf=0., gpu_time=0., cpu_time=0.; real_Double_t gpu_perf2=0., gpu_time2=0.; double Anorm, error, work[1]; double *hA, *hB, *hC, *hR; magmaDouble_ptr dA[MagmaMaxGPUs], dB[MagmaMaxGPUs], dC[MagmaMaxGPUs], dwork[MagmaMaxGPUs]; magmaDouble_ptr dA2; magma_int_t i, j, dev, M, N, size, lda, ldb, ldc, ldda, lddb, lddc, msize, nb; magma_int_t ione = 1; magma_int_t iseed[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); opts.ngpu = abs( opts.ngpu ); // always uses multi-GPU code double tol = opts.tolerance * lapackf77_dlamch("E"); // default values nb = (opts.nb > 0 ? opts.nb : 64); magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2]; magma_int_t ncmplx = 0; magma_buildconnection_mgpu( gnode, &ncmplx, opts.ngpu ); printf("%% Initializing communication pattern... GPU-ncmplx %d\n", (int) ncmplx); for (i=0; i < ncmplx; ++i) { magma_int_t myngpu = gnode[i][MagmaMaxGPUs]; printf("%% cmplx %d has %d GPUs:", i, myngpu); for (j=0; j < myngpu; ++j) { printf(" %d", (int) gnode[i][j]); if (j < myngpu-1) { printf(","); } } printf("\n"); } // number of queues per GPU. Requires ngpu. magma_int_t nqueue = opts.ngpu; // number of events per GPU. Require ngpu*ngpu. magma_int_t nevents = opts.ngpu*opts.ngpu; magma_queue_t queues[MagmaMaxGPUs][20], queues0[MagmaMaxGPUs]; magma_event_t events[MagmaMaxGPUs][MagmaMaxGPUs*MagmaMaxGPUs + 10]; for( dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); for( i = 0; i < nqueue; ++i ) { magma_queue_create( dev, &queues[dev][i] ); } queues0[dev] = queues[dev][0]; for( i = 0; i < nevents; ++i ) { cudaEventCreateWithFlags( &events[dev][i], cudaEventDisableTiming ); } } printf("%% nb %d, ngpu %d, version %d\n", (int) nb, (int) opts.ngpu, (int) opts.version ); printf("%% M N nb offset CPU Gflop/s (sec) GPU Gflop/s (sec) CUBLAS hemm (sec) ||R|| / ||A||*||B||\n"); printf("%%========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { M = opts.msize[itest]; N = opts.nsize[itest]; for( int offset = 0; offset < N; offset += min(N,nb) ) { for( int iter = 0; iter < opts.niter; ++iter ) { msize = M - offset; lda = M; // TODO depends on side ldb = M; ldc = M; ldda = magma_roundup( lda, opts.align ); // multiple of 32 by default lddb = magma_roundup( ldb, opts.align ); // multiple of 32 by default lddc = magma_roundup( ldc, opts.align ); // multiple of 32 by default gflops = FLOPS_DSYMM( MagmaLeft, (double)msize, (double)N ) / 1e9; magma_int_t dworksiz = lddc*N + (M*N)*opts.ngpu; TESTING_MALLOC_CPU( hA, double, lda*M ); TESTING_MALLOC_CPU( hB, double, ldb*N ); TESTING_MALLOC_CPU( hC, double, ldc*N ); TESTING_MALLOC_PIN( hR, double, ldc*N ); for( dev = 0; dev < opts.ngpu; ++dev ) { magma_int_t mlocal = ((M / nb) / opts.ngpu + 1) * nb; magma_setdevice( dev ); TESTING_MALLOC_DEV( dA[dev], double, ldda*mlocal ); TESTING_MALLOC_DEV( dB[dev], double, lddb*N ); TESTING_MALLOC_DEV( dC[dev], double, lddc*N ); TESTING_MALLOC_DEV( dwork[dev], double, dworksiz ); } if ( opts.check ) { magma_setdevice( 0 ); TESTING_MALLOC_DEV( dA2, double, ldda*M ); } size = lda*M; lapackf77_dlarnv( &ione, iseed, &size, hA ); magma_dmake_symmetric( M, hA, lda ); size = ldb*N; lapackf77_dlarnv( &ione, iseed, &size, hB ); size = ldc*N; lapackf77_dlarnv( &ione, iseed, &size, hC ); lapackf77_dlacpy( "Full", &M, &N, hC, &ldc, hR, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix_1D_col_bcyclic( M, M, hA, lda, dA, ldda, opts.ngpu, nb, queues0 ); for( dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); magma_dsetmatrix( M, N, hB, lda, dB[dev], ldda, opts.queue ); // since when offset != 0, the GPU that does beta*C may not be 0, // send initial hC to all GPUs. magma_dsetmatrix( M, N, hC, lda, dC[dev], ldda, opts.queue ); } trace_init( 1, opts.ngpu, nqueue, (magma_queue_t*) queues ); gpu_time = magma_sync_wtime(0); magmablas_dsymm_mgpu( MagmaLeft, MagmaLower, msize, N, alpha, dA, ldda, offset, dB, ldda, beta, dC, ldda, dwork, dworksiz, opts.ngpu, nb, queues, nqueue, events, nevents, gnode, ncmplx); gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gflops / gpu_time; #ifdef TRACING char buf[80]; snprintf( buf, sizeof(buf), "dsymm-m%d-n%d-nb%d-ngpu%d-run%d.svg", (int) M, (int) N, (int) nb, (int) opts.ngpu, (int) iter ); trace_finalize( buf, "trace.css" ); #endif /* ==================================================================== Performs operation using CUBLAS =================================================================== */ if ( opts.check && iter == 0 ) { magma_setdevice( 0 ); magma_dsetmatrix( M, M, hA, lda, dA2, ldda, opts.queue ); magma_dsetmatrix( M, N, hB, lda, dB[0], ldda, opts.queue ); magma_dsetmatrix( M, N, hC, lda, dwork[0], ldda, opts.queue ); gpu_time2 = magma_sync_wtime(0); magma_dsymm( MagmaLeft, MagmaLower, msize, N, alpha, dA2 + offset + offset*ldda, ldda, dB[0], ldda, beta, dwork[0], ldda, opts.queue ); gpu_time2 = magma_sync_wtime(0) - gpu_time2; gpu_perf2 = gflops / gpu_time2; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.check ) { // store ||A||*||B|| Anorm = lapackf77_dlange("fro", &msize, &msize, hA + offset + offset*lda, &lda, work ); Anorm *= lapackf77_dlange("fro", &msize, &N, hB, &lda, work ); //printf( "A =" ); magma_dprint( M, M, hA, lda ); //printf( "B =" ); magma_dprint( M, N, hB, lda ); //printf( "C =" ); magma_dprint( M, N, hC, lda ); cpu_time = magma_wtime(); blasf77_dsymm( "Left", "Lower", &msize, &N, &alpha, hA + offset + offset*lda, &lda, hB, &lda, &beta, hC, &lda ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; for (dev=0; dev < opts.ngpu; ++dev) { magma_setdevice( dev ); magma_dgetmatrix( M, N, dC[dev], ldda, hR, lda, opts.queue ); // compute relative error ||R||/||A||*||B||, where R := C_magma - C_lapack = R - C size = ldc*N; blasf77_daxpy( &size, &c_neg_one, hC, &ione, hR, &ione ); error = lapackf77_dlange("fro", &msize, &N, hR, &lda, work) / Anorm; //printf( "R =" ); magma_dprint( M, N, hR, lda ); bool okay = (error < tol); status += ! okay; if (dev == 0) { printf( "%5d %5d %5d %5d %7.1f (%7.4f) %7.1f (%7.4f) %7.1f (%7.4f) %8.2e %s\n", (int) M, (int) N, (int) nb, (int) offset, cpu_perf, cpu_time, gpu_perf, gpu_time, gpu_perf2, gpu_time2, error, (okay ? "ok" : "failed") ); } else { printf( " dev %d %74s %8.2e %s\n", dev, "", error, (okay ? "ok" : "failed") ); } } } else { printf( "%5d %5d %5d %5d --- ( --- ) %7.1f (%7.4f) --- ( --- ) ---\n", (int) M, (int) N, (int) nb, (int) offset, gpu_perf, gpu_time ); } TESTING_FREE_CPU( hA ); TESTING_FREE_CPU( hB ); TESTING_FREE_CPU( hC ); TESTING_FREE_PIN( hR ); for( dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); TESTING_FREE_DEV( dA[dev] ); TESTING_FREE_DEV( dB[dev] ); TESTING_FREE_DEV( dC[dev] ); TESTING_FREE_DEV( dwork[dev] ); } if ( opts.check ) { magma_setdevice( 0 ); TESTING_FREE_DEV( dA2 ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } // offset printf( "\n" ); } for( dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); for( i = 0; i < nqueue; ++i ) { magma_queue_destroy( queues[dev][i] ); } for( i = 0; i < nevents; ++i ) { magma_event_destroy( events[dev][i] ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/** Purpose ------- SORMQR overwrites the general real M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = MagmaTrans: Q**H * C C * Q**H @endverbatim where Q is a real unitary matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by SGEQRF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] nrgpu INTEGER Number of GPUs to use. @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = MagmaTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] A REAL array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by SGEQRF in the first k columns of its array argument A. @param[in] lda INTEGER The leading dimension of the array A. If SIDE = MagmaLeft, LDA >= max(1,M); if SIDE = MagmaRight, LDA >= max(1,N). @param[in] tau REAL array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by SGEQRF. @param[in,out] C REAL array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = MagmaLeft, and LWORK >= M*NB if SIDE = MagmaRight, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sormqr_m(magma_int_t nrgpu, magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, float *A, magma_int_t lda, float *tau, float *C, magma_int_t ldc, float *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define C(i, j) (C + (j)*ldc + (i)) #define dC(gpui, i, j) (dw[gpui] + (j)*lddc + (i)) #define dA_c(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddac) #define dA_r(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddar) #define dT(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + (ind)*((nb+1)*nb)) #define dwork(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + 2*((nb+1)*nb) + (ind)*(lddwork*nb)) float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; const char* side_ = lapack_side_const( side ); const char* trans_ = lapack_trans_const( trans ); // TODO fix memory leak (alloc after argument checks) magma_int_t nb = 128; float *T; magma_smalloc_pinned(&T, nb*nb); //printf("calling sormqr_m with nb=%d\n", (int) nb); float* dw[MagmaMaxGPUs]; magma_queue_t stream [MagmaMaxGPUs][2]; magma_event_t event [MagmaMaxGPUs][2]; magma_int_t ind_c; magma_device_t igpu; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); *info = 0; magma_int_t left = (side == MagmaLeft); magma_int_t notran = (trans == MagmaNoTrans); magma_int_t lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ magma_int_t nq, nw; if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != MagmaTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } magma_int_t lwkopt = max(1,nw) * nb; if (*info == 0) { work[0] = MAGMA_S_MAKE( lwkopt, 0 ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } if (nb >= k) { /* Use CPU code */ lapackf77_sormqr(side_, trans_, &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, info); return *info; } magma_int_t lddc = (m+63)/64*64; magma_int_t lddac = nq; magma_int_t lddar = nb; magma_int_t lddwork = nw; magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; magma_int_t nb_l=256; magma_int_t nbl = (n-1)/nb_l+1; // number of blocks magma_int_t maxnlocal = (nbl+nrgpu-1)/nrgpu*nb_l; nrgpu = min(nrgpu, (n+nb_l-1)/nb_l); // Don't use GPU that will not have data. magma_int_t ldw = maxnlocal*lddc // dC + 2*lddac*lddar // 2*dA + 2*(nb + 1 + lddwork)*nb; // 2*(dT and dwork) for (igpu = 0; igpu < nrgpu; ++igpu) { magma_setdevice(igpu); if (MAGMA_SUCCESS != magma_smalloc( &dw[igpu], ldw )) { *info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(*info) ); return *info; } magma_queue_create( &stream[igpu][0] ); magma_queue_create( &stream[igpu][1] ); magma_event_create( &event[igpu][0] ); magma_event_create( &event[igpu][1] ); } /* Use hybrid CPU-MGPU code */ if (left) { //copy C to mgpus for (magma_int_t i = 0; i < nbl; ++i) { magma_int_t igpu = i%nrgpu; magma_setdevice(igpu); magma_int_t kb = min(nb_l, n-i*nb_l); magma_ssetmatrix_async( m, kb, C(0, i*nb_l), ldc, dC(igpu, 0, i/nrgpu*nb_l), lddc, stream[igpu][0] ); nlocal[igpu] += kb; } magma_int_t i1, i2, i3; if ( !notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } ind_c = 0; for (magma_int_t i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { // start the copy of A panel magma_int_t kb = min(nb, k - i); for (igpu = 0; igpu < nrgpu; ++igpu) { magma_setdevice(igpu); magma_event_sync(event[igpu][ind_c]); // check if the new data can be copied magma_ssetmatrix_async(nq-i, kb, A(i, i), lda, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] ); // set upper triangular part of dA to identity magmablas_slaset_band_q( MagmaUpper, kb, kb, kb, c_zero, c_one, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] ); } /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ magma_int_t nqi = nq - i; lapackf77_slarft("F", "C", &nqi, &kb, A(i, i), &lda, &tau[i], T, &kb); /* H or H' is applied to C(1:m,i:n) */ /* Apply H or H'; First copy T to the GPU */ for (igpu = 0; igpu < nrgpu; ++igpu) { magma_setdevice(igpu); magma_ssetmatrix_async(kb, kb, T, kb, dT(igpu, ind_c), kb, stream[igpu][0] ); } for (igpu = 0; igpu < nrgpu; ++igpu) { magma_setdevice(igpu); magma_queue_sync( stream[igpu][0] ); // check if the data was copied magmablasSetKernelStream(stream[igpu][1]); magma_slarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, m-i, nlocal[igpu], kb, dA_c(igpu, ind_c, i, 0), lddac, dT(igpu, ind_c), kb, dC(igpu, i, 0), lddc, dwork(igpu, ind_c), lddwork); magma_event_record(event[igpu][ind_c], stream[igpu][1] ); } ind_c = (ind_c+1)%2; } for (igpu = 0; igpu < nrgpu; ++igpu) { magma_setdevice(igpu); magma_queue_sync( stream[igpu][1] ); } //copy C from mgpus for (magma_int_t i = 0; i < nbl; ++i) { magma_int_t igpu = i%nrgpu; magma_setdevice(igpu); magma_int_t kb = min(nb_l, n-i*nb_l); magma_sgetmatrix( m, kb, dC(igpu, 0, i/nrgpu*nb_l), lddc, C(0, i*nb_l), ldc ); // magma_sgetmatrix_async( m, kb, // dC(igpu, 0, i/nrgpu*nb_l), lddc, // C(0, i*nb_l), ldc, stream[igpu][0] ); } } else { // TODO fix memory leak T, dw, event, stream fprintf(stderr, "The case (side == right) is not implemented\n"); *info = MAGMA_ERR_NOT_IMPLEMENTED; magma_xerbla( __func__, -(*info) ); return *info; /* if ( notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } mi = m; ic = 0; for (i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { ib = min(nb, k - i); // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) i__4 = nq - i; lapackf77_slarft("F", "C", &i__4, &ib, A(i, i), &lda, &tau[i], T, &ib); // 1) copy the panel from A to the GPU, and // 2) set upper triangular part of dA to identity magma_ssetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda ); magmablas_slaset_band( MagmaUpper, ib, ib, ib, c_zero, c_one, dA(i, 0), ldda ); // H or H' is applied to C(1:m,i:n) ni = n - i; jc = i; // Apply H or H'; First copy T to the GPU magma_ssetmatrix( ib, ib, T, ib, dT, ib ); magma_slarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i, 0), ldda, dT, ib, dC(ic, jc), lddc, dwork, lddwork); } */ } work[0] = MAGMA_S_MAKE( lwkopt, 0 ); for (igpu = 0; igpu < nrgpu; ++igpu) { magma_setdevice(igpu); magma_event_destroy( event[igpu][0] ); magma_event_destroy( event[igpu][1] ); magma_queue_destroy( stream[igpu][0] ); magma_queue_destroy( stream[igpu][1] ); magma_free( dw[igpu] ); } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_sormqr */
extern "C" magma_int_t magma_zbulge_applyQ_v2( magma_side_t side, magma_int_t NE, magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, magmaDoubleComplex_ptr dE, magma_int_t ldde, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *T, magma_int_t ldt, magma_int_t *info) { //%=========================== //% local variables //%=========================== magma_int_t Vm, Vn, mt, nt; magma_int_t myrow, mycol, blkj, blki; magma_int_t blkid,vpos,tpos; magma_int_t firstrow, nbcolinvolvd; magma_int_t versionL = 113; magma_int_t versionR = 92; magma_int_t Vchunksiz = 10; *info=0; /* Quick return */ if ( NE == 0 ) { return *info; } if ( N == 0 ) { return *info; } if ( NB == 0 ) { return *info; } /* ========================================== * some infos for developer * Initialisation and checking nb of cores * ==========================================*/ /* we have 2 algo for left (113 114) and 2 algo for right (91 92) * which correspond to versionL versionR. * They are very similar (detail explained in tech report and matlab code) * however version 114 and 92 improve locality. * while version 113 is used in case WNATZ=1 (construct Q2) which allow * the construction to be done in an optimized way taking into * consideration that the matrix is Identity so making less flops. * */ // Initialize streaming and events magma_device_sync(); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_t myevent[2]; cudaEventCreateWithFlags(&myevent[0],cudaEventDisableTiming); cudaEventCreateWithFlags(&myevent[1],cudaEventDisableTiming); // Azzam 21/11/2012 // NOTE THAT dwork was of size 2*NE*Vblksiz+... // but I am thinking why not modifing it to NE*Vblksiz+... // BUT NO because the 2* is used because of making 2 streams working and so // they might be using dwork in parallel magmaDoubleComplex *dwork, *dwork0, *dwork1, *dwvt0, *dwvt1; magmaDoubleComplex *dT0, *dV0, *dT1, *dV1; magma_int_t lddv = ldv; magma_int_t lddt = ldt; magma_int_t lddw = 0; magma_int_t lddwork = ((NE+31)/32)*32; magma_int_t dwVTsiz = lddv*Vblksiz; // lddv*lddv + lddv*lddwork; (v2) // lddv*Vblksiz; (v1,v3) magma_int_t dworksiz = lddwork*Vblksiz; // lddv*Vblksiz; (v2) // NE*Vblksiz=lddwork*Vblksiz; (v1,v3) if (MAGMA_SUCCESS != magma_zmalloc( &dwork, 2*dworksiz + 2*dwVTsiz + 2*Vchunksiz* (Vblksiz* (lddv+lddt)) )) { printf ("!!!! magma_zbulge_applyQ magma_alloc failed for: dwork\n" ); exit(-1); } dwork0 = dwork; // size = dworksiz; dwork1 = dwork0 + dworksiz; // size = dworksiz; dwvt0 = dwork + 2*dworksiz; // size = dwVTsiz; dwvt1 = dwvt0 + dwVTsiz; // size = dwVTsiz; dV0 = dwork + 2*dworksiz + 2*dwVTsiz; dT0 = dV0 + Vchunksiz*Vblksiz*lddv; dV1 = dT0 + Vchunksiz*Vblksiz*lddt; dT1 = dV1 + Vchunksiz*Vblksiz*lddv; // make overlapped copy magma_int_t ncpy = 0; magma_int_t copyed=0, copyst=0; magma_int_t blkcnt,nothing, mysiz, flip, vld,tld, locpos; findVTsiz(N, NB, Vblksiz, &blkcnt, ¬hing); flip = 0; // performance loss if the reflector are applied to a big number of eigenvectors (~10000) // => apply the reflectors to blocks of eigenvectors. //magma_int_t nr_bl = magma_ceildiv(NE,10000); //nr of blocks magma_int_t sz_bl = NE; //magma_ceildiv(NE,nr_bl*64)*64; //maximum size of blocks (to have blocks of around the same size and multiple of 64) magma_int_t ib; //size of current block /* SIDE LEFT meaning apply E = Q*E = (q_1*q_2*.....*q_n) * E ==> so traverse Vs in reverse order (forward) from q_n to q_1 * Also E is splitten by row meaning each apply consist in a block of row (horizontal block) */ /* SIDE RIGHT meaning apply E = E*Q = E * (q_1*q_2*.....*q_n) ==> so tarverse Vs in normal order (forward) from q_1 to q_n * Also E is splitten by col meaning each apply consist in a block of col (vertical block) */ #ifdef ENABLE_DEBUG printf(" APPLY Q_v22 GPU with N %d, NE %d, NB %d, Vblksiz %d, versionL %d versionR %d SIDE %c \n", N, NE, NB, Vblksiz, versionL, versionR, side); #endif /* * MagmamaLeft */ if (side == MagmaLeft) { /* * Version 113: * loop over the block_col (nt) and for each find the * number of tiles (mt) in this block_col. then loop over mt, find * the size of the V's(Vm,Vn) and apply it to the corresponding * portion of E. */ if ( versionL == 113 ) { nt = magma_ceildiv((N-1),Vblksiz); for (blkj=nt-1; blkj >= 0; blkj--) { /* the index of the first row on the top of block (blkj) */ firstrow = blkj * Vblksiz + 1; /*find the number of tile for this block */ if ( blkj == nt-1 ) mt = magma_ceildiv( N - firstrow, NB); else mt = magma_ceildiv( N - (firstrow+1), NB); /*loop over the tiles find the size of the Vs and apply it */ for (blki=mt; blki > 0; blki--) { /*calculate the size of each losange of Vs= (Vm,Vn)*/ myrow = firstrow + (mt-blki)*NB; mycol = blkj*Vblksiz; Vm = min( NB+Vblksiz-1, N-myrow); if ( ( blkj == nt-1 ) && ( blki == mt ) ) { Vn = min (Vblksiz, Vm); } else { Vn = min (Vblksiz, Vm-1); } /*calculate the pointer to the Vs and the Ts. * Note that Vs and Ts have special storage done * by the bulgechasing function*/ //printf("voici blkj %d blki %d Vm %d Vn %d mycol %d vpos %d \n",blkj,blki,Vm, Vn,mycol,vpos); magma_bulge_findpos113(N, NB, Vblksiz, mycol, myrow, &blkid); // COPY Vchunksiz Vs and Vchunksiz Ts to GPU and store it in dV0/dV1 and dT0/dT1 if (ncpy == 0) { // flip = 1 for this. copyst = 0; // meaning that copy will start copying from blkid =copyst copyed = min(copyst+Vchunksiz, blkcnt); // meaning that copy will end copying at blkid =copyed-1==> next copy had to start at copyed mysiz = copyed-copyst; // the size of the chunk to be copied if (mysiz > 0) { ncpy = 1; flip = 1; vpos = copyst*Vblksiz*ldv; tpos = copyst*Vblksiz*ldt; vld = mysiz * ldv; tld = mysiz * ldt; magmablasSetKernelStream(stream[1]); magma_zsetmatrix_async(vld, Vblksiz, V(vpos), vld, dV1, vld, stream[1]); magma_zsetmatrix_async(tld, Vblksiz, T(tpos), tld, dT1, tld, stream[1]); //printf("doing the first copy of mysiz %2d copyst %2d copyed %2d vpos %8d tpos %8d into dV1 dT1\n",mysiz,copyst,copyed,vpos,tpos); } } if (blkid == copyst) { flip = ncpy % 2; copyst = copyed; // meaning that copy will start copying from blkid =copyst copyed = min(copyst+Vchunksiz, blkcnt); // meaning that copy will end copying at blkid =copyed-1==> next copy had to start at copyed mysiz = copyed-copyst; // the size of the chunk to be copied //printf(" get to copy blkid %d blkid+(2*Vchunksiz) %d copyst %d copyed %d\n",blkid,blkid+(Vchunksiz),copyst,copyed); if (mysiz > 0) { ncpy = ncpy + 1; vpos = copyst*Vblksiz*ldv; tpos = copyst*Vblksiz*ldt; vld = mysiz * ldv; tld = mysiz * ldt; if (flip == 0) { // now I am working on dV0 so copy the next and put it on dV1 //printf("doing overlapping copy of mysiz %2d copyst %2d copyed %2d vpos %8d tpos %8d into dV1 dT1\n",mysiz,copyst,copyed,vpos,tpos); magmablasSetKernelStream(stream[1]); magma_zsetmatrix_async(vld, Vblksiz, V(vpos), vld, dV1, vld, stream[1]); magma_zsetmatrix_async(tld, Vblksiz, T(tpos), tld, dT1, tld, stream[1]); } else { // now I am working on dV1 so copy the next and put it on dV0 //printf("doing overlapping copy of mysiz %2d copyst %2d copyed %2d vpos %8d tpos %8d into dV0 dT0\n",mysiz,copyst,copyed,vpos,tpos); magmablasSetKernelStream(stream[0]); magma_zsetmatrix_async(vld, Vblksiz, V(vpos), vld, dV0, vld, stream[0]); magma_zsetmatrix_async(tld, Vblksiz, T(tpos), tld, dT0, tld, stream[0]); } } } if ((Vm > 0) && (Vn > 0)) { locpos = blkid%Vchunksiz; magma_int_t lcvpos = locpos*Vblksiz*lddv; magma_int_t lctpos = locpos*Vblksiz*lddt; //printf("voici blkj %d blki %d Vm %d Vn %d mycol %d locvpos %5d loctpos %5d blkid %2d using data in dV%1d dT%1d \n",blkj,blki,Vm, Vn,mycol,lcvpos,lctpos, blkid,flip,flip); if (flip == 0) { magmablasSetKernelStream(stream[0]); magma_queue_wait_event( stream[0], myevent[1] ); for (magma_int_t i=0; i < NE; i += sz_bl) { ib = min(sz_bl, NE-i); lddw = min(lddwork,sz_bl); //magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV0+lcvpos, lddv, dT0+lctpos, lddt, dE(myrow,i), ldde, dwork0, lddw); magma_zlarfb_gpu_gemm( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV0+lcvpos, lddv, dT0+lctpos, lddt, dE(myrow,i), ldde, dwork0, lddw, dwvt0, lddv); } magma_event_record( myevent[0], stream[0] ); } else { magmablasSetKernelStream(stream[1]); magma_queue_wait_event( stream[1], myevent[0] ); for (magma_int_t i=0; i < NE; i += sz_bl) { ib = min(sz_bl, NE-i); lddw = min(lddwork,sz_bl); //magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV1+lcvpos, lddv, dT1+lctpos, lddt, dE(myrow,i), ldde, dwork1, lddw); magma_zlarfb_gpu_gemm( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV1+lcvpos, lddv, dT1+lctpos, lddt, dE(myrow,i), ldde, dwork1, lddw, dwvt1, lddv); } magma_event_record( myevent[1], stream[1] ); } } // end for (Vm &Vn) > 0 } // end for blki } // end for blkj } // end if version=113 /* * Version 114: * loop over the block_row (mt) and for each find diagonally the * number of tiles (nt) in this block_row. then loop over nt, find * the size of the V's(Vm,Vn) and apply it to the corresponding * portion of E. */ else { mt = magma_ceildiv((N-1),NB); for (blki = mt; blki > 0; blki--) { /* nbcolinvolvd = number of column corresponding to this block_row (blki) */ nbcolinvolvd = min(N-1, blki*NB); /*find the number of tile for this block (diagonal row of tiles) */ nt = magma_ceildiv(nbcolinvolvd,Vblksiz); /*loop over the tiles find the size of the Vs and apply it */ for (blkj = nt-1; blkj >= 0; blkj--) { /* the index of the first row of the first col meaning * the block on the top left (blki) */ firstrow = (mt-blki)*NB+1; /*calculate the size of each losange of Vs= (Vm,Vn)*/ myrow = firstrow + blkj*Vblksiz; mycol = blkj*Vblksiz; Vm = min( NB+Vblksiz-1, N-myrow); if ( ( blkj == nt-1 ) && ( blki == mt ) ) { Vn = min (Vblksiz, Vm); } else { Vn = min (Vblksiz, Vm-1); } if ((Vm > 0) && (Vn > 0)) { /*calculate the pointer to the Vs and the Ts. * Note that Vs and Ts have special storage done * by the bulgechasing function*/ magma_bulge_findVTpos(N, NB, Vblksiz, mycol, myrow, ldv, ldt, &vpos, &tpos); magma_zsetmatrix_async(Vm, Vn, V(vpos), ldv, dV0, lddv, NULL); magma_zsetmatrix_async(Vn, Vn, T(tpos), ldt, dT0, lddt, NULL); //printf("voici blki %d rownbm %d mycol %d coled %d blkid %d vpos %d tpos %d\n", blki, rownbm, mycol, coled, blkid, vpos, tpos); for (magma_int_t i=0; i < NE; i += sz_bl) { ib = min(sz_bl, NE-i); magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV0, lddv, dT0, lddt, dE(myrow,i), ldde, dwork, NE); } } // end for (Vm &Vn) > 0 } // end for blkj } // end for blki } // end version 114 } // end LEFT /* * MagmaRight */ else { /* * Version 91: */ if ( versionR == 91 ) { nt = magma_ceildiv((N-1),Vblksiz); for (blkj=0; blkj < nt; blkj++) { /* the index of the first myrow on the top of block (blkj) */ firstrow = blkj * Vblksiz + 1; /*find the number of tile for this block */ if ( blkj == nt-1 ) mt = magma_ceildiv( N - firstrow, NB); else mt = magma_ceildiv( N - (firstrow+1), NB); /*loop over the tiles find the size of the Vs and apply it */ for (blki=1; blki <= mt; blki++) { /*calculate the size of each losange of Vs= (Vm,Vn)*/ myrow = firstrow + (mt-blki)*NB; Vm = min( NB+Vblksiz-1, N-myrow); if ( (blkj == nt-1) && (blki == mt) ) { Vn = min( Vblksiz, Vm ); } else { Vn = min( Vblksiz, Vm-1 ); } mycol = blkj*Vblksiz; if ((Vm > 0) && (Vn > 0)) { /*calculate the pointer to the Vs and the Ts. * Note that Vs and Ts have special storage done * by the bulgechasing function*/ magma_bulge_findVTpos(N, NB, Vblksiz, mycol, myrow, ldv, ldt, &vpos, &tpos); magma_zsetmatrix_async(Vm, Vn, V(vpos), ldv, dV0, lddv, NULL); magma_zsetmatrix_async(Vn, Vn, T(tpos), ldt, dT0, lddt, NULL); magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, Vm, Vn, dV0, lddv, dT0, lddt, dE(0, myrow), ldde, dwork, NE); } // end for (Vm &Vn) > 0 } // end for blki } // end fo blkj } // end of version 91 /* * Version 92: */ else { mt = magma_ceildiv((N-1),NB); for (blki = 1; blki <= mt; blki++) { /* nbcolinvolvd = number of column corresponding to this block_row (blki) */ nbcolinvolvd = min(N-1, blki*NB); /*find the number of tile for this block (diagonal row of tiles) */ nt = magma_ceildiv(nbcolinvolvd,Vblksiz); /*loop over the tiles find the size of the Vs and apply it */ for (blkj = 0; blkj < nt; blkj++) { /* the index of the first row of the first col meaning * the block on the top left (blki) */ firstrow = (mt-blki)*NB+1; /*calculate the size of each losange of Vs= (Vm,Vn)*/ myrow = firstrow + blkj*Vblksiz; mycol = blkj*Vblksiz; Vm = min( NB+Vblksiz-1, N-myrow); if ( ( blkj == nt-1 ) && ( blki == mt ) ) { Vn = min (Vblksiz, Vm); } else { Vn = min (Vblksiz, Vm-1); } if ((Vm > 0) && (Vn > 0)) { /*calculate the pointer to the Vs and the Ts. * Note that Vs and Ts have special storage done * by the bulgechasing function*/ magma_bulge_findVTpos(N, NB, Vblksiz, mycol, myrow, ldv, ldt, &vpos, &tpos); magma_zsetmatrix_async(Vm, Vn, V(vpos), ldv, dV0, lddv, NULL); magma_zsetmatrix_async(Vn, Vn, T(tpos), ldt, dT0, lddt, NULL); magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, Vm, Vn, dV0, lddv, dT0, lddt, dE(0, myrow), ldde, dwork, NE); } // end for (Vm &Vn) > 0 } //end for blkj } // end for blki } //end of version 92 } // end RIGHT magma_device_sync(); magmablasSetKernelStream( orig_stream ); magma_event_destroy( myevent[0] ); magma_event_destroy( myevent[1] ); magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free(dwork); return *info; }
extern "C" magma_int_t magma_cpotrf_m(magma_int_t num_gpus0, char uplo, magma_int_t n, magmaFloatComplex *a, magma_int_t lda, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CPOTRF_OOC computes the Cholesky factorization of a complex Hermitian positive definite matrix A. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix A may not fit entirely in the GPU memory. The factorization has the form A = U**H * U, if UPLO = 'U', or A = L * L**H, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) COMPLEX array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U**H * U or A = L * L**H. Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ /* Local variables */ float d_one = 1.0; float d_neg_one = -1.0; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; char uplo_[2] = {uplo, 0}; int upper = lapackf77_lsame(uplo_, "U"); magmaFloatComplex *dwork[MagmaMaxGPUs], *dt[MagmaMaxGPUs]; magma_int_t ldda, lddla, nb, iinfo, n_local[MagmaMaxGPUs], J2, d, num_gpus; magma_int_t j, jj, jb, J, JB, NB, MB, h; magma_queue_t stream[MagmaMaxGPUs][3]; magma_event_t event[MagmaMaxGPUs][5]; #ifdef ROW_MAJOR_PROFILE magma_timestr_t start, end, start0, end0; float chol_time = 1.0; #endif *info = 0; if ((! upper) && (! lapackf77_lsame(uplo_, "L"))) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; nb = magma_get_dpotrf_nb(n); if( num_gpus0 > n/nb ) { num_gpus = n/nb; if( n%nb != 0 ) num_gpus ++; } else { num_gpus = num_gpus0; } //ldda = ((n+31)/32)*32; ldda = ((n+nb-1)/nb)*nb; lddla = ((nb*((n+nb*num_gpus-1)/(nb*num_gpus))+31)/32)*32; /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(magmaFloatComplex); MB = n; /* number of rows in the big panel */ NB = (magma_int_t)((0.8*freeMem-max(2,num_gpus)*nb*ldda-(n+nb)*nb)/lddla); /* number of columns in the big panel */ //NB = min(5*nb,n); if( NB >= n ) { #ifdef CHECK_CPOTRF_OOC printf( " * still fit in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_CPOTRF_OOC printf( " * don't fit in GPU memory.\n" ); #endif NB = (NB/nb) * nb; /* making sure it's devisable by nb */ } #ifdef CHECK_CPOTRF_OOC if( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d, lddla=%d, freeMem=%.2e).\n",n,NB,nb,lddla,(float)freeMem ); else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d, lddla=%d, freeMem=%.2e).\n",n,NB,nb,lddla,(float)freeMem ); fflush(stdout); #endif for (d=0; d<num_gpus; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_cmalloc( &dt[d], NB*lddla + max(2,num_gpus)*nb*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dwork[d] = &dt[d][max(2,num_gpus)*nb*ldda]; for( j=0; j<3; j++ ) magma_queue_create( &stream[d][j] ); for( j=0; j<5; j++ ) magma_event_create( &event[d][j] ); magma_device_sync(); // synch the device } magma_setdevice(0); #ifdef ROW_MAJOR_PROFILE start0 = get_current_time(); #endif if (nb <= 1 || nb >= n) { lapackf77_cpotrf(uplo_, &n, a, &lda, info); } else { /* Use hybrid blocked code. */ if (upper) { /* =========================================================== * * Compute the Cholesky factorization A = U'*U. * * big panel is divided by block-row and distributed in block * * column cyclic format */ /* for each big-panel */ for( J=0; J<n; J+=NB ) { JB = min(NB,n-J); if( num_gpus0 > (n-J)/nb ) { num_gpus = (n-J)/nb; if( (n-J)%nb != 0 ) num_gpus ++; } else { num_gpus = num_gpus0; } /* load the new big-panel by block-rows */ magma_chtodpo( num_gpus, &uplo, JB, n, J, J, nb, a, lda, dwork, NB, stream, &iinfo); #ifdef ROW_MAJOR_PROFILE start = get_current_time(); #endif /* update with the previous big-panels */ for( j=0; j<J; j+=nb ) { /* upload the diagonal of the block column (broadcast to all GPUs) */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_csetmatrix_async( nb, JB, A(j, J), lda, dTup(d, 0, J), nb, stream[d][0] ); n_local[d] = 0; } /* distribute off-diagonal blocks to GPUs */ for( jj=J+JB; jj<n; jj+=nb ) { d = ((jj-J)/nb)%num_gpus; magma_setdevice(d); jb = min(nb, n-jj); magma_csetmatrix_async( nb, jb, A(j, jj), lda, dTup(d, 0, J+JB+n_local[d]), nb, stream[d][0] ); n_local[d] += jb; } /* wait for the communication */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); } /* update the current big-panel using the previous block-row */ /* -- process the big diagonal block of the big panel */ for( jj=0; jj<JB; jj+=nb ) { // jj is 'local' column index within the big panel d = (jj/nb)%num_gpus; J2 = jj/(nb*num_gpus); magma_setdevice(d); magmablasSetKernelStream(stream[d][J2%2]); // the last stream (2) used to process off-diagonal J2 = nb*J2; jb = min(nb,JB-jj); // number of columns in this current block-row magma_cgemm( MagmaConjTrans, MagmaNoTrans, jj, jb, nb, c_neg_one, dTup(d, 0, J ), nb, dTup(d, 0, J+jj), nb, c_one, dAup(d, 0, J2), NB); magma_cherk(MagmaUpper, MagmaConjTrans, jb, nb, d_neg_one, dTup(d, 0, J+jj), nb, d_one, dAup(d, jj, J2), NB); } /* -- process the remaining big off-diagonal block of the big panel */ if( n > J+JB ) { for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][2]); /* local number of columns in the big panel */ n_local[d] = ((n-J)/(nb*num_gpus))*nb; if (d < ((n-J)/nb)%num_gpus) n_local[d] += nb; else if (d == ((n-J)/nb)%num_gpus) n_local[d] += (n-J)%nb; /* subtracting the local number of columns in the diagonal */ J2 = nb*(JB/(nb*num_gpus)); if( d < (JB/nb)%num_gpus ) J2+=nb; n_local[d] -= J2; magma_cgemm( MagmaConjTrans, MagmaNoTrans, JB, n_local[d], nb, c_neg_one, dTup(d, 0, J ), nb, dTup(d, 0, J+JB), nb, c_one, dAup(d, 0, J2), NB); } } /* wait for the previous updates */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( jj=0; jj<3; jj++ ) magma_queue_sync( stream[d][jj] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); } /* end of updates with previous rows */ /* factor the big panel */ h = (JB+nb-1)/nb; // big diagonal of big panel will be on CPU // using two streams //magma_cpotrf2_mgpu(num_gpus, uplo, JB, n-J, J, J, nb, // dwork, NB, dt, ldda, a, lda, h, stream, event, &iinfo); // using three streams magma_cpotrf3_mgpu(num_gpus, uplo, JB, n-J, J, J, nb, dwork, NB, dt, ldda, a, lda, h, stream, event, &iinfo); if( iinfo != 0 ) { *info = J+iinfo; break; } #ifdef ROW_MAJOR_PROFILE end = get_current_time(); chol_time += GetTimerValue(start, end); #endif /* upload the off-diagonal (and diagonal!!!) big panel */ magma_cdtohpo(num_gpus, &uplo, JB, n, J, J, nb, NB, a, lda, dwork, NB, stream, &iinfo); //magma_cdtohpo(num_gpus, &uplo, JB, n, J, J, nb, 0, a, lda, dwork, NB, stream, &iinfo); } } else { /* ========================================================= * * Compute the Cholesky factorization A = L*L'. */ /* for each big-panel */ for( J=0; J<n; J+=NB ) { JB = min(NB,n-J); if( num_gpus0 > (n-J)/nb ) { num_gpus = (n-J)/nb; if( (n-J)%nb != 0 ) num_gpus ++; } else { num_gpus = num_gpus0; } /* load the new big-panel by block-columns */ magma_chtodpo( num_gpus, &uplo, n, JB, J, J, nb, a, lda, dwork, lddla, stream, &iinfo); /* update with the previous big-panels */ #ifdef ROW_MAJOR_PROFILE start = get_current_time(); #endif for( j=0; j<J; j+=nb ) { /* upload the diagonal of big panel */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_csetmatrix_async( JB, nb, A(J, j), lda, dT(d, J, 0), ldda, stream[d][0] ); n_local[d] = 0; } /* upload off-diagonals */ for( jj=J+JB; jj<n; jj+=nb ) { d = ((jj-J)/nb)%num_gpus; magma_setdevice(d); jb = min(nb, n-jj); magma_csetmatrix_async( jb, nb, A(jj, j), lda, dT(d, J+JB+n_local[d], 0), ldda, stream[d][0] ); n_local[d] += jb; } /* wait for the communication */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); } /* update the current big-panel using the previous block-row */ for( jj=0; jj<JB; jj+=nb ) { /* diagonal */ d = (jj/nb)%num_gpus; J2 = jj/(nb*num_gpus); magma_setdevice(d); magmablasSetKernelStream(stream[d][J2%2]); J2 = nb*J2; jb = min(nb,JB-jj); magma_cgemm( MagmaNoTrans, MagmaConjTrans, jb, jj, nb, c_neg_one, dT(d, J+jj, 0), ldda, dT(d, J, 0), ldda, c_one, dA(d, J2, 0), lddla); magma_cherk(MagmaLower, MagmaNoTrans, jb, nb, d_neg_one, dT(d, J+jj, 0), ldda, d_one, dA(d, J2, jj), lddla); } if( n > J+JB ) { /* off-diagonal */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][2]); /* local number of columns in the big panel */ n_local[d] = (((n-J)/nb)/num_gpus)*nb; if (d < ((n-J)/nb)%num_gpus) n_local[d] += nb; else if (d == ((n-J)/nb)%num_gpus) n_local[d] += (n-J)%nb; /* subtracting local number of columns in diagonal */ J2 = nb*(JB/(nb*num_gpus)); if( d < (JB/nb)%num_gpus ) J2+=nb; n_local[d] -= J2; magma_cgemm( MagmaNoTrans, MagmaConjTrans, n_local[d], JB, nb, c_neg_one, dT(d, J+JB, 0), ldda, dT(d, J, 0), ldda, c_one, dA(d, J2, 0), lddla); } } /* wait for the previous updates */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( jj=0; jj<3; jj++ ) magma_queue_sync( stream[d][jj] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); } /* factor the big panel */ h = (JB+nb-1)/nb; // big diagonal of big panel will be on CPU // using two streams //magma_cpotrf2_mgpu(num_gpus, uplo, n-J, JB, J, J, nb, // dwork, lddla, dt, ldda, a, lda, h, stream, event, &iinfo); // using three streams magma_cpotrf3_mgpu(num_gpus, uplo, n-J, JB, J, J, nb, dwork, lddla, dt, ldda, a, lda, h, stream, event, &iinfo); if( iinfo != 0 ) { *info = J+iinfo; break; } #ifdef ROW_MAJOR_PROFILE end = get_current_time(); chol_time += GetTimerValue(start, end); #endif /* upload the off-diagonal big panel */ magma_cdtohpo( num_gpus, &uplo, n, JB, J, J, nb, JB, a, lda, dwork, lddla, stream, &iinfo); } /* end of for J */ } /* if upper */ } /* if nb */ #ifdef ROW_MAJOR_PROFILE end0 = get_current_time(); #endif if( num_gpus0 > n/nb ) { num_gpus = n/nb; if( n%nb != 0 ) num_gpus ++; } else { num_gpus = num_gpus0; } for (d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( j=0; j<3; j++ ) { if( stream[d][j] != NULL ) magma_queue_destroy( stream[d][j] ); } magma_free( dt[d] ); for( j=0; j<5; j++ ) { magma_event_destroy( event[d][j] ); } } magma_setdevice(0); #ifdef ROW_MAJOR_PROFILE printf("\n n=%d NB=%d nb=%d\n",n,NB,nb); printf(" Without memory allocation: %f / %f = %f GFlop/s\n", FLOPS_CPOTRF(n)/1000000, GetTimerValue(start0, end0), FLOPS_CPOTRF(n)/(1000000*GetTimerValue(start0, end0))); printf(" Performance %f / %f = %f GFlop/s\n", FLOPS_CPOTRF(n)/1000000, chol_time, FLOPS_CPOTRF(n)/(1000000*chol_time)); #endif return *info; } /* magma_cpotrf_ooc */
magma_int_t magma_cgmres( magma_c_sparse_matrix A, magma_c_vector b, magma_c_vector *x, magma_c_solver_par *solver_par ){ // prepare solver feedback solver_par->solver = Magma_GMRES; solver_par->numiter = 0; solver_par->info = 0; // local variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE, c_mone = MAGMA_C_NEG_ONE; magma_int_t dofs = A.num_rows; magma_int_t i, j, k, m = 0; magma_int_t restart = min( dofs-1, solver_par->restart ); magma_int_t ldh = restart+1; float nom, rNorm, RNorm, nom0, betanom, r0 = 0.; // CPU workspace magma_setdevice(0); magmaFloatComplex *H, *HH, *y, *h1; magma_cmalloc_pinned( &H, (ldh+1)*ldh ); magma_cmalloc_pinned( &y, ldh ); magma_cmalloc_pinned( &HH, ldh*ldh ); magma_cmalloc_pinned( &h1, ldh ); // GPU workspace magma_c_vector r, q, q_t; magma_c_vinit( &r, Magma_DEV, dofs, c_zero ); magma_c_vinit( &q, Magma_DEV, dofs*(ldh+1), c_zero ); q_t.memory_location = Magma_DEV; q_t.val = NULL; q_t.num_rows = q_t.nnz = dofs; magmaFloatComplex *dy, *dH = NULL; if (MAGMA_SUCCESS != magma_cmalloc( &dy, ldh )) return MAGMA_ERR_DEVICE_ALLOC; if (MAGMA_SUCCESS != magma_cmalloc( &dH, (ldh+1)*ldh )) return MAGMA_ERR_DEVICE_ALLOC; // GPU stream magma_queue_t stream[2]; magma_event_t event[1]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); magmablasSetKernelStream(stream[0]); magma_cscal( dofs, c_zero, x->val, 1 ); // x = 0 magma_ccopy( dofs, b.val, 1, r.val, 1 ); // r = b nom0 = betanom = magma_scnrm2( dofs, r.val, 1 ); // nom0= || r|| nom = nom0 * nom0; solver_par->init_res = nom0; H(1,0) = MAGMA_C_MAKE( nom0, 0. ); magma_csetvector(1, &H(1,0), 1, &dH(1,0), 1); if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) return MAGMA_SUCCESS; //Chronometry real_Double_t tempo1, tempo2; magma_device_sync(); tempo1=magma_wtime(); if( solver_par->verbose > 0 ){ solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ){ magma_ccopy(dofs, r.val, 1, q(0), 1); // q[0] = 1.0/||r|| magma_cscal(dofs, 1./H(1,0), q(0), 1); // (to be fused) for(k=1; k<=restart; k++) { q_t.val = q(k-1); magmablasSetKernelStream(stream[0]); magma_c_spmv( c_one, A, q_t, c_zero, r ); // r = A q[k] if (solver_par->ortho == Magma_MGS ) { // modified Gram-Schmidt magmablasSetKernelStream(stream[0]); for (i=1; i<=k; i++) { H(i,k) =magma_cdotc(dofs, q(i-1), 1, r.val, 1); // H(i,k) = q[i] . r magma_caxpy(dofs,-H(i,k), q(i-1), 1, r.val, 1); // r = r - H(i,k) q[i] } H(k+1,k) = MAGMA_C_MAKE( magma_scnrm2(dofs, r.val, 1), 0. ); // H(k+1,k) = sqrt(r . r) if (k < restart) { magma_ccopy(dofs, r.val, 1, q(k), 1); // q[k] = 1.0/H[k][k-1] r magma_cscal(dofs, 1./H(k+1,k), q(k), 1); // (to be fused) } } else if (solver_par->ortho == Magma_FUSED_CGS ) { // fusing cgemv with scnrm2 in classical Gram-Schmidt magmablasSetKernelStream(stream[0]); magma_ccopy(dofs, r.val, 1, q(k), 1); // dH(1:k+1,k) = q[0:k] . r magmablas_cgemv(MagmaTrans, dofs, k+1, c_one, q(0), dofs, r.val, 1, c_zero, &dH(1,k), 1); // r = r - q[0:k-1] dH(1:k,k) magmablas_cgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.val, 1); // 1) dH(k+1,k) = sqrt( dH(k+1,k) - dH(1:k,k) ) magma_ccopyscale( dofs, k, r.val, q(k), &dH(1,k) ); // 2) q[k] = q[k] / dH(k+1,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_cgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); // asynch copy dH(1:(k+1),k) to H(1:(k+1),k) } else { // classical Gram-Schmidt (default) // > explicitly calling magmabls magmablasSetKernelStream(stream[0]); magmablas_cgemv(MagmaTrans, dofs, k, c_one, q(0), dofs, r.val, 1, c_zero, &dH(1,k), 1); // dH(1:k,k) = q[0:k-1] . r #ifndef SCNRM2SCALE // start copying dH(1:k,k) to H(1:k,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_cgetvector_async(k, &dH(1,k), 1, &H(1,k), 1, stream[1]); #endif // r = r - q[0:k-1] dH(1:k,k) magmablas_cgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.val, 1); #ifdef SCNRM2SCALE magma_ccopy(dofs, r.val, 1, q(k), 1); // q[k] = r / H(k,k-1) magma_scnrm2scale(dofs, q(k), dofs, &dH(k+1,k) ); // dH(k+1,k) = sqrt(r . r) and r = r / dH(k+1,k) magma_event_record( event[0], stream[0] ); // start sending dH(1:k,k) to H(1:k,k) magma_queue_wait_event( stream[1], event[0] ); // can we keep H(k+1,k) on GPU and combine? magma_cgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); #else H(k+1,k) = MAGMA_C_MAKE( magma_scnrm2(dofs, r.val, 1), 0. ); // H(k+1,k) = sqrt(r . r) if( k<solver_par->restart ){ magmablasSetKernelStream(stream[0]); magma_ccopy(dofs, r.val, 1, q(k), 1); // q[k] = 1.0/H[k][k-1] r magma_cscal(dofs, 1./H(k+1,k), q(k), 1); // (to be fused) } #endif } } magma_queue_sync( stream[1] ); for( k=1; k<=restart; k++ ){ /* Minimization of || b-Ax || in H_k */ for (i=1; i<=k; i++) { #if defined(PRECISION_z) || defined(PRECISION_c) cblas_cdotc_sub( i+1, &H(1,k), 1, &H(1,i), 1, &HH(k,i) ); #else HH(k,i) = cblas_cdotc(i+1, &H(1,k), 1, &H(1,i), 1); #endif } h1[k] = H(1,k)*H(1,0); if (k != 1) for (i=1; i<k; i++) { for (m=i+1; m<k; m++){ HH(k,m) -= HH(k,i) * HH(m,i); } HH(k,k) -= HH(k,i) * HH(k,i) / HH(i,i); HH(k,i) = HH(k,i)/HH(i,i); h1[k] -= h1[i] * HH(k,i); } y[k] = h1[k]/HH(k,k); if (k != 1) for (i=k-1; i>=1; i--) { y[i] = h1[i]/HH(i,i); for (j=i+1; j<=k; j++) y[i] -= y[j] * HH(j,i); } m = k; rNorm = fabs(MAGMA_C_REAL(H(k+1,k))); } magma_csetmatrix_async(m, 1, y+1, m, dy, m, stream[0]); magmablasSetKernelStream(stream[0]); magma_cgemv(MagmaNoTrans, dofs, m, c_one, q(0), dofs, dy, 1, c_one, x->val, 1); magma_c_spmv( c_mone, A, *x, c_zero, r ); // r = - A * x magma_caxpy(dofs, c_one, b.val, 1, r.val, 1); // r = r + b H(1,0) = MAGMA_C_MAKE( magma_scnrm2(dofs, r.val, 1), 0. ); // RNorm = H[1][0] = || r || RNorm = MAGMA_C_REAL( H(1,0) ); betanom = fabs(RNorm); if( solver_par->verbose > 0 ){ magma_device_sync(); tempo2=magma_wtime(); if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < r0 ) { break; } } magma_device_sync(); tempo2=magma_wtime(); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; magma_cresidual( A, b, *x, &residual ); solver_par->iter_res = betanom; solver_par->final_res = residual; if( solver_par->numiter < solver_par->maxiter){ solver_par->info = 0; }else if( solver_par->init_res > solver_par->final_res ){ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -2; } else{ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -1; } // free pinned memory magma_free_pinned( H ); magma_free_pinned( y ); magma_free_pinned( HH ); magma_free_pinned( h1 ); // free GPU memory magma_free(dy); if (dH != NULL ) magma_free(dH); magma_c_vfree(&r); magma_c_vfree(&q); // free GPU streams and events //magma_queue_destroy( stream[0] ); //magma_queue_destroy( stream[1] ); magma_event_destroy( event[0] ); magmablasSetKernelStream(NULL); return MAGMA_SUCCESS; } /* magma_cgmres */
extern "C" magma_int_t magma_sgeqrf2_mgpu( magma_int_t num_gpus, magma_int_t m, magma_int_t n, float **dlA, magma_int_t ldda, float *tau, magma_int_t *info ) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= SGEQRF2_MGPU computes a QR factorization of a real M-by-N matrix A: A = Q * R. This is a GPU interface of the routine. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) REAL array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix dA. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be dividable by 16. TAU (output) REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define dlA(dev, i, j) (dlA[dev] + (i) + (j)*(ldda)) #define hpanel(i) (hpanel + (i)) // set to NULL to make cleanup easy: free(NULL) does nothing. float *dwork[MagmaMaxGPUs]={NULL}, *dpanel[MagmaMaxGPUs]={NULL}; float *hwork=NULL, *hpanel=NULL; magma_queue_t stream[MagmaMaxGPUs][2]={{NULL}}; magma_event_t panel_event[MagmaMaxGPUs]={NULL}; magma_int_t i, j, min_mn, dev, ldhpanel, lddwork, rows; magma_int_t ib, nb; magma_int_t lhwork, lwork; magma_int_t panel_dev, i_local, i_nb_local, n_local[MagmaMaxGPUs], la_dev, dpanel_offset; magma_queue_t cqueue; magmablasGetKernelStream( &cqueue ); magma_device_t cdevice; magma_getdevice( &cdevice ); *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } min_mn = min(m,n); if (min_mn == 0) return *info; nb = magma_get_sgeqrf_nb( m ); /* dwork is (n*nb) --- for T (nb*nb) and slarfb work ((n-nb)*nb) --- * + dpanel (ldda*nb), on each GPU. * I think slarfb work could be smaller, max(n_local[:]). * Oddly, T and slarfb work get stacked on top of each other, both with lddwork=n. * on GPU that owns panel, set dpanel = dlA(dev,i,i_local). * on other GPUs, set dpanel = dwork[dev] + dpanel_offset. */ lddwork = n; dpanel_offset = lddwork*nb; for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); if ( MAGMA_SUCCESS != magma_smalloc( &(dwork[dev]), (lddwork + ldda)*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto CLEANUP; } } /* hwork is MAX( workspace for sgeqrf (n*nb), two copies of T (2*nb*nb) ) * + hpanel (m*nb). * for last block, need 2*n*nb total. */ ldhpanel = m; lhwork = max( n*nb, 2*nb*nb ); lwork = max( lhwork + ldhpanel*nb, 2*n*nb ); if ( MAGMA_SUCCESS != magma_smalloc_pinned( &hwork, lwork )) { *info = MAGMA_ERR_HOST_ALLOC; goto CLEANUP; } hpanel = hwork + lhwork; /* Set the number of local n for each GPU */ for( dev=0; dev < num_gpus; dev++ ) { n_local[dev] = ((n/nb)/num_gpus)*nb; if (dev < (n/nb) % num_gpus) n_local[dev] += nb; else if (dev == (n/nb) % num_gpus) n_local[dev] += n % nb; } for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_queue_create( &stream[dev][0] ); magma_queue_create( &stream[dev][1] ); magma_event_create( &panel_event[dev] ); } if ( nb < min_mn ) { /* Use blocked code initially */ // Note: as written, ib cannot be < nb. for( i = 0; i < min_mn-nb; i += nb ) { /* Set the GPU number that holds the current panel */ panel_dev = (i/nb) % num_gpus; /* Set the local index where the current panel is (j==i) */ i_local = i/(nb*num_gpus)*nb; ib = min(min_mn-i, nb); rows = m-i; /* Send current panel to the CPU, after panel_event indicates it has been updated */ magma_setdevice( panel_dev ); magma_queue_wait_event( stream[panel_dev][1], panel_event[panel_dev] ); magma_sgetmatrix_async( rows, ib, dlA(panel_dev, i, i_local), ldda, hpanel(i), ldhpanel, stream[panel_dev][1] ); magma_queue_sync( stream[panel_dev][1] ); // Factor panel lapackf77_sgeqrf( &rows, &ib, hpanel(i), &ldhpanel, tau+i, hwork, &lhwork, info ); if ( *info != 0 ) { fprintf( stderr, "error %d\n", (int) *info ); } // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, hpanel(i), &ldhpanel, tau+i, hwork, &ib ); spanel_to_q( MagmaUpper, ib, hpanel(i), ldhpanel, hwork + ib*ib ); // Send the current panel back to the GPUs for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); if (dev == panel_dev) dpanel[dev] = dlA(dev, i, i_local); else dpanel[dev] = dwork[dev] + dpanel_offset; magma_ssetmatrix_async( rows, ib, hpanel(i), ldhpanel, dpanel[dev], ldda, stream[dev][0] ); } for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_queue_sync( stream[dev][0] ); } // TODO: if spanel_to_q copied whole block, wouldn't need to restore // -- just send the copy to the GPUs. // TODO: also, could zero out the lower triangle and use Azzam's larfb w/ gemm. /* Restore the panel */ sq_to_panel( MagmaUpper, ib, hpanel(i), ldhpanel, hwork + ib*ib ); if (i + ib < n) { /* Send the T matrix to the GPU. */ for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_ssetmatrix_async( ib, ib, hwork, ib, dwork[dev], lddwork, stream[dev][0] ); } la_dev = (panel_dev+1) % num_gpus; for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magmablasSetKernelStream( stream[dev][0] ); if (dev == la_dev && i+nb < min_mn-nb) { // If not last panel, // for look-ahead panel, apply H' to A(i:m,i+ib:i+2*ib) i_nb_local = (i+nb)/(nb*num_gpus)*nb; magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local), ldda, // C dwork[dev]+ib, lddwork ); // work magma_event_record( panel_event[dev], stream[dev][0] ); // for trailing matrix, apply H' to A(i:m,i+2*ib:n) magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, n_local[dev]-(i_nb_local+ib), ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local+ib), ldda, // C dwork[dev]+ib, lddwork ); // work } else { // for trailing matrix, apply H' to A(i:m,i+ib:n) i_nb_local = i_local; if (dev <= panel_dev) { i_nb_local += ib; } magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, n_local[dev]-i_nb_local, ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local), ldda, // C dwork[dev]+ib, lddwork ); // work } } // Restore top of panel (after larfb is done) magma_setdevice( panel_dev ); magma_ssetmatrix_async( ib, ib, hpanel(i), ldhpanel, dlA(panel_dev, i, i_local), ldda, stream[panel_dev][0] ); } } } else { i = 0; } /* Use unblocked code to factor the last or only block row. */ if (i < min_mn) { rows = m-i; for( j=i; j < n; j += nb ) { panel_dev = (j/nb) % num_gpus; i_local = j/(nb*num_gpus)*nb; ib = min( n-j, nb ); magma_setdevice( panel_dev ); magma_sgetmatrix( rows, ib, dlA(panel_dev, i, i_local), ldda, hwork + (j-i)*rows, rows ); } // needs lwork >= 2*n*nb: // needs (m-i)*(n-i) for last block row, bounded by nb*n. // needs (n-i)*nb for sgeqrf work, bounded by n*nb. ib = n-i; // total columns in block row lhwork = lwork - ib*rows; lapackf77_sgeqrf( &rows, &ib, hwork, &rows, tau+i, hwork + ib*rows, &lhwork, info ); if ( *info != 0 ) { fprintf( stderr, "error %d\n", (int) *info ); } for( j=i; j < n; j += nb ) { panel_dev = (j/nb) % num_gpus; i_local = j/(nb*num_gpus)*nb; ib = min( n-j, nb ); magma_setdevice( panel_dev ); magma_ssetmatrix( rows, ib, hwork + (j-i)*rows, rows, dlA(panel_dev, i, i_local), ldda ); } } CLEANUP: // free(NULL) does nothing. // check that queues and events are non-zero before destroying them, though. for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); if ( stream[dev][0] ) { magma_queue_destroy( stream[dev][0] ); } if ( stream[dev][1] ) { magma_queue_destroy( stream[dev][1] ); } if ( panel_event[dev] ) { magma_event_destroy( panel_event[dev] ); } magma_free( dwork[dev] ); } magma_free_pinned( hwork ); magma_setdevice( cdevice ); magmablasSetKernelStream( cqueue ); return *info; } /* magma_sgeqrf2_mgpu */
extern "C" magma_int_t magma_cgetrf_m(magma_int_t num_gpus0, magma_int_t m, magma_int_t n, magmaFloatComplex *a, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix may not fit entirely in the GPU memory. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Note: The factorization of big panel is done calling multiple-gpu-interface. Pivots are applied on GPU within the big panel. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define A(i,j) (a + (j)*lda + (i)) #define inAT(d,i,j) (dAT[d] + (i)*nb*ldn_local + (j)*nb) #define inPT(d,i,j) (dPT[d] + (i)*nb*nb + (j)*nb*maxm) //#define PROFILE #ifdef PROFILE float flops, time_rmajor = 0, time_rmajor2 = 0, time_rmajor3 = 0, time_mem = 0; magma_timestr_t start, start1, start2, end1, end, start0 = get_current_time(); #endif magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *dAT[MagmaMaxGPUs], *dA[MagmaMaxGPUs], *dPT[MagmaMaxGPUs]; magma_int_t iinfo = 0, nb, nbi, maxm, n_local[MagmaMaxGPUs], ldn_local; magma_int_t N, M, NB, NBk, I, d, num_gpus; magma_int_t ii, jj, h, offset, ib, rows, s; magma_queue_t stream[MagmaMaxGPUs][2]; magma_event_t event[MagmaMaxGPUs][2]; *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* initialize nb */ nb = magma_get_cgetrf_nb(m); maxm = ((m + 31)/32)*32; /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(magmaFloatComplex); /* number of columns in the big panel */ h = 1+(2+num_gpus0); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); char * ngr_nb_char = getenv("MAGMA_NGR_NB"); if( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); //NB = 5*max(nb,32); if( num_gpus0 > ceil((float)NB/nb) ) { num_gpus = (int)ceil((float)NB/nb); h = 1+(2+num_gpus); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); } else { num_gpus = num_gpus0; } if( num_gpus*NB >= n ) { #ifdef CHECK_CGETRF_OOC printf( " * still fit in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_CGETRF_OOC printf( " * don't fit in GPU memory.\n" ); #endif NB = num_gpus*NB; NB = max(nb,(NB / nb) * nb); /* making sure it's devisable by nb (x64) */ } #ifdef CHECK_CGETRF_OOC if( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n",n,NB,nb,(float)freeMem ); else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n",n,NB,nb,(float)freeMem ); #endif if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code for scalar of one tile. */ lapackf77_cgetrf(&m, &n, a, &lda, ipiv, info); } else { /* Use hybrid blocked code. */ /* allocate memory on GPU to store the big panel */ #ifdef PROFILE start = get_current_time(); #endif n_local[0] = (NB/nb)/num_gpus; if( NB%(nb*num_gpus) != 0 ) n_local[0] ++; n_local[0] *= nb; ldn_local = ((n_local[0]+31)/32)*32; for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_cmalloc( &dA[d], (ldn_local+h*nb)*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dPT[d] = dA[d] + nb*maxm; /* for storing the previous panel from CPU */ dAT[d] = dA[d] + h*nb*maxm; /* for storing the big panel */ magma_queue_create( &stream[d][0] ); magma_queue_create( &stream[d][1] ); magma_event_create( &event[d][0] ); magma_event_create( &event[d][1] ); } //magma_setdevice(0); #ifdef PROFILE end = get_current_time(); printf( " memory-allocation time: %e\n",GetTimerValue(start, end)/1000.0 ); start = get_current_time(); #endif for( I=0; I<n; I+=NB ) { M = m; N = min( NB, n-I ); /* number of columns in this big panel */ s = min(max(m-I,0),N)/nb; /* number of small block-columns in this big panel */ maxm = ((M + 31)/32)*32; if( num_gpus0 > ceil((float)N/nb) ) { num_gpus = (int)ceil((float)N/nb); } else { num_gpus = num_gpus0; } for( d=0; d<num_gpus; d++ ) { n_local[d] = ((N/nb)/num_gpus)*nb; if (d < (N/nb)%num_gpus) n_local[d] += nb; else if (d == (N/nb)%num_gpus) n_local[d] += N%nb; } ldn_local = ((n_local[0]+31)/32)*32; #ifdef PROFILE start2 = get_current_time(); #endif /* upload the next big panel into GPU, transpose (A->A'), and pivot it */ magmablas_csetmatrix_transpose_mgpu(num_gpus, stream, A(0,I), lda, dAT, ldn_local, dA, maxm, M, N, nb); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } #ifdef PROFILE start1 = get_current_time(); #endif /* == --------------------------------------------------------------- == */ /* == loop around the previous big-panels to update the new big-panel == */ for( offset = 0; offset<min(m,I); offset+=NB ) { NBk = min( m-offset, NB ); /* start sending the first tile from the previous big-panels to gpus */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); nbi = min( nb, NBk ); magma_csetmatrix_async( (M-offset), nbi, A(offset,offset), lda, dA[d], (maxm-offset), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][0] ); /* transpose */ magmablas_ctranspose2( inPT(d,0,0), nb, dA[d], maxm-offset, M-offset, nbi); } /* applying the pivot from the previous big-panel */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][1]); magmablas_cpermute_long3( inAT(d,0,0), ldn_local, ipiv, NBk, offset ); } /* == going through each block-column of previous big-panels == */ for( jj=0, ib=offset/nb; jj<NBk; jj+=nb, ib++ ) { ii = offset+jj; rows = maxm - ii; nbi = min( nb, NBk-jj ); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); /* wait for a block-column on GPU */ magma_queue_sync( stream[d][0] ); /* start sending next column */ if( jj+nb < NBk ) { magma_csetmatrix_async( (M-ii-nb), min(nb,NBk-jj-nb), A(ii+nb,ii+nb), lda, dA[d], (rows-nb), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][(1+jj/nb)%2] ); /* transpose next column */ magmablas_ctranspose2( inPT(d,0,(1+jj/nb)%2), nb, dA[d], rows-nb, M-ii-nb, nb); } /* update with the block column */ magmablasSetKernelStream(stream[d][1]); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d], nbi, c_one, inPT(d,0,(jj/nb)%2), nb, inAT(d,ib,0), ldn_local ); if( M > ii+nb ) { magma_cgemm( MagmaNoTrans, MagmaNoTrans, n_local[d], M-(ii+nb), nbi, c_neg_one, inAT(d,ib,0), ldn_local, inPT(d,1,(jj/nb)%2), nb, c_one, inAT(d,ib+1,0), ldn_local ); } magma_event_record( event[d][(jj/nb)%2], stream[d][1] ); } /* end of for each block-columns in a big-panel */ } } /* end of for each previous big-panels */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } /* calling magma-gpu interface to panel-factorize the big panel */ if( M > I ) { //magma_cgetrf1_mgpu(num_gpus, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, &a[I*lda], lda, // (magma_queue_t **)stream, &iinfo); magma_cgetrf2_mgpu(num_gpus, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, A(0,I), lda, stream, &iinfo); if( iinfo < 0 ) { *info = iinfo; break; } else if( iinfo != 0 ) { *info = iinfo + I * NB; //break; } /* adjust pivots */ for( ii=I; ii<min(I+N,m); ii++ ) ipiv[ii] += I; } #ifdef PROFILE end1 = get_current_time(); time_rmajor += GetTimerValue(start1, end1); time_rmajor3 += GetTimerValue(start2, end1); time_mem += (GetTimerValue(start2, end1)-GetTimerValue(start1, end1))/1000.0; #endif /* download the current big panel to CPU */ magmablas_cgetmatrix_transpose_mgpu(num_gpus, stream, dAT, ldn_local, A(0,I), lda, dA, maxm, M, N, nb); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } #ifdef PROFILE end1 = get_current_time(); time_rmajor2 += GetTimerValue(start1, end1); #endif } /* end of for */ #ifdef PROFILE end = get_current_time(); flops = FLOPS_CGETRF( m, n ) / 1000000; printf(" NB=%d nb=%d\n",NB,nb); printf(" memcopy and transpose %e seconds\n",time_mem ); printf(" total time %e seconds\n",GetTimerValue(start0,end)/1000.0); printf(" Performance %f GFlop/s, %f seconds without htod and dtoh\n", flops / time_rmajor, time_rmajor /1000.0); printf(" Performance %f GFlop/s, %f seconds with htod\n", flops / time_rmajor3, time_rmajor3/1000.0); printf(" Performance %f GFlop/s, %f seconds with dtoh\n", flops / time_rmajor2, time_rmajor2/1000.0); printf(" Performance %f GFlop/s, %f seconds without memory-allocation\n", flops / GetTimerValue(start, end), GetTimerValue(start,end)/1000.0); #endif for( d=0; d<num_gpus0; d++ ) { magma_setdevice(d); magma_free( dA[d] ); magma_event_destroy( event[d][0] ); magma_event_destroy( event[d][1] ); magma_queue_destroy( stream[d][0] ); magma_queue_destroy( stream[d][1] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); } if( *info >= 0 ) magma_cgetrf_piv(m, n, NB, a, lda, ipiv, info); return *info; } /* magma_cgetrf_m */
/** Purpose ------- DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] num_gpus INTEGER The number of GPUs to be used for the factorization. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] d_lA DOUBLE_PRECISION array of pointers on the GPU, dimension (num_gpus) On entry, the symmetric matrix dA distributed over GPUs (d_lA[d] points to the local matrix on the d-th GPU). It is distributed in 1D block column or row cyclic (with the block size of nb) if UPLO = MagmaUpper or MagmaLower, respectively. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array d_lA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_dposv_comp ********************************************************************/ extern "C" magma_int_t magma_dpotrf_mgpu(magma_int_t num_gpus, magma_uplo_t uplo, magma_int_t n, double **d_lA, magma_int_t ldda, magma_int_t *info) { magma_int_t j, nb, d, lddp, h; const char* uplo_ = lapack_uplo_const( uplo ); double *work; int upper = (uplo == MagmaUpper); double *dwork[MagmaMaxGPUs]; magma_queue_t stream[MagmaMaxGPUs][3]; magma_event_t event[MagmaMaxGPUs][5]; *info = 0; nb = magma_get_dpotrf_nb(n); if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (!upper) { lddp = nb*(n/(nb*num_gpus)); if ( n%(nb*num_gpus) != 0 ) lddp += min(nb,n-num_gpus*lddp); if ( ldda < lddp ) *info = -4; } else if ( ldda < n ) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); if (num_gpus == 1 && ((nb <= 1) || (nb >= n)) ) { /* Use unblocked code. */ magma_setdevice(0); if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, n*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_dgetmatrix( n, n, d_lA[0], ldda, work, n ); lapackf77_dpotrf(uplo_, &n, work, &n, info); magma_dsetmatrix( n, n, work, n, d_lA[0], ldda ); magma_free_pinned( work ); } else { lddp = nb*((n+nb-1)/nb); for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_dmalloc( &dwork[d], num_gpus*nb*lddp )) { for( j=0; j < d; j++ ) { magma_setdevice(j); magma_free( dwork[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } for( j=0; j < 3; j++ ) magma_queue_create( &stream[d][j] ); for( j=0; j < 5; j++ ) magma_event_create( &event[d][j] ); } magma_setdevice(0); h = 1; //num_gpus; //(n+nb-1)/nb; if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, n*nb*h )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } if (upper) { /* with two streams */ //magma_dpotrf2_mgpu(num_gpus, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, n, // h, stream, event, info); /* with three streams */ magma_dpotrf3_mgpu(num_gpus, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, n, h, stream, event, info); } else { /* with two streams */ //magma_dpotrf2_mgpu(num_gpus, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, nb*h, // h, stream, event, info); /* with three streams */ magma_dpotrf3_mgpu(num_gpus, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, nb*h, h, stream, event, info); } /* clean up */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); for( j=0; j < 3; j++ ) { magma_queue_sync( stream[d][j] ); magma_queue_destroy( stream[d][j] ); } for( j=0; j < 5; j++ ) magma_event_destroy( event[d][j] ); magma_free( dwork[d] ); } magma_free_pinned( work ); } /* end of not lapack */ magma_setdevice( orig_dev ); return *info; } /* magma_dpotrf_mgpu */
/** Purpose ------- ZHETRD_HE2HB reduces a complex Hermitian matrix A to real symmetric band-diagonal form T by an orthogonal similarity transformation: Q**H * A * Q = T. This version stores the triangular matrices T used in the accumulated Householder transformations (I - V T V'). Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A COMPLEX_16 array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if UPLO = MagmaUpper, the Upper band-diagonal of A is overwritten by the corresponding elements of the band-diagonal matrix T, and the elements above the band diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = MagmaLower, the the Lower band-diagonal of A is overwritten by the corresponding elements of the band-diagonal matrix T, and the elements below the band-diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau COMPLEX_16 array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= 1. For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] dT COMPLEX_16 array on the GPU, dimension N*NB, where NB is the optimal blocksize. On exit dT holds the upper triangular matrices T from the accumulated Householder transformations (I - V T V') used in the factorization. The nb x nb matrices T are ordered consecutively in memory one after another. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n-1) . . . H(2) H(1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in A(1:i-1,i+1), and tau in TAU(i). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(n-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i), and tau in TAU(i). The contents of A on exit are illustrated by the following examples with n = 5: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( d e v2 v3 v4 ) ( d ) ( d e v3 v4 ) ( e d ) ( d e v4 ) ( v1 e d ) ( d e ) ( v1 v2 e d ) ( d ) ( v1 v2 v3 e d ) where d and e denote diagonal and off-diagonal elements of T, and vi denotes an element of the vector defining H(i). @ingroup magma_zheev_2stage ********************************************************************/ extern "C" magma_int_t magma_zhetrd_he2hb_mgpu( magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *work, magma_int_t lwork, magmaDoubleComplex_ptr dAmgpu[], magma_int_t ldda, magmaDoubleComplex_ptr dTmgpu[], magma_int_t lddt, magma_int_t ngpu, magma_int_t distblk, magma_queue_t queues[][20], magma_int_t nqueue, magma_int_t *info) { #define A(a_1,a_2) ( A + ((a_2)-1)*( lda) + (a_1)-1) #define tau_ref(a_1) (tau + (a_1)-1) #define dT(a_0, a_1, a_2) (dTmgpu[a_0] + ((a_2)-1)*(lddt) + (a_1)-1) #define dA(a_0, a_1, a_2) (dAmgpu[a_0] + ((a_2)-1)*(ldda) + (a_1)-1) magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex c_neg_half = MAGMA_Z_NEG_HALF; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_zero = MAGMA_Z_ZERO; double d_one = MAGMA_D_ONE; magma_int_t pm, pn, indi, indj, pk; magma_int_t pm_old=0, pn_old=0, indi_old=0, flipV=-1; magma_int_t iblock, idev, di; int i; int lwkopt; int lquery; assert (nqueue >= 3); assert (nqueue >= (ngpu+1)); *info = 0; int upper = (uplo == MagmaUpper); lquery = (lwork == -1); if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } else if (lwork < 1 && ! lquery) { *info = -9; } /* Determine the block size. */ lwkopt = n * nb; if (*info == 0) { work[0] = MAGMA_Z_MAKE( lwkopt, 0 ); } if (*info != 0) return *info; else if (lquery) return *info; /* Quick return if possible */ if (n == 0) { work[0] = c_one; return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); // limit to 16 threads magma_int_t orig_threads = magma_get_lapack_numthreads(); magma_set_lapack_numthreads( min(orig_threads,16) ); magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2]; magma_int_t nbcmplx=0; magma_buildconnection_mgpu(gnode, &nbcmplx, ngpu); #ifdef ENABLE_DEBUG printf(" Initializing communication pattern.... GPU-ncmplx %d\n\n", nbcmplx); #endif magmaDoubleComplex *dspace[MagmaMaxGPUs]; magmaDoubleComplex *dwork[MagmaMaxGPUs], *dworkbis[MagmaMaxGPUs]; magmaDoubleComplex *dvall[MagmaMaxGPUs], *dv[MagmaMaxGPUs], *dw[MagmaMaxGPUs]; magmaDoubleComplex *workngpu[MagmaMaxGPUs+1]; magma_event_t redevents[MagmaMaxGPUs][MagmaMaxGPUs*MagmaMaxGPUs+10]; magma_int_t nbevents = MagmaMaxGPUs*MagmaMaxGPUs; magma_int_t lddv = ldda; magma_int_t lddw = lddv; magma_int_t dwrk2siz = ldda*nb*(ngpu+1); magma_int_t worksiz = n*nb; magma_int_t devworksiz = 2*nb*lddv + nb*lddw + nb*ldda + dwrk2siz; // 2*dv(dv0+dv1) + dw + dwork +dworkbis // local allocation and stream creation // TODO check malloc for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_zmalloc( &dspace[dev], devworksiz ); magma_zmalloc_pinned ( &workngpu[dev], worksiz); dvall[dev] = dspace[dev]; dw[dev] = dvall[dev] + 2*nb*lddv; dwork[dev] = dw[dev] + nb*lddw; dworkbis[dev] = dwork[dev] + nb*ldda; magmablasSetKernelStream( queues[ dev ][ 0 ] ); for( magma_int_t i = 0; i < nbevents; ++i ) { cudaEventCreateWithFlags(&redevents[dev][i],cudaEventDisableTiming); } } magma_zmalloc_pinned ( &workngpu[ngpu], worksiz); magmaDoubleComplex *worktest = NULL; //magma_zmalloc_cpu( &worktest, n*nb ); // not used // ====================== magmaDoubleComplex *hT = work + lwork - nb*nb; lwork -= nb*nb; memset( hT, 0, nb*nb*sizeof(magmaDoubleComplex)); if (upper) { printf("ZHETRD_HE2HB is not yet implemented for upper matrix storage. Exit.\n"); exit(1); } else { /* Reduce the lower triangle of A */ for (i = 1; i <= n-nb; i += nb) { indi = i+nb; indj = i; pm = n - i - nb + 1; //pn = min(i+nb-1, n-nb) -i + 1; pn = nb; /* Get the current panel (no need for the 1st iteration) */ if (i > 1 ) { // zpanel_to_q copy the upper oof diagonal part of // the matrix to work to be restored later. acctually // the zero's and one's putted are not used this is only // because we don't have a function that copy only the // upper part of A to be restored after copying the // lookahead panel that has been computted from GPU to CPU. zpanel_to_q(MagmaUpper, pn-1, A(i, i+1), lda, work); // find the device who own the panel then send it to the CPU. // below a -1 was added and then a -1 was done on di because of the fortran indexing iblock = ((i-1) / distblk) / ngpu; // local block id di = iblock*distblk + (i-1)%distblk; // local index in parent matrix idev = ((i-1) / distblk) % ngpu; // device with this block //printf("Receiving panel ofsize %d %d from idev %d A(%d,%d) \n",(pm+pn), pn,idev,i-1,di); magma_setdevice( idev ); //magma_device_sync(); magma_zgetmatrix_async( (pm+pn), pn, dA(idev, i, di+1), ldda, A( i, i), lda, queues[ idev ][ nqueue-1 ] ); //magma_setdevice( 0 ); //printf("updating zher2k on A(%d,%d) of size %d %d \n",indi_old+pn_old-1,indi_old+pn_old-1,pm_old-pn_old,pn_old); // compute ZHER2K_MGPU magmablas_zher2k_mgpu2( MagmaLower, MagmaNoTrans, pm_old-pn_old, pn_old, c_neg_one, dv, pm_old, pn_old, dw, pm_old, pn_old, d_one, dAmgpu, ldda, indi_old+pn_old-1, ngpu, distblk, queues, 2 ); //magma_setdevice( 0 ); magma_setdevice( idev ); magma_queue_sync( queues[idev][ nqueue-1 ] ); //magma_setdevice( 0 ); zq_to_panel(MagmaUpper, pn-1, A(i, i+1), lda, work); } /* ========================================================== QR factorization on a panel starting nb off of the diagonal. Prepare the V and T matrices. ========================================================== */ lapackf77_zgeqrf(&pm, &pn, A(indi, indj), &lda, tau_ref(i), work, &lwork, info); /* Form the matrix T */ pk=min(pm,pn); lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &pm, &pk, A(indi, indj), &lda, tau_ref(i), hT, &nb); /* Prepare V - put 0s in the upper triangular part of the panel (and 1s on the diagonal), temporaly storing the original in work */ zpanel_to_q(MagmaUpper, pk, A(indi, indj), lda, work); /* Send V and T from the CPU to the GPU */ // To be able to overlap the GET with the ZHER2K // it should be done on last stream. // TO Avoid a BUG that is overwriting the old_V // used atthis moment by zher2k with the new_V // send it now, we decide to have a flipflop // vector of Vs. if step%2=0 use V[0] else use V[nb*n] flipV = ((i-1)/nb)%2; for( magma_int_t dev = 0; dev < ngpu; ++dev ) { dv[dev] = dvall[dev] + flipV*nb*lddv; } for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); // send V magma_zsetmatrix_async( pm, pk, A(indi, indj), lda, dv[dev], pm, queues[dev][nqueue-1] ); // Send the triangular factor T to the GPU magma_zsetmatrix_async( pk, pk, hT, nb, dT(dev, 1, i), lddt, queues[dev][nqueue-1] ); } /* ========================================================== Compute W: 1. X = A (V T) 2. W = X - 0.5* V * (T' * (V' * X)) ========================================================== */ for( magma_int_t dev = 0; dev < ngpu; ++dev ) { // dwork = V T magma_setdevice( dev ); magmablasSetKernelStream( queues[ dev ][ nqueue-1 ] ); magma_queue_sync( queues[dev][nqueue-1] ); magma_zgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk, c_one, dv[dev], pm, dT(dev, 1, i), lddt, c_zero, dwork[dev], pm); } // =============================================== // SYNC TO BE SURE THAT BOTH V AND T WERE // RECEIVED AND VT IS COMPUTED and SYR2K is done // =============================================== for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); for( magma_int_t s = 0; s < nqueue; ++s ) magma_queue_sync( queues[dev][s] ); } // compute ZHEMM_MGPU // The broadcast of the result done inside this function // should be done in stream [0] because i am assuming this // for the GEMMs below otherwise I have to SYNC over the // Broadcasting stream. if (ngpu == 1) { magmablasSetKernelStream( queues[ 0 ][ 0 ] ); magma_zhemm(MagmaLeft, uplo, pm, pk, c_one, dAmgpu[0]+(indi-1)*ldda+(indi-1), ldda, dwork[0], pm, c_zero, dw[0], pm); } else { magmablas_zhemm_mgpu_com( MagmaLeft, uplo, pm, pk, c_one, dAmgpu, ldda, indi-1, dwork, pm, c_zero, dw, pm, dworkbis, dwrk2siz, worktest, pm, workngpu, worksiz, ngpu, distblk, queues, nqueue-1, redevents, nbevents, gnode, nbcmplx); } /* dwork = V*T already ==> dwork' = T'*V' * compute T'*V'*X ==> dwork'*W ==> * dwork + pm*nb = ((T' * V') * X) = dwork' * X = dwork' * W */ for( magma_int_t dev = 0; dev < ngpu; ++dev ) { // Here we have to wait until the broadcast of ZHEMM has been done. // Note that the broadcast should be done on stream[0] so in a way // we can continue here on the same stream and avoid a sync magma_setdevice( dev ); magmablasSetKernelStream( queues[ dev ][ 0 ] ); // magma_queue_sync( queues[dev][0] ); magma_zgemm(MagmaConjTrans, MagmaNoTrans, pk, pk, pm, c_one, dwork[dev], pm, dw[dev], pm, c_zero, dworkbis[dev], nb); /* W = X - 0.5 * V * T'*V'*X * = X - 0.5 * V * (dwork + pm*nb) = W - 0.5 * V * (dwork + pm*nb) */ magma_zgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk, c_neg_half, dv[dev], pm, dworkbis[dev], nb, c_one, dw[dev], pm); } /* restore the panel it is put here to overlap with the previous GEMM*/ zq_to_panel(MagmaUpper, pk, A(indi, indj), lda, work); // =============================================== // SYNC TO BE SURE THAT BOTH V AND W ARE DONE // =============================================== // Synchronise to be sure that W has been computed // because next ZHER2K use streaming and may happen // that lunch a gemm on stream 2 while stream 0 // which compute those 2 GEMM above has not been // computed and also used for the same reason in // the panel update below and also for the last HER2K for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_queue_sync( queues[dev][0] ); } /* ========================================================== Update the unreduced submatrix A(i+ib:n,i+ib:n), using an update of the form: A := A - V*W' - W*V' ========================================================== */ if (i + nb <= n-nb) { /* There would be next iteration; do lookahead - update the next panel */ // below a -1 was added and then a -1 was done on di because of the fortran indexing iblock = ((indi-1) / distblk) / ngpu; // local block id di = iblock*distblk + (indi-1)%distblk; // local index in parent matrix idev = ((indi-1) / distblk) % ngpu; // device with this block magma_setdevice( idev ); magmablasSetKernelStream( queues[ idev ][ nqueue-1 ] ); //magma_queue_sync( queues[idev][0] ); removed because the sync has been done in the loop above magma_zgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one, dv[idev], pm, dw[idev], pm, c_one, dA(idev, indi, di+1), ldda); magma_zgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one, dw[idev], pm, dv[idev], pm, c_one, dA(idev, indi, di+1), ldda); //printf("updating next panel distblk %d idev %d on A(%d,%d) of size %d %d %d \n",distblk,idev,indi-1,di,pm,pn,pn); } else { /* no look-ahead as this is last iteration */ // below a -1 was added and then a -1 was done on di because of the fortran indexing iblock = ((indi-1) / distblk) / ngpu; // local block id di = iblock*distblk + (indi-1)%distblk; // local index in parent matrix idev = ((indi-1) / distblk) % ngpu; // device with this block magma_setdevice( idev ); magmablasSetKernelStream( queues[ idev ][ 0 ] ); //printf("LAST ZHER2K idev %d on A(%d,%d) of size %d \n",idev, indi-1,di,pk); magma_zher2k(MagmaLower, MagmaNoTrans, pk, pk, c_neg_one, dv[idev], pm, dw[idev], pm, d_one, dA(idev, indi, di+1), ldda); /* Send the last block to the CPU */ zpanel_to_q(MagmaUpper, pk-1, A(n-pk+1, n-pk+2), lda, work); magma_zgetmatrix( pk, pk, dA(idev, indi, di+1), ldda, A(n-pk+1, n-pk+1), lda ); zq_to_panel(MagmaUpper, pk-1, A(n-pk+1, n-pk+2), lda, work); } indi_old = indi; //indj_old = indj; pm_old = pm; pn_old = pn; } // end loop for (i) }// end of LOWER //magma_setdevice( 0 ); for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_free( dspace[dev]); magma_free_pinned(workngpu[dev]); for( magma_int_t e = 0; e < nbevents; ++e ) { magma_event_destroy( redevents[dev][e] ); } } magma_free_pinned(workngpu[ngpu]); magma_free_cpu(worktest); magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); magma_set_lapack_numthreads( orig_threads ); work[0] = MAGMA_Z_MAKE( lwkopt, 0 ); return *info; } /* magma_zhetrd_he2hb_mgpu */
/* //////////////////////////////////////////////////////////////////////////// -- Testing magma_ssymm_mgpu */ int main( int argc, char** argv) { TESTING_INIT(); float c_neg_one = MAGMA_S_NEG_ONE; float calpha = MAGMA_S_MAKE( 3.456, 5.678 ); float cbeta = MAGMA_S_MAKE( 1.234, 2.456 ); real_Double_t gflops, gpu_perf=0., cpu_perf=0., gpu_time=0., cpu_time=0.; real_Double_t gpu_perf2=0., gpu_time2=0.; float error=0., errorbis=0., work[1]; float *hA, *hX, *hB, *hR; float *dA[MagmaMaxGPUs], *dX[MagmaMaxGPUs], *dB[MagmaMaxGPUs], *dwork[MagmaMaxGPUs], *hwork[MagmaMaxGPUs+1]; float *dA2; magma_int_t M, N, size, lda, ldda, msize, nb, nstream; magma_int_t ione = 1; magma_int_t iseed[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); // default values nb = (opts.nb > 0 ? opts.nb : 64); nstream = (opts.nstream > 0 ? opts.nstream : 2); magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2]; magma_int_t nbcmplx = 0; magma_buildconnection_mgpu(gnode, &nbcmplx, opts.ngpu); printf("Initializing communication pattern... GPU-ncmplx %d\n\n", (int) nbcmplx); for (int i=0; i < nbcmplx; ++i) { int myngpu = gnode[i][MagmaMaxGPUs]; printf("cmplx %d has %d gpu ", i, myngpu); for(int j=0; j < myngpu; ++j) printf(" %d", (int) gnode[i][j]); printf("\n"); } magma_int_t nbevents = 2; magma_queue_t streams[MagmaMaxGPUs][20]; magma_event_t redevents[MagmaMaxGPUs][20]; magma_event_t redevents2[MagmaMaxGPUs][MagmaMaxGPUs*MagmaMaxGPUs+10]; for( int d = 0; d < opts.ngpu; ++d ) { for( magma_int_t i = 0; i < nstream; ++i ) { magma_queue_create( &streams[d][i] ); } for( magma_int_t i = 0; i < nbevents; ++i ) { cudaEventCreateWithFlags(&redevents[d][i], cudaEventDisableTiming); cudaEventCreateWithFlags(&redevents2[d][i], cudaEventDisableTiming); } } printf( "nb %d, ngpu %d, nstream %d version %d\n", (int) nb, (int) opts.ngpu, (int) nstream, (int) opts.version ); printf(" M N nb offset CPU GFlop/s (sec) GPU GFlop/s (sec) CUBLAS hemm (sec) ||R|| / ||A||*||X||\n"); printf("=========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { M = opts.msize[itest]; N = opts.nsize[itest]; for( int offset = 0; offset < N; offset += min(N,nb) ) { for( int iter = 0; iter < opts.niter; ++iter ) { msize = M - offset; lda = M; ldda = ((M + 31)/32)*32; size = lda*M; gflops = FLOPS_SSYMM( MagmaLeft, (float)msize, (float)N ) / 1e9; magma_int_t dworksiz = ldda*N*3; magma_int_t hworksiz = lda*N; TESTING_MALLOC_CPU( hA, float, lda*M ); TESTING_MALLOC_CPU( hX, float, lda*N ); TESTING_MALLOC_CPU( hB, float, lda*N ); TESTING_MALLOC_PIN( hR, float, lda*N ); for( int d = 0; d < opts.ngpu; ++d ) { magma_int_t mlocal = ((M / nb) / opts.ngpu + 1) * nb; magma_setdevice( d ); TESTING_MALLOC_DEV( dA[d], float, ldda*mlocal ); TESTING_MALLOC_DEV( dX[d], float, ldda*N ); TESTING_MALLOC_DEV( dB[d], float, ldda*N ); TESTING_MALLOC_DEV( dwork[d], float, dworksiz ); TESTING_MALLOC_PIN( hwork[d], float, hworksiz ); } TESTING_MALLOC_PIN( hwork[opts.ngpu], float, lda*N ); if ( opts.check ) { magma_setdevice( 0 ); TESTING_MALLOC_DEV( dA2, float, ldda*M ); } lapackf77_slarnv( &ione, iseed, &size, hA ); magma_smake_symmetric( M, hA, lda ); size = lda*N; lapackf77_slarnv( &ione, iseed, &size, hX ); lapackf77_slarnv( &ione, iseed, &size, hB ); lapackf77_slacpy( "Full", &M, &N, hB, &lda, hR, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_ssetmatrix_1D_col_bcyclic( M, M, hA, lda, dA, ldda, opts.ngpu, nb ); for( int d = 0; d < opts.ngpu; ++d ) { magma_setdevice( d ); //magmablasSetKernelStream( streams[ d ][ 0 ] ); magma_ssetmatrix( M, N, hX, lda, dX[d], ldda ); //if (d == 0) magma_ssetmatrix( M, N, hB, lda, dB[d], ldda ); // this is wrong coz when offset != 0 the gpu who do the beta*C may be not 0 so this should be related to stdev(starting device who own i=0 first col) magma_ssetmatrix( M, N, hB, lda, dB[d], ldda ); } //memset(hR, 0, lda*N*sizeof(float)); //trace_init( 1, opts.ngpu, nstream, (magma_queue_t*) streams ); //magma_int_t offset = 0; //nb; gpu_time = magma_sync_wtime(0); magmablas_ssymm_mgpu_com( MagmaLeft, MagmaLower, msize, N, calpha, dA, ldda, offset, dX, ldda, cbeta, dB, ldda, dwork, dworksiz, hR, lda, hwork, hworksiz, opts.ngpu, nb, streams, nstream, redevents2, nbevents, gnode, nbcmplx); gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gflops / gpu_time; #ifdef TRACING char buf[80]; snprintf( buf, sizeof(buf), "ssymm-m%d-n%d-nb%d-stream%d-ngpu%d-run%d.svg", (int) M, (int) N, (int) nb, (int) nstream, (int) opts.ngpu, (int) j ); trace_finalize( buf, "trace.css" ); #endif /* ==================================================================== Performs operation using CUBLAS =================================================================== */ if ( opts.check && iter == 0 ) { magma_setdevice( 0 ); magmablasSetKernelStream( 0 ); magma_ssetmatrix( M, M, hA, lda, dA2, ldda ); magma_ssetmatrix( M, N, hX, lda, dX[0], ldda ); magma_ssetmatrix( M, N, hB, lda, dwork[0], ldda ); gpu_time2 = magma_sync_wtime(0); magma_ssymm( MagmaLeft, MagmaLower, msize, N, calpha, dA2+offset*ldda+offset, ldda, dX[0], ldda, cbeta, dwork[0], ldda ); gpu_time2 = magma_sync_wtime(0) - gpu_time2; gpu_perf2 = gflops / gpu_time2; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.check ) { // store ||A||*||X|| errorbis = lapackf77_slange("fro", &msize, &msize, hA+offset*lda+offset, &lda, work ); errorbis *= lapackf77_slange("fro", &msize, &N, hX, &lda, work ); //printf( "A =" ); magma_sprint( M, M, hA, lda ); //printf( "X =" ); magma_sprint( M, N, hX, lda ); //printf( "B =" ); magma_sprint( M, N, hB, lda ); cpu_time = magma_wtime(); blasf77_ssymm( "Left", "Lower", &msize, &N, &calpha, hA+offset*lda+offset, &lda, hX, &lda, &cbeta, hB, &lda ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* trace_file = fopen("AJETE/C", "w"); for (int j = 0; j < N; j++) for (int i = 0; i < siz; i++) fprintf(trace_file, "%10d%10d%40.30e\n", i+1, j+1, hB[j*lda+i]); fclose(trace_file); */ magma_int_t firstprint=0; for(magma_int_t dev=0; dev < opts.ngpu; ++dev) { magma_setdevice( dev ); magma_sgetmatrix( M, N, dB[dev], ldda, hR, lda ); // compute relative error ||R||/||A||*||X||, where R := B_magma - B_lapack = R - B size = lda*N; blasf77_saxpy( &size, &c_neg_one, hB, &ione, hR, &ione ); error = lapackf77_slange("fro", &msize, &N, hR, &lda, work) / errorbis; //printf( "R =" ); magma_sprint( M, N, hR, lda ); if (firstprint == 0) { printf( "%5d %5d %5d %5d %7.1f (%7.4f) %7.1f (%7.4f) %7.1f (%7.4f) %8.2e %s\n", (int) M, (int) N, (int) nb, (int) offset, cpu_perf, cpu_time, gpu_perf, gpu_time, gpu_perf2, gpu_time2, error, (error < tol ? "ok" : "failed") ); } else { printf( "%89s %8.2e %s\n", " ", error, (error < tol ? "ok" : "failed") ); } status += ! (error < tol); firstprint =1; } } else { printf( "%5d %5d %5d %5d --- ( --- ) %7.1f (%7.4f) --- ( --- ) ---\n", (int) M, (int) N, (int) nb, (int) offset, gpu_perf, gpu_time ); } TESTING_FREE_CPU( hA ); TESTING_FREE_CPU( hX ); TESTING_FREE_CPU( hB ); TESTING_FREE_PIN( hR ); for( int d = 0; d < opts.ngpu; ++d ) { magma_setdevice( d ); TESTING_FREE_DEV( dA[d] ); TESTING_FREE_DEV( dX[d] ); TESTING_FREE_DEV( dB[d] ); TESTING_FREE_DEV( dwork[d] ); TESTING_FREE_PIN( hwork[d] ); } TESTING_FREE_PIN( hwork[opts.ngpu] ); if ( opts.check ) { magma_setdevice( 0 ); TESTING_FREE_DEV( dA2 ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } // offset printf( "\n" ); } for( int d = 0; d < opts.ngpu; ++d ) { magma_setdevice( d ); for( magma_int_t i = 0; i < nstream; ++i ) { magma_queue_destroy( streams[d][i] ); } for( magma_int_t i = 0; i < nbevents; ++i ) { magma_event_destroy( redevents[d][i] ); magma_event_destroy( redevents2[d][i] ); } } TESTING_FINALIZE(); return status; }
/** Purpose ------- CHETRD reduces a complex Hermitian matrix A to real symmetric tridiagonal form T by an orthogonal similarity transformation: Q**H * A * Q = T. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] nqueue INTEGER The number of GPU queues used for update. 10 >= nqueue > 0. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if UPLO = MagmaUpper, the diagonal and first superdiagonal of A are overwritten by the corresponding elements of the tridiagonal matrix T, and the elements above the first superdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = MagmaLower, the diagonal and first subdiagonal of A are over- written by the corresponding elements of the tridiagonal matrix T, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] d COMPLEX array, dimension (N) The diagonal elements of the tridiagonal matrix T: D(i) = A(i,i). @param[out] e COMPLEX array, dimension (N-1) The off-diagonal elements of the tridiagonal matrix T: E(i) = A(i,i+1) if UPLO = MagmaUpper, E(i) = A(i+1,i) if UPLO = MagmaLower. @param[out] tau COMPLEX array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= N*NB, where NB is the optimal blocksize given by magma_get_chetrd_nb(). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n-1) . . . H(2) H(1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in A(1:i-1,i+1), and tau in TAU(i). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(n-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i), and tau in TAU(i). The contents of A on exit are illustrated by the following examples with n = 5: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( d e v2 v3 v4 ) ( d ) ( d e v3 v4 ) ( e d ) ( d e v4 ) ( v1 e d ) ( d e ) ( v1 v2 e d ) ( d ) ( v1 v2 v3 e d ) where d and e denote diagonal and off-diagonal elements of T, and vi denotes an element of the vector defining H(i). @ingroup magma_cheev_comp ********************************************************************/ extern "C" magma_int_t magma_chetrd_mgpu( magma_int_t ngpu, magma_int_t nqueue, magma_uplo_t uplo, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, float *d, float *e, magmaFloatComplex *tau, magmaFloatComplex *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define dA(id, i, j) (dA[(id)] + (j)*ldda + (i)) #define dW(id, i, j) (dW[(id)] + (j)*ldda + (i)) /* Constants */ const magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; const magmaFloatComplex c_one = MAGMA_C_ONE; const float d_one = MAGMA_D_ONE; /* Local variables */ const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t nlocal, ldda; magma_int_t nb = magma_get_chetrd_nb(n), ib, ib2; #ifdef PROFILE_SY2RK float mv_time = 0.0; float up_time = 0.0; #endif magma_int_t kk, nx; magma_int_t i, ii, iii, j, dev, i_n; magma_int_t iinfo; magma_int_t ldwork, lddw, lwkopt, ldwork2, lhwork; // set pointers to NULL so it is safe to goto CLEANUP if any malloc fails. magma_queue_t queues[MagmaMaxGPUs][10] = { { NULL, NULL } }; magma_queue_t queues0[MagmaMaxGPUs] = { NULL }; magmaFloatComplex *hwork = NULL; magmaFloatComplex_ptr dwork2[MagmaMaxGPUs] = { NULL }; magmaFloatComplex_ptr dA[MagmaMaxGPUs] = { NULL }; magmaFloatComplex_ptr dW[MagmaMaxGPUs] = { NULL }; *info = 0; bool upper = (uplo == MagmaUpper); bool lquery = (lwork == -1); if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } else if (lwork < nb*n && ! lquery) { *info = -9; } else if ( nqueue > 2 ) { *info = 2; // TODO fix } /* Determine the block size. */ ldwork = n; lwkopt = n * nb; if (*info == 0) { work[0] = magma_cmake_lwork( lwkopt ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { work[0] = c_one; return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); //#define PROFILE_SY2RK #ifdef PROFILE_SY2RK float times[11] = { 0 }; magma_event_t start, stop; float etime; magma_setdevice( 0 ); magma_event_create( &start ); magma_event_create( &stop ); #endif ldda = magma_roundup( lda, 32 ); lddw = ldda; nlocal = nb*(1 + n/(nb*ngpu)); ldwork2 = ldda*( magma_ceildiv( n, nb ) + 1); // i.e., ldda*(blocks + 1) for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); // TODO fix memory leak if ( MAGMA_SUCCESS != magma_cmalloc( &dA[dev], nlocal*ldda + 3*lddw*nb ) || MAGMA_SUCCESS != magma_cmalloc( &dwork2[dev], ldwork2 ) ) { *info = MAGMA_ERR_DEVICE_ALLOC; goto CLEANUP; } dW[dev] = dA[dev] + nlocal*ldda; for( kk=0; kk < nqueue; kk++ ) { magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[dev][kk] ); } queues0[dev] = queues[dev][0]; } lhwork = nqueue*ngpu*n; if ( MAGMA_SUCCESS != magma_cmalloc_pinned( &hwork, lhwork ) ) { *info = MAGMA_ERR_HOST_ALLOC; goto CLEANUP; } // nx <= n is required // use LAPACK for n < 3000, otherwise switch at 512 if (n < 3000) nx = n; else nx = 512; if (upper) { /* Copy the matrix to the GPU */ if (1 <= n-nx) { magma_chtodhe( ngpu, uplo, n, nb, A, lda, dA, ldda, queues, &iinfo ); } /* Reduce the upper triangle of A. Columns 1:kk are handled by the unblocked method. */ for (i = nb*((n-1)/nb); i >= nx; i -= nb) { ib = min(nb, n-i); ii = nb*(i/(nb*ngpu)); dev = (i/nb)%ngpu; /* wait for the next panel */ if (i != nb*((n-1)/nb)) { magma_setdevice( dev ); magma_queue_sync( queues[dev][0] ); } magma_clatrd_mgpu( ngpu, uplo, i+ib, ib, nb, A(0, 0), lda, e, tau, work, ldwork, dA, ldda, 0, dW, i+ib, hwork, lhwork, dwork2, ldwork2, queues0 ); magma_cher2k_mgpu( ngpu, MagmaUpper, MagmaNoTrans, nb, i, ib, c_neg_one, dW, i+ib, 0, d_one, dA, ldda, 0, nqueue, queues ); /* get the next panel */ if (i-nb >= nx ) { ib2 = min(nb, n-(i-nb)); ii = nb*((i-nb)/(nb*ngpu)); dev = ((i-nb)/nb)%ngpu; magma_setdevice( dev ); magma_cgetmatrix_async( (i-nb)+ib2, ib2, dA(dev, 0, ii), ldda, A(0, i-nb), lda, queues[dev][0] ); } /* Copy superdiagonal elements back into A, and diagonal elements into D */ for (j = i; j < i+ib; ++j) { if ( j > 0 ) { *A(j-1,j) = MAGMA_C_MAKE( e[j - 1], 0 ); } d[j] = MAGMA_C_REAL( *A(j, j) ); } } /* end of for i=... */ if ( nx > 0 ) { if (1 <= n-nx) { /* else A is already on CPU */ for (i=0; i < nx; i += nb) { ib = min(nb, n-i); ii = nb*(i/(nb*ngpu)); dev = (i/nb)%ngpu; magma_setdevice( dev ); magma_cgetmatrix_async( nx, ib, dA(dev, 0, ii), ldda, A(0, i), lda, queues[dev][0] ); } } for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_queue_sync( queues[dev][0] ); } /* Use CPU code to reduce the last or only block */ lapackf77_chetrd( uplo_, &nx, A(0, 0), &lda, d, e, tau, work, &lwork, &iinfo ); } } else { trace_init( 1, ngpu, nqueue, queues ); /* Copy the matrix to the GPU */ if (1 <= n-nx) { magma_chtodhe( ngpu, uplo, n, nb, A, lda, dA, ldda, queues, &iinfo ); } /* Reduce the lower triangle of A */ for (i = 0; i < n-nx; i += nb) { ib = min(nb, n-i); ii = nb*(i/(nb*ngpu)); dev = (i/nb)%ngpu; /* Reduce columns i:i+ib-1 to tridiagonal form and form the matrix W which is needed to update the unreduced part of the matrix */ /* Get the current panel (no need for the 1st iteration) */ if (i != 0) { magma_setdevice( dev ); trace_gpu_start( dev, 0, "comm", "get" ); magma_cgetmatrix_async( n-i, ib, dA(dev, i, ii), ldda, A(i,i), lda, queues[dev][0] ); trace_gpu_end( dev, 0 ); magma_queue_sync( queues[dev][0] ); magma_setdevice( 0 ); } magma_clatrd_mgpu( ngpu, uplo, n-i, ib, nb, A(i, i), lda, &e[i], &tau[i], work, ldwork, dA, ldda, i, dW, n-i, hwork, lhwork, dwork2, ldwork2, queues0 ); #ifdef PROFILE_SY2RK magma_setdevice( 0 ); if ( i > 0 ) { cudaEventElapsedTime( &etime, start, stop ); up_time += (etime/1000.0); } magma_event_record( start, 0 ); #endif magma_cher2k_mgpu( ngpu, MagmaLower, MagmaNoTrans, nb, n-i-ib, ib, c_neg_one, dW, n-i, ib, d_one, dA, ldda, i+ib, nqueue, queues ); #ifdef PROFILE_SY2RK magma_setdevice( 0 ); magma_event_record( stop, 0 ); #endif /* Copy subdiagonal elements back into A, and diagonal elements into D */ for (j = i; j < i+ib; ++j) { if ( j+1 < n ) { *A(j+1,j) = MAGMA_C_MAKE( e[j], 0 ); } d[j] = MAGMA_C_REAL( *A(j, j) ); } } /* for i=... */ /* Use CPU code to reduce the last or only block */ if ( i < n ) { iii = i; i_n = n-i; if ( i > 0 ) { for (; i < n; i += nb) { ib = min(nb, n-i); ii = nb*(i/(nb*ngpu)); dev = (i/nb)%ngpu; magma_setdevice( dev ); magma_cgetmatrix_async( i_n, ib, dA(dev, iii, ii), ldda, A(iii, i), lda, queues[dev][0] ); } for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_queue_sync( queues[dev][0] ); } } lapackf77_chetrd( uplo_, &i_n, A(iii, iii), &lda, &d[iii], &e[iii], &tau[iii], work, &lwork, &iinfo ); } } for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); for( kk=0; kk < nqueue; kk++ ) { magma_queue_sync( queues[dev][kk] ); } } #ifdef PROFILE_SY2RK magma_setdevice( 0 ); if ( n > nx ) { cudaEventElapsedTime( &etime, start, stop ); up_time += (etime/1000.0); } magma_event_destroy( start ); magma_event_destroy( stop ); #endif trace_finalize( "chetrd.svg", "trace.css" ); #ifdef PROFILE_SY2RK printf( " n=%d nb=%d\n", n, nb ); printf( " Time in CLARFG: %.2e seconds\n", times[0] ); //printf( " Time in CHEMV : %.2e seconds\n", mv_time ); printf( " Time in CHER2K: %.2e seconds\n", up_time ); #endif CLEANUP: for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); for( kk=0; kk < nqueue; kk++ ) { magma_queue_destroy( queues[dev][kk] ); } magma_free( dA[dev] ); magma_free( dwork2[dev] ); } magma_free_pinned( hwork ); magma_setdevice( orig_dev ); work[0] = magma_cmake_lwork( lwkopt ); return *info; } /* magma_chetrd */
/** Purpose ======= ZHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method. The form of the factorization is A = U*D*U**H or A = L*D*L**H where U (or L) is a product of permutation and unit upper (lower) triangular matrices, and D is Hermitian and block diagonal with 1-by-1 and 2-by-2 diagonal blocks. This is the blocked version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] UPLO CHARACTER*1 - = 'U': Upper triangle of A is stored; - = 'L': Lower triangle of A is stored. @param[in] N INTEGER The order of the matrix A. N >= 0. @param[in,out] A COMPLEX*16 array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, the block diagonal matrix D and the multipliers used to obtain the factor U or L (see below for further details). @param[in] LDA INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] IPIV INTEGER array, dimension (N) Details of the interchanges and the block structure of D. If IPIV(k) > 0, then rows and columns k and IPIV(k) were interchanged and D(k,k) is a 1-by-1 diagonal block. If UPLO = 'U' and IPIV(k) = IPIV(k-1) < 0, then rows and columns k-1 and -IPIV(k) were interchanged and D(k-1:k,k-1:k) is a 2-by-2 diagonal block. If UPLO = 'L' and IPIV(k) = IPIV(k+1) < 0, then rows and columns k+1 and -IPIV(k) were interchanged and D(k:k+1,k:k+1) is a 2-by-2 diagonal block. @param[out] INFO INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, D(i,i) is exactly zero. The factorization has been completed, but the block diagonal matrix D is exactly singular, and division by zero will occur if it is used to solve a system of equations. Further Details =============== If UPLO = 'U', then A = U*D*U', where U = P(n)*U(n)* ... *P(k)U(k)* ..., i.e., U is a product of terms P(k)*U(k), where k decreases from n to 1 in steps of 1 or 2, and D is a block diagonal matrix with 1-by-1 and 2-by-2 diagonal blocks D(k). P(k) is a permutation matrix as defined by IPIV(k), and U(k) is a unit upper triangular matrix, such that if the diagonal block D(k) is of order s (s = 1 or 2), then ( I v 0 ) k-s U(k) = ( 0 I 0 ) s ( 0 0 I ) n-k k-s s n-k If s = 1, D(k) overwrites A(k,k), and v overwrites A(1:k-1,k). If s = 2, the upper triangle of D(k) overwrites A(k-1,k-1), A(k-1,k), and A(k,k), and v overwrites A(1:k-2,k-1:k). If UPLO = 'L', then A = L*D*L', where L = P(1)*L(1)* ... *P(k)*L(k)* ..., i.e., L is a product of terms P(k)*L(k), where k increases from 1 to n in steps of 1 or 2, and D is a block diagonal matrix with 1-by-1 and 2-by-2 diagonal blocks D(k). P(k) is a permutation matrix as defined by IPIV(k), and L(k) is a unit lower triangular matrix, such that if the diagonal block D(k) is of order s (s = 1 or 2), then ( I 0 0 ) k-1 L(k) = ( 0 I 0 ) s ( 0 v I ) n-k-s+1 k-1 s n-k-s+1 If s = 1, D(k) overwrites A(k,k), and v overwrites A(k+1:n,k). If s = 2, the lower triangle of D(k) overwrites A(k,k), A(k+1,k), and A(k+1,k+1), and v overwrites A(k+2:n,k:k+1). @ingroup magma_zhetrf_comp ********************************************************************/ extern "C" magma_int_t magma_zhetrf( magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { #define A(i, j) ( A + (j)*lda + (i)) #define dA(i, j) (dA + (j)*ldda + (i)) /* .. Local Scalars .. */ magma_int_t upper; magma_int_t nb = magma_get_zhetrf_nb(n); magma_int_t iinfo = 0, nk, kb; /* .. Executable Statements .. */ /* Test the input parameters. */ *info = 0; upper = (uplo == MagmaUpper); if ( !upper && uplo != MagmaLower ) { *info = -1; } else if ( n < 0 ) { *info = -2; } else if ( lda < max( 1, n ) ) { *info = -4; } if ( *info != 0 ) { magma_xerbla( __func__, -(*info) ); return *info; } magma_int_t ldda = 32*((n+31)/32); magmaDoubleComplex *dA, *dW; if ((MAGMA_SUCCESS != magma_zmalloc( &dA, n*ldda )) || (MAGMA_SUCCESS != magma_zmalloc( &dW, (1+nb)*ldda ))) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t stream[2]; magma_event_t event[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); magma_event_create( &event[1] ); trace_init( 1, 1, 2, (CUstream_st**)stream ); /* copy matrix to GPU */ trace_gpu_start( 0, 0, "set", "setA" ); //magma_zsetmatrix_async( n, n, A(0,0), lda, dA(0,0), ldda, stream[0] ); if ( upper ) { for (int k = 0; k < n; k+=nb ) { kb = min(nb, n-k); magma_zsetmatrix_async( k+kb, kb, A(0,k), lda, dA(0,k), ldda, stream[0] ); } } else { for (int k = 0; k < n; k+=nb ) { kb = min(nb, n-k); magma_zsetmatrix_async( n-k, kb, A(k,k), lda, dA(k,k), ldda, stream[0] ); } } trace_gpu_end( 0, 0 ); if ( upper ) { /* Factorize A as U*D*U' using the upper triangle of A K is the main loop index, decreasing from N to 1 in steps of KB, where KB is the number of columns factorized by ZLAHEF; KB is either NB or NB-1, or K for the last block */ kb = min(n,nb); for (int k = n-1; k >= 0; k-=kb ) { nk = k+1; kb = min(nb, nk); if ( k+1 > nb ) { /* Factorize columns k-kb+1:k of A and use blocked code to update columns 1:k-kb */ magma_zlahef_gpu( MagmaUpper, nk, kb, &kb, A( 0, 0 ), lda, dA( 0, 0 ), ldda, &ipiv[0], dW, ldda, stream, event, &iinfo ); } else { /* Use unblocked code to factorize columns 1:k of A */ magma_queue_sync( stream[0] ); magma_zgetmatrix( nk, nk, dA( 0, 0 ),ldda, A( 0, 0 ),lda ); lapackf77_zhetf2( MagmaUpperStr, &nk, A( 0, 0 ), &lda, &ipiv[0], &iinfo ); kb = k+1; } /* Set INFO on the first occurrence of a zero pivot */ if ( *info == 0 && iinfo > 0 ) *info = iinfo; } } else { /* Factorize A as L*D*L' using the lower triangle of A K is the main loop index, increasing from 1 to N in steps of KB, where KB is the number of columns factorized by ZLAHEF; KB is either NB or NB-1, or N-K+1 for the last block */ for (int k = 0; k < n; k += kb ) { nk = n-k; kb = min(nb, n - k); if ( k < n-nb ) { /* Factorize columns k:k+kb-1 of A and use blocked code to update columns k+kb:n */ magma_zlahef_gpu( MagmaLower, nk, nb, &kb, A( k, k ), lda, dA( k, k ), ldda, &ipiv[k], dW, ldda, stream, event, &iinfo ); } else { /* Use unblocked code to factorize columns k:n of A */ magma_queue_sync( stream[0] ); magma_zgetmatrix( nk,nk, dA(k,k),ldda, A(k,k),lda ); lapackf77_zhetf2( MagmaLowerStr, &nk, A( k, k ), &lda, &ipiv[k], &iinfo ); } /* Set INFO on the first occurrence of a zero pivot */ if ( *info == 0 && iinfo > 0 ) *info = iinfo + k; /* Adjust IPIV */ for (int j = k; j < k + kb; j ++) { if ( ipiv[j] > 0 ) { ipiv[j] = ipiv[j] + k; } else { ipiv[j] = ipiv[j] - k; } } } } trace_finalize( "zhetrf.svg","trace.css" ); magma_queue_sync( stream[0] ); magma_queue_sync( stream[1] ); magmablasSetKernelStream( NULL ); magma_event_destroy( event[0] ); magma_event_destroy( event[1] ); magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( dA ); magma_free( dW ); return *info; /* End of ZHETRF */ }
extern "C" magma_int_t magma_sgmres( magma_s_sparse_matrix A, magma_s_vector b, magma_s_vector *x, magma_s_solver_par *solver_par, magma_queue_t queue ) { magma_int_t stat = 0; // set queue for old dense routines magma_queue_t orig_queue; magmablasGetKernelStream( &orig_queue ); magma_int_t stat_cpu = 0, stat_dev = 0; // prepare solver feedback solver_par->solver = Magma_GMRES; solver_par->numiter = 0; solver_par->info = MAGMA_SUCCESS; // local variables float c_zero = MAGMA_S_ZERO, c_one = MAGMA_S_ONE, c_mone = MAGMA_S_NEG_ONE; magma_int_t dofs = A.num_rows; magma_int_t i, j, k, m = 0; magma_int_t restart = min( dofs-1, solver_par->restart ); magma_int_t ldh = restart+1; float nom, rNorm, RNorm, nom0, betanom, r0 = 0.; // CPU workspace //magma_setdevice(0); float *H, *HH, *y, *h1; stat_cpu += magma_smalloc_pinned( &H, (ldh+1)*ldh ); stat_cpu += magma_smalloc_pinned( &y, ldh ); stat_cpu += magma_smalloc_pinned( &HH, ldh*ldh ); stat_cpu += magma_smalloc_pinned( &h1, ldh ); if( stat_cpu != 0){ magma_free_pinned( H ); magma_free_pinned( y ); magma_free_pinned( HH ); magma_free_pinned( h1 ); magmablasSetKernelStream( orig_queue ); return MAGMA_ERR_HOST_ALLOC; } // GPU workspace magma_s_vector r, q, q_t; magma_s_vinit( &r, Magma_DEV, dofs, c_zero, queue ); magma_s_vinit( &q, Magma_DEV, dofs*(ldh+1), c_zero, queue ); q_t.memory_location = Magma_DEV; q_t.dval = NULL; q_t.num_rows = q_t.nnz = dofs; q_t.num_cols = 1; float *dy = NULL, *dH = NULL; stat_dev += magma_smalloc( &dy, ldh ); stat_dev += magma_smalloc( &dH, (ldh+1)*ldh ); if( stat_dev != 0){ magma_free_pinned( H ); magma_free_pinned( y ); magma_free_pinned( HH ); magma_free_pinned( h1 ); magma_free( dH ); magma_free( dy ); magma_free( dH ); magma_free( dy ); magmablasSetKernelStream( orig_queue ); return MAGMA_ERR_DEVICE_ALLOC; } // GPU stream magma_queue_t stream[2]; magma_event_t event[1]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); //magmablasSetKernelStream(stream[0]); magma_sscal( dofs, c_zero, x->dval, 1 ); // x = 0 magma_scopy( dofs, b.dval, 1, r.dval, 1 ); // r = b nom0 = betanom = magma_snrm2( dofs, r.dval, 1 ); // nom0= || r|| nom = nom0 * nom0; solver_par->init_res = nom0; H(1,0) = MAGMA_S_MAKE( nom0, 0. ); magma_ssetvector(1, &H(1,0), 1, &dH(1,0), 1); if ( (r0 = nom0 * solver_par->epsilon ) < ATOLERANCE ){ r0 = solver_par->epsilon; } if ( nom < r0 ) { magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ) { for(k=1; k<=restart; k++) { magma_scopy(dofs, r.dval, 1, q(k-1), 1); // q[0] = 1.0/||r|| magma_sscal(dofs, 1./H(k,k-1), q(k-1), 1); // (to be fused) q_t.dval = q(k-1); //magmablasSetKernelStream(stream[0]); magma_s_spmv( c_one, A, q_t, c_zero, r, queue ); // r = A q[k] // if (solver_par->ortho == Magma_MGS ) { // modified Gram-Schmidt for (i=1; i<=k; i++) { H(i,k) =magma_sdot(dofs, q(i-1), 1, r.dval, 1); // H(i,k) = q[i] . r magma_saxpy(dofs,-H(i,k), q(i-1), 1, r.dval, 1); // r = r - H(i,k) q[i] } H(k+1,k) = MAGMA_S_MAKE( magma_snrm2(dofs, r.dval, 1), 0. ); // H(k+1,k) = ||r|| /*} else if (solver_par->ortho == Magma_FUSED_CGS ) { // fusing sgemv with snrm2 in classical Gram-Schmidt magmablasSetKernelStream(stream[0]); magma_scopy(dofs, r.dval, 1, q(k), 1); // dH(1:k+1,k) = q[0:k] . r magmablas_sgemv(MagmaTrans, dofs, k+1, c_one, q(0), dofs, r.dval, 1, c_zero, &dH(1,k), 1); // r = r - q[0:k-1] dH(1:k,k) magmablas_sgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.dval, 1); // 1) dH(k+1,k) = sqrt( dH(k+1,k) - dH(1:k,k) ) magma_scopyscale( dofs, k, r.dval, q(k), &dH(1,k) ); // 2) q[k] = q[k] / dH(k+1,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_sgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); // asynch copy dH(1:(k+1),k) to H(1:(k+1),k) } else { // classical Gram-Schmidt (default) // > explicitly calling magmabls magmablasSetKernelStream(stream[0]); magmablas_sgemv(MagmaTrans, dofs, k, c_one, q(0), dofs, r.dval, 1, c_zero, &dH(1,k), 1, queue ); // dH(1:k,k) = q[0:k-1] . r #ifndef SNRM2SCALE // start copying dH(1:k,k) to H(1:k,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_sgetvector_async(k, &dH(1,k), 1, &H(1,k), 1, stream[1]); #endif // r = r - q[0:k-1] dH(1:k,k) magmablas_sgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.dval, 1); #ifdef SNRM2SCALE magma_scopy(dofs, r.dval, 1, q(k), 1); // q[k] = r / H(k,k-1) magma_snrm2scale(dofs, q(k), dofs, &dH(k+1,k) ); // dH(k+1,k) = sqrt(r . r) and r = r / dH(k+1,k) magma_event_record( event[0], stream[0] ); // start sending dH(1:k,k) to H(1:k,k) magma_queue_wait_event( stream[1], event[0] ); // can we keep H(k+1,k) on GPU and combine? magma_sgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); #else H(k+1,k) = MAGMA_S_MAKE( magma_snrm2(dofs, r.dval, 1), 0. ); // H(k+1,k) = sqrt(r . r) if ( k<solver_par->restart ) { magmablasSetKernelStream(stream[0]); magma_scopy(dofs, r.dval, 1, q(k), 1); // q[k] = 1.0/H[k][k-1] r magma_sscal(dofs, 1./H(k+1,k), q(k), 1); // (to be fused) } #endif }*/ /* Minimization of || b-Ax || in H_k */ for (i=1; i<=k; i++) { HH(k,i) = magma_cblas_sdot( i+1, &H(1,k), 1, &H(1,i), 1 ); } h1[k] = H(1,k)*H(1,0); if (k != 1) { for (i=1; i<k; i++) { HH(k,i) = HH(k,i)/HH(i,i);// for (m=i+1; m<=k; m++) { HH(k,m) -= HH(k,i) * HH(m,i) * HH(i,i); } h1[k] -= h1[i] * HH(k,i); } } y[k] = h1[k]/HH(k,k); if (k != 1) for (i=k-1; i>=1; i--) { y[i] = h1[i]/HH(i,i); for (j=i+1; j<=k; j++) y[i] -= y[j] * HH(j,i); } m = k; rNorm = fabs(MAGMA_S_REAL(H(k+1,k))); }/* Minimization done */ // compute solution approximation magma_ssetmatrix(m, 1, y+1, m, dy, m ); magma_sgemv(MagmaNoTrans, dofs, m, c_one, q(0), dofs, dy, 1, c_one, x->dval, 1); // compute residual magma_s_spmv( c_mone, A, *x, c_zero, r, queue ); // r = - A * x magma_saxpy(dofs, c_one, b.dval, 1, r.dval, 1); // r = r + b H(1,0) = MAGMA_S_MAKE( magma_snrm2(dofs, r.dval, 1), 0. ); // RNorm = H[1][0] = || r || RNorm = MAGMA_S_REAL( H(1,0) ); betanom = fabs(RNorm); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < r0 ) { break; } } tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; magma_sresidual( A, b, *x, &residual, queue ); solver_par->iter_res = betanom; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter) { solver_par->info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_SLOW_CONVERGENCE; } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_DIVERGENCE; } // free pinned memory magma_free_pinned( H ); magma_free_pinned( y ); magma_free_pinned( HH ); magma_free_pinned( h1 ); // free GPU memory magma_free(dy); if (dH != NULL ) magma_free(dH); magma_s_vfree(&r, queue ); magma_s_vfree(&q, queue ); // free GPU streams and events magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_event_destroy( event[0] ); //magmablasSetKernelStream(NULL); magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } /* magma_sgmres */
/***************************************************************************//** Purpose ------- CGEQRF computes a QR factorization of a complex M-by-N matrix A: A = Q * R. This is a GPU interface of the routine. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dlA COMPLEX array of pointers on the GPU, dimension (ngpu). On entry, the M-by-N matrix A distributed over GPUs (d_lA[d] points to the local matrix on d-th GPU). It uses 1D block column cyclic format with the block size of nb, and each local matrix is stored by column. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] tau COMPLEX array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_geqrf *******************************************************************************/ extern "C" magma_int_t magma_cgeqrf2_mgpu( magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaFloatComplex_ptr dlA[], magma_int_t ldda, magmaFloatComplex *tau, magma_int_t *info ) { #define dlA(dev, i, j) (dlA[dev] + (i) + (j)*(ldda)) #define hpanel(i) (hpanel + (i)) // set to NULL to make cleanup easy: free(NULL) does nothing. magmaFloatComplex *dwork[MagmaMaxGPUs]={NULL}, *dpanel[MagmaMaxGPUs]={NULL}; magmaFloatComplex *hwork=NULL, *hpanel=NULL; magma_queue_t queues[MagmaMaxGPUs][2]={{NULL}}; magma_event_t panel_event[MagmaMaxGPUs]={NULL}; magma_int_t i, j, min_mn, dev, ldhpanel, lddwork, rows; magma_int_t ib, nb; magma_int_t lhwork, lwork; magma_int_t panel_dev, i_local, i_nb_local, n_local[MagmaMaxGPUs], la_dev, dpanel_offset; *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } min_mn = min(m,n); if (min_mn == 0) return *info; magma_device_t orig_dev; magma_getdevice( &orig_dev ); nb = magma_get_cgeqrf_nb( m, n ); /* dwork is (n*nb) --- for T (nb*nb) and clarfb work ((n-nb)*nb) --- * + dpanel (ldda*nb), on each GPU. * I think clarfb work could be smaller, max(n_local[:]). * Oddly, T and clarfb work get stacked on top of each other, both with lddwork=n. * on GPU that owns panel, set dpanel = dlA(dev,i,i_local). * on other GPUs, set dpanel = dwork[dev] + dpanel_offset. */ lddwork = n; dpanel_offset = lddwork*nb; for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); if ( MAGMA_SUCCESS != magma_cmalloc( &(dwork[dev]), (lddwork + ldda)*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto CLEANUP; } } /* hwork is MAX( workspace for cgeqrf (n*nb), two copies of T (2*nb*nb) ) * + hpanel (m*nb). * for last block, need 2*n*nb total. */ ldhpanel = m; lhwork = max( n*nb, 2*nb*nb ); lwork = max( lhwork + ldhpanel*nb, 2*n*nb ); if ( MAGMA_SUCCESS != magma_cmalloc_pinned( &hwork, lwork )) { *info = MAGMA_ERR_HOST_ALLOC; goto CLEANUP; } hpanel = hwork + lhwork; /* Set the number of local n for each GPU */ for( dev=0; dev < ngpu; dev++ ) { n_local[dev] = ((n/nb)/ngpu)*nb; if (dev < (n/nb) % ngpu) n_local[dev] += nb; else if (dev == (n/nb) % ngpu) n_local[dev] += n % nb; } for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_queue_create( dev, &queues[dev][0] ); magma_queue_create( dev, &queues[dev][1] ); magma_event_create( &panel_event[dev] ); } if ( nb < min_mn ) { /* Use blocked code initially */ // Note: as written, ib cannot be < nb. for( i = 0; i < min_mn-nb; i += nb ) { /* Set the GPU number that holds the current panel */ panel_dev = (i/nb) % ngpu; /* Set the local index where the current panel is (j == i) */ i_local = i/(nb*ngpu)*nb; ib = min(min_mn-i, nb); rows = m-i; /* Send current panel to the CPU, after panel_event indicates it has been updated */ magma_setdevice( panel_dev ); magma_queue_wait_event( queues[panel_dev][1], panel_event[panel_dev] ); magma_cgetmatrix_async( rows, ib, dlA(panel_dev, i, i_local), ldda, hpanel(i), ldhpanel, queues[panel_dev][1] ); magma_queue_sync( queues[panel_dev][1] ); // Factor panel lapackf77_cgeqrf( &rows, &ib, hpanel(i), &ldhpanel, tau+i, hwork, &lhwork, info ); if ( *info != 0 ) { fprintf( stderr, "error %lld\n", (long long) *info ); } // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, hpanel(i), &ldhpanel, tau+i, hwork, &ib ); magma_cpanel_to_q( MagmaUpper, ib, hpanel(i), ldhpanel, hwork + ib*ib ); // Send the current panel back to the GPUs for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); if (dev == panel_dev) dpanel[dev] = dlA(dev, i, i_local); else dpanel[dev] = dwork[dev] + dpanel_offset; magma_csetmatrix_async( rows, ib, hpanel(i), ldhpanel, dpanel[dev], ldda, queues[dev][0] ); } for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_queue_sync( queues[dev][0] ); } // TODO: if magma_cpanel_to_q copied whole block, wouldn't need to restore // -- just send the copy to the GPUs. // TODO: also, could zero out the lower triangle and use Azzam's larfb w/ gemm. /* Restore the panel */ magma_cq_to_panel( MagmaUpper, ib, hpanel(i), ldhpanel, hwork + ib*ib ); if (i + ib < n) { /* Send the T matrix to the GPU. */ for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_csetmatrix_async( ib, ib, hwork, ib, dwork[dev], lddwork, queues[dev][0] ); } la_dev = (panel_dev+1) % ngpu; for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); if (dev == la_dev && i+nb < min_mn-nb) { // If not last panel, // for look-ahead panel, apply H' to A(i:m,i+ib:i+2*ib) i_nb_local = (i+nb)/(nb*ngpu)*nb; magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local), ldda, // C dwork[dev]+ib, lddwork, // work queues[dev][0] ); magma_event_record( panel_event[dev], queues[dev][0] ); // for trailing matrix, apply H' to A(i:m,i+2*ib:n) magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[dev]-(i_nb_local+ib), ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local+ib), ldda, // C dwork[dev]+ib, lddwork, // work queues[dev][0] ); } else { // for trailing matrix, apply H' to A(i:m,i+ib:n) i_nb_local = i_local; if (dev <= panel_dev) { i_nb_local += ib; } magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[dev]-i_nb_local, ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local), ldda, // C dwork[dev]+ib, lddwork, // work queues[dev][0] ); } } // Restore top of panel (after larfb is done) magma_setdevice( panel_dev ); magma_csetmatrix_async( ib, ib, hpanel(i), ldhpanel, dlA(panel_dev, i, i_local), ldda, queues[panel_dev][0] ); } } } else { i = 0; } /* Use unblocked code to factor the last or only block row. */ if (i < min_mn) { rows = m-i; for( j=i; j < n; j += nb ) { panel_dev = (j/nb) % ngpu; i_local = j/(nb*ngpu)*nb; ib = min( n-j, nb ); magma_setdevice( panel_dev ); magma_cgetmatrix( rows, ib, dlA(panel_dev, i, i_local), ldda, hwork + (j-i)*rows, rows, queues[panel_dev][0] ); } // needs lwork >= 2*n*nb: // needs (m-i)*(n-i) for last block row, bounded by nb*n. // needs (n-i)*nb for cgeqrf work, bounded by n*nb. ib = n-i; // total columns in block row lhwork = lwork - ib*rows; lapackf77_cgeqrf( &rows, &ib, hwork, &rows, tau+i, hwork + ib*rows, &lhwork, info ); if ( *info != 0 ) { fprintf( stderr, "error %lld\n", (long long) *info ); } for( j=i; j < n; j += nb ) { panel_dev = (j/nb) % ngpu; i_local = j/(nb*ngpu)*nb; ib = min( n-j, nb ); magma_setdevice( panel_dev ); magma_csetmatrix( rows, ib, hwork + (j-i)*rows, rows, dlA(panel_dev, i, i_local), ldda, queues[panel_dev][0] ); } } CLEANUP: // free(NULL) does nothing. for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_queue_destroy( queues[dev][0] ); magma_queue_destroy( queues[dev][1] ); magma_event_destroy( panel_event[dev] ); magma_free( dwork[dev] ); } magma_free_pinned( hwork ); magma_setdevice( orig_dev ); return *info; } /* magma_cgeqrf2_mgpu */
/** Purpose ======= SSYTRF_nopiv computes the LDLt factorization of a real symmetric matrix A. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The factorization has the form A = U^H * D * U, if UPLO = MagmaUpper, or A = L * D * L^H, if UPLO = MagmaLower, where U is an upper triangular matrix, L is lower triangular, and D is a diagonal matrix. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U^H D U or A = L D L^H. \n Higher performance is achieved if A is in pinned memory. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -6, the GPU memory allocation failed - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_ssysv_comp ******************************************************************* */ extern "C" magma_int_t magma_ssytrf_nopiv( magma_uplo_t uplo, magma_int_t n, float *A, magma_int_t lda, magma_int_t *info) { #define A(i, j) ( A +(j)*lda + (i)) #define dA(i, j) (dA +(j)*ldda + (i)) #define dW(i, j) (dW +(j)*ldda + (i)) #define dWt(i, j) (dW +(j)*nb + (i)) /* Constants */ const float c_one = MAGMA_S_ONE; const float c_neg_one = MAGMA_S_NEG_ONE; /* Local variables */ bool upper = (uplo == MagmaUpper); magma_int_t j, k, jb, ldda, nb, ib, iinfo; magmaFloat_ptr dA; magmaFloat_ptr dW; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return MAGMA_ERR_ILLEGAL_VALUE; } /* Quick return */ if ( n == 0 ) return MAGMA_SUCCESS; ldda = magma_roundup( n, 32 ); nb = magma_get_ssytrf_nopiv_nb(n); ib = min(32, nb); // inner-block for diagonal factorization if ((MAGMA_SUCCESS != magma_smalloc(&dA, n *ldda)) || (MAGMA_SUCCESS != magma_smalloc(&dW, nb*ldda))) { /* alloc failed so call the non-GPU-resident version */ *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_device_t cdev; magma_queue_t queues[2]; magma_event_t event; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); magma_event_create( &event ); trace_init( 1, 1, 2, queues ); /* Use hybrid blocked code. */ if (upper) { //========================================================= // Compute the LDLt factorization A = U'*D*U without pivoting. // copy matrix to GPU for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async(j+jb, jb, A(0, j), lda, dA(0, j), ldda, queues[0]); trace_gpu_end( 0, 0 ); } // main loop for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); if ( j != 0) { //magma_event_sync(event); magma_sgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, queues[1]); } trace_gpu_end( 0, 0 ); // factorize the diagonal block magma_queue_sync(queues[1]); trace_cpu_start( 0, "potrf", "potrf" ); magma_ssytrf_nopiv_cpu( MagmaUpper, jb, ib, A(j, j), lda, info ); trace_cpu_end( 0 ); if (*info != 0) { *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, queues[0]); trace_gpu_end( 0, 0 ); // copy j-th column of U back to CPU trace_gpu_start( 0, 1, "get", "get" ); magma_sgetmatrix_async(j, jb, dA(0, j), ldda, A(0, j), lda, queues[1]); trace_gpu_end( 0, 1 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_strsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaUnit, jb, (n-j-jb), c_one, dA(j, j), ldda, dA(j, j+jb), ldda, queues[0] ); magma_scopymatrix( jb, n-j-jb, dA( j, j+jb ), ldda, dWt( 0, j+jb ), nb, queues[0] ); // update the trailing submatrix with D magmablas_slascl_diag( MagmaUpper, jb, n-j-jb, dA(j, j), ldda, dA(j, j+jb), ldda, queues[0], &iinfo); trace_gpu_end( 0, 0 ); // update the trailing submatrix with U and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k < n; k += nb) { magma_int_t kb = min(nb,n-k); magma_sgemm( MagmaConjTrans, MagmaNoTrans, kb, n-k, jb, c_neg_one, dWt(0, k), nb, dA(j, k), ldda, c_one, dA(k, k), ldda, queues[0]); if (k == j+jb) { // magma_event_record( event, queues[0] ); magma_queue_sync( queues[0] ); } } trace_gpu_end( 0, 0 ); } } } else { //========================================================= // Compute the LDLt factorization A = L*D*L' without pivoting. // copy the matrix to GPU for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async((n-j), jb, A(j, j), lda, dA(j, j), ldda, queues[0]); trace_gpu_end( 0, 0 ); } // main loop for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); if (j != 0) { //magma_event_sync(event); magma_sgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, queues[1]); } trace_gpu_end( 0, 0 ); // factorize the diagonal block magma_queue_sync(queues[1]); trace_cpu_start( 0, "potrf", "potrf" ); magma_ssytrf_nopiv_cpu( MagmaLower, jb, ib, A(j, j), lda, info ); trace_cpu_end( 0 ); if (*info != 0) { *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, queues[0]); trace_gpu_end( 0, 0 ); // copy j-th row of L back to CPU trace_gpu_start( 0, 1, "get", "get" ); magma_sgetmatrix_async(jb, j, dA(j, 0), ldda, A(j, 0), lda, queues[1]); trace_gpu_end( 0, 1 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_strsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaUnit, (n-j-jb), jb, c_one, dA(j, j), ldda, dA(j+jb, j), ldda, queues[0] ); magma_scopymatrix( n-j-jb,jb, dA( j+jb, j ), ldda, dW( j+jb, 0 ), ldda, queues[0] ); // update the trailing submatrix with D magmablas_slascl_diag( MagmaLower, n-j-jb, jb, dA(j, j), ldda, dA(j+jb, j), ldda, queues[0], &iinfo ); trace_gpu_end( 0, 0 ); // update the trailing submatrix with L and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k < n; k += nb) { magma_int_t kb = min(nb,n-k); magma_sgemm( MagmaNoTrans, MagmaConjTrans, n-k, kb, jb, c_neg_one, dA(k, j), ldda, dW(k, 0), ldda, c_one, dA(k, k), ldda, queues[0] ); if (k == j+jb) { //magma_event_record( event, queues[0] ); magma_queue_sync(queues[0]); } } trace_gpu_end( 0, 0 ); } } } trace_finalize( "ssytrf.svg","trace.css" ); magma_queue_destroy(queues[0]); magma_queue_destroy(queues[1]); magma_event_destroy( event ); magma_free(dW); magma_free(dA); return MAGMA_SUCCESS; } /* magma_ssytrf_nopiv */
/***************************************************************************//** Purpose ------- DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] d_lA DOUBLE PRECISION array of pointers on the GPU, dimension (ngpu) On entry, the symmetric matrix dA distributed over GPUs (d_lA[d] points to the local matrix on the d-th GPU). It is distributed in 1D block column or row cyclic (with the block size of nb) if UPLO = MagmaUpper or MagmaLower, respectively. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array d_lA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_potrf *******************************************************************************/ extern "C" magma_int_t magma_dpotrf_mgpu( magma_int_t ngpu, magma_uplo_t uplo, magma_int_t n, magmaDouble_ptr d_lA[], magma_int_t ldda, magma_int_t *info) { magma_int_t j, nb, d, lddp, h; const char* uplo_ = lapack_uplo_const( uplo ); double *work; bool upper = (uplo == MagmaUpper); double *dwork[MagmaMaxGPUs]; magma_queue_t queues[MagmaMaxGPUs][3]; magma_event_t event[MagmaMaxGPUs][5]; *info = 0; nb = magma_get_dpotrf_nb(n); if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (!upper) { lddp = nb*(n/(nb*ngpu)); if ( n%(nb*ngpu) != 0 ) lddp += min(nb, n-ngpu*lddp); if ( ldda < lddp ) *info = -4; } else if ( ldda < n ) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); if (ngpu == 1 && ((nb <= 1) || (nb >= n)) ) { /* Use unblocked code. */ magma_setdevice(0); magma_queue_create( 0, &queues[0][0] ); if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, n*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_dgetmatrix( n, n, d_lA[0], ldda, work, n, queues[0][0] ); lapackf77_dpotrf(uplo_, &n, work, &n, info); magma_dsetmatrix( n, n, work, n, d_lA[0], ldda, queues[0][0] ); magma_free_pinned( work ); magma_queue_destroy( queues[0][0] ); } else { lddp = magma_roundup( n, nb ); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_dmalloc( &dwork[d], ngpu*nb*lddp )) { for( j=0; j < d; j++ ) { magma_setdevice(j); magma_free( dwork[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } for( j=0; j < 3; j++ ) { magma_queue_create( d, &queues[d][j] ); } for( j=0; j < 5; j++ ) { magma_event_create( &event[d][j] ); } } magma_setdevice(0); h = 1; //ngpu; //magma_ceildiv( n, nb ); if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, n*nb*h )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } if (upper) { /* with three queues */ magma_dpotrf3_mgpu(ngpu, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, n, h, queues, event, info); } else { /* with three queues */ magma_dpotrf3_mgpu(ngpu, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, nb*h, h, queues, event, info); } /* clean up */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); for( j=0; j < 3; j++ ) { magma_queue_sync( queues[d][j] ); magma_queue_destroy( queues[d][j] ); } for( j=0; j < 5; j++ ) magma_event_destroy( event[d][j] ); magma_free( dwork[d] ); } magma_free_pinned( work ); } /* end of not lapack */ magma_setdevice( orig_dev ); return *info; } /* magma_dpotrf_mgpu */
extern "C" magma_int_t magma_cpotrf_mgpu(magma_int_t num_gpus, char uplo, magma_int_t n, magmaFloatComplex **d_lA, magma_int_t ldda, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= CPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = 'U', or dA = L * L**H, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of dA is stored; = 'L': Lower triangle of dA is stored. N (input) INTEGER The order of the matrix dA. N >= 0. dA (input/output) COMPLEX array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = 'U', the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be dividable by 16. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ magma_int_t j, nb, d, lddp, h; char uplo_[2] = {uplo, 0}; magmaFloatComplex *work; int upper = lapackf77_lsame(uplo_, "U"); magmaFloatComplex *dwork[MagmaMaxGPUs]; magma_queue_t stream[MagmaMaxGPUs][3]; magma_event_t event[MagmaMaxGPUs][5]; *info = 0; nb = magma_get_cpotrf_nb(n); if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (!upper) { lddp = nb*(n/(nb*num_gpus)); if( n%(nb*num_gpus) != 0 ) lddp+=min(nb,n-num_gpus*lddp); if( ldda < lddp ) *info = -4; } else if( ldda < n ) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (num_gpus == 1 && ((nb <= 1) || (nb >= n)) ) { /* Use unblocked code. */ magma_setdevice(0); if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, n*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_cgetmatrix( n, n, d_lA[0], ldda, work, n ); lapackf77_cpotrf(uplo_, &n, work, &n, info); magma_csetmatrix( n, n, work, n, d_lA[0], ldda ); magma_free_pinned( work ); } else { lddp = nb*((n+nb-1)/nb); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_cmalloc( &dwork[d], num_gpus*nb*lddp )) { for( j=0; j<d; j++ ) { magma_setdevice(j); magma_free( dwork[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } for( j=0; j<3; j++ ) magma_queue_create( &stream[d][j] ); for( j=0; j<5; j++ ) magma_event_create( &event[d][j] ); } magma_setdevice(0); h = 1; //num_gpus; //(n+nb-1)/nb; if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, n*nb*h )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } if (upper) { /* with two streams */ //magma_cpotrf2_mgpu(num_gpus, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, n, // h, stream, event, info); /* with three streams */ magma_cpotrf3_mgpu(num_gpus, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, n, h, stream, event, info); } else { /* with two streams */ //magma_cpotrf2_mgpu(num_gpus, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, nb*h, // h, stream, event, info); /* with three streams */ magma_cpotrf3_mgpu(num_gpus, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, nb*h, h, stream, event, info); } /* clean up */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( j=0; j<3; j++ ) { magma_queue_sync( stream[d][j] ); magma_queue_destroy( stream[d][j] ); } magmablasSetKernelStream(NULL); for( j=0; j<5; j++ ) magma_event_destroy( event[d][j] ); magma_free( dwork[d] ); } magma_setdevice(0); magma_free_pinned( work ); } /* end of not lapack */ return *info; } /* magma_cpotrf_mgpu */
/** Purpose ======= DSYTRF computes the factorization of a real symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. The form of the factorization is A = U*D*U^H or A = L*D*L^H where U (or L) is a product of permutation and unit upper (lower) triangular matrices, and D is symmetric and block diagonal with 1-by-1 and 2-by-2 diagonal blocks. This is the blocked version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A DOUBLE PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, the block diagonal matrix D and the multipliers used to obtain the factor U or L (see below for further details). @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] ipiv INTEGER array, dimension (N) Details of the interchanges and the block structure of D. If IPIV(k) > 0, then rows and columns k and IPIV(k) were interchanged and D(k,k) is a 1-by-1 diagonal block. If UPLO = MagmaUpper and IPIV(k) = IPIV(k-1) < 0, then rows and columns k-1 and -IPIV(k) were interchanged and D(k-1:k,k-1:k) is a 2-by-2 diagonal block. If UPLO = MagmaLower and IPIV(k) = IPIV(k+1) < 0, then rows and columns k+1 and -IPIV(k) were interchanged and D(k:k+1,k:k+1) is a 2-by-2 diagonal block. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, D(i,i) is exactly zero. The factorization has been completed, but the block diagonal matrix D is exactly singular, and division by zero will occur if it is used to solve a system of equations. Further Details =============== If UPLO = MagmaUpper, then A = U*D*U', where U = P(n)*U(n)* ... *P(k)U(k)* ..., i.e., U is a product of terms P(k)*U(k), where k decreases from n to 1 in steps of 1 or 2, and D is a block diagonal matrix with 1-by-1 and 2-by-2 diagonal blocks D(k). P(k) is a permutation matrix as defined by IPIV(k), and U(k) is a unit upper triangular matrix, such that if the diagonal block D(k) is of order s (s = 1 or 2), then ( I v 0 ) k-s U(k) = ( 0 I 0 ) s ( 0 0 I ) n-k k-s s n-k If s = 1, D(k) overwrites A(k,k), and v overwrites A(1:k-1,k). If s = 2, the upper triangle of D(k) overwrites A(k-1,k-1), A(k-1,k), and A(k,k), and v overwrites A(1:k-2,k-1:k). If UPLO = MagmaLower, then A = L*D*L', where L = P(1)*L(1)* ... *P(k)*L(k)* ..., i.e., L is a product of terms P(k)*L(k), where k increases from 1 to n in steps of 1 or 2, and D is a block diagonal matrix with 1-by-1 and 2-by-2 diagonal blocks D(k). P(k) is a permutation matrix as defined by IPIV(k), and L(k) is a unit lower triangular matrix, such that if the diagonal block D(k) is of order s (s = 1 or 2), then ( I 0 0 ) k-1 L(k) = ( 0 I 0 ) s ( 0 v I ) n-k-s+1 k-1 s n-k-s+1 If s = 1, D(k) overwrites A(k,k), and v overwrites A(k+1:n,k). If s = 2, the lower triangle of D(k) overwrites A(k,k), A(k+1,k), and A(k+1,k+1), and v overwrites A(k+2:n,k:k+1). @ingroup magma_dsysv_comp ********************************************************************/ extern "C" magma_int_t magma_dsytrf( magma_uplo_t uplo, magma_int_t n, double *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { #define A(i_, j_) ( A + (i_) + (j_)*lda ) #define dA(i_, j_) (dA + (i_) + (j_)*ldda) /* .. Local Scalars .. */ magma_int_t nb = magma_get_dsytrf_nb(n); magma_int_t iinfo = 0, nk, kb, j, k; /* Test the input parameters. */ *info = 0; bool upper = (uplo == MagmaUpper); if ( ! upper && uplo != MagmaLower ) { *info = -1; } else if ( n < 0 ) { *info = -2; } else if ( lda < max( 1, n ) ) { *info = -4; } if ( *info != 0 ) { magma_xerbla( __func__, -(*info) ); return *info; } // TODO fix memory leak of dA if dW fails magma_int_t ldda = magma_roundup( n, 32 ); magmaDouble_ptr dA, dW; if ((MAGMA_SUCCESS != magma_dmalloc( &dA, n*ldda )) || (MAGMA_SUCCESS != magma_dmalloc( &dW, (1+nb)*ldda ))) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_t queues[2]; magma_event_t event[2]; magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); magma_event_create( &event[0] ); magma_event_create( &event[1] ); trace_init( 1, 1, 2, queues ); /* copy matrix to GPU */ trace_gpu_start( 0, 0, "set", "setA" ); //magma_dsetmatrix_async( n, n, A(0,0), lda, dA(0,0), ldda, queues[0] ); if ( upper ) { for (k = 0; k < n; k += nb ) { kb = min(nb, n-k); magma_dsetmatrix_async( k+kb, kb, A(0,k), lda, dA(0,k), ldda, queues[0] ); } } else { for (k = 0; k < n; k += nb ) { kb = min(nb, n-k); magma_dsetmatrix_async( n-k, kb, A(k,k), lda, dA(k,k), ldda, queues[0] ); } } trace_gpu_end( 0, 0 ); if ( upper ) { /* Factorize A as U*D*U' using the upper triangle of A K is the main loop index, decreasing from N to 1 in steps of KB, where KB is the number of columns factorized by DLASYF; KB is either NB or NB-1, or K for the last block */ kb = min(n,nb); for (k = n-1; k >= 0; k -= kb ) { nk = k+1; kb = min(nb, nk); if ( k+1 > nb ) { /* Factorize columns k-kb+1:k of A and use blocked code to update columns 1:k-kb */ magma_dlasyf_gpu( MagmaUpper, nk, kb, &kb, A( 0, 0 ), lda, dA( 0, 0 ), ldda, &ipiv[0], dW, ldda, queues, event, &iinfo ); } else { /* Use unblocked code to factorize columns 1:k of A */ magma_queue_sync( queues[0] ); magma_dgetmatrix( nk, nk, dA( 0, 0 ), ldda, A( 0, 0 ), lda, queues[0] ); lapackf77_dsytf2( MagmaUpperStr, &nk, A( 0, 0 ), &lda, &ipiv[0], &iinfo ); kb = k+1; } /* Set INFO on the first occurrence of a zero pivot */ if ( *info == 0 && iinfo > 0 ) *info = iinfo; } } else { /* Factorize A as L*D*L' using the lower triangle of A K is the main loop index, increasing from 1 to N in steps of KB, where KB is the number of columns factorized by DLASYF; KB is either NB or NB-1, or N-K+1 for the last block */ for (k = 0; k < n; k += kb ) { nk = n-k; kb = min(nb, n - k); if ( k < n-nb ) { /* Factorize columns k:k+kb-1 of A and use blocked code to update columns k+kb:n */ magma_dlasyf_gpu( MagmaLower, nk, nb, &kb, A( k, k ), lda, dA( k, k ), ldda, &ipiv[k], dW, ldda, queues, event, &iinfo ); } else { /* Use unblocked code to factorize columns k:n of A */ magma_queue_sync( queues[0] ); magma_dgetmatrix( nk, nk, dA(k,k), ldda, A(k,k), lda, queues[0] ); lapackf77_dsytf2( MagmaLowerStr, &nk, A( k, k ), &lda, &ipiv[k], &iinfo ); } /* Set INFO on the first occurrence of a zero pivot */ if ( *info == 0 && iinfo > 0 ) *info = iinfo + k; /* Adjust IPIV */ for (j = k; j < k + kb; j ++) { if ( ipiv[j] > 0 ) { ipiv[j] = ipiv[j] + k; } else { ipiv[j] = ipiv[j] - k; } } } } trace_finalize( "dsytrf.svg", "trace.css" ); magma_queue_sync( queues[0] ); magma_queue_sync( queues[1] ); magma_event_destroy( event[0] ); magma_event_destroy( event[1] ); magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); magma_free( dA ); magma_free( dW ); return *info; } /* End of DSYTRF */
extern "C" magma_int_t magma_ssytrf_nopiv(magma_uplo_t uplo, magma_int_t n, float *A, magma_int_t lda, magma_int_t *info) { /* -- MAGMA (version 1.6.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2011 Purpose ======= SSYTRF_nopiv computes the LDLt factorization of a real symmetric matrix A. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The factorization has the form A = U\*\*H * D * U, if UPLO = 'U', or A = L * D * L\*\*H, if UPLO = 'L', where U is an upper triangular matrix, L is lower triangular, and D is a diagonal matrix. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) REAL array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U\*\*H*U or A = L*L\*\*H. Higher performance is achieved if A is in pinned memory, e.g. allocated using cudaMallocHost. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -6, the GPU memory allocation failed > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ /* Local variables */ float zone = MAGMA_S_ONE; float mzone = MAGMA_S_NEG_ONE; int upper = (uplo == MagmaUpper); magma_int_t j, k, jb, ldda, nb, ib, iinfo; magmaFloat_ptr dA; magmaFloat_ptr dW; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return MAGMA_ERR_ILLEGAL_VALUE; } /* Quick return */ if ( n == 0 ) return MAGMA_SUCCESS; ldda = ((n+31)/32)*32; nb = magma_get_ssytrf_nopiv_nb(n); ib = min(32, nb); // inner-block for diagonal factorization if ((MAGMA_SUCCESS != magma_smalloc(&dA, n *ldda)) || (MAGMA_SUCCESS != magma_smalloc(&dW, nb*ldda))) { /* alloc failed so call the non-GPU-resident version */ *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t stream[2]; magma_event_t event; magma_queue_create(&stream[0]); magma_queue_create(&stream[1]); magma_event_create( &event ); trace_init( 1, 1, 2, (CUstream_st**)stream ); //if (nb <= 1 || nb >= n) //{ // lapackf77_spotrf(uplo_, &n, a, &lda, info); //} else { /* Use hybrid blocked code. */ if (upper) { //========================================================= // Compute the LDLt factorization A = U'*D*U without pivoting. // copy matrix to GPU for (j=0; j<n; j+=nb) { jb = min(nb, (n-j)); trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async(j+jb, jb, A(0, j), lda, dA(0, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); } // main loop for (j=0; j<n; j += nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); magma_sgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, stream[0]); trace_gpu_end( 0, 0 ); // copy j-th column of U back to CPU magma_queue_wait_event( stream[1], event ); trace_gpu_start( 0, 1, "get", "get" ); magma_sgetmatrix_async(j, jb, dA(0, j), ldda, A(0, j), lda, stream[1]); trace_gpu_end( 0, 1 ); // factorize the diagonal block magma_queue_sync(stream[0]); trace_cpu_start( 0, "potrf", "potrf" ); ssytrf_nopiv_cpu(MagmaUpper, jb, ib, A(j, j), lda, info); trace_cpu_end( 0 ); if (*info != 0){ *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column magmablasSetKernelStream( stream[0] ); trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_strsm(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaUnit, jb, (n-j-jb), zone, dA(j, j), ldda, dA(j, j+jb), ldda); magma_scopymatrix( jb, n-j-jb, dA( j, j+jb ), ldda, dWt( 0, j+jb ), nb ); // update the trailing submatrix with D magmablas_slascl_diag(MagmaUpper, jb, n-j-jb, dA(j, j), ldda, dA(j, j+jb), ldda, &iinfo); magma_event_record( event, stream[0] ); trace_gpu_end( 0, 0 ); // update the trailing submatrix with U and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k<n; k+=nb) { magma_int_t kb = min(nb,n-k); magma_sgemm(MagmaConjTrans, MagmaNoTrans, kb, n-k, jb, mzone, dWt(0, k), nb, dA(j, k), ldda, zone, dA(k, k), ldda); } trace_gpu_end( 0, 0 ); } } } else { //========================================================= // Compute the LDLt factorization A = L*D*L' without pivoting. // copy the matrix to GPU for (j=0; j<n; j+=nb) { jb = min(nb, (n-j)); trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async((n-j), jb, A(j, j), lda, dA(j, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); } // main loop for (j=0; j<n; j+=nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); magma_sgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, stream[0]); trace_gpu_end( 0, 0 ); // copy j-th row of L back to CPU magma_queue_wait_event( stream[1], event ); trace_gpu_start( 0, 1, "get", "get" ); magma_sgetmatrix_async(jb, j, dA(j, 0), ldda, A(j, 0), lda, stream[1]); trace_gpu_end( 0, 1 ); // factorize the diagonal block magma_queue_sync(stream[0]); trace_cpu_start( 0, "potrf", "potrf" ); ssytrf_nopiv_cpu(MagmaLower, jb, ib, A(j, j), lda, info); trace_cpu_end( 0 ); if (*info != 0){ *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column magmablasSetKernelStream( stream[0] ); trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_strsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaUnit, (n-j-jb), jb, zone, dA(j, j), ldda, dA(j+jb, j), ldda); magma_scopymatrix( n-j-jb,jb, dA( j+jb, j ), ldda, dW( j+jb, 0 ), ldda ); // update the trailing submatrix with D magmablas_slascl_diag(MagmaLower, n-j-jb, jb, dA(j, j), ldda, dA(j+jb, j), ldda, &iinfo); magma_event_record( event, stream[0] ); trace_gpu_end( 0, 0 ); // update the trailing submatrix with L and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k<n; k+=nb) { magma_int_t kb = min(nb,n-k); magma_sgemm(MagmaNoTrans, MagmaConjTrans, n-k, kb, jb, mzone, dA(k, j), ldda, dW(k, 0), ldda, zone, dA(k, k), ldda); } trace_gpu_end( 0, 0 ); } } } } trace_finalize( "ssytrf.svg","trace.css" ); magma_queue_destroy(stream[0]); magma_queue_destroy(stream[1]); magma_event_destroy( event ); magma_free(dW); magma_free(dA); return MAGMA_SUCCESS; } /* magma_ssytrf_nopiv */
/** Purpose ------- ZHETRD_HE2HB reduces a complex Hermitian matrix A to real symmetric band-diagonal form T by an orthogonal similarity transformation: Q**H * A * Q = T. This version stores the triangular matrices T used in the accumulated Householder transformations (I - V T V'). Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. n >= 0. @param[in] nb INTEGER The inner blocking. nb >= 0. @param[in,out] A COMPLEX_16 array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if UPLO = MagmaUpper, the Upper band-diagonal of A is overwritten by the corresponding elements of the band-diagonal matrix T, and the elements above the band diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = MagmaLower, the the Lower band-diagonal of A is overwritten by the corresponding elements of the band-diagonal matrix T, and the elements below the band-diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau COMPLEX_16 array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= 1. For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] dT COMPLEX_16 array on the GPU, dimension N*NB, where NB is the optimal blocksize. On exit dT holds the upper triangular matrices T from the accumulated Householder transformations (I - V T V') used in the factorization. The nb x nb matrices T are ordered consecutively in memory one after another. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n-1) . . . H(2) H(1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in A(1:i-1,i+1), and tau in TAU(i). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(n-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i), and tau in TAU(i). The contents of A on exit are illustrated by the following examples with n = 5: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( d e v2 v3 v4 ) ( d ) ( d e v3 v4 ) ( e d ) ( d e v4 ) ( v1 e d ) ( d e ) ( v1 v2 e d ) ( d ) ( v1 v2 v3 e d ) where d and e denote diagonal and off-diagonal elements of T, and vi denotes an element of the vector defining H(i). @ingroup magma_zheev_2stage ********************************************************************/ extern "C" magma_int_t magma_zhetrd_he2hb( magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *work, magma_int_t lwork, magmaDoubleComplex_ptr dT, magma_int_t *info) { #ifdef HAVE_clBLAS #define dA(a_1,a_2) (dA, (dA_offset + ((a_2)-1)*(ldda) + (a_1)-1)) #define dT(a_1) (dT, (dT_offset + ((a_1)-1)*(lddt))) #else #define dA(a_1,a_2) (dA + ((a_2)-1)*(ldda) + (a_1)-1) #define dT(a_1) (dT + ((a_1)-1)*(lddt)) #endif #define A(a_1,a_2) ( A + ((a_2)-1)*( lda) + (a_1)-1) #define tau_ref(a_1) (tau + (a_1)-1) magma_int_t ldda = magma_roundup( n, 32 ); magma_int_t lddt = nb; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex c_neg_half = MAGMA_Z_NEG_HALF; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_zero = MAGMA_Z_ZERO; double d_one = MAGMA_D_ONE; magma_int_t pm, pn, indi, indj, pk; magma_int_t pm_old=0, pn_old=0, indi_old=0, indj_old=0; magma_int_t i; magma_int_t lwkopt; *info = 0; bool upper = (uplo == MagmaUpper); bool lquery = (lwork == -1); if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } else if (lwork < 1 && ! lquery) { *info = -9; } /* Determine the block size. */ lwkopt = n * nb; if (*info == 0) { work[0] = magma_zmake_lwork( lwkopt ); } if (*info != 0) return *info; else if (lquery) return *info; /* Quick return if possible */ if (n == 0) { work[0] = c_one; return *info; } magmaDoubleComplex *dA; if (MAGMA_SUCCESS != magma_zmalloc( &dA, (n + 2*nb)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } // limit to 16 threads magma_int_t orig_threads = magma_get_lapack_numthreads(); magma_set_lapack_numthreads( min(orig_threads,16) ); /* Use the first panel of dA as work space */ magmaDoubleComplex *dwork = dA + n*ldda; magmaDoubleComplex *dW = dwork + nb*ldda; #ifdef TRACING char buf[80]; #endif magma_queue_t queues[2]; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); trace_init( 1, 1, 3, queues ); lwork -= nb*nb; magmaDoubleComplex *hT = work + lwork; memset( hT, 0, nb*nb*sizeof(magmaDoubleComplex)); magma_event_t Pupdate_event; cudaEventCreateWithFlags(&Pupdate_event,cudaEventDisableTiming); //magma_event_create(&Pupdate_event); if (upper) { printf("ZHETRD_HE2HB is not yet implemented for upper matrix storage. Exit.\n"); exit(1); } else { /* Copy the matrix to the GPU */ if (1 <= n-nb) { trace_gpu_start( 0, 0, "set", "set A" ); magma_zsetmatrix_async( (n-nb), (n-nb), A(nb+1, nb+1), lda, dA(nb+1, nb+1), ldda, queues[0] ); trace_gpu_end( 0, 0 ); } /* Reduce the lower triangle of A */ for (i = 1; i <= n-nb; i += nb) { indi = i+nb; indj = i; pm = n - i - nb + 1; //pn = min(i+nb-1, n-nb) -i + 1; pn = nb; /* Get the current panel (no need for the 1st iteration) */ if (i > 1 ) { // magma_zpanel_to_q copy the upper oof diagonal part of // the matrix to work to be restored later. acctually // the zero's and one's putted are not used this is only // because we don't have a function that copy only the // upper part of A to be restored after copying the // lookahead panel that has been computted from GPU to CPU. magma_zpanel_to_q(MagmaUpper, pn-1, A(i, i+1), lda, work); trace_gpu_start( 0, 1, "get", "get panel" ); //magma_queue_sync( queues[0] ); magma_queue_wait_event(queues[1], Pupdate_event); //, 0); magma_zgetmatrix_async( (pm+pn), pn, dA( i, i), ldda, A ( i, i), lda, queues[1] ); trace_gpu_end( 0, 1 ); trace_gpu_start( 0, 2, "her2k", "her2k" ); magma_zher2k( MagmaLower, MagmaNoTrans, pm_old-pn_old, pn_old, c_neg_one, dA(indi_old+pn_old, indj_old), ldda, dW + pn_old, pm_old, d_one, dA(indi_old+pn_old, indi_old+pn_old), ldda, queues[0] ); trace_gpu_end( 0, 2 ); trace_cpu_start( 0, "sync", "sync on 1" ); magma_queue_sync( queues[1] ); trace_cpu_end( 0 ); magma_zq_to_panel(MagmaUpper, pn-1, A(i, i+1), lda, work); } /* ========================================================== QR factorization on a panel starting nb off of the diagonal. Prepare the V and T matrices. ========================================================== */ #ifdef TRACING snprintf( buf, sizeof(buf), "panel %d", i ); #endif trace_cpu_start( 0, "geqrf", buf ); lapackf77_zgeqrf(&pm, &pn, A(indi, indj), &lda, tau_ref(i), work, &lwork, info); /* Form the matrix T */ pk=min(pm,pn); lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &pm, &pk, A(indi, indj), &lda, tau_ref(i), hT, &nb); /* Prepare V - put 0s in the upper triangular part of the panel (and 1s on the diagonal), temporaly storing the original in work */ magma_zpanel_to_q(MagmaUpper, pk, A(indi, indj), lda, work); trace_cpu_end( 0 ); /* Send V from the CPU to the GPU */ trace_gpu_start( 0, 0, "set", "set V and T" ); magma_zsetmatrix_async( pm, pk, A(indi, indj), lda, dA(indi, indj), ldda, queues[0] ); /* Send the triangular factor T to the GPU */ magma_zsetmatrix_async( pk, pk, hT, nb, dT(i), lddt, queues[0] ); trace_gpu_end( 0, 0 ); /* ========================================================== Compute W: 1. X = A (V T) 2. W = X - 0.5* V * (T' * (V' * X)) ========================================================== */ /* dwork = V T */ trace_cpu_start( 0, "sync", "sync on 0" ); // this sync is done here to be sure that the copy has been finished // because below we made a restore magma_zq_to_panel and this restore need // to ensure that the copy has been finished. we did it here to allow // overlapp of restore with next gemm and symm. magma_queue_sync( queues[0] ); trace_cpu_end( 0 ); trace_gpu_start( 0, 2, "gemm", "work = V*T" ); magma_zgemm( MagmaNoTrans, MagmaNoTrans, pm, pk, pk, c_one, dA(indi, indj), ldda, dT(i), lddt, c_zero, dwork, pm, queues[0] ); trace_gpu_end( 0, 2 ); /* dW = X = A*V*T. dW = A*dwork */ trace_gpu_start( 0, 2, "hemm", "X = A*work" ); magma_zhemm( MagmaLeft, uplo, pm, pk, c_one, dA(indi, indi), ldda, dwork, pm, c_zero, dW, pm, queues[0] ); trace_gpu_end( 0, 2 ); /* restore the panel */ magma_zq_to_panel(MagmaUpper, pk, A(indi, indj), lda, work); /* dwork = V*T already ==> dwork' = T'*V' * compute T'*V'*X ==> dwork'*W ==> * dwork + pm*nb = ((T' * V') * X) = dwork' * X = dwork' * W */ trace_gpu_start( 0, 2, "gemm", "work = T'*V'*X" ); magma_zgemm( MagmaConjTrans, MagmaNoTrans, pk, pk, pm, c_one, dwork, pm, dW, pm, c_zero, dwork + pm*nb, nb, queues[0] ); trace_gpu_end( 0, 2 ); /* W = X - 0.5 * V * T'*V'*X * = X - 0.5 * V * (dwork + pm*nb) = W - 0.5 * V * (dwork + pm*nb) */ trace_gpu_start( 0, 2, "gemm", "W = X - 0.5*V*(T'*V'*X)" ); magma_zgemm( MagmaNoTrans, MagmaNoTrans, pm, pk, pk, c_neg_half, dA(indi, indj), ldda, dwork + pm*nb, nb, c_one, dW, pm, queues[0] ); trace_gpu_end( 0, 2 ); /* ========================================================== Update the unreduced submatrix A(i+ib:n,i+ib:n), using an update of the form: A := A - V*W' - W*V' ========================================================== */ if (i + nb <= n-nb) { /* There would be next iteration; do lookahead - update the next panel */ trace_gpu_start( 0, 2, "gemm", "gemm 4 next panel left" ); magma_zgemm( MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one, dA(indi, indj), ldda, dW, pm, c_one, dA(indi, indi), ldda, queues[0] ); trace_gpu_end( 0, 2 ); trace_gpu_start( 0, 2, "gemm", "gemm 5 next panel right" ); magma_zgemm( MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one, dW, pm, dA(indi, indj), ldda, c_one, dA(indi, indi), ldda, queues[0] ); trace_gpu_end( 0, 2 ); magma_event_record(Pupdate_event, queues[0]); } else { /* no look-ahead as this is last iteration */ trace_gpu_start( 0, 2, "her2k", "her2k last iteration" ); magma_zher2k( MagmaLower, MagmaNoTrans, pk, pk, c_neg_one, dA(indi, indj), ldda, dW, pm, d_one, dA(indi, indi), ldda, queues[0] ); trace_gpu_end( 0, 2 ); } indi_old = indi; indj_old = indj; pm_old = pm; pn_old = pn; } // end loop for (i) /* Send the last block to the CPU */ pk = min(pm,pn); if (1 <= n-nb) { magma_zpanel_to_q(MagmaUpper, pk-1, A(n-pk+1, n-pk+2), lda, work); trace_gpu_start( 0, 2, "get", "get last block" ); magma_zgetmatrix( pk, pk, dA(n-pk+1, n-pk+1), ldda, A(n-pk+1, n-pk+1), lda, queues[0] ); trace_gpu_end( 0, 2 ); magma_zq_to_panel(MagmaUpper, pk-1, A(n-pk+1, n-pk+2), lda, work); } }// end of LOWER trace_finalize( "zhetrd_he2hb.svg", "trace.css" ); magma_queue_sync( queues[0] ); magma_queue_sync( queues[1] ); magma_event_destroy( Pupdate_event ); magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); magma_free( dA ); magma_set_lapack_numthreads( orig_threads ); return *info; } /* magma_zhetrd_he2hb */