extern "C" magma_int_t magma_zlahr2_m( magma_int_t n, magma_int_t k, magma_int_t nb, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *T, magma_int_t ldt, magmaDoubleComplex *Y, magma_int_t ldy, struct zgehrd_data* data ) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZLAHR2 reduces the first NB columns of a complex general n-BY-(n-k+1) matrix A so that elements below the k-th subdiagonal are zero. The reduction is performed by an orthogonal similarity transformation Q' * A * Q. The routine returns the matrices V and T which determine Q as a block reflector I - V*T*V', and also the matrix Y = A * V. (Note this is different than LAPACK, which computes Y = A * V * T.) This is an auxiliary routine called by ZGEHRD. Arguments ========= N (input) INTEGER The order of the matrix A. K (input) INTEGER The offset for the reduction. Elements below the k-th subdiagonal in the first NB columns are reduced to zero. K < N. NB (input) INTEGER The number of columns to be reduced. A (input/output) COMPLEX_16 array, dimension (LDA,N-K+1) On entry, the n-by-(n-k+1) general matrix A. On exit, the elements on and above the k-th subdiagonal in the first NB columns are overwritten with the corresponding elements of the reduced matrix; the elements below the k-th subdiagonal, with the array TAU, represent the matrix Q as a product of elementary reflectors. The other columns of A are unchanged. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). TAU (output) COMPLEX_16 array, dimension (NB) The scalar factors of the elementary reflectors. See Further Details. T (output) COMPLEX_16 array, dimension (LDT,NB) The upper triangular matrix T. LDT (input) INTEGER The leading dimension of the array T. LDT >= NB. Y (output) COMPLEX_16 array, dimension (LDY,NB) The n-by-nb matrix Y. LDY (input) INTEGER The leading dimension of the array Y. LDY >= N. dA (input/output) COMPLEX_16 array on the GPU, dimension (LDA,N-K+1) On entry, the n-by-(n-k+1) general matrix A. On exit, the elements in rows K:N of the first NB columns are overwritten with the matrix Y. DV (output) COMPLEX_16 array on the GPU, dimension (N, NB) On exit this contains the Householder vectors of the transformation. Further Details =============== The matrix Q is represented as a product of nb elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in A(i+k+1:n,i), and tau in TAU(i). The elements of the vectors v together form the (n-k+1)-by-nb matrix V which is needed, with T and Y, to apply the transformation to the unreduced part of the matrix, using an update of the form: A := (I - V*T*V') * (A - Y*T*V'). The contents of A on exit are illustrated by the following example with n = 7, k = 3 and nb = 2: ( a a a a a ) ( a a a a a ) ( a a a a a ) ( h h a a a ) ( v1 h a a a ) ( v1 v2 a a a ) ( v1 v2 a a a ) where "a" denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. ===================================================================== */ #define A( i, j ) ( A + (i) + (j)*lda) #define Y( i, j ) ( Y + (i) + (j)*ldy) #define T( i, j ) ( T + (i) + (j)*ldt) #define dA( d, i, j ) (data->A [d] + (i) + (j)*ldda) #define dTi( d ) (data->Ti[d]) #define dV( d, i, j ) (data->V [d] + (i) + (j)*ldv ) #define dVd( d, i, j ) (data->Vd[d] + (i) + (j)*ldvd) #define dY( d, i, j ) (data->Y [d] + (i) + (j)*ldda) magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex tmp; magma_int_t ngpu = data->ngpu; magma_int_t ldda = data->ldda; magma_int_t ldv = data->ldv; magma_int_t ldvd = data->ldvd; magma_int_t ione = 1; magma_int_t d, dki1, dn, nblocks, gblock, lblock, lgid; magma_int_t n_k_i_1, n_k; magmaDoubleComplex scale; magma_int_t i; magmaDoubleComplex ei = MAGMA_Z_ZERO; magma_int_t info_data = 0; magma_int_t *info = &info_data; if (n < 0) { *info = -1; } else if (k < 0 || k >= n) { *info = -2; } else if (nb < 1 || nb > n) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (ldt < nb) { *info = -8; } else if (ldy < max(1,n)) { *info = -10; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } // adjust from 1-based indexing k -= 1; // Function Body if (n <= 1) return 0; // zero out current top block of V on all GPUs for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( data->streams[d] ); magmablas_zlaset( MagmaUpperLower, nb, nb, dV(d,k,0), ldv ); } // set all Y=0 lapackf77_zlaset( "Full", &n, &nb, &c_zero, &c_zero, Y, &ldy ); for (i = 0; i < nb; ++i) { n_k_i_1 = n - k - i - 1; n_k = n - k; if (i > 0) { // Finish applying I - V * T * V' on right tmp = MAGMA_Z_NEGATE( tau[i-1] ); blasf77_zaxpy( &n_k, &tmp, Y(k,i-1), &ione, A(k,i), &ione ); // Apply I - V * T' * V' to this column (call it b) from the // left, using the last column of T as workspace, w. // // Let V = ( V1 ) and b = ( b1 ) (first i-1 rows) // ( V2 ) ( b2 ) // where V1 is unit lower triangular // w := b1 = A(k+1:k+i, i) blasf77_zcopy( &i, A(k+1,i), &ione, T(0,nb-1), &ione ); // w := V1' * b1 = VA(k+1:k+i, 0:i-1)' * w blasf77_ztrmv( "Lower", "Conj", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // w := w + V2'*b2 = w + VA(k+i+1:n-1, 0:i-1)' * A(k+i+1:n-1, i) blasf77_zgemv( "Conj", &n_k_i_1, &i, &c_one, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_one, T(0,nb-1), &ione ); // w := T'*w = T(0:i-1, 0:i-1)' * w blasf77_ztrmv( "Upper", "Conj", "Non-unit", &i, T(0,0), &ldt, T(0,nb-1), &ione ); // b2 := b2 - V2*w = A(k+i+1:n-1, i) - VA(k+i+1:n-1, 0:i-1) * w blasf77_zgemv( "No trans", &n_k_i_1, &i, &c_neg_one, A(k+i+1,0), &lda, T(0,nb-1), &ione, &c_one, A(k+i+1,i), &ione ); // w := V1*w = VA(k+1:k+i, 0:i-1) * w blasf77_ztrmv( "Lower", "No trans", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // b1 := b1 - w = A(k+1:k+i-1, i) - w blasf77_zaxpy( &i, &c_neg_one, T(0,nb-1), &ione, A(k+1,i), &ione ); // Restore diagonal element, saved below during previous iteration *A(k+i,i-1) = ei; } // Generate the elementary reflector H(i) to annihilate A(k+i+1:n-1,i) lapackf77_zlarfg( &n_k_i_1, A(k+i+1,i), A(k+i+2,i), &ione, &tau[i] ); // Save diagonal element and set to one, to simplify multiplying by V ei = *A(k+i+1,i); *A(k+i+1,i) = c_one; // compute yi = A vi = sum_g A{d} vi{d} nblocks = (n-1) / nb / ngpu + 1; for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( data->streams[d] ); // dV(k+i+1:n-1, i) = VA(k+i:n, i) magma_zsetvector_async( n_k_i_1, A(k+i+1,i), 1, dV(d, k+i+1, i), 1, data->streams[d] ); // copy column of dV -> dVd, using block cyclic distribution. // This assumes V and Vd have been padded so that // a 2D matrix copy doesn't access them out-of-bounds gblock = k / nb; lblock = gblock / ngpu; lgid = gblock % ngpu; if ( d < lgid ) { lblock += 1; } // treat V as (nb*ngpu) x nblock matrix, and Vd as nb x nblock matrix magmablas_zlacpy( 'F', nb, nblocks-lblock, dV (d, d*nb + lblock*nb*ngpu, i), nb*ngpu, dVd(d, 0 + lblock*nb , i), nb ); // convert global indices (k) to local indices (dk) magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn ); // dY(k:n, i) = dA(k:n, k+i+1:n) * dV(k+i+1:n, i) // skip if matrix is empty // each GPU copies to different temporary vector in Y, // which are summed in separate loop below if ( dn-dki1 > 0 ) { magma_zgemv( 'N', n-k, dn-dki1, c_one, dA (d, k , dki1), ldda, dVd(d, dki1, i), 1, c_zero, dY (d, k , i), 1 ); // copy vector to host, storing in column nb+d of Y // as temporary space (Y has >= nb+ngpu columns) magma_zgetvector_async( n-k, dY(d, k, i), 1, Y(k, nb+d), 1, data->streams[d] ); } } // while GPU is doing above Ag*v... // Compute T(0:i,i) = [ -tau T V' vi ] // [ tau ] // T(0:i-1, i) = -tau VA(k+i+1:n-1, 0:i-1)' VA(k+i+1:n-1, i) scale = MAGMA_Z_NEGATE( tau[i] ); blasf77_zgemv( "Conj", &n_k_i_1, &i, &scale, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_zero, T(0,i), &ione ); // T(0:i-1, i) = T(0:i-1, 0:i-1) * T(0:i-1, i) blasf77_ztrmv( "Upper", "No trans", "Non-unit", &i, T(0,0), &ldt, T(0,i), &ione ); *T(i,i) = tau[i]; // apply reflectors to next column, A(i+1), on right only. // one axpy will be required to finish this, in the next iteration above if ( i > 0 && i+1 < nb ) { // Update next column, A(k:n,i+1), applying Q on right. // One axpy will be required to finish this, in the next iteration // above, after yi is computed. // This updates one more row than LAPACK does (row k), // making block above panel an even multiple of nb. // Use last column of T as workspace, w. magma_int_t i1 = i+1; // If complex, conjugate row of V, and undo afterwards #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv( &i1, A(k+i1,0), &lda ); #endif // w = T(0:i, 0:i+1) * VA(k+i+1, 0:i+1)' // T is now rectangular, so we use gemv instead of trmv as in lapack. blasf77_zgemv( "No trans", &i, &i1, &c_one, T(0,0), &ldt, A(k+i1,0), &lda, &c_zero, T(0,nb-1), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv( &i1, A(k+i1,0), &lda ); #endif // A(k:n, i+1) -= Y(k:n, 0:i) * w blasf77_zgemv( "No trans", &n_k, &i, &c_neg_one, Y(k,0), &ldy, T(0,nb-1), &ione, &c_one, A(k,i1), &ione ); } // yi = sum_g yi{d} for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_queue_sync( data->streams[d] ); magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn ); if ( dn-dki1 > 0 ) { // yi = yi + yi{d} blasf77_zaxpy( &n_k, &c_one, Y(k,nb+d), &ione, Y(k,i), &ione ); } } } // Restore diagonal element *A(k+nb,nb-1) = ei; // compute Y = Am V = sum_g Am{d} V{d} --- top part, Y(0:k-1,:) for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( data->streams[d] ); // convert global indices (k) to local indices (dk) magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn ); // dY(0:k, :) = dA(0:k, k+i+1:n-1) * dV(k+i+1:n-1, :) // skip if matrix is empty // each GPU copies to different temporary block in Y, // which are summed in separate loop below if ( dn-dki1 > 0 ) { magma_zgemm( 'N', 'N', k, nb, dn-dki1, c_one, dA (d, 0 , dki1), ldda, dVd(d, dki1, 0), ldvd, c_zero, dY (d, 0 , 0), ldda ); // copy result to host, storing in columns [nb + nb*d : nb + nb*(d+1)] of Y // as temporary space (Y has nb + nb*ngpu columns) magma_zgetmatrix_async( k, nb, dY(d, 0, 0), ldda, Y(0,nb+nb*d), ldy, data->streams[d] ); } } // Y = sum_g Y{d} for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_queue_sync( 0 ); magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn ); if ( dn-dki1 > 0 ) { // Y = Y + Am V for( i = 0; i < nb; ++i ) { blasf77_zaxpy( &k, &c_one, Y(0,nb+nb*d+i), &ione, Y(0,i), &ione ); } } } // copy Y and T matrices to GPUs for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_zsetmatrix_async( n, nb, Y, ldy, dY(d, 0, 0), ldda, data->streams[d] ); magma_zsetmatrix_async( nb, nb, T, nb, dTi(d), nb, data->streams[d] ); } return 0; } // magma_zlahr2
/** Purpose ------- ZLATRD reduces NB rows and columns of a complex Hermitian matrix A to Hermitian tridiagonal form by an orthogonal similarity transformation Q' * A * Q, and returns the matrices V and W which are needed to apply the transformation to the unreduced part of A. If UPLO = MagmaUpper, ZLATRD reduces the last NB rows and columns of a matrix, of which the upper triangle is supplied; if UPLO = MagmaLower, ZLATRD reduces the first NB rows and columns of a matrix, of which the lower triangle is supplied. This is an auxiliary routine called by ZHETRD. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] uplo magma_uplo_t Specifies whether the upper or lower triangular part of the Hermitian matrix A is stored: - = MagmaUpper: Upper triangular - = MagmaLower: Lower triangular @param[in] n INTEGER The order of the matrix A. @param[in] nb INTEGER The number of rows and columns to be reduced. @param[in] nb0 INTEGER The block size used for the matrix distribution. nb and nb0 can be different for the final step of zhetrd. @param[in,out] A COMPLEX_16 array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit: - if UPLO = MagmaUpper, the last NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements above the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; - if UPLO = MagmaLower, the first NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements below the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= (1,N). @param[out] e COMPLEX_16 array, dimension (N-1) If UPLO = MagmaUpper, E(n-nb:n-1) contains the superdiagonal elements of the last NB columns of the reduced matrix; if UPLO = MagmaLower, E(1:nb) contains the subdiagonal elements of the first NB columns of the reduced matrix. @param[out] tau COMPLEX_16 array, dimension (N-1) The scalar factors of the elementary reflectors, stored in TAU(n-nb:n-1) if UPLO = MagmaUpper, and in TAU(1:nb) if UPLO = MagmaLower. See Further Details. @param[out] W COMPLEX_16 array, dimension (LDW,NB) The n-by-nb matrix W required to update the unreduced part of A. @param[in] ldw INTEGER The leading dimension of the array W. LDW >= max(1,N). @param dA @param[in] ldda @param[in] offset @param dW @param[in] lddw @param hwork @param[in] lhwork @param dwork @param[in] ldwork @param[in] queues magma_queue_t array of dimension (ngpu). queues[dev] is an execution queue on GPU dev. Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n) H(n-1) . . . H(n-nb+1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i), and tau in TAU(i-1). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i), and tau in TAU(i). The elements of the vectors v together form the n-by-nb matrix V which is needed, with W, to apply the transformation to the unreduced part of the matrix, using a Hermitian rank-2k update of the form: A := A - V*W' - W*V'. The contents of A on exit are illustrated by the following examples with n = 5 and nb = 2: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( a a a v4 v5 ) ( d ) ( a a v4 v5 ) ( 1 d ) ( a 1 v5 ) ( v1 1 a ) ( d 1 ) ( v1 v2 a a ) ( d ) ( v1 v2 a a a ) where d denotes a diagonal element of the reduced matrix, a denotes an element of the original matrix that is unchanged, and vi denotes an element of the vector defining H(i). @ingroup magma_zheev_aux ********************************************************************/ extern "C" magma_int_t magma_zlatrd_mgpu( magma_int_t ngpu, magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magma_int_t nb0, magmaDoubleComplex *A, magma_int_t lda, double *e, magmaDoubleComplex *tau, magmaDoubleComplex *W, magma_int_t ldw, magmaDoubleComplex_ptr dA[], magma_int_t ldda, magma_int_t offset, magmaDoubleComplex_ptr dW[], magma_int_t lddw, magmaDoubleComplex *hwork, magma_int_t lhwork, magmaDoubleComplex_ptr dwork[], magma_int_t ldwork, magma_queue_t queues[] ) { #define A(i, j) (A + (j)*lda + (i)) #define W(i, j) (W + (j)*ldw + (i)) #define dA(dev, i, j) (dA[(dev)] + ((j)+loffset)*ldda + (i) + offset) #define dW(dev, i, j) (dW[(dev)] + (j) *lddw + (i)) #define dW1(dev, i, j) (dW[(dev)] + ((j)+nb) *lddw + (i)) /* Constants */ const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; const magmaDoubleComplex c_one = MAGMA_Z_ONE; const magmaDoubleComplex c_zero = MAGMA_Z_ZERO; const magma_int_t ione = 1; /* Local variables */ magmaDoubleComplex alpha, value; magma_int_t dev; magma_int_t i, n_i, n_i_1, ip1, iw; // TODO check arguments magma_int_t info = 0; if (n <= 0) { return info; } // TODO allocate f in zhetrd and pass into zlatrd. (e.g., expand hwork a bit) magmaDoubleComplex *f; magma_zmalloc_cpu( &f, n ); if ( f == NULL ) { info = MAGMA_ERR_HOST_ALLOC; return info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); if (uplo == MagmaUpper) { /* Reduce last NB columns of upper triangle */ for (i = n-1; i >= n - nb; --i) { ip1 = i + 1; n_i_1 = n - i - 1; iw = i - n + nb; if (i < n-1) { /* Update A(1:i,i) */ magmaDoubleComplex wii = -conj( *W(i, iw+1) ); blasf77_zaxpy( &ip1, &wii, A(0, i+1), &ione, A(0, i), &ione ); wii = -conj( *A(i, i+1) ); blasf77_zaxpy( &ip1, &wii, W(0, iw+1), &ione, A(0, i), &ione ); } if (i > 0) { /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */ alpha = *A(i-1, i); lapackf77_zlarfg( &i, &alpha, A(0, i), &ione, &tau[i - 1] ); e[i-1] = MAGMA_Z_REAL( alpha ); *A(i-1,i) = MAGMA_Z_ONE; // TODO Previously, this set dx2[dev] = dW1(dev, 0, iw); and used dx2 in zhemv. // TODO Now zhemv handles broadcasting x to the GPUs, but data in dW1 is // TODO apparently still used in zhetrd_mgpu / zher2k_mgpu. for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_zsetvector_async( n, A(0,i), 1, dW1(dev, 0, iw), 1, queues[dev] ); } magmablas_zhemv_mgpu( MagmaUpper, i, c_one, dA, ldda, 0, A(0,i), 1, c_zero, W(0, iw), 1, hwork, lhwork, dwork, ldwork, ngpu, nb0, queues ); if (i < n-1) { blasf77_zgemv( MagmaConjTransStr, &i, &n_i_1, &c_one, W(0, iw+1), &ldw, A(0, i), &ione, &c_zero, W(i+1, iw), &ione ); } /* overlap update */ if ( i < n-1 && i-1 >= n - nb ) { /* Update A(1:i,i) */ #ifdef COMPLEX lapackf77_zlacgv( &n_i_1, W(i-1, iw+1), &ldw ); #endif blasf77_zgemv( "No transpose", &i, &n_i_1, &c_neg_one, A(0, i+1), &lda, W(i-1, iw+1), &ldw, &c_one, A(0, i-1), &ione ); #ifdef COMPLEX lapackf77_zlacgv( &n_i_1, W(i-1, iw+1), &ldw ); lapackf77_zlacgv( &n_i_1, A(i-1, i +1), &lda ); #endif blasf77_zgemv( "No transpose", &i, &n_i_1, &c_neg_one, W(0, iw+1), &ldw, A(i-1, i+1), &lda, &c_one, A(0, i-1), &ione ); #ifdef COMPLEX lapackf77_zlacgv( &n_i_1, A(i-1, i+1), &lda ); #endif } // synchronize to get zhemv result W(0, iw) magmablas_zhemv_mgpu_sync( MagmaUpper, i, c_one, dA, ldda, 0, A(0,i), 1, c_zero, W(0, iw), 1, hwork, lhwork, dwork, ldwork, ngpu, nb0, queues ); if (i < n-1) { blasf77_zgemv( "No transpose", &i, &n_i_1, &c_neg_one, A(0, i+1), &lda, W(i+1, iw), &ione, &c_one, W(0, iw), &ione ); blasf77_zgemv( MagmaConjTransStr, &i, &n_i_1, &c_one, A(0, i+1), &lda, A(0, i), &ione, &c_zero, W(i+1, iw), &ione ); blasf77_zgemv( "No transpose", &i, &n_i_1, &c_neg_one, W(0, iw+1), &ldw, W(i+1, iw), &ione, &c_one, W(0, iw), &ione ); } blasf77_zscal( &i, &tau[i - 1], W(0, iw), &ione ); value = magma_cblas_zdotc( i, W(0,iw), ione, A(0,i), ione ); alpha = tau[i - 1] * -0.5f * value; blasf77_zaxpy( &i, &alpha, A(0, i), &ione, W(0, iw), &ione ); for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_zsetvector_async( n, W(0,iw), 1, dW(dev, 0, iw), 1, queues[dev] ); } } } } else { /* Reduce first NB columns of lower triangle */ for (i = 0; i < nb; ++i) { /* Update A(i:n,i) */ n_i = n - i; //idw = ((offset+i)/nb)%ngpu; if ( i > 0 ) { trace_cpu_start( 0, "gemv", "gemv" ); magmaDoubleComplex wii = -conj( *W(i, i-1) ); blasf77_zaxpy( &n_i, &wii, A(i, i-1), &ione, A(i, i), &ione ); wii = -conj( *A(i, i-1) ); blasf77_zaxpy( &n_i, &wii, W(i, i-1), &ione, A(i, i), &ione ); } if (i < n-1) { /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */ n_i_1 = n - i - 1; trace_cpu_start( 0, "larfg", "larfg" ); alpha = *A(i+1, i); lapackf77_zlarfg( &n_i_1, &alpha, A(min(i+2,n-1), i), &ione, &tau[i] ); e[i] = MAGMA_Z_REAL( alpha ); *A(i+1,i) = MAGMA_Z_ONE; trace_cpu_end( 0 ); /* Compute W(i+1:n,i) */ // TODO Previously, this set dx2[id] = dW1(id, 0, i)-offset; and used dx2 in zhemv. // TODO Now zhemv handles broadcasting x to the GPUs, but data in dW1 is // TODO apparently still used in zhetrd_mgpu / zher2k_mgpu. for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_zsetvector_async( n, A(0,i), 1, dW1(dev, 0, i), 1, queues[dev] ); } magmablas_zhemv_mgpu( MagmaLower, n_i_1, c_one, dA, ldda, offset+i+1, A(i+1, i), 1, c_zero, W(i+1, i), 1, hwork, lhwork, dwork, ldwork, ngpu, nb0, queues ); trace_cpu_start( 0, "gemv", "gemv" ); blasf77_zgemv( MagmaConjTransStr, &n_i_1, &i, &c_one, W(i+1, 0), &ldw, A(i+1, i), &ione, &c_zero, W(0, i), &ione ); blasf77_zgemv( "No transpose", &n_i_1, &i, &c_neg_one, A(i+1, 0), &lda, W(0, i), &ione, &c_zero, f, &ione ); blasf77_zgemv( MagmaConjTransStr, &n_i_1, &i, &c_one, A(i+1, 0), &lda, A(i+1, i), &ione, &c_zero, W(0, i), &ione ); trace_cpu_end( 0 ); /* overlap update */ if ( i > 0 && i+1 < n ) { trace_cpu_start( 0, "gemv", "gemv" ); #ifdef COMPLEX lapackf77_zlacgv( &i, W(i+1, 0), &ldw ); #endif blasf77_zgemv( "No transpose", &n_i_1, &i, &c_neg_one, A(i+1, 0), &lda, W(i+1, 0), &ldw, &c_one, A(i+1, i+1), &ione ); #ifdef COMPLEX lapackf77_zlacgv( &i, W(i+1, 0), &ldw ); lapackf77_zlacgv( &i, A(i+1, 0), &lda ); #endif blasf77_zgemv( "No transpose", &n_i_1, &i, &c_neg_one, W(i+1, 0), &ldw, A(i+1, 0), &lda, &c_one, A(i+1, i+1), &ione ); #ifdef COMPLEX lapackf77_zlacgv( &i, A(i+1, 0), &lda ); #endif trace_cpu_end( 0 ); } // synchronize to get zhemv result W(i+1, i) magmablas_zhemv_mgpu_sync( MagmaLower, n_i_1, c_one, dA, ldda, offset+i+1, A(i+1, i), 1, c_zero, W(i+1, i), 1, hwork, lhwork, dwork, ldwork, ngpu, nb0, queues ); trace_cpu_start( 0, "axpy", "axpy" ); if (i != 0) { blasf77_zaxpy( &n_i_1, &c_one, f, &ione, W(i+1, i), &ione ); } blasf77_zgemv( "No transpose", &n_i_1, &i, &c_neg_one, W(i+1, 0), &ldw, W(0, i), &ione, &c_one, W(i+1, i), &ione ); blasf77_zscal( &n_i_1, &tau[i], W(i+1,i), &ione ); value = magma_cblas_zdotc( n_i_1, W(i+1,i), ione, A(i+1,i), ione ); alpha = tau[i] * -0.5f * value; blasf77_zaxpy( &n_i_1, &alpha, A(i+1, i), &ione, W(i+1,i), &ione ); trace_cpu_end( 0 ); for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_zsetvector_async( n, W(0,i), 1, dW(dev, 0, i), 1, queues[dev] ); } } } } magma_free_cpu( f ); magma_setdevice( orig_dev ); return info; } /* magma_zlatrd_mgpu */
extern "C" magma_int_t magma_zpidr_strms( magma_z_matrix A, magma_z_matrix b, magma_z_matrix *x, magma_z_solver_par *solver_par, magma_z_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_PIDRMERGE; solver_par->numiter = 0; solver_par->spmv_count = 0; solver_par->init_res = 0.0; solver_par->final_res = 0.0; solver_par->iter_res = 0.0; solver_par->runtime = 0.0; // constants const magmaDoubleComplex c_zero = MAGMA_Z_ZERO; const magmaDoubleComplex c_one = MAGMA_Z_ONE; const magmaDoubleComplex c_n_one = MAGMA_Z_NEG_ONE; // internal user options const magma_int_t smoothing = 1; // 0 = disable, 1 = enable const double angle = 0.7; // [0-1] // local variables magma_int_t iseed[4] = {0, 0, 0, 1}; magma_int_t dof; magma_int_t s; magma_int_t distr; magma_int_t k, i, sk; magma_int_t innerflag; magma_int_t ldd; magma_int_t q; double residual; double nrm; double nrmb; double nrmr; double nrmt; double rho; magmaDoubleComplex om; magmaDoubleComplex gamma; // matrices and vectors magma_z_matrix dxs = {Magma_CSR}; magma_z_matrix dr = {Magma_CSR}, drs = {Magma_CSR}; magma_z_matrix dP = {Magma_CSR}, dP1 = {Magma_CSR}; magma_z_matrix dG = {Magma_CSR}, dGcol = {Magma_CSR}; magma_z_matrix dU = {Magma_CSR}; magma_z_matrix dM = {Magma_CSR}; magma_z_matrix df = {Magma_CSR}; magma_z_matrix dt = {Magma_CSR}, dtt = {Magma_CSR}; magma_z_matrix dc = {Magma_CSR}; magma_z_matrix dv = {Magma_CSR}; magma_z_matrix dlu = {Magma_CSR}; magma_z_matrix dskp = {Magma_CSR}; magma_z_matrix dalpha = {Magma_CSR}; magma_z_matrix dbeta = {Magma_CSR}; magmaDoubleComplex *hMdiag = NULL; magmaDoubleComplex *hskp = NULL; magmaDoubleComplex *halpha = NULL; magmaDoubleComplex *hbeta = NULL; magmaDoubleComplex *d1 = NULL, *d2 = NULL; // queue variables const magma_int_t nqueues = 3; // number of queues magma_queue_t queues[nqueues]; // chronometry real_Double_t tempo1, tempo2; // create additional queues queues[0] = queue; for ( q = 1; q < nqueues; q++ ) { magma_queue_create( queue->device(), &(queues[q]) ); } // initial s space // TODO: add option for 's' (shadow space number) // Hack: uses '--restart' option as the shadow space number. // This is not a good idea because the default value of restart option is used to detect // if the user provided a custom restart. This means that if the default restart value // is changed then the code will think it was the user (unless the default value is // also updated in the 'if' statement below. s = 1; if ( solver_par->restart != 50 ) { if ( solver_par->restart > A.num_cols ) { s = A.num_cols; } else { s = solver_par->restart; } } solver_par->restart = s; // set max iterations solver_par->maxiter = min( 2 * A.num_cols, solver_par->maxiter ); // check if matrix A is square if ( A.num_rows != A.num_cols ) { //printf("Matrix A is not square.\n"); info = MAGMA_ERR_NOT_SUPPORTED; goto cleanup; } // |b| nrmb = magma_dznrm2( b.num_rows, b.dval, 1, queue ); if ( nrmb == 0.0 ) { magma_zscal( x->num_rows, MAGMA_Z_ZERO, x->dval, 1, queue ); info = MAGMA_SUCCESS; goto cleanup; } // t = 0 // make t twice as large to contain both, dt and dr ldd = magma_roundup( b.num_rows, 32 ); CHECK( magma_zvinit( &dt, Magma_DEV, ldd, 2, c_zero, queue )); dt.num_rows = b.num_rows; dt.num_cols = 1; dt.nnz = dt.num_rows; // redirect the dr.dval to the second part of dt CHECK( magma_zvinit( &dr, Magma_DEV, b.num_rows, 1, c_zero, queue )); magma_free( dr.dval ); dr.dval = dt.dval + ldd; // r = b - A x CHECK( magma_zresidualvec( A, b, *x, &dr, &nrmr, queue )); // |r| solver_par->init_res = nrmr; solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nrmr; } // check if initial is guess good enough if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { info = MAGMA_SUCCESS; goto cleanup; } // P = randn(n, s) // P = ortho(P) //--------------------------------------- // P = 0.0 CHECK( magma_zvinit( &dP, Magma_CPU, A.num_cols, s, c_zero, queue )); // P = randn(n, s) distr = 3; // 1 = unif (0,1), 2 = unif (-1,1), 3 = normal (0,1) dof = dP.num_rows * dP.num_cols; lapackf77_zlarnv( &distr, iseed, &dof, dP.val ); // transfer P to device CHECK( magma_zmtransfer( dP, &dP1, Magma_CPU, Magma_DEV, queue )); magma_zmfree( &dP, queue ); // P = ortho(P1) if ( dP1.num_cols > 1 ) { // P = magma_zqr(P1), QR factorization CHECK( magma_zqr( dP1.num_rows, dP1.num_cols, dP1, dP1.ld, &dP, NULL, queue )); } else { // P = P1 / |P1| nrm = magma_dznrm2( dof, dP1.dval, 1, queue ); nrm = 1.0 / nrm; magma_zdscal( dof, nrm, dP1.dval, 1, queue ); CHECK( magma_zmtransfer( dP1, &dP, Magma_DEV, Magma_DEV, queue )); } magma_zmfree( &dP1, queue ); //--------------------------------------- // allocate memory for the scalar products CHECK( magma_zmalloc_pinned( &hskp, 5 )); CHECK( magma_zvinit( &dskp, Magma_DEV, 4, 1, c_zero, queue )); CHECK( magma_zmalloc_pinned( &halpha, s )); CHECK( magma_zvinit( &dalpha, Magma_DEV, s, 1, c_zero, queue )); CHECK( magma_zmalloc_pinned( &hbeta, s )); CHECK( magma_zvinit( &dbeta, Magma_DEV, s, 1, c_zero, queue )); // workspace for merged dot product CHECK( magma_zmalloc( &d1, max(2, s) * b.num_rows )); CHECK( magma_zmalloc( &d2, max(2, s) * b.num_rows )); // smoothing enabled if ( smoothing > 0 ) { // set smoothing solution vector CHECK( magma_zmtransfer( *x, &dxs, Magma_DEV, Magma_DEV, queue )); // tt = 0 // make tt twice as large to contain both, dtt and drs ldd = magma_roundup( b.num_rows, 32 ); CHECK( magma_zvinit( &dtt, Magma_DEV, ldd, 2, c_zero, queue )); dtt.num_rows = dr.num_rows; dtt.num_cols = 1; dtt.nnz = dtt.num_rows; // redirect the drs.dval to the second part of dtt CHECK( magma_zvinit( &drs, Magma_DEV, dr.num_rows, 1, c_zero, queue )); magma_free( drs.dval ); drs.dval = dtt.dval + ldd; // set smoothing residual vector magma_zcopyvector( dr.num_rows, dr.dval, 1, drs.dval, 1, queue ); } // G(n,s) = 0 if ( s > 1 ) { ldd = magma_roundup( A.num_rows, 32 ); CHECK( magma_zvinit( &dG, Magma_DEV, ldd, s, c_zero, queue )); dG.num_rows = A.num_rows; } else { CHECK( magma_zvinit( &dG, Magma_DEV, A.num_rows, s, c_zero, queue )); } // dGcol represents a single column of dG, array pointer is set inside loop CHECK( magma_zvinit( &dGcol, Magma_DEV, dG.num_rows, 1, c_zero, queue )); magma_free( dGcol.dval ); // U(n,s) = 0 if ( s > 1 ) { ldd = magma_roundup( A.num_cols, 32 ); CHECK( magma_zvinit( &dU, Magma_DEV, ldd, s, c_zero, queue )); dU.num_rows = A.num_cols; } else { CHECK( magma_zvinit( &dU, Magma_DEV, A.num_cols, s, c_zero, queue )); } // M(s,s) = I CHECK( magma_zvinit( &dM, Magma_DEV, s, s, c_zero, queue )); CHECK( magma_zmalloc_pinned( &hMdiag, s )); magmablas_zlaset( MagmaFull, dM.num_rows, dM.num_cols, c_zero, c_one, dM.dval, dM.ld, queue ); // f = 0 CHECK( magma_zvinit( &df, Magma_DEV, dP.num_cols, 1, c_zero, queue )); // c = 0 CHECK( magma_zvinit( &dc, Magma_DEV, dM.num_cols, 1, c_zero, queue )); // v = r CHECK( magma_zmtransfer( dr, &dv, Magma_DEV, Magma_DEV, queue )); // lu = 0 CHECK( magma_zvinit( &dlu, Magma_DEV, dr.num_rows, 1, c_zero, queue )); //--------------START TIME--------------- // chronometry tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->timing[0] = 0.0; } om = MAGMA_Z_ONE; gamma = MAGMA_Z_ZERO; innerflag = 0; // start iteration do { solver_par->numiter++; // new RHS for small systems // f = P' r // Q1 magma_zgemvmdot_shfl( dP.num_rows, dP.num_cols, dP.dval, dr.dval, d1, d2, df.dval, queues[1] ); // skp[4] = f(k) // Q1 magma_zgetvector_async( 1, df.dval, 1, &hskp[4], 1, queues[1] ); // c(k:s) = f(k:s) // Q1 magma_zcopyvector_async( s, df.dval, 1, dc.dval, 1, queues[1] ); // c(k:s) = M(k:s,k:s) \ f(k:s) // Q1 magma_ztrsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, s, dM.dval, dM.ld, dc.dval, 1, queues[1] ); // shadow space loop for ( k = 0; k < s; ++k ) { sk = s - k; dGcol.dval = dG.dval + k * dG.ld; // v = r - G(:,k:s) c(k:s) // Q1 magmablas_zgemv( MagmaNoTrans, dG.num_rows, sk, c_n_one, dGcol.dval, dG.ld, &dc.dval[k], 1, c_one, dv.dval, 1, queues[1] ); // preconditioning operation // v = L \ v; // v = U \ v; // Q1 CHECK( magma_z_applyprecond_left( MagmaNoTrans, A, dv, &dlu, precond_par, queues[1] )); CHECK( magma_z_applyprecond_right( MagmaNoTrans, A, dlu, &dv, precond_par, queues[1] )); // sync Q0 --> U(:,k) = U(:,k) - U(:,1:k) * alpha(1:k) magma_queue_sync( queues[0] ); // U(:,k) = om * v + U(:,k:s) c(k:s) // Q1 magmablas_zgemv( MagmaNoTrans, dU.num_rows, sk, c_one, &dU.dval[k*dU.ld], dU.ld, &dc.dval[k], 1, om, dv.dval, 1, queues[1] ); // G(:,k) = A U(:,k) // Q1 CHECK( magma_z_spmv( c_one, A, dv, c_zero, dGcol, queues[1] )); solver_par->spmv_count++; // bi-orthogonalize the new basis vectors for ( i = 0; i < k; ++i ) { // alpha = P(:,i)' G(:,k) // Q1 halpha[i] = magma_zdotc( dP.num_rows, &dP.dval[i*dP.ld], 1, dGcol.dval, 1, queues[1] ); // implicit sync Q1 --> alpha = P(:,i)' G(:,k) // alpha = alpha / M(i,i) halpha[i] = halpha[i] / hMdiag[i]; // G(:,k) = G(:,k) - alpha * G(:,i) // Q1 magma_zaxpy( dG.num_rows, -halpha[i], &dG.dval[i*dG.ld], 1, dGcol.dval, 1, queues[1] ); } // sync Q1 --> compute new G, skp[4] = f(k magma_queue_sync( queues[1] ); // new column of M = P'G, first k-1 entries are zero // M(k:s,k) = P(:,k:s)' G(:,k) // Q2 magma_zgemvmdot_shfl( dP.num_rows, sk, &dP.dval[k*dP.ld], dGcol.dval, d1, d2, &dM.dval[k*dM.ld+k], queues[2] ); // U(:,k) = v // Q0 magma_zcopyvector_async( dU.num_rows, dv.dval, 1, &dU.dval[k*dU.ld], 1, queues[0] ); // non-first s iteration if ( k > 0 ) { // alpha = dalpha // Q0 magma_zsetvector_async( k, halpha, 1, dalpha.dval, 1, queues[0] ); // U update outside of loop using GEMV // U(:,k) = U(:,k) - U(:,1:k) * alpha(1:k) // Q0 magmablas_zgemv( MagmaNoTrans, dU.num_rows, k, c_n_one, dU.dval, dU.ld, dalpha.dval, 1, c_one, &dU.dval[k*dU.ld], 1, queues[0] ); } // Mdiag(k) = M(k,k) // Q2 magma_zgetvector( 1, &dM.dval[k*dM.ld+k], 1, &hMdiag[k], 1, queues[2] ); // implicit sync Q2 --> Mdiag(k) = M(k,k) // check M(k,k) == 0 if ( MAGMA_Z_EQUAL(hMdiag[k], MAGMA_Z_ZERO) ) { innerflag = 1; info = MAGMA_DIVERGENCE; break; } // beta = f(k) / M(k,k) hbeta[k] = hskp[4] / hMdiag[k]; // check for nan if ( magma_z_isnan( hbeta[k] ) || magma_z_isinf( hbeta[k] )) { innerflag = 1; info = MAGMA_DIVERGENCE; break; } // non-last s iteration if ( (k + 1) < s ) { // f(k+1:s) = f(k+1:s) - beta * M(k+1:s,k) // Q1 magma_zaxpy( sk-1, -hbeta[k], &dM.dval[k*dM.ld+(k+1)], 1, &df.dval[k+1], 1, queues[1] ); // c(k+1:s) = f(k+1:s) // Q1 magma_zcopyvector_async( sk-1, &df.dval[k+1], 1, &dc.dval[k+1], 1, queues[1] ); // c(k+1:s) = M(k+1:s,k+1:s) \ f(k+1:s) // Q1 magma_ztrsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, sk-1, &dM.dval[(k+1)*dM.ld+(k+1)], dM.ld, &dc.dval[k+1], 1, queues[1] ); // skp[4] = f(k+1) // Q1 magma_zgetvector_async( 1, &df.dval[k+1], 1, &hskp[4], 1, queues[1] ); } // r = r - beta * G(:,k) // Q2 magma_zaxpy( dr.num_rows, -hbeta[k], dGcol.dval, 1, dr.dval, 1, queues[2] ); // smoothing disabled if ( smoothing <= 0 ) { // |r| // Q2 nrmr = magma_dznrm2( dr.num_rows, dr.dval, 1, queues[2] ); // implicit sync Q2 --> |r| // v = r // Q1 magma_zcopyvector_async( dr.num_rows, dr.dval, 1, dv.dval, 1, queues[1] ); // smoothing enabled } else { // x = x + beta * U(:,k) // Q0 magma_zaxpy( x->num_rows, hbeta[k], &dU.dval[k*dU.ld], 1, x->dval, 1, queues[0] ); // smoothing operation //--------------------------------------- // t = rs - r // Q2 magma_zidr_smoothing_1( drs.num_rows, drs.num_cols, drs.dval, dr.dval, dtt.dval, queues[2] ); // t't // t'rs // Q2 CHECK( magma_zgemvmdot_shfl( dt.ld, 2, dtt.dval, dtt.dval, d1, d2, &dskp.dval[2], queues[2] )); // skp[2-3] = dskp[2-3] // Q2 magma_zgetvector( 2, &dskp.dval[2], 1, &hskp[2], 1, queues[2] ); // implicit sync Q2 --> skp = dskp // gamma = (t' * rs) / (t' * t) gamma = hskp[3] / hskp[2]; // xs = xs - gamma * (xs - x) // Q0 magma_zidr_smoothing_2( dxs.num_rows, dxs.num_cols, -gamma, x->dval, dxs.dval, queues[0] ); // v = r // Q1 magma_zcopyvector_async( dr.num_rows, dr.dval, 1, dv.dval, 1, queues[1] ); // rs = rs - gamma * t // Q2 magma_zaxpy( drs.num_rows, -gamma, dtt.dval, 1, drs.dval, 1, queues[2] ); // |rs| // Q2 nrmr = magma_dznrm2( drs.num_rows, drs.dval, 1, queues[2] ); // implicit sync Q2 --> |r| //--------------------------------------- } // store current timing and residual if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter) % solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)nrmr; solver_par->timing[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)tempo2 - tempo1; } } // check convergence or iteration limit if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { s = k + 1; // for the x-update outside the loop innerflag = 2; info = MAGMA_SUCCESS; break; } } // smoothing disabled if ( smoothing <= 0 && innerflag != 1 ) { // dbeta(1:s) = beta(1:s) // Q0 magma_zsetvector_async( s, hbeta, 1, dbeta.dval, 1, queues[0] ); // x = x + U(:,1:s) * beta(1:s) // Q0 magmablas_zgemv( MagmaNoTrans, dU.num_rows, s, c_one, dU.dval, dU.ld, dbeta.dval, 1, c_one, x->dval, 1, queues[0] ); } // check convergence or iteration limit or invalid result of inner loop if ( innerflag > 0 ) { break; } // preconditioning operation // v = L \ v; // v = U \ v; // Q2 CHECK( magma_z_applyprecond_left( MagmaNoTrans, A, dv, &dlu, precond_par, queues[2] )); CHECK( magma_z_applyprecond_right( MagmaNoTrans, A, dlu, &dv, precond_par, queues[2] )); // t = A v // Q2 CHECK( magma_z_spmv( c_one, A, dv, c_zero, dt, queues[2] )); solver_par->spmv_count++; // computation of a new omega //--------------------------------------- // t't // t'r // Q2 CHECK( magma_zgemvmdot_shfl( dt.ld, 2, dt.dval, dt.dval, d1, d2, dskp.dval, queues[2] )); // skp[0-2] = dskp[0-2] // Q2 magma_zgetvector( 2, dskp.dval, 1, hskp, 1, queues[2] ); // implicit sync Q2 --> skp = dskp // |t| nrmt = magma_dsqrt( MAGMA_Z_REAL(hskp[0]) ); // rho = abs((t' * r) / (|t| * |r|)) rho = MAGMA_D_ABS( MAGMA_Z_REAL(hskp[1]) / (nrmt * nrmr) ); // om = (t' * r) / (|t| * |t|) om = hskp[1] / hskp[0]; if ( rho < angle ) { om = (om * angle) / rho; } //--------------------------------------- if ( MAGMA_Z_EQUAL(om, MAGMA_Z_ZERO) ) { info = MAGMA_DIVERGENCE; break; } // sync Q1 --> v = r magma_queue_sync( queues[1] ); // r = r - om * t // Q2 magma_zaxpy( dr.num_rows, -om, dt.dval, 1, dr.dval, 1, queues[2] ); // x = x + om * v // Q0 magma_zaxpy( x->num_rows, om, dv.dval, 1, x->dval, 1, queues[0] ); // smoothing disabled if ( smoothing <= 0 ) { // |r| // Q2 nrmr = magma_dznrm2( dr.num_rows, dr.dval, 1, queues[2] ); // implicit sync Q2 --> |r| // v = r // Q1 magma_zcopyvector_async( dr.num_rows, dr.dval, 1, dv.dval, 1, queues[1] ); // smoothing enabled } else { // smoothing operation //--------------------------------------- // t = rs - r // Q2 magma_zidr_smoothing_1( drs.num_rows, drs.num_cols, drs.dval, dr.dval, dtt.dval, queues[2] ); // t't // t'rs // Q2 CHECK( magma_zgemvmdot_shfl( dt.ld, 2, dtt.dval, dtt.dval, d1, d2, &dskp.dval[2], queues[2] )); // skp[2-3] = dskp[2-3] // Q2 magma_zgetvector( 2, &dskp.dval[2], 1, &hskp[2], 1, queues[2] ); // implicit sync Q2 --> skp = dskp // gamma = (t' * rs) / (t' * t) gamma = hskp[3] / hskp[2]; // xs = xs - gamma * (xs - x) // Q0 magma_zidr_smoothing_2( dxs.num_rows, dxs.num_cols, -gamma, x->dval, dxs.dval, queues[0] ); // v = r // Q1 magma_zcopyvector_async( dr.num_rows, dr.dval, 1, dv.dval, 1, queues[1] ); // rs = rs - gamma * (rs - r) // Q2 magma_zaxpy( drs.num_rows, -gamma, dtt.dval, 1, drs.dval, 1, queues[2] ); // |rs| // Q2 nrmr = magma_dznrm2( drs.num_rows, drs.dval, 1, queues[2] ); // implicit sync Q2 --> |r| //--------------------------------------- } // store current timing and residual if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); magma_queue_sync( queue ); if ( (solver_par->numiter) % solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)nrmr; solver_par->timing[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)tempo2 - tempo1; } } // check convergence or iteration limit if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { info = MAGMA_SUCCESS; break; } } while ( solver_par->numiter + 1 <= solver_par->maxiter ); // sync all queues for ( q = 0; q < nqueues; q++ ) { magma_queue_sync( queues[q] ); } // smoothing enabled if ( smoothing > 0 ) { // x = xs magma_zcopyvector_async( x->num_rows, dxs.dval, 1, x->dval, 1, queue ); // r = rs magma_zcopyvector_async( dr.num_rows, drs.dval, 1, dr.dval, 1, queue ); } // get last iteration timing tempo2 = magma_sync_wtime( queue ); magma_queue_sync( queue ); solver_par->runtime = (real_Double_t)tempo2 - tempo1; //--------------STOP TIME---------------- // get final stats solver_par->iter_res = nrmr; CHECK( magma_zresidualvec( A, b, *x, &dr, &residual, queue )); solver_par->final_res = residual; // set solver conclusion if ( info != MAGMA_SUCCESS && info != MAGMA_DIVERGENCE ) { if ( solver_par->init_res > solver_par->final_res ) { info = MAGMA_SLOW_CONVERGENCE; } } cleanup: // free resources // sync all queues, destory additional queues magma_queue_sync( queues[0] ); for ( q = 1; q < nqueues; q++ ) { magma_queue_sync( queues[q] ); magma_queue_destroy( queues[q] ); } // smoothing enabled if ( smoothing > 0 ) { drs.dval = NULL; // needed because its pointer is redirected to dtt magma_zmfree( &dxs, queue ); magma_zmfree( &drs, queue ); magma_zmfree( &dtt, queue ); } dr.dval = NULL; // needed because its pointer is redirected to dt dGcol.dval = NULL; // needed because its pointer is redirected to dG magma_zmfree( &dr, queue ); magma_zmfree( &dP, queue ); magma_zmfree( &dP1, queue ); magma_zmfree( &dG, queue ); magma_zmfree( &dGcol, queue ); magma_zmfree( &dU, queue ); magma_zmfree( &dM, queue ); magma_zmfree( &df, queue ); magma_zmfree( &dt, queue ); magma_zmfree( &dc, queue ); magma_zmfree( &dv, queue ); magma_zmfree( &dlu, queue ); magma_zmfree( &dskp, queue ); magma_zmfree( &dalpha, queue ); magma_zmfree( &dbeta, queue ); magma_free_pinned( hMdiag ); magma_free_pinned( hskp ); magma_free_pinned( halpha ); magma_free_pinned( hbeta ); magma_free( d1 ); magma_free( d2 ); solver_par->info = info; return info; /* magma_zpidr_strms */ }
extern "C" double magma_zlatrd_mgpu(magma_int_t num_gpus, char uplo, magma_int_t n0, magma_int_t n, magma_int_t nb, magma_int_t nb0, magmaDoubleComplex *a, magma_int_t lda, double *e, magmaDoubleComplex *tau, magmaDoubleComplex *w, magma_int_t ldw, magmaDoubleComplex **da, magma_int_t ldda, magma_int_t offset, magmaDoubleComplex **dw, magma_int_t lddw, magmaDoubleComplex *dwork[MagmaMaxGPUs], magma_int_t ldwork, magma_int_t k, magmaDoubleComplex *dx[MagmaMaxGPUs], magmaDoubleComplex *dy[MagmaMaxGPUs], magmaDoubleComplex *work, magma_queue_t stream[][10], double *times) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= ZLATRD reduces NB rows and columns of a complex Hermitian matrix A to Hermitian tridiagonal form by an orthogonal similarity transformation Q' * A * Q, and returns the matrices V and W which are needed to apply the transformation to the unreduced part of A. If UPLO = 'U', ZLATRD reduces the last NB rows and columns of a matrix, of which the upper triangle is supplied; if UPLO = 'L', ZLATRD reduces the first NB rows and columns of a matrix, of which the lower triangle is supplied. This is an auxiliary routine called by ZHETRD. Arguments ========= UPLO (input) CHARACTER*1 Specifies whether the upper or lower triangular part of the Hermitian matrix A is stored: = 'U': Upper triangular = 'L': Lower triangular N (input) INTEGER The order of the matrix A. NB (input) INTEGER The number of rows and columns to be reduced. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit: if UPLO = 'U', the last NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements above the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = 'L', the first NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements below the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= (1,N). E (output) COMPLEX_16 array, dimension (N-1) If UPLO = 'U', E(n-nb:n-1) contains the superdiagonal elements of the last NB columns of the reduced matrix; if UPLO = 'L', E(1:nb) contains the subdiagonal elements of the first NB columns of the reduced matrix. TAU (output) COMPLEX_16 array, dimension (N-1) The scalar factors of the elementary reflectors, stored in TAU(n-nb:n-1) if UPLO = 'U', and in TAU(1:nb) if UPLO = 'L'. See Further Details. W (output) COMPLEX_16 array, dimension (LDW,NB) The n-by-nb matrix W required to update the unreduced part of A. LDW (input) INTEGER The leading dimension of the array W. LDW >= max(1,N). Further Details =============== If UPLO = 'U', the matrix Q is represented as a product of elementary reflectors Q = H(n) H(n-1) . . . H(n-nb+1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i), and tau in TAU(i-1). If UPLO = 'L', the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i), and tau in TAU(i). The elements of the vectors v together form the n-by-nb matrix V which is needed, with W, to apply the transformation to the unreduced part of the matrix, using a Hermitian rank-2k update of the form: A := A - V*W' - W*V'. The contents of A on exit are illustrated by the following examples with n = 5 and nb = 2: if UPLO = 'U': if UPLO = 'L': ( a a a v4 v5 ) ( d ) ( a a v4 v5 ) ( 1 d ) ( a 1 v5 ) ( v1 1 a ) ( d 1 ) ( v1 v2 a a ) ( d ) ( v1 v2 a a a ) where d denotes a diagonal element of the reduced matrix, a denotes an element of the original matrix that is unchanged, and vi denotes an element of the vector defining H(i). ===================================================================== */ char uplo_[2] = {uplo, 0}; double mv_time = 0.0; magma_int_t i; #ifndef MAGMABLAS_ZHEMV_MGPU magma_int_t loffset = nb0*((offset/nb0)/num_gpus); #endif magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex value = MAGMA_Z_ZERO; magma_int_t id, idw, i_one = 1; //magma_int_t kk; magma_int_t ione = 1; magma_int_t i_n, i_1, iw; magmaDoubleComplex alpha; magmaDoubleComplex *dx2[MagmaMaxGPUs]; magmaDoubleComplex *f = (magmaDoubleComplex *)malloc(n*sizeof(magmaDoubleComplex )); if (n <= 0) { return 0; } //#define PROFILE_SYMV #ifdef PROFILE_SYMV magma_event_t start, stop; float etime; magma_timestr_t cpu_start, cpu_end; magma_setdevice(0); magma_event_create( &start ); magma_event_create( &stop ); #endif if (lapackf77_lsame(uplo_, "U")) { /* Reduce last NB columns of upper triangle */ for (i = n-1; i >= n - nb ; --i) { i_1 = i + 1; i_n = n - i - 1; iw = i - n + nb; if (i < n-1) { /* Update A(1:i,i) */ magmaDoubleComplex wii = *W(i, iw+1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i_one, &wii, &ldw); #endif wii = -wii; blasf77_zaxpy(&i_1, &wii, A(0, i+1), &i_one, A(0, i), &ione); wii = *A(i, i+1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i_one, &wii, &ldw); #endif wii = -wii; blasf77_zaxpy(&i_1, &wii, W(0, iw+1), &i_one, A(0, i), &ione); } if (i > 0) { /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */ alpha = *A(i-1, i); lapackf77_zlarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]); e[i-1] = MAGMA_Z_REAL( alpha ); *A(i-1,i) = MAGMA_Z_MAKE( 1, 0 ); for( id=0; id<num_gpus; id++ ) { magma_setdevice(id); dx2[id] = dW1(id, 0, iw); magma_zsetvector_async( n, A(0,i), 1, dW1(id, 0, iw), 1, stream[id][0]); #ifndef MAGMABLAS_ZHEMV_MGPU magma_zsetvector_async( i, A(0,i), 1, dx[id], 1, stream[id][0] ); #endif } magmablas_zhemv_mgpu(num_gpus, k, 'U', i, nb0, c_one, da, ldda, 0, dx2, ione, c_zero, dy, ione, dwork, ldwork, work, W(0, iw), stream ); if (i < n-1) { blasf77_zgemv(MagmaConjTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); } /* overlap update */ if( i < n-1 && i-1 >= n - nb ) { magma_int_t im1_1 = i_1 - 1; magma_int_t im1 = i-1; /* Update A(1:i,i) */ #if defined(PRECISION_z) || defined(PRECISION_c) magma_int_t im1_n = i_n + 1; lapackf77_zlacgv(&im1_n, W(im1, iw+1), &ldw); #endif blasf77_zgemv("No transpose", &im1_1, &i_n, &c_neg_one, A(0, i+1), &lda, W(im1, iw+1), &ldw, &c_one, A(0, i-1), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&im1_n, W(im1, iw+1), &ldw); lapackf77_zlacgv(&im1_n, A(im1, i +1), &lda); #endif blasf77_zgemv("No transpose", &im1_1, &i_n, &c_neg_one, W(0, iw+1), &ldw, A(im1, i+1), &lda, &c_one, A(0, i-1), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&im1_n, A(im1, i+1), &lda); #endif } // 3. Here is where we need it // TODO find the right place magmablas_zhemv_sync(num_gpus, k, i, work, W(0, iw), stream ); if (i < n-1) { blasf77_zgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); blasf77_zgemv(MagmaConjTransStr, &i, &i_n, &c_one, A(0, i+1), &lda, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); blasf77_zgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); } blasf77_zscal(&i, &tau[i - 1], W(0, iw), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_zdotc_sub( i, W(0,iw), ione, A(0,i), ione, &value ); #else value = cblas_zdotc( i, W(0,iw), ione, A(0,i), ione ); #endif alpha = tau[i - 1] * -.5f * value; blasf77_zaxpy(&i, &alpha, A(0, i), &ione, W(0, iw), &ione); for( id=0; id<num_gpus; id++ ) { magma_setdevice(id); if( k > 1 ) { magma_zsetvector_async( n, W(0,iw), 1, dW(id, 0, iw), 1, stream[id][1] ); } else { magma_zsetvector_async( n, W(0,iw), 1, dW(id, 0, iw), 1, stream[id][0] ); } } } } } else { /* Reduce first NB columns of lower triangle */ for (i = 0; i < nb; ++i) { /* Update A(i:n,i) */ i_n = n - i; idw = ((offset+i)/nb)%num_gpus; if( i > 0 ) { trace_cpu_start( 0, "gemv", "gemv" ); magmaDoubleComplex wii = *W(i, i-1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i_one, &wii, &ldw); #endif wii = -wii; blasf77_zaxpy( &i_n, &wii, A(i, i-1), &ione, A(i, i), &ione); wii = *A(i, i-1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i_one, &wii, &lda); #endif wii = -wii; blasf77_zaxpy( &i_n, &wii, W(i, i-1), &ione, A(i, i), &ione); } if (i < n-1) { /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */ i_n = n - i - 1; trace_cpu_start( 0, "larfg", "larfg" ); alpha = *A(i+1, i); #ifdef PROFILE_SYMV cpu_start = get_current_time(); #endif lapackf77_zlarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]); #ifdef PROFILE_SYMV cpu_end = get_current_time(); times[0] += GetTimerValue(cpu_start,cpu_end)/1000.0; #endif e[i] = MAGMA_Z_REAL( alpha ); *A(i+1,i) = MAGMA_Z_MAKE( 1, 0 ); trace_cpu_end( 0 ); /* Compute W(i+1:n,i) */ // 1. Send the block reflector A(i+1:n,i) to the GPU //trace_gpu_start( idw, 0, "comm", "comm1" ); #ifndef MAGMABLAS_ZHEMV_MGPU magma_setdevice(idw); magma_zsetvector( i_n, A(i+1,i), 1, dA(idw, i+1, i), 1 ); #endif for( id=0; id<num_gpus; id++ ) { magma_setdevice(id); trace_gpu_start( id, 0, "comm", "comm" ); #ifdef MAGMABLAS_ZHEMV_MGPU dx2[id] = dW1(id, 0, i)-offset; #else dx2[id] = dx[id]; magma_zsetvector( i_n, A(i+1,i), 1, dx[id], 1 ); #endif magma_zsetvector_async( n, A(0,i), 1, dW1(id, 0, i), 1, stream[id][0] ); trace_gpu_end( id, 0 ); } /* mat-vec on multiple GPUs */ #ifdef PROFILE_SYMV magma_setdevice(0); magma_event_record(start, stream[0][0]); #endif magmablas_zhemv_mgpu(num_gpus, k, 'L', i_n, nb0, c_one, da, ldda, offset+i+1, dx2, ione, c_zero, dy, ione, dwork, ldwork, work, W(i+1,i), stream ); #ifdef PROFILE_SYMV magma_setdevice(0); magma_event_record(stop, stream[0][0]); #endif trace_cpu_start( 0, "gemv", "gemv" ); blasf77_zgemv(MagmaConjTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw, A(i+1, i), &ione, &c_zero, W(0, i), &ione); blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda, W(0, i), &ione, &c_zero, f, &ione); blasf77_zgemv(MagmaConjTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda, A(i+1, i), &ione, &c_zero, W(0, i), &ione); trace_cpu_end( 0 ); /* overlap update */ if( i > 0 && i+1 < n ) { magma_int_t ip1 = i+1; trace_cpu_start( 0, "gemv", "gemv" ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i, W(ip1, 0), &ldw); #endif blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, A(ip1, 0), &lda, W(ip1, 0), &ldw, &c_one, A(ip1, ip1), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i, W(ip1, 0), &ldw); lapackf77_zlacgv(&i, A(ip1 ,0), &lda); #endif blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, W(ip1, 0), &ldw, A(ip1, 0), &lda, &c_one, A(ip1, ip1), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i, A(ip1, 0), &lda); #endif trace_cpu_end( 0 ); } /* synchronize */ magmablas_zhemv_sync(num_gpus, k, i_n, work, W(i+1,i), stream ); #ifdef PROFILE_SYMV cudaEventElapsedTime(&etime, start, stop); mv_time += (etime/1000.0); times[1+(i_n/(n0/10))] += (etime/1000.0); #endif trace_cpu_start( 0, "axpy", "axpy" ); if (i!=0) blasf77_zaxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione); blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw, W(0, i), &ione, &c_one, W(i+1, i), &ione); blasf77_zscal(&i_n, &tau[i], W(i+1,i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_zdotc_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value ); #else value = cblas_zdotc( i_n, W(i+1,i), ione, A(i+1,i), ione ); #endif alpha = tau[i]* -.5f * value; blasf77_zaxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione); trace_cpu_end( 0 ); for( id=0; id<num_gpus; id++ ) { magma_setdevice(id); if( k > 1 ) { magma_zsetvector_async( n, W(0,i), 1, dW(id, 0, i), 1, stream[id][1] ); } else { magma_zsetvector_async( n, W(0,i), 1, dW(id, 0, i), 1, stream[id][0] ); } } } } } #ifdef PROFILE_SYMV magma_setdevice(0); magma_event_destory( start ); magma_event_destory( stop ); #endif for( id=0; id<num_gpus; id++ ) { magma_setdevice(id); if( k > 1) magma_queue_sync(stream[id][1]); } free(f); return mv_time; } /* zlatrd_ */