/** Purpose ------- ZLATRD2 reduces NB rows and columns of a complex Hermitian matrix A to Hermitian tridiagonal form by an orthogonal similarity transformation Q' * A * Q, and returns the matrices V and W which are needed to apply the transformation to the unreduced part of A. If UPLO = MagmaUpper, ZLATRD reduces the last NB rows and columns of a matrix, of which the upper triangle is supplied; if UPLO = MagmaLower, ZLATRD reduces the first NB rows and columns of a matrix, of which the lower triangle is supplied. This is an auxiliary routine called by ZHETRD2_GPU. It uses an accelerated HEMV that needs extra memory. Arguments --------- @param[in] uplo magma_uplo_t Specifies whether the upper or lower triangular part of the Hermitian matrix A is stored: - = MagmaUpper: Upper triangular - = MagmaLower: Lower triangular @param[in] n INTEGER The order of the matrix A. @param[in] nb INTEGER The number of rows and columns to be reduced. @param[in,out] A COMPLEX_16 array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit: - if UPLO = MagmaUpper, the last NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements above the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; - if UPLO = MagmaLower, the first NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements below the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= (1,N). @param[out] e COMPLEX_16 array, dimension (N-1) If UPLO = MagmaUpper, E(n-nb:n-1) contains the superdiagonal elements of the last NB columns of the reduced matrix; if UPLO = MagmaLower, E(1:nb) contains the subdiagonal elements of the first NB columns of the reduced matrix. @param[out] tau COMPLEX_16 array, dimension (N-1) The scalar factors of the elementary reflectors, stored in TAU(n-nb:n-1) if UPLO = MagmaUpper, and in TAU(1:nb) if UPLO = MagmaLower. See Further Details. @param[out] W COMPLEX_16 array, dimension (LDW,NB) The n-by-nb matrix W required to update the unreduced part of A. @param[in] ldw INTEGER The leading dimension of the array W. LDW >= max(1,N). Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n) H(n-1) . . . H(n-nb+1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i), and tau in TAU(i-1). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i), and tau in TAU(i). The elements of the vectors v together form the n-by-nb matrix V which is needed, with W, to apply the transformation to the unreduced part of the matrix, using a Hermitian rank-2k update of the form: A := A - V*W' - W*V'. The contents of A on exit are illustrated by the following examples with n = 5 and nb = 2: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( a a a v4 v5 ) ( d ) ( a a v4 v5 ) ( 1 d ) ( a 1 v5 ) ( v1 1 a ) ( d 1 ) ( v1 v2 a a ) ( d ) ( v1 v2 a a a ) where d denotes a diagonal element of the reduced matrix, a denotes an element of the original matrix that is unchanged, and vi denotes an element of the vector defining H(i). @ingroup magma_zheev_aux ********************************************************************/ extern "C" magma_int_t magma_zlatrd2(magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magmaDoubleComplex *A, magma_int_t lda, double *e, magmaDoubleComplex *tau, magmaDoubleComplex *W, magma_int_t ldw, magmaDoubleComplex *dA, magma_int_t ldda, magmaDoubleComplex *dW, magma_int_t lddw, magmaDoubleComplex *dwork, magma_int_t ldwork) { #define A(i, j) (A + (j)*lda + (i)) #define W(i, j) (W + (j)*ldw + (i)) #define dA(i, j) (dA + (j)*ldda + (i)) #define dW(i, j) (dW + (j)*lddw + (i)) magma_int_t i; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex value = MAGMA_Z_ZERO; magma_int_t ione = 1; magma_int_t i_n, i_1, iw; magmaDoubleComplex alpha; magmaDoubleComplex *f; if (n <= 0) { return 0; } magma_queue_t stream; magma_queue_create( &stream ); magma_zmalloc_cpu( &f, n ); assert( f != NULL ); // TODO return error, or allocate outside zlatrd if (uplo == MagmaUpper) { /* Reduce last NB columns of upper triangle */ for (i = n-1; i >= n - nb; --i) { i_1 = i + 1; i_n = n - i - 1; iw = i - n + nb; if (i < n-1) { /* Update A(1:i,i) */ #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i_n, W(i, iw+1), &ldw); #endif blasf77_zgemv("No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda, W(i, iw+1), &ldw, &c_one, A(0, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i_n, W(i, iw+1), &ldw); lapackf77_zlacgv(&i_n, A(i, i+1), &ldw); #endif blasf77_zgemv("No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw, A(i, i+1), &lda, &c_one, A(0, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i_n, A(i, i+1), &ldw); #endif } if (i > 0) { /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */ alpha = *A(i-1, i); lapackf77_zlarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]); e[i-1] = MAGMA_Z_REAL( alpha ); *A(i-1,i) = MAGMA_Z_MAKE( 1, 0 ); /* Compute W(1:i-1,i) */ // 1. Send the block reflector A(0:n-i-1,i) to the GPU magma_zsetvector( i, A(0, i), 1, dA(0, i), 1 ); //#if (GPUSHMEM < 200) //magma_zhemv(MagmaUpper, i, c_one, dA(0, 0), ldda, // dA(0, i), ione, c_zero, dW(0, iw), ione); //#else magmablas_zhemv_work(MagmaUpper, i, c_one, dA(0, 0), ldda, dA(0, i), ione, c_zero, dW(0, iw), ione, dwork, ldwork); //#endif // 2. Start putting the result back (asynchronously) magma_zgetmatrix_async( i, 1, dW(0, iw), lddw, W(0, iw) /*test*/, ldw, stream ); if (i < n-1) { blasf77_zgemv(MagmaConjTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); } // 3. Here is where we need it // TODO find the right place magma_queue_sync( stream ); if (i < n-1) { blasf77_zgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); blasf77_zgemv(MagmaConjTransStr, &i, &i_n, &c_one, A(0, i+1), &lda, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); blasf77_zgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); } blasf77_zscal(&i, &tau[i - 1], W(0, iw), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_zdotc_sub( i, W(0,iw), ione, A(0,i), ione, &value ); #else value = cblas_zdotc( i, W(0,iw), ione, A(0,i), ione ); #endif alpha = tau[i - 1] * -0.5f * value; blasf77_zaxpy(&i, &alpha, A(0, i), &ione, W(0, iw), &ione); } } } else { /* Reduce first NB columns of lower triangle */ for (i = 0; i < nb; ++i) { /* Update A(i:n,i) */ i_n = n - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i, W(i, 0), &ldw); #endif blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda, W(i, 0), &ldw, &c_one, A(i, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i, W(i, 0), &ldw); lapackf77_zlacgv(&i, A(i, 0), &lda); #endif blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw, A(i, 0), &lda, &c_one, A(i, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i, A(i, 0), &lda); #endif if (i < n-1) { /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */ i_n = n - i - 1; alpha = *A(i+1, i); lapackf77_zlarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]); e[i] = MAGMA_Z_REAL( alpha ); *A(i+1,i) = MAGMA_Z_MAKE( 1, 0 ); /* Compute W(i+1:n,i) */ // 1. Send the block reflector A(i+1:n,i) to the GPU magma_zsetvector( i_n, A(i+1, i), 1, dA(i+1, i), 1 ); //#if (GPUSHMEM < 200) //magma_zhemv(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero, // dW(i+1, i), ione); //#else magmablas_zhemv_work(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero, dW(i+1, i), ione, dwork, ldwork); //#endif // 2. Start putting the result back (asynchronously) magma_zgetmatrix_async( i_n, 1, dW(i+1, i), lddw, W(i+1, i), ldw, stream ); blasf77_zgemv(MagmaConjTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw, A(i+1, i), &ione, &c_zero, W(0, i), &ione); blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda, W(0, i), &ione, &c_zero, f, &ione); blasf77_zgemv(MagmaConjTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda, A(i+1, i), &ione, &c_zero, W(0, i), &ione); // 3. Here is where we need it magma_queue_sync( stream ); if (i != 0) blasf77_zaxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione); blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw, W(0, i), &ione, &c_one, W(i+1, i), &ione); blasf77_zscal(&i_n, &tau[i], W(i+1,i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_zdotc_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value ); #else value = cblas_zdotc( i_n, W(i+1,i), ione, A(i+1,i), ione ); #endif alpha = tau[i] * -0.5f * value; blasf77_zaxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione); } } } magma_free_cpu(f); magma_queue_destroy( stream ); return 0; } /* magma_zlatrd */
/** Purpose ------- ZLATRD reduces NB rows and columns of a complex Hermitian matrix A to Hermitian tridiagonal form by an orthogonal similarity transformation Q' * A * Q, and returns the matrices V and W which are needed to apply the transformation to the unreduced part of A. If UPLO = MagmaUpper, ZLATRD reduces the last NB rows and columns of a matrix, of which the upper triangle is supplied; if UPLO = MagmaLower, ZLATRD reduces the first NB rows and columns of a matrix, of which the lower triangle is supplied. This is an auxiliary routine called by ZHETRD. Arguments --------- @param[in] uplo magma_uplo_t Specifies whether the upper or lower triangular part of the Hermitian matrix A is stored: - = MagmaUpper: Upper triangular - = MagmaLower: Lower triangular @param[in] n INTEGER The order of the matrix A. @param[in] nb INTEGER The number of rows and columns to be reduced. @param[in,out] A COMPLEX_16 array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit: - if UPLO = MagmaUpper, the last NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements above the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; - if UPLO = MagmaLower, the first NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements below the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= (1,N). @param[out] e COMPLEX_16 array, dimension (N-1) If UPLO = MagmaUpper, E(n-nb:n-1) contains the superdiagonal elements of the last NB columns of the reduced matrix; if UPLO = MagmaLower, E(1:nb) contains the subdiagonal elements of the first NB columns of the reduced matrix. @param[out] tau COMPLEX_16 array, dimension (N-1) The scalar factors of the elementary reflectors, stored in TAU(n-nb:n-1) if UPLO = MagmaUpper, and in TAU(1:nb) if UPLO = MagmaLower. See Further Details. @param[out] W COMPLEX_16 array, dimension (LDW,NB) The n-by-nb matrix W required to update the unreduced part of A. @param[in] ldw INTEGER The leading dimension of the array W. LDW >= max(1,N). @param dA TODO: dimension (ldda, n)? @param ldda TODO: ldda >= n? @param dW TODO: dimension (lddw, ??) @param lddw TODO: lddw >= n ?? @param[in] queue magma_queue_t Queue to execute in. Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n) H(n-1) . . . H(n-nb+1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i), and tau in TAU(i-1). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i), and tau in TAU(i). The elements of the vectors v together form the n-by-nb matrix V which is needed, with W, to apply the transformation to the unreduced part of the matrix, using a Hermitian rank-2k update of the form: A := A - V*W' - W*V'. The contents of A on exit are illustrated by the following examples with n = 5 and nb = 2: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( a a a v4 v5 ) ( d ) ( a a v4 v5 ) ( 1 d ) ( a 1 v5 ) ( v1 1 a ) ( d 1 ) ( v1 v2 a a ) ( d ) ( v1 v2 a a a ) where d denotes a diagonal element of the reduced matrix, a denotes an element of the original matrix that is unchanged, and vi denotes an element of the vector defining H(i). @ingroup magma_zheev_aux ********************************************************************/ extern "C" magma_int_t magma_zlatrd( magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magmaDoubleComplex *A, magma_int_t lda, double *e, magmaDoubleComplex *tau, magmaDoubleComplex *W, magma_int_t ldw, magmaDoubleComplex *work, magma_int_t lwork, magmaDoubleComplex_ptr dA, magma_int_t ldda, magmaDoubleComplex_ptr dW, magma_int_t lddw, magma_queue_t queue ) { #define A(i_, j_) (A + (i_) + (j_)*lda) #define W(i_, j_) (W + (i_) + (j_)*ldw) #define dA(i_, j_) (dA + (i_) + (j_)*ldda) #define dW(i_, j_) (dW + (i_) + (j_)*lddw) /* Constants */ const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; const magmaDoubleComplex c_one = MAGMA_Z_ONE; const magmaDoubleComplex c_zero = MAGMA_Z_ZERO; const magma_int_t ione = 1; /* Local variables */ magmaDoubleComplex alpha, value; magma_int_t i, i_n, i_1, iw; /* Check arguments */ magma_int_t info = 0; if ( uplo != MagmaLower && uplo != MagmaUpper ) { info = -1; } else if ( n < 0 ) { info = -2; } else if ( nb < 1 ) { info = -3; } else if ( lda < max(1,n) ) { info = -5; } else if ( ldw < max(1,n) ) { info = -9; } else if ( ldda < max(1,n) ) { info = -11; } else if ( lddw < max(1,n) ) { info = -13; } if (info != 0) { magma_xerbla( __func__, -(info) ); return info; } /* Quick return if possible */ if (n == 0) { return info; } if (uplo == MagmaUpper) { /* Reduce last NB columns of upper triangle */ for (i = n-1; i >= n - nb; --i) { i_1 = i + 1; i_n = n - i - 1; iw = i - n + nb; if (i < n-1) { /* Update A(1:i,i) */ #ifdef COMPLEX lapackf77_zlacgv( &i_n, W(i, iw+1), &ldw ); #endif blasf77_zgemv( "No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda, W(i, iw+1), &ldw, &c_one, A(0, i), &ione ); #ifdef COMPLEX lapackf77_zlacgv( &i_n, W(i, iw+1), &ldw ); lapackf77_zlacgv( &i_n, A(i, i+1), &lda ); #endif blasf77_zgemv( "No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw, A(i, i+1), &lda, &c_one, A(0, i), &ione ); #ifdef COMPLEX lapackf77_zlacgv( &i_n, A(i, i+1), &lda ); #endif } if (i > 0) { /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */ alpha = *A(i-1, i); lapackf77_zlarfg( &i, &alpha, A(0, i), &ione, &tau[i - 1] ); e[i-1] = MAGMA_Z_REAL( alpha ); *A(i-1,i) = MAGMA_Z_ONE; /* Compute W(1:i-1,i) */ // 1. Send the block reflector A(0:n-i-1,i) to the GPU magma_zsetvector( i, A(0, i), 1, dA(0, i), 1, queue ); magma_zhemv( MagmaUpper, i, c_one, dA(0, 0), ldda, dA(0, i), ione, c_zero, dW(0, iw), ione, queue ); // 2. Start putting the result back (asynchronously) magma_zgetmatrix_async( i, 1, dW(0, iw), lddw, W(0, iw), ldw, queue ); if (i < n-1) { blasf77_zgemv( MagmaConjTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw, A(0, i), &ione, &c_zero, W(i+1, iw), &ione ); } // 3. Here is where we need it // TODO find the right place magma_queue_sync( queue ); if (i < n-1) { blasf77_zgemv( "No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda, W(i+1, iw), &ione, &c_one, W(0, iw), &ione ); blasf77_zgemv( MagmaConjTransStr, &i, &i_n, &c_one, A(0, i+1), &lda, A(0, i), &ione, &c_zero, W(i+1, iw), &ione ); blasf77_zgemv( "No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw, W(i+1, iw), &ione, &c_one, W(0, iw), &ione ); } blasf77_zscal( &i, &tau[i - 1], W(0, iw), &ione ); value = magma_cblas_zdotc( i, W(0,iw), ione, A(0,i), ione ); alpha = tau[i - 1] * -0.5f * value; blasf77_zaxpy( &i, &alpha, A(0, i), &ione, W(0, iw), &ione ); } } } else { /* Reduce first NB columns of lower triangle */ for (i = 0; i < nb; ++i) { /* Update A(i:n,i) */ i_n = n - i; #ifdef COMPLEX lapackf77_zlacgv( &i, W(i, 0), &ldw ); #endif blasf77_zgemv( "No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda, W(i, 0), &ldw, &c_one, A(i, i), &ione ); #ifdef COMPLEX lapackf77_zlacgv( &i, W(i, 0), &ldw ); lapackf77_zlacgv( &i, A(i, 0), &lda ); #endif blasf77_zgemv( "No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw, A(i, 0), &lda, &c_one, A(i, i), &ione ); #ifdef COMPLEX lapackf77_zlacgv( &i, A(i, 0), &lda ); #endif if (i < n-1) { /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */ i_n = n - i - 1; alpha = *A(i+1, i); lapackf77_zlarfg( &i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i] ); e[i] = MAGMA_Z_REAL( alpha ); *A(i+1,i) = MAGMA_Z_ONE; /* Compute W(i+1:n,i) */ // 1. Send the block reflector A(i+1:n,i) to the GPU magma_zsetvector( i_n, A(i+1, i), 1, dA(i+1, i), 1, queue ); magma_zhemv( MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero, dW(i+1, i), ione, queue ); // 2. Start putting the result back (asynchronously) magma_zgetmatrix_async( i_n, 1, dW(i+1, i), lddw, W(i+1, i), ldw, queue ); blasf77_zgemv( MagmaConjTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw, A(i+1, i), &ione, &c_zero, W(0, i), &ione ); blasf77_zgemv( "No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda, W(0, i), &ione, &c_zero, work, &ione ); blasf77_zgemv( MagmaConjTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda, A(i+1, i), &ione, &c_zero, W(0, i), &ione ); // 3. Here is where we need it magma_queue_sync( queue ); if (i != 0) blasf77_zaxpy( &i_n, &c_one, work, &ione, W(i+1, i), &ione ); blasf77_zgemv( "No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw, W(0, i), &ione, &c_one, W(i+1, i), &ione ); blasf77_zscal( &i_n, &tau[i], W(i+1,i), &ione ); value = magma_cblas_zdotc( i_n, W(i+1,i), ione, A(i+1,i), ione ); alpha = tau[i] * -0.5f * value; blasf77_zaxpy( &i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione ); } } } return info; } /* magma_zlatrd */
int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, t1, t2; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_trans_t trans[] = { MagmaNoTrans, MagmaConjTrans, MagmaTrans }; magma_uplo_t uplo [] = { MagmaLower, MagmaUpper }; magma_diag_t diag [] = { MagmaUnit, MagmaNonUnit }; magma_side_t side [] = { MagmaLeft, MagmaRight }; magmaDoubleComplex *A, *B, *C, *C2, *LU; magmaDoubleComplex *dA, *dB, *dC1, *dC2; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 0.5, 0.1 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( 0.7, 0.2 ); double dalpha = 0.6; double dbeta = 0.8; double work[1], error, total_error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld, info; magma_int_t *piv; magma_int_t err; magma_opts opts; parse_opts( argc, argv, &opts ); printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" ); total_error = 0.; for( int itest = 0; itest < opts.ntest; ++itest ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; printf("=========================================================================\n"); printf( "m=%d, n=%d, k=%d\n", (int) m, (int) n, (int) k ); // allocate matrices // over-allocate so they can be any combination of {m,n,k} x {m,n,k}. maxn = max( max( m, n ), k ); ld = max( 1, maxn ); size = ld*maxn; err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) ); assert( err == 0 ); err = magma_zmalloc_pinned( &A, size ); assert( err == 0 ); err = magma_zmalloc_pinned( &B, size ); assert( err == 0 ); err = magma_zmalloc_pinned( &C, size ); assert( err == 0 ); err = magma_zmalloc_pinned( &C2, size ); assert( err == 0 ); err = magma_zmalloc_pinned( &LU, size ); assert( err == 0 ); err = magma_zmalloc( &dA, size ); assert( err == 0 ); err = magma_zmalloc( &dB, size ); assert( err == 0 ); err = magma_zmalloc( &dC1, size ); assert( err == 0 ); err = magma_zmalloc( &dC2, size ); assert( err == 0 ); // initialize matrices size = maxn*maxn; lapackf77_zlarnv( &ione, ISEED, &size, A ); lapackf77_zlarnv( &ione, ISEED, &size, B ); lapackf77_zlarnv( &ione, ISEED, &size, C ); printf( "========== Level 1 BLAS ==========\n" ); // ----- test ZSWAP // swap columns 2 and 3 of dA, then copy to C2 and compare with A if ( n >= 3 ) { magma_zsetmatrix( m, n, A, ld, dA, ld ); magma_zsetmatrix( m, n, A, ld, dB, ld ); magma_zswap( m, dA(0,1), 1, dA(0,2), 1 ); magma_zswap( m, dB(0,1), 1, dB(0,2), 1 ); // check results, storing diff between magma and cuda calls in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dA, 1, dB, 1 ); magma_zgetmatrix( m, n, dB, ld, C2, ld ); error = lapackf77_zlange( "F", &m, &k, C2, &ld, work ); total_error += error; printf( "zswap diff %.2g\n", error ); } else { printf( "zswap skipped for n < 3\n" ); } // ----- test IZAMAX // get argmax of column of A magma_zsetmatrix( m, k, A, ld, dA, ld ); error = 0; for( int j = 0; j < k; ++j ) { magma_int_t i1 = magma_izamax( m, dA(0,j), 1 ); int i2; // NOT magma_int_t, for cublas cublasIzamax( handle, m, dA(0,j), 1, &i2 ); // todo need sync here? assert( i1 == i2 ); error += abs( i1 - i2 ); } total_error += error; gflops = (double)m * k / 1e9; printf( "izamax diff %.2g\n", error ); printf( "\n" ); printf( "========== Level 2 BLAS ==========\n" ); // ----- test ZGEMV // c = alpha*A*b + beta*c, with A m*n; b,c m or n-vectors // try no-trans/trans for( int ia = 0; ia < 3; ++ia ) { magma_zsetmatrix( m, n, A, ld, dA, ld ); magma_zsetvector( maxn, B, 1, dB, 1 ); magma_zsetvector( maxn, C, 1, dC1, 1 ); magma_zsetvector( maxn, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_zgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZgemv( handle, cublas_trans_const(trans[ia]), m, n, &alpha, dA, ld, dB, 1, &beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 size = (trans[ia] == MagmaNoTrans ? m : n); cublasZaxpy( handle, size, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetvector( size, dC2, 1, C2, 1 ); error = lapackf77_zlange( "F", &size, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_ZGEMV( m, n ) / 1e9; printf( "zgemv( %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_trans_const(trans[ia]), error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test ZHEMV // c = alpha*A*b + beta*c, with A m*m symmetric; b,c m-vectors // try upper/lower for( int iu = 0; iu < 2; ++iu ) { magma_zsetmatrix( m, m, A, ld, dA, ld ); magma_zsetvector( m, B, 1, dB, 1 ); magma_zsetvector( m, C, 1, dC1, 1 ); magma_zsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_zhemv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZhemv( handle, cublas_uplo_const(uplo[iu]), m, &alpha, dA, ld, dB, 1, &beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, m, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_zlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_ZHEMV( m ) / 1e9; printf( "zhemv( %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test ZTRSV // solve A*c = c, with A m*m triangular; c m-vector // try upper/lower, no-trans/trans, unit/non-unit diag // Factor A into LU to get well-conditioned triangles, else solve yields garbage. // Still can give garbage if solves aren't consistent with LU factors, // e.g., using unit diag for U, so copy lower triangle to upper triangle. // Also used for trsm later. lapackf77_zlacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld ); lapackf77_zgetrf( &maxn, &maxn, LU, &ld, piv, &info ); for( int j = 0; j < maxn; ++j ) { for( int i = 0; i < j; ++i ) { *LU(i,j) = *LU(j,i); } } for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { magma_zsetmatrix( m, m, LU, ld, dA, ld ); magma_zsetvector( m, C, 1, dC1, 1 ); magma_zsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_ztrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZtrsv( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, dA, ld, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, m, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_zlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_ZTRSM( MagmaLeft, m, 1 ) / 1e9; printf( "ztrsv( %c, %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), lapacke_diag_const(diag[id]), error, gflops/t1, gflops/t2 ); }}} printf( "\n" ); printf( "========== Level 3 BLAS ==========\n" ); // ----- test ZGEMM // C = alpha*A*B + beta*C, with A m*k or k*m; B k*n or n*k; C m*n // try combinations of no-trans/trans for( int ia = 0; ia < 3; ++ia ) { for( int ib = 0; ib < 3; ++ib ) { bool nta = (trans[ia] == MagmaNoTrans); bool ntb = (trans[ib] == MagmaNoTrans); magma_zsetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA, ld ); magma_zsetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB, ld ); magma_zsetmatrix( m, n, C, ld, dC1, ld ); magma_zsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_zgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZgemm( handle, cublas_trans_const(trans[ia]), cublas_trans_const(trans[ib]), m, n, k, &alpha, dA, ld, dB, ld, &beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZGEMM( m, n, k ) / 1e9; printf( "zgemm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_trans_const(trans[ia]), lapacke_trans_const(trans[ib]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test ZHEMM // C = alpha*A*B + beta*C (left) with A m*m symmetric; B,C m*n; or // C = alpha*B*A + beta*C (right) with A n*n symmetric; B,C m*n // try left/right, upper/lower for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { magma_zsetmatrix( m, m, A, ld, dA, ld ); magma_zsetmatrix( m, n, B, ld, dB, ld ); magma_zsetmatrix( m, n, C, ld, dC1, ld ); magma_zsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_zhemm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZhemm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), m, n, &alpha, dA, ld, dB, ld, &beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZHEMM( side[is], m, n ) / 1e9; printf( "zhemm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_side_const(side[is]), lapacke_uplo_const(uplo[iu]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test ZHERK // C = alpha*A*A^H + beta*C (no-trans) with A m*k and C m*m symmetric; or // C = alpha*A^H*A + beta*C (trans) with A k*m and C m*m symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { magma_zsetmatrix( n, k, A, ld, dA, ld ); magma_zsetmatrix( n, n, C, ld, dC1, ld ); magma_zsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_zherk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZherk( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), n, k, &dalpha, dA, ld, &dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZHERK( k, n ) / 1e9; printf( "zherk( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test ZHER2K // C = alpha*A*B^H + ^alpha*B*A^H + beta*C (no-trans) with A,B n*k; C n*n symmetric; or // C = alpha*A^H*B + ^alpha*B^H*A + beta*C (trans) with A,B k*n; C n*n symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { bool nt = (trans[it] == MagmaNoTrans); magma_zsetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA, ld ); magma_zsetmatrix( n, n, C, ld, dC1, ld ); magma_zsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_zher2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZher2k( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), n, k, &alpha, dA, ld, dB, ld, &dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZHER2K( k, n ) / 1e9; printf( "zher2k( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test ZTRMM // C = alpha*A*C (left) with A m*m triangular; C m*n; or // C = alpha*C*A (right) with A n*n triangular; C m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == MagmaLeft); magma_zsetmatrix( (left ? m : n), (left ? m : n), A, ld, dA, ld ); magma_zsetmatrix( m, n, C, ld, dC1, ld ); magma_zsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ztrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; // note cublas does trmm out-of-place (i.e., adds output matrix C), // but allows C=B to do in-place. t2 = magma_sync_wtime( 0 ); cublasZtrmm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, n, &alpha, dA, ld, dC2, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZTRMM( side[is], m, n ) / 1e9; printf( "ztrmm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // ----- test ZTRSM // solve A*X = alpha*B (left) with A m*m triangular; B m*n; or // solve X*A = alpha*B (right) with A n*n triangular; B m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == MagmaLeft); magma_zsetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA, ld ); magma_zsetmatrix( m, n, C, ld, dC1, ld ); magma_zsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ztrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZtrsm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, n, &alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZTRSM( side[is], m, n ) / 1e9; printf( "ztrsm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // cleanup magma_free_cpu( piv ); magma_free_pinned( A ); magma_free_pinned( B ); magma_free_pinned( C ); magma_free_pinned( C2 ); magma_free_pinned( LU ); magma_free( dA ); magma_free( dB ); magma_free( dC1 ); magma_free( dC2 ); fflush( stdout ); } if ( total_error != 0. ) { printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n", total_error ); } else { printf( "all tests passed\n" ); } TESTING_FINALIZE(); int status = (total_error != 0.); return status; }
extern "C" magma_int_t magma_zpcg_merge( magma_z_matrix A, magma_z_matrix b, magma_z_matrix *x, magma_z_solver_par *solver_par, magma_z_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_PCGMERGE; solver_par->numiter = 0; solver_par->spmv_count = 0; // solver variables magmaDoubleComplex alpha, beta, gamma, rho, tmp1, *skp_h={0}; double nom, nom0, r0, res, nomb; magmaDoubleComplex den; // some useful variables magmaDoubleComplex c_zero = MAGMA_Z_ZERO, c_one = MAGMA_Z_ONE; magma_int_t dofs = A.num_rows*b.num_cols; magma_z_matrix r={Magma_CSR}, d={Magma_CSR}, z={Magma_CSR}, h={Magma_CSR}, rt={Magma_CSR}; magmaDoubleComplex *d1=NULL, *d2=NULL, *skp=NULL; // GPU workspace CHECK( magma_zvinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &d, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &z, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &rt, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &h, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zmalloc( &d1, dofs*(2) )); CHECK( magma_zmalloc( &d2, dofs*(2) )); // array for the parameters CHECK( magma_zmalloc( &skp, 7 )); // skp = [alpha|beta|gamma|rho|tmp1|tmp2|res] // solver setup CHECK( magma_zresidualvec( A, b, *x, &r, &nom0, queue)); // preconditioner CHECK( magma_z_applyprecond_left( MagmaNoTrans, A, r, &rt, precond_par, queue )); CHECK( magma_z_applyprecond_right( MagmaNoTrans, A, rt, &h, precond_par, queue )); magma_zcopy( dofs, h.dval, 1, d.dval, 1, queue ); nom = MAGMA_Z_ABS( magma_zdotc( dofs, r.dval, 1, h.dval, 1, queue )); CHECK( magma_z_spmv( c_one, A, d, c_zero, z, queue )); // z = A d den = magma_zdotc( dofs, d.dval, 1, z.dval, 1, queue ); // den = d'* z solver_par->init_res = nom0; nomb = magma_dznrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } if ( (r0 = nomb * solver_par->rtol) < ATOLERANCE ){ r0 = ATOLERANCE; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nom0; solver_par->timing[0] = 0.0; } if ( nom < r0 ) { info = MAGMA_SUCCESS; goto cleanup; } // check positive definite if ( MAGMA_Z_ABS(den) <= 0.0 ) { info = MAGMA_NONSPD; goto cleanup; } // array on host for the parameters CHECK( magma_zmalloc_cpu( &skp_h, 7 )); alpha = rho = gamma = tmp1 = c_one; beta = magma_zdotc( dofs, h.dval, 1, r.dval, 1, queue ); skp_h[0]=alpha; skp_h[1]=beta; skp_h[2]=gamma; skp_h[3]=rho; skp_h[4]=tmp1; skp_h[5]=MAGMA_Z_MAKE(nom, 0.0); skp_h[6]=MAGMA_Z_MAKE(nom, 0.0); magma_zsetvector( 7, skp_h, 1, skp, 1, queue ); //Chronometry real_Double_t tempo1, tempo2, tempop1, tempop2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; // computes SpMV and dot product CHECK( magma_zcgmerge_spmv1( A, d1, d2, d.dval, z.dval, skp, queue )); solver_par->spmv_count++; if( precond_par->solver == Magma_JACOBI ){ CHECK( magma_zjcgmerge_xrbeta( dofs, d1, d2, precond_par->d.dval, x->dval, r.dval, d.dval, z.dval, h.dval, skp, queue )); } else if( precond_par->solver == Magma_NONE ){ // updates x, r CHECK( magma_zpcgmerge_xrbeta1( dofs, x->dval, r.dval, d.dval, z.dval, skp, queue )); // computes scalars and updates d CHECK( magma_zpcgmerge_xrbeta2( dofs, d1, d2, r.dval, r.dval, d.dval, skp, queue )); } else { // updates x, r CHECK( magma_zpcgmerge_xrbeta1( dofs, x->dval, r.dval, d.dval, z.dval, skp, queue )); // preconditioner in between tempop1 = magma_sync_wtime( queue ); CHECK( magma_z_applyprecond_left( MagmaNoTrans, A, r, &rt, precond_par, queue )); CHECK( magma_z_applyprecond_right( MagmaNoTrans, A, rt, &h, precond_par, queue )); // magma_zcopy( dofs, r.dval, 1, h.dval, 1 ); tempop2 = magma_sync_wtime( queue ); precond_par->runtime += tempop2-tempop1; // computes scalars and updates d CHECK( magma_zpcgmerge_xrbeta2( dofs, d1, d2, h.dval, r.dval, d.dval, skp, queue )); } //if( solver_par->numiter==1){ // magma_zcopy( dofs, h.dval, 1, d.dval, 1 ); //} // updates x, r, computes scalars and updates d //CHECK( magma_zcgmerge_xrbeta( dofs, d1, d2, x->dval, r.dval, d.dval, z.dval, skp, queue )); // check stopping criterion (asynchronous copy) magma_zgetvector( 1 , skp+6, 1, skp_h+6, 1, queue ); res = sqrt(MAGMA_Z_ABS(skp_h[6])); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res/nomb <= solver_par->rtol || res <= solver_par->atol ){ break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; double residual; CHECK( magma_zresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = res; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->atol || solver_par->iter_res/solver_par->init_res < solver_par->rtol ){ info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_DIVERGENCE; } cleanup: magma_zmfree(&r, queue ); magma_zmfree(&z, queue ); magma_zmfree(&d, queue ); magma_zmfree(&rt, queue ); magma_zmfree(&h, queue ); magma_free( d1 ); magma_free( d2 ); magma_free( skp ); magma_free_cpu( skp_h ); solver_par->info = info; return info; } /* magma_zpcg_merge */
extern "C" magma_int_t magma_zpidr_merge( magma_z_matrix A, magma_z_matrix b, magma_z_matrix *x, magma_z_solver_par *solver_par, magma_z_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_PIDRMERGE; solver_par->numiter = 0; solver_par->spmv_count = 0; solver_par->init_res = 0.0; solver_par->final_res = 0.0; solver_par->iter_res = 0.0; solver_par->runtime = 0.0; // constants const magmaDoubleComplex c_zero = MAGMA_Z_ZERO; const magmaDoubleComplex c_one = MAGMA_Z_ONE; const magmaDoubleComplex c_n_one = MAGMA_Z_NEG_ONE; // internal user parameters const magma_int_t smoothing = 1; // 0 = disable, 1 = enable const double angle = 0.7; // [0-1] // local variables magma_int_t iseed[4] = {0, 0, 0, 1}; magma_int_t dof; magma_int_t s; magma_int_t distr; magma_int_t k, i, sk; magma_int_t innerflag; magma_int_t ldd; double residual; double nrm; double nrmb; double nrmr; double nrmt; double rho; magmaDoubleComplex om; magmaDoubleComplex gamma; magmaDoubleComplex fk; // matrices and vectors magma_z_matrix dxs = {Magma_CSR}; magma_z_matrix dr = {Magma_CSR}, drs = {Magma_CSR}; magma_z_matrix dP = {Magma_CSR}, dP1 = {Magma_CSR}; magma_z_matrix dG = {Magma_CSR}, dGcol = {Magma_CSR}; magma_z_matrix dU = {Magma_CSR}; magma_z_matrix dM = {Magma_CSR}, hMdiag = {Magma_CSR}; magma_z_matrix df = {Magma_CSR}; magma_z_matrix dt = {Magma_CSR}, dtt = {Magma_CSR}; magma_z_matrix dc = {Magma_CSR}; magma_z_matrix dv = {Magma_CSR}; magma_z_matrix dlu = {Magma_CSR}; magma_z_matrix dskp = {Magma_CSR}, hskp = {Magma_CSR}; magma_z_matrix dalpha = {Magma_CSR}, halpha = {Magma_CSR}; magma_z_matrix dbeta = {Magma_CSR}, hbeta = {Magma_CSR}; magmaDoubleComplex *d1 = NULL, *d2 = NULL; // chronometry real_Double_t tempo1, tempo2; // initial s space // TODO: add option for 's' (shadow space number) // Hack: uses '--restart' option as the shadow space number. // This is not a good idea because the default value of restart option is used to detect // if the user provided a custom restart. This means that if the default restart value // is changed then the code will think it was the user (unless the default value is // also updated in the 'if' statement below. s = 1; if ( solver_par->restart != 50 ) { if ( solver_par->restart > A.num_cols ) { s = A.num_cols; } else { s = solver_par->restart; } } solver_par->restart = s; // set max iterations solver_par->maxiter = min( 2 * A.num_cols, solver_par->maxiter ); // check if matrix A is square if ( A.num_rows != A.num_cols ) { //printf("Matrix A is not square.\n"); info = MAGMA_ERR_NOT_SUPPORTED; goto cleanup; } // |b| nrmb = magma_dznrm2( b.num_rows, b.dval, 1, queue ); if ( nrmb == 0.0 ) { magma_zscal( x->num_rows, MAGMA_Z_ZERO, x->dval, 1, queue ); info = MAGMA_SUCCESS; goto cleanup; } // t = 0 // make t twice as large to contain both, dt and dr ldd = magma_roundup( b.num_rows, 32 ); CHECK( magma_zvinit( &dt, Magma_DEV, ldd, 2, c_zero, queue )); dt.num_rows = b.num_rows; dt.num_cols = 1; dt.nnz = dt.num_rows; // redirect the dr.dval to the second part of dt CHECK( magma_zvinit( &dr, Magma_DEV, b.num_rows, 1, c_zero, queue )); magma_free( dr.dval ); dr.dval = dt.dval + ldd; // r = b - A x CHECK( magma_zresidualvec( A, b, *x, &dr, &nrmr, queue )); // |r| solver_par->init_res = nrmr; solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nrmr; } // check if initial is guess good enough if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { info = MAGMA_SUCCESS; goto cleanup; } // P = randn(n, s) // P = ortho(P) //--------------------------------------- // P = 0.0 CHECK( magma_zvinit( &dP, Magma_CPU, A.num_cols, s, c_zero, queue )); // P = randn(n, s) distr = 3; // 1 = unif (0,1), 2 = unif (-1,1), 3 = normal (0,1) dof = dP.num_rows * dP.num_cols; lapackf77_zlarnv( &distr, iseed, &dof, dP.val ); // transfer P to device CHECK( magma_zmtransfer( dP, &dP1, Magma_CPU, Magma_DEV, queue )); magma_zmfree( &dP, queue ); // P = ortho(P1) if ( dP1.num_cols > 1 ) { // P = magma_zqr(P1), QR factorization CHECK( magma_zqr( dP1.num_rows, dP1.num_cols, dP1, dP1.ld, &dP, NULL, queue )); } else { // P = P1 / |P1| nrm = magma_dznrm2( dof, dP1.dval, 1, queue ); nrm = 1.0 / nrm; magma_zdscal( dof, nrm, dP1.dval, 1, queue ); CHECK( magma_zmtransfer( dP1, &dP, Magma_DEV, Magma_DEV, queue )); } magma_zmfree( &dP1, queue ); //--------------------------------------- // allocate memory for the scalar products CHECK( magma_zvinit( &hskp, Magma_CPU, 4, 1, c_zero, queue )); CHECK( magma_zvinit( &dskp, Magma_DEV, 4, 1, c_zero, queue )); CHECK( magma_zvinit( &halpha, Magma_CPU, s, 1, c_zero, queue )); CHECK( magma_zvinit( &dalpha, Magma_DEV, s, 1, c_zero, queue )); CHECK( magma_zvinit( &hbeta, Magma_CPU, s, 1, c_zero, queue )); CHECK( magma_zvinit( &dbeta, Magma_DEV, s, 1, c_zero, queue )); // workspace for merged dot product CHECK( magma_zmalloc( &d1, max(2, s) * b.num_rows )); CHECK( magma_zmalloc( &d2, max(2, s) * b.num_rows )); // smoothing enabled if ( smoothing > 0 ) { // set smoothing solution vector CHECK( magma_zmtransfer( *x, &dxs, Magma_DEV, Magma_DEV, queue )); // tt = 0 // make tt twice as large to contain both, dtt and drs ldd = magma_roundup( b.num_rows, 32 ); CHECK( magma_zvinit( &dtt, Magma_DEV, ldd, 2, c_zero, queue )); dtt.num_rows = dr.num_rows; dtt.num_cols = 1; dtt.nnz = dtt.num_rows; // redirect the drs.dval to the second part of dtt CHECK( magma_zvinit( &drs, Magma_DEV, dr.num_rows, 1, c_zero, queue )); magma_free( drs.dval ); drs.dval = dtt.dval + ldd; // set smoothing residual vector magma_zcopyvector( dr.num_rows, dr.dval, 1, drs.dval, 1, queue ); } // G(n,s) = 0 if ( s > 1 ) { ldd = magma_roundup( A.num_rows, 32 ); CHECK( magma_zvinit( &dG, Magma_DEV, ldd, s, c_zero, queue )); dG.num_rows = A.num_rows; } else { CHECK( magma_zvinit( &dG, Magma_DEV, A.num_rows, s, c_zero, queue )); } // dGcol represents a single column of dG, array pointer is set inside loop CHECK( magma_zvinit( &dGcol, Magma_DEV, dG.num_rows, 1, c_zero, queue )); magma_free( dGcol.dval ); // U(n,s) = 0 if ( s > 1 ) { ldd = magma_roundup( A.num_cols, 32 ); CHECK( magma_zvinit( &dU, Magma_DEV, ldd, s, c_zero, queue )); dU.num_rows = A.num_cols; } else { CHECK( magma_zvinit( &dU, Magma_DEV, A.num_cols, s, c_zero, queue )); } // M(s,s) = I CHECK( magma_zvinit( &dM, Magma_DEV, s, s, c_zero, queue )); CHECK( magma_zvinit( &hMdiag, Magma_CPU, s, 1, c_zero, queue )); magmablas_zlaset( MagmaFull, dM.num_rows, dM.num_cols, c_zero, c_one, dM.dval, dM.ld, queue ); // f = 0 CHECK( magma_zvinit( &df, Magma_DEV, dP.num_cols, 1, c_zero, queue )); // c = 0 CHECK( magma_zvinit( &dc, Magma_DEV, dM.num_cols, 1, c_zero, queue )); // v = 0 CHECK( magma_zvinit( &dv, Magma_DEV, dr.num_rows, 1, c_zero, queue )); // lu = 0 CHECK( magma_zvinit( &dlu, Magma_DEV, dr.num_rows, 1, c_zero, queue )); //--------------START TIME--------------- // chronometry tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->timing[0] = 0.0; } om = MAGMA_Z_ONE; innerflag = 0; // start iteration do { solver_par->numiter++; // new RHS for small systems // f = P' r magma_zgemvmdot_shfl( dP.num_rows, dP.num_cols, dP.dval, dr.dval, d1, d2, df.dval, queue ); // shadow space loop for ( k = 0; k < s; ++k ) { sk = s - k; // c(k:s) = M(k:s,k:s) \ f(k:s) magma_zcopyvector( sk, &df.dval[k], 1, &dc.dval[k], 1, queue ); magma_ztrsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, sk, &dM.dval[k*dM.ld+k], dM.ld, &dc.dval[k], 1, queue ); // v = r - G(:,k:s) c(k:s) magma_zcopyvector( dr.num_rows, dr.dval, 1, dv.dval, 1, queue ); magmablas_zgemv( MagmaNoTrans, dG.num_rows, sk, c_n_one, &dG.dval[k*dG.ld], dG.ld, &dc.dval[k], 1, c_one, dv.dval, 1, queue ); // preconditioning operation // v = L \ v; // v = U \ v; CHECK( magma_z_applyprecond_left( MagmaNoTrans, A, dv, &dlu, precond_par, queue )); CHECK( magma_z_applyprecond_right( MagmaNoTrans, A, dlu, &dv, precond_par, queue )); // U(:,k) = om * v + U(:,k:s) c(k:s) magmablas_zgemv( MagmaNoTrans, dU.num_rows, sk, c_one, &dU.dval[k*dU.ld], dU.ld, &dc.dval[k], 1, om, dv.dval, 1, queue ); magma_zcopyvector( dU.num_rows, dv.dval, 1, &dU.dval[k*dU.ld], 1, queue ); // G(:,k) = A U(:,k) dGcol.dval = dG.dval + k * dG.ld; CHECK( magma_z_spmv( c_one, A, dv, c_zero, dGcol, queue )); solver_par->spmv_count++; // bi-orthogonalize the new basis vectors for ( i = 0; i < k; ++i ) { // alpha = P(:,i)' G(:,k) halpha.val[i] = magma_zdotc( dP.num_rows, &dP.dval[i*dP.ld], 1, &dG.dval[k*dG.ld], 1, queue ); // alpha = alpha / M(i,i) halpha.val[i] = halpha.val[i] / hMdiag.val[i]; // G(:,k) = G(:,k) - alpha * G(:,i) magma_zaxpy( dG.num_rows, -halpha.val[i], &dG.dval[i*dG.ld], 1, &dG.dval[k*dG.ld], 1, queue ); } // non-first s iteration if ( k > 0 ) { // U update outside of loop using GEMV // U(:,k) = U(:,k) - U(:,1:k) * alpha(1:k) magma_zsetvector( k, halpha.val, 1, dalpha.dval, 1, queue ); magmablas_zgemv( MagmaNoTrans, dU.num_rows, k, c_n_one, dU.dval, dU.ld, dalpha.dval, 1, c_one, &dU.dval[k*dU.ld], 1, queue ); } // new column of M = P'G, first k-1 entries are zero // M(k:s,k) = P(:,k:s)' G(:,k) magma_zgemvmdot_shfl( dP.num_rows, sk, &dP.dval[k*dP.ld], &dG.dval[k*dG.ld], d1, d2, &dM.dval[k*dM.ld+k], queue ); magma_zgetvector( 1, &dM.dval[k*dM.ld+k], 1, &hMdiag.val[k], 1, queue ); // check M(k,k) == 0 if ( MAGMA_Z_EQUAL(hMdiag.val[k], MAGMA_Z_ZERO) ) { innerflag = 1; info = MAGMA_DIVERGENCE; break; } // beta = f(k) / M(k,k) magma_zgetvector( 1, &df.dval[k], 1, &fk, 1, queue ); hbeta.val[k] = fk / hMdiag.val[k]; // check for nan if ( magma_z_isnan( hbeta.val[k] ) || magma_z_isinf( hbeta.val[k] )) { innerflag = 1; info = MAGMA_DIVERGENCE; break; } // r = r - beta * G(:,k) magma_zaxpy( dr.num_rows, -hbeta.val[k], &dG.dval[k*dG.ld], 1, dr.dval, 1, queue ); // smoothing disabled if ( smoothing <= 0 ) { // |r| nrmr = magma_dznrm2( dr.num_rows, dr.dval, 1, queue ); // smoothing enabled } else { // x = x + beta * U(:,k) magma_zaxpy( x->num_rows, hbeta.val[k], &dU.dval[k*dU.ld], 1, x->dval, 1, queue ); // smoothing operation //--------------------------------------- // t = rs - r magma_zidr_smoothing_1( drs.num_rows, drs.num_cols, drs.dval, dr.dval, dtt.dval, queue ); // t't // t'rs CHECK( magma_zgemvmdot_shfl( dt.ld, 2, dtt.dval, dtt.dval, d1, d2, &dskp.dval[2], queue )); magma_zgetvector( 2, &dskp.dval[2], 1, &hskp.val[2], 1, queue ); // gamma = (t' * rs) / (t' * t) gamma = hskp.val[3] / hskp.val[2]; // rs = rs - gamma * (rs - r) magma_zaxpy( drs.num_rows, -gamma, dtt.dval, 1, drs.dval, 1, queue ); // xs = xs - gamma * (xs - x) magma_zidr_smoothing_2( dxs.num_rows, dxs.num_cols, -gamma, x->dval, dxs.dval, queue ); // |rs| nrmr = magma_dznrm2( drs.num_rows, drs.dval, 1, queue ); //--------------------------------------- } // store current timing and residual if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter) % solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)nrmr; solver_par->timing[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)tempo2 - tempo1; } } // check convergence or iteration limit if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { s = k + 1; // for the x-update outside the loop innerflag = 2; info = MAGMA_SUCCESS; break; } // non-last s iteration if ( (k + 1) < s ) { // f(k+1:s) = f(k+1:s) - beta * M(k+1:s,k) magma_zaxpy( sk-1, -hbeta.val[k], &dM.dval[k*dM.ld+(k+1)], 1, &df.dval[k+1], 1, queue ); } } // smoothing disabled if ( smoothing <= 0 && innerflag != 1 ) { // update solution approximation x // x = x + U(:,1:s) * beta(1:s) magma_zsetvector( s, hbeta.val, 1, dbeta.dval, 1, queue ); magmablas_zgemv( MagmaNoTrans, dU.num_rows, s, c_one, dU.dval, dU.ld, dbeta.dval, 1, c_one, x->dval, 1, queue ); } // check convergence or iteration limit or invalid result of inner loop if ( innerflag > 0 ) { break; } // v = r magma_zcopy( dr.num_rows, dr.dval, 1, dv.dval, 1, queue ); // preconditioning operation // v = L \ v; // v = U \ v; CHECK( magma_z_applyprecond_left( MagmaNoTrans, A, dv, &dlu, precond_par, queue )); CHECK( magma_z_applyprecond_right( MagmaNoTrans, A, dlu, &dv, precond_par, queue )); // t = A v CHECK( magma_z_spmv( c_one, A, dv, c_zero, dt, queue )); solver_par->spmv_count++; // computation of a new omega //--------------------------------------- // t't // t'r CHECK( magma_zgemvmdot_shfl( dt.ld, 2, dt.dval, dt.dval, d1, d2, dskp.dval, queue )); magma_zgetvector( 2, dskp.dval, 1, hskp.val, 1, queue ); // |t| nrmt = magma_dsqrt( MAGMA_Z_REAL(hskp.val[0]) ); // rho = abs((t' * r) / (|t| * |r|)) rho = MAGMA_D_ABS( MAGMA_Z_REAL(hskp.val[1]) / (nrmt * nrmr) ); // om = (t' * r) / (|t| * |t|) om = hskp.val[1] / hskp.val[0]; if ( rho < angle ) { om = (om * angle) / rho; } //--------------------------------------- if ( MAGMA_Z_EQUAL(om, MAGMA_Z_ZERO) ) { info = MAGMA_DIVERGENCE; break; } // update approximation vector // x = x + om * v magma_zaxpy( x->num_rows, om, dv.dval, 1, x->dval, 1, queue ); // update residual vector // r = r - om * t magma_zaxpy( dr.num_rows, -om, dt.dval, 1, dr.dval, 1, queue ); // smoothing disabled if ( smoothing <= 0 ) { // residual norm nrmr = magma_dznrm2( dr.num_rows, dr.dval, 1, queue ); // smoothing enabled } else { // smoothing operation //--------------------------------------- // t = rs - r magma_zidr_smoothing_1( drs.num_rows, drs.num_cols, drs.dval, dr.dval, dtt.dval, queue ); // t't // t'rs CHECK( magma_zgemvmdot_shfl( dt.ld, 2, dtt.dval, dtt.dval, d1, d2, &dskp.dval[2], queue )); magma_zgetvector( 2, &dskp.dval[2], 1, &hskp.val[2], 1, queue ); // gamma = (t' * rs) / (t' * t) gamma = hskp.val[3] / hskp.val[2]; // rs = rs - gamma * (rs - r) magma_zaxpy( drs.num_rows, -gamma, dtt.dval, 1, drs.dval, 1, queue ); // xs = xs - gamma * (xs - x) magma_zidr_smoothing_2( dxs.num_rows, dxs.num_cols, -gamma, x->dval, dxs.dval, queue ); // |rs| nrmr = magma_dznrm2( drs.num_rows, drs.dval, 1, queue ); //--------------------------------------- } // store current timing and residual if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter) % solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)nrmr; solver_par->timing[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)tempo2 - tempo1; } } // check convergence if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { info = MAGMA_SUCCESS; break; } } while ( solver_par->numiter + 1 <= solver_par->maxiter ); // smoothing enabled if ( smoothing > 0 ) { // x = xs magma_zcopyvector( x->num_rows, dxs.dval, 1, x->dval, 1, queue ); // r = rs magma_zcopyvector( dr.num_rows, drs.dval, 1, dr.dval, 1, queue ); } // get last iteration timing tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t)tempo2 - tempo1; //--------------STOP TIME---------------- // get final stats solver_par->iter_res = nrmr; CHECK( magma_zresidualvec( A, b, *x, &dr, &residual, queue )); solver_par->final_res = residual; // set solver conclusion if ( info != MAGMA_SUCCESS && info != MAGMA_DIVERGENCE ) { if ( solver_par->init_res > solver_par->final_res ) { info = MAGMA_SLOW_CONVERGENCE; } } cleanup: // free resources // smoothing enabled if ( smoothing > 0 ) { drs.dval = NULL; // needed because its pointer is redirected to dtt magma_zmfree( &dxs, queue ); magma_zmfree( &drs, queue ); magma_zmfree( &dtt, queue ); } dr.dval = NULL; // needed because its pointer is redirected to dt dGcol.dval = NULL; // needed because its pointer is redirected to dG magma_zmfree( &dr, queue ); magma_zmfree( &dP, queue ); magma_zmfree( &dP1, queue ); magma_zmfree( &dG, queue ); magma_zmfree( &dGcol, queue ); magma_zmfree( &dU, queue ); magma_zmfree( &dM, queue ); magma_zmfree( &hMdiag, queue ); magma_zmfree( &df, queue ); magma_zmfree( &dt, queue ); magma_zmfree( &dc, queue ); magma_zmfree( &dv, queue ); magma_zmfree( &dlu, queue ); magma_zmfree( &dskp, queue ); magma_zmfree( &dalpha, queue ); magma_zmfree( &dbeta, queue ); magma_zmfree( &hskp, queue ); magma_zmfree( &halpha, queue ); magma_zmfree( &hbeta, queue ); magma_free( d1 ); magma_free( d2 ); solver_par->info = info; return info; /* magma_zpidr_merge */ }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ztrmv */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf, cpu_time; double cublas_error, Cnorm, work[1]; magma_int_t N; magma_int_t Ak; magma_int_t sizeA; magma_int_t lda, ldda; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaDoubleComplex *h_A, *h_x, *h_xcublas; magmaDoubleComplex *d_A, *d_x; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_opts opts; parse_opts( argc, argv, &opts ); printf("If running lapack (option --lapack), MAGMA and CUBLAS error are both computed\n" "relative to CPU BLAS result. Else, MAGMA error is computed relative to CUBLAS result.\n\n" "uplo = %c, transA = %c, diag = %c \n", opts.uplo, opts.transA, opts.diag ); printf(" N CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("==================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; gflops = FLOPS_ZTRMM(opts.side, N, 1) / 1e9; lda = N; Ak = N; ldda = ((lda+31)/32)*32; sizeA = lda*Ak; TESTING_MALLOC( h_A, magmaDoubleComplex, lda*Ak ); TESTING_MALLOC( h_x, magmaDoubleComplex, N ); TESTING_MALLOC( h_xcublas, magmaDoubleComplex, N ); TESTING_DEVALLOC( d_A, magmaDoubleComplex, ldda*Ak ); TESTING_DEVALLOC( d_x, magmaDoubleComplex, N ); /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_zlarnv( &ione, ISEED, &N, h_x ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_zsetmatrix( Ak, Ak, h_A, lda, d_A, ldda ); magma_zsetvector( N, h_x, 1, d_x, 1 ); cublas_time = magma_sync_wtime( NULL ); cublasZtrmv( opts.uplo, opts.transA, opts.diag, N, d_A, ldda, d_x, 1 ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_zgetvector( N, d_x, 1, h_xcublas, 1 ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_ztrmv( &opts.uplo, &opts.transA, &opts.diag, &N, h_A, &lda, h_x, &ione ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_zlange( "M", &N, &ione, h_x, &N, work ); blasf77_zaxpy( &N, &c_neg_one, h_x, &ione, h_xcublas, &ione ); cublas_error = lapackf77_zlange( "M", &N, &ione, h_xcublas, &N, work ) / Cnorm; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e\n", (int) N, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error ); } else { printf("%5d %7.2f (%7.2f) --- ( --- ) --- ---\n", (int) N, cublas_perf, 1000.*cublas_time); } TESTING_FREE( h_A ); TESTING_FREE( h_x ); TESTING_FREE( h_xcublas ); TESTING_DEVFREE( d_A ); TESTING_DEVFREE( d_x ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
/** @deprecated Purpose ------- ZLAQPS computes a step of QR factorization with column pivoting of a complex M-by-N matrix A by using Blas-3. It tries to factorize NB columns from A starting from the row OFFSET+1, and updates all of the matrix with Blas-3 xGEMM. In some cases, due to catastrophic cancellations, it cannot factorize NB columns. Hence, the actual number of factorized columns is returned in KB. Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0 @param[in] offset INTEGER The number of rows of A that have been factorized in previous steps. @param[in] nb INTEGER The number of columns to factorize. @param[out] kb INTEGER The number of columns actually factorized. @param[in,out] dA COMPLEX_16 array, dimension (LDDA,N), on the GPU. On entry, the M-by-N matrix A. On exit, block A(OFFSET+1:M,1:KB) is the triangular factor obtained and block A(1:OFFSET,1:N) has been accordingly pivoted, but no factorized. The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has been updated. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[in,out] jpvt INTEGER array, dimension (N) JPVT(I) = K <==> Column K of the full matrix A has been permuted into position I in AP. @param[out] tau COMPLEX_16 array, dimension (KB) The scalar factors of the elementary reflectors. @param[in,out] vn1 DOUBLE PRECISION array, dimension (N) The vector with the partial column norms. @param[in,out] vn2 DOUBLE PRECISION array, dimension (N) The vector with the exact column norms. @param[in,out] dauxv COMPLEX_16 array, dimension (NB), on the GPU Auxiliary vector. @param[in,out] dF COMPLEX_16 array, dimension (LDDF,NB), on the GPU Matrix F' = L*Y'*A. @param[in] lddf INTEGER The leading dimension of the array F. LDDF >= max(1,N). @ingroup magma_zgeqp3_aux ********************************************************************/ extern "C" magma_int_t magma_zlaqps_gpu( magma_int_t m, magma_int_t n, magma_int_t offset, magma_int_t nb, magma_int_t *kb, magmaDoubleComplex_ptr dA, magma_int_t ldda, magma_int_t *jpvt, magmaDoubleComplex *tau, double *vn1, double *vn2, magmaDoubleComplex_ptr dauxv, magmaDoubleComplex_ptr dF, magma_int_t lddf) { #define dA(i, j) (dA + (i) + (j)*(ldda)) #define dF(i, j) (dF + (i) + (j)*(lddf)) magmaDoubleComplex c_zero = MAGMA_Z_MAKE( 0.,0.); magmaDoubleComplex c_one = MAGMA_Z_MAKE( 1.,0.); magmaDoubleComplex c_neg_one = MAGMA_Z_MAKE(-1.,0.); magma_int_t ione = 1; magma_int_t i__1, i__2; magmaDoubleComplex z__1; magma_int_t k, rk; magmaDoubleComplex_ptr dAks; magmaDoubleComplex tauk = MAGMA_Z_ZERO; magma_int_t pvt; double tol3z; magma_int_t itemp; double lsticc; magmaDouble_ptr dlsticcs; magma_dmalloc( &dlsticcs, 1+256*(n+255)/256 ); tol3z = magma_dsqrt( lapackf77_dlamch("Epsilon")); lsticc = 0; k = 0; magma_zmalloc( &dAks, nb ); magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); while( k < nb && lsticc == 0 ) { rk = offset + k; /* Determine ith pivot column and swap if necessary */ // subtract 1 from Fortran/CUBLAS idamax; pvt, k are 0-based. pvt = k + magma_idamax( n-k, &vn1[k], ione, queue ) - 1; if (pvt != k) { /* F gets swapped so F must be sent at the end to GPU */ i__1 = k; magmablas_zswap( m, dA(0, pvt), ione, dA(0, k), ione, queue ); magmablas_zswap( i__1, dF(pvt, 0), lddf, dF(k, 0), lddf, queue ); itemp = jpvt[pvt]; jpvt[pvt] = jpvt[k]; jpvt[k] = itemp; magma_dswap( 2, &vn1[pvt], n+offset, &vn1[k], n+offset, queue ); } /* Apply previous Householder reflectors to column K: A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'. Optimization: multiply with beta=0; wait for vector and subtract */ if (k > 0) { //#define RIGHT_UPDATE #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_zgemv( MagmaNoTrans, i__1, i__2, c_neg_one, A(offset+nb, 0), lda, F(k, 0), ldf, c_one, A(offset+nb, k), ione, queue ); #else i__1 = m - rk; i__2 = k; magma_zgemv( MagmaNoTrans, i__1, i__2, c_neg_one, dA(rk, 0), ldda, dF(k, 0), lddf, c_one, dA(rk, k), ione, queue ); #endif } /* Generate elementary reflector H(k). */ magma_zlarfg_gpu( m-rk, dA(rk, k), dA(rk + 1, k), &tau[k], &vn1[k], &dAks[k], queue ); /* needed to avoid the race condition */ if (k == 0) magma_zsetvector( 1, &c_one, 1, dA(rk, k), 1, queue ); else magma_zcopymatrix( 1, 1, dA(offset, 0), 1, dA(rk, k), 1, queue ); /* Compute Kth column of F: Compute F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */ if (k < n-1 || k > 0) magma_zgetvector( 1, &tau[k], 1, &tauk, 1, queue ); if (k < n-1) { i__1 = m - rk; i__2 = n - k - 1; /* Multiply on GPU */ magma_zgemv( MagmaConjTrans, m-rk, n-k-1, tauk, dA( rk, k+1 ), ldda, dA( rk, k ), 1, c_zero, dF( k+1, k ), 1, queue ); } /* Incremental updating of F: F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). F(1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K) := tau(K)(A(RK:M,K+1:N)' - F(1:N,1:K-1)*A(RK:M,1:K-1)') A(RK:M,K) so, F is (updated A)*V */ if (k > 0) { z__1 = MAGMA_Z_NEGATE( tauk ); #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_zgemv( MagmaConjTrans, i__1, i__2, z__1, dA(offset+nb, 0), lda, dA(offset+nb, k), ione, c_zero, dauxv, ione, queue ); i__1 = k; magma_zgemv( MagmaNoTrans, n-k-1, i__1, c_one, F(k+1,0), ldf, dauxv, ione, c_one, F(k+1,k), ione, queue ); #else i__1 = m - rk; i__2 = k; magma_zgemv( MagmaConjTrans, i__1, i__2, z__1, dA(rk, 0), ldda, dA(rk, k), ione, c_zero, dauxv, ione, queue ); /* I think we only need stricly lower-triangular part :) */ magma_zgemv( MagmaNoTrans, n-k-1, i__2, c_one, dF(k+1,0), lddf, dauxv, ione, c_one, dF(k+1,k), ione, queue ); #endif } /* Optimization: On the last iteration start sending F back to the GPU */ /* Update the current row of A: A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'. */ if (k < n-1) { i__1 = n - k - 1; i__2 = k + 1; #ifdef RIGHT_UPDATE /* right-looking update of rows, */ magma_zgemm( MagmaNoTrans, MagmaConjTrans, nb-k, i__1, ione, c_neg_one, dA(rk, k ), ldda, dF(k+1, k ), lddf, c_one, dA(rk, k+1), ldda, queue ); #else /* left-looking update of rows, * * since F=A'v with original A, so no right-looking */ magma_zgemm( MagmaNoTrans, MagmaConjTrans, ione, i__1, i__2, c_neg_one, dA(rk, 0 ), ldda, dF(k+1,0 ), lddf, c_one, dA(rk, k+1), ldda, queue ); #endif } /* Update partial column norms. */ if (rk < min(m, n+offset)-1 ) { magmablas_dznrm2_row_check_adjust( n-k-1, tol3z, &vn1[k+1], &vn2[k+1], dA(rk,k+1), ldda, dlsticcs, queue ); //magma_device_sync(); magma_dgetvector( 1, &dlsticcs[0], 1, &lsticc, 1, queue ); } ++k; } magma_zcopymatrix( 1, k, dAks, 1, dA(offset, 0), ldda+1, queue ); // leave k as the last column done --k; *kb = k + 1; rk = offset + *kb - 1; /* Apply the block reflector to the rest of the matrix: A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)' */ if (*kb < min(n, m - offset)) { i__1 = m - rk - 1; i__2 = n - *kb; magma_zgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb, c_neg_one, dA(rk+1, 0 ), ldda, dF(*kb, 0 ), lddf, c_one, dA(rk+1, *kb), ldda, queue ); } /* Recomputation of difficult columns. */ if ( lsticc > 0 ) { // printf( " -- recompute dnorms --\n" ); magmablas_dznrm2_check( m-rk-1, n-*kb, dA(rk+1,*kb), ldda, &vn1[*kb], dlsticcs, queue ); magma_dcopymatrix( n-*kb, 1, &vn1[*kb], *kb, &vn2[*kb], *kb, queue ); } magma_free( dAks ); magma_free( dlsticcs ); magma_queue_destroy( queue ); return MAGMA_SUCCESS; } /* magma_zlaqps */
extern "C" magma_int_t magma_zlabrd_gpu( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *a, magma_int_t lda, magmaDoubleComplex_ptr da, size_t da_offset, magma_int_t ldda, double *d, double *e, magmaDoubleComplex *tauq, magmaDoubleComplex *taup, magmaDoubleComplex *x, magma_int_t ldx, magmaDoubleComplex_ptr dx, size_t dx_offset, magma_int_t lddx, magmaDoubleComplex *y, magma_int_t ldy, magmaDoubleComplex_ptr dy, size_t dy_offset, magma_int_t lddy, magma_queue_t queue ) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= ZLABRD reduces the first NB rows and columns of a complex general m by n matrix A to upper or lower bidiagonal form by an orthogonal transformation Q' * A * P, and returns the matrices X and Y which are needed to apply the transformation to the unreduced part of A. If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower bidiagonal form. This is an auxiliary routine called by SGEBRD Arguments ========= M (input) INTEGER The number of rows in the matrix A. N (input) INTEGER The number of columns in the matrix A. NB (input) INTEGER The number of leading rows and columns of A to be reduced. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the m by n general matrix to be reduced. On exit, the first NB rows and columns of the matrix are overwritten; the rest of the array is unchanged. If m >= n, elements on and below the diagonal in the first NB columns, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors; and elements above the diagonal in the first NB rows, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors. If m < n, elements below the diagonal in the first NB columns, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors, and elements on and above the diagonal in the first NB rows, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). D (output) COMPLEX_16 array, dimension (NB) The diagonal elements of the first NB rows and columns of the reduced matrix. D(i) = A(i,i). E (output) COMPLEX_16 array, dimension (NB) The off-diagonal elements of the first NB rows and columns of the reduced matrix. TAUQ (output) COMPLEX_16 array dimension (NB) The scalar factors of the elementary reflectors which represent the orthogonal matrix Q. See Further Details. TAUP (output) COMPLEX_16 array, dimension (NB) The scalar factors of the elementary reflectors which represent the orthogonal matrix P. See Further Details. X (output) COMPLEX_16 array, dimension (LDX,NB) The m-by-nb matrix X required to update the unreduced part of A. LDX (input) INTEGER The leading dimension of the array X. LDX >= M. Y (output) COMPLEX_16 array, dimension (LDY,NB) The n-by-nb matrix Y required to update the unreduced part of A. LDY (input) INTEGER The leading dimension of the array Y. LDY >= N. Further Details =============== The matrices Q and P are represented as products of elementary reflectors: Q = H(1) H(2) . . . H(nb) and P = G(1) G(2) . . . G(nb) Each H(i) and G(i) has the form: H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' where tauq and taup are complex scalars, and v and u are complex vectors. If m >= n, v(1:i-1) = 0, v(i) = 1, and v(i:m) is stored on exit in A(i:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+1:n) is stored on exit in A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i). If m < n, v(1:i) = 0, v(i+1) = 1, and v(i+1:m) is stored on exit in A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i:n) is stored on exit in A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i). The elements of the vectors v and u together form the m-by-nb matrix V and the nb-by-n matrix U' which are needed, with X and Y, to apply the transformation to the unreduced part of the matrix, using a block update of the form: A := A - V*Y' - X*U'. The contents of A on exit are illustrated by the following examples with nb = 2: m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n): ( 1 1 u1 u1 u1 ) ( 1 u1 u1 u1 u1 u1 ) ( v1 1 1 u2 u2 ) ( 1 1 u2 u2 u2 u2 ) ( v1 v2 a a a ) ( v1 1 a a a a ) ( v1 v2 a a a ) ( v1 v2 a a a a ) ( v1 v2 a a a ) ( v1 v2 a a a a ) ( v1 v2 a a a ) where a denotes an element of the original matrix which is unchanged, vi denotes an element of the vector defining H(i), and ui an element of the vector defining G(i). ===================================================================== */ magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magma_int_t c__1 = 1; magma_int_t a_dim1, a_offset, x_dim1, x_offset, y_dim1, y_offset, i__2, i__3; magma_int_t i__; magmaDoubleComplex alpha; a_dim1 = lda; a_offset = 1 + a_dim1; a -= a_offset; --d; --e; --tauq; --taup; x_dim1 = ldx; x_offset = 1 + x_dim1; x -= x_offset; dx_offset -= 1 + lddx; y_dim1 = ldy; y_offset = 1 + y_dim1; y -= y_offset; dy_offset -= 1 + lddy; /* Quick return if possible */ if (m <= 0 || n <= 0) { return 0; } magmaDoubleComplex *f; magma_zmalloc_cpu( &f, max(n,m) ); assert( f != NULL ); // TODO return error, or allocate outside zlatrd magma_event_t event = NULL; if (m >= n) { /* Reduce to upper bidiagonal form */ for (i__ = 1; i__ <= nb; ++i__) { /* Update A(i:m,i) */ i__2 = m - i__ + 1; i__3 = i__ - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv( &i__3, &y[i__+y_dim1], &ldy ); #endif blasf77_zgemv("No transpose", &i__2, &i__3, &c_neg_one, &a[i__ + a_dim1], &lda, &y[i__+y_dim1], &ldy, &c_one, &a[i__ + i__ * a_dim1], &c__1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv( &i__3, &y[i__+y_dim1], &ldy ); #endif blasf77_zgemv("No transpose", &i__2, &i__3, &c_neg_one, &x[i__ + x_dim1], &ldx, &a[i__*a_dim1+1], &c__1, &c_one, &a[i__+i__*a_dim1], &c__1); /* Generate reflection Q(i) to annihilate A(i+1:m,i) */ alpha = a[i__ + i__ * a_dim1]; i__2 = m - i__ + 1; i__3 = i__ + 1; lapackf77_zlarfg(&i__2, &alpha, &a[min(i__3,m) + i__ * a_dim1], &c__1, &tauq[i__]); d[i__] = MAGMA_Z_REAL( alpha ); if (i__ < n) { a[i__ + i__ * a_dim1] = c_one; /* Compute Y(i+1:n,i) */ i__2 = m - i__ + 1; i__3 = n - i__; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_zsetvector( i__2, a + i__ + i__ * a_dim1, 1, da, da_offset + (i__-1)+(i__-1)* (ldda), 1, queue ); // 2. Multiply --------------------------------------------- magma_zgemv(MagmaConjTrans, i__2, i__3, c_one, da, da_offset + (i__-1) + ((i__-1) + 1) * (ldda), ldda, da, da_offset + (i__-1) + (i__-1) * (ldda), c__1, c_zero, dy, dy_offset + i__ + 1 + i__ * y_dim1, c__1, queue ); // 3. Put the result back ---------------------------------- magma_zgetmatrix_async( i__3, 1, dy, dy_offset + i__+1+i__*y_dim1, y_dim1, y+i__+1+i__*y_dim1, y_dim1, queue, &event ); i__2 = m - i__ + 1; i__3 = i__ - 1; blasf77_zgemv(MagmaConjTransStr, &i__2, &i__3, &c_one, &a[i__ + a_dim1], &lda, &a[i__ + i__ * a_dim1], &c__1, &c_zero, &y[i__ * y_dim1 + 1], &c__1); i__2 = n - i__; i__3 = i__ - 1; blasf77_zgemv("N", &i__2, &i__3, &c_neg_one, &y[i__ + 1 +y_dim1], &ldy, &y[i__ * y_dim1 + 1], &c__1, &c_zero, f, &c__1); i__2 = m - i__ + 1; i__3 = i__ - 1; blasf77_zgemv(MagmaConjTransStr, &i__2, &i__3, &c_one, &x[i__ + x_dim1], &ldx, &a[i__ + i__ * a_dim1], &c__1, &c_zero, &y[i__ * y_dim1 + 1], &c__1); // 4. Synch to make sure the result is back ---------------- magma_event_sync( event ); if (i__3 != 0){ i__2 = n - i__; blasf77_zaxpy(&i__2, &c_one, f,&c__1, &y[i__+1+i__*y_dim1],&c__1); } i__2 = i__ - 1; i__3 = n - i__; blasf77_zgemv(MagmaConjTransStr, &i__2, &i__3, &c_neg_one, &a[(i__ + 1) * a_dim1 + 1], &lda, &y[i__ * y_dim1 + 1], &c__1, &c_one, &y[i__ + 1 + i__ * y_dim1], &c__1); i__2 = n - i__; blasf77_zscal(&i__2, &tauq[i__], &y[i__ + 1 + i__ * y_dim1], &c__1); /* Update A(i,i+1:n) */ i__2 = n - i__; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv( &i__2, &a[i__+(i__+1)*a_dim1], &lda ); lapackf77_zlacgv( &i__, &a[i__+a_dim1], &lda ); #endif blasf77_zgemv("No transpose", &i__2, &i__, &c_neg_one, &y[i__ + 1 + y_dim1], &ldy, &a[i__ + a_dim1], &lda, &c_one, &a[i__ + (i__ + 1) * a_dim1], &lda); i__2 = i__ - 1; i__3 = n - i__; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv( &i__, &a[i__+a_dim1], &lda ); lapackf77_zlacgv( &i__2, &x[i__+x_dim1], &ldx ); #endif blasf77_zgemv(MagmaConjTransStr, &i__2, &i__3, &c_neg_one, &a[(i__ + 1) * a_dim1 + 1], &lda, &x[i__ + x_dim1], &ldx, &c_one, &a[ i__ + (i__ + 1) * a_dim1], &lda); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv( &i__2, &x[i__+x_dim1], &ldx ); #endif /* Generate reflection P(i) to annihilate A(i,i+2:n) */ i__2 = n - i__; /* Computing MIN */ i__3 = i__ + 2; alpha = a[i__ + (i__ + 1) * a_dim1]; lapackf77_zlarfg(&i__2, &alpha, &a[i__ + min( i__3,n) * a_dim1], &lda, &taup[i__]); e[i__] = MAGMA_Z_REAL( alpha ); a[i__ + (i__ + 1) * a_dim1] = c_one; /* Compute X(i+1:m,i) */ i__2 = m - i__; i__3 = n - i__; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_zsetvector( i__3, a + i__ + (i__ +1)* a_dim1, lda, da, da_offset + (i__-1)+((i__-1)+1)*(ldda), ldda, queue ); // 2. Multiply --------------------------------------------- //magma_zcopy(i__3, da+(i__-1)+((i__-1)+1)*(ldda), ldda, // dy + 1 + lddy, 1); magma_zgemv(MagmaNoTrans, i__2, i__3, c_one, da, da_offset + (i__-1)+1+ ((i__-1)+1) * (ldda), ldda, da, da_offset + (i__-1) + ((i__-1)+1) * (ldda), ldda, //dy + 1 + lddy, 1, c_zero, dx, dx_offset + i__ + 1 + i__ * x_dim1, c__1, queue ); // 3. Put the result back ---------------------------------- magma_zgetmatrix_async( i__2, 1, dx, dx_offset + i__+1+i__*x_dim1, x_dim1, x+i__+1+i__*x_dim1, x_dim1, queue, &event ); i__2 = n - i__; blasf77_zgemv(MagmaConjTransStr, &i__2, &i__, &c_one, &y[i__ + 1 + y_dim1], &ldy, &a[i__ + (i__ + 1) * a_dim1], &lda, &c_zero, &x[ i__ * x_dim1 + 1], &c__1); i__2 = m - i__; blasf77_zgemv("N", &i__2, &i__, &c_neg_one, &a[i__ + 1 + a_dim1], &lda, &x[i__ * x_dim1 + 1], &c__1, &c_zero, f, &c__1); i__2 = i__ - 1; i__3 = n - i__; blasf77_zgemv("N", &i__2, &i__3, &c_one, &a[(i__ + 1) * a_dim1 + 1], &lda, &a[i__ + (i__ + 1) * a_dim1], &lda, &c_zero, &x[i__ * x_dim1 + 1], &c__1); // 4. Synch to make sure the result is back ---------------- magma_event_sync( event ); if (i__!=0){ i__2 = m - i__; blasf77_zaxpy(&i__2, &c_one, f,&c__1, &x[i__+1+i__*x_dim1],&c__1); } i__2 = m - i__; i__3 = i__ - 1; blasf77_zgemv("No transpose", &i__2, &i__3, &c_neg_one, &x[i__ + 1 + x_dim1], &ldx, &x[i__ * x_dim1 + 1], &c__1, &c_one, &x[ i__ + 1 + i__ * x_dim1], &c__1); i__2 = m - i__; blasf77_zscal(&i__2, &taup[i__], &x[i__ + 1 + i__ * x_dim1], &c__1); #if defined(PRECISION_z) || defined(PRECISION_c) i__2 = n - i__; lapackf77_zlacgv( &i__2, &a[i__+(i__+1)*a_dim1], &lda ); // 4. Send the block reflector A(i+1:m,i) to the GPU after ZLACGV() magma_zsetvector( i__2, a + i__ + (i__ +1)* a_dim1, lda, da, da_offset + (i__-1)+((i__-1)+1)*(ldda), ldda, queue ); #endif } } } else { /* Reduce to lower bidiagonal form */ for (i__ = 1; i__ <= nb; ++i__) { /* Update A(i,i:n) */ i__2 = n - i__ + 1; i__3 = i__ - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i__2, &a[i__ + i__ * a_dim1], &lda); lapackf77_zlacgv(&i__3, &a[i__ + a_dim1], &lda); #endif blasf77_zgemv("No transpose", &i__2, &i__3, &c_neg_one, &y[i__ + y_dim1], &ldy, &a[i__ + a_dim1], &lda, &c_one, &a[i__ + i__ * a_dim1], &lda); i__2 = i__ - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i__3, &a[i__ + a_dim1], &lda); lapackf77_zlacgv(&i__3, &x[i__ + x_dim1], &ldx); #endif i__3 = n - i__ + 1; blasf77_zgemv(MagmaConjTransStr, &i__2, &i__3, &c_neg_one, &a[i__ * a_dim1 + 1], &lda, &x[i__ + x_dim1], &ldx, &c_one, &a[i__ + i__ * a_dim1], &lda); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i__2, &x[i__ + x_dim1], &ldx); #endif /* Generate reflection P(i) to annihilate A(i,i+1:n) */ i__2 = n - i__ + 1; /* Computing MIN */ i__3 = i__ + 1; alpha = a[i__ + i__ * a_dim1]; lapackf77_zlarfg(&i__2, &alpha, &a[i__ + min(i__3,n) * a_dim1], &lda, &taup[i__]); d[i__] = MAGMA_Z_REAL( alpha ); if (i__ < m) { a[i__ + i__ * a_dim1] = c_one; /* Compute X(i+1:m,i) */ i__2 = m - i__; i__3 = n - i__ + 1; // 1. Send the block reflector A(i,i+1:n) to the GPU ------ magma_zsetvector( i__3, a + i__ + i__ * a_dim1, lda, da, da_offset + (i__-1)+(i__-1)* (ldda), ldda, queue ); // 2. Multiply --------------------------------------------- //magma_zcopy(i__3, da+(i__-1)+(i__-1)*(ldda), ldda, // dy + 1 + lddy, 1); magma_zgemv(MagmaNoTrans, i__2, i__3, c_one, da, da_offset + (i__-1)+1 + (i__-1) * ldda, ldda, da, da_offset + (i__-1) + (i__-1) * ldda, ldda, // dy + 1 + lddy, 1, c_zero, dx, dx_offset + i__ + 1 + i__ * x_dim1, c__1, queue ); // 3. Put the result back ---------------------------------- magma_zgetmatrix_async( i__2, 1, dx, dx_offset + i__+1+i__*x_dim1, x_dim1, x+i__+1+i__*x_dim1, x_dim1, queue, &event ); i__2 = n - i__ + 1; i__3 = i__ - 1; blasf77_zgemv(MagmaConjTransStr, &i__2, &i__3, &c_one, &y[i__ + y_dim1], &ldy, &a[i__ + i__ * a_dim1], &lda, &c_zero, &x[i__ * x_dim1 + 1], &c__1); i__2 = m - i__; i__3 = i__ - 1; blasf77_zgemv("No transpose", &i__2, &i__3, &c_neg_one, &a[i__ + 1 + a_dim1], &lda, &x[i__ * x_dim1 + 1], &c__1, &c_zero, f, &c__1); i__2 = i__ - 1; i__3 = n - i__ + 1; blasf77_zgemv("No transpose", &i__2, &i__3, &c_one, &a[i__ * a_dim1 + 1], &lda, &a[i__ + i__ * a_dim1], &lda, &c_zero, &x[i__ * x_dim1 + 1], &c__1); // 4. Synch to make sure the result is back ---------------- magma_event_sync( event ); if (i__2 != 0){ i__3 = m - i__; blasf77_zaxpy(&i__3, &c_one, f,&c__1, &x[i__+1+i__*x_dim1],&c__1); } i__2 = m - i__; i__3 = i__ - 1; blasf77_zgemv("No transpose", &i__2, &i__3, &c_neg_one, &x[i__ + 1 + x_dim1], &ldx, &x[i__ * x_dim1 + 1], &c__1, &c_one, &x[i__ + 1 + i__ * x_dim1], &c__1); i__2 = m - i__; blasf77_zscal(&i__2, &taup[i__], &x[i__ + 1 + i__ * x_dim1], &c__1); i__2 = n - i__ + 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i__2, &a[i__ + i__ * a_dim1], &lda); magma_zsetvector( i__2, a + i__ + (i__ )* a_dim1, lda, da, da_offset + (i__-1)+ (i__-1)*(ldda), ldda, queue ); #endif /* Update A(i+1:m,i) */ i__2 = m - i__; i__3 = i__ - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i__3, &y[i__ + y_dim1], &ldy); #endif blasf77_zgemv("No transpose", &i__2, &i__3, &c_neg_one, &a[i__ + 1 + a_dim1], &lda, &y[i__ + y_dim1], &ldy, &c_one, &a[i__ + 1 + i__ * a_dim1], &c__1); i__2 = m - i__; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i__3, &y[i__ + y_dim1], &ldy); #endif blasf77_zgemv("No transpose", &i__2, &i__, &c_neg_one, &x[i__ + 1 + x_dim1], &ldx, &a[i__ * a_dim1 + 1], &c__1, &c_one, &a[i__ + 1 + i__ * a_dim1], &c__1); /* Generate reflection Q(i) to annihilate A(i+2:m,i) */ i__2 = m - i__; i__3 = i__ + 2; alpha = a[i__ + 1 + i__ * a_dim1]; lapackf77_zlarfg(&i__2, &alpha, &a[min(i__3,m) + i__ * a_dim1], &c__1, &tauq[i__]); e[i__] = MAGMA_Z_REAL( alpha ); a[i__ + 1 + i__ * a_dim1] = c_one; /* Compute Y(i+1:n,i) */ i__2 = m - i__; i__3 = n - i__; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_zsetvector( i__2, a + i__ +1+ i__ * a_dim1, 1, da, da_offset + (i__-1)+1+ (i__-1)*(ldda), 1, queue ); // 2. Multiply --------------------------------------------- magma_zgemv(MagmaConjTrans, i__2, i__3, c_one, da, da_offset + (i__-1)+1+ ((i__-1)+1) * ldda, ldda, da, da_offset + (i__-1)+1+ (i__-1) * ldda, c__1, c_zero, dy, dy_offset + i__ + 1 + i__ * y_dim1, c__1, queue ); // 3. Put the result back ---------------------------------- magma_zgetmatrix_async( i__3, 1, dy, dy_offset + i__+1+i__*y_dim1, y_dim1, y+i__+1+i__*y_dim1, y_dim1, queue, &event ); i__2 = m - i__; i__3 = i__ - 1; blasf77_zgemv(MagmaConjTransStr, &i__2, &i__3, &c_one, &a[i__ + 1 + a_dim1], &lda, &a[i__ + 1 + i__ * a_dim1], &c__1, &c_zero, &y[ i__ * y_dim1 + 1], &c__1); i__2 = n - i__; i__3 = i__ - 1; blasf77_zgemv("No transpose", &i__2, &i__3, &c_neg_one, &y[i__ + 1 + y_dim1], &ldy, &y[i__ * y_dim1 + 1], &c__1, &c_zero, f, &c__1); i__2 = m - i__; blasf77_zgemv(MagmaConjTransStr, &i__2, &i__, &c_one, &x[i__ + 1 + x_dim1], &ldx, &a[i__ + 1 + i__ * a_dim1], &c__1, &c_zero, &y[i__ * y_dim1 + 1], &c__1); // 4. Synch to make sure the result is back ---------------- magma_event_sync( event ); if (i__3 != 0){ i__2 = n - i__; blasf77_zaxpy(&i__2, &c_one, f,&c__1, &y[i__+1+i__*y_dim1],&c__1); } i__2 = n - i__; blasf77_zgemv(MagmaConjTransStr, &i__, &i__2, &c_neg_one, &a[(i__ + 1) * a_dim1 + 1], &lda, &y[i__ * y_dim1 + 1], &c__1, &c_one, &y[i__ + 1 + i__ * y_dim1], &c__1); i__2 = n - i__; blasf77_zscal(&i__2, &tauq[i__], &y[i__ + 1 + i__ * y_dim1], &c__1); } #if defined(PRECISION_z) || defined(PRECISION_c) else { i__2 = n - i__ + 1; lapackf77_zlacgv(&i__2, &a[i__ + i__ * a_dim1], &lda); magma_zsetvector( i__2, a + i__ + (i__ )* a_dim1, lda, da, da_offset + (i__-1)+ (i__-1)*(ldda), ldda, queue ); } #endif } } magma_queue_sync( queue ); magma_free_cpu(f); return MAGMA_SUCCESS; } /* magma_zlabrd */
int main(int argc, char **argv) { TESTING_INIT(); const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; const magma_int_t ione = 1; real_Double_t atomics_perf=0, atomics_time=0; real_Double_t gflops, magma_perf=0, magma_time=0, cublas_perf, cublas_time, cpu_perf, cpu_time; double magma_error=0, atomics_error=0, cublas_error, work[1]; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t N, lda, ldda, sizeA, sizeX, sizeY, blocks, ldwork; magma_int_t incx = 1; magma_int_t incy = 1; magma_int_t nb = 64; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 1.5, -2.3 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.6, 0.8 ); magmaDoubleComplex *A, *X, *Y, *Yatomics, *Ycublas, *Ymagma; magmaDoubleComplex_ptr dA, dX, dY, dwork; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("%% uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf("%% N MAGMA Gflop/s (ms) Atomics Gflop/s CUBLAS Gflop/s CPU Gflop/s MAGMA error Atomics CUBLAS\n"); printf("%%=====================================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default sizeA = N*lda; sizeX = N*incx; sizeY = N*incy; gflops = FLOPS_ZHEMV( N ) / 1e9; TESTING_MALLOC_CPU( A, magmaDoubleComplex, sizeA ); TESTING_MALLOC_CPU( X, magmaDoubleComplex, sizeX ); TESTING_MALLOC_CPU( Y, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Yatomics, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ycublas, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ymagma, magmaDoubleComplex, sizeY ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dX, magmaDoubleComplex, sizeX ); TESTING_MALLOC_DEV( dY, magmaDoubleComplex, sizeY ); blocks = magma_ceildiv( N, nb ); ldwork = ldda*blocks; TESTING_MALLOC_DEV( dwork, magmaDoubleComplex, ldwork ); magmablas_zlaset( MagmaFull, ldwork, 1, MAGMA_Z_NAN, MAGMA_Z_NAN, dwork, ldwork ); magmablas_zlaset( MagmaFull, ldda, N, MAGMA_Z_NAN, MAGMA_Z_NAN, dA, ldda ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &sizeA, A ); magma_zmake_hermitian( N, A, lda ); // should not use data from the opposite triangle -- fill with NAN to check magma_int_t N1 = N-1; if ( opts.uplo == MagmaUpper ) { lapackf77_zlaset( "Lower", &N1, &N1, &MAGMA_Z_NAN, &MAGMA_Z_NAN, &A[1], &lda ); } else { lapackf77_zlaset( "Upper", &N1, &N1, &MAGMA_Z_NAN, &MAGMA_Z_NAN, &A[lda], &lda ); } lapackf77_zlarnv( &ione, ISEED, &sizeX, X ); lapackf77_zlarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( N, N, A, lda, dA, ldda ); magma_zsetvector( N, X, incx, dX, incx ); magma_zsetvector( N, Y, incy, dY, incy ); magmablasSetKernelStream( opts.queue ); // opts.handle also uses opts.queue cublas_time = magma_sync_wtime( opts.queue ); #ifdef HAVE_CUBLAS cublasZhemv( opts.handle, cublas_uplo_const(opts.uplo), N, &alpha, dA, ldda, dX, incx, &beta, dY, incy ); #else magma_zhemv( opts.uplo, N, alpha, dA, 0, ldda, dX, 0, incx, beta, dY, 0, incy, opts.queue ); #endif cublas_time = magma_sync_wtime( opts.queue ) - cublas_time; cublas_perf = gflops / cublas_time; magma_zgetvector( N, dY, incy, Ycublas, incy ); /* ===================================================================== Performs operation using CUBLAS - using atomics =================================================================== */ #ifdef HAVE_CUBLAS cublasSetAtomicsMode( opts.handle, CUBLAS_ATOMICS_ALLOWED ); magma_zsetvector( N, Y, incy, dY, incy ); // sync on queue doesn't work -- need device sync or use NULL stream -- bug in CUBLAS? atomics_time = magma_sync_wtime( NULL /*opts.queue*/ ); cublasZhemv( opts.handle, cublas_uplo_const(opts.uplo), N, &alpha, dA, ldda, dX, incx, &beta, dY, incy ); atomics_time = magma_sync_wtime( NULL /*opts.queue*/ ) - atomics_time; atomics_perf = gflops / atomics_time; magma_zgetvector( N, dY, incy, Yatomics, incy ); cublasSetAtomicsMode( opts.handle, CUBLAS_ATOMICS_NOT_ALLOWED ); #endif /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ #ifdef HAVE_CUBLAS magma_zsetvector( N, Y, incy, dY, incy ); magma_time = magma_sync_wtime( opts.queue ); if ( opts.version == 1 ) { magmablas_zhemv_work( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy, dwork, ldwork, opts.queue ); } else { // non-work interface (has added overhead) magmablas_zhemv( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy ); } magma_time = magma_sync_wtime( opts.queue ) - magma_time; magma_perf = gflops / magma_time; magma_zgetvector( N, dY, incy, Ymagma, incy ); #endif /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_zhemv( lapack_uplo_const(opts.uplo), &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Ycublas, &incy ); cublas_error = lapackf77_zlange( "M", &N, &ione, Ycublas, &N, work ) / N; #ifdef HAVE_CUBLAS blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Yatomics, &incy ); atomics_error = lapackf77_zlange( "M", &N, &ione, Yatomics, &N, work ) / N; blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_zlange( "M", &N, &ione, Ymagma, &N, work ) / N; #endif bool okay = (magma_error < tol && cublas_error < tol && atomics_error < tol); status += ! okay; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %8.2e %s\n", (int) N, magma_perf, 1000.*magma_time, atomics_perf, 1000.*atomics_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error, atomics_error, (okay ? "ok" : "failed")); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Yatomics ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_zlahr2(magma_int_t n, magma_int_t k, magma_int_t nb, cuDoubleComplex *da, cuDoubleComplex *dv, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *t, magma_int_t ldt, cuDoubleComplex *y, magma_int_t ldy) { /* -- MAGMA auxiliary routine (version 1.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= ZLAHR2 reduces the first NB columns of a complex general n-BY-(n-k+1) matrix A so that elements below the k-th subdiagonal are zero. The reduction is performed by an orthogonal similarity transformation Q' * A * Q. The routine returns the matrices V and T which determine Q as a block reflector I - V*T*V', and also the matrix Y = A * V. This is an auxiliary routine called by ZGEHRD. Arguments ========= N (input) INTEGER The order of the matrix A. K (input) INTEGER The offset for the reduction. Elements below the k-th subdiagonal in the first NB columns are reduced to zero. K < N. NB (input) INTEGER The number of columns to be reduced. DA (input/output) COMPLEX_16 array on the GPU, dimension (LDA,N-K+1) On entry, the n-by-(n-k+1) general matrix A. On exit, the elements on and above the k-th subdiagonal in the first NB columns are overwritten with the corresponding elements of the reduced matrix; the elements below the k-th subdiagonal, with the array TAU, represent the matrix Q as a product of elementary reflectors. The other columns of A are unchanged. See Further Details. DV (output) COMPLEX_16 array on the GPU, dimension (N, NB) On exit this contains the Householder vectors of the transformation. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). TAU (output) COMPLEX_16 array, dimension (NB) The scalar factors of the elementary reflectors. See Further Details. T (output) COMPLEX_16 array, dimension (LDT,NB) The upper triangular matrix T. LDT (input) INTEGER The leading dimension of the array T. LDT >= NB. Y (output) COMPLEX_16 array, dimension (LDY,NB) The n-by-nb matrix Y. LDY (input) INTEGER The leading dimension of the array Y. LDY >= N. Further Details =============== The matrix Q is represented as a product of nb elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in A(i+k+1:n,i), and tau in TAU(i). The elements of the vectors v together form the (n-k+1)-by-nb matrix V which is needed, with T and Y, to apply the transformation to the unreduced part of the matrix, using an update of the form: A := (I - V*T*V') * (A - Y*T*V'). The contents of A on exit are illustrated by the following example with n = 7, k = 3 and nb = 2: ( a a a a a ) ( a a a a a ) ( a a a a a ) ( h h a a a ) ( v1 h a a a ) ( v1 v2 a a a ) ( v1 v2 a a a ) where a denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. ===================================================================== */ cuDoubleComplex c_zero = MAGMA_Z_ZERO; cuDoubleComplex c_one = MAGMA_Z_ONE; cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t ldda = lda; magma_int_t c__1 = 1; magma_int_t a_dim1, a_offset, t_dim1, t_offset, y_dim1, y_offset, i__2, i__3; cuDoubleComplex d__1; magma_int_t i__; cuDoubleComplex ei; --tau; a_dim1 = lda; a_offset = 1 + a_dim1; a -= a_offset; t_dim1 = ldt; t_offset = 1 + t_dim1; t -= t_offset; y_dim1 = ldy; y_offset = 1 + y_dim1; y -= y_offset; /* Function Body */ if (n <= 1) return 0; for (i__ = 1; i__ <= nb; ++i__) { if (i__ > 1) { /* Update A(K+1:N,I); Update I-th column of A - Y * V' */ i__2 = n - k + 1; i__3 = i__ - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i__3, &a[k+i__-1+a_dim1], &lda); #endif blasf77_zcopy(&i__3, &a[k+i__-1+a_dim1], &lda, &t[nb*t_dim1+1], &c__1); blasf77_ztrmv("u","n","n",&i__3,&t[t_offset], &ldt, &t[nb*t_dim1+1], &c__1); blasf77_zgemv("NO TRANSPOSE", &i__2, &i__3, &c_neg_one, &y[k + y_dim1], &ldy, &t[nb*t_dim1+1], &c__1, &c_one, &a[k+i__*a_dim1],&c__1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i__3, &a[k+i__-1+a_dim1], &lda); #endif /* Apply I - V * T' * V' to this column (call it b) from the left, using the last column of T as workspace Let V = ( V1 ) and b = ( b1 ) (first I-1 rows) ( V2 ) ( b2 ) where V1 is unit lower triangular w := V1' * b1 */ i__2 = i__ - 1; blasf77_zcopy(&i__2, &a[k+1+i__*a_dim1], &c__1, &t[nb*t_dim1+1], &c__1); blasf77_ztrmv("Lower", MagmaConjTransStr, "UNIT", &i__2, &a[k + 1 + a_dim1], &lda, &t[nb * t_dim1 + 1], &c__1); /* w := w + V2'*b2 */ i__2 = n - k - i__ + 1; i__3 = i__ - 1; blasf77_zgemv(MagmaConjTransStr, &i__2, &i__3, &c_one, &a[k + i__ + a_dim1], &lda, &a[k+i__+i__*a_dim1], &c__1, &c_one, &t[nb*t_dim1+1], &c__1); /* w := T'*w */ i__2 = i__ - 1; blasf77_ztrmv("U", MagmaConjTransStr, "N", &i__2, &t[t_offset], &ldt, &t[nb*t_dim1+1], &c__1); /* b2 := b2 - V2*w */ i__2 = n - k - i__ + 1; i__3 = i__ - 1; blasf77_zgemv("N", &i__2, &i__3, &c_neg_one, &a[k + i__ + a_dim1], &lda, &t[nb*t_dim1+1], &c__1, &c_one, &a[k+i__+i__*a_dim1], &c__1); /* b1 := b1 - V1*w */ i__2 = i__ - 1; blasf77_ztrmv("L","N","U",&i__2,&a[k+1+a_dim1],&lda,&t[nb*t_dim1+1],&c__1); blasf77_zaxpy(&i__2, &c_neg_one, &t[nb * t_dim1 + 1], &c__1, &a[k + 1 + i__ * a_dim1], &c__1); a[k + i__ - 1 + (i__ - 1) * a_dim1] = ei; } /* Generate the elementary reflector H(I) to annihilate A(K+I+1:N,I) */ i__2 = n - k - i__ + 1; i__3 = k + i__ + 1; lapackf77_zlarfg(&i__2, &a[k + i__ + i__ * a_dim1], &a[min(i__3,n) + i__ * a_dim1], &c__1, &tau[i__]); ei = a[k + i__ + i__ * a_dim1]; a[k + i__ + i__ * a_dim1] = c_one; /* Compute Y(K+1:N,I) */ i__2 = n - k; i__3 = n - k - i__ + 1; magma_zsetvector( i__3, &a[k + i__ + i__*a_dim1], 1, dv+(i__-1)*(ldda+1), 1 ); magma_zgemv(MagmaNoTrans, i__2+1, i__3, c_one, da -1 + k + i__ * ldda, ldda, dv+(i__-1)*(ldda+1), c__1, c_zero, da-1 + k + (i__-1)*ldda, c__1); i__2 = n - k - i__ + 1; i__3 = i__ - 1; blasf77_zgemv(MagmaConjTransStr, &i__2, &i__3, &c_one, &a[k + i__ + a_dim1], &lda, &a[k+i__+i__*a_dim1], &c__1, &c_zero, &t[i__*t_dim1+1], &c__1); /* Compute T(1:I,I) */ i__2 = i__ - 1; d__1 = MAGMA_Z_NEGATE( tau[i__] ); blasf77_zscal(&i__2, &d__1, &t[i__ * t_dim1 + 1], &c__1); blasf77_ztrmv("U","N","N", &i__2, &t[t_offset], &ldt, &t[i__*t_dim1+1], &c__1); t[i__ + i__ * t_dim1] = tau[i__]; magma_zgetvector( n - k + 1, da-1+ k+(i__-1)*ldda, 1, y+ k + i__*y_dim1, 1 ); } a[k + nb + nb * a_dim1] = ei; return 0; } /* magma_zlahr2 */
/* //////////////////////////////////////////////////////////////////////////// -- Testing ztrmv */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf, cpu_time; double cublas_error, Cnorm, work[1]; magma_int_t N; magma_int_t Ak; magma_int_t sizeA; magma_int_t lda, ldda; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaDoubleComplex *h_A, *h_x, *h_xcublas; magmaDoubleComplex_ptr d_A, d_x; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) double tol = opts.tolerance * lapackf77_dlamch("E"); printf("If running lapack (option --lapack), CUBLAS error is computed\n" "relative to CPU BLAS result.\n\n"); printf("uplo = %s, transA = %s, diag = %s \n", lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag) ); printf(" N CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("==================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; gflops = FLOPS_ZTRMM(opts.side, N, 1) / 1e9; lda = N; Ak = N; ldda = ((lda+31)/32)*32; sizeA = lda*Ak; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, lda*Ak ); TESTING_MALLOC_CPU( h_x, magmaDoubleComplex, N ); TESTING_MALLOC_CPU( h_xcublas, magmaDoubleComplex, N ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*Ak ); TESTING_MALLOC_DEV( d_x, magmaDoubleComplex, N ); /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_zlarnv( &ione, ISEED, &N, h_x ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( Ak, Ak, h_A, lda, d_A, ldda ); magma_zsetvector( N, h_x, 1, d_x, 1 ); cublas_time = magma_sync_wtime( NULL ); cublasZtrmv( opts.handle, cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), cublas_diag_const(opts.diag), N, d_A, ldda, d_x, 1 ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_zgetvector( N, d_x, 1, h_xcublas, 1 ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_ztrmv( lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &N, h_A, &lda, h_x, &ione ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_zlange( "M", &N, &ione, h_x, &N, work ); blasf77_zaxpy( &N, &c_neg_one, h_x, &ione, h_xcublas, &ione ); cublas_error = lapackf77_zlange( "M", &N, &ione, h_xcublas, &N, work ) / Cnorm; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error, (cublas_error < tol ? "ok" : "failed")); status += ! (cublas_error < tol); } else { printf("%5d %7.2f (%7.2f) --- ( --- ) --- ---\n", (int) N, cublas_perf, 1000.*cublas_time); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_x ); TESTING_FREE_CPU( h_xcublas ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_x ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf, cpu_time; double magma_error, cublas_error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t M, N, Xm, Ym, lda, sizeA, sizeX, sizeY; magma_int_t incx = 1; magma_int_t incy = 1; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 1.5, -2.3 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.6, 0.8 ); magmaDoubleComplex *A, *X, *Y, *Ycublas, *Ymagma; magmaDoubleComplex *dA, *dX, *dY; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("trans = %s\n", lapack_trans_const(opts.transA) ); printf(" M N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("===================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = ((M+31)/32)*32; gflops = FLOPS_ZGEMV( M, N ) / 1e9; if ( opts.transA == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } sizeA = lda*N; sizeX = incx*Xm; sizeY = incy*Ym; TESTING_MALLOC_CPU( A, magmaDoubleComplex, sizeA ); TESTING_MALLOC_CPU( X, magmaDoubleComplex, sizeX ); TESTING_MALLOC_CPU( Y, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ycublas, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ymagma, magmaDoubleComplex, sizeY ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, sizeA ); TESTING_MALLOC_DEV( dX, magmaDoubleComplex, sizeX ); TESTING_MALLOC_DEV( dY, magmaDoubleComplex, sizeY ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &sizeA, A ); lapackf77_zlarnv( &ione, ISEED, &sizeX, X ); lapackf77_zlarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( M, N, A, lda, dA, lda ); magma_zsetvector( Xm, X, incx, dX, incx ); magma_zsetvector( Ym, Y, incy, dY, incy ); cublas_time = magma_sync_wtime( 0 ); cublasZgemv( handle, cublas_trans_const(opts.transA), M, N, &alpha, dA, lda, dX, incx, &beta, dY, incy ); cublas_time = magma_sync_wtime( 0 ) - cublas_time; cublas_perf = gflops / cublas_time; magma_zgetvector( Ym, dY, incy, Ycublas, incy ); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_zsetvector( Ym, Y, incy, dY, incy ); magma_time = magma_sync_wtime( 0 ); magmablas_zgemv( opts.transA, M, N, alpha, dA, lda, dX, incx, beta, dY, incy ); magma_time = magma_sync_wtime( 0 ) - magma_time; magma_perf = gflops / magma_time; magma_zgetvector( Ym, dY, incy, Ymagma, incy ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_zgemv( lapack_trans_const(opts.transA), &M, &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_zaxpy( &Ym, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_zlange( "M", &Ym, &ione, Ymagma, &Ym, work ) / Ym; blasf77_zaxpy( &Ym, &c_neg_one, Y, &incy, Ycublas, &incy ); cublas_error = lapackf77_zlange( "M", &Ym, &ione, Ycublas, &Ym, work ) / Ym; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error, (magma_error < tol && cublas_error < tol ? "ok" : "failed")); status += ! (magma_error < tol && cublas_error < tol); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgemm_batched */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cpu_perf, cpu_time; double magma_error, magma_err, Ynorm, work[1]; magma_int_t M, N, Xm, Ym, lda, ldda; magma_int_t sizeA, sizeX, sizeY; magma_int_t incx = 1; magma_int_t incy = 1; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_int_t batchCount; magmaDoubleComplex *h_A, *h_X, *h_Y, *h_Ymagma; magmaDoubleComplex *d_A, *d_X, *d_Y; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 0.29, -0.86 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.48, 0.38 ); magmaDoubleComplex **A_array = NULL; magmaDoubleComplex **X_array = NULL; magmaDoubleComplex **Y_array = NULL; magma_opts opts; parse_opts( argc, argv, &opts ); batchCount = opts.batchcount; opts.lapack |= opts.check; //double tol = opts.tolerance * lapackf77_dlamch("E"); printf("trans = %s\n", lapack_trans_const(opts.transA) ); printf("BatchCount M N MAGMA Gflop/s (ms) CPU Gflop/s (ms) MAGMA error\n"); printf("===================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = ((M+31)/32)*32; gflops = FLOPS_ZGEMV( M, N ) / 1e9 * batchCount; if ( opts.transA == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } sizeA = lda*N*batchCount; sizeX = incx*Xm*batchCount; sizeY = incy*Ym*batchCount; ldda = ((lda+31)/32)*32; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, sizeA ); TESTING_MALLOC_CPU( h_X, magmaDoubleComplex, sizeX ); TESTING_MALLOC_CPU( h_Y, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( h_Ymagma, magmaDoubleComplex, sizeY ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N*batchCount ); TESTING_MALLOC_DEV( d_X, magmaDoubleComplex, sizeX ); TESTING_MALLOC_DEV( d_Y, magmaDoubleComplex, sizeY ); magma_malloc((void**)&A_array, batchCount * sizeof(*A_array)); magma_malloc((void**)&X_array, batchCount * sizeof(*X_array)); magma_malloc((void**)&Y_array, batchCount * sizeof(*Y_array)); /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_zlarnv( &ione, ISEED, &sizeX, h_X ); lapackf77_zlarnv( &ione, ISEED, &sizeY, h_Y ); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_zsetmatrix( M, N*batchCount, h_A, lda, d_A, ldda ); magma_zsetvector( Xm*batchCount, h_X, incx, d_X, incx ); magma_zsetvector( Ym*batchCount, h_Y, incy, d_Y, incy ); zset_pointer(A_array, d_A, ldda, 0, 0, ldda*N, batchCount, magma_stream); zset_pointer(X_array, d_X, 1, 0, 0, incx*Xm, batchCount, magma_stream); zset_pointer(Y_array, d_Y, 1, 0, 0, incy*Ym, batchCount, magma_stream); magma_time = magma_sync_wtime( NULL ); magmablas_zgemv_batched(opts.transA, M, N, alpha, A_array, ldda, X_array, incx, beta, Y_array, incy, batchCount, magma_stream); magma_time = magma_sync_wtime( NULL ) - magma_time; magma_perf = gflops / magma_time; magma_zgetvector( Ym*batchCount, d_Y, incy, h_Ymagma, incy ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); for(int i=0; i<batchCount; i++) { blasf77_zgemv( lapack_trans_const(opts.transA), &M, &N, &alpha, h_A + i*lda*N, &lda, h_X + i*Xm, &incx, &beta, h_Y + i*Ym, &incy ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma relative to lapack, // |C_magma - C_lapack| / |C_lapack| magma_error = 0.0; for(int s=0; s<batchCount; s++) { Ynorm = lapackf77_zlange( "M", &M, &ione, h_Y + s*Ym, &incy, work ); blasf77_zaxpy( &Ym, &c_neg_one, h_Y + s*Ym, &ione, h_Ymagma + s*Ym, &ione ); magma_err = lapackf77_zlange( "M", &M, &ione, h_Ymagma + s*Ym, &incy, work ) / Ynorm; if ( isnan(magma_err) || isinf(magma_err) ) { magma_error = magma_err; break; } magma_error = max(fabs(magma_err), magma_error); } printf("%10d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e \n", (int) batchCount, (int) M, (int) N, magma_perf, 1000.*magma_time, cpu_perf, 1000.*cpu_time, magma_error); } else { printf("%10d %5d %5d %7.2f (%7.2f) --- ( --- ) ---\n", (int) batchCount, (int) M, (int) N, magma_perf, 1000.*magma_time); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( h_Y ); TESTING_FREE_CPU( h_Ymagma ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_X ); TESTING_FREE_DEV( d_Y ); TESTING_FREE_DEV( A_array ); TESTING_FREE_DEV( X_array ); TESTING_FREE_DEV( Y_array ); fflush( stdout); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
int main(int argc, char **argv) { TESTING_CUDA_INIT(); magma_timestr_t start, end; double flops, magma_perf, cuda_perf, error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; FILE *fp ; magma_int_t i, lda, Xm, Ym; magma_int_t M, M0 = 0; magma_int_t N, N0 = 0; magma_int_t szeA, szeX, szeY; magma_int_t istart = 64; magma_int_t iend = 10240; magma_int_t incx = 1; magma_int_t incy = 1; char trans = MagmaNoTrans; cuDoubleComplex alpha = MAGMA_Z_MAKE(1., 0.); // MAGMA_Z_MAKE( 1.5, -2.3 ); cuDoubleComplex beta = MAGMA_Z_MAKE(0., 0.); // MAGMA_Z_MAKE( -0.6, 0.8 ); cuDoubleComplex *A, *X, *Y, *Ycublas, *Ymagma; cuDoubleComplex *dA, *dX, *dY; if (argc != 1){ for(i=1; i<argc; i++){ if ( strcmp("-n", argv[i]) == 0 ){ N0 = atoi(argv[++i]); } else if ( strcmp("-m", argv[i]) == 0 ){ M0 = atoi(argv[++i]); } else if (strcmp("-N", argv[i])==0){ trans = MagmaNoTrans; } else if (strcmp("-T", argv[i])==0){ trans = MagmaTrans; } #if defined(PRECISION_z) || defined(PRECISION_c) else if (strcmp("-C", argv[i])==0){ trans = MagmaConjTrans; } #endif } } if ( (M0 != 0) && (N0 != 0) ) iend = istart + 1; M = N = iend; if ( M0 != 0 ) M = M0; if ( N0 != 0 ) N = N0; if( trans == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } lda = ((M+31)/32)*32; szeA = lda*N; szeX = incx*Xm; szeY = incy*Ym; TESTING_MALLOC( A, cuDoubleComplex, szeA ); TESTING_MALLOC( X, cuDoubleComplex, szeX ); TESTING_MALLOC( Y, cuDoubleComplex, szeY ); TESTING_MALLOC( Ycublas, cuDoubleComplex, szeY ); TESTING_MALLOC( Ymagma, cuDoubleComplex, szeY ); TESTING_DEVALLOC( dA, cuDoubleComplex, szeA ); TESTING_DEVALLOC( dX, cuDoubleComplex, szeX ); TESTING_DEVALLOC( dY, cuDoubleComplex, szeY ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &szeA, A ); lapackf77_zlarnv( &ione, ISEED, &szeX, X ); lapackf77_zlarnv( &ione, ISEED, &szeY, Y ); fp = fopen ("results_zgemv.txt", "w") ; if( fp == NULL ){ printf("Couldn't open output file\n"); exit(1);} printf("\nUsage: \n"); printf(" testing_zgemv [-N|T|C] [-m %d] [-n %d]\n\n", 1024, 1024); printf( " m n CUBLAS,Gflop/s MAGMABLAS Gflop/s \"error\"\n" "==============================================================\n"); fprintf(fp, " m n CUBLAS,Gflop/s MAGMABLAS Gflop/s \"error\"\n" "==============================================================\n"); for( i=istart; i < iend; i = (int)((i+1)*1.1) ) { M = N = i; if ( M0 != 0 ) M = M0; if ( N0 != 0 ) N = N0; if( trans == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } lda = ((M+31)/32)*32; flops = FLOPS( (double)M, (double)N ) / 1000000; printf( "%5d %5d ", (int) M, (int) N ); fprintf( fp, "%5d %5d ", (int) M, (int) N ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_zsetmatrix( M, N, A, lda, dA, lda ); magma_zsetvector( Xm, X, incx, dX, incx ); magma_zsetvector( Ym, Y, incy, dY, incy ); /* * Cublas Version */ start = get_current_time(); cublasZgemv( trans, M, N, alpha, dA, lda, dX, incx, beta, dY, incy ); end = get_current_time(); magma_zgetvector( Ym, dY, incy, Ycublas, incy ); cuda_perf = flops / GetTimerValue(start, end); printf( "%11.2f", cuda_perf ); fprintf(fp, "%11.2f", cuda_perf ); /* * Magma Version */ magma_zsetvector( Ym, Y, incy, dY, incy ); start = get_current_time(); magmablas_zgemv( trans, M, N, alpha, dA, lda, dX, incx, beta, dY, incy ); end = get_current_time(); magma_zgetvector( Ym, dY, incx, Ymagma, incx ); magma_perf = flops / GetTimerValue(start, end); printf( "%11.2f", magma_perf ); fprintf(fp, "%11.2f", magma_perf ); /* ===================================================================== Computing the Difference Cublas VS Magma =================================================================== */ blasf77_zaxpy( &Ym, &c_neg_one, Ymagma, &incy, Ycublas, &incy); error = lapackf77_zlange( "M", &Ym, &ione, Ycublas, &Ym, work ); #if 0 printf( "\t\t %8.6e", error / (double)Ym ); fprintf( fp, "\t\t %8.6e", error / (double)Ym ); /* * Blas comparaison */ { char *blastrans = MagmaNoTransStr; if ( trans == MagmaConjTrans ) blastrans = MagmaConjTransStr; else if ( trans == MagmaTrans ) blastrans = MagmaTransStr; blasf77_zcopy( &Ym, Y, &incy, Ycublas, &incy ); blasf77_zgemv( blastrans, &M, &N, &alpha, A, &lda, X, &incx, &beta, Ycublas, &incy ); blasf77_zaxpy( &Ym, &c_neg_one, Ymagma, &incy, Ycublas, &incy); error = lapackf77_zlange( "M", &Ym, &ione, Ycublas, &Ym, work ); } #endif printf( "\t\t %8.6e\n", error / (double)Ym ); fprintf( fp, "\t\t %8.6e\n", error / (double)Ym ); } /* Free Memory */ TESTING_FREE( A ); TESTING_FREE( X ); TESTING_FREE( Y ); TESTING_FREE( Ycublas ); TESTING_FREE( Ymagma ); TESTING_DEVFREE( dA ); TESTING_DEVFREE( dX ); TESTING_DEVFREE( dY ); /* Free device */ TESTING_CUDA_FINALIZE(); return EXIT_SUCCESS; }
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cpu_perf, cpu_time; double magma_error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t N, lda, ldda, sizeA, sizeX, sizeY, blocks, ldwork; magma_int_t incx = 1; magma_int_t incy = 1; magma_int_t nb = 64; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 1.5, -2.3 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.6, 0.8 ); magmaDoubleComplex *A, *X, *Y, *Ymagma; magmaDoubleComplex *dA, *dX, *dY, *dwork; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf(" N MAGMA Gflop/s (ms) CPU Gflop/s (ms) MAGMA error\n"); printf("=========================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldda = ((N + 31)/32)*32; sizeA = N*lda; sizeX = N*incx; sizeY = N*incy; gflops = FLOPS_ZSYMV( N ) / 1e9; TESTING_MALLOC_CPU( A, magmaDoubleComplex, sizeA ); TESTING_MALLOC_CPU( X, magmaDoubleComplex, sizeX ); TESTING_MALLOC_CPU( Y, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ymagma, magmaDoubleComplex, sizeY ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dX, magmaDoubleComplex, sizeX ); TESTING_MALLOC_DEV( dY, magmaDoubleComplex, sizeY ); blocks = (N + nb - 1) / nb; ldwork = ldda*blocks; TESTING_MALLOC_DEV( dwork, magmaDoubleComplex, ldwork ); magmablas_zlaset( MagmaFull, ldwork, 1, MAGMA_Z_NAN, MAGMA_Z_NAN, dwork, ldwork ); magmablas_zlaset( MagmaFull, ldda, N, MAGMA_Z_NAN, MAGMA_Z_NAN, dA, ldda ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &sizeA, A ); magma_zmake_hermitian( N, A, lda ); lapackf77_zlarnv( &ione, ISEED, &sizeX, X ); lapackf77_zlarnv( &ione, ISEED, &sizeY, Y ); /* Note: CUBLAS does not implement zsymv */ /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_zsetmatrix( N, N, A, lda, dA, ldda ); magma_zsetvector( N, X, incx, dX, incx ); magma_zsetvector( N, Y, incy, dY, incy ); //magma_zprint_gpu( ldda, blocks, dwork, ldda ); magma_time = magma_sync_wtime( 0 ); magmablas_zsymv_work( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy, dwork, ldwork ); // TODO provide option to test non-work interface //magmablas_zsymv( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy ); magma_time = magma_sync_wtime( 0 ) - magma_time; magma_perf = gflops / magma_time; magma_zgetvector( N, dY, incy, Ymagma, incy ); //magma_zprint_gpu( ldda, blocks, dwork, ldda ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); lapackf77_zsymv( lapack_uplo_const(opts.uplo), &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_zlange( "M", &N, &ione, Ymagma, &N, work ) / N; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, magma_perf, 1000.*magma_time, cpu_perf, 1000.*cpu_time, magma_error, (magma_error < tol ? "ok" : "failed")); status += ! (magma_error < tol); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_zqr( magma_int_t m, magma_int_t n, magma_z_matrix A, magma_int_t lda, magma_z_matrix *Q, magma_z_matrix *R, magma_queue_t queue ) { magma_int_t info = 0; // local constants const magmaDoubleComplex c_zero = MAGMA_Z_ZERO; // local variables magma_int_t inc = 1; magma_int_t k = min(m,n); magma_int_t ldt; magma_int_t nb; magmaDoubleComplex *tau = NULL; magmaDoubleComplex *dT = NULL; magmaDoubleComplex *dA = NULL; magma_z_matrix dR1 = {Magma_CSR}; // allocate CPU resources CHECK( magma_zmalloc_pinned( &tau, k ) ); // query number of blocks required for QR factorization nb = magma_get_zgeqrf_nb( m, n ); ldt = (2 * k + magma_roundup(n, 32)) * nb; CHECK( magma_zmalloc( &dT, ldt ) ); // get copy of matrix array if ( A.memory_location == Magma_DEV ) { dA = A.dval; } else { CHECK( magma_zmalloc( &dA, lda * n ) ); magma_zsetvector( lda * n, A.val, inc, dA, inc, queue ); } // QR factorization magma_zgeqrf_gpu( m, n, dA, lda, tau, dT, &info ); // construct R matrix if ( R != NULL ) { if ( A.memory_location == Magma_DEV ) { CHECK( magma_zvinit( R, Magma_DEV, lda, n, c_zero, queue ) ); magmablas_zlacpy( MagmaUpper, k, n, dA, lda, R->dval, lda, queue ); } else { CHECK( magma_zvinit( &dR1, Magma_DEV, lda, n, c_zero, queue ) ); magmablas_zlacpy( MagmaUpper, k, n, dA, lda, dR1.dval, lda, queue ); CHECK( magma_zvinit( R, Magma_CPU, lda, n, c_zero, queue ) ); magma_zgetvector( lda * n, dR1.dval, inc, R->val, inc, queue ); } } // construct Q matrix if ( Q != NULL ) { magma_zungqr_gpu( m, n, k, dA, lda, tau, dT, nb, &info ); if ( A.memory_location == Magma_DEV ) { CHECK( magma_zvinit( Q, Magma_DEV, lda, n, c_zero, queue ) ); magma_zcopyvector( lda * n, dA, inc, Q->dval, inc, queue ); } else { CHECK( magma_zvinit( Q, Magma_CPU, lda, n, c_zero, queue ) ); magma_zgetvector( lda * n, dA, inc, Q->val, inc, queue ); } } cleanup: if( info != 0 ){ magma_zmfree( Q, queue ); magma_zmfree( R, queue ); magma_zmfree( &dR1, queue ); } // free resources magma_free_pinned( tau ); magma_free( dT ); if ( A.memory_location == Magma_CPU ) { magma_free( dA ); } return info; }
/** @deprecated Purpose ------- ZLAQPS computes a step of QR factorization with column pivoting of a complex M-by-N matrix A by using Blas-3. It tries to factorize NB columns from A starting from the row OFFSET+1, and updates all of the matrix with Blas-3 xGEMM. In some cases, due to catastrophic cancellations, it cannot factorize NB columns. Hence, the actual number of factorized columns is returned in KB. Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0 @param[in] offset INTEGER The number of rows of A that have been factorized in previous steps. @param[in] nb INTEGER The number of columns to factorize. @param[out] kb INTEGER The number of columns actually factorized. @param[in,out] dA COMPLEX_16 array, dimension (LDDA,N), on the GPU. On entry, the M-by-N matrix A. On exit, block A(OFFSET+1:M,1:KB) is the triangular factor obtained and block A(1:OFFSET,1:N) has been accordingly pivoted, but no factorized. The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has been updated. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[in,out] jpvt INTEGER array, dimension (N) JPVT(I) = K <==> Column K of the full matrix A has been permuted into position I in AP. @param[out] tau COMPLEX_16 array, dimension (KB) The scalar factors of the elementary reflectors. @param[in,out] vn1 DOUBLE PRECISION array, dimension (N) The vector with the partial column norms. @param[in,out] vn2 DOUBLE PRECISION array, dimension (N) The vector with the exact column norms. @param[in,out] dauxv COMPLEX_16 array, dimension (NB), on the GPU Auxiliary vector. @param[in,out] dF COMPLEX_16 array, dimension (LDDF,NB), on the GPU Matrix F' = L*Y'*A. @param[in] lddf INTEGER The leading dimension of the array F. LDDF >= max(1,N). @ingroup magma_zgeqp3_aux ********************************************************************/ extern "C" magma_int_t magma_zlaqps_gpu( magma_int_t m, magma_int_t n, magma_int_t offset, magma_int_t nb, magma_int_t *kb, magmaDoubleComplex_ptr dA, magma_int_t ldda, magma_int_t *jpvt, magmaDoubleComplex *tau, double *vn1, double *vn2, magmaDoubleComplex_ptr dauxv, magmaDoubleComplex_ptr dF, magma_int_t lddf) { #define dA(i, j) (dA + (i) + (j)*(ldda)) #define dF(i, j) (dF + (i) + (j)*(lddf)) magmaDoubleComplex c_zero = MAGMA_Z_MAKE( 0.,0.); magmaDoubleComplex c_one = MAGMA_Z_MAKE( 1.,0.); magmaDoubleComplex c_neg_one = MAGMA_Z_MAKE(-1.,0.); magma_int_t ione = 1; magma_int_t i__1, i__2; //double d__1; magmaDoubleComplex z__1; //magma_int_t j; magma_int_t k, rk; //magmaDoubleComplex Akk; magmaDoubleComplex_ptr dAks; magmaDoubleComplex tauk = MAGMA_Z_ZERO; magma_int_t pvt; //double temp, temp2; double tol3z; magma_int_t itemp; double lsticc; magmaDouble_ptr dlsticcs; magma_dmalloc( &dlsticcs, 1+256*(n+255)/256 ); //lastrk = min( m, n + offset ); tol3z = magma_dsqrt( lapackf77_dlamch("Epsilon")); lsticc = 0; k = 0; magma_zmalloc( &dAks, nb ); while( k < nb && lsticc == 0 ) { rk = offset + k; /* Determine ith pivot column and swap if necessary */ // subtract 1 from Fortran/CUBLAS idamax; pvt, k are 0-based. pvt = k + magma_idamax( n-k, &vn1[k], ione ) - 1; if (pvt != k) { /*if (pvt >= nb) { // 1. Start copy from GPU magma_zgetmatrix_async( m - offset - nb, 1, dA(offset + nb, pvt), ldda, A (offset + nb, pvt), lda, stream ); }*/ /* F gets swapped so F must be sent at the end to GPU */ i__1 = k; /*if (pvt < nb) { // no need of transfer if pivot is within the panel blasf77_zswap( &m, A(0, pvt), &ione, A(0, k), &ione ); } else { // 1. Finish copy from GPU magma_queue_sync( stream ); // 2. Swap as usual on CPU blasf77_zswap(&m, A(0, pvt), &ione, A(0, k), &ione); // 3. Restore the GPU magma_zsetmatrix_async( m - offset - nb, 1, A (offset + nb, pvt), lda, dA(offset + nb, pvt), ldda, stream); }*/ magmablas_zswap( m, dA(0, pvt), ione, dA(0, k), ione ); //blasf77_zswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf ); magmablas_zswap( i__1, dF(pvt, 0), lddf, dF(k, 0), lddf); itemp = jpvt[pvt]; jpvt[pvt] = jpvt[k]; jpvt[k] = itemp; //vn1[pvt] = vn1[k]; //vn2[pvt] = vn2[k]; #if defined(PRECISION_d) || defined(PRECISION_z) //magma_dswap( 1, &vn1[pvt], 1, &vn1[k], 1 ); //magma_dswap( 1, &vn2[pvt], 1, &vn2[k], 1 ); magma_dswap( 2, &vn1[pvt], n+offset, &vn1[k], n+offset ); #else //magma_sswap( 1, &vn1[pvt], 1, &vn1[k], 1 ); //magma_sswap( 1, &vn2[pvt], 1, &vn2[k], 1 ); magma_sswap(2, &vn1[pvt], n+offset, &vn1[k], n+offset); #endif } /* Apply previous Householder reflectors to column K: A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'. Optimization: multiply with beta=0; wait for vector and subtract */ if (k > 0) { /*#if (defined(PRECISION_c) || defined(PRECISION_z)) for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_Z_CNJG( *F(k,j) ); } #endif*/ //#define RIGHT_UPDATE #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_zgemv( MagmaNoTrans, i__1, i__2, c_neg_one, A(offset+nb, 0), lda, F(k, 0), ldf, c_one, A(offset+nb, k), ione ); #else i__1 = m - rk; i__2 = k; /*blasf77_zgemv( MagmaNoTransStr, &i__1, &i__2, &c_neg_one, A(rk, 0), &lda, F(k, 0), &ldf, &c_one, A(rk, k), &ione ); */ magma_zgemv( MagmaNoTrans, i__1, i__2, c_neg_one, dA(rk, 0), ldda, dF(k, 0), lddf, c_one, dA(rk, k), ione ); #endif /*#if (defined(PRECISION_c) || defined(PRECISION_z)) for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_Z_CNJG( *F(k,j) ); } #endif*/ } /* Generate elementary reflector H(k). */ magma_zlarfg_gpu( m-rk, dA(rk, k), dA(rk + 1, k), &tau[k], &vn1[k], &dAks[k]); //Akk = *A(rk, k); //*A(rk, k) = c_one; //magma_zgetvector( 1, &dAks[k], 1, &Akk, 1 ); /* needed to avoid the race condition */ if (k == 0) magma_zsetvector( 1, &c_one, 1, dA(rk, k), 1 ); else magma_zcopymatrix( 1, 1, dA(offset, 0), 1, dA(rk, k), 1 ); /* Compute Kth column of F: Compute F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */ if (k < n-1 || k > 0) magma_zgetvector( 1, &tau[k], 1, &tauk, 1 ); if (k < n-1) { i__1 = m - rk; i__2 = n - k - 1; /* Send the vector to the GPU */ //magma_zsetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda ); /* Multiply on GPU */ // was CALL ZGEMV( 'Conjugate transpose', M-RK+1, N-K, // TAU( K ), A( RK, K+1 ), LDA, // A( RK, K ), 1, // CZERO, F( K+1, K ), 1 ) //magma_zgetvector( 1, &tau[k], 1, &tauk, 1 ); magma_zgemv( MagmaConjTrans, m-rk, n-k-1, tauk, dA( rk, k+1 ), ldda, dA( rk, k ), 1, c_zero, dF( k+1, k ), 1 ); //magma_zscal( m-rk, tau[k], F( k+1, k), 1 ); //magma_int_t i__3 = nb-k-1; //magma_int_t i__4 = i__2 - i__3; //magma_int_t i__5 = nb-k; //magma_zgemv( MagmaConjTrans, i__1 - i__5, i__2 - i__3, // tau[k], dA(rk +i__5, k+1+i__3), ldda, // dA(rk +i__5, k ), ione, // c_zero, dF(k+1+i__3, k ), ione ); //magma_zgetmatrix_async( i__2-i__3, 1, // dF(k + 1 +i__3, k), i__2, // F (k + 1 +i__3, k), i__2, stream ); //blasf77_zgemv( MagmaConjTransStr, &i__1, &i__3, // &tau[k], A(rk, k+1), &lda, // A(rk, k ), &ione, // &c_zero, F(k+1, k ), &ione ); //magma_queue_sync( stream ); //blasf77_zgemv( MagmaConjTransStr, &i__5, &i__4, // &tau[k], A(rk, k+1+i__3), &lda, // A(rk, k ), &ione, // &c_one, F(k+1+i__3, k ), &ione ); } /* Padding F(1:K,K) with zeros. for (j = 0; j <= k; ++j) { magma_zsetvector( 1, &c_zero, 1, F(j, k), 1 ); }*/ /* Incremental updating of F: F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). F(1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K) := tau(K)(A(RK:M,K+1:N)' - F(1:N,1:K-1)*A(RK:M,1:K-1)') A(RK:M,K) so, F is (updated A)*V */ //if (k > 0 && k < n-1) { if (k > 0) { //magma_zgetvector( 1, &tau[k], 1, &tauk, 1 ); z__1 = MAGMA_Z_NEGATE( tauk ); #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_zgemv( MagmaConjTrans, i__1, i__2, z__1, dA(offset+nb, 0), lda, dA(offset+nb, k), ione, c_zero, dauxv, ione ); i__1 = k; magma_zgemv( MagmaNoTrans, n-k-1, i__1, c_one, F(k+1,0), ldf, dauxv, ione, c_one, F(k+1,k), ione ); #else i__1 = m - rk; i__2 = k; //blasf77_zgemv( MagmaConjTransStr, &i__1, &i__2, // &z__1, A(rk, 0), &lda, // A(rk, k), &ione, // &c_zero, auxv, &ione ); magma_zgemv( MagmaConjTrans, i__1, i__2, z__1, dA(rk, 0), ldda, dA(rk, k), ione, c_zero, dauxv, ione ); //i__1 = k; //blasf77_zgemv( MagmaNoTransStr, &n, &i__1, // &c_one, F(0,0), &ldf, // auxv, &ione, // &c_one, F(0,k), &ione ); /*magma_zgemv( MagmaNoTrans, n, i__1, c_one, F(0,0), ldf, auxv, ione, c_one, F(0,k), ione ); */ /* I think we only need stricly lower-triangular part :) */ magma_zgemv( MagmaNoTrans, n-k-1, i__2, c_one, dF(k+1,0), lddf, dauxv, ione, c_one, dF(k+1,k), ione ); #endif } /* Optimization: On the last iteration start sending F back to the GPU */ /* Update the current row of A: A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'. */ if (k < n-1) { i__1 = n - k - 1; i__2 = k + 1; //blasf77_zgemm( MagmaNoTransStr, MagmaConjTransStr, &ione, &i__1, &i__2, // &c_neg_one, A(rk, 0 ), &lda, // F(k+1,0 ), &ldf, // &c_one, A(rk, k+1), &lda ); #ifdef RIGHT_UPDATE /* right-looking update of rows, */ magma_zgemm( MagmaNoTrans, MagmaConjTrans, nb-k, i__1, ione, c_neg_one, dA(rk, k ), ldda, dF(k+1, k ), lddf, c_one, dA(rk, k+1), ldda ); #else /* left-looking update of rows, * * since F=A'v with original A, so no right-looking */ magma_zgemm( MagmaNoTrans, MagmaConjTrans, ione, i__1, i__2, c_neg_one, dA(rk, 0 ), ldda, dF(k+1,0 ), lddf, c_one, dA(rk, k+1), ldda ); #endif } /* Update partial column norms. */ if (rk < min(m, n+offset)-1 ) { magmablas_dznrm2_row_check_adjust(n-k-1, tol3z, &vn1[k+1], &vn2[k+1], dA(rk,k+1), ldda, dlsticcs); magma_device_sync(); #if defined(PRECISION_d) || defined(PRECISION_z) magma_dgetvector( 1, &dlsticcs[0], 1, &lsticc, 1 ); #else magma_sgetvector( 1, &dlsticcs[0], 1, &lsticc, 1 ); #endif } /*if (rk < lastrk) { for (j = k + 1; j < n; ++j) { if (vn1[j] != 0.) { // NOTE: The following 4 lines follow from the analysis in // Lapack Working Note 176. temp = MAGMA_Z_ABS( *A(rk,j) ) / vn1[j]; temp = max( 0., ((1. + temp) * (1. - temp)) ); d__1 = vn1[j] / vn2[j]; temp2 = temp * (d__1 * d__1); if (temp2 <= tol3z) { vn2[j] = (double) lsticc; lsticc = j; } else { vn1[j] *= magma_dsqrt(temp); } } } }*/ //*A(rk, k) = Akk; //magma_zsetvector( 1, &Akk, 1, A(rk, k), 1 ); //magma_zswap( 1, &dAks[k], 1, A(rk, k), 1 ); ++k; } magma_zcopymatrix( 1, k, dAks, 1, dA(offset, 0), ldda+1 ); // leave k as the last column done --k; *kb = k + 1; rk = offset + *kb - 1; /* Apply the block reflector to the rest of the matrix: A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)' */ if (*kb < min(n, m - offset)) { i__1 = m - rk - 1; i__2 = n - *kb; /* Send F to the GPU magma_zsetmatrix( i__2, *kb, F (*kb, 0), ldf, dF(*kb, 0), i__2 ); */ magma_zgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb, c_neg_one, dA(rk+1, 0 ), ldda, dF(*kb, 0 ), lddf, c_one, dA(rk+1, *kb), ldda ); } /* Recomputation of difficult columns. */ if ( lsticc > 0 ) { // printf( " -- recompute dnorms --\n" ); magmablas_dznrm2_check( m-rk-1, n-*kb, dA(rk+1,*kb), ldda, &vn1[*kb], dlsticcs ); magma_dcopymatrix( n-*kb, 1, &vn1[*kb], *kb, &vn2[*kb], *kb ); /*while( lsticc > 0 ) { itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc])); i__1 = m - rk - 1; if (lsticc <= nb) vn1[lsticc] = magma_cblas_dznrm2( i__1, A(rk+1,lsticc), ione ); else { // Where is the data, CPU or GPU ? double r1, r2; r1 = magma_cblas_dznrm2( nb-k, A(rk+1,lsticc), ione ); r2 = magma_dznrm2(m-offset-nb, dA(offset + nb + 1, lsticc), ione); vn1[lsticc] = magma_dsqrt(r1*r1+r2*r2); } // NOTE: The computation of VN1( LSTICC ) relies on the fact that // SNRM2 does not fail on vectors with norm below the value of SQRT(DLAMCH('S')) vn2[lsticc] = vn1[lsticc]; lsticc = itemp; */ } magma_free(dAks); magma_free(dlsticcs); return MAGMA_SUCCESS; } /* magma_zlaqps */
extern "C" magma_int_t magma_zlaqps_gpu(magma_int_t m, magma_int_t n, magma_int_t offset, magma_int_t nb, magma_int_t *kb, magmaDoubleComplex *A, magma_int_t lda, magma_int_t *jpvt, magmaDoubleComplex *tau, double *vn1, double *vn2, magmaDoubleComplex *auxv, magmaDoubleComplex *F, magma_int_t ldf) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZLAQPS computes a step of QR factorization with column pivoting of a complex M-by-N matrix A by using Blas-3. It tries to factorize NB columns from A starting from the row OFFSET+1, and updates all of the matrix with Blas-3 xGEMM. In some cases, due to catastrophic cancellations, it cannot factorize NB columns. Hence, the actual number of factorized columns is returned in KB. Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0 OFFSET (input) INTEGER The number of rows of A that have been factorized in previous steps. NB (input) INTEGER The number of columns to factorize. KB (output) INTEGER The number of columns actually factorized. A (input/output) COMPLEX*16 array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, block A(OFFSET+1:M,1:KB) is the triangular factor obtained and block A(1:OFFSET,1:N) has been accordingly pivoted, but no factorized. The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has been updated. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). JPVT (input/output) INTEGER array, dimension (N) JPVT(I) = K <==> Column K of the full matrix A has been permuted into position I in AP. TAU (output) COMPLEX*16 array, dimension (KB) The scalar factors of the elementary reflectors. VN1 (input/output) DOUBLE PRECISION array, dimension (N) The vector with the partial column norms. VN2 (input/output) DOUBLE PRECISION array, dimension (N) The vector with the exact column norms. AUXV (input/output) COMPLEX*16 array, dimension (NB) Auxiliar vector. F (input/output) COMPLEX*16 array, dimension (LDF,NB) Matrix F' = L*Y'*A. LDF (input) INTEGER The leading dimension of the array F. LDF >= max(1,N). ===================================================================== */ #define A(i, j) (A + (i) + (j)*(lda )) #define F(i, j) (F + (i) + (j)*(ldf )) magmaDoubleComplex c_zero = MAGMA_Z_MAKE( 0.,0.); magmaDoubleComplex c_one = MAGMA_Z_MAKE( 1.,0.); magmaDoubleComplex c_neg_one = MAGMA_Z_MAKE(-1.,0.); magma_int_t ione = 1; magma_int_t i__1, i__2; //double d__1; magmaDoubleComplex z__1; //magma_int_t j; magma_int_t k, rk; //magmaDoubleComplex Akk; magmaDoubleComplex *Aks; magmaDoubleComplex tauk; magma_int_t pvt; //double temp, temp2; double tol3z; magma_int_t itemp; double lsticc, *lsticcs; magma_int_t lastrk; magma_dmalloc( &lsticcs, 1+256*(n+255)/256 ); lastrk = min( m, n + offset ); tol3z = magma_dsqrt( lapackf77_dlamch("Epsilon")); lsticc = 0; k = 0; magma_zmalloc( &Aks, nb ); while( k < nb && lsticc == 0 ) { rk = offset + k; /* Determine ith pivot column and swap if necessary */ // Fortran: pvt, k, idamax are all 1-based; subtract 1 from k. // C: pvt, k, idamax are all 0-based; don't subtract 1. pvt = k - 1 + magma_idamax( n-k, &vn1[k], ione ); if (pvt != k) { /*if (pvt >= nb) { // 1. Start copy from GPU magma_zgetmatrix_async( m - offset - nb, 1, dA(offset + nb, pvt), ldda, A (offset + nb, pvt), lda, stream ); }*/ /* F gets swapped so F must be sent at the end to GPU */ i__1 = k; /*if (pvt < nb){ // no need of transfer if pivot is within the panel blasf77_zswap( &m, A(0, pvt), &ione, A(0, k), &ione ); } else { // 1. Finish copy from GPU magma_queue_sync( stream ); // 2. Swap as usual on CPU blasf77_zswap(&m, A(0, pvt), &ione, A(0, k), &ione); // 3. Restore the GPU magma_zsetmatrix_async( m - offset - nb, 1, A (offset + nb, pvt), lda, dA(offset + nb, pvt), ldda, stream); }*/ magmablas_zswap( m, A(0, pvt), ione, A(0, k), ione ); //blasf77_zswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf ); magmablas_zswap( i__1, F(pvt, 0), ldf, F(k, 0), ldf); itemp = jpvt[pvt]; jpvt[pvt] = jpvt[k]; jpvt[k] = itemp; //vn1[pvt] = vn1[k]; //vn2[pvt] = vn2[k]; #if defined(PRECISION_d) || defined(PRECISION_z) //magma_dswap( 1, &vn1[pvt], 1, &vn1[k], 1 ); //magma_dswap( 1, &vn2[pvt], 1, &vn2[k], 1 ); magma_dswap( 2, &vn1[pvt], n+offset, &vn1[k], n+offset ); #else //magma_sswap( 1, &vn1[pvt], 1, &vn1[k], 1 ); //magma_sswap( 1, &vn2[pvt], 1, &vn2[k], 1 ); magma_sswap(2, &vn1[pvt], n+offset, &vn1[k], n+offset); #endif } /* Apply previous Householder reflectors to column K: A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'. Optimization: multiply with beta=0; wait for vector and subtract */ if (k > 0) { /*#if (defined(PRECISION_c) || defined(PRECISION_z)) for (j = 0; j < k; ++j){ *F(k,j) = MAGMA_Z_CNJG( *F(k,j) ); } #endif*/ //#define RIGHT_UPDATE #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_zgemv( MagmaNoTrans, i__1, i__2, c_neg_one, A(offset+nb, 0), lda, F(k, 0), ldf, c_one, A(offset+nb, k), ione ); #else i__1 = m - rk; i__2 = k; /*blasf77_zgemv( MagmaNoTransStr, &i__1, &i__2, &c_neg_one, A(rk, 0), &lda, F(k, 0), &ldf, &c_one, A(rk, k), &ione );*/ magma_zgemv( MagmaNoTrans, i__1, i__2, c_neg_one, A(rk, 0), lda, F(k, 0), ldf, c_one, A(rk, k), ione ); #endif /*#if (defined(PRECISION_c) || defined(PRECISION_z)) for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_Z_CNJG( *F(k,j) ); } #endif*/ } /* Generate elementary reflector H(k). */ magma_zlarfg_gpu(m-rk, A(rk, k), A(rk + 1, k), &tau[k], &vn1[k], &Aks[k]); //Akk = *A(rk, k); //*A(rk, k) = c_one; //magma_zgetvector( 1, &Aks[k], 1, &Akk, 1 ); /* needed to avoid the race condition */ if (k == 0) magma_zsetvector( 1, &c_one, 1, A(rk, k), 1 ); else magma_zcopymatrix( 1, 1, A(offset, 0), 1, A(rk, k), 1 ); /* Compute Kth column of F: Compute F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */ if (k < n-1 || k > 0) magma_zgetvector( 1, &tau[k], 1, &tauk, 1 ); if (k < n-1) { i__1 = m - rk; i__2 = n - k - 1; /* Send the vector to the GPU */ //magma_zsetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda ); /* Multiply on GPU */ // was CALL ZGEMV( 'Conjugate transpose', M-RK+1, N-K, // TAU( K ), A( RK, K+1 ), LDA, // A( RK, K ), 1, // CZERO, F( K+1, K ), 1 ) //magma_zgetvector( 1, &tau[k], 1, &tauk, 1 ); magma_zgemv( MagmaConjTrans, m-rk, n-k-1, tauk, A( rk, k+1 ), lda, A( rk, k ), 1, c_zero, F( k+1, k ), 1 ); //magma_zscal( m-rk, tau[k], F( k+1, k), 1 ); //magma_int_t i__3 = nb-k-1; //magma_int_t i__4 = i__2 - i__3; //magma_int_t i__5 = nb-k; //magma_zgemv( MagmaConjTrans, i__1 - i__5, i__2 - i__3, // tau[k], dA(rk +i__5, k+1+i__3), ldda, // dA(rk +i__5, k ), ione, // c_zero, dF(k+1+i__3, k ), ione ); //magma_zgetmatrix_async( i__2-i__3, 1, // dF(k + 1 +i__3, k), i__2, // F (k + 1 +i__3, k), i__2, stream ); //blasf77_zgemv( MagmaConjTransStr, &i__1, &i__3, // &tau[k], A(rk, k+1), &lda, // A(rk, k ), &ione, // &c_zero, F(k+1, k ), &ione ); //magma_queue_sync( stream ); //blasf77_zgemv( MagmaConjTransStr, &i__5, &i__4, // &tau[k], A(rk, k+1+i__3), &lda, // A(rk, k ), &ione, // &c_one, F(k+1+i__3, k ), &ione ); } /* Padding F(1:K,K) with zeros. for (j = 0; j <= k; ++j) { magma_zsetvector( 1, &c_zero, 1, F(j, k), 1 ); }*/ /* Incremental updating of F: F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). F(1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K) := tau(K)(A(RK:M,K+1:N)' - F(1:N,1:K-1)*A(RK:M,1:K-1)') A(RK:M,K) so, F is (updated A)*V */ //if (k > 0 && k<n-1) { if (k > 0) { //magma_zgetvector( 1, &tau[k], 1, &tauk, 1 ); z__1 = MAGMA_Z_NEGATE( tauk ); #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_zgemv( MagmaConjTrans, i__1, i__2, z__1, A(offset+nb, 0), lda, A(offset+nb, k), ione, c_zero, auxv, ione ); i__1 = k; magma_zgemv( MagmaNoTrans, n-k-1, i__1, c_one, F(k+1,0), ldf, auxv, ione, c_one, F(k+1,k), ione ); #else i__1 = m - rk; i__2 = k; //blasf77_zgemv( MagmaConjTransStr, &i__1, &i__2, // &z__1, A(rk, 0), &lda, // A(rk, k), &ione, // &c_zero, auxv, &ione ); magma_zgemv( MagmaConjTrans, i__1, i__2, z__1, A(rk, 0), lda, A(rk, k), ione, c_zero, auxv, ione ); //i__1 = k; //blasf77_zgemv( MagmaNoTransStr, &n, &i__1, // &c_one, F(0,0), &ldf, // auxv, &ione, // &c_one, F(0,k), &ione ); /*magma_zgemv( MagmaNoTrans, n, i__1, c_one, F(0,0), ldf, auxv, ione, c_one, F(0,k), ione );*/ /* I think we only need stricly lower-triangular part :) */ magma_zgemv( MagmaNoTrans, n-k-1, i__2, c_one, F(k+1,0), ldf, auxv, ione, c_one, F(k+1,k), ione ); #endif } /* Optimization: On the last iteration start sending F back to the GPU */ /* Update the current row of A: A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'. */ if (k < n-1) { i__1 = n - k - 1; i__2 = k + 1; //blasf77_zgemm( MagmaNoTransStr, MagmaConjTransStr, &ione, &i__1, &i__2, // &c_neg_one, A(rk, 0 ), &lda, // F(k+1,0 ), &ldf, // &c_one, A(rk, k+1), &lda ); #ifdef RIGHT_UPDATE /* right-looking update of rows, */ magma_zgemm( MagmaNoTrans, MagmaConjTrans, nb-k, i__1, ione, c_neg_one, A(rk, k ), lda, F(k+1, k ), ldf, c_one, A(rk, k+1), lda ); #else /* left-looking update of rows, * * since F=A'v with original A, so no right-looking */ magma_zgemm( MagmaNoTrans, MagmaConjTrans, ione, i__1, i__2, c_neg_one, A(rk, 0 ), lda, F(k+1,0 ), ldf, c_one, A(rk, k+1), lda ); #endif } /* Update partial column norms. */ if (rk < min(m, n+offset)-1 ) { magmablas_dznrm2_row_check_adjust(n-k-1, tol3z, &vn1[k+1], &vn2[k+1], A(rk,k+1), lda, lsticcs); magma_device_sync(); #if defined(PRECISION_d) || defined(PRECISION_z) magma_dgetvector( 1, &lsticcs[0], 1, &lsticc, 1 ); #else magma_sgetvector( 1, &lsticcs[0], 1, &lsticc, 1 ); #endif } /*if (rk < lastrk) { for (j = k + 1; j < n; ++j) { if (vn1[j] != 0.) { // NOTE: The following 4 lines follow from the analysis in // Lapack Working Note 176. temp = MAGMA_Z_ABS( *A(rk,j) ) / vn1[j]; temp = max( 0., ((1. + temp) * (1. - temp)) ); d__1 = vn1[j] / vn2[j]; temp2 = temp * (d__1 * d__1); if (temp2 <= tol3z) { vn2[j] = (double) lsticc; lsticc = j; } else { vn1[j] *= magma_dsqrt(temp); } } } }*/ //*A(rk, k) = Akk; //magma_zsetvector( 1, &Akk, 1, A(rk, k), 1 ); //magma_zswap( 1, &Aks[k], 1, A(rk, k), 1 ); ++k; } magma_zcopymatrix( 1, k, Aks, 1, A(offset, 0), lda+1 ); // leave k as the last column done --k; *kb = k + 1; rk = offset + *kb - 1; /* Apply the block reflector to the rest of the matrix: A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)' */ if (*kb < min(n, m - offset)) { i__1 = m - rk - 1; i__2 = n - *kb; /* Send F to the GPU magma_zsetmatrix( i__2, *kb, F (*kb, 0), ldf, dF(*kb, 0), i__2 );*/ magma_zgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb, c_neg_one, A(rk+1, 0 ), lda, F(*kb, 0 ), ldf, c_one, A(rk+1, *kb), lda ); } /* Recomputation of difficult columns. */ if( lsticc > 0 ) { printf( " -- recompute dnorms --\n" ); magmablas_dznrm2_check(m-rk-1, n-*kb, A(rk+1,*kb), lda, &vn1[*kb], lsticcs); #if defined(PRECISION_d) || defined(PRECISION_z) magma_dcopymatrix( n-*kb, 1, &vn1[*kb], *kb, &vn2[*kb], *kb); #else magma_scopymatrix( n-*kb, 1, &vn1[*kb], *kb, &vn2[*kb], *kb); #endif /*while( lsticc > 0 ) { itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc])); i__1 = m - rk - 1; if (lsticc <= nb) vn1[lsticc] = cblas_dznrm2(i__1, A(rk + 1, lsticc), ione); else { // Where is the data, CPU or GPU ? double r1, r2; r1 = cblas_dznrm2(nb-k, A(rk + 1, lsticc), ione); r2 = magma_dznrm2(m-offset-nb, dA(offset + nb + 1, lsticc), ione); vn1[lsticc] = magma_dsqrt(r1*r1+r2*r2); } // NOTE: The computation of VN1( LSTICC ) relies on the fact that // SNRM2 does not fail on vectors with norm below the value of SQRT(DLAMCH('S')) vn2[lsticc] = vn1[lsticc]; lsticc = itemp;*/ } magma_free(Aks); magma_free(lsticcs); return MAGMA_SUCCESS; } /* magma_zlaqps */
/* //////////////////////////////////////////////////////////////////////////// -- Testing ztrsm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf=0, cpu_time=0; double cublas_error, normA, normx, normr, work[1]; magma_int_t N, info; magma_int_t sizeA; magma_int_t lda, ldda; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t *ipiv; magmaDoubleComplex *h_A, *h_b, *h_x, *h_xcublas; magmaDoubleComplex_ptr d_A, d_x; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("uplo = %s, transA = %s, diag = %s\n", lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag) ); printf(" N CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("============================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; gflops = FLOPS_ZTRSM(opts.side, N, 1) / 1e9; lda = N; ldda = ((lda+31)/32)*32; sizeA = lda*N; TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( h_b, magmaDoubleComplex, N ); TESTING_MALLOC_CPU( h_x, magmaDoubleComplex, N ); TESTING_MALLOC_CPU( h_xcublas, magmaDoubleComplex, N ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( d_x, magmaDoubleComplex, N ); /* Initialize the matrices */ /* Factor A into LU to get well-conditioned triangular matrix. * Copy L to U, since L seems okay when used with non-unit diagonal * (i.e., from U), while U fails when used with unit diagonal. */ lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_zgetrf( &N, &N, h_A, &lda, ipiv, &info ); for( int j = 0; j < N; ++j ) { for( int i = 0; i < j; ++i ) { *h_A(i,j) = *h_A(j,i); } } lapackf77_zlarnv( &ione, ISEED, &N, h_b ); blasf77_zcopy( &N, h_b, &ione, h_x, &ione ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( N, N, h_A, lda, d_A, ldda ); magma_zsetvector( N, h_x, 1, d_x, 1 ); cublas_time = magma_sync_wtime( NULL ); cublasZtrsv( opts.handle, cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), cublas_diag_const(opts.diag), N, d_A, ldda, d_x, 1 ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_zgetvector( N, d_x, 1, h_xcublas, 1 ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_ztrsv( lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &N, h_A, &lda, h_x, &ione ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ // ||b - Ax|| / (||A||*||x||) // error for CUBLAS normA = lapackf77_zlange( "F", &N, &N, h_A, &lda, work ); normx = lapackf77_zlange( "F", &N, &ione, h_xcublas, &ione, work ); blasf77_ztrmv( lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &N, h_A, &lda, h_xcublas, &ione ); blasf77_zaxpy( &N, &c_neg_one, h_b, &ione, h_xcublas, &ione ); normr = lapackf77_zlange( "F", &N, &ione, h_xcublas, &N, work ); cublas_error = normr / (normA*normx); if ( opts.lapack ) { printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error, (cublas_error < tol ? "ok" : "failed")); status += ! (cublas_error < tol); } else { printf("%5d %7.2f (%7.2f) --- ( --- ) %8.2e %s\n", (int) N, cublas_perf, 1000.*cublas_time, cublas_error, (cublas_error < tol ? "ok" : "failed")); status += ! (cublas_error < tol); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_b ); TESTING_FREE_CPU( h_x ); TESTING_FREE_CPU( h_xcublas ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_x ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf, cpu_time; double magma_error, cublas_error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t N, lda, sizeA, sizeX, sizeY, blocks, ldwork; magma_int_t incx = 1; magma_int_t incy = 1; magma_int_t nb = 64; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 1.5, -2.3 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.6, 0.8 ); magmaDoubleComplex *A, *X, *Y, *Ycublas, *Ymagma; magmaDoubleComplex *dA, *dX, *dY, *dC_work; magma_opts opts; parse_opts( argc, argv, &opts ); printf(" N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("=============================================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; lda = ((N + 31)/32)*32; sizeA = N*lda; sizeX = N*incx; sizeY = N*incy; gflops = FLOPS_ZHEMV( N ) / 1e9; TESTING_MALLOC( A, magmaDoubleComplex, sizeA ); TESTING_MALLOC( X, magmaDoubleComplex, sizeX ); TESTING_MALLOC( Y, magmaDoubleComplex, sizeY ); TESTING_MALLOC( Ycublas, magmaDoubleComplex, sizeY ); TESTING_MALLOC( Ymagma, magmaDoubleComplex, sizeY ); TESTING_DEVALLOC( dA, magmaDoubleComplex, sizeA ); TESTING_DEVALLOC( dX, magmaDoubleComplex, sizeX ); TESTING_DEVALLOC( dY, magmaDoubleComplex, sizeY ); blocks = (N + nb - 1) / nb; ldwork = lda * (blocks + 1); TESTING_DEVALLOC( dC_work, magmaDoubleComplex, ldwork ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &sizeA, A ); magma_zmake_hermitian( N, A, lda ); lapackf77_zlarnv( &ione, ISEED, &sizeX, X ); lapackf77_zlarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( N, N, A, lda, dA, lda ); magma_zsetvector( N, X, incx, dX, incx ); magma_zsetvector( N, Y, incy, dY, incy ); cublas_time = magma_sync_wtime( 0 ); cublasZhemv( opts.uplo, N, alpha, dA, lda, dX, incx, beta, dY, incy ); cublas_time = magma_sync_wtime( 0 ) - cublas_time; cublas_perf = gflops / cublas_time; magma_zgetvector( N, dY, incy, Ycublas, incy ); /* ===================================================================== Performs operation using MAGMA BLAS =================================================================== */ magma_zsetvector( N, Y, incy, dY, incy ); magma_time = magma_sync_wtime( 0 ); #if (GPUSHMEM >= 200) magmablas_zhemv2( opts.uplo, N, alpha, dA, lda, dX, incx, beta, dY, incy, dC_work, ldwork ); #else magmablas_zhemv( opts.uplo, N, alpha, dA, lda, dX, incx, beta, dY, incy ); #endif magma_time = magma_sync_wtime( 0 ) - magma_time; magma_perf = gflops / magma_time; magma_zgetvector( N, dY, incy, Ymagma, incy ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_zhemv( &opts.uplo, &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_zlange( "M", &N, &ione, Ymagma, &N, work ) / N; blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Ycublas, &incy ); cublas_error = lapackf77_zlange( "M", &N, &ione, Ycublas, &N, work ) / N; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e\n", (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error ); TESTING_FREE( A ); TESTING_FREE( X ); TESTING_FREE( Y ); TESTING_FREE( Ycublas ); TESTING_FREE( Ymagma ); TESTING_DEVFREE( dA ); TESTING_DEVFREE( dX ); TESTING_DEVFREE( dY ); TESTING_DEVFREE( dC_work ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
extern "C" magma_int_t magma_zbicgstab_merge3( magma_z_matrix A, magma_z_matrix b, magma_z_matrix *x, magma_z_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_BICGSTABMERGE; solver_par->numiter = 0; solver_par->spmv_count = 0; // solver variables magmaDoubleComplex alpha, beta, omega, rho_old, rho_new, *skp_h={0}; double nom, nom0, betanom, nomb; // some useful variables magmaDoubleComplex c_zero = MAGMA_Z_ZERO, c_one = MAGMA_Z_ONE; magma_int_t dofs = A.num_rows; // workspace magma_z_matrix q={Magma_CSR}, r={Magma_CSR}, rr={Magma_CSR}, p={Magma_CSR}, v={Magma_CSR}, s={Magma_CSR}, t={Magma_CSR}; magmaDoubleComplex *d1=NULL, *d2=NULL, *skp=NULL; d1 = NULL; d2 = NULL; skp = NULL; CHECK( magma_zmalloc( &d1, dofs*(2) )); CHECK( magma_zmalloc( &d2, dofs*(2) )); // array for the parameters CHECK( magma_zmalloc( &skp, 8 )); // skp = [alpha|beta|omega|rho_old|rho|nom|tmp1|tmp2] CHECK( magma_zvinit( &q, Magma_DEV, dofs*6, 1, c_zero, queue )); // q = rr|r|p|v|s|t rr.memory_location = Magma_DEV; rr.dval = NULL; rr.num_rows = rr.nnz = dofs; rr.num_cols = 1; rr.storage_type = Magma_DENSE; r.memory_location = Magma_DEV; r.dval = NULL; r.num_rows = r.nnz = dofs; r.num_cols = 1; r.storage_type = Magma_DENSE; p.memory_location = Magma_DEV; p.dval = NULL; p.num_rows = p.nnz = dofs; p.num_cols = 1; p.storage_type = Magma_DENSE; v.memory_location = Magma_DEV; v.dval = NULL; v.num_rows = v.nnz = dofs; v.num_cols = 1; v.storage_type = Magma_DENSE; s.memory_location = Magma_DEV; s.dval = NULL; s.num_rows = s.nnz = dofs; s.num_cols = 1; s.storage_type = Magma_DENSE; t.memory_location = Magma_DEV; t.dval = NULL; t.num_rows = t.nnz = dofs; t.num_cols = 1; t.storage_type = Magma_DENSE; rr.dval = q(0); r.dval = q(1); p.dval = q(2); v.dval = q(3); s.dval = q(4); t.dval = q(5); // solver setup CHECK( magma_zresidualvec( A, b, *x, &r, &nom0, queue)); magma_zcopy( dofs, r.dval, 1, q(0), 1, queue ); // rr = r magma_zcopy( dofs, r.dval, 1, q(1), 1, queue ); // q = r betanom = nom0; nom = nom0*nom0; rho_new = magma_zdotc( dofs, r.dval, 1, r.dval, 1, queue ); // rho=<rr,r> rho_old = omega = alpha = MAGMA_Z_MAKE( 1.0, 0. ); beta = rho_new; solver_par->init_res = nom0; // array on host for the parameters CHECK( magma_zmalloc_cpu( &skp_h, 8 )); nomb = magma_dznrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } skp_h[0]=alpha; skp_h[1]=beta; skp_h[2]=omega; skp_h[3]=rho_old; skp_h[4]=rho_new; skp_h[5]=MAGMA_Z_MAKE(nom, 0.0); magma_zsetvector( 8, skp_h, 1, skp, 1, queue ); CHECK( magma_z_spmv( c_one, A, r, c_zero, v, queue )); // z = A r nomb = magma_dznrm2( dofs, b.dval, 1, queue ); if( nom0 < solver_par->atol || nom0/nomb < solver_par->rtol ){ info = MAGMA_SUCCESS; goto cleanup; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; // computes p=r+beta*(p-omega*v) CHECK( magma_zbicgmerge1( dofs, skp, v.dval, r.dval, p.dval, queue )); CHECK( magma_z_spmv( c_one, A, p, c_zero, v, queue )); // v = Ap solver_par->spmv_count++; CHECK( magma_zmdotc( dofs, 1, q.dval, v.dval, d1, d2, skp, queue )); CHECK( magma_zbicgmerge4( 1, skp, queue )); CHECK( magma_zbicgmerge2( dofs, skp, r.dval, v.dval, s.dval, queue )); // s=r-alpha*v CHECK( magma_z_spmv( c_one, A, s, c_zero, t, queue )); // t=As solver_par->spmv_count++; CHECK( magma_zmdotc( dofs, 2, q.dval+4*dofs, t.dval, d1, d2, skp+6, queue )); CHECK( magma_zbicgmerge4( 2, skp, queue )); CHECK( magma_zbicgmerge_xrbeta( dofs, d1, d2, q.dval, r.dval, p.dval, s.dval, t.dval, x->dval, skp, queue )); // check stopping criterion magma_zgetvector_async( 1 , skp+5, 1, skp_h+5, 1, queue ); betanom = sqrt(MAGMA_Z_REAL(skp_h[5])); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < solver_par->atol || betanom/nomb < solver_par->rtol ) { break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; double residual; CHECK( magma_zresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = betanom; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->atol || solver_par->iter_res/solver_par->init_res < solver_par->rtol ){ info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_zmfree(&q, queue ); // frees all vectors magma_free(d1); magma_free(d2); magma_free( skp ); magma_free_cpu( skp_h ); solver_par->info = info; return info; } /* zbicgstab_merge */
// -------------------- int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, cpu_time=0, cpu_perf=0, gpu_time, gpu_perf, mgpu_time, mgpu_perf, cuda_time, cuda_perf; double Ynorm, error=0, error2=0, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t n_local[MagmaMaxGPUs]; magma_int_t N, Noffset, lda, ldda, blocks, lhwork, ldwork, matsize, vecsize; magma_int_t incx = 1; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 1.5, -2.3 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.6, 0.8 ); magmaDoubleComplex *A, *X, *Y, *Ylapack, *Ycublas, *Ymagma, *Ymagma1, *hwork; magmaDoubleComplex_ptr dA, dX, dY; magmaDoubleComplex_ptr d_lA[MagmaMaxGPUs], dwork[MagmaMaxGPUs]; magma_device_t dev; magma_queue_t queues[MagmaMaxGPUs]; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); opts.ngpu = abs( opts.ngpu ); // always uses multi-GPU code double tol = opts.tolerance * lapackf77_dlamch("E"); magma_int_t nb = 64; // required by magmablas_zhemv_mgpu implementation for( dev=0; dev < opts.ngpu; ++dev ) { magma_queue_create( dev, &queues[dev] ); } // currently, tests all offsets in the offsets array; // comment out loop below to test a specific offset. magma_int_t offset = opts.offset; magma_int_t offsets[] = { 0, 1, 31, 32, 33, 63, 64, 65, 100, 200 }; magma_int_t noffsets = sizeof(offsets) / sizeof(*offsets); printf("%% uplo = %s, ngpu %d, block size = %d, offset %d\n", lapack_uplo_const(opts.uplo), (int) opts.ngpu, (int) nb, (int) offset ); printf( "%% BLAS CUBLAS MAGMA 1 GPU MAGMA MGPU Error rel Error rel\n" "%% N offset Gflop/s (msec) Gflop/s (msec) Gflop/s (msec) Gflop/s (msec) to CUBLAS to LAPACK\n" "%%==================================================================================================================\n" ); for( int itest = 0; itest < opts.ntest; ++itest ) { // comment out these two lines & end of loop to test a specific offset for( int ioffset=0; ioffset < noffsets; ioffset += 1 ) { offset = offsets[ioffset]; for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; Noffset = N + offset; lda = Noffset; ldda = magma_roundup( Noffset, opts.align ); // multiple of 32 by default matsize = Noffset*ldda; vecsize = (Noffset-1)*incx + 1; gflops = FLOPS_ZHEMV( N ) / 1e9; blocks = magma_ceildiv( N + (offset % nb), nb ); lhwork = N*opts.ngpu; ldwork = ldda*(blocks + 1); TESTING_MALLOC_CPU( A, magmaDoubleComplex, matsize ); TESTING_MALLOC_CPU( Y, magmaDoubleComplex, vecsize ); TESTING_MALLOC_CPU( Ycublas, magmaDoubleComplex, vecsize ); TESTING_MALLOC_CPU( Ymagma, magmaDoubleComplex, vecsize ); TESTING_MALLOC_CPU( Ymagma1, magmaDoubleComplex, vecsize ); TESTING_MALLOC_CPU( Ylapack, magmaDoubleComplex, vecsize ); TESTING_MALLOC_PIN( X, magmaDoubleComplex, vecsize ); TESTING_MALLOC_PIN( hwork, magmaDoubleComplex, lhwork ); magma_setdevice( opts.device ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, matsize ); TESTING_MALLOC_DEV( dX, magmaDoubleComplex, vecsize ); TESTING_MALLOC_DEV( dY, magmaDoubleComplex, vecsize ); // TODO make magma_zmalloc_bcyclic helper function? for( dev=0; dev < opts.ngpu; dev++ ) { n_local[dev] = ((Noffset/nb)/opts.ngpu)*nb; if (dev < (Noffset/nb) % opts.ngpu) n_local[dev] += nb; else if (dev == (Noffset/nb) % opts.ngpu) n_local[dev] += Noffset % nb; magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], magmaDoubleComplex, ldda*n_local[dev] ); TESTING_MALLOC_DEV( dwork[dev], magmaDoubleComplex, ldwork ); } ////////////////////////////////////////////////////////////////////////// /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &matsize, A ); magma_zmake_hermitian( Noffset, A, lda ); lapackf77_zlarnv( &ione, ISEED, &vecsize, X ); lapackf77_zlarnv( &ione, ISEED, &vecsize, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_setdevice( opts.device ); magma_zsetmatrix( Noffset, Noffset, A, lda, dA, ldda, opts.queue ); magma_zsetvector( Noffset, X, incx, dX, incx, opts.queue ); magma_zsetvector( Noffset, Y, incx, dY, incx, opts.queue ); cuda_time = magma_sync_wtime(0); cublasZhemv( opts.handle, cublas_uplo_const(opts.uplo), N, &alpha, dA + offset + offset*ldda, ldda, dX + offset, incx, &beta, dY + offset, incx ); cuda_time = magma_sync_wtime(0) - cuda_time; cuda_perf = gflops / cuda_time; magma_zgetvector( Noffset, dY, incx, Ycublas, incx, opts.queue ); /* ===================================================================== Performs operation using MAGMABLAS (1 GPU) =================================================================== */ magma_setdevice( opts.device ); magma_zsetvector( Noffset, Y, incx, dY, incx, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); magmablas_zhemv_work( opts.uplo, N, alpha, dA + offset + offset*ldda, ldda, dX + offset, incx, beta, dY + offset, incx, dwork[ opts.device ], ldwork, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gflops / gpu_time; magma_zgetvector( Noffset, dY, incx, Ymagma1, incx, opts.queue ); /* ===================================================================== Performs operation using MAGMABLAS (multi-GPU) =================================================================== */ magma_zsetmatrix_1D_col_bcyclic( Noffset, Noffset, A, lda, d_lA, ldda, opts.ngpu, nb, queues ); blasf77_zcopy( &Noffset, Y, &incx, Ymagma, &incx ); // workspaces do NOT need to be zero -- set to NAN to prove for( dev=0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); magmablas_zlaset( MagmaFull, ldwork, 1, MAGMA_Z_NAN, MAGMA_Z_NAN, dwork[dev], ldwork, opts.queue ); } lapackf77_zlaset( "Full", &lhwork, &ione, &MAGMA_Z_NAN, &MAGMA_Z_NAN, hwork, &lhwork ); mgpu_time = magma_sync_wtime(0); magma_int_t info; info = magmablas_zhemv_mgpu( opts.uplo, N, alpha, d_lA, ldda, offset, X + offset, incx, beta, Ymagma + offset, incx, hwork, lhwork, dwork, ldwork, opts.ngpu, nb, queues ); if (info != 0) { printf("magmablas_zhemv_mgpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } info = magmablas_zhemv_mgpu_sync( opts.uplo, N, alpha, d_lA, ldda, offset, X + offset, incx, beta, Ymagma + offset, incx, hwork, lhwork, dwork, ldwork, opts.ngpu, nb, queues ); if (info != 0) { printf("magmablas_zhemv_sync returned error %d: %s.\n", (int) info, magma_strerror( info )); } mgpu_time = magma_sync_wtime(0) - mgpu_time; mgpu_perf = gflops / mgpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { blasf77_zcopy( &Noffset, Y, &incx, Ylapack, &incx ); cpu_time = magma_wtime(); blasf77_zhemv( lapack_uplo_const(opts.uplo), &N, &alpha, A + offset + offset*lda, &lda, X + offset, &incx, &beta, Ylapack + offset, &incx ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Compute the Difference LAPACK vs. Magma =================================================================== */ Ynorm = lapackf77_zlange( "F", &Noffset, &ione, Ylapack, &Noffset, work ); blasf77_zaxpy( &Noffset, &c_neg_one, Ymagma, &incx, Ylapack, &incx ); error2 = lapackf77_zlange( "F", &Noffset, &ione, Ylapack, &Noffset, work ) / Ynorm; } /* ===================================================================== Compute the Difference Cublas vs. Magma =================================================================== */ Ynorm = lapackf77_zlange( "F", &Noffset, &ione, Ycublas, &Noffset, work ); blasf77_zaxpy( &Noffset, &c_neg_one, Ymagma, &incx, Ycublas, &incx ); error = lapackf77_zlange( "F", &Noffset, &ione, Ycublas, &Noffset, work ) / Ynorm; bool okay = (error < tol && error2 < tol); status += ! okay; if ( opts.lapack ) { printf( "%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) N, (int) offset, cpu_perf, cpu_time*1000., cuda_perf, cuda_time*1000., gpu_perf, gpu_time*1000., mgpu_perf, mgpu_time*1000., error, error2, (okay ? "ok" : "failed") ); } else { printf( "%5d %5d --- ( --- ) %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e --- %s\n", (int) N, (int) offset, cuda_perf, cuda_time*1000., gpu_perf, gpu_time*1000., mgpu_perf, mgpu_time*1000., error, (okay ? "ok" : "failed") ); } /* Free Memory */ TESTING_FREE_CPU( A ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_CPU( Ymagma1 ); TESTING_FREE_CPU( Ylapack ); TESTING_FREE_PIN( X ); TESTING_FREE_PIN( hwork ); magma_setdevice( opts.device ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); for( dev=0; dev < opts.ngpu; dev++ ) { magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); TESTING_FREE_DEV( dwork[dev] ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } // comment out these two lines line & top of loop test a specific offset } // end for ioffset printf( "\n" ); } for( dev=0; dev < opts.ngpu; ++dev ) { magma_queue_destroy( queues[dev] ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_zgesv_rbt( magma_bool_t ref, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *B, magma_int_t ldb, magma_int_t *info) { /* Function Body */ *info = 0; if ( ! (ref == MagmaTrue) && ! (ref == MagmaFalse) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (nrhs < 0) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (ldb < max(1,n)) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (nrhs == 0 || n == 0) return *info; magma_int_t nn = n + ((4-(n % 4))%4); magmaDoubleComplex *dA, *hu, *hv, *db, *dAo, *dBo, *dwork; magma_int_t n2; magma_int_t iter; n2 = nn*nn; if (MAGMA_SUCCESS != magma_zmalloc( &dA, n2 )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if (MAGMA_SUCCESS != magma_zmalloc( &db, nn*nrhs )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if (ref == MagmaTrue) { if (MAGMA_SUCCESS != magma_zmalloc( &dAo, n2 )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if (MAGMA_SUCCESS != magma_zmalloc( &dwork, nn*nrhs )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if (MAGMA_SUCCESS != magma_zmalloc( &dBo, nn*nrhs )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } if (MAGMA_SUCCESS != magma_zmalloc_cpu( &hu, 2*nn )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } if (MAGMA_SUCCESS != magma_zmalloc_cpu( &hv, 2*nn )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magmablas_zlaset(MagmaFull, nn, nn, MAGMA_Z_ZERO, MAGMA_Z_ONE, dA, nn); /* Send matrix on the GPU*/ magma_zsetmatrix(n, n, A, lda, dA, nn); /* Send b on the GPU*/ magma_zsetmatrix(n, nrhs, B, ldb, db, nn); *info = magma_zgerbt_gpu(MagmaTrue, nn, nrhs, dA, nn, db, nn, hu, hv, info); if (*info != MAGMA_SUCCESS) { return *info; } if (ref == MagmaTrue) { magma_zcopymatrix(nn, nn, dA, nn, dAo, nn); magma_zcopymatrix(nn, nrhs, db, nn, dBo, nn); } /* Solve the system U^TAV.y = U^T.b on the GPU*/ magma_zgesv_nopiv_gpu( nn, nrhs, dA, nn, db, nn, info); /* Iterative refinement */ if (ref == MagmaTrue) { magma_zgerfs_nopiv_gpu(MagmaNoTrans, nn, nrhs, dAo, nn, dBo, nn, db, nn, dwork, dA, &iter, info); } //printf("iter = %d\n", iter); /* The solution of A.x = b is Vy computed on the GPU */ magmaDoubleComplex *dv; if (MAGMA_SUCCESS != magma_zmalloc( &dv, 2*nn )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_zsetvector(2*nn, hv, 1, dv, 1); for(int i = 0; i < nrhs; i++) { magmablas_zprbt_mv(nn, dv, db+(i*nn)); } magma_zgetmatrix(n, nrhs, db, nn, B, ldb); magma_free_cpu( hu); magma_free_cpu( hv); magma_free( dA ); magma_free( dv ); magma_free( db ); if (ref == MagmaTrue) { magma_free( dAo ); magma_free( dBo ); magma_free( dwork ); } return *info; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zlanhe */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaDoubleComplex *h_A; double *h_work; magmaDoubleComplex_ptr d_A; magmaDouble_ptr d_work; magma_int_t i, j, N, n2, lda, ldda; magma_int_t idist = 3; // normal distribution (otherwise max norm is always ~ 1) magma_int_t ISEED[4] = {0,0,0,1}; double error, norm_magma, norm_lapack; magma_int_t status = 0; magma_int_t lapack_nan_fail = 0; magma_int_t lapack_inf_fail = 0; bool mkl_warning = false; magma_opts opts; opts.parse_opts( argc, argv ); double tol = opts.tolerance * lapackf77_dlamch("E"); double tol2; magma_uplo_t uplo[] = { MagmaLower, MagmaUpper }; magma_norm_t norm[] = { MagmaInfNorm, MagmaOneNorm, MagmaMaxNorm, MagmaFrobeniusNorm }; // Double-Complex inf-norm not supported on Tesla (CUDA arch 1.x) #if defined(PRECISION_z) magma_int_t arch = magma_getdevice_arch(); if ( arch < 200 ) { printf("!!!! NOTE: Double-Complex %s and %s norm are not supported\n" "!!!! on CUDA architecture %d; requires arch >= 200.\n" "!!!! It should report \"parameter number 1 had an illegal value\" below.\n\n", MagmaInfNormStr, MagmaOneNormStr, (int) arch ); for( int inorm = 0; inorm < 2; ++inorm ) { for( int iuplo = 0; iuplo < 2; ++iuplo ) { printf( "Testing that magmablas_zlanhe( %s, %s, ... ) returns -1 error...\n", lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] )); norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], 1, NULL, 1, NULL, 1 ); if ( norm_magma != -1 ) { printf( "expected magmablas_zlanhe to return -1 error, but got %f\n", norm_magma ); status = 1; } }} printf( "...return values %s\n\n", (status == 0 ? "ok" : "failed") ); } #endif #ifdef MAGMA_WITH_MKL // MKL 11.1 has bug in multi-threaded zlanhe; use single thread to work around. // MKL 11.2 corrects it for inf, one, max norm. // MKL 11.2 still segfaults for Frobenius norm, which is not tested here // because MAGMA doesn't implement Frobenius norm yet. MKLVersion mkl_version; mkl_get_version( &mkl_version ); magma_int_t la_threads = magma_get_lapack_numthreads(); bool mkl_single_thread = (mkl_version.MajorVersion <= 11 && mkl_version.MinorVersion < 2); if ( mkl_single_thread ) { printf( "\nNote: using single thread to work around MKL zlanhe bug.\n\n" ); } #endif printf("%% N norm uplo CPU GByte/s (ms) GPU GByte/s (ms) error nan inf\n"); printf("%%=================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int inorm = 0; inorm < 3; ++inorm ) { /* < 4 for Frobenius */ for( int iuplo = 0; iuplo < 2; ++iuplo ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = magma_roundup( N, opts.align ); // read upper or lower triangle gbytes = 0.5*(N+1)*N*sizeof(magmaDoubleComplex) / 1e9; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_work, double, N ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( d_work, double, N ); /* Initialize the matrix */ lapackf77_zlarnv( &idist, ISEED, &n2, h_A ); magma_zsetmatrix( N, N, h_A, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gbytes / gpu_time; if (norm_magma == -1) { printf( "%5d %4c skipped because %s norm isn't supported\n", (int) N, lapacke_norm_const( norm[inorm] ), lapack_norm_const( norm[inorm] )); goto cleanup; } else if (norm_magma < 0) { printf("magmablas_zlanhe returned error %f: %s.\n", norm_magma, magma_strerror( (int) norm_magma )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ #ifdef MAGMA_WITH_MKL if ( mkl_single_thread ) { // work around MKL bug in multi-threaded zlanhe magma_set_lapack_numthreads( 1 ); } #endif cpu_time = magma_wtime(); norm_lapack = lapackf77_zlanhe( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if (norm_lapack < 0) { printf("lapackf77_zlanhe returned error %f: %s.\n", norm_lapack, magma_strerror( (int) norm_lapack )); } /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = fabs( norm_magma - norm_lapack ) / norm_lapack; tol2 = tol; if ( norm[inorm] == MagmaMaxNorm ) { // max-norm depends on only one element, so for Real precisions, // MAGMA and LAPACK should exactly agree (tol2 = 0), // while Complex precisions incur roundoff in cuCabs. #ifdef REAL tol2 = 0; #endif } bool okay; okay = (error <= tol2); status += ! okay; mkl_warning |= ! okay; /* ==================================================================== Check for NAN and INF propagation =================================================================== */ #define h_A(i_, j_) (h_A + (i_) + (j_)*lda) #define d_A(i_, j_) (d_A + (i_) + (j_)*ldda) i = rand() % N; j = rand() % N; magma_int_t tmp; if ( uplo[iuplo] == MagmaLower && i < j ) { tmp = i; i = j; j = tmp; } else if ( uplo[iuplo] == MagmaUpper && i > j ) { tmp = i; i = j; j = tmp; } *h_A(i,j) = MAGMA_Z_NAN; magma_zsetvector( 1, h_A(i,j), 1, d_A(i,j), 1 ); norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N ); norm_lapack = lapackf77_zlanhe( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); bool nan_okay; nan_okay = isnan(norm_magma); bool la_nan_okay; la_nan_okay = isnan(norm_lapack); lapack_nan_fail += ! la_nan_okay; status += ! nan_okay; *h_A(i,j) = MAGMA_Z_INF; magma_zsetvector( 1, h_A(i,j), 1, d_A(i,j), 1 ); norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N ); norm_lapack = lapackf77_zlanhe( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); bool inf_okay; inf_okay = isinf(norm_magma); bool la_inf_okay; la_inf_okay = isinf(norm_lapack); lapack_inf_fail += ! la_inf_okay; status += ! inf_okay; #ifdef MAGMA_WITH_MKL if ( mkl_single_thread ) { // end single thread to work around MKL bug magma_set_lapack_numthreads( la_threads ); } #endif printf("%5d %4c %4c %7.2f (%7.2f) %7.2f (%7.2f) %#9.3g %-6s %6s%1s %6s%1s\n", (int) N, lapacke_norm_const( norm[inorm] ), lapacke_uplo_const( uplo[iuplo] ), cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., error, (okay ? "ok" : "failed"), (nan_okay ? "ok" : "failed"), (la_nan_okay ? " " : "*"), (inf_okay ? "ok" : "failed"), (la_inf_okay ? " " : "*")); cleanup: TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_work ); fflush( stdout ); } // end iter if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iuplo, inorm printf( "\n" ); } // don't print "failed" here because then run_tests.py thinks MAGMA failed if ( lapack_nan_fail ) { printf( "* Warning: LAPACK did not pass NAN propagation test; upgrade to LAPACK version >= 3.4.2 (Sep. 2012)\n" ); } if ( lapack_inf_fail ) { printf( "* Warning: LAPACK did not pass INF propagation test\n" ); } if ( mkl_warning ) { printf("* MKL (e.g., 11.1) has a bug in zlanhe with multiple threads;\n" " corrected in 11.2 for one, inf, max norms, but still in Frobenius norm.\n" " Try again with MKL_NUM_THREADS=1.\n" ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_zlatrd2(char uplo, magma_int_t n, magma_int_t nb, magmaDoubleComplex *a, magma_int_t lda, double *e, magmaDoubleComplex *tau, magmaDoubleComplex *w, magma_int_t ldw, magmaDoubleComplex *da, magma_int_t ldda, magmaDoubleComplex *dw, magma_int_t lddw, magmaDoubleComplex *dwork, magma_int_t ldwork) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZLATRD2 reduces NB rows and columns of a complex Hermitian matrix A to Hermitian tridiagonal form by an orthogonal similarity transformation Q' * A * Q, and returns the matrices V and W which are needed to apply the transformation to the unreduced part of A. If UPLO = 'U', ZLATRD reduces the last NB rows and columns of a matrix, of which the upper triangle is supplied; if UPLO = 'L', ZLATRD reduces the first NB rows and columns of a matrix, of which the lower triangle is supplied. This is an auxiliary routine called by ZHETRD2_GPU. It uses an accelerated HEMV that needs extra memory. Arguments ========= UPLO (input) CHARACTER*1 Specifies whether the upper or lower triangular part of the Hermitian matrix A is stored: = 'U': Upper triangular = 'L': Lower triangular N (input) INTEGER The order of the matrix A. NB (input) INTEGER The number of rows and columns to be reduced. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit: if UPLO = 'U', the last NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements above the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = 'L', the first NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements below the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= (1,N). E (output) COMPLEX_16 array, dimension (N-1) If UPLO = 'U', E(n-nb:n-1) contains the superdiagonal elements of the last NB columns of the reduced matrix; if UPLO = 'L', E(1:nb) contains the subdiagonal elements of the first NB columns of the reduced matrix. TAU (output) COMPLEX_16 array, dimension (N-1) The scalar factors of the elementary reflectors, stored in TAU(n-nb:n-1) if UPLO = 'U', and in TAU(1:nb) if UPLO = 'L'. See Further Details. W (output) COMPLEX_16 array, dimension (LDW,NB) The n-by-nb matrix W required to update the unreduced part of A. LDW (input) INTEGER The leading dimension of the array W. LDW >= max(1,N). Further Details =============== If UPLO = 'U', the matrix Q is represented as a product of elementary reflectors Q = H(n) H(n-1) . . . H(n-nb+1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i), and tau in TAU(i-1). If UPLO = 'L', the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i), and tau in TAU(i). The elements of the vectors v together form the n-by-nb matrix V which is needed, with W, to apply the transformation to the unreduced part of the matrix, using a Hermitian rank-2k update of the form: A := A - V*W' - W*V'. The contents of A on exit are illustrated by the following examples with n = 5 and nb = 2: if UPLO = 'U': if UPLO = 'L': ( a a a v4 v5 ) ( d ) ( a a v4 v5 ) ( 1 d ) ( a 1 v5 ) ( v1 1 a ) ( d 1 ) ( v1 v2 a a ) ( d ) ( v1 v2 a a a ) where d denotes a diagonal element of the reduced matrix, a denotes an element of the original matrix that is unchanged, and vi denotes an element of the vector defining H(i). ===================================================================== */ char uplo_[2] = {uplo, 0}; magma_int_t i; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex value = MAGMA_Z_ZERO; magma_int_t ione = 1; magma_int_t i_n, i_1, iw; magmaDoubleComplex alpha; magmaDoubleComplex *f; if (n <= 0) { return 0; } magma_queue_t stream; magma_queue_create( &stream ); magma_zmalloc_cpu( &f, n ); assert( f != NULL ); // TODO return error, or allocate outside zlatrd if (lapackf77_lsame(uplo_, "U")) { /* Reduce last NB columns of upper triangle */ for (i = n-1; i >= n - nb ; --i) { i_1 = i + 1; i_n = n - i - 1; iw = i - n + nb; if (i < n-1) { /* Update A(1:i,i) */ #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i_n, W(i, iw+1), &ldw); #endif blasf77_zgemv("No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda, W(i, iw+1), &ldw, &c_one, A(0, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i_n, W(i, iw+1), &ldw); lapackf77_zlacgv(&i_n, A(i, i+1), &ldw); #endif blasf77_zgemv("No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw, A(i, i+1), &lda, &c_one, A(0, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i_n, A(i, i+1), &ldw); #endif } if (i > 0) { /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */ alpha = *A(i-1, i); lapackf77_zlarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]); e[i-1] = MAGMA_Z_REAL( alpha ); MAGMA_Z_SET2REAL(*A(i-1, i), 1.); /* Compute W(1:i-1,i) */ // 1. Send the block reflector A(0:n-i-1,i) to the GPU magma_zsetvector( i, A(0, i), 1, dA(0, i), 1 ); #if (GPUSHMEM < 200) magma_zhemv(MagmaUpper, i, c_one, dA(0, 0), ldda, dA(0, i), ione, c_zero, dW(0, iw), ione); #else magmablas_zhemv2(MagmaUpper, i, c_one, dA(0, 0), ldda, dA(0, i), ione, c_zero, dW(0, iw), ione, dwork, ldwork); #endif // 2. Start putting the result back (asynchronously) magma_zgetmatrix_async( i, 1, dW(0, iw), lddw, W(0, iw) /*test*/, ldw, stream ); if (i < n-1) { blasf77_zgemv(MagmaConjTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); } // 3. Here is where we need it // TODO find the right place magma_queue_sync( stream ); if (i < n-1) { blasf77_zgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); blasf77_zgemv(MagmaConjTransStr, &i, &i_n, &c_one, A(0, i+1), &lda, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); blasf77_zgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); } blasf77_zscal(&i, &tau[i - 1], W(0, iw), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_zdotc_sub( i, W(0,iw), ione, A(0,i), ione, &value ); #else value = cblas_zdotc( i, W(0,iw), ione, A(0,i), ione ); #endif alpha = tau[i - 1] * -0.5f * value; blasf77_zaxpy(&i, &alpha, A(0, i), &ione, W(0, iw), &ione); } } } else { /* Reduce first NB columns of lower triangle */ for (i = 0; i < nb; ++i) { /* Update A(i:n,i) */ i_n = n - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i, W(i, 0), &ldw); #endif blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda, W(i, 0), &ldw, &c_one, A(i, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i, W(i, 0), &ldw); lapackf77_zlacgv(&i, A(i ,0), &lda); #endif blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw, A(i, 0), &lda, &c_one, A(i, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i, A(i, 0), &lda); #endif if (i < n-1) { /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */ i_n = n - i - 1; alpha = *A(i+1, i); lapackf77_zlarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]); e[i] = MAGMA_Z_REAL( alpha ); MAGMA_Z_SET2REAL(*A(i+1, i), 1.); /* Compute W(i+1:n,i) */ // 1. Send the block reflector A(i+1:n,i) to the GPU magma_zsetvector( i_n, A(i+1, i), 1, dA(i+1, i), 1 ); #if (GPUSHMEM < 200) magma_zhemv(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero, dW(i+1, i), ione); #else magmablas_zhemv2('L', i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero, dW(i+1, i), ione, dwork, ldwork); #endif // 2. Start putting the result back (asynchronously) magma_zgetmatrix_async( i_n, 1, dW(i+1, i), lddw, W(i+1, i), ldw, stream ); blasf77_zgemv(MagmaConjTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw, A(i+1, i), &ione, &c_zero, W(0, i), &ione); blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda, W(0, i), &ione, &c_zero, f, &ione); blasf77_zgemv(MagmaConjTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda, A(i+1, i), &ione, &c_zero, W(0, i), &ione); // 3. Here is where we need it magma_queue_sync( stream ); if (i!=0) blasf77_zaxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione); blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw, W(0, i), &ione, &c_one, W(i+1, i), &ione); blasf77_zscal(&i_n, &tau[i], W(i+1,i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_zdotc_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value ); #else value = cblas_zdotc( i_n, W(i+1,i), ione, A(i+1,i), ione ); #endif alpha = tau[i] * -0.5f * value; blasf77_zaxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione); } } } magma_free_cpu(f); magma_queue_destroy( stream ); return 0; } /* zlatrd */
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, dev_perf, dev_time, cpu_perf, cpu_time; double magma_error, dev_error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t M, N, Xm, Ym, lda, ldda, sizeA, sizeX, sizeY; magma_int_t incx = 1; magma_int_t incy = 1; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 1.5, -2.3 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.6, 0.8 ); magmaDoubleComplex *A, *X, *Y, *Ydev, *Ymagma; magmaDoubleComplex_ptr dA, dX, dY; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("%% trans = %s\n", lapack_trans_const(opts.transA) ); #ifdef HAVE_CUBLAS printf("%% M N MAGMA Gflop/s (ms) %s Gflop/s (ms) CPU Gflop/s (ms) MAGMA error %s error\n", g_platform_str, g_platform_str ); #else printf("%% M N %s Gflop/s (ms) CPU Gflop/s (ms) %s error\n", g_platform_str, g_platform_str ); #endif printf("%%==================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = M; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default gflops = FLOPS_ZGEMV( M, N ) / 1e9; if ( opts.transA == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } sizeA = lda*N; sizeX = incx*Xm; sizeY = incy*Ym; TESTING_MALLOC_CPU( A, magmaDoubleComplex, sizeA ); TESTING_MALLOC_CPU( X, magmaDoubleComplex, sizeX ); TESTING_MALLOC_CPU( Y, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ydev, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ymagma, magmaDoubleComplex, sizeY ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dX, magmaDoubleComplex, sizeX ); TESTING_MALLOC_DEV( dY, magmaDoubleComplex, sizeY ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &sizeA, A ); lapackf77_zlarnv( &ione, ISEED, &sizeX, X ); lapackf77_zlarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( M, N, A, lda, dA, ldda, opts.queue ); magma_zsetvector( Xm, X, incx, dX, incx, opts.queue ); magma_zsetvector( Ym, Y, incy, dY, incy, opts.queue ); dev_time = magma_sync_wtime( opts.queue ); #ifdef HAVE_CUBLAS cublasZgemv( opts.handle, cublas_trans_const(opts.transA), M, N, &alpha, dA, ldda, dX, incx, &beta, dY, incy ); #else magma_zgemv( opts.transA, M, N, alpha, dA, ldda, dX, incx, beta, dY, incy ); #endif dev_time = magma_sync_wtime( opts.queue ) - dev_time; dev_perf = gflops / dev_time; magma_zgetvector( Ym, dY, incy, Ydev, incy, opts.queue ); /* ===================================================================== Performs operation using MAGMABLAS (currently only with CUDA) =================================================================== */ #ifdef HAVE_CUBLAS magma_zsetvector( Ym, Y, incy, dY, incy, opts.queue ); magma_time = magma_sync_wtime( opts.queue ); magmablas_zgemv( opts.transA, M, N, alpha, dA, ldda, dX, incx, beta, dY, incy, opts.queue ); magma_time = magma_sync_wtime( opts.queue ) - magma_time; magma_perf = gflops / magma_time; magma_zgetvector( Ym, dY, incy, Ymagma, incy, opts.queue ); #endif /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_zgemv( lapack_trans_const(opts.transA), &M, &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ double Anorm = lapackf77_zlange( "F", &M, &N, A, &lda, work ); double Xnorm = lapackf77_zlange( "F", &Xm, &ione, X, &Xm, work ); blasf77_zaxpy( &Ym, &c_neg_one, Y, &incy, Ydev, &incy ); dev_error = lapackf77_zlange( "F", &Ym, &ione, Ydev, &Ym, work ) / (Anorm * Xnorm); #ifdef HAVE_CUBLAS blasf77_zaxpy( &Ym, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_zlange( "F", &Ym, &ione, Ymagma, &Ym, work ) / (Anorm * Xnorm); bool okay = (magma_error < tol) && (dev_error < tol); status += ! okay; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, dev_perf, 1000.*dev_time, cpu_perf, 1000.*cpu_time, magma_error, dev_error, (okay ? "ok" : "failed")); #else bool okay = (dev_error < tol); status += ! okay; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, dev_perf, 1000.*dev_time, cpu_perf, 1000.*cpu_time, dev_error, (okay ? "ok" : "failed")); #endif TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ydev ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
extern "C" double magma_zlatrd_mgpu(magma_int_t num_gpus, char uplo, magma_int_t n0, magma_int_t n, magma_int_t nb, magma_int_t nb0, magmaDoubleComplex *a, magma_int_t lda, double *e, magmaDoubleComplex *tau, magmaDoubleComplex *w, magma_int_t ldw, magmaDoubleComplex **da, magma_int_t ldda, magma_int_t offset, magmaDoubleComplex **dw, magma_int_t lddw, magmaDoubleComplex *dwork[MagmaMaxGPUs], magma_int_t ldwork, magma_int_t k, magmaDoubleComplex *dx[MagmaMaxGPUs], magmaDoubleComplex *dy[MagmaMaxGPUs], magmaDoubleComplex *work, magma_queue_t stream[][10], double *times) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= ZLATRD reduces NB rows and columns of a complex Hermitian matrix A to Hermitian tridiagonal form by an orthogonal similarity transformation Q' * A * Q, and returns the matrices V and W which are needed to apply the transformation to the unreduced part of A. If UPLO = 'U', ZLATRD reduces the last NB rows and columns of a matrix, of which the upper triangle is supplied; if UPLO = 'L', ZLATRD reduces the first NB rows and columns of a matrix, of which the lower triangle is supplied. This is an auxiliary routine called by ZHETRD. Arguments ========= UPLO (input) CHARACTER*1 Specifies whether the upper or lower triangular part of the Hermitian matrix A is stored: = 'U': Upper triangular = 'L': Lower triangular N (input) INTEGER The order of the matrix A. NB (input) INTEGER The number of rows and columns to be reduced. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit: if UPLO = 'U', the last NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements above the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = 'L', the first NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements below the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= (1,N). E (output) COMPLEX_16 array, dimension (N-1) If UPLO = 'U', E(n-nb:n-1) contains the superdiagonal elements of the last NB columns of the reduced matrix; if UPLO = 'L', E(1:nb) contains the subdiagonal elements of the first NB columns of the reduced matrix. TAU (output) COMPLEX_16 array, dimension (N-1) The scalar factors of the elementary reflectors, stored in TAU(n-nb:n-1) if UPLO = 'U', and in TAU(1:nb) if UPLO = 'L'. See Further Details. W (output) COMPLEX_16 array, dimension (LDW,NB) The n-by-nb matrix W required to update the unreduced part of A. LDW (input) INTEGER The leading dimension of the array W. LDW >= max(1,N). Further Details =============== If UPLO = 'U', the matrix Q is represented as a product of elementary reflectors Q = H(n) H(n-1) . . . H(n-nb+1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i), and tau in TAU(i-1). If UPLO = 'L', the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i), and tau in TAU(i). The elements of the vectors v together form the n-by-nb matrix V which is needed, with W, to apply the transformation to the unreduced part of the matrix, using a Hermitian rank-2k update of the form: A := A - V*W' - W*V'. The contents of A on exit are illustrated by the following examples with n = 5 and nb = 2: if UPLO = 'U': if UPLO = 'L': ( a a a v4 v5 ) ( d ) ( a a v4 v5 ) ( 1 d ) ( a 1 v5 ) ( v1 1 a ) ( d 1 ) ( v1 v2 a a ) ( d ) ( v1 v2 a a a ) where d denotes a diagonal element of the reduced matrix, a denotes an element of the original matrix that is unchanged, and vi denotes an element of the vector defining H(i). ===================================================================== */ char uplo_[2] = {uplo, 0}; double mv_time = 0.0; magma_int_t i; #ifndef MAGMABLAS_ZHEMV_MGPU magma_int_t loffset = nb0*((offset/nb0)/num_gpus); #endif magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex value = MAGMA_Z_ZERO; magma_int_t id, idw, i_one = 1; //magma_int_t kk; magma_int_t ione = 1; magma_int_t i_n, i_1, iw; magmaDoubleComplex alpha; magmaDoubleComplex *dx2[MagmaMaxGPUs]; magmaDoubleComplex *f = (magmaDoubleComplex *)malloc(n*sizeof(magmaDoubleComplex )); if (n <= 0) { return 0; } //#define PROFILE_SYMV #ifdef PROFILE_SYMV magma_event_t start, stop; float etime; magma_timestr_t cpu_start, cpu_end; magma_setdevice(0); magma_event_create( &start ); magma_event_create( &stop ); #endif if (lapackf77_lsame(uplo_, "U")) { /* Reduce last NB columns of upper triangle */ for (i = n-1; i >= n - nb ; --i) { i_1 = i + 1; i_n = n - i - 1; iw = i - n + nb; if (i < n-1) { /* Update A(1:i,i) */ magmaDoubleComplex wii = *W(i, iw+1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i_one, &wii, &ldw); #endif wii = -wii; blasf77_zaxpy(&i_1, &wii, A(0, i+1), &i_one, A(0, i), &ione); wii = *A(i, i+1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i_one, &wii, &ldw); #endif wii = -wii; blasf77_zaxpy(&i_1, &wii, W(0, iw+1), &i_one, A(0, i), &ione); } if (i > 0) { /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */ alpha = *A(i-1, i); lapackf77_zlarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]); e[i-1] = MAGMA_Z_REAL( alpha ); *A(i-1,i) = MAGMA_Z_MAKE( 1, 0 ); for( id=0; id<num_gpus; id++ ) { magma_setdevice(id); dx2[id] = dW1(id, 0, iw); magma_zsetvector_async( n, A(0,i), 1, dW1(id, 0, iw), 1, stream[id][0]); #ifndef MAGMABLAS_ZHEMV_MGPU magma_zsetvector_async( i, A(0,i), 1, dx[id], 1, stream[id][0] ); #endif } magmablas_zhemv_mgpu(num_gpus, k, 'U', i, nb0, c_one, da, ldda, 0, dx2, ione, c_zero, dy, ione, dwork, ldwork, work, W(0, iw), stream ); if (i < n-1) { blasf77_zgemv(MagmaConjTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); } /* overlap update */ if( i < n-1 && i-1 >= n - nb ) { magma_int_t im1_1 = i_1 - 1; magma_int_t im1 = i-1; /* Update A(1:i,i) */ #if defined(PRECISION_z) || defined(PRECISION_c) magma_int_t im1_n = i_n + 1; lapackf77_zlacgv(&im1_n, W(im1, iw+1), &ldw); #endif blasf77_zgemv("No transpose", &im1_1, &i_n, &c_neg_one, A(0, i+1), &lda, W(im1, iw+1), &ldw, &c_one, A(0, i-1), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&im1_n, W(im1, iw+1), &ldw); lapackf77_zlacgv(&im1_n, A(im1, i +1), &lda); #endif blasf77_zgemv("No transpose", &im1_1, &i_n, &c_neg_one, W(0, iw+1), &ldw, A(im1, i+1), &lda, &c_one, A(0, i-1), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&im1_n, A(im1, i+1), &lda); #endif } // 3. Here is where we need it // TODO find the right place magmablas_zhemv_sync(num_gpus, k, i, work, W(0, iw), stream ); if (i < n-1) { blasf77_zgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); blasf77_zgemv(MagmaConjTransStr, &i, &i_n, &c_one, A(0, i+1), &lda, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); blasf77_zgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); } blasf77_zscal(&i, &tau[i - 1], W(0, iw), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_zdotc_sub( i, W(0,iw), ione, A(0,i), ione, &value ); #else value = cblas_zdotc( i, W(0,iw), ione, A(0,i), ione ); #endif alpha = tau[i - 1] * -.5f * value; blasf77_zaxpy(&i, &alpha, A(0, i), &ione, W(0, iw), &ione); for( id=0; id<num_gpus; id++ ) { magma_setdevice(id); if( k > 1 ) { magma_zsetvector_async( n, W(0,iw), 1, dW(id, 0, iw), 1, stream[id][1] ); } else { magma_zsetvector_async( n, W(0,iw), 1, dW(id, 0, iw), 1, stream[id][0] ); } } } } } else { /* Reduce first NB columns of lower triangle */ for (i = 0; i < nb; ++i) { /* Update A(i:n,i) */ i_n = n - i; idw = ((offset+i)/nb)%num_gpus; if( i > 0 ) { trace_cpu_start( 0, "gemv", "gemv" ); magmaDoubleComplex wii = *W(i, i-1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i_one, &wii, &ldw); #endif wii = -wii; blasf77_zaxpy( &i_n, &wii, A(i, i-1), &ione, A(i, i), &ione); wii = *A(i, i-1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i_one, &wii, &lda); #endif wii = -wii; blasf77_zaxpy( &i_n, &wii, W(i, i-1), &ione, A(i, i), &ione); } if (i < n-1) { /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */ i_n = n - i - 1; trace_cpu_start( 0, "larfg", "larfg" ); alpha = *A(i+1, i); #ifdef PROFILE_SYMV cpu_start = get_current_time(); #endif lapackf77_zlarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]); #ifdef PROFILE_SYMV cpu_end = get_current_time(); times[0] += GetTimerValue(cpu_start,cpu_end)/1000.0; #endif e[i] = MAGMA_Z_REAL( alpha ); *A(i+1,i) = MAGMA_Z_MAKE( 1, 0 ); trace_cpu_end( 0 ); /* Compute W(i+1:n,i) */ // 1. Send the block reflector A(i+1:n,i) to the GPU //trace_gpu_start( idw, 0, "comm", "comm1" ); #ifndef MAGMABLAS_ZHEMV_MGPU magma_setdevice(idw); magma_zsetvector( i_n, A(i+1,i), 1, dA(idw, i+1, i), 1 ); #endif for( id=0; id<num_gpus; id++ ) { magma_setdevice(id); trace_gpu_start( id, 0, "comm", "comm" ); #ifdef MAGMABLAS_ZHEMV_MGPU dx2[id] = dW1(id, 0, i)-offset; #else dx2[id] = dx[id]; magma_zsetvector( i_n, A(i+1,i), 1, dx[id], 1 ); #endif magma_zsetvector_async( n, A(0,i), 1, dW1(id, 0, i), 1, stream[id][0] ); trace_gpu_end( id, 0 ); } /* mat-vec on multiple GPUs */ #ifdef PROFILE_SYMV magma_setdevice(0); magma_event_record(start, stream[0][0]); #endif magmablas_zhemv_mgpu(num_gpus, k, 'L', i_n, nb0, c_one, da, ldda, offset+i+1, dx2, ione, c_zero, dy, ione, dwork, ldwork, work, W(i+1,i), stream ); #ifdef PROFILE_SYMV magma_setdevice(0); magma_event_record(stop, stream[0][0]); #endif trace_cpu_start( 0, "gemv", "gemv" ); blasf77_zgemv(MagmaConjTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw, A(i+1, i), &ione, &c_zero, W(0, i), &ione); blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda, W(0, i), &ione, &c_zero, f, &ione); blasf77_zgemv(MagmaConjTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda, A(i+1, i), &ione, &c_zero, W(0, i), &ione); trace_cpu_end( 0 ); /* overlap update */ if( i > 0 && i+1 < n ) { magma_int_t ip1 = i+1; trace_cpu_start( 0, "gemv", "gemv" ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i, W(ip1, 0), &ldw); #endif blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, A(ip1, 0), &lda, W(ip1, 0), &ldw, &c_one, A(ip1, ip1), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i, W(ip1, 0), &ldw); lapackf77_zlacgv(&i, A(ip1 ,0), &lda); #endif blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, W(ip1, 0), &ldw, A(ip1, 0), &lda, &c_one, A(ip1, ip1), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv(&i, A(ip1, 0), &lda); #endif trace_cpu_end( 0 ); } /* synchronize */ magmablas_zhemv_sync(num_gpus, k, i_n, work, W(i+1,i), stream ); #ifdef PROFILE_SYMV cudaEventElapsedTime(&etime, start, stop); mv_time += (etime/1000.0); times[1+(i_n/(n0/10))] += (etime/1000.0); #endif trace_cpu_start( 0, "axpy", "axpy" ); if (i!=0) blasf77_zaxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione); blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw, W(0, i), &ione, &c_one, W(i+1, i), &ione); blasf77_zscal(&i_n, &tau[i], W(i+1,i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_zdotc_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value ); #else value = cblas_zdotc( i_n, W(i+1,i), ione, A(i+1,i), ione ); #endif alpha = tau[i]* -.5f * value; blasf77_zaxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione); trace_cpu_end( 0 ); for( id=0; id<num_gpus; id++ ) { magma_setdevice(id); if( k > 1 ) { magma_zsetvector_async( n, W(0,i), 1, dW(id, 0, i), 1, stream[id][1] ); } else { magma_zsetvector_async( n, W(0,i), 1, dW(id, 0, i), 1, stream[id][0] ); } } } } } #ifdef PROFILE_SYMV magma_setdevice(0); magma_event_destory( start ); magma_event_destory( stop ); #endif for( id=0; id<num_gpus; id++ ) { magma_setdevice(id); if( k > 1) magma_queue_sync(stream[id][1]); } free(f); return mv_time; } /* zlatrd_ */