SEXP magMultmv(SEXP a, SEXP transa, SEXP x, SEXP right) { SEXP gpu = magGetGPU(a, x), y = PROTECT(NEW_OBJECT(MAKE_CLASS("magma"))); int RHS = LOGICAL_VALUE(right), TA = (LOGICAL_VALUE(transa) ^ !RHS), *DIMA = INTEGER(GET_DIM(a)), M = DIMA[0], N = DIMA[1], LENX = LENGTH(x), LENY = DIMA[TA], LDA=M; char TRANSA = (TA ? 'T' : 'N'); double *A = REAL(PROTECT(AS_NUMERIC(a))), *X = REAL(PROTECT(AS_NUMERIC(x))), *dA, *dX, *dY; if(DIMA[!TA] != LENX) error("non-conformable matrices"); y = SET_SLOT(y, install(".Data"), allocMatrix(REALSXP, (RHS ? LENY : 1), (RHS ? 1 : LENY))); SET_SLOT(y, install("gpu"), duplicate(gpu)); magma_malloc((void**)&dA, (M*N)*sizeof(double)); magma_malloc((void**)&dX, LENX*sizeof(double)); magma_malloc((void**)&dY, LENY*sizeof(double)); magma_dsetmatrix(M, N, A, LDA, dA, LDA); magma_dsetvector(LENX, X, 1, dX, 1); if(LOGICAL_VALUE(gpu)) { magmablas_dgemv(TRANSA, M, N, 1.0, dA, LDA, dX, 1, 0.0, dY, 1); } else { cublasDgemv(TRANSA, M, N, 1.0, dA, LDA, dX, 1, 0.0, dY, 1); } magma_dgetvector(LENY, dY, 1, REAL(y), 1); magma_free(dA); magma_free(dX); magma_free(dY); UNPROTECT(3); return y; }
/** Purpose ------- DLABRD reduces the first NB rows and columns of a real general m by n matrix A to upper or lower bidiagonal form by an orthogonal transformation Q' * A * P, and returns the matrices X and Y which are needed to apply the transformation to the unreduced part of A. If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower bidiagonal form. This is an auxiliary routine called by DGEBRD. Arguments --------- @param[in] m INTEGER The number of rows in the matrix A. @param[in] n INTEGER The number of columns in the matrix A. @param[in] nb INTEGER The number of leading rows and columns of A to be reduced. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the m by n general matrix to be reduced. On exit, the first NB rows and columns of the matrix are overwritten; the rest of the array is unchanged. If m >= n, elements on and below the diagonal in the first NB columns, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors; and elements above the diagonal in the first NB rows, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors. \n If m < n, elements below the diagonal in the first NB columns, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors, and elements on and above the diagonal in the first NB rows, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[in,out] dA DOUBLE_PRECISION array, dimension (LDDA,N) Copy of A on GPU. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). @param[out] d DOUBLE_PRECISION array, dimension (NB) The diagonal elements of the first NB rows and columns of the reduced matrix. D(i) = A(i,i). @param[out] e DOUBLE_PRECISION array, dimension (NB) The off-diagonal elements of the first NB rows and columns of the reduced matrix. @param[out] tauq DOUBLE_PRECISION array dimension (NB) The scalar factors of the elementary reflectors which represent the orthogonal matrix Q. See Further Details. @param[out] taup DOUBLE_PRECISION array, dimension (NB) The scalar factors of the elementary reflectors which represent the orthogonal matrix P. See Further Details. @param[out] X DOUBLE_PRECISION array, dimension (LDX,NB) The m-by-nb matrix X required to update the unreduced part of A. @param[in] ldx INTEGER The leading dimension of the array X. LDX >= M. @param[out] dX DOUBLE_PRECISION array, dimension (LDDX,NB) Copy of X on GPU. @param[in] lddx INTEGER The leading dimension of the array dX. LDDX >= M. @param[out] Y DOUBLE_PRECISION array, dimension (LDY,NB) The n-by-nb matrix Y required to update the unreduced part of A. @param[in] ldy INTEGER The leading dimension of the array Y. LDY >= N. @param[out] dY DOUBLE_PRECISION array, dimension (LDDY,NB) Copy of Y on GPU. @param[in] lddy INTEGER The leading dimension of the array dY. LDDY >= N. Further Details --------------- The matrices Q and P are represented as products of elementary reflectors: Q = H(1) H(2) . . . H(nb) and P = G(1) G(2) . . . G(nb) Each H(i) and G(i) has the form: H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' where tauq and taup are real scalars, and v and u are real vectors. If m >= n, v(1:i-1) = 0, v(i) = 1, and v(i:m) is stored on exit in A(i:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+1:n) is stored on exit in A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i). If m < n, v(1:i) = 0, v(i+1) = 1, and v(i+1:m) is stored on exit in A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i:n) is stored on exit in A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i). The elements of the vectors v and u together form the m-by-nb matrix V and the nb-by-n matrix U' which are needed, with X and Y, to apply the transformation to the unreduced part of the matrix, using a block update of the form: A := A - V*Y' - X*U'. The contents of A on exit are illustrated by the following examples with nb = 2: @verbatim m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n): ( 1 1 u1 u1 u1 ) ( 1 u1 u1 u1 u1 u1 ) ( v1 1 1 u2 u2 ) ( 1 1 u2 u2 u2 u2 ) ( v1 v2 a a a ) ( v1 1 a a a a ) ( v1 v2 a a a ) ( v1 v2 a a a a ) ( v1 v2 a a a ) ( v1 v2 a a a a ) ( v1 v2 a a a ) @endverbatim where a denotes an element of the original matrix which is unchanged, vi denotes an element of the vector defining H(i), and ui an element of the vector defining G(i). @ingroup magma_dgesvd_aux ********************************************************************/ extern "C" magma_int_t magma_dlabrd_gpu( magma_int_t m, magma_int_t n, magma_int_t nb, double *A, magma_int_t lda, double *dA, magma_int_t ldda, double *d, double *e, double *tauq, double *taup, double *X, magma_int_t ldx, double *dX, magma_int_t lddx, double *Y, magma_int_t ldy, double *dY, magma_int_t lddy) { #define A(i_,j_) (A + (i_) + (j_)*lda) #define X(i_,j_) (X + (i_) + (j_)*ldx) #define Y(i_,j_) (Y + (i_) + (j_)*ldy) #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dY(i_,j_) (dY + (i_) + (j_)*lddy) #define dX(i_,j_) (dX + (i_) + (j_)*lddx) double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; double c_zero = MAGMA_D_ZERO; magma_int_t ione = 1; magma_int_t i__2, i__3; magma_int_t i; double alpha; A -= 1 + lda; X -= 1 + ldx; dX -= 1 + lddx; Y -= 1 + ldy; dY -= 1 + lddy; --d; --e; --tauq; --taup; /* Quick return if possible */ magma_int_t info = 0; if (m <= 0 || n <= 0) { return info; } double *f; magma_queue_t stream; magma_queue_create( &stream ); magma_dmalloc_cpu( &f, max(n,m) ); if ( f == NULL ) { info = MAGMA_ERR_HOST_ALLOC; return info; } if (m >= n) { /* Reduce to upper bidiagonal form */ for (i = 1; i <= nb; ++i) { /* Update A(i:m,i) */ i__2 = m - i + 1; i__3 = i - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__3, Y(i,1), &ldy ); #endif blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, A(i,1), &lda, Y(i,1), &ldy, &c_one, A(i,i), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__3, Y(i,1), &ldy ); #endif blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, X(i,1), &ldx, A(1,i), &ione, &c_one, A(i,i), &ione ); /* Generate reflection Q(i) to annihilate A(i+1:m,i) */ alpha = *A(i,i); i__2 = m - i + 1; i__3 = i + 1; lapackf77_dlarfg( &i__2, &alpha, A(min(i__3,m),i), &ione, &tauq[i] ); d[i] = MAGMA_D_REAL( alpha ); if (i < n) { *A(i,i) = c_one; /* Compute Y(i+1:n,i) */ i__2 = m - i + 1; i__3 = n - i; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_dsetvector( i__2, A(i,i), 1, dA(i-1,i-1), 1 ); // 2. Multiply --------------------------------------------- magma_dgemv( MagmaConjTrans, i__2, i__3, c_one, dA(i-1,i), ldda, dA(i-1,i-1), ione, c_zero, dY(i+1,i), ione ); // 3. Put the result back ---------------------------------- magma_dgetmatrix_async( i__3, 1, dY(i+1,i), lddy, Y(i+1,i), ldy, stream ); i__2 = m - i + 1; i__3 = i - 1; blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_one, A(i,1), &lda, A(i,i), &ione, &c_zero, Y(1,i), &ione ); i__2 = n - i; i__3 = i - 1; blasf77_dgemv( "N", &i__2, &i__3, &c_neg_one, Y(i+1,1), &ldy, Y(1,i), &ione, &c_zero, f, &ione ); i__2 = m - i + 1; i__3 = i - 1; blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_one, X(i,1), &ldx, A(i,i), &ione, &c_zero, Y(1,i), &ione ); // 4. Sync to make sure the result is back ---------------- magma_queue_sync( stream ); if (i__3 != 0) { i__2 = n - i; blasf77_daxpy( &i__2, &c_one, f, &ione, Y(i+1,i), &ione ); } i__2 = i - 1; i__3 = n - i; blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_neg_one, A(1,i+1), &lda, Y(1,i), &ione, &c_one, Y(i+1,i), &ione ); i__2 = n - i; blasf77_dscal( &i__2, &tauq[i], Y(i+1,i), &ione ); /* Update A(i,i+1:n) */ i__2 = n - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__2, A(i,i+1), &lda ); lapackf77_dlacgv( &i, A(i,1), &lda ); #endif blasf77_dgemv( "No transpose", &i__2, &i, &c_neg_one, Y(i+1,1), &ldy, A(i,1), &lda, &c_one, A(i,i+1), &lda ); i__2 = i - 1; i__3 = n - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i, A(i,1), &lda ); lapackf77_dlacgv( &i__2, X(i,1), &ldx ); #endif blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_neg_one, A(1,i+1), &lda, X(i,1), &ldx, &c_one, A(i,i+1), &lda ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__2, X(i,1), &ldx ); #endif /* Generate reflection P(i) to annihilate A(i,i+2:n) */ i__2 = n - i; i__3 = i + 2; alpha = *A(i,i+1); lapackf77_dlarfg( &i__2, &alpha, A(i,min(i__3,n)), &lda, &taup[i] ); e[i] = MAGMA_D_REAL( alpha ); *A(i,i+1) = c_one; /* Compute X(i+1:m,i) */ i__2 = m - i; i__3 = n - i; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_dsetvector( i__3, A(i,i+1), lda, dA(i-1,i), ldda ); // 2. Multiply --------------------------------------------- //magma_dcopy( i__3, dA(i-1,i), ldda, dY(1,1), 1 ); magma_dgemv( MagmaNoTrans, i__2, i__3, c_one, dA(i,i), ldda, dA(i-1,i), ldda, //dY(1,1), 1, c_zero, dX(i+1,i), ione ); // 3. Put the result back ---------------------------------- magma_dgetmatrix_async( i__2, 1, dX(i+1,i), lddx, X(i+1,i), ldx, stream ); i__2 = n - i; blasf77_dgemv( MagmaConjTransStr, &i__2, &i, &c_one, Y(i+1,1), &ldy, A(i,i+1), &lda, &c_zero, X(1,i), &ione ); i__2 = m - i; blasf77_dgemv( "N", &i__2, &i, &c_neg_one, A(i+1,1), &lda, X(1,i), &ione, &c_zero, f, &ione ); i__2 = i - 1; i__3 = n - i; blasf77_dgemv( "N", &i__2, &i__3, &c_one, A(1,i+1), &lda, A(i,i+1), &lda, &c_zero, X(1,i), &ione ); // 4. Sync to make sure the result is back ---------------- magma_queue_sync( stream ); if (i != 0) { i__2 = m - i; blasf77_daxpy( &i__2, &c_one, f, &ione, X(i+1,i), &ione ); } i__2 = m - i; i__3 = i - 1; blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, X(i+1,1), &ldx, X(1,i), &ione, &c_one, X(i+1,i), &ione ); i__2 = m - i; blasf77_dscal( &i__2, &taup[i], X(i+1,i), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) i__2 = n - i; lapackf77_dlacgv( &i__2, A(i,i+1), &lda ); // 4. Send the block reflector A(i+1:m,i) to the GPU after DLACGV() magma_dsetvector( i__2, A(i,i+1), lda, dA(i-1,i), ldda ); #endif } } } else { /* Reduce to lower bidiagonal form */ for (i = 1; i <= nb; ++i) { /* Update A(i,i:n) */ i__2 = n - i + 1; i__3 = i - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__2, A(i,i), &lda ); lapackf77_dlacgv( &i__3, A(i,1), &lda ); #endif blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, Y(i,1), &ldy, A(i,1), &lda, &c_one, A(i,i), &lda ); i__2 = i - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__3, A(i,1), &lda ); lapackf77_dlacgv( &i__3, X(i,1), &ldx ); #endif i__3 = n - i + 1; blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_neg_one, A(1,i), &lda, X(i,1), &ldx, &c_one, A(i,i), &lda ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__2, X(i,1), &ldx ); #endif /* Generate reflection P(i) to annihilate A(i,i+1:n) */ i__2 = n - i + 1; i__3 = i + 1; alpha = *A(i,i); lapackf77_dlarfg( &i__2, &alpha, A(i,min(i__3,n)), &lda, &taup[i] ); d[i] = MAGMA_D_REAL( alpha ); if (i < m) { *A(i,i) = c_one; /* Compute X(i+1:m,i) */ i__2 = m - i; i__3 = n - i + 1; // 1. Send the block reflector A(i,i+1:n) to the GPU ------ magma_dsetvector( i__3, A(i,i), lda, dA(i-1,i-1), ldda ); // 2. Multiply --------------------------------------------- //magma_dcopy( i__3, dA(i-1,i-1), ldda, dY(1,1), 1 ); magma_dgemv( MagmaNoTrans, i__2, i__3, c_one, dA(i,i-1), ldda, dA(i-1,i-1), ldda, //dY(1,1), 1, c_zero, dX(i+1,i), ione ); // 3. Put the result back ---------------------------------- magma_dgetmatrix_async( i__2, 1, dX(i+1,i), lddx, X(i+1,i), ldx, stream ); i__2 = n - i + 1; i__3 = i - 1; blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_one, Y(i,1), &ldy, A(i,i), &lda, &c_zero, X(1,i), &ione ); i__2 = m - i; i__3 = i - 1; blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, A(i+1,1), &lda, X(1,i), &ione, &c_zero, f, &ione ); i__2 = i - 1; i__3 = n - i + 1; blasf77_dgemv( "No transpose", &i__2, &i__3, &c_one, A(1,i), &lda, A(i,i), &lda, &c_zero, X(1,i), &ione ); // 4. Sync to make sure the result is back ---------------- magma_queue_sync( stream ); if (i__2 != 0) { i__3 = m - i; blasf77_daxpy( &i__3, &c_one, f, &ione, X(i+1,i), &ione ); } i__2 = m - i; i__3 = i - 1; blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, X(i+1,1), &ldx, X(1,i), &ione, &c_one, X(i+1,i), &ione ); i__2 = m - i; blasf77_dscal( &i__2, &taup[i], X(i+1,i), &ione ); i__2 = n - i + 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__2, A(i,i), &lda ); magma_dsetvector( i__2, A(i,i), lda, dA(i-1,i-1), ldda ); #endif /* Update A(i+1:m,i) */ i__2 = m - i; i__3 = i - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__3, Y(i,1), &ldy ); #endif blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, A(i+1,1), &lda, Y(i,1), &ldy, &c_one, A(i+1,i), &ione ); i__2 = m - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__3, Y(i,1), &ldy ); #endif blasf77_dgemv( "No transpose", &i__2, &i, &c_neg_one, X(i+1,1), &ldx, A(1,i), &ione, &c_one, A(i+1,i), &ione ); /* Generate reflection Q(i) to annihilate A(i+2:m,i) */ i__2 = m - i; i__3 = i + 2; alpha = *A(i+1,i); lapackf77_dlarfg( &i__2, &alpha, A(min(i__3,m),i), &ione, &tauq[i] ); e[i] = MAGMA_D_REAL( alpha ); *A(i+1,i) = c_one; /* Compute Y(i+1:n,i) */ i__2 = m - i; i__3 = n - i; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_dsetvector( i__2, A(i+1,i), 1, dA(i,i-1), 1 ); // 2. Multiply --------------------------------------------- magma_dgemv( MagmaConjTrans, i__2, i__3, c_one, dA(i,i), ldda, dA(i,i-1), ione, c_zero, dY(i+1,i), ione ); // 3. Put the result back ---------------------------------- magma_dgetmatrix_async( i__3, 1, dY(i+1,i), lddy, Y(i+1,i), ldy, stream ); i__2 = m - i; i__3 = i - 1; blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_one, A(i+1,1), &lda, A(i+1,i), &ione, &c_zero, Y(1,i), &ione ); i__2 = n - i; i__3 = i - 1; blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, Y(i+1,1), &ldy, Y(1,i), &ione, &c_zero, f, &ione ); i__2 = m - i; blasf77_dgemv( MagmaConjTransStr, &i__2, &i, &c_one, X(i+1,1), &ldx, A(i+1,i), &ione, &c_zero, Y(1,i), &ione ); // 4. Sync to make sure the result is back ---------------- magma_queue_sync( stream ); if (i__3 != 0) { i__2 = n - i; blasf77_daxpy( &i__2, &c_one, f, &ione, Y(i+1,i), &ione ); } i__2 = n - i; blasf77_dgemv( MagmaConjTransStr, &i, &i__2, &c_neg_one, A(1,i+1), &lda, Y(1,i), &ione, &c_one, Y(i+1,i), &ione ); i__2 = n - i; blasf77_dscal( &i__2, &tauq[i], Y(i+1,i), &ione ); } #if defined(PRECISION_z) || defined(PRECISION_c) else { i__2 = n - i + 1; lapackf77_dlacgv( &i__2, A(i,i), &lda ); magma_dsetvector( i__2, A(i,i), lda, dA(i-1,i-1), ldda ); } #endif } } magma_queue_destroy( stream ); magma_free_cpu( f ); return info; } /* magma_dlabrd_gpu */
magma_int_t magma_dpgmres( magma_d_sparse_matrix A, magma_d_vector b, magma_d_vector *x, magma_d_solver_par *solver_par, magma_d_preconditioner *precond_par ){ // prepare solver feedback solver_par->solver = Magma_PGMRES; solver_par->numiter = 0; solver_par->info = 0; // local variables double c_zero = MAGMA_D_ZERO, c_one = MAGMA_D_ONE, c_mone = MAGMA_D_NEG_ONE; magma_int_t dofs = A.num_rows; magma_int_t i, j, k, m = 0; magma_int_t restart = min( dofs-1, solver_par->restart ); magma_int_t ldh = restart+1; double nom, rNorm, RNorm, nom0, betanom, r0 = 0.; // CPU workspace magma_setdevice(0); double *H, *HH, *y, *h1; magma_dmalloc_pinned( &H, (ldh+1)*ldh ); magma_dmalloc_pinned( &y, ldh ); magma_dmalloc_pinned( &HH, ldh*ldh ); magma_dmalloc_pinned( &h1, ldh ); // GPU workspace magma_d_vector r, q, q_t, z, z_t, t; magma_d_vinit( &t, Magma_DEV, dofs, c_zero ); magma_d_vinit( &r, Magma_DEV, dofs, c_zero ); magma_d_vinit( &q, Magma_DEV, dofs*(ldh+1), c_zero ); magma_d_vinit( &z, Magma_DEV, dofs*(ldh+1), c_zero ); magma_d_vinit( &z_t, Magma_DEV, dofs, c_zero ); q_t.memory_location = Magma_DEV; q_t.val = NULL; q_t.num_rows = q_t.nnz = dofs; double *dy, *dH = NULL; if (MAGMA_SUCCESS != magma_dmalloc( &dy, ldh )) return MAGMA_ERR_DEVICE_ALLOC; if (MAGMA_SUCCESS != magma_dmalloc( &dH, (ldh+1)*ldh )) return MAGMA_ERR_DEVICE_ALLOC; // GPU stream magma_queue_t stream[2]; magma_event_t event[1]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); magmablasSetKernelStream(stream[0]); magma_dscal( dofs, c_zero, x->val, 1 ); // x = 0 magma_dcopy( dofs, b.val, 1, r.val, 1 ); // r = b nom0 = betanom = magma_dnrm2( dofs, r.val, 1 ); // nom0= || r|| nom = nom0 * nom0; solver_par->init_res = nom0; H(1,0) = MAGMA_D_MAKE( nom0, 0. ); magma_dsetvector(1, &H(1,0), 1, &dH(1,0), 1); if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) return MAGMA_SUCCESS; //Chronometry real_Double_t tempo1, tempo2; magma_device_sync(); tempo1=magma_wtime(); if( solver_par->verbose > 0 ){ solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ){ magma_dcopy(dofs, r.val, 1, q(0), 1); // q[0] = 1.0/H(1,0) r magma_dscal(dofs, 1./H(1,0), q(0), 1); // (to be fused) for(k=1; k<=restart; k++) { q_t.val = q(k-1); magmablasSetKernelStream(stream[0]); // preconditioner // z[k] = M^(-1) q(k) magma_d_applyprecond_left( A, q_t, &t, precond_par ); magma_d_applyprecond_right( A, t, &z_t, precond_par ); magma_dcopy(dofs, z_t.val, 1, z(k-1), 1); // r = A q[k] magma_d_spmv( c_one, A, z_t, c_zero, r ); if (solver_par->ortho == Magma_MGS ) { // modified Gram-Schmidt magmablasSetKernelStream(stream[0]); for (i=1; i<=k; i++) { H(i,k) =magma_ddot(dofs, q(i-1), 1, r.val, 1); // H(i,k) = q[i] . r magma_daxpy(dofs,-H(i,k), q(i-1), 1, r.val, 1); // r = r - H(i,k) q[i] } H(k+1,k) = MAGMA_D_MAKE( magma_dnrm2(dofs, r.val, 1), 0. ); // H(k+1,k) = sqrt(r . r) if (k < restart) { magma_dcopy(dofs, r.val, 1, q(k), 1); // q[k] = 1.0/H[k][k-1] r magma_dscal(dofs, 1./H(k+1,k), q(k), 1); // (to be fused) } } else if (solver_par->ortho == Magma_FUSED_CGS ) { // fusing dgemv with dnrm2 in classical Gram-Schmidt magmablasSetKernelStream(stream[0]); magma_dcopy(dofs, r.val, 1, q(k), 1); // dH(1:k+1,k) = q[0:k] . r magmablas_dgemv(MagmaTrans, dofs, k+1, c_one, q(0), dofs, r.val, 1, c_zero, &dH(1,k), 1); // r = r - q[0:k-1] dH(1:k,k) magmablas_dgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.val, 1); // 1) dH(k+1,k) = sqrt( dH(k+1,k) - dH(1:k,k) ) magma_dcopyscale( dofs, k, r.val, q(k), &dH(1,k) ); // 2) q[k] = q[k] / dH(k+1,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_dgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); // asynch copy dH(1:(k+1),k) to H(1:(k+1),k) } else { // classical Gram-Schmidt (default) // > explicitly calling magmabls magmablasSetKernelStream(stream[0]); magmablas_dgemv(MagmaTrans, dofs, k, c_one, q(0), dofs, r.val, 1, c_zero, &dH(1,k), 1); // dH(1:k,k) = q[0:k-1] . r #ifndef DNRM2SCALE // start copying dH(1:k,k) to H(1:k,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_dgetvector_async(k, &dH(1,k), 1, &H(1,k), 1, stream[1]); #endif // r = r - q[0:k-1] dH(1:k,k) magmablas_dgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.val, 1); #ifdef DNRM2SCALE magma_dcopy(dofs, r.val, 1, q(k), 1); // q[k] = r / H(k,k-1) magma_dnrm2scale(dofs, q(k), dofs, &dH(k+1,k) ); // dH(k+1,k) = sqrt(r . r) and r = r / dH(k+1,k) magma_event_record( event[0], stream[0] ); // start sending dH(1:k,k) to H(1:k,k) magma_queue_wait_event( stream[1], event[0] ); // can we keep H(k+1,k) on GPU and combine? magma_dgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); #else H(k+1,k) = MAGMA_D_MAKE( magma_dnrm2(dofs, r.val, 1), 0. ); // H(k+1,k) = sqrt(r . r) if( k<solver_par->restart ){ magmablasSetKernelStream(stream[0]); magma_dcopy(dofs, r.val, 1, q(k), 1); // q[k] = 1.0/H[k][k-1] r magma_dscal(dofs, 1./H(k+1,k), q(k), 1); // (to be fused) } #endif } } magma_queue_sync( stream[1] ); for( k=1; k<=restart; k++ ){ /* Minimization of || b-Ax || in H_k */ for (i=1; i<=k; i++) { #if defined(PRECISION_z) || defined(PRECISION_c) cblas_ddot_sub( i+1, &H(1,k), 1, &H(1,i), 1, &HH(k,i) ); #else HH(k,i) = cblas_ddot(i+1, &H(1,k), 1, &H(1,i), 1); #endif } h1[k] = H(1,k)*H(1,0); if (k != 1) for (i=1; i<k; i++) { for (m=i+1; m<k; m++){ HH(k,m) -= HH(k,i) * HH(m,i); } HH(k,k) -= HH(k,i) * HH(k,i) / HH(i,i); HH(k,i) = HH(k,i)/HH(i,i); h1[k] -= h1[i] * HH(k,i); } y[k] = h1[k]/HH(k,k); if (k != 1) for (i=k-1; i>=1; i--) { y[i] = h1[i]/HH(i,i); for (j=i+1; j<=k; j++) y[i] -= y[j] * HH(j,i); } m = k; rNorm = fabs(MAGMA_D_REAL(H(k+1,k))); } magma_dsetmatrix_async(m, 1, y+1, m, dy, m, stream[0]); magmablasSetKernelStream(stream[0]); magma_dgemv(MagmaNoTrans, dofs, m, c_one, z(0), dofs, dy, 1, c_one, x->val, 1); magma_d_spmv( c_mone, A, *x, c_zero, r ); // r = - A * x magma_daxpy(dofs, c_one, b.val, 1, r.val, 1); // r = r + b H(1,0) = MAGMA_D_MAKE( magma_dnrm2(dofs, r.val, 1), 0. ); // RNorm = H[1][0] = || r || RNorm = MAGMA_D_REAL( H(1,0) ); betanom = fabs(RNorm); if( solver_par->verbose > 0 ){ magma_device_sync(); tempo2=magma_wtime(); if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < r0 ) { break; } } magma_device_sync(); tempo2=magma_wtime(); solver_par->runtime = (real_Double_t) tempo2-tempo1; double residual; magma_dresidual( A, b, *x, &residual ); solver_par->iter_res = betanom; solver_par->final_res = residual; if( solver_par->numiter < solver_par->maxiter){ solver_par->info = 0; }else if( solver_par->init_res > solver_par->final_res ){ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -2; } else{ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -1; } // free pinned memory magma_free_pinned( H ); magma_free_pinned( y ); magma_free_pinned( HH ); magma_free_pinned( h1 ); // free GPU memory magma_free(dy); if (dH != NULL ) magma_free(dH); magma_d_vfree(&t); magma_d_vfree(&r); magma_d_vfree(&q); magma_d_vfree(&z); magma_d_vfree(&z_t); // free GPU streams and events magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_event_destroy( event[0] ); magmablasSetKernelStream(NULL); return MAGMA_SUCCESS; } /* magma_dgmres */
extern "C" magma_int_t magma_dlabrd_gpu( magma_int_t m, magma_int_t n, magma_int_t nb, double *a, magma_int_t lda, double *da, magma_int_t ldda, double *d, double *e, double *tauq, double *taup, double *x, magma_int_t ldx, double *dx, magma_int_t lddx, double *y, magma_int_t ldy, double *dy, magma_int_t lddy) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= DLABRD reduces the first NB rows and columns of a real general m by n matrix A to upper or lower bidiagonal form by an orthogonal transformation Q' * A * P, and returns the matrices X and Y which are needed to apply the transformation to the unreduced part of A. If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower bidiagonal form. This is an auxiliary routine called by SGEBRD Arguments ========= M (input) INTEGER The number of rows in the matrix A. N (input) INTEGER The number of columns in the matrix A. NB (input) INTEGER The number of leading rows and columns of A to be reduced. A (input/output) DOUBLE_PRECISION array, dimension (LDA,N) On entry, the m by n general matrix to be reduced. On exit, the first NB rows and columns of the matrix are overwritten; the rest of the array is unchanged. If m >= n, elements on and below the diagonal in the first NB columns, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors; and elements above the diagonal in the first NB rows, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors. If m < n, elements below the diagonal in the first NB columns, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors, and elements on and above the diagonal in the first NB rows, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). D (output) DOUBLE_PRECISION array, dimension (NB) The diagonal elements of the first NB rows and columns of the reduced matrix. D(i) = A(i,i). E (output) DOUBLE_PRECISION array, dimension (NB) The off-diagonal elements of the first NB rows and columns of the reduced matrix. TAUQ (output) DOUBLE_PRECISION array dimension (NB) The scalar factors of the elementary reflectors which represent the orthogonal matrix Q. See Further Details. TAUP (output) DOUBLE_PRECISION array, dimension (NB) The scalar factors of the elementary reflectors which represent the orthogonal matrix P. See Further Details. X (output) DOUBLE_PRECISION array, dimension (LDX,NB) The m-by-nb matrix X required to update the unreduced part of A. LDX (input) INTEGER The leading dimension of the array X. LDX >= M. Y (output) DOUBLE_PRECISION array, dimension (LDY,NB) The n-by-nb matrix Y required to update the unreduced part of A. LDY (input) INTEGER The leading dimension of the array Y. LDY >= N. Further Details =============== The matrices Q and P are represented as products of elementary reflectors: Q = H(1) H(2) . . . H(nb) and P = G(1) G(2) . . . G(nb) Each H(i) and G(i) has the form: H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' where tauq and taup are real scalars, and v and u are real vectors. If m >= n, v(1:i-1) = 0, v(i) = 1, and v(i:m) is stored on exit in A(i:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+1:n) is stored on exit in A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i). If m < n, v(1:i) = 0, v(i+1) = 1, and v(i+1:m) is stored on exit in A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i:n) is stored on exit in A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i). The elements of the vectors v and u together form the m-by-nb matrix V and the nb-by-n matrix U' which are needed, with X and Y, to apply the transformation to the unreduced part of the matrix, using a block update of the form: A := A - V*Y' - X*U'. The contents of A on exit are illustrated by the following examples with nb = 2: m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n): ( 1 1 u1 u1 u1 ) ( 1 u1 u1 u1 u1 u1 ) ( v1 1 1 u2 u2 ) ( 1 1 u2 u2 u2 u2 ) ( v1 v2 a a a ) ( v1 1 a a a a ) ( v1 v2 a a a ) ( v1 v2 a a a a ) ( v1 v2 a a a ) ( v1 v2 a a a a ) ( v1 v2 a a a ) where a denotes an element of the original matrix which is unchanged, vi denotes an element of the vector defining H(i), and ui an element of the vector defining G(i). ===================================================================== */ /* Table of constant values */ double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; double c_zero = MAGMA_D_ZERO; magma_int_t c__1 = 1; /* System generated locals */ magma_int_t a_dim1, a_offset, x_dim1, x_offset, y_dim1, y_offset, i__2, i__3; /* Local variables */ magma_int_t i__; double alpha; a_dim1 = lda; a_offset = 1 + a_dim1; a -= a_offset; --d; --e; --tauq; --taup; x_dim1 = ldx; x_offset = 1 + x_dim1; x -= x_offset; dx-= 1 + lddx; y_dim1 = ldy; y_offset = 1 + y_dim1; y -= y_offset; dy-= 1 + lddy; /* Function Body */ if (m <= 0 || n <= 0) { return 0; } double *f; magma_queue_t stream; magma_queue_create( &stream ); magma_dmalloc_cpu( &f, max(n,m) ); assert( f != NULL ); // TODO return error, or allocate outside dlatrd if (m >= n) { /* Reduce to upper bidiagonal form */ for (i__ = 1; i__ <= nb; ++i__) { /* Update A(i:m,i) */ i__2 = m - i__ + 1; i__3 = i__ - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__3, &y[i__+y_dim1], &ldy ); #endif blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one, &a[i__ + a_dim1], &lda, &y[i__+y_dim1], &ldy, &c_one, &a[i__ + i__ * a_dim1], &c__1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__3, &y[i__+y_dim1], &ldy ); #endif blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one, &x[i__ + x_dim1], &ldx, &a[i__*a_dim1+1], &c__1, &c_one, &a[i__+i__*a_dim1], &c__1); /* Generate reflection Q(i) to annihilate A(i+1:m,i) */ alpha = a[i__ + i__ * a_dim1]; i__2 = m - i__ + 1; i__3 = i__ + 1; lapackf77_dlarfg(&i__2, &alpha, &a[min(i__3,m) + i__ * a_dim1], &c__1, &tauq[i__]); d[i__] = MAGMA_D_REAL( alpha ); if (i__ < n) { a[i__ + i__ * a_dim1] = c_one; /* Compute Y(i+1:n,i) */ i__2 = m - i__ + 1; i__3 = n - i__; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_dsetvector( i__2, a + i__ + i__ * a_dim1, 1, da+(i__-1)+(i__-1)* (ldda), 1 ); // 2. Multiply --------------------------------------------- magma_dgemv(MagmaTrans, i__2, i__3, c_one, da + (i__-1) + ((i__-1) + 1) * (ldda), ldda, da + (i__-1) + (i__-1) * (ldda), c__1, c_zero, dy + i__ + 1 + i__ * y_dim1, c__1); // 3. Put the result back ---------------------------------- magma_dgetmatrix_async( i__3, 1, dy+i__+1+i__*y_dim1, y_dim1, y+i__+1+i__*y_dim1, y_dim1, stream ); i__2 = m - i__ + 1; i__3 = i__ - 1; blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_one, &a[i__ + a_dim1], &lda, &a[i__ + i__ * a_dim1], &c__1, &c_zero, &y[i__ * y_dim1 + 1], &c__1); i__2 = n - i__; i__3 = i__ - 1; blasf77_dgemv("N", &i__2, &i__3, &c_neg_one, &y[i__ + 1 +y_dim1], &ldy, &y[i__ * y_dim1 + 1], &c__1, &c_zero, f, &c__1); i__2 = m - i__ + 1; i__3 = i__ - 1; blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_one, &x[i__ + x_dim1], &ldx, &a[i__ + i__ * a_dim1], &c__1, &c_zero, &y[i__ * y_dim1 + 1], &c__1); // 4. Synch to make sure the result is back ---------------- magma_queue_sync( stream ); if (i__3!=0){ i__2 = n - i__; blasf77_daxpy(&i__2, &c_one, f,&c__1, &y[i__+1+i__*y_dim1],&c__1); } i__2 = i__ - 1; i__3 = n - i__; blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_neg_one, &a[(i__ + 1) * a_dim1 + 1], &lda, &y[i__ * y_dim1 + 1], &c__1, &c_one, &y[i__ + 1 + i__ * y_dim1], &c__1); i__2 = n - i__; blasf77_dscal(&i__2, &tauq[i__], &y[i__ + 1 + i__ * y_dim1], &c__1); /* Update A(i,i+1:n) */ i__2 = n - i__; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__2, &a[i__+(i__+1)*a_dim1], &lda ); lapackf77_dlacgv( &i__, &a[i__+a_dim1], &lda ); #endif blasf77_dgemv("No transpose", &i__2, &i__, &c_neg_one, &y[i__ + 1 + y_dim1], &ldy, &a[i__ + a_dim1], &lda, &c_one, &a[i__ + ( i__ + 1) * a_dim1], &lda); i__2 = i__ - 1; i__3 = n - i__; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__, &a[i__+a_dim1], &lda ); lapackf77_dlacgv( &i__2, &x[i__+x_dim1], &ldx ); #endif blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_neg_one, &a[(i__ + 1) * a_dim1 + 1], &lda, &x[i__ + x_dim1], &ldx, &c_one, &a[ i__ + (i__ + 1) * a_dim1], &lda); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__2, &x[i__+x_dim1], &ldx ); #endif /* Generate reflection P(i) to annihilate A(i,i+2:n) */ i__2 = n - i__; /* Computing MIN */ i__3 = i__ + 2; alpha = a[i__ + (i__ + 1) * a_dim1]; lapackf77_dlarfg(&i__2, &alpha, &a[i__ + min( i__3,n) * a_dim1], &lda, &taup[i__]); e[i__] = MAGMA_D_REAL( alpha ); a[i__ + (i__ + 1) * a_dim1] = c_one; /* Compute X(i+1:m,i) */ i__2 = m - i__; i__3 = n - i__; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_dsetvector( i__3, a + i__ + (i__ +1)* a_dim1, lda, da+(i__-1)+((i__-1)+1)*(ldda), ldda ); // 2. Multiply --------------------------------------------- //magma_dcopy(i__3, da+(i__-1)+((i__-1)+1)*(ldda), ldda, // dy + 1 + lddy, 1); magma_dgemv(MagmaNoTrans, i__2, i__3, c_one, da + (i__-1)+1+ ((i__-1)+1) * (ldda), ldda, da + (i__-1) + ((i__-1)+1) * (ldda), ldda, //dy + 1 + lddy, 1, c_zero, dx + i__ + 1 + i__ * x_dim1, c__1); // 3. Put the result back ---------------------------------- magma_dgetmatrix_async( i__2, 1, dx+i__+1+i__*x_dim1, x_dim1, x+i__+1+i__*x_dim1, x_dim1, stream ); i__2 = n - i__; blasf77_dgemv(MagmaTransStr, &i__2, &i__, &c_one, &y[i__ + 1 + y_dim1], &ldy, &a[i__ + (i__ + 1) * a_dim1], &lda, &c_zero, &x[ i__ * x_dim1 + 1], &c__1); i__2 = m - i__; blasf77_dgemv("N", &i__2, &i__, &c_neg_one, &a[i__ + 1 + a_dim1], &lda, &x[i__ * x_dim1 + 1], &c__1, &c_zero, f, &c__1); i__2 = i__ - 1; i__3 = n - i__; blasf77_dgemv("N", &i__2, &i__3, &c_one, &a[(i__ + 1) * a_dim1 + 1], &lda, &a[i__ + (i__ + 1) * a_dim1], &lda, &c_zero, &x[i__ * x_dim1 + 1], &c__1); // 4. Synch to make sure the result is back ---------------- magma_queue_sync( stream ); if (i__!=0){ i__2 = m - i__; blasf77_daxpy(&i__2, &c_one, f,&c__1, &x[i__+1+i__*x_dim1],&c__1); } i__2 = m - i__; i__3 = i__ - 1; blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one, &x[i__ + 1 + x_dim1], &ldx, &x[i__ * x_dim1 + 1], &c__1, &c_one, &x[ i__ + 1 + i__ * x_dim1], &c__1); i__2 = m - i__; blasf77_dscal(&i__2, &taup[i__], &x[i__ + 1 + i__ * x_dim1], &c__1); #if defined(PRECISION_z) || defined(PRECISION_c) i__2 = n - i__; lapackf77_dlacgv( &i__2, &a[i__+(i__+1)*a_dim1], &lda ); // 4. Send the block reflector A(i+1:m,i) to the GPU after DLACGV() magma_dsetvector( i__2, a + i__ + (i__ +1)* a_dim1, lda, da+(i__-1)+((i__-1)+1)*(ldda), ldda ); #endif } } } else { /* Reduce to lower bidiagonal form */ for (i__ = 1; i__ <= nb; ++i__) { /* Update A(i,i:n) */ i__2 = n - i__ + 1; i__3 = i__ - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i__2, &a[i__ + i__ * a_dim1], &lda); lapackf77_dlacgv(&i__3, &a[i__ + a_dim1], &lda); #endif blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one, &y[i__ + y_dim1], &ldy, &a[i__ + a_dim1], &lda, &c_one, &a[i__ + i__ * a_dim1], &lda); i__2 = i__ - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i__3, &a[i__ + a_dim1], &lda); lapackf77_dlacgv(&i__3, &x[i__ + x_dim1], &ldx); #endif i__3 = n - i__ + 1; blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_neg_one, &a[i__ * a_dim1 + 1], &lda, &x[i__ + x_dim1], &ldx, &c_one, &a[i__ + i__ * a_dim1], &lda); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i__2, &x[i__ + x_dim1], &ldx); #endif /* Generate reflection P(i) to annihilate A(i,i+1:n) */ i__2 = n - i__ + 1; /* Computing MIN */ i__3 = i__ + 1; alpha = a[i__ + i__ * a_dim1]; lapackf77_dlarfg(&i__2, &alpha, &a[i__ + min(i__3,n) * a_dim1], &lda, &taup[i__]); d[i__] = MAGMA_D_REAL( alpha ); if (i__ < m) { a[i__ + i__ * a_dim1] = c_one; /* Compute X(i+1:m,i) */ i__2 = m - i__; i__3 = n - i__ + 1; // 1. Send the block reflector A(i,i+1:n) to the GPU ------ magma_dsetvector( i__3, a + i__ + i__ * a_dim1, lda, da+(i__-1)+(i__-1)* (ldda), ldda ); // 2. Multiply --------------------------------------------- //magma_dcopy(i__3, da+(i__-1)+(i__-1)*(ldda), ldda, // dy + 1 + lddy, 1); magma_dgemv(MagmaNoTrans, i__2, i__3, c_one, da + (i__-1)+1 + (i__-1) * ldda, ldda, da + (i__-1) + (i__-1) * ldda, ldda, // dy + 1 + lddy, 1, c_zero, dx + i__ + 1 + i__ * x_dim1, c__1); // 3. Put the result back ---------------------------------- magma_dgetmatrix_async( i__2, 1, dx+i__+1+i__*x_dim1, x_dim1, x+i__+1+i__*x_dim1, x_dim1, stream ); i__2 = n - i__ + 1; i__3 = i__ - 1; blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_one, &y[i__ + y_dim1], &ldy, &a[i__ + i__ * a_dim1], &lda, &c_zero, &x[i__ * x_dim1 + 1], &c__1); i__2 = m - i__; i__3 = i__ - 1; blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one, &a[i__ + 1 + a_dim1], &lda, &x[i__ * x_dim1 + 1], &c__1, &c_zero, f, &c__1); i__2 = i__ - 1; i__3 = n - i__ + 1; blasf77_dgemv("No transpose", &i__2, &i__3, &c_one, &a[i__ * a_dim1 + 1], &lda, &a[i__ + i__ * a_dim1], &lda, &c_zero, &x[i__ * x_dim1 + 1], &c__1); // 4. Synch to make sure the result is back ---------------- magma_queue_sync( stream ); if (i__2!=0){ i__3 = m - i__; blasf77_daxpy(&i__3, &c_one, f,&c__1, &x[i__+1+i__*x_dim1],&c__1); } i__2 = m - i__; i__3 = i__ - 1; blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one, &x[i__ + 1 + x_dim1], &ldx, &x[i__ * x_dim1 + 1], &c__1, &c_one, &x[i__ + 1 + i__ * x_dim1], &c__1); i__2 = m - i__; blasf77_dscal(&i__2, &taup[i__], &x[i__ + 1 + i__ * x_dim1], &c__1); i__2 = n - i__ + 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i__2, &a[i__ + i__ * a_dim1], &lda); magma_dsetvector( i__2, a + i__ + (i__ )* a_dim1, lda, da+(i__-1)+ (i__-1)*(ldda), ldda ); #endif /* Update A(i+1:m,i) */ i__2 = m - i__; i__3 = i__ - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i__3, &y[i__ + y_dim1], &ldy); #endif blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one, &a[i__ + 1 + a_dim1], &lda, &y[i__ + y_dim1], &ldy, &c_one, &a[i__ + 1 + i__ * a_dim1], &c__1); i__2 = m - i__; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i__3, &y[i__ + y_dim1], &ldy); #endif blasf77_dgemv("No transpose", &i__2, &i__, &c_neg_one, &x[i__ + 1 + x_dim1], &ldx, &a[i__ * a_dim1 + 1], &c__1, &c_one, &a[i__ + 1 + i__ * a_dim1], &c__1); /* Generate reflection Q(i) to annihilate A(i+2:m,i) */ i__2 = m - i__; i__3 = i__ + 2; alpha = a[i__ + 1 + i__ * a_dim1]; lapackf77_dlarfg(&i__2, &alpha, &a[min(i__3,m) + i__ * a_dim1], &c__1, &tauq[i__]); e[i__] = MAGMA_D_REAL( alpha ); a[i__ + 1 + i__ * a_dim1] = c_one; /* Compute Y(i+1:n,i) */ i__2 = m - i__; i__3 = n - i__; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_dsetvector( i__2, a + i__ +1+ i__ * a_dim1, 1, da+(i__-1)+1+ (i__-1)*(ldda), 1 ); // 2. Multiply --------------------------------------------- magma_dgemv(MagmaTrans, i__2, i__3, c_one, da + (i__-1)+1+ ((i__-1)+1) * ldda, ldda, da + (i__-1)+1+ (i__-1) * ldda, c__1, c_zero, dy + i__ + 1 + i__ * y_dim1, c__1); // 3. Put the result back ---------------------------------- magma_dgetmatrix_async( i__3, 1, dy+i__+1+i__*y_dim1, y_dim1, y+i__+1+i__*y_dim1, y_dim1, stream ); i__2 = m - i__; i__3 = i__ - 1; blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_one, &a[i__ + 1 + a_dim1], &lda, &a[i__ + 1 + i__ * a_dim1], &c__1, &c_zero, &y[ i__ * y_dim1 + 1], &c__1); i__2 = n - i__; i__3 = i__ - 1; blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one, &y[i__ + 1 + y_dim1], &ldy, &y[i__ * y_dim1 + 1], &c__1, &c_zero, f, &c__1); i__2 = m - i__; blasf77_dgemv(MagmaTransStr, &i__2, &i__, &c_one, &x[i__ + 1 + x_dim1], &ldx, &a[i__ + 1 + i__ * a_dim1], &c__1, &c_zero, &y[i__ * y_dim1 + 1], &c__1); // 4. Synch to make sure the result is back ---------------- magma_queue_sync( stream ); if (i__3!=0){ i__2 = n - i__; blasf77_daxpy(&i__2, &c_one, f,&c__1, &y[i__+1+i__*y_dim1],&c__1); } i__2 = n - i__; blasf77_dgemv(MagmaTransStr, &i__, &i__2, &c_neg_one, &a[(i__ + 1) * a_dim1 + 1], &lda, &y[i__ * y_dim1 + 1], &c__1, &c_one, &y[i__ + 1 + i__ * y_dim1], &c__1); i__2 = n - i__; blasf77_dscal(&i__2, &tauq[i__], &y[i__ + 1 + i__ * y_dim1], &c__1); } #if defined(PRECISION_z) || defined(PRECISION_c) else { i__2 = n - i__ + 1; lapackf77_dlacgv(&i__2, &a[i__ + i__ * a_dim1], &lda); magma_dsetvector( i__2, a + i__ + (i__ )* a_dim1, lda, da+(i__-1)+ (i__-1)*(ldda), ldda ); } #endif } } magma_queue_destroy( stream ); magma_free_cpu(f); return MAGMA_SUCCESS; } /* dlabrd */
extern "C" magma_int_t magma_dgesv_rbt_batched( magma_int_t n, magma_int_t nrhs, double **dA_array, magma_int_t ldda, double **dB_array, magma_int_t lddb, magma_int_t *dinfo_array, magma_int_t batchCount, magma_queue_t queue) { /* Local variables */ magma_int_t i, info; info = 0; if (n < 0) { info = -1; } else if (nrhs < 0) { info = -2; } else if (ldda < max(1,n)) { info = -4; } else if (lddb < max(1,n)) { info = -6; } if (info != 0) { magma_xerbla( __func__, -(info) ); return info; } /* Quick return if possible */ if (n == 0 || nrhs == 0) { return info; } double *hu, *hv; if (MAGMA_SUCCESS != magma_dmalloc_cpu( &hu, 2*n )) { info = MAGMA_ERR_HOST_ALLOC; return info; } if (MAGMA_SUCCESS != magma_dmalloc_cpu( &hv, 2*n )) { info = MAGMA_ERR_HOST_ALLOC; return info; } info = magma_dgerbt_batched(MagmaTrue, n, nrhs, dA_array, n, dB_array, n, hu, hv, &info, batchCount, queue); if (info != MAGMA_SUCCESS) { return info; } info = magma_dgetrf_nopiv_batched( n, n, dA_array, ldda, dinfo_array, batchCount, queue); if ( info != MAGMA_SUCCESS ) { return info; } #ifdef CHECK_INFO // check correctness of results throught "dinfo_magma" and correctness of argument throught "info" magma_int_t *cpu_info = NULL; magma_imalloc_cpu( &cpu_info, batchCount ); magma_getvector( batchCount, sizeof(magma_int_t), dinfo_array, 1, cpu_info, 1); for (i=0; i < batchCount; i++) { if (cpu_info[i] != 0 ) { printf("magma_dgetrf_batched matrix %d returned error %d\n",i, (int)cpu_info[i] ); info = cpu_info[i]; magma_free_cpu (cpu_info); return info; } } magma_free_cpu (cpu_info); #endif info = magma_dgetrs_nopiv_batched( MagmaNoTrans, n, nrhs, dA_array, ldda, dB_array, lddb, dinfo_array, batchCount, queue ); /* The solution of A.x = b is Vy computed on the GPU */ double *dv; if (MAGMA_SUCCESS != magma_dmalloc( &dv, 2*n )) { info = MAGMA_ERR_DEVICE_ALLOC; return info; } magma_dsetvector( 2*n, hv, 1, dv, 1, queue ); for (i = 0; i < nrhs; i++) magmablas_dprbt_mv_batched(n, dv, dB_array+(i), batchCount, queue); // magma_dgetmatrix( n, nrhs, db, nn, B, ldb, queue ); return info; }
/** Purpose ------- DLAHR2 reduces the first NB columns of a real general n-BY-(n-k+1) matrix A so that elements below the k-th subdiagonal are zero. The reduction is performed by an orthogonal similarity transformation Q' * A * Q. The routine returns the matrices V and T which determine Q as a block reflector I - V*T*V', and also the matrix Y = A * V. (Note this is different than LAPACK, which computes Y = A * V * T.) This is an auxiliary routine called by DGEHRD. Arguments --------- @param[in] n INTEGER The order of the matrix A. @param[in] k INTEGER The offset for the reduction. Elements below the k-th subdiagonal in the first NB columns are reduced to zero. K < N. @param[in] nb INTEGER The number of columns to be reduced. @param[in,out] dA DOUBLE PRECISION array on the GPU, dimension (LDDA,N-K+1) On entry, the n-by-(n-k+1) general matrix A. On exit, the elements in rows K:N of the first NB columns are overwritten with the matrix Y. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). @param[out] dV DOUBLE PRECISION array on the GPU, dimension (LDDV, NB) On exit this n-by-nb array contains the Householder vectors of the transformation. @param[in] lddv INTEGER The leading dimension of the array dV. LDDV >= max(1,N). @param[in,out] A DOUBLE PRECISION array, dimension (LDA,N-K+1) On entry, the n-by-(n-k+1) general matrix A. On exit, the elements on and above the k-th subdiagonal in the first NB columns are overwritten with the corresponding elements of the reduced matrix; the elements below the k-th subdiagonal, with the array TAU, represent the matrix Q as a product of elementary reflectors. The other columns of A are unchanged. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau DOUBLE PRECISION array, dimension (NB) The scalar factors of the elementary reflectors. See Further Details. @param[out] T DOUBLE PRECISION array, dimension (LDT,NB) The upper triangular matrix T. @param[in] ldt INTEGER The leading dimension of the array T. LDT >= NB. @param[out] Y DOUBLE PRECISION array, dimension (LDY,NB) The n-by-nb matrix Y. @param[in] ldy INTEGER The leading dimension of the array Y. LDY >= N. @param[in] queue magma_queue_t Queue to execute in. Further Details --------------- The matrix Q is represented as a product of nb elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in A(i+k+1:n,i), and tau in TAU(i). The elements of the vectors v together form the (n-k+1)-by-nb matrix V which is needed, with T and Y, to apply the transformation to the unreduced part of the matrix, using an update of the form: A := (I - V*T*V') * (A - Y*T*V'). The contents of A on exit are illustrated by the following example with n = 7, k = 3 and nb = 2: @verbatim ( a a a a a ) ( a a a a a ) ( a a a a a ) ( h h a a a ) ( v1 h a a a ) ( v1 v2 a a a ) ( v1 v2 a a a ) @endverbatim where "a" denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. @ingroup magma_dgeev_aux ********************************************************************/ extern "C" magma_int_t magma_dlahr2( magma_int_t n, magma_int_t k, magma_int_t nb, magmaDouble_ptr dA, magma_int_t ldda, magmaDouble_ptr dV, magma_int_t lddv, double *A, magma_int_t lda, double *tau, double *T, magma_int_t ldt, double *Y, magma_int_t ldy, magma_queue_t queue ) { #define A(i_,j_) ( A + (i_) + (j_)*lda) #define Y(i_,j_) ( Y + (i_) + (j_)*ldy) #define T(i_,j_) ( T + (i_) + (j_)*ldt) #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dV(i_,j_) (dV + (i_) + (j_)*lddv) double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t n_k_i_1, n_k; double scale; magma_int_t i; double ei = MAGMA_D_ZERO; magma_int_t info = 0; if (n < 0) { info = -1; } else if (k < 0 || k > n) { info = -2; } else if (nb < 1 || nb > n) { info = -3; } else if (ldda < max(1,n)) { info = -5; } else if (lddv < max(1,n)) { info = -7; } else if (lda < max(1,n)) { info = -9; } else if (ldt < max(1,nb)) { info = -12; } else if (ldy < max(1,n)) { info = -13; } if (info != 0) { magma_xerbla( __func__, -(info) ); return info; } // adjust from 1-based indexing k -= 1; if (n <= 1) return info; for (i = 0; i < nb; ++i) { n_k_i_1 = n - k - i - 1; n_k = n - k; if (i > 0) { // Update A(k:n-1,i); Update i-th column of A - Y * T * V' // This updates one more row than LAPACK does (row k), // making the block above the panel an even multiple of nb. // Use last column of T as workspace, w. // w(0:i-1, nb-1) = VA(k+i, 0:i-1)' blasf77_dcopy( &i, A(k+i,0), &lda, T(0,nb-1), &ione ); #ifdef COMPLEX // If real, conjugate row of V. lapackf77_dlacgv(&i, T(0,nb-1), &ione); #endif // w = T(0:i-1, 0:i-1) * w blasf77_dtrmv( "Upper", "No trans", "No trans", &i, T(0,0), &ldt, T(0,nb-1), &ione ); // A(k:n-1, i) -= Y(k:n-1, 0:i-1) * w blasf77_dgemv( "No trans", &n_k, &i, &c_neg_one, Y(k,0), &ldy, T(0,nb-1), &ione, &c_one, A(k,i), &ione ); // Apply I - V * T' * V' to this column (call it b) from the // left, using the last column of T as workspace, w. // // Let V = ( V1 ) and b = ( b1 ) (first i-1 rows) // ( V2 ) ( b2 ) // where V1 is unit lower triangular // w := b1 = A(k+1:k+i, i) blasf77_dcopy( &i, A(k+1,i), &ione, T(0,nb-1), &ione ); // w := V1' * b1 = VA(k+1:k+i, 0:i-1)' * w blasf77_dtrmv( "Lower", "Conj", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // w := w + V2'*b2 = w + VA(k+i+1:n-1, 0:i-1)' * A(k+i+1:n-1, i) blasf77_dgemv( "Conj", &n_k_i_1, &i, &c_one, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_one, T(0,nb-1), &ione ); // w := T'*w = T(0:i-1, 0:i-1)' * w blasf77_dtrmv( "Upper", "Conj", "Non-unit", &i, T(0,0), &ldt, T(0,nb-1), &ione ); // b2 := b2 - V2*w = A(k+i+1:n-1, i) - VA(k+i+1:n-1, 0:i-1) * w blasf77_dgemv( "No trans", &n_k_i_1, &i, &c_neg_one, A(k+i+1,0), &lda, T(0,nb-1), &ione, &c_one, A(k+i+1,i), &ione ); // w := V1*w = VA(k+1:k+i, 0:i-1) * w blasf77_dtrmv( "Lower", "No trans", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // b1 := b1 - w = A(k+1:k+i-1, i) - w blasf77_daxpy( &i, &c_neg_one, T(0,nb-1), &ione, A(k+1,i), &ione ); // Restore diagonal element, saved below during previous iteration *A(k+i,i-1) = ei; } // Generate the elementary reflector H(i) to annihilate A(k+i+1:n-1,i) lapackf77_dlarfg( &n_k_i_1, A(k+i+1,i), A(k+i+2,i), &ione, &tau[i] ); // Save diagonal element and set to one, to simplify multiplying by V ei = *A(k+i+1,i); *A(k+i+1,i) = c_one; // dV(i+1:n-k-1, i) = VA(k+i+1:n-1, i) magma_dsetvector( n_k_i_1, A(k+i+1,i), 1, dV(i+1,i), 1, queue ); // Compute Y(k+1:n,i) = A vi // dA(k:n-1, i) = dA(k:n-1, i+1:n-k-1) * dV(i+1:n-k-1, i) magma_dgemv( MagmaNoTrans, n_k, n_k_i_1, c_one, dA(k,i+1), ldda, dV(i+1,i), ione, c_zero, dA(k,i), ione, queue ); // Compute T(0:i,i) = [ -tau T V' vi ] // [ tau ] // T(0:i-1, i) = -tau VA(k+i+1:n-1, 0:i-1)' VA(k+i+1:n-1, i) scale = MAGMA_D_NEGATE( tau[i]); blasf77_dgemv( "Conj", &n_k_i_1, &i, &scale, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_zero, T(0,i), &ione ); // T(0:i-1, i) = T(0:i-1, 0:i-1) * T(0:i-1, i) blasf77_dtrmv( "Upper", "No trans", "Non-unit", &i, T(0,0), &ldt, T(0,i), &ione ); *T(i,i) = tau[i]; // Y(k:n-1, i) = dA(k:n-1, i) magma_dgetvector( n-k, dA(k,i), 1, Y(k,i), 1, queue ); } // Restore diagonal element *A(k+nb,nb-1) = ei; return info; } /* magma_dlahr2 */
extern "C" magma_int_t magma_didr( magma_d_matrix A, magma_d_matrix b, magma_d_matrix *x, magma_d_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_IDR; solver_par->numiter = 0; solver_par->spmv_count = 0; solver_par->init_res = 0.0; solver_par->final_res = 0.0; solver_par->iter_res = 0.0; solver_par->runtime = 0.0; // constants const double c_zero = MAGMA_D_ZERO; const double c_one = MAGMA_D_ONE; const double c_n_one = MAGMA_D_NEG_ONE; // internal user parameters const magma_int_t smoothing = 1; // 0 = disable, 1 = enable const double angle = 0.7; // [0-1] // local variables magma_int_t iseed[4] = {0, 0, 0, 1}; magma_int_t dof; magma_int_t s; magma_int_t distr; magma_int_t k, i, sk; magma_int_t innerflag; double residual; double nrm; double nrmb; double nrmr; double nrmt; double rho; double om; double tt; double tr; double gamma; double alpha; double mkk; double fk; // matrices and vectors magma_d_matrix dxs = {Magma_CSR}; magma_d_matrix dr = {Magma_CSR}, drs = {Magma_CSR}; magma_d_matrix dP = {Magma_CSR}, dP1 = {Magma_CSR}; magma_d_matrix dG = {Magma_CSR}; magma_d_matrix dU = {Magma_CSR}; magma_d_matrix dM = {Magma_CSR}; magma_d_matrix df = {Magma_CSR}; magma_d_matrix dt = {Magma_CSR}; magma_d_matrix dc = {Magma_CSR}; magma_d_matrix dv = {Magma_CSR}; magma_d_matrix dbeta = {Magma_CSR}, hbeta = {Magma_CSR}; // chronometry real_Double_t tempo1, tempo2; // initial s space // TODO: add option for 's' (shadow space number) // Hack: uses '--restart' option as the shadow space number. // This is not a good idea because the default value of restart option is used to detect // if the user provided a custom restart. This means that if the default restart value // is changed then the code will think it was the user (unless the default value is // also updated in the 'if' statement below. s = 1; if ( solver_par->restart != 50 ) { if ( solver_par->restart > A.num_cols ) { s = A.num_cols; } else { s = solver_par->restart; } } solver_par->restart = s; // set max iterations solver_par->maxiter = min( 2 * A.num_cols, solver_par->maxiter ); // check if matrix A is square if ( A.num_rows != A.num_cols ) { //printf("Matrix A is not square.\n"); info = MAGMA_ERR_NOT_SUPPORTED; goto cleanup; } // |b| nrmb = magma_dnrm2( b.num_rows, b.dval, 1, queue ); if ( nrmb == 0.0 ) { magma_dscal( x->num_rows, MAGMA_D_ZERO, x->dval, 1, queue ); info = MAGMA_SUCCESS; goto cleanup; } // r = b - A x CHECK( magma_dvinit( &dr, Magma_DEV, b.num_rows, 1, c_zero, queue )); CHECK( magma_dresidualvec( A, b, *x, &dr, &nrmr, queue )); // |r| solver_par->init_res = nrmr; solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nrmr; } // check if initial is guess good enough if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { info = MAGMA_SUCCESS; goto cleanup; } // P = randn(n, s) // P = ortho(P) //--------------------------------------- // P = 0.0 CHECK( magma_dvinit( &dP, Magma_CPU, A.num_cols, s, c_zero, queue )); // P = randn(n, s) distr = 3; // 1 = unif (0,1), 2 = unif (-1,1), 3 = normal (0,1) dof = dP.num_rows * dP.num_cols; lapackf77_dlarnv( &distr, iseed, &dof, dP.val ); // transfer P to device CHECK( magma_dmtransfer( dP, &dP1, Magma_CPU, Magma_DEV, queue )); magma_dmfree( &dP, queue ); // P = ortho(P1) if ( dP1.num_cols > 1 ) { // P = magma_dqr(P1), QR factorization CHECK( magma_dqr( dP1.num_rows, dP1.num_cols, dP1, dP1.ld, &dP, NULL, queue )); } else { // P = P1 / |P1| nrm = magma_dnrm2( dof, dP1.dval, 1, queue ); nrm = 1.0 / nrm; magma_dscal( dof, nrm, dP1.dval, 1, queue ); CHECK( magma_dmtransfer( dP1, &dP, Magma_DEV, Magma_DEV, queue )); } magma_dmfree( &dP1, queue ); //--------------------------------------- // allocate memory for the scalar products CHECK( magma_dvinit( &hbeta, Magma_CPU, s, 1, c_zero, queue )); CHECK( magma_dvinit( &dbeta, Magma_DEV, s, 1, c_zero, queue )); // smoothing enabled if ( smoothing > 0 ) { // set smoothing solution vector CHECK( magma_dmtransfer( *x, &dxs, Magma_DEV, Magma_DEV, queue )); // set smoothing residual vector CHECK( magma_dmtransfer( dr, &drs, Magma_DEV, Magma_DEV, queue )); } // G(n,s) = 0 CHECK( magma_dvinit( &dG, Magma_DEV, A.num_cols, s, c_zero, queue )); // U(n,s) = 0 CHECK( magma_dvinit( &dU, Magma_DEV, A.num_cols, s, c_zero, queue )); // M(s,s) = I CHECK( magma_dvinit( &dM, Magma_DEV, s, s, c_zero, queue )); magmablas_dlaset( MagmaFull, s, s, c_zero, c_one, dM.dval, s, queue ); // f = 0 CHECK( magma_dvinit( &df, Magma_DEV, dP.num_cols, 1, c_zero, queue )); // t = 0 CHECK( magma_dvinit( &dt, Magma_DEV, dr.num_rows, 1, c_zero, queue )); // c = 0 CHECK( magma_dvinit( &dc, Magma_DEV, dM.num_cols, 1, c_zero, queue )); // v = 0 CHECK( magma_dvinit( &dv, Magma_DEV, dr.num_rows, 1, c_zero, queue )); //--------------START TIME--------------- // chronometry tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->timing[0] = 0.0; } om = MAGMA_D_ONE; innerflag = 0; // start iteration do { solver_par->numiter++; // new RHS for small systems // f = P' r magmablas_dgemv( MagmaConjTrans, dP.num_rows, dP.num_cols, c_one, dP.dval, dP.ld, dr.dval, 1, c_zero, df.dval, 1, queue ); // shadow space loop for ( k = 0; k < s; ++k ) { sk = s - k; // f(k:s) = M(k:s,k:s) c(k:s) magma_dcopyvector( sk, &df.dval[k], 1, &dc.dval[k], 1, queue ); magma_dtrsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, sk, &dM.dval[k*dM.ld+k], dM.ld, &dc.dval[k], 1, queue ); // v = r - G(:,k:s) c(k:s) magma_dcopyvector( dr.num_rows, dr.dval, 1, dv.dval, 1, queue ); magmablas_dgemv( MagmaNoTrans, dG.num_rows, sk, c_n_one, &dG.dval[k*dG.ld], dG.ld, &dc.dval[k], 1, c_one, dv.dval, 1, queue ); // U(:,k) = om * v + U(:,k:s) c(k:s) magmablas_dgemv( MagmaNoTrans, dU.num_rows, sk, c_one, &dU.dval[k*dU.ld], dU.ld, &dc.dval[k], 1, om, dv.dval, 1, queue ); magma_dcopyvector( dU.num_rows, dv.dval, 1, &dU.dval[k*dU.ld], 1, queue ); // G(:,k) = A U(:,k) CHECK( magma_d_spmv( c_one, A, dv, c_zero, dv, queue )); solver_par->spmv_count++; magma_dcopyvector( dG.num_rows, dv.dval, 1, &dG.dval[k*dG.ld], 1, queue ); // bi-orthogonalize the new basis vectors for ( i = 0; i < k; ++i ) { // alpha = P(:,i)' G(:,k) alpha = magma_ddot( dP.num_rows, &dP.dval[i*dP.ld], 1, &dG.dval[k*dG.ld], 1, queue ); // alpha = alpha / M(i,i) magma_dgetvector( 1, &dM.dval[i*dM.ld+i], 1, &mkk, 1, queue ); alpha = alpha / mkk; // G(:,k) = G(:,k) - alpha * G(:,i) magma_daxpy( dG.num_rows, -alpha, &dG.dval[i*dG.ld], 1, &dG.dval[k*dG.ld], 1, queue ); // U(:,k) = U(:,k) - alpha * U(:,i) magma_daxpy( dU.num_rows, -alpha, &dU.dval[i*dU.ld], 1, &dU.dval[k*dU.ld], 1, queue ); } // new column of M = P'G, first k-1 entries are zero // M(k:s,k) = P(:,k:s)' G(:,k) magmablas_dgemv( MagmaConjTrans, dP.num_rows, sk, c_one, &dP.dval[k*dP.ld], dP.ld, &dG.dval[k*dG.ld], 1, c_zero, &dM.dval[k*dM.ld+k], 1, queue ); // check M(k,k) == 0 magma_dgetvector( 1, &dM.dval[k*dM.ld+k], 1, &mkk, 1, queue ); if ( MAGMA_D_EQUAL(mkk, MAGMA_D_ZERO) ) { innerflag = 1; info = MAGMA_DIVERGENCE; break; } // beta = f(k) / M(k,k) magma_dgetvector( 1, &df.dval[k], 1, &fk, 1, queue ); hbeta.val[k] = fk / mkk; // check for nan if ( magma_d_isnan( hbeta.val[k] ) || magma_d_isinf( hbeta.val[k] )) { innerflag = 1; info = MAGMA_DIVERGENCE; break; } // r = r - beta * G(:,k) magma_daxpy( dr.num_rows, -hbeta.val[k], &dG.dval[k*dG.ld], 1, dr.dval, 1, queue ); // smoothing disabled if ( smoothing <= 0 ) { // |r| nrmr = magma_dnrm2( dr.num_rows, dr.dval, 1, queue ); // smoothing enabled } else { // x = x + beta * U(:,k) magma_daxpy( x->num_rows, hbeta.val[k], &dU.dval[k*dU.ld], 1, x->dval, 1, queue ); // smoothing operation //--------------------------------------- // t = rs - r magma_dcopyvector( drs.num_rows, drs.dval, 1, dt.dval, 1, queue ); magma_daxpy( dt.num_rows, c_n_one, dr.dval, 1, dt.dval, 1, queue ); // t't // t'rs tt = magma_ddot( dt.num_rows, dt.dval, 1, dt.dval, 1, queue ); tr = magma_ddot( dt.num_rows, dt.dval, 1, drs.dval, 1, queue ); // gamma = (t' * rs) / (t' * t) gamma = tr / tt; // rs = rs - gamma * (rs - r) magma_daxpy( drs.num_rows, -gamma, dt.dval, 1, drs.dval, 1, queue ); // xs = xs - gamma * (xs - x) magma_dcopyvector( dxs.num_rows, dxs.dval, 1, dt.dval, 1, queue ); magma_daxpy( dt.num_rows, c_n_one, x->dval, 1, dt.dval, 1, queue ); magma_daxpy( dxs.num_rows, -gamma, dt.dval, 1, dxs.dval, 1, queue ); // |rs| nrmr = magma_dnrm2( drs.num_rows, drs.dval, 1, queue ); //--------------------------------------- } // store current timing and residual if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter) % solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)nrmr; solver_par->timing[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)tempo2 - tempo1; } } // check convergence if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { s = k + 1; // for the x-update outside the loop innerflag = 2; info = MAGMA_SUCCESS; break; } // non-last s iteration if ( (k + 1) < s ) { // f(k+1:s) = f(k+1:s) - beta * M(k+1:s,k) magma_daxpy( sk-1, -hbeta.val[k], &dM.dval[k*dM.ld+(k+1)], 1, &df.dval[k+1], 1, queue ); } } // smoothing disabled if ( smoothing <= 0 && innerflag != 1 ) { // update solution approximation x // x = x + U(:,1:s) * beta(1:s) magma_dsetvector( s, hbeta.val, 1, dbeta.dval, 1, queue ); magmablas_dgemv( MagmaNoTrans, dU.num_rows, s, c_one, dU.dval, dU.ld, dbeta.dval, 1, c_one, x->dval, 1, queue ); } // check convergence or iteration limit or invalid result of inner loop if ( innerflag > 0 ) { break; } // t = A v // t = A r CHECK( magma_d_spmv( c_one, A, dr, c_zero, dt, queue )); solver_par->spmv_count++; // computation of a new omega //--------------------------------------- // |t| nrmt = magma_dnrm2( dt.num_rows, dt.dval, 1, queue ); // t'r tr = magma_ddot( dt.num_rows, dt.dval, 1, dr.dval, 1, queue ); // rho = abs(t' * r) / (|t| * |r|)) rho = MAGMA_D_ABS( MAGMA_D_REAL(tr) / (nrmt * nrmr) ); // om = (t' * r) / (|t| * |t|) om = tr / (nrmt * nrmt); if ( rho < angle ) { om = (om * angle) / rho; } //--------------------------------------- if ( MAGMA_D_EQUAL(om, MAGMA_D_ZERO) ) { info = MAGMA_DIVERGENCE; break; } // update approximation vector // x = x + om * v // x = x + om * r magma_daxpy( x->num_rows, om, dr.dval, 1, x->dval, 1, queue ); // update residual vector // r = r - om * t magma_daxpy( dr.num_rows, -om, dt.dval, 1, dr.dval, 1, queue ); // smoothing disabled if ( smoothing <= 0 ) { // residual norm nrmr = magma_dnrm2( b.num_rows, dr.dval, 1, queue ); // smoothing enabled } else { // smoothing operation //--------------------------------------- // t = rs - r magma_dcopyvector( drs.num_rows, drs.dval, 1, dt.dval, 1, queue ); magma_daxpy( dt.num_rows, c_n_one, dr.dval, 1, dt.dval, 1, queue ); // t't // t'rs tt = magma_ddot( dt.num_rows, dt.dval, 1, dt.dval, 1, queue ); tr = magma_ddot( dt.num_rows, dt.dval, 1, drs.dval, 1, queue ); // gamma = (t' * rs) / (|t| * |t|) gamma = tr / tt; // rs = rs - gamma * (rs - r) magma_daxpy( drs.num_rows, -gamma, dt.dval, 1, drs.dval, 1, queue ); // xs = xs - gamma * (xs - x) magma_dcopyvector( dxs.num_rows, dxs.dval, 1, dt.dval, 1, queue ); magma_daxpy( dt.num_rows, c_n_one, x->dval, 1, dt.dval, 1, queue ); magma_daxpy( dxs.num_rows, -gamma, dt.dval, 1, dxs.dval, 1, queue ); // |rs| nrmr = magma_dnrm2( b.num_rows, drs.dval, 1, queue ); //--------------------------------------- } // store current timing and residual if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter) % solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)nrmr; solver_par->timing[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)tempo2 - tempo1; } } // check convergence if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { info = MAGMA_SUCCESS; break; } } while ( solver_par->numiter + 1 <= solver_par->maxiter ); // smoothing enabled if ( smoothing > 0 ) { // x = xs magma_dcopyvector( x->num_rows, dxs.dval, 1, x->dval, 1, queue ); // r = rs magma_dcopyvector( dr.num_rows, drs.dval, 1, dr.dval, 1, queue ); } // get last iteration timing tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t)tempo2 - tempo1; //--------------STOP TIME---------------- // get final stats solver_par->iter_res = nrmr; CHECK( magma_dresidualvec( A, b, *x, &dr, &residual, queue )); solver_par->final_res = residual; // set solver conclusion if ( info != MAGMA_SUCCESS && info != MAGMA_DIVERGENCE ) { if ( solver_par->init_res > solver_par->final_res ) { info = MAGMA_SLOW_CONVERGENCE; } } cleanup: // free resources // smoothing enabled if ( smoothing > 0 ) { magma_dmfree( &dxs, queue ); magma_dmfree( &drs, queue ); } magma_dmfree( &dr, queue ); magma_dmfree( &dP, queue ); magma_dmfree( &dP1, queue ); magma_dmfree( &dG, queue ); magma_dmfree( &dU, queue ); magma_dmfree( &dM, queue ); magma_dmfree( &df, queue ); magma_dmfree( &dt, queue ); magma_dmfree( &dc, queue ); magma_dmfree( &dv, queue ); magma_dmfree( &dbeta, queue ); magma_dmfree( &hbeta, queue ); solver_par->info = info; return info; /* magma_didr */ }
magma_int_t magma_dcg_merge( magma_d_sparse_matrix A, magma_d_vector b, magma_d_vector *x, magma_d_solver_par *solver_par ){ // prepare solver feedback solver_par->solver = Magma_CGMERGE; solver_par->numiter = 0; solver_par->info = 0; // some useful variables double c_zero = MAGMA_D_ZERO, c_one = MAGMA_D_ONE; magma_int_t dofs = A.num_rows; // GPU stream magma_queue_t stream[2]; magma_event_t event[1]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); // GPU workspace magma_d_vector r, d, z; magma_d_vinit( &r, Magma_DEV, dofs, c_zero ); magma_d_vinit( &d, Magma_DEV, dofs, c_zero ); magma_d_vinit( &z, Magma_DEV, dofs, c_zero ); double *d1, *d2, *skp; magma_dmalloc( &d1, dofs*(1) ); magma_dmalloc( &d2, dofs*(1) ); // array for the parameters magma_dmalloc( &skp, 6 ); // skp = [alpha|beta|gamma|rho|tmp1|tmp2] // solver variables double alpha, beta, gamma, rho, tmp1, *skp_h; double nom, nom0, r0, betanom, den; // solver setup magma_dscal( dofs, c_zero, x->val, 1) ; // x = 0 magma_dcopy( dofs, b.val, 1, r.val, 1 ); // r = b magma_dcopy( dofs, b.val, 1, d.val, 1 ); // d = b nom0 = betanom = magma_dnrm2( dofs, r.val, 1 ); nom = nom0 * nom0; // nom = r' * r magma_d_spmv( c_one, A, d, c_zero, z ); // z = A d den = MAGMA_D_REAL( magma_ddot(dofs, d.val, 1, z.val, 1) ); // den = d'* z solver_par->init_res = nom0; // array on host for the parameters magma_dmalloc_cpu( &skp_h, 6 ); alpha = rho = gamma = tmp1 = c_one; beta = magma_ddot(dofs, r.val, 1, r.val, 1); skp_h[0]=alpha; skp_h[1]=beta; skp_h[2]=gamma; skp_h[3]=rho; skp_h[4]=tmp1; skp_h[5]=MAGMA_D_MAKE(nom, 0.0); magma_dsetvector( 6, skp_h, 1, skp, 1 ); if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) return MAGMA_SUCCESS; // check positive definite if (den <= 0.0) { printf("Operator A is not postive definite. (Ar,r) = %f\n", den); return -100; } //Chronometry real_Double_t tempo1, tempo2; magma_device_sync(); tempo1=magma_wtime(); if( solver_par->verbose > 0 ){ solver_par->res_vec[0] = (real_Double_t) nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ){ magmablasSetKernelStream(stream[0]); // computes SpMV and dot product magma_dcgmerge_spmv1( A, d1, d2, d.val, z.val, skp ); // updates x, r, computes scalars and updates d magma_dcgmerge_xrbeta( dofs, d1, d2, x->val, r.val, d.val, z.val, skp ); // check stopping criterion (asynchronous copy) magma_dgetvector_async( 1 , skp+1, 1, skp_h+1, 1, stream[1] ); betanom = sqrt(MAGMA_D_REAL(skp_h[1])); if( solver_par->verbose > 0 ){ magma_device_sync(); tempo2=magma_wtime(); if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < r0 ) { break; } } magma_device_sync(); tempo2=magma_wtime(); solver_par->runtime = (real_Double_t) tempo2-tempo1; double residual; magma_dresidual( A, b, *x, &residual ); solver_par->iter_res = betanom; solver_par->final_res = residual; if( solver_par->numiter < solver_par->maxiter){ solver_par->info = 0; }else if( solver_par->init_res > solver_par->final_res ){ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -2; } else{ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -1; } magma_d_vfree(&r); magma_d_vfree(&z); magma_d_vfree(&d); magma_free( d1 ); magma_free( d2 ); magma_free( skp ); magma_free_cpu( skp_h ); return MAGMA_SUCCESS; } /* magma_dcg_merge */
extern "C" magma_int_t magma_dcg_merge( magma_d_matrix A, magma_d_matrix b, magma_d_matrix *x, magma_d_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_CGMERGE; solver_par->numiter = 0; solver_par->spmv_count = 0; // solver variables double alpha, beta, gamma, rho, tmp1, *skp_h={0}; double nom, nom0, betanom, den, nomb; // some useful variables double c_zero = MAGMA_D_ZERO, c_one = MAGMA_D_ONE; magma_int_t dofs = A.num_rows*b.num_cols; magma_d_matrix r={Magma_CSR}, d={Magma_CSR}, z={Magma_CSR}, B={Magma_CSR}, C={Magma_CSR}; double *d1=NULL, *d2=NULL, *skp=NULL; // GPU workspace CHECK( magma_dvinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_dvinit( &d, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_dvinit( &z, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_dmalloc( &d1, dofs*(1) )); CHECK( magma_dmalloc( &d2, dofs*(1) )); // array for the parameters CHECK( magma_dmalloc( &skp, 6 )); // skp = [alpha|beta|gamma|rho|tmp1|tmp2] // solver setup magma_dscal( dofs, c_zero, x->dval, 1, queue ); // x = 0 //CHECK( magma_dresidualvec( A, b, *x, &r, nom0, queue)); magma_dcopy( dofs, b.dval, 1, r.dval, 1, queue ); // r = b magma_dcopy( dofs, r.dval, 1, d.dval, 1, queue ); // d = r nom0 = betanom = magma_dnrm2( dofs, r.dval, 1, queue ); nom = nom0 * nom0; // nom = r' * r CHECK( magma_d_spmv( c_one, A, d, c_zero, z, queue )); // z = A d den = MAGMA_D_ABS( magma_ddot( dofs, d.dval, 1, z.dval, 1, queue ) ); // den = d'* z solver_par->init_res = nom0; nomb = magma_dnrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } // array on host for the parameters CHECK( magma_dmalloc_cpu( &skp_h, 6 )); alpha = rho = gamma = tmp1 = c_one; beta = magma_ddot( dofs, r.dval, 1, r.dval, 1, queue ); skp_h[0]=alpha; skp_h[1]=beta; skp_h[2]=gamma; skp_h[3]=rho; skp_h[4]=tmp1; skp_h[5]=MAGMA_D_MAKE(nom, 0.0); magma_dsetvector( 6, skp_h, 1, skp, 1, queue ); if( nom0 < solver_par->atol || nom0/nomb < solver_par->rtol ){ info = MAGMA_SUCCESS; goto cleanup; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t) nom0; solver_par->timing[0] = 0.0; } // check positive definite if (den <= 0.0) { info = MAGMA_NONSPD; goto cleanup; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; // computes SpMV and dot product CHECK( magma_dcgmerge_spmv1( A, d1, d2, d.dval, z.dval, skp, queue )); solver_par->spmv_count++; // updates x, r, computes scalars and updates d CHECK( magma_dcgmerge_xrbeta( dofs, d1, d2, x->dval, r.dval, d.dval, z.dval, skp, queue )); // check stopping criterion (asynchronous copy) magma_dgetvector( 1 , skp+1, 1, skp_h+1, 1, queue ); betanom = sqrt(MAGMA_D_ABS(skp_h[1])); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < solver_par->atol || betanom/nomb < solver_par->rtol ) { break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; double residual; CHECK( magma_dresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = betanom; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->atol || solver_par->iter_res/solver_par->init_res < solver_par->rtol ){ info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_DIVERGENCE; } cleanup: magma_dmfree(&r, queue ); magma_dmfree(&z, queue ); magma_dmfree(&d, queue ); magma_dmfree(&B, queue ); magma_dmfree(&C, queue ); magma_free( d1 ); magma_free( d2 ); magma_free( skp ); magma_free_cpu( skp_h ); solver_par->info = info; return info; } /* magma_dcg_merge */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *d_A, *h_R, *tau, *dT, *h_work, tmp[1]; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb, size; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}, ISEED2[4]; magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t status = 0; double tol; opts.lapack |= (opts.version == 2 && opts.check == 2); // check (-c2) implies lapack (-l) if ( opts.version != 2 && opts.check == 1 ) { printf( " ===================================================================\n" " NOTE: -c check for this version will be wrong\n" " because tester ignores the special structure of MAGMA dgeqrf resuls.\n" " We reset it to -c2.\n" " ===================================================================\n\n"); opts.check = 2; } if ( opts.version == 2 ) { if ( opts.check == 1 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R-Q'A||_1 / (M*||A||_1*eps) ||I-Q'Q||_1 / (M*eps)\n"); printf("=========================================================================================================\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("=======================================================================\n"); } tol = 1.0; } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||Ax-b||_F/(N*||A||_F*||x||_F)\n"); printf("====================================================================================\n"); tol = opts.tolerance * lapackf77_dlamch("E"); } for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_DGEQRF( M, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_dgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); TESTING_MALLOC_CPU( tau, double, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( h_work, double, lwork ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); /* Initialize the matrix */ for ( int j=0; j<4; j++ ) ISEED2[j] = ISEED[j]; // saving seeds lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_dsetmatrix( M, N, h_R, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); if ( opts.version == 2 ) { magma_dgeqrf2_gpu( M, N, d_A, ldda, tau, &info); } else { nb = magma_get_dgeqrf_nb( M ); size = (2*min(M, N) + (N+31)/32*32 )*nb; TESTING_MALLOC_DEV( dT, double, size ); if ( opts.version == 3 ) { magma_dgeqrf3_gpu( M, N, d_A, ldda, tau, dT, &info); } else { magma_dgeqrf_gpu( M, N, d_A, ldda, tau, dT, &info); } } gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ double *tau2; TESTING_MALLOC_CPU( tau2, double, min_mn ); cpu_time = magma_wtime(); lapackf77_dgeqrf(&M, &N, h_A, &lda, tau2, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( tau2 ); } if ( opts.check == 1 ) { /* ===================================================================== Check the result =================================================================== */ magma_int_t lwork = n2+N; double *h_W1, *h_W2, *h_W3; double *h_RW, results[2]; magma_dgetmatrix( M, N, d_A, ldda, h_R, M ); TESTING_MALLOC_CPU( h_W1, double, n2 ); // Q TESTING_MALLOC_CPU( h_W2, double, n2 ); // R TESTING_MALLOC_CPU( h_W3, double, lwork ); // WORK TESTING_MALLOC_CPU( h_RW, double, M ); // RWORK lapackf77_dlarnv( &ione, ISEED2, &n2, h_A ); lapackf77_dqrt02( &M, &N, &min_mn, h_A, h_R, h_W1, h_W2, &lda, tau, h_W3, &lwork, h_RW, results ); if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, results[0], results[1] ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, gpu_perf, gpu_time, results[0], results[1] ); } // todo also check results[1] < tol? printf(" %s\n", (results[0] < tol ? "ok" : "failed")); status += ! (results[0] < tol); TESTING_FREE_CPU( h_W1 ); TESTING_FREE_CPU( h_W2 ); TESTING_FREE_CPU( h_W3 ); TESTING_FREE_CPU( h_RW ); } else if ( opts.check == 2 ) { if ( opts.version == 2 ) { /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_dgetmatrix( M, N, d_A, ldda, h_R, M ); error = lapackf77_dlange("f", &M, &N, h_A, &lda, work); blasf77_daxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("f", &M, &N, h_R, &lda, work) / error; if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e", (int) M, (int) N, gpu_perf, gpu_time, error ); } printf(" %s\n", (error < tol ? "ok" : "failed")); status += ! (error < tol); } else if ( M >= N ) { magma_int_t lwork; double *x, *b, *d_B, *hwork; const double c_zero = MAGMA_D_ZERO; const double c_one = MAGMA_D_ONE; const double c_neg_one = MAGMA_D_NEG_ONE; const magma_int_t ione = 1; // initialize RHS, b = A*random TESTING_MALLOC_CPU( x, double, N ); TESTING_MALLOC_CPU( b, double, M ); lapackf77_dlarnv( &ione, ISEED, &N, x ); blasf77_dgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_zero, b, &ione ); // copy to GPU TESTING_MALLOC_DEV( d_B, double, M ); magma_dsetvector( M, b, 1, d_B, 1 ); if ( opts.version == 1 ) { // allocate hwork magma_dgeqrs_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, tmp, -1, &info ); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, double, lwork ); // solve linear system magma_dgeqrs_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, hwork, lwork, &info ); if (info != 0) printf("magma_dgeqrs returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( hwork ); } else { // allocate hwork magma_dgeqrs3_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, tmp, -1, &info ); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, double, lwork ); // solve linear system magma_dgeqrs3_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, hwork, lwork, &info ); if (info != 0) printf("magma_dgeqrs3 returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( hwork ); } magma_dgetvector( N, d_B, 1, x, 1 ); // compute r = Ax - b, saved in b lapackf77_dlarnv( &ione, ISEED2, &n2, h_A ); blasf77_dgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_neg_one, b, &ione ); // compute residual |Ax - b| / (n*|A|*|x|) double norm_x, norm_A, norm_r, work[1]; norm_A = lapackf77_dlange( "F", &M, &N, h_A, &lda, work ); norm_r = lapackf77_dlange( "F", &M, &ione, b, &M, work ); norm_x = lapackf77_dlange( "F", &N, &ione, x, &N, work ); TESTING_FREE_CPU( x ); TESTING_FREE_CPU( b ); TESTING_FREE_DEV( d_B ); error = norm_r / (N * norm_A * norm_x); if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e", (int) M, (int) N, gpu_perf, gpu_time, error ); } printf(" %s\n", (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) --- ", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- ", (int) M, (int) N, gpu_perf, gpu_time); } printf("%s\n", (opts.check != 0 ? " (error check only for M >= N)" : "")); } } else { if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) ---\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, gpu_time); } } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); if ( opts.version != 2 ) TESTING_FREE_DEV( dT ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); const double d_neg_one = MAGMA_D_NEG_ONE; const double d_one = MAGMA_D_ONE; const double c_neg_one = MAGMA_D_NEG_ONE; const double c_one = MAGMA_D_ONE; const double c_zero = MAGMA_D_ZERO; const magma_int_t ione = 1; real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double Anorm, error=0, error2=0; double *h_A, *h_R, *tau, *h_work, tmp[1]; magmaDouble_ptr d_A, dT; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb, size; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t status = 0; double tol = opts.tolerance * lapackf77_dlamch("E"); printf( "version %d\n", (int) opts.version ); if ( opts.version == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |R - Q^H*A| |I - Q^H*Q|\n"); printf("===============================================================================\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |b - A*x|\n"); printf("================================================================\n"); } for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_DGEQRF( M, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_dgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); TESTING_MALLOC_CPU( tau, double, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( h_work, double, lwork ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_dsetmatrix( M, N, h_R, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); if ( opts.version == 2 ) { // LAPACK complaint arguments magma_dgeqrf2_gpu( M, N, d_A, ldda, tau, &info ); } else { nb = magma_get_dgeqrf_nb( M ); size = (2*min(M, N) + (N+31)/32*32 )*nb; TESTING_MALLOC_DEV( dT, double, size ); if ( opts.version == 1 ) { // stores dT, V blocks have zeros, R blocks inverted & stored in dT magma_dgeqrf_gpu( M, N, d_A, ldda, tau, dT, &info ); } #ifdef HAVE_CUBLAS else if ( opts.version == 3 ) { // stores dT, V blocks have zeros, R blocks stored in dT magma_dgeqrf3_gpu( M, N, d_A, ldda, tau, dT, &info ); } #endif else { printf( "Unknown version %d\n", (int) opts.version ); exit(1); } } gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.check && opts.version == 2 ) { /* ===================================================================== Check the result, following zqrt01 except using the reduced Q. This works for any M,N (square, tall, wide). Only for version 2, which has LAPACK complaint output. =================================================================== */ magma_dgetmatrix( M, N, d_A, ldda, h_R, lda ); magma_int_t ldq = M; magma_int_t ldr = min_mn; double *Q, *R; double *work; TESTING_MALLOC_CPU( Q, double, ldq*min_mn ); // M by K TESTING_MALLOC_CPU( R, double, ldr*N ); // K by N TESTING_MALLOC_CPU( work, double, min_mn ); // generate M by K matrix Q, where K = min(M,N) lapackf77_dlacpy( "Lower", &M, &min_mn, h_R, &lda, Q, &ldq ); lapackf77_dorgqr( &M, &min_mn, &min_mn, Q, &ldq, tau, h_work, &lwork, &info ); assert( info == 0 ); // copy K by N matrix R lapackf77_dlaset( "Lower", &min_mn, &N, &c_zero, &c_zero, R, &ldr ); lapackf77_dlacpy( "Upper", &min_mn, &N, h_R, &lda, R, &ldr ); // error = || R - Q^H*A || / (N * ||A||) blasf77_dgemm( "Conj", "NoTrans", &min_mn, &N, &M, &c_neg_one, Q, &ldq, h_A, &lda, &c_one, R, &ldr ); Anorm = lapackf77_dlange( "1", &M, &N, h_A, &lda, work ); error = lapackf77_dlange( "1", &min_mn, &N, R, &ldr, work ); if ( N > 0 && Anorm > 0 ) error /= (N*Anorm); // set R = I (K by K identity), then R = I - Q^H*Q // error = || I - Q^H*Q || / N lapackf77_dlaset( "Upper", &min_mn, &min_mn, &c_zero, &c_one, R, &ldr ); blasf77_dsyrk( "Upper", "Conj", &min_mn, &M, &d_neg_one, Q, &ldq, &d_one, R, &ldr ); error2 = lapackf77_dlansy( "1", "Upper", &min_mn, R, &ldr, work ); if ( N > 0 ) error2 /= N; TESTING_FREE_CPU( Q ); Q = NULL; TESTING_FREE_CPU( R ); R = NULL; TESTING_FREE_CPU( work ); work = NULL; } else if ( opts.check && M >= N ) { /* ===================================================================== Check the result by solving consistent linear system, A*x = b. Only for versions 1 & 3 with M >= N. =================================================================== */ magma_int_t lwork; double *x, *b, *hwork; magmaDouble_ptr d_B; const double c_zero = MAGMA_D_ZERO; const double c_one = MAGMA_D_ONE; const double c_neg_one = MAGMA_D_NEG_ONE; const magma_int_t ione = 1; // initialize RHS, b = A*random TESTING_MALLOC_CPU( x, double, N ); TESTING_MALLOC_CPU( b, double, M ); lapackf77_dlarnv( &ione, ISEED, &N, x ); blasf77_dgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_zero, b, &ione ); // copy to GPU TESTING_MALLOC_DEV( d_B, double, M ); magma_dsetvector( M, b, 1, d_B, 1 ); if ( opts.version == 1 ) { // allocate hwork magma_dgeqrs_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, tmp, -1, &info ); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, double, lwork ); // solve linear system magma_dgeqrs_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, hwork, lwork, &info ); if (info != 0) printf("magma_dgeqrs returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( hwork ); } #ifdef HAVE_CUBLAS else if ( opts.version == 3 ) { // allocate hwork magma_dgeqrs3_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, tmp, -1, &info ); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, double, lwork ); // solve linear system magma_dgeqrs3_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, hwork, lwork, &info ); if (info != 0) printf("magma_dgeqrs3 returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( hwork ); } #endif else { printf( "Unknown version %d\n", (int) opts.version ); exit(1); } magma_dgetvector( N, d_B, 1, x, 1 ); // compute r = Ax - b, saved in b blasf77_dgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_neg_one, b, &ione ); // compute residual |Ax - b| / (n*|A|*|x|) double norm_x, norm_A, norm_r, work[1]; norm_A = lapackf77_dlange( "F", &M, &N, h_A, &lda, work ); norm_r = lapackf77_dlange( "F", &M, &ione, b, &M, work ); norm_x = lapackf77_dlange( "F", &N, &ione, x, &N, work ); TESTING_FREE_CPU( x ); TESTING_FREE_CPU( b ); TESTING_FREE_DEV( d_B ); error = norm_r / (N * norm_A * norm_x); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_dgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Print performance and error. =================================================================== */ printf("%5d %5d ", (int) M, (int) N ); if ( opts.lapack ) { printf( "%7.2f (%7.2f)", cpu_perf, cpu_time ); } else { printf(" --- ( --- )" ); } printf( " %7.2f (%7.2f) ", gpu_perf, gpu_time ); if ( opts.check ) { if ( opts.version == 2 ) { bool okay = (error < tol && error2 < tol); status += ! okay; printf( "%11.2e %11.2e %s\n", error, error2, (okay ? "ok" : "failed") ); } else if ( M >= N ) { bool okay = (error < tol); status += ! okay; printf( "%10.2e %s\n", error, (okay ? "ok" : "failed") ); } else { printf( "(error check only for M >= N)\n" ); } } else { printf( " ---\n" ); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); if ( opts.version != 2 ) TESTING_FREE_DEV( dT ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/** Purpose ------- DLATRD reduces NB rows and columns of a real symmetric matrix A to symmetric tridiagonal form by an orthogonal similarity transformation Q' * A * Q, and returns the matrices V and W which are needed to apply the transformation to the unreduced part of A. If UPLO = MagmaUpper, DLATRD reduces the last NB rows and columns of a matrix, of which the upper triangle is supplied; if UPLO = MagmaLower, DLATRD reduces the first NB rows and columns of a matrix, of which the lower triangle is supplied. This is an auxiliary routine called by DSYTRD. Arguments --------- @param[in] uplo magma_uplo_t Specifies whether the upper or lower triangular part of the symmetric matrix A is stored: - = MagmaUpper: Upper triangular - = MagmaLower: Lower triangular @param[in] n INTEGER The order of the matrix A. @param[in] nb INTEGER The number of rows and columns to be reduced. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit: - if UPLO = MagmaUpper, the last NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements above the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; - if UPLO = MagmaLower, the first NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements below the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= (1,N). @param[out] e DOUBLE_PRECISION array, dimension (N-1) If UPLO = MagmaUpper, E(n-nb:n-1) contains the superdiagonal elements of the last NB columns of the reduced matrix; if UPLO = MagmaLower, E(1:nb) contains the subdiagonal elements of the first NB columns of the reduced matrix. @param[out] tau DOUBLE_PRECISION array, dimension (N-1) The scalar factors of the elementary reflectors, stored in TAU(n-nb:n-1) if UPLO = MagmaUpper, and in TAU(1:nb) if UPLO = MagmaLower. See Further Details. @param[out] W DOUBLE_PRECISION array, dimension (LDW,NB) The n-by-nb matrix W required to update the unreduced part of A. @param[in] ldw INTEGER The leading dimension of the array W. LDW >= max(1,N). Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n) H(n-1) . . . H(n-nb+1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i), and tau in TAU(i-1). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i), and tau in TAU(i). The elements of the vectors v together form the n-by-nb matrix V which is needed, with W, to apply the transformation to the unreduced part of the matrix, using a symmetric rank-2k update of the form: A := A - V*W' - W*V'. The contents of A on exit are illustrated by the following examples with n = 5 and nb = 2: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( a a a v4 v5 ) ( d ) ( a a v4 v5 ) ( 1 d ) ( a 1 v5 ) ( v1 1 a ) ( d 1 ) ( v1 v2 a a ) ( d ) ( v1 v2 a a a ) where d denotes a diagonal element of the reduced matrix, a denotes an element of the original matrix that is unchanged, and vi denotes an element of the vector defining H(i). @ingroup magma_dsyev_aux ********************************************************************/ extern "C" double magma_dlatrd_mgpu(magma_int_t num_gpus, magma_uplo_t uplo, magma_int_t n0, magma_int_t n, magma_int_t nb, magma_int_t nb0, double *A, magma_int_t lda, double *e, double *tau, double *W, magma_int_t ldw, double **dA, magma_int_t ldda, magma_int_t offset, double **dW, magma_int_t lddw, double *dwork[MagmaMaxGPUs], magma_int_t ldwork, magma_int_t k, double *dx[MagmaMaxGPUs], double *dy[MagmaMaxGPUs], double *work, magma_queue_t stream[][10], double *times) { #define A(i, j) (A + (j)*lda + (i)) #define W(i, j) (W + (j)*ldw + (i)) #define dA(id, i, j) (dA[(id)] + ((j)+loffset)*ldda + (i) + offset) #define dW(id, i, j) (dW[(id)] + (j) *lddw + (i)) #define dW1(id, i, j) (dW[(id)] + ((j)+nb) *lddw + (i)) double mv_time = 0.0; magma_int_t i; #ifndef MAGMABLAS_DSYMV_MGPU magma_int_t loffset = nb0*((offset/nb0)/num_gpus); #endif double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; double c_zero = MAGMA_D_ZERO; double value = MAGMA_D_ZERO; magma_int_t id, idw, i_one = 1; //magma_int_t kk; magma_int_t ione = 1; magma_int_t i_n, i_1, iw; double alpha; double *dx2[MagmaMaxGPUs]; double *f; magma_dmalloc_cpu( &f, n ); if (n <= 0) { return 0; } //#define PROFILE_SYMV #ifdef PROFILE_SYMV magma_event_t start, stop; float etime; magma_timestr_t cpu_start, cpu_end; magma_setdevice(0); magma_event_create( &start ); magma_event_create( &stop ); #endif if (uplo == MagmaUpper) { /* Reduce last NB columns of upper triangle */ for (i = n-1; i >= n - nb ; --i) { i_1 = i + 1; i_n = n - i - 1; iw = i - n + nb; if (i < n-1) { /* Update A(1:i,i) */ double wii = *W(i, iw+1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i_one, &wii, &ldw); #endif wii = -wii; blasf77_daxpy(&i_1, &wii, A(0, i+1), &i_one, A(0, i), &ione); wii = *A(i, i+1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i_one, &wii, &ldw); #endif wii = -wii; blasf77_daxpy(&i_1, &wii, W(0, iw+1), &i_one, A(0, i), &ione); } if (i > 0) { /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */ alpha = *A(i-1, i); lapackf77_dlarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]); e[i-1] = MAGMA_D_REAL( alpha ); *A(i-1,i) = MAGMA_D_MAKE( 1, 0 ); for( id=0; id < num_gpus; id++ ) { magma_setdevice(id); dx2[id] = dW1(id, 0, iw); magma_dsetvector_async( n, A(0,i), 1, dW1(id, 0, iw), 1, stream[id][0]); #ifndef MAGMABLAS_DSYMV_MGPU magma_dsetvector_async( i, A(0,i), 1, dx[id], 1, stream[id][0] ); #endif } magmablas_dsymv_mgpu(num_gpus, k, MagmaUpper, i, nb0, c_one, dA, ldda, 0, dx2, ione, c_zero, dy, ione, dwork, ldwork, work, W(0, iw), stream ); if (i < n-1) { blasf77_dgemv(MagmaTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); } /* overlap update */ if ( i < n-1 && i-1 >= n - nb ) { magma_int_t im1_1 = i_1 - 1; magma_int_t im1 = i-1; /* Update A(1:i,i) */ #if defined(PRECISION_z) || defined(PRECISION_c) magma_int_t im1_n = i_n + 1; lapackf77_dlacgv(&im1_n, W(im1, iw+1), &ldw); #endif blasf77_dgemv("No transpose", &im1_1, &i_n, &c_neg_one, A(0, i+1), &lda, W(im1, iw+1), &ldw, &c_one, A(0, i-1), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&im1_n, W(im1, iw+1), &ldw); lapackf77_dlacgv(&im1_n, A(im1, i +1), &lda); #endif blasf77_dgemv("No transpose", &im1_1, &i_n, &c_neg_one, W(0, iw+1), &ldw, A(im1, i+1), &lda, &c_one, A(0, i-1), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&im1_n, A(im1, i+1), &lda); #endif } // 3. Here is where we need it // TODO find the right place magmablas_dsymv_sync(num_gpus, k, i, work, W(0, iw), stream ); if (i < n-1) { blasf77_dgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); blasf77_dgemv(MagmaTransStr, &i, &i_n, &c_one, A(0, i+1), &lda, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); blasf77_dgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); } blasf77_dscal(&i, &tau[i - 1], W(0, iw), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_ddot_sub( i, W(0,iw), ione, A(0,i), ione, &value ); #else value = cblas_ddot( i, W(0,iw), ione, A(0,i), ione ); #endif alpha = tau[i - 1] * -.5f * value; blasf77_daxpy(&i, &alpha, A(0, i), &ione, W(0, iw), &ione); for( id=0; id < num_gpus; id++ ) { magma_setdevice(id); if ( k > 1 ) { magma_dsetvector_async( n, W(0,iw), 1, dW(id, 0, iw), 1, stream[id][1] ); } else { magma_dsetvector_async( n, W(0,iw), 1, dW(id, 0, iw), 1, stream[id][0] ); } } } } } else { /* Reduce first NB columns of lower triangle */ for (i = 0; i < nb; ++i) { /* Update A(i:n,i) */ i_n = n - i; idw = ((offset+i)/nb)%num_gpus; if ( i > 0 ) { trace_cpu_start( 0, "gemv", "gemv" ); double wii = *W(i, i-1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i_one, &wii, &ldw); #endif wii = -wii; blasf77_daxpy( &i_n, &wii, A(i, i-1), &ione, A(i, i), &ione); wii = *A(i, i-1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i_one, &wii, &lda); #endif wii = -wii; blasf77_daxpy( &i_n, &wii, W(i, i-1), &ione, A(i, i), &ione); } if (i < n-1) { /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */ i_n = n - i - 1; trace_cpu_start( 0, "larfg", "larfg" ); alpha = *A(i+1, i); #ifdef PROFILE_SYMV cpu_start = get_current_time(); #endif lapackf77_dlarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]); #ifdef PROFILE_SYMV cpu_end = get_current_time(); times[0] += GetTimerValue(cpu_start,cpu_end)/1000.0; #endif e[i] = MAGMA_D_REAL( alpha ); *A(i+1,i) = MAGMA_D_MAKE( 1, 0 ); trace_cpu_end( 0 ); /* Compute W(i+1:n,i) */ // 1. Send the block reflector A(i+1:n,i) to the GPU //trace_gpu_start( idw, 0, "comm", "comm1" ); #ifndef MAGMABLAS_DSYMV_MGPU magma_setdevice(idw); magma_dsetvector( i_n, A(i+1,i), 1, dA(idw, i+1, i), 1 ); #endif for( id=0; id < num_gpus; id++ ) { magma_setdevice(id); trace_gpu_start( id, 0, "comm", "comm" ); #ifdef MAGMABLAS_DSYMV_MGPU dx2[id] = dW1(id, 0, i)-offset; #else dx2[id] = dx[id]; magma_dsetvector( i_n, A(i+1,i), 1, dx[id], 1 ); #endif magma_dsetvector_async( n, A(0,i), 1, dW1(id, 0, i), 1, stream[id][0] ); trace_gpu_end( id, 0 ); } /* mat-vec on multiple GPUs */ #ifdef PROFILE_SYMV magma_setdevice(0); magma_event_record(start, stream[0][0]); #endif magmablas_dsymv_mgpu(num_gpus, k, MagmaLower, i_n, nb0, c_one, dA, ldda, offset+i+1, dx2, ione, c_zero, dy, ione, dwork, ldwork, work, W(i+1,i), stream ); #ifdef PROFILE_SYMV magma_setdevice(0); magma_event_record(stop, stream[0][0]); #endif trace_cpu_start( 0, "gemv", "gemv" ); blasf77_dgemv(MagmaTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw, A(i+1, i), &ione, &c_zero, W(0, i), &ione); blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda, W(0, i), &ione, &c_zero, f, &ione); blasf77_dgemv(MagmaTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda, A(i+1, i), &ione, &c_zero, W(0, i), &ione); trace_cpu_end( 0 ); /* overlap update */ if ( i > 0 && i+1 < n ) { magma_int_t ip1 = i+1; trace_cpu_start( 0, "gemv", "gemv" ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i, W(ip1, 0), &ldw); #endif blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, A(ip1, 0), &lda, W(ip1, 0), &ldw, &c_one, A(ip1, ip1), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i, W(ip1, 0), &ldw); lapackf77_dlacgv(&i, A(ip1, 0), &lda); #endif blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, W(ip1, 0), &ldw, A(ip1, 0), &lda, &c_one, A(ip1, ip1), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i, A(ip1, 0), &lda); #endif trace_cpu_end( 0 ); } /* synchronize */ magmablas_dsymv_sync(num_gpus, k, i_n, work, W(i+1,i), stream ); #ifdef PROFILE_SYMV cudaEventElapsedTime(&etime, start, stop); mv_time += (etime/1000.0); times[1+(i_n/(n0/10))] += (etime/1000.0); #endif trace_cpu_start( 0, "axpy", "axpy" ); if (i != 0) blasf77_daxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione); blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw, W(0, i), &ione, &c_one, W(i+1, i), &ione); blasf77_dscal(&i_n, &tau[i], W(i+1,i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_ddot_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value ); #else value = cblas_ddot( i_n, W(i+1,i), ione, A(i+1,i), ione ); #endif alpha = tau[i]* -.5f * value; blasf77_daxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione); trace_cpu_end( 0 ); for( id=0; id < num_gpus; id++ ) { magma_setdevice(id); if ( k > 1 ) { magma_dsetvector_async( n, W(0,i), 1, dW(id, 0, i), 1, stream[id][1] ); } else { magma_dsetvector_async( n, W(0,i), 1, dW(id, 0, i), 1, stream[id][0] ); } } } } } #ifdef PROFILE_SYMV magma_setdevice(0); magma_event_destory( start ); magma_event_destory( stop ); #endif for( id=0; id < num_gpus; id++ ) { magma_setdevice(id); if ( k > 1 ) magma_queue_sync(stream[id][1]); } magma_free_cpu(f); return mv_time; } /* magma_dlatrd_mgpu */
extern "C" magma_int_t magma_dlahr2(magma_int_t n, magma_int_t k, magma_int_t nb, double *da, double *dv, double *a, magma_int_t lda, double *tau, double *t, magma_int_t ldt, double *y, magma_int_t ldy) { /* -- MAGMA auxiliary routine (version 1.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= DLAHR2 reduces the first NB columns of a real general n-BY-(n-k+1) matrix A so that elements below the k-th subdiagonal are zero. The reduction is performed by an orthogonal similarity transformation Q' * A * Q. The routine returns the matrices V and T which determine Q as a block reflector I - V*T*V', and also the matrix Y = A * V. This is an auxiliary routine called by DGEHRD. Arguments ========= N (input) INTEGER The order of the matrix A. K (input) INTEGER The offset for the reduction. Elements below the k-th subdiagonal in the first NB columns are reduced to zero. K < N. NB (input) INTEGER The number of columns to be reduced. DA (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDA,N-K+1) On entry, the n-by-(n-k+1) general matrix A. On exit, the elements on and above the k-th subdiagonal in the first NB columns are overwritten with the corresponding elements of the reduced matrix; the elements below the k-th subdiagonal, with the array TAU, represent the matrix Q as a product of elementary reflectors. The other columns of A are unchanged. See Further Details. DV (output) DOUBLE_PRECISION array on the GPU, dimension (N, NB) On exit this contains the Householder vectors of the transformation. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). TAU (output) DOUBLE_PRECISION array, dimension (NB) The scalar factors of the elementary reflectors. See Further Details. T (output) DOUBLE_PRECISION array, dimension (LDT,NB) The upper triangular matrix T. LDT (input) INTEGER The leading dimension of the array T. LDT >= NB. Y (output) DOUBLE_PRECISION array, dimension (LDY,NB) The n-by-nb matrix Y. LDY (input) INTEGER The leading dimension of the array Y. LDY >= N. Further Details =============== The matrix Q is represented as a product of nb elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in A(i+k+1:n,i), and tau in TAU(i). The elements of the vectors v together form the (n-k+1)-by-nb matrix V which is needed, with T and Y, to apply the transformation to the unreduced part of the matrix, using an update of the form: A := (I - V*T*V') * (A - Y*T*V'). The contents of A on exit are illustrated by the following example with n = 7, k = 3 and nb = 2: ( a a a a a ) ( a a a a a ) ( a a a a a ) ( h h a a a ) ( v1 h a a a ) ( v1 v2 a a a ) ( v1 v2 a a a ) where a denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. ===================================================================== */ double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ldda = lda; magma_int_t c__1 = 1; magma_int_t a_dim1, a_offset, t_dim1, t_offset, y_dim1, y_offset, i__2, i__3; double d__1; magma_int_t i__; double ei; --tau; a_dim1 = lda; a_offset = 1 + a_dim1; a -= a_offset; t_dim1 = ldt; t_offset = 1 + t_dim1; t -= t_offset; y_dim1 = ldy; y_offset = 1 + y_dim1; y -= y_offset; /* Function Body */ if (n <= 1) return 0; for (i__ = 1; i__ <= nb; ++i__) { if (i__ > 1) { /* Update A(K+1:N,I); Update I-th column of A - Y * V' */ i__2 = n - k + 1; i__3 = i__ - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i__3, &a[k+i__-1+a_dim1], &lda); #endif blasf77_dcopy(&i__3, &a[k+i__-1+a_dim1], &lda, &t[nb*t_dim1+1], &c__1); blasf77_dtrmv("u","n","n",&i__3,&t[t_offset], &ldt, &t[nb*t_dim1+1], &c__1); blasf77_dgemv("NO TRANSPOSE", &i__2, &i__3, &c_neg_one, &y[k + y_dim1], &ldy, &t[nb*t_dim1+1], &c__1, &c_one, &a[k+i__*a_dim1],&c__1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i__3, &a[k+i__-1+a_dim1], &lda); #endif /* Apply I - V * T' * V' to this column (call it b) from the left, using the last column of T as workspace Let V = ( V1 ) and b = ( b1 ) (first I-1 rows) ( V2 ) ( b2 ) where V1 is unit lower triangular w := V1' * b1 */ i__2 = i__ - 1; blasf77_dcopy(&i__2, &a[k+1+i__*a_dim1], &c__1, &t[nb*t_dim1+1], &c__1); blasf77_dtrmv("Lower", MagmaTransStr, "UNIT", &i__2, &a[k + 1 + a_dim1], &lda, &t[nb * t_dim1 + 1], &c__1); /* w := w + V2'*b2 */ i__2 = n - k - i__ + 1; i__3 = i__ - 1; blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_one, &a[k + i__ + a_dim1], &lda, &a[k+i__+i__*a_dim1], &c__1, &c_one, &t[nb*t_dim1+1], &c__1); /* w := T'*w */ i__2 = i__ - 1; blasf77_dtrmv("U", MagmaTransStr, "N", &i__2, &t[t_offset], &ldt, &t[nb*t_dim1+1], &c__1); /* b2 := b2 - V2*w */ i__2 = n - k - i__ + 1; i__3 = i__ - 1; blasf77_dgemv("N", &i__2, &i__3, &c_neg_one, &a[k + i__ + a_dim1], &lda, &t[nb*t_dim1+1], &c__1, &c_one, &a[k+i__+i__*a_dim1], &c__1); /* b1 := b1 - V1*w */ i__2 = i__ - 1; blasf77_dtrmv("L","N","U",&i__2,&a[k+1+a_dim1],&lda,&t[nb*t_dim1+1],&c__1); blasf77_daxpy(&i__2, &c_neg_one, &t[nb * t_dim1 + 1], &c__1, &a[k + 1 + i__ * a_dim1], &c__1); a[k + i__ - 1 + (i__ - 1) * a_dim1] = ei; } /* Generate the elementary reflector H(I) to annihilate A(K+I+1:N,I) */ i__2 = n - k - i__ + 1; i__3 = k + i__ + 1; lapackf77_dlarfg(&i__2, &a[k + i__ + i__ * a_dim1], &a[min(i__3,n) + i__ * a_dim1], &c__1, &tau[i__]); ei = a[k + i__ + i__ * a_dim1]; a[k + i__ + i__ * a_dim1] = c_one; /* Compute Y(K+1:N,I) */ i__2 = n - k; i__3 = n - k - i__ + 1; magma_dsetvector( i__3, &a[k + i__ + i__*a_dim1], 1, dv+(i__-1)*(ldda+1), 1 ); magma_dgemv(MagmaNoTrans, i__2+1, i__3, c_one, da -1 + k + i__ * ldda, ldda, dv+(i__-1)*(ldda+1), c__1, c_zero, da-1 + k + (i__-1)*ldda, c__1); i__2 = n - k - i__ + 1; i__3 = i__ - 1; blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_one, &a[k + i__ + a_dim1], &lda, &a[k+i__+i__*a_dim1], &c__1, &c_zero, &t[i__*t_dim1+1], &c__1); /* Compute T(1:I,I) */ i__2 = i__ - 1; d__1 = MAGMA_D_NEGATE( tau[i__] ); blasf77_dscal(&i__2, &d__1, &t[i__ * t_dim1 + 1], &c__1); blasf77_dtrmv("U","N","N", &i__2, &t[t_offset], &ldt, &t[i__*t_dim1+1], &c__1); t[i__ + i__ * t_dim1] = tau[i__]; magma_dgetvector( n - k + 1, da-1+ k+(i__-1)*ldda, 1, y+ k + i__*y_dim1, 1 ); } a[k + nb + nb * a_dim1] = ei; return 0; } /* magma_dlahr2 */
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf, cpu_time; double magma_error, cublas_error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t M, N, Xm, Ym, lda, sizeA, sizeX, sizeY; magma_int_t incx = 1; magma_int_t incy = 1; double c_neg_one = MAGMA_D_NEG_ONE; double alpha = MAGMA_D_MAKE( 1.5, -2.3 ); double beta = MAGMA_D_MAKE( -0.6, 0.8 ); double *A, *X, *Y, *Ycublas, *Ymagma; double *dA, *dX, *dY; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("trans = %s\n", lapack_trans_const(opts.transA) ); printf(" M N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("===================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = ((M+31)/32)*32; gflops = FLOPS_DGEMV( M, N ) / 1e9; if ( opts.transA == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } sizeA = lda*N; sizeX = incx*Xm; sizeY = incy*Ym; TESTING_MALLOC_CPU( A, double, sizeA ); TESTING_MALLOC_CPU( X, double, sizeX ); TESTING_MALLOC_CPU( Y, double, sizeY ); TESTING_MALLOC_CPU( Ycublas, double, sizeY ); TESTING_MALLOC_CPU( Ymagma, double, sizeY ); TESTING_MALLOC_DEV( dA, double, sizeA ); TESTING_MALLOC_DEV( dX, double, sizeX ); TESTING_MALLOC_DEV( dY, double, sizeY ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &sizeA, A ); lapackf77_dlarnv( &ione, ISEED, &sizeX, X ); lapackf77_dlarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_dsetmatrix( M, N, A, lda, dA, lda ); magma_dsetvector( Xm, X, incx, dX, incx ); magma_dsetvector( Ym, Y, incy, dY, incy ); cublas_time = magma_sync_wtime( 0 ); cublasDgemv( handle, cublas_trans_const(opts.transA), M, N, &alpha, dA, lda, dX, incx, &beta, dY, incy ); cublas_time = magma_sync_wtime( 0 ) - cublas_time; cublas_perf = gflops / cublas_time; magma_dgetvector( Ym, dY, incy, Ycublas, incy ); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_dsetvector( Ym, Y, incy, dY, incy ); magma_time = magma_sync_wtime( 0 ); magmablas_dgemv( opts.transA, M, N, alpha, dA, lda, dX, incx, beta, dY, incy ); magma_time = magma_sync_wtime( 0 ) - magma_time; magma_perf = gflops / magma_time; magma_dgetvector( Ym, dY, incx, Ymagma, incx ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_dgemv( lapack_trans_const(opts.transA), &M, &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_daxpy( &Ym, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_dlange( "M", &Ym, &ione, Ymagma, &Ym, work ) / Ym; blasf77_daxpy( &Ym, &c_neg_one, Y, &incy, Ycublas, &incy ); cublas_error = lapackf77_dlange( "M", &Ym, &ione, Ycublas, &Ym, work ) / Ym; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error, (magma_error < tol && cublas_error < tol ? "ok" : "failed")); status += ! (magma_error < tol && cublas_error < tol); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_dlatrd2(char uplo, magma_int_t n, magma_int_t nb, double *a, magma_int_t lda, double *e, double *tau, double *w, magma_int_t ldw, double *da, magma_int_t ldda, double *dw, magma_int_t lddw, double *dwork, magma_int_t ldwork) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= DLATRD2 reduces NB rows and columns of a real symmetric matrix A to symmetric tridiagonal form by an orthogonal similarity transformation Q' * A * Q, and returns the matrices V and W which are needed to apply the transformation to the unreduced part of A. If UPLO = 'U', DLATRD reduces the last NB rows and columns of a matrix, of which the upper triangle is supplied; if UPLO = 'L', DLATRD reduces the first NB rows and columns of a matrix, of which the lower triangle is supplied. This is an auxiliary routine called by DSYTRD2_GPU. It uses an accelerated HEMV that needs extra memory. Arguments ========= UPLO (input) CHARACTER*1 Specifies whether the upper or lower triangular part of the symmetric matrix A is stored: = 'U': Upper triangular = 'L': Lower triangular N (input) INTEGER The order of the matrix A. NB (input) INTEGER The number of rows and columns to be reduced. A (input/output) DOUBLE_PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = 'U', the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit: if UPLO = 'U', the last NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements above the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = 'L', the first NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements below the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= (1,N). E (output) DOUBLE_PRECISION array, dimension (N-1) If UPLO = 'U', E(n-nb:n-1) contains the superdiagonal elements of the last NB columns of the reduced matrix; if UPLO = 'L', E(1:nb) contains the subdiagonal elements of the first NB columns of the reduced matrix. TAU (output) DOUBLE_PRECISION array, dimension (N-1) The scalar factors of the elementary reflectors, stored in TAU(n-nb:n-1) if UPLO = 'U', and in TAU(1:nb) if UPLO = 'L'. See Further Details. W (output) DOUBLE_PRECISION array, dimension (LDW,NB) The n-by-nb matrix W required to update the unreduced part of A. LDW (input) INTEGER The leading dimension of the array W. LDW >= max(1,N). Further Details =============== If UPLO = 'U', the matrix Q is represented as a product of elementary reflectors Q = H(n) H(n-1) . . . H(n-nb+1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i), and tau in TAU(i-1). If UPLO = 'L', the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i), and tau in TAU(i). The elements of the vectors v together form the n-by-nb matrix V which is needed, with W, to apply the transformation to the unreduced part of the matrix, using a symmetric rank-2k update of the form: A := A - V*W' - W*V'. The contents of A on exit are illustrated by the following examples with n = 5 and nb = 2: if UPLO = 'U': if UPLO = 'L': ( a a a v4 v5 ) ( d ) ( a a v4 v5 ) ( 1 d ) ( a 1 v5 ) ( v1 1 a ) ( d 1 ) ( v1 v2 a a ) ( d ) ( v1 v2 a a a ) where d denotes a diagonal element of the reduced matrix, a denotes an element of the original matrix that is unchanged, and vi denotes an element of the vector defining H(i). ===================================================================== */ char uplo_[2] = {uplo, 0}; magma_int_t i; double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; double c_zero = MAGMA_D_ZERO; double value = MAGMA_D_ZERO; magma_int_t ione = 1; magma_int_t i_n, i_1, iw; double alpha; double *f; if (n <= 0) { return 0; } magma_queue_t stream; magma_queue_create( &stream ); magma_dmalloc_cpu( &f, n ); assert( f != NULL ); // TODO return error, or allocate outside dlatrd if (lapackf77_lsame(uplo_, "U")) { /* Reduce last NB columns of upper triangle */ for (i = n-1; i >= n - nb ; --i) { i_1 = i + 1; i_n = n - i - 1; iw = i - n + nb; if (i < n-1) { /* Update A(1:i,i) */ #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i_n, W(i, iw+1), &ldw); #endif blasf77_dgemv("No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda, W(i, iw+1), &ldw, &c_one, A(0, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i_n, W(i, iw+1), &ldw); lapackf77_dlacgv(&i_n, A(i, i+1), &ldw); #endif blasf77_dgemv("No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw, A(i, i+1), &lda, &c_one, A(0, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i_n, A(i, i+1), &ldw); #endif } if (i > 0) { /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */ alpha = *A(i-1, i); lapackf77_dlarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]); e[i-1] = MAGMA_D_REAL( alpha ); *A(i-1,i) = MAGMA_D_MAKE( 1, 0 ); /* Compute W(1:i-1,i) */ // 1. Send the block reflector A(0:n-i-1,i) to the GPU magma_dsetvector( i, A(0, i), 1, dA(0, i), 1 ); //#if (GPUSHMEM < 200) //magma_dsymv(MagmaUpper, i, c_one, dA(0, 0), ldda, // dA(0, i), ione, c_zero, dW(0, iw), ione); //#else magmablas_dsymv_work(MagmaUpper, i, c_one, dA(0, 0), ldda, dA(0, i), ione, c_zero, dW(0, iw), ione, dwork, ldwork); //#endif // 2. Start putting the result back (asynchronously) magma_dgetmatrix_async( i, 1, dW(0, iw), lddw, W(0, iw) /*test*/, ldw, stream ); if (i < n-1) { blasf77_dgemv(MagmaTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); } // 3. Here is where we need it // TODO find the right place magma_queue_sync( stream ); if (i < n-1) { blasf77_dgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); blasf77_dgemv(MagmaTransStr, &i, &i_n, &c_one, A(0, i+1), &lda, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); blasf77_dgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); } blasf77_dscal(&i, &tau[i - 1], W(0, iw), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_ddot_sub( i, W(0,iw), ione, A(0,i), ione, &value ); #else value = cblas_ddot( i, W(0,iw), ione, A(0,i), ione ); #endif alpha = tau[i - 1] * -0.5f * value; blasf77_daxpy(&i, &alpha, A(0, i), &ione, W(0, iw), &ione); } } } else { /* Reduce first NB columns of lower triangle */ for (i = 0; i < nb; ++i) { /* Update A(i:n,i) */ i_n = n - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i, W(i, 0), &ldw); #endif blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda, W(i, 0), &ldw, &c_one, A(i, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i, W(i, 0), &ldw); lapackf77_dlacgv(&i, A(i ,0), &lda); #endif blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw, A(i, 0), &lda, &c_one, A(i, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i, A(i, 0), &lda); #endif if (i < n-1) { /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */ i_n = n - i - 1; alpha = *A(i+1, i); lapackf77_dlarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]); e[i] = MAGMA_D_REAL( alpha ); *A(i+1,i) = MAGMA_D_MAKE( 1, 0 ); /* Compute W(i+1:n,i) */ // 1. Send the block reflector A(i+1:n,i) to the GPU magma_dsetvector( i_n, A(i+1, i), 1, dA(i+1, i), 1 ); //#if (GPUSHMEM < 200) //magma_dsymv(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero, // dW(i+1, i), ione); //#else magmablas_dsymv_work('L', i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero, dW(i+1, i), ione, dwork, ldwork); //#endif // 2. Start putting the result back (asynchronously) magma_dgetmatrix_async( i_n, 1, dW(i+1, i), lddw, W(i+1, i), ldw, stream ); blasf77_dgemv(MagmaTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw, A(i+1, i), &ione, &c_zero, W(0, i), &ione); blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda, W(0, i), &ione, &c_zero, f, &ione); blasf77_dgemv(MagmaTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda, A(i+1, i), &ione, &c_zero, W(0, i), &ione); // 3. Here is where we need it magma_queue_sync( stream ); if (i!=0) blasf77_daxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione); blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw, W(0, i), &ione, &c_one, W(i+1, i), &ione); blasf77_dscal(&i_n, &tau[i], W(i+1,i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_ddot_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value ); #else value = cblas_ddot( i_n, W(i+1,i), ione, A(i+1,i), ione ); #endif alpha = tau[i] * -0.5f * value; blasf77_daxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione); } } } magma_free_cpu(f); magma_queue_destroy( stream ); return 0; } /* dlatrd */
/** Purpose ------- DGERBT solves a system of linear equations A * X = B where A is a general n-by-n matrix and X and B are n-by-nrhs matrices. Random Butterfly Tranformation is applied on A and B, then the LU decomposition with no pivoting is used to factor A as A = L * U, where L is unit lower triangular, and U is upper triangular. The factored form of A is then used to solve the system of equations A * X = B. This is a batched version that solves batchCount matrices in parallel. dA, dB, and info become arrays with one entry per matrix. Arguments --------- @param[in] gen magma_bool_t - = MagmaTrue: new matrices are generated for U and V - = MagmaFalse: matrices U and V given as parameter are used @param[in] n INTEGER The order of the matrix A. n >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. nrhs >= 0. @param[in,out] dA_array Array of pointers, dimension (batchCount). Each is a DOUBLE PRECISION array on the GPU, dimension (LDDA,N). On entry, each pointer is an M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of each array A. LDDA >= max(1,M). @param[in,out] dB_array Array of pointers, dimension (batchCount). Each is a DOUBLE PRECISION array on the GPU, dimension (LDDB,N). On entry, each pointer is an right hand side matrix B. On exit, each pointer is the solution matrix X. @param[in] lddb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[in,out] U DOUBLE PRECISION array, dimension (2,n) Random butterfly matrix, if gen = MagmaTrue U is generated and returned as output; else we use U given as input. CPU memory @param[in,out] V DOUBLE PRECISION array, dimension (2,n) Random butterfly matrix, if gen = MagmaTrue V is generated and returned as output; else we use U given as input. CPU memory @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. @param[in] batchCount INTEGER The number of matrices to operate on. @param[in] queue magma_queue_t Queue to execute in. @ingroup magma_dgesv_comp ********************************************************************/ extern "C" magma_int_t magma_dgerbt_batched( magma_bool_t gen, magma_int_t n, magma_int_t nrhs, double **dA_array, magma_int_t ldda, double **dB_array, magma_int_t lddb, double *U, double *V, magma_int_t *info, magma_int_t batchCount, magma_queue_t queue) { double *du, *dv; magma_int_t i; /* Function Body */ *info = 0; if ( ! (gen == MagmaTrue) && ! (gen == MagmaFalse) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (nrhs < 0) { *info = -3; } else if (ldda < max(1,n)) { *info = -5; } else if (lddb < max(1,n)) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (nrhs == 0 || n == 0) return *info; /* Allocate memory for the buterfly matrices */ if (MAGMA_SUCCESS != magma_dmalloc( &du, 2*n )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if (MAGMA_SUCCESS != magma_dmalloc( &dv, 2*n )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* Initialize Butterfly matrix on the CPU*/ if (gen == MagmaTrue) init_butterfly(2*n, U, V); /* Copy the butterfly to the GPU */ magma_dsetvector( 2*n, U, 1, du, 1, queue ); magma_dsetvector( 2*n, V, 1, dv, 1, queue ); /* Perform Partial Random Butterfly Transformation on the GPU*/ magmablas_dprbt_batched(n, dA_array, ldda, du, dv, batchCount, queue); /* Compute U^T.b on the GPU*/ // TODO fix for multiple RHS for (i= 0; i < nrhs; i++) magmablas_dprbt_mtv_batched(n, du, dB_array, batchCount, queue); magma_free( du ); magma_free( dv ); return *info; }
magma_int_t magma_dbicgstab_merge2( magma_d_sparse_matrix A, magma_d_vector b, magma_d_vector *x, magma_d_solver_par *solver_par ){ // prepare solver feedback solver_par->solver = Magma_BICGSTABMERGE2; solver_par->numiter = 0; solver_par->info = 0; // some useful variables double c_zero = MAGMA_D_ZERO, c_one = MAGMA_D_ONE; magma_int_t dofs = A.num_rows; // GPU stream magma_queue_t stream[2]; magma_event_t event[1]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); // workspace magma_d_vector q, r,rr,p,v,s,t; double *d1, *d2, *skp; magma_dmalloc( &d1, dofs*(2) ); magma_dmalloc( &d2, dofs*(2) ); // array for the parameters magma_dmalloc( &skp, 8 ); // skp = [alpha|beta|omega|rho_old|rho|nom|tmp1|tmp2] magma_d_vinit( &q, Magma_DEV, dofs*6, c_zero ); // q = rr|r|p|v|s|t rr.memory_location = Magma_DEV; rr.val = NULL; rr.num_rows = rr.nnz = dofs; r.memory_location = Magma_DEV; r.val = NULL; r.num_rows = r.nnz = dofs; p.memory_location = Magma_DEV; p.val = NULL; p.num_rows = p.nnz = dofs; v.memory_location = Magma_DEV; v.val = NULL; v.num_rows = v.nnz = dofs; s.memory_location = Magma_DEV; s.val = NULL; s.num_rows = s.nnz = dofs; t.memory_location = Magma_DEV; t.val = NULL; t.num_rows = t.nnz = dofs; rr.val = q(0); r.val = q(1); p.val = q(2); v.val = q(3); s.val = q(4); t.val = q(5); // solver variables double alpha, beta, omega, rho_old, rho_new, *skp_h; double nom, nom0, betanom, r0, den; // solver setup magma_dscal( dofs, c_zero, x->val, 1) ; // x = 0 magma_dcopy( dofs, b.val, 1, q(0), 1 ); // rr = b magma_dcopy( dofs, b.val, 1, q(1), 1 ); // r = b rho_new = magma_ddot( dofs, r.val, 1, r.val, 1 ); // rho=<rr,r> nom = MAGMA_D_REAL(magma_ddot( dofs, r.val, 1, r.val, 1 )); nom0 = betanom = sqrt(nom); // nom = || r || rho_old = omega = alpha = MAGMA_D_MAKE( 1.0, 0. ); beta = rho_new; solver_par->init_res = nom0; // array on host for the parameters magma_dmalloc_cpu( &skp_h, 8 ); skp_h[0]=alpha; skp_h[1]=beta; skp_h[2]=omega; skp_h[3]=rho_old; skp_h[4]=rho_new; skp_h[5]=MAGMA_D_MAKE(nom, 0.0); magma_dsetvector( 8, skp_h, 1, skp, 1 ); magma_d_spmv( c_one, A, r, c_zero, v ); // z = A r den = MAGMA_D_REAL( magma_ddot(dofs, v.val, 1, r.val, 1) );// den = z dot r if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) return MAGMA_SUCCESS; // check positive definite if (den <= 0.0) { printf("Operator A is not postive definite. (Ar,r) = %f\n", den); return -100; } //Chronometry real_Double_t tempo1, tempo2; magma_device_sync(); tempo1=magma_wtime(); if( solver_par->verbose > 0 ){ solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ){ magmablasSetKernelStream(stream[0]); // computes p=r+beta*(p-omega*v) magma_dbicgmerge1( dofs, skp, v.val, r.val, p.val ); magma_dbicgmerge_spmv1( A, d1, d2, q(2), q(0), q(3), skp ); magma_dbicgmerge2( dofs, skp, r.val, v.val, s.val ); // s=r-alpha*v magma_dbicgmerge_spmv2( A, d1, d2, q(4), q(5), skp); magma_dbicgmerge_xrbeta( dofs, d1, d2, q(0), q(1), q(2), q(4), q(5), x->val, skp); // check stopping criterion (asynchronous copy) magma_dgetvector_async( 1 , skp+5, 1, skp_h+5, 1, stream[1] ); betanom = sqrt(MAGMA_D_REAL(skp_h[5])); if( solver_par->verbose > 0 ){ magma_device_sync(); tempo2=magma_wtime(); if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < r0 ) { break; } } magma_device_sync(); tempo2=magma_wtime(); solver_par->runtime = (real_Double_t) tempo2-tempo1; double residual; magma_dresidual( A, b, *x, &residual ); solver_par->iter_res = betanom; solver_par->final_res = residual; if( solver_par->numiter < solver_par->maxiter){ solver_par->info = 0; }else if( solver_par->init_res > solver_par->final_res ){ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -2; } else{ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -1; } magma_d_vfree(&q); // frees all vectors magma_free(d1); magma_free(d2); magma_free( skp ); magma_free_cpu( skp_h ); return MAGMA_SUCCESS; } /* dbicgstab_merge2 */
int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, t1, t2; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; const char trans[] = { 'N', 'C', 'T' }; const char uplo[] = { 'L', 'U' }; const char diag[] = { 'U', 'N' }; const char side[] = { 'L', 'R' }; double *A, *B, *C, *C2, *LU; double *dA, *dB, *dC1, *dC2; double alpha = MAGMA_D_MAKE( 0.5, 0.1 ); double beta = MAGMA_D_MAKE( 0.7, 0.2 ); double dalpha = 0.6; double dbeta = 0.8; double work[1], error, total_error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld, info; magma_int_t *piv; magma_err_t err; magma_opts opts; parse_opts( argc, argv, &opts ); printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" ); total_error = 0.; for( int i = 0; i < opts.ntest; ++i ) { m = opts.msize[i]; n = opts.nsize[i]; k = opts.ksize[i]; printf("=========================================================================\n"); printf( "M %d, N %d, K %d\n", (int) m, (int) n, (int) k ); // allocate matrices // over-allocate so they can be any combination of {m,n,k} x {m,n,k}. maxn = max( max( m, n ), k ); ld = maxn; size = maxn*maxn; err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) ); assert( err == 0 ); err = magma_dmalloc_pinned( &A, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &B, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &C, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &C2, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &LU, size ); assert( err == 0 ); err = magma_dmalloc( &dA, size ); assert( err == 0 ); err = magma_dmalloc( &dB, size ); assert( err == 0 ); err = magma_dmalloc( &dC1, size ); assert( err == 0 ); err = magma_dmalloc( &dC2, size ); assert( err == 0 ); // initialize matrices size = maxn*maxn; lapackf77_dlarnv( &ione, ISEED, &size, A ); lapackf77_dlarnv( &ione, ISEED, &size, B ); lapackf77_dlarnv( &ione, ISEED, &size, C ); printf( "========== Level 1 BLAS ==========\n" ); // ----- test DSWAP // swap 2nd and 3rd columns of dA, then copy to C2 and compare with A assert( n >= 4 ); magma_dsetmatrix( m, n, A, ld, dA, ld ); magma_dsetmatrix( m, n, A, ld, dB, ld ); magma_dswap( m, dA(0,1), 1, dA(0,2), 1 ); magma_dswap( m, dB(0,1), 1, dB(0,2), 1 ); // check results, storing diff between magma and cuda calls in C2 cublasDaxpy( ld*n, c_neg_one, dA, 1, dB, 1 ); magma_dgetmatrix( m, n, dB, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &k, C2, &ld, work ); total_error += error; printf( "dswap diff %.2g\n", error ); // ----- test IDAMAX // get argmax of column of A magma_dsetmatrix( m, k, A, ld, dA, ld ); error = 0; for( int j = 0; j < k; ++j ) { magma_int_t i1 = magma_idamax( m, dA(0,j), 1 ); magma_int_t i2 = cublasIdamax( m, dA(0,j), 1 ); assert( i1 == i2 ); error += abs( i1 - i2 ); } total_error += error; gflops = (double)m * k / 1e9; printf( "idamax diff %.2g\n", error ); printf( "\n" ); printf( "========== Level 2 BLAS ==========\n" ); // ----- test DGEMV // c = alpha*A*b + beta*c, with A m*n; b,c m or n-vectors // try no-trans/trans for( int ia = 0; ia < 3; ++ia ) { magma_dsetmatrix( m, n, A, ld, dA, ld ); magma_dsetvector( maxn, B, 1, dB, 1 ); magma_dsetvector( maxn, C, 1, dC1, 1 ); magma_dsetvector( maxn, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 size = (trans[ia] == 'N' ? m : n); cublasDaxpy( size, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( size, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &size, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DGEMV( m, n ) / 1e9; printf( "dgemv( %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", trans[ia], error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test DSYMV // c = alpha*A*b + beta*c, with A m*m symmetric; b,c m-vectors // try upper/lower for( int iu = 0; iu < 2; ++iu ) { magma_dsetmatrix( m, m, A, ld, dA, ld ); magma_dsetvector( m, B, 1, dB, 1 ); magma_dsetvector( m, C, 1, dC1, 1 ); magma_dsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( m, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYMV( m ) / 1e9; printf( "dsymv( %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test DTRSV // solve A*c = c, with A m*m triangular; c m-vector // try upper/lower, no-trans/trans, unit/non-unit diag // Factor A into LU to get well-conditioned triangles, else solve yields garbage. // Still can give garbage if solves aren't consistent with LU factors, // e.g., using unit diag for U, so copy lower triangle to upper triangle. // Also used for trsm later. lapackf77_dlacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld ); lapackf77_dgetrf( &maxn, &maxn, LU, &ld, piv, &info ); for( int j = 0; j < maxn; ++j ) { for( int i = 0; i < j; ++i ) { *LU(i,j) = *LU(j,i); } } for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { magma_dsetmatrix( m, m, LU, ld, dA, ld ); magma_dsetvector( m, C, 1, dC1, 1 ); magma_dsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( m, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRSM( MagmaLeft, m, 1 ) / 1e9; printf( "dtrsv( %c, %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], diag[id], error, gflops/t1, gflops/t2 ); }}} printf( "\n" ); printf( "========== Level 3 BLAS ==========\n" ); // ----- test DGEMM // C = alpha*A*B + beta*C, with A m*k or k*m; B k*n or n*k; C m*n // try combinations of no-trans/trans for( int ia = 0; ia < 3; ++ia ) { for( int ib = 0; ib < 3; ++ib ) { bool nta = (trans[ia] == 'N'); bool ntb = (trans[ib] == 'N'); magma_dsetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA, ld ); magma_dsetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DGEMM( m, n, k ) / 1e9; printf( "dgemm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", trans[ia], trans[ib], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYMM // C = alpha*A*B + beta*C (left) with A m*m symmetric; B,C m*n; or // C = alpha*B*A + beta*C (right) with A n*n symmetric; B,C m*n // try left/right, upper/lower for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { magma_dsetmatrix( m, m, A, ld, dA, ld ); magma_dsetmatrix( m, n, B, ld, dB, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYMM( side[is], m, n ) / 1e9; printf( "dsymm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", side[is], uplo[iu], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYRK // C = alpha*A*A^H + beta*C (no-trans) with A m*k and C m*m symmetric; or // C = alpha*A^H*A + beta*C (trans) with A k*m and C m*m symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { magma_dsetmatrix( n, k, A, ld, dA, ld ); magma_dsetmatrix( n, n, C, ld, dC1, ld ); magma_dsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYRK( k, n ) / 1e9; printf( "dsyrk( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYR2K // C = alpha*A*B^H + ^alpha*B*A^H + beta*C (no-trans) with A,B n*k; C n*n symmetric; or // C = alpha*A^H*B + ^alpha*B^H*A + beta*C (trans) with A,B k*n; C n*n symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { bool nt = (trans[it] == 'N'); magma_dsetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA, ld ); magma_dsetmatrix( n, n, C, ld, dC1, ld ); magma_dsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYR2K( k, n ) / 1e9; printf( "dsyr2k( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DTRMM // C = alpha*A*C (left) with A m*m triangular; C m*n; or // C = alpha*C*A (right) with A n*n triangular; C m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == 'L'); magma_dsetmatrix( (left ? m : n), (left ? m : n), A, ld, dA, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRMM( side[is], m, n ) / 1e9; printf( "dtrmm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // ----- test DTRSM // solve A*X = alpha*B (left) with A m*m triangular; B m*n; or // solve X*A = alpha*B (right) with A n*n triangular; B m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == 'L'); magma_dsetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRSM( side[is], m, n ) / 1e9; printf( "dtrsm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // cleanup magma_free_cpu( piv ); magma_free_pinned( A ); magma_free_pinned( B ); magma_free_pinned( C ); magma_free_pinned( C2 ); magma_free_pinned( LU ); magma_free( dA ); magma_free( dB ); magma_free( dC1 ); magma_free( dC2 ); } if ( total_error != 0. ) { printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n", total_error ); } else { printf( "all tests passed\n" ); } TESTING_FINALIZE(); return 0; }
extern "C" magma_int_t magma_dmtransfer( magma_d_matrix A, magma_d_matrix *B, magma_location_t src, magma_location_t dst, magma_queue_t queue ) { magma_int_t info = 0; B->val = NULL; B->diag = NULL; B->row = NULL; B->rowidx = NULL; B->col = NULL; B->blockinfo = NULL; B->dval = NULL; B->ddiag = NULL; B->drow = NULL; B->drowidx = NULL; B->dcol = NULL; B->diag = NULL; B->ddiag = NULL; B->list = NULL; B->dlist = NULL; // first case: copy matrix from host to device if ( src == Magma_CPU && dst == Magma_DEV ) { //CSR-type if ( A.storage_type == Magma_CSR || A.storage_type == Magma_CUCSR || A.storage_type == Magma_CSC || A.storage_type == Magma_CSRD || A.storage_type == Magma_CSRL || A.storage_type == Magma_CSRU ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc( &B->dval, A.nnz )); CHECK( magma_index_malloc( &B->drow, A.num_rows + 1 )); CHECK( magma_index_malloc( &B->dcol, A.nnz )); // data transfer magma_dsetvector( A.nnz, A.val, 1, B->dval, 1, queue ); magma_index_setvector( A.num_rows + 1, A.row, 1, B->drow, 1, queue ); magma_index_setvector( A.nnz, A.col, 1, B->dcol, 1, queue ); } //COO-type else if ( A.storage_type == Magma_COO ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc( &B->dval, A.nnz )); CHECK( magma_index_malloc( &B->dcol, A.nnz )); CHECK( magma_index_malloc( &B->drowidx, A.nnz )); // data transfer magma_dsetvector( A.nnz, A.val, 1, B->dval, 1, queue ); magma_index_setvector( A.nnz, A.col, 1, B->dcol, 1, queue ); magma_index_setvector( A.nnz, A.rowidx, 1, B->drowidx, 1, queue ); } //CSRCOO-type else if ( A.storage_type == Magma_CSRCOO ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc( &B->dval, A.nnz )); CHECK( magma_index_malloc( &B->drow, A.num_rows + 1 )); CHECK( magma_index_malloc( &B->dcol, A.nnz )); CHECK( magma_index_malloc( &B->drowidx, A.nnz )); // data transfer magma_dsetvector( A.nnz, A.val, 1, B->dval, 1, queue ); magma_index_setvector( A.num_rows + 1, A.row, 1, B->drow, 1, queue ); magma_index_setvector( A.nnz, A.col, 1, B->dcol, 1, queue ); magma_index_setvector( A.nnz, A.rowidx, 1, B->drowidx, 1, queue ); } //ELL/ELLPACKT-type else if ( A.storage_type == Magma_ELLPACKT || A.storage_type == Magma_ELL ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc( &B->dval, A.num_rows * A.max_nnz_row )); CHECK( magma_index_malloc( &B->dcol, A.num_rows * A.max_nnz_row )); // data transfer magma_dsetvector( A.num_rows * A.max_nnz_row, A.val, 1, B->dval, 1, queue ); magma_index_setvector( A.num_rows * A.max_nnz_row, A.col, 1, B->dcol, 1, queue ); } //ELLD-type else if ( A.storage_type == Magma_ELLD ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc( &B->dval, A.num_rows * A.max_nnz_row )); CHECK( magma_index_malloc( &B->dcol, A.num_rows * A.max_nnz_row )); // data transfer magma_dsetvector( A.num_rows * A.max_nnz_row, A.val, 1, B->dval, 1, queue ); magma_index_setvector( A.num_rows * A.max_nnz_row, A.col, 1, B->dcol, 1, queue ); } //ELLRT-type else if ( A.storage_type == Magma_ELLRT ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->blocksize = A.blocksize; B->alignment = A.alignment; magma_int_t rowlength = magma_roundup( A.max_nnz_row, A.alignment ); // memory allocation CHECK( magma_dmalloc( &B->dval, A.num_rows * rowlength )); CHECK( magma_index_malloc( &B->dcol, A.num_rows * rowlength )); CHECK( magma_index_malloc( &B->drow, A.num_rows )); // data transfer magma_dsetvector( A.num_rows * rowlength, A.val, 1, B->dval, 1, queue ); magma_index_setvector( A.num_rows * rowlength, A.col, 1, B->dcol, 1, queue ); magma_index_setvector( A.num_rows, A.row, 1, B->drow, 1, queue ); } //SELLP-type else if ( A.storage_type == Magma_SELLP ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->blocksize = A.blocksize; B->numblocks = A.numblocks; B->alignment = A.alignment; // memory allocation CHECK( magma_dmalloc( &B->dval, A.nnz )); CHECK( magma_index_malloc( &B->dcol, A.nnz )); CHECK( magma_index_malloc( &B->drow, A.numblocks + 1 )); // data transfer magma_dsetvector( A.nnz, A.val, 1, B->dval, 1, queue ); magma_index_setvector( A.nnz, A.col, 1, B->dcol, 1, queue ); magma_index_setvector( A.numblocks + 1, A.row, 1, B->drow, 1, queue ); } //BCSR-type else if ( A.storage_type == Magma_BCSR ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->blocksize = A.blocksize; B->numblocks = A.numblocks; B->alignment = A.alignment; magma_int_t size_b = A.blocksize; //magma_int_t c_blocks = ceil( (float)A.num_cols / (float)size_b ); // max number of blocks per row //magma_int_t r_blocks = ceil( (float)A.num_rows / (float)size_b ); magma_int_t r_blocks = magma_ceildiv( A.num_rows, size_b ); // max number of blocks per column // memory allocation CHECK( magma_dmalloc( &B->dval, size_b * size_b * A.numblocks )); CHECK( magma_index_malloc( &B->drow, r_blocks + 1 )); CHECK( magma_index_malloc( &B->dcol, A.numblocks )); // data transfer magma_dsetvector( size_b * size_b * A.numblocks, A.val, 1, B->dval, 1, queue ); magma_index_setvector( r_blocks + 1, A.row, 1, B->drow, 1, queue ); magma_index_setvector( A.numblocks, A.col, 1, B->dcol, 1, queue ); } //DENSE-type else if ( A.storage_type == Magma_DENSE ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->major = A.major; B->ld = A.ld; // memory allocation CHECK( magma_dmalloc( &B->dval, A.num_rows * A.num_cols )); // data transfer magma_dsetvector( A.num_rows * A.num_cols, A.val, 1, B->dval, 1, queue ); } } // second case: copy matrix from host to host else if ( src == Magma_CPU && dst == Magma_CPU ) { //CSR-type if ( A.storage_type == Magma_CSR || A.storage_type == Magma_CUCSR || A.storage_type == Magma_CSC || A.storage_type == Magma_CSRD || A.storage_type == Magma_CSRL || A.storage_type == Magma_CSRU ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc_cpu( &B->val, A.nnz )); CHECK( magma_index_malloc_cpu( &B->row, A.num_rows + 1 )); CHECK( magma_index_malloc_cpu( &B->col, A.nnz )); // data transfer for( magma_int_t i=0; i<A.nnz; i++ ) { B->val[i] = A.val[i]; B->col[i] = A.col[i]; } for( magma_int_t i=0; i<A.num_rows+1; i++ ) { B->row[i] = A.row[i]; } } //COO-type else if ( A.storage_type == Magma_COO ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc_cpu( &B->val, A.nnz )); CHECK( magma_index_malloc_cpu( &B->col, A.nnz )); CHECK( magma_index_malloc_cpu( &B->rowidx, A.nnz )); // data transfer for( magma_int_t i=0; i<A.nnz; i++ ) { B->val[i] = A.val[i]; B->col[i] = A.col[i]; B->rowidx[i] = A.rowidx[i]; } } //CSRCOO-type else if ( A.storage_type == Magma_CSRCOO ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc_cpu( &B->val, A.nnz )); CHECK( magma_index_malloc_cpu( &B->row, A.num_rows + 1 )); CHECK( magma_index_malloc_cpu( &B->col, A.nnz )); CHECK( magma_index_malloc_cpu( &B->rowidx, A.nnz )); // data transfer for( magma_int_t i=0; i<A.nnz; i++ ) { B->val[i] = A.val[i]; B->col[i] = A.col[i]; B->rowidx[i] = A.rowidx[i]; } for( magma_int_t i=0; i<A.num_rows+1; i++ ) { B->row[i] = A.row[i]; } } //ELL/ELLPACKT-type else if ( A.storage_type == Magma_ELLPACKT || A.storage_type == Magma_ELL ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc_cpu( &B->val, A.num_rows * A.max_nnz_row )); CHECK( magma_index_malloc_cpu( &B->col, A.num_rows * A.max_nnz_row )); // data transfer for( magma_int_t i=0; i<A.num_rows*A.max_nnz_row; i++ ) { B->val[i] = A.val[i]; B->col[i] = A.col[i]; } } //ELLD-type else if ( A.storage_type == Magma_ELLD ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc_cpu( &B->val, A.num_rows * A.max_nnz_row )); CHECK( magma_index_malloc_cpu( &B->col, A.num_rows * A.max_nnz_row )); // data transfer for( magma_int_t i=0; i<A.num_rows*A.max_nnz_row; i++ ) { B->val[i] = A.val[i]; B->col[i] = A.col[i]; } } //ELLRT-type else if ( A.storage_type == Magma_ELLRT ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->blocksize = A.blocksize; B->alignment = A.alignment; //int threads_per_row = A.alignment; //int rowlength = magma_roundup( A.max_nnz_row, threads_per_row ); magma_int_t rowlength = magma_roundup( A.max_nnz_row, A.alignment ); // memory allocation CHECK( magma_dmalloc_cpu( &B->val, rowlength * A.num_rows )); CHECK( magma_index_malloc_cpu( &B->row, A.num_rows )); CHECK( magma_index_malloc_cpu( &B->col, rowlength * A.num_rows )); // data transfer for( magma_int_t i=0; i<A.num_rows*rowlength; i++ ) { B->val[i] = A.val[i]; B->col[i] = A.col[i]; } for( magma_int_t i=0; i<A.num_rows; i++ ) { B->row[i] = A.row[i]; } } //SELLP-type else if ( A.storage_type == Magma_SELLP ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->blocksize = A.blocksize; B->alignment = A.alignment; B->numblocks = A.numblocks; // memory allocation CHECK( magma_dmalloc_cpu( &B->val, A.nnz )); CHECK( magma_index_malloc_cpu( &B->col, A.nnz )); CHECK( magma_index_malloc_cpu( &B->row, A.numblocks + 1 )); // data transfer for( magma_int_t i=0; i<A.nnz; i++ ) { B->val[i] = A.val[i]; B->col[i] = A.col[i]; } for( magma_int_t i=0; i<A.numblocks+1; i++ ) { B->row[i] = A.row[i]; } } //BCSR-type else if ( A.storage_type == Magma_BCSR ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->blocksize = A.blocksize; B->numblocks = A.numblocks; B->alignment = A.alignment; magma_int_t size_b = A.blocksize; //magma_int_t c_blocks = ceil( (float)A.num_cols / (float)size_b ); // max number of blocks per row //magma_int_t r_blocks = ceil( (float)A.num_rows / (float)size_b ); magma_int_t r_blocks = magma_ceildiv( A.num_rows, size_b ); // max number of blocks per column // memory allocation CHECK( magma_dmalloc_cpu( &B->val, size_b * size_b * A.numblocks )); CHECK( magma_index_malloc_cpu( &B->row, r_blocks + 1 )); CHECK( magma_index_malloc_cpu( &B->col, A.numblocks )); // data transfer //magma_dsetvector( size_b * size_b * A.numblocks, A.val, 1, B->dval, 1, queue ); for( magma_int_t i=0; i<size_b*size_b*A.numblocks; i++ ) { B->dval[i] = A.val[i]; } //magma_index_setvector( r_blocks + 1, A.row, 1, B->drow, 1, queue ); for( magma_int_t i=0; i<r_blocks+1; i++ ) { B->drow[i] = A.row[i]; } //magma_index_setvector( A.numblocks, A.col, 1, B->dcol, 1, queue ); for( magma_int_t i=0; i<A.numblocks; i++ ) { B->dcol[i] = A.col[i]; } } //DENSE-type else if ( A.storage_type == Magma_DENSE ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->major = A.major; B->ld = A.ld; // memory allocation CHECK( magma_dmalloc_cpu( &B->val, A.num_rows * A.num_cols )); // data transfer for( magma_int_t i=0; i<A.num_rows*A.num_cols; i++ ) { B->val[i] = A.val[i]; } } } // third case: copy matrix from device to host else if ( src == Magma_DEV && dst == Magma_CPU ) { //CSR-type if ( A.storage_type == Magma_CSR || A.storage_type == Magma_CUCSR || A.storage_type == Magma_CSC || A.storage_type == Magma_CSRD || A.storage_type == Magma_CSRL || A.storage_type == Magma_CSRU ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc_cpu( &B->val, A.nnz )); CHECK( magma_index_malloc_cpu( &B->row, A.num_rows + 1 )); CHECK( magma_index_malloc_cpu( &B->col, A.nnz )); // data transfer magma_dgetvector( A.nnz, A.dval, 1, B->val, 1, queue ); magma_index_getvector( A.num_rows + 1, A.drow, 1, B->row, 1, queue ); magma_index_getvector( A.nnz, A.dcol, 1, B->col, 1, queue ); } //COO-type else if ( A.storage_type == Magma_COO ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc_cpu( &B->val, A.nnz )); CHECK( magma_index_malloc_cpu( &B->col, A.nnz )); CHECK( magma_index_malloc_cpu( &B->rowidx, A.nnz )); // data transfer magma_dgetvector( A.nnz, A.dval, 1, B->val, 1, queue ); magma_index_getvector( A.nnz, A.dcol, 1, B->col, 1, queue ); magma_index_getvector( A.nnz, A.drowidx, 1, B->rowidx, 1, queue ); } //CSRCOO-type else if ( A.storage_type == Magma_CSRCOO ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc_cpu( &B->val, A.nnz )); CHECK( magma_index_malloc_cpu( &B->row, A.num_rows + 1 )); CHECK( magma_index_malloc_cpu( &B->col, A.nnz )); CHECK( magma_index_malloc_cpu( &B->rowidx, A.nnz )); // data transfer magma_dgetvector( A.nnz, A.dval, 1, B->val, 1, queue ); magma_index_getvector( A.num_rows + 1, A.drow, 1, B->row, 1, queue ); magma_index_getvector( A.nnz, A.dcol, 1, B->col, 1, queue ); magma_index_getvector( A.nnz, A.drowidx, 1, B->rowidx, 1, queue ); } //ELL/ELLPACKT-type else if ( A.storage_type == Magma_ELLPACKT || A.storage_type == Magma_ELL ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc_cpu( &B->val, A.num_rows * A.max_nnz_row )); CHECK( magma_index_malloc_cpu( &B->col, A.num_rows * A.max_nnz_row )); // data transfer magma_dgetvector( A.num_rows * A.max_nnz_row, A.dval, 1, B->val, 1, queue ); magma_index_getvector( A.num_rows * A.max_nnz_row, A.dcol, 1, B->col, 1, queue ); } //ELLD-type else if ( A.storage_type == Magma_ELLD ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc_cpu( &B->val, A.num_rows * A.max_nnz_row )); CHECK( magma_index_malloc_cpu( &B->col, A.num_rows * A.max_nnz_row )); // data transfer magma_dgetvector( A.num_rows * A.max_nnz_row, A.dval, 1, B->val, 1, queue ); magma_index_getvector( A.num_rows * A.max_nnz_row, A.dcol, 1, B->col, 1, queue ); } //ELLRT-type else if ( A.storage_type == Magma_ELLRT ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->blocksize = A.blocksize; B->alignment = A.alignment; //int threads_per_row = A.alignment; //int rowlength = magma_roundup( A.max_nnz_row, threads_per_row ); magma_int_t rowlength = magma_roundup( A.max_nnz_row, A.alignment ); // memory allocation CHECK( magma_dmalloc_cpu( &B->val, rowlength * A.num_rows )); CHECK( magma_index_malloc_cpu( &B->row, A.num_rows )); CHECK( magma_index_malloc_cpu( &B->col, rowlength * A.num_rows )); // data transfer magma_dgetvector( A.num_rows * rowlength, A.dval, 1, B->val, 1, queue ); magma_index_getvector( A.num_rows * rowlength, A.dcol, 1, B->col, 1, queue ); magma_index_getvector( A.num_rows, A.drow, 1, B->row, 1, queue ); } //SELLP-type else if ( A.storage_type == Magma_SELLP ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->blocksize = A.blocksize; B->numblocks = A.numblocks; B->alignment = A.alignment; // memory allocation CHECK( magma_dmalloc_cpu( &B->val, A.nnz )); CHECK( magma_index_malloc_cpu( &B->col, A.nnz )); CHECK( magma_index_malloc_cpu( &B->row, A.numblocks + 1 )); // data transfer magma_dgetvector( A.nnz, A.dval, 1, B->val, 1, queue ); magma_index_getvector( A.nnz, A.dcol, 1, B->col, 1, queue ); magma_index_getvector( A.numblocks + 1, A.drow, 1, B->row, 1, queue ); } //BCSR-type else if ( A.storage_type == Magma_BCSR ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->blocksize = A.blocksize; B->numblocks = A.numblocks; B->alignment = A.alignment; magma_int_t size_b = A.blocksize; //magma_int_t c_blocks = ceil( (float)A.num_cols / (float)size_b ); // max number of blocks per row //magma_int_t r_blocks = ceil( (float)A.num_rows / (float)size_b ); magma_int_t r_blocks = magma_ceildiv( A.num_rows, size_b ); // max number of blocks per column // memory allocation CHECK( magma_dmalloc_cpu( &B->val, size_b * size_b * A.numblocks )); CHECK( magma_index_malloc_cpu( &B->row, r_blocks + 1 )); CHECK( magma_index_malloc_cpu( &B->col, A.numblocks )); // data transfer magma_dgetvector( size_b * size_b * A.numblocks, A.dval, 1, B->val, 1, queue ); magma_index_getvector( r_blocks + 1, A.drow, 1, B->row, 1, queue ); magma_index_getvector( A.numblocks, A.dcol, 1, B->col, 1, queue ); } //DENSE-type else if ( A.storage_type == Magma_DENSE ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->major = A.major; B->ld = A.ld; // memory allocation CHECK( magma_dmalloc_cpu( &B->val, A.num_rows * A.num_cols )); // data transfer magma_dgetvector( A.num_rows * A.num_cols, A.dval, 1, B->val, 1, queue ); } } // fourth case: copy matrix from device to device else if ( src == Magma_DEV && dst == Magma_DEV ) { //CSR-type if ( A.storage_type == Magma_CSR || A.storage_type == Magma_CUCSR || A.storage_type == Magma_CSC || A.storage_type == Magma_CSRD || A.storage_type == Magma_CSRL || A.storage_type == Magma_CSRU ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc( &B->dval, A.nnz )); CHECK( magma_index_malloc( &B->drow, A.num_rows + 1 )); CHECK( magma_index_malloc( &B->dcol, A.nnz )); // data transfer magma_dcopyvector( A.nnz, A.dval, 1, B->dval, 1, queue ); magma_index_copyvector( A.num_rows + 1, A.drow, 1, B->drow, 1, queue ); magma_index_copyvector( A.nnz, A.dcol, 1, B->dcol, 1, queue ); } //COO-type else if ( A.storage_type == Magma_COO ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc( &B->dval, A.nnz )); CHECK( magma_index_malloc( &B->dcol, A.nnz )); CHECK( magma_index_malloc( &B->drowidx, A.nnz )); // data transfer magma_dcopyvector( A.nnz, A.dval, 1, B->dval, 1, queue ); magma_index_copyvector( A.nnz, A.dcol, 1, B->dcol, 1, queue ); magma_index_copyvector( A.nnz, A.drowidx, 1, B->drowidx, 1, queue ); } //CSRCOO-type else if ( A.storage_type == Magma_CSRCOO ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc( &B->dval, A.nnz )); CHECK( magma_index_malloc( &B->drow, A.num_rows + 1 )); CHECK( magma_index_malloc( &B->dcol, A.nnz )); CHECK( magma_index_malloc( &B->drowidx, A.nnz )); // data transfer magma_dcopyvector( A.nnz, A.dval, 1, B->dval, 1, queue ); magma_index_copyvector( A.num_rows + 1, A.drow, 1, B->drow, 1, queue ); magma_index_copyvector( A.nnz, A.dcol, 1, B->dcol, 1, queue ); magma_index_copyvector( A.nnz, A.drowidx, 1, B->drowidx, 1, queue ); } //ELL/ELLPACKT-type else if ( A.storage_type == Magma_ELLPACKT || A.storage_type == Magma_ELL ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc( &B->dval, A.num_rows * A.max_nnz_row )); CHECK( magma_index_malloc( &B->dcol, A.num_rows * A.max_nnz_row )); // data transfer magma_dcopyvector( A.num_rows * A.max_nnz_row, A.dval, 1, B->dval, 1, queue ); magma_index_copyvector( A.num_rows * A.max_nnz_row, A.dcol, 1, B->dcol, 1, queue ); } //ELLD-type else if ( A.storage_type == Magma_ELLD ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc( &B->dval, A.num_rows * A.max_nnz_row )); CHECK( magma_index_malloc( &B->dcol, A.num_rows * A.max_nnz_row )); // data transfer magma_dcopyvector( A.num_rows * A.max_nnz_row, A.dval, 1, B->dval, 1, queue ); magma_index_copyvector( A.num_rows * A.max_nnz_row, A.dcol, 1, B->dcol, 1, queue ); } //ELLRT-type else if ( A.storage_type == Magma_ELLRT ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->blocksize = A.blocksize; B->alignment = A.alignment; //int threads_per_row = A.alignment; //int rowlength = magma_roundup( A.max_nnz_row, threads_per_row ); magma_int_t rowlength = magma_roundup( A.max_nnz_row, A.alignment ); // memory allocation CHECK( magma_dmalloc( &B->dval, A.num_rows * rowlength )); CHECK( magma_index_malloc( &B->dcol, A.num_rows * rowlength )); CHECK( magma_index_malloc( &B->drow, A.num_rows )); // data transfer magma_dcopyvector( A.num_rows * rowlength, A.dval, 1, B->dval, 1, queue ); magma_index_copyvector( A.num_rows * rowlength, A.dcol, 1, B->dcol, 1, queue ); magma_index_copyvector( A.num_rows, A.drow, 1, B->drow, 1, queue ); } //SELLP-type else if ( A.storage_type == Magma_SELLP ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->blocksize = A.blocksize; B->numblocks = A.numblocks; B->alignment = A.alignment; // memory allocation CHECK( magma_dmalloc( &B->dval, A.nnz )); CHECK( magma_index_malloc( &B->dcol, A.nnz )); CHECK( magma_index_malloc( &B->drow, A.numblocks + 1 )); // data transfer magma_dcopyvector( A.nnz, A.dval, 1, B->dval, 1, queue ); magma_index_copyvector( A.nnz, A.dcol, 1, B->dcol, 1, queue ); magma_index_copyvector( A.numblocks + 1, A.drow, 1, B->drow, 1, queue ); } //BCSR-type else if ( A.storage_type == Magma_BCSR ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->blocksize = A.blocksize; B->numblocks = A.numblocks; B->alignment = A.alignment; magma_int_t size_b = A.blocksize; //magma_int_t c_blocks = ceil( (float)A.num_cols / (float)size_b ); // max number of blocks per row //magma_int_t r_blocks = ceil( (float)A.num_rows / (float)size_b ); magma_int_t r_blocks = magma_ceildiv( A.num_rows, size_b ); // max number of blocks per column // memory allocation CHECK( magma_dmalloc( &B->dval, size_b * size_b * A.numblocks )); CHECK( magma_index_malloc( &B->drow, r_blocks + 1 )); CHECK( magma_index_malloc( &B->dcol, A.numblocks )); // data transfer magma_dcopyvector( size_b * size_b * A.numblocks, A.dval, 1, B->dval, 1, queue ); magma_index_copyvector( r_blocks + 1, A.drow, 1, B->drow, 1, queue ); magma_index_copyvector( A.numblocks, A.dcol, 1, B->dcol, 1, queue ); } //DENSE-type else if ( A.storage_type == Magma_DENSE ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->major = A.major; B->ld = A.ld; // memory allocation CHECK( magma_dmalloc( &B->dval, A.num_rows * A.num_cols )); // data transfer magma_dcopyvector( A.num_rows * A.num_cols, A.dval, 1, B->dval, 1, queue ); } } cleanup: if( info != 0 ){ magma_dmfree( B, queue ); } return info; }