extern "C" magma_int_t magma_dormql( magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, double *a, magma_int_t lda, double *tau, double *c, magma_int_t ldc, double *work, magma_int_t lwork, magma_queue_t queue, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= DORMQL overwrites the general real M-by-N matrix C with SIDE = 'L' SIDE = 'R' TRANS = 'N': Q * C C * Q TRANS = 'C': Q**H * C C * Q**H where Q is a real unitary matrix defined as the product of k elementary reflectors Q = H(k) . . . H(2) H(1) as returned by DGEQLF. Q is of order M if SIDE = 'L' and of order N if SIDE = 'R'. Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply Q or Q**H from the Left; = 'R': apply Q or Q**H from the Right. TRANS (input) CHARACTER*1 = 'N': No transpose, apply Q; = 'C': Transpose, apply Q**H. M (input) INTEGER The number of rows of the matrix C. M >= 0. N (input) INTEGER The number of columns of the matrix C. N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = 'L', M >= K >= 0; if SIDE = 'R', N >= K >= 0. A (input) DOUBLE PRECISION array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by DGEQLF in the last k columns of its array argument A. A is modified by the routine but restored on exit. LDA (input) INTEGER The leading dimension of the array A. If SIDE = 'L', LDA >= max(1,M); if SIDE = 'R', LDA >= max(1,N). TAU (input) DOUBLE PRECISION array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by DGEQLF. C (input/output) DOUBLE PRECISION array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. LDC (input) INTEGER The leading dimension of the array C. LDC >= max(1,M). WORK (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. If SIDE = 'L', LWORK >= max(1,N); if SIDE = 'R', LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = 'L', and LWORK >= M*NB if SIDE = 'R', where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ /* Allocate work space on the GPU */ magmaDouble_ptr dwork, dc; magma_dmalloc( &dc, (m)*(n) ); magma_dmalloc( &dwork, 2*(m + 64)*64 ); /* Copy matrix C from the CPU to the GPU */ magma_dsetmatrix( m, n, c, ldc, dc, 0, m, queue ); //dc -= (1 + m); size_t dc_offset = -(1+m); magma_int_t a_offset, c_dim1, c_offset, i__4; magma_int_t i__; double t[2*4160] /* was [65][64] */; magma_int_t i1, i2, i3, ib, nb, mi, ni, nq, nw; magma_int_t iinfo, ldwork, lwkopt; int lquery, left, notran; a_offset = 1 + lda; a -= a_offset; --tau; c_dim1 = ldc; c_offset = 1 + c_dim1; c -= c_offset; *info = 0; left = (side == MagmaLeft); notran = (trans == MagmaNoTrans); lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = max(1,n); } else { nq = n; nw = max(1,m); } if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != MagmaConjTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } if (*info == 0) { if (m == 0 || n == 0) { lwkopt = 1; } else { /* Determine the block size. NB may be at most NBMAX, where NBMAX is used to define the local array T. */ nb = 64; lwkopt = nw * nb; } work[0] = MAGMA_D_MAKE( lwkopt, 0 ); if (lwork < nw && ! lquery) { *info = -12; } } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0) { return *info; } ldwork = nw; if ( nb >= k ) { /* Use CPU code */ lapackf77_dormql(lapack_const(side), lapack_const(trans), &m, &n, &k, &a[a_offset], &lda, &tau[1], &c[c_offset], &ldc, work, &lwork, &iinfo); } else { /* Use hybrid CPU-GPU code */ if ((left && notran) || (! left && ! notran)) { i1 = 1; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb + 1; i2 = 1; i3 = -nb; } if (left) { ni = n; } else { mi = m; } for (i__ = i1; i3 < 0 ? i__ >= i2 : i__ <= i2; i__ += i3) { ib = min(nb, k - i__ + 1); /* Form the triangular factor of the block reflector H = H(i+ib-1) . . . H(i+1) H(i) */ i__4 = nq - k + i__ + ib - 1; lapackf77_dlarft("Backward", "Columnwise", &i__4, &ib, &a[i__ * lda + 1], &lda, &tau[i__], t, &ib); /* 1) Put 0s in the lower triangular part of A; 2) copy the panel from A to the GPU, and 3) restore A */ dpanel_to_q(MagmaLower, ib, &a[i__ + i__ * lda], lda, t+ib*ib); magma_dsetmatrix( i__4, ib, &a[1 + i__ * lda], lda, dwork, 0, i__4, queue ); dq_to_panel(MagmaLower, ib, &a[i__ + i__ * lda], lda, t+ib*ib); if (left) { /* H or H' is applied to C(1:m-k+i+ib-1,1:n) */ mi = m - k + i__ + ib - 1; } else { /* H or H' is applied to C(1:m,1:n-k+i+ib-1) */ ni = n - k + i__ + ib - 1; } /* Apply H or H'; First copy T to the GPU */ magma_dsetmatrix( ib, ib, t, ib, dwork, i__4*ib, ib, queue ); magma_dlarfb_gpu(side, trans, MagmaBackward, MagmaColumnwise, mi, ni, ib, dwork, 0, i__4, dwork, i__4*ib, ib, dc, dc_offset+(1+m), m, dwork, (i__4*ib + ib*ib), ldwork, queue); } magma_dgetmatrix( m, n, dc, dc_offset+(1+m), m, &c[c_offset], ldc, queue ); } work[0] = MAGMA_D_MAKE( lwkopt, 0 ); //dc += (1 + m); magma_free( dc ); magma_free( dwork ); return *info; } /* magma_dormql */
extern "C" magma_int_t magma_dormqr2_gpu(const char side, const char trans, magma_int_t m, magma_int_t n, magma_int_t k, double *da, magma_int_t ldda, double *tau, double *dc, magma_int_t lddc, double *wa, magma_int_t ldwa, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= DORMQR overwrites the general real M-by-N matrix C with SIDE = 'L' SIDE = 'R' TRANS = 'N': Q * C C * Q TRANS = 'T': Q**T * C C * Q**T where Q is a real orthogonal matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by DGEQRF. Q is of order M if SIDE = 'L' and of order N if SIDE = 'R'. Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply Q or Q**T from the Left; = 'R': apply Q or Q**T from the Right. TRANS (input) CHARACTER*1 = 'N': No transpose, apply Q; = 'T': Transpose, apply Q**T. M (input) INTEGER The number of rows of the matrix C. M >= 0. N (input) INTEGER The number of columns of the matrix C. N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = 'L', M >= K >= 0; if SIDE = 'R', N >= K >= 0. DA (input) DOUBLE_PRECISION array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by DGEQRF in the first k columns of its array argument A. The diagonal and the upper part are destroyed, the reflectors are not modified. LDDA (input) INTEGER The leading dimension of the array DA. LDDA >= max(1,M) if SIDE = 'L'; LDDA >= max(1,N) if SIDE = 'R'. TAU (input) DOUBLE_PRECISION array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by DGEQRF. DC (device input/output) DOUBLE_PRECISION array, dimension (LDDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by (Q*C) or (Q**T * C) or (C * Q**T) or (C*Q). LDDC (input) INTEGER The leading dimension of the array C. LDDC >= max(1,M). WA (input/workspace) DOUBLE_PRECISION array, dimension (LDWA,M) if SIDE = 'L' (LDWA,N) if SIDE = 'R' The vectors which define the elementary reflectors, as returned by DSYTRD_GPU. LDWA (input) INTEGER The leading dimension of the array A. LDWA >= max(1,M) if SIDE = 'L'; LDWA >= max(1,N) if SIDE = 'R'. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ char side_[2] = {side, 0}; char trans_[2] = {trans, 0}; /* Allocate work space on the GPU */ double *dwork; magma_int_t wa_offset, dc_offset, i__4, lddwork; magma_int_t i; double t[2*4160] /* was [65][64] */; magma_int_t i1, i2, step, ib, ic, jc, nb, mi, ni, nq, nw; int left, notran; wa_offset = 1 + ldwa; wa -= wa_offset; --tau; dc_offset = 1 + lddc; dc -= dc_offset; *info = 0; left = lapackf77_lsame(side_, "L"); notran = lapackf77_lsame(trans_, "N"); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; magma_dmalloc( &dwork, (n + 64)*64 ); } else { nq = n; nw = m; magma_dmalloc( &dwork, (m + 64)*64 ); } if (! left && ! lapackf77_lsame(side_, "R")) { *info = -1; } else if (! notran && ! lapackf77_lsame(trans_, "T")) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (ldda < max(1,nq)) { *info = -7; } else if (lddc < max(1,m)) { *info = -10; } else if (ldwa < max(1,nq)) { *info = -12; } // size of the block nb = 64; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { return *info; } /* Use hybrid CPU-GPU code */ if ( ( left && (! notran) ) || ( (! left) && notran ) ) { i1 = 1; i2 = k; step = nb; } else { i1 = ((k - 1)/nb)*nb + 1; i2 = 1; step = -nb; } // silence "uninitialized" warnings mi = 0; ni = 0; if (left) { ni = n; jc = 1; } else { mi = m; ic = 1; } magmablas_dsetdiag1subdiag0('L', k, nb, da, ldda); // for i=i1 to i2 by step for (i = i1; (step < 0 ? i >= i2 : i <= i2); i += step) { ib = min(nb, k - i + 1); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ i__4 = nq - i + 1; lapackf77_dlarft("F", "C", &i__4, &ib, &wa[i + i*ldwa], &ldwa, &tau[i], t, &ib); if (left) { /* H or H' is applied to C(i:m,1:n) */ mi = m - i + 1; ic = i; } else { /* H or H' is applied to C(1:m,i:n) */ ni = n - i + 1; jc = i; } if (left) lddwork = ni; else lddwork = mi; /* Apply H or H'; First copy T to the GPU */ magma_dsetmatrix( ib, ib, t, ib, dwork, ib ); magma_dlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, da + (i - 1) + (i - 1)*ldda , ldda, dwork, ib, &dc[ic + jc*lddc], lddc, dwork + ib*ib, lddwork); } magma_free( dwork ); return *info; } /* magma_dormqr */
extern "C" magma_int_t magma_dgetrf_gpu( magma_int_t m, magma_int_t n, magmaDouble_ptr dA, size_t dA_offset, magma_int_t ldda, magma_int_t *ipiv, magma_queue_t queue, magma_int_t *info ) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define dA(i_, j_) dA, dA_offset + (i_)*nb + (j_)*nb*ldda #define dAT(i_, j_) dAT, dAT_offset + (i_)*nb*lddat + (j_)*nb #define dAP(i_, j_) dAP, (i_) + (j_)*maxm #define work(i_) (work + (i_)) double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t iinfo, nb; magma_int_t maxm, maxn, mindim; magma_int_t i, j, rows, s, lddat, ldwork; magmaDouble_ptr dAT, dAP; double *work; size_t dAT_offset; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); nb = magma_get_dgetrf_nb(m); s = mindim / nb; if (nb <= 1 || nb >= min(m,n)) { /* Use CPU code. */ if ( MAGMA_SUCCESS != magma_dmalloc_cpu( &work, m*n )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_dgetmatrix( m, n, dA(0,0), ldda, work(0), m, queue ); lapackf77_dgetrf( &m, &n, work, &m, ipiv, info ); magma_dsetmatrix( m, n, work(0), m, dA(0,0), ldda, queue ); magma_free_cpu( work ); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; if ( MAGMA_SUCCESS != magma_dmalloc( &dAP, nb*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } // square matrices can be done in place; // rectangular requires copy to transpose if ( m == n ) { dAT = dA; dAT_offset = dA_offset; lddat = ldda; magmablas_dtranspose_inplace( m, dAT(0,0), lddat, queue ); } else { lddat = maxn; // N-by-M dAT_offset = 0; if ( MAGMA_SUCCESS != magma_dmalloc( &dAT, lddat*maxm )) { magma_free( dAP ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmablas_dtranspose( m, n, dA(0,0), ldda, dAT(0,0), lddat, queue ); } ldwork = maxm; if ( MAGMA_SUCCESS != magma_dmalloc_cpu( &work, ldwork*nb )) { magma_free( dAP ); if ( dA != dAT ) magma_free( dAT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } for( j=0; j < s; j++ ) { // download j-th panel magmablas_dtranspose( nb, m-j*nb, dAT(j,j), lddat, dAP(0,0), maxm, queue ); magma_dgetmatrix( m-j*nb, nb, dAP(0,0), maxm, work(0), ldwork, queue ); if ( j > 0 ){ magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (j+1)*nb, nb, c_one, dAT(j-1,j-1), lddat, dAT(j-1,j+1), lddat, queue ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-j*nb, nb, c_neg_one, dAT(j-1,j+1), lddat, dAT(j, j-1), lddat, c_one, dAT(j, j+1), lddat, queue ); } // do the cpu part rows = m - j*nb; lapackf77_dgetrf( &rows, &nb, work, &ldwork, ipiv+j*nb, &iinfo ); if ( *info == 0 && iinfo > 0 ) *info = iinfo + j*nb; for( i=j*nb; i < j*nb + nb; ++i ) { ipiv[i] += j*nb; } magmablas_dlaswp( n, dAT(0,0), lddat, j*nb + 1, j*nb + nb, ipiv, 1, queue ); // upload j-th panel magma_dsetmatrix( m-j*nb, nb, work(0), ldwork, dAP(0,0), maxm, queue ); magmablas_dtranspose( m-j*nb, nb, dAP(0,0), maxm, dAT(j,j), lddat, queue ); // do the small non-parallel computations (next panel update) if ( s > (j+1) ) { magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat, queue ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat, queue ); } else { magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat, queue ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat, queue ); } } magma_int_t nb0 = min( m - s*nb, n - s*nb ); if ( nb0 > 0 ) { rows = m - s*nb; magmablas_dtranspose( nb0, rows, dAT(s,s), lddat, dAP(0,0), maxm, queue ); magma_dgetmatrix( rows, nb0, dAP(0,0), maxm, work(0), ldwork, queue ); // do the cpu part lapackf77_dgetrf( &rows, &nb0, work, &ldwork, ipiv+s*nb, &iinfo ); if ( *info == 0 && iinfo > 0 ) *info = iinfo + s*nb; for( i=s*nb; i < s*nb + nb0; ++i ) { ipiv[i] += s*nb; } magmablas_dlaswp( n, dAT(0,0), lddat, s*nb + 1, s*nb + nb0, ipiv, 1, queue ); // upload j-th panel magma_dsetmatrix( rows, nb0, work(0), ldwork, dAP(0,0), maxm, queue ); magmablas_dtranspose( rows, nb0, dAP(0,0), maxm, dAT(s,s), lddat, queue ); magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, dAT(s,s), lddat, dAT(s,s)+nb0, lddat, queue ); } // undo transpose if ( dA == dAT ) { magmablas_dtranspose_inplace( m, dAT(0,0), lddat, queue ); } else { magmablas_dtranspose( n, m, dAT(0,0), lddat, dA(0,0), ldda, queue ); magma_free( dAT ); } magma_free( dAP ); magma_free_cpu( work ); } return *info; } /* magma_dgetrf_gpu */
extern "C" void magma_dbulge_applyQ( magma_int_t WANTZ, magma_side_t SIDE, magma_int_t NE, magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, double *E, magma_int_t LDE, double *V, double *TAU, double *T, magma_int_t *INFO, double *dV, double *dT, double *dE, magma_int_t copytype ) { //%=========================== //% local variables //%=========================== double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; magma_int_t LDT, LDV, blklen, firstcolj; magma_int_t bg, nbGblk, rownbm, k, m, n; magma_int_t st, ed, fst, vlen, vnb, colj, len; magma_int_t blkid, vpos, taupos, tpos; //double *WORK; magma_int_t LWORK; magma_int_t cur_blksiz, avai_blksiz, ncolinvolvd; magma_int_t nbgr, colst, coled, versionL, versionR; magma_int_t blkcnt=-1; *INFO=0; versionL = 113; versionR = 92; LDT = Vblksiz; LDV = NB+Vblksiz-1; blklen = LDV*Vblksiz; nbGblk = plasma_ceildiv((N-1), Vblksiz); //magma_dmalloc_cpu( &WORK, LWORK ); /* find the size of the matrix T V*/ findVTsiz(N, NB, Vblksiz, &blkcnt, &LDV); /* Copy E & V & T to the GPU in dE and dV and dT * depending on copytype: * 1: mean copy only V * 2: mean copy V and T * 3: mean copy V, T and E * */ if (copytype > 0) magma_dsetmatrix( LDV, blkcnt*Vblksiz, V, LDV, dV, LDV ); if (copytype > 1) magma_dsetmatrix( LDT, blkcnt*Vblksiz, T, LDT, dT, LDT ); if (copytype > 2) magma_dsetmatrix( N, NE, E, N, dE, N ); double *dwork; magma_int_t ldwork; ldwork = NE; LWORK = 2*N*max(Vblksiz, 64); if (MAGMA_SUCCESS != magma_dmalloc( &dwork, LWORK )) { printf ("!!!! magma_dbulge_applyQ magma_alloc failed for: dwork\n" ); exit(-1); } /* SIDE LEFT meaning apply E = Q*E = (q_1*q_2*.....*q_n) * E ==> so traverse Vs in reverse order (forward) from q_n to q_1 * Also E is splitten by row meaning each apply consist in a block of row (horizontal block) */ /* SIDE RIGHT meaning apply E = E*Q = E * (q_1*q_2*.....*q_n) ==> so tarverse Vs in normal order (forward) from q_1 to q_n * Also E is splitten by col meaning each apply consist in a block of col (vertical block) */ /* WANTZ = 1 meaning E is IDENTITY so form Q using optimized update. * So we use the reverse order from small q to large one, * so from q_n to q_1 so Left update to Identity. * Use versionL 113 because in 114 we need to update the whole matrix and not in icreasing order. * WANTZ = 2 meaning E is a full matrix and need to be updated from Left or Right so use normal update * */ if (WANTZ == 1) { versionL=113; SIDE = MagmaLeft; //set the matrix to Identity here to avoid copying it from the CPU magmablas_dlaset( MagmaFull, N, N, c_zero, c_one, dE, N ); } printf(" APPLY Q_v115 GPU with N %d NB %d Vblksiz %d SIDE %c versionL %d versionR %d WANTZ %d \n", (int) N, (int) NB, (int) Vblksiz, SIDE, (int) versionL, (int) versionR, (int) WANTZ); #if defined(USESTREAM) magma_int_t N2=N/2; magma_int_t N1=N-N2; printf("using stream\n"); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); #endif if (SIDE == MagmaLeft) { if (versionL == 113) { for (bg = nbGblk; bg > 0; bg--) { firstcolj = (bg-1)*Vblksiz + 1; if (bg == nbGblk) rownbm = plasma_ceildiv((N-(firstcolj)), NB); // last blk has size=1 used for real to handle A(N,N-1) else rownbm = plasma_ceildiv((N-(firstcolj+1)), NB); for (m = rownbm; m > 0; m--) { vlen = 0; vnb = 0; colj = (bg-1)*Vblksiz; // for k=0; I compute the fst and then can remove it from the loop fst = (rownbm -m)*NB+colj +1; for (k=0; k < Vblksiz; k++) { colj = (bg-1)*Vblksiz + k; st = (rownbm -m)*NB+colj +1; ed = min(st+NB-1, N-1); if (st > ed) break; if ((st == ed) && (colj != N-2)) break; vlen=ed-fst+1; vnb=k+1; } colst = (bg-1)*Vblksiz; findVTpos(N, NB, Vblksiz, colst, fst, &vpos, &taupos, &tpos, &blkid); printf("voici bg %d m %d vlen %d vnb %d fcolj %d vpos %d taupos %d \n", (int) bg, (int) m, (int) vlen, (int) vnb, (int) colst+1, (int) vpos+1, (int) taupos+1); if ((vlen > 0) && (vnb > 0)) { if (WANTZ == 1) { len = N-colst; magma_dlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, len, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,colst), LDE, dwork, len); } else { magma_dlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, NE, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,0), LDE, dwork, NE); } } } } } else if (versionL == 114) { rownbm = plasma_ceildiv((N-1), NB); for (m = rownbm; m > 0; m--) { ncolinvolvd = min(N-1, m*NB); avai_blksiz=min(Vblksiz, ncolinvolvd); nbgr = plasma_ceildiv(ncolinvolvd, avai_blksiz); for (n = nbgr; n > 0; n--) { vlen = 0; vnb = 0; cur_blksiz = min(ncolinvolvd-(n-1)*avai_blksiz, avai_blksiz); colst = (n-1)*avai_blksiz; coled = colst + cur_blksiz -1; fst = (rownbm -m)*NB+colst +1; for (colj=colst; colj <= coled; colj++) { st = (rownbm -m)*NB+colj +1; ed = min(st+NB-1, N-1); if (st > ed) break; if ((st == ed) && (colj != N-2)) break; vlen=ed-fst+1; vnb=vnb+1; } findVTpos(N, NB, Vblksiz, colst, fst, &vpos, &taupos, &tpos, &blkid); //printf("voici bg %d m %d vlen %d vnb %d fcolj %d vpos %d taupos %d \n", bg, m, vlen, vnb, colst+1, vpos+1, taupos+1); if ((vlen > 0) && (vnb > 0)) { #if defined(USESTREAM) magmablasSetKernelStream(stream[0]); magma_dlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, N1, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,0), LDE, dwork, N1); magmablasSetKernelStream(stream[1]); magma_dlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, N2, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,N1), LDE, &dwork[N1*Vblksiz], N2); #else magma_dlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, NE, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,0), LDE, dwork, NE); #endif } } } } } else if (SIDE == MagmaRight) { if (versionR == 91) { for (bg =1; bg <= nbGblk; bg++) { firstcolj = (bg-1)*Vblksiz + 1; rownbm = plasma_ceildiv((N-(firstcolj+1)), NB); if (bg == nbGblk) rownbm = plasma_ceildiv((N-(firstcolj)), NB); // last blk has size=1 used for real to handle A(N,N-1) for (m = 1; m <= rownbm; m++) { vlen = 0; vnb = 0; // for k=0; I compute the fst and then can remove it from the loop colj = (bg-1)*Vblksiz; fst = (rownbm -m)*NB+colj +1; for (k=0; k < Vblksiz; k++) { colj = (bg-1)*Vblksiz + k; st = (rownbm -m)*NB+colj +1; ed = min(st+NB-1, N-1); if (st > ed) break; if ((st == ed) && (colj != N-2)) break; vlen=ed-fst+1; vnb=k+1; } colj = (bg-1)*Vblksiz; findVTpos(N, NB, Vblksiz, colj, fst, &vpos, &taupos, &tpos, &blkid); //printf("voici bg %d m %d vlen %d vnb %d fcolj %d vpos %d taupos %d \n", bg, m, vlen, vnb, colj, vpos, taupos); if ((vlen > 0) && (vnb > 0)) { #if defined(USESTREAM) magmablasSetKernelStream(stream[0]); magma_dlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N1, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, N1); magmablasSetKernelStream(stream[1]); magma_dlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N2, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(N1, fst), LDE, &dwork[N1*Vblksiz], N2); #else magma_dlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, NE); #endif } } } } else if (versionR == 92) { rownbm = plasma_ceildiv((N-1), NB); for (m = 1; m <= rownbm; m++) { ncolinvolvd = min(N-1, m*NB); avai_blksiz=min(Vblksiz, ncolinvolvd); nbgr = plasma_ceildiv(ncolinvolvd, avai_blksiz); for (n = 1; n <= nbgr; n++) { vlen = 0; vnb = 0; cur_blksiz = min(ncolinvolvd-(n-1)*avai_blksiz, avai_blksiz); colst = (n-1)*avai_blksiz; coled = colst + cur_blksiz -1; fst = (rownbm -m)*NB+colst +1; for (colj=colst; colj <= coled; colj++) { st = (rownbm -m)*NB+colj +1; ed = min(st+NB-1, N-1); if (st > ed) break; if ((st == ed) && (colj != N-2)) break; vlen=ed-fst+1; vnb=vnb+1; } findVTpos(N, NB, Vblksiz, colst, fst, &vpos, &taupos, &tpos, &blkid); if ((vlen > 0) && (vnb > 0)) { #if defined(USESTREAM) magmablasSetKernelStream(stream[0]); magma_dlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N1, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, N1); magmablasSetKernelStream(stream[1]); magma_dlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N2, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(N1, fst), LDE, &dwork[N1*Vblksiz], N2); #else magma_dlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, NE); #endif } } } } } else { printf("ERROR SIDE %d\n", SIDE); } #if defined(USESTREAM) magmablasSetKernelStream(NULL); magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); #endif }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dpotrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double *h_A, *h_R; magmaDouble_ptr d_A; magma_int_t N, n2, lda, ldda, info; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double work[1], error; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) double tol = opts.tolerance * lapackf77_dlamch("E"); printf("uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma - R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; gflops = FLOPS_DPOTRF( N ) / 1e9; TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); magma_dmake_hpd( N, h_A, lda ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_dsetmatrix( N, N, h_A, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_dpotrf_gpu( opts.uplo, N, d_A, ldda, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dpotrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dpotrf( lapack_uplo_const(opts.uplo), &N, h_A, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_dgetmatrix( N, N, d_A, ldda, h_R, lda ); error = lapackf77_dlange("f", &N, &N, h_A, &lda, work); blasf77_daxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("f", &N, &N, h_R, &lda, work) / error; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); } else { printf("%5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) N, gpu_perf, gpu_time ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/** Purpose ------- DSYEVDX computes selected eigenvalues and, optionally, eigenvectors of a real symmetric matrix A. Eigenvalues and eigenvectors can be selected by specifying either a range of values or a range of indices for the desired eigenvalues. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] jobz magma_vec_t - = MagmaNoVec: Compute eigenvalues only; - = MagmaVec: Compute eigenvalues and eigenvectors. @param[in] range magma_range_t - = MagmaRangeAll: all eigenvalues will be found. - = MagmaRangeV: all eigenvalues in the half-open interval (VL,VU] will be found. - = MagmaRangeI: the IL-th through IU-th eigenvalues will be found. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] dA DOUBLE_PRECISION array on the GPU, dimension (LDDA, N). On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = MagmaVec, then if INFO = 0, the first m columns of A contains the required orthonormal eigenvectors of the matrix A. If JOBZ = MagmaNoVec, then on exit the lower triangle (if UPLO=MagmaLower) or the upper triangle (if UPLO=MagmaUpper) of A, including the diagonal, is destroyed. @param[in] ldda INTEGER The leading dimension of the array DA. LDDA >= max(1,N). @param[in] vl DOUBLE PRECISION @param[in] vu DOUBLE PRECISION If RANGE=MagmaRangeV, the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = MagmaRangeAll or MagmaRangeI. @param[in] il INTEGER @param[in] iu INTEGER If RANGE=MagmaRangeI, the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = MagmaRangeAll or MagmaRangeV. @param[out] m INTEGER The total number of eigenvalues found. 0 <= M <= N. If RANGE = MagmaRangeAll, M = N, and if RANGE = MagmaRangeI, M = IU-IL+1. @param[out] w DOUBLE PRECISION array, dimension (N) If INFO = 0, the required m eigenvalues in ascending order. @param wA (workspace) DOUBLE PRECISION array, dimension (LDWA, N) @param[in] ldwa INTEGER The leading dimension of the array wA. LDWA >= max(1,N). @param[out] work (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LWORK >= 2*N + N*NB. If JOBZ = MagmaVec and N > 1, LWORK >= max( 2*N + N*NB, 1 + 6*N + 2*N**2 ). NB can be obtained through magma_get_dsytrd_nb(N). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. @param[out] iwork (workspace) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. @param[in] liwork INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1. If JOBZ = MagmaVec and N > 1, LIWORK >= 3 + 5*N. \n If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = MagmaVec, then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details --------------- Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified description of INFO. Sven, 16 Feb 05. @ingroup magma_dsyev_driver ********************************************************************/ extern "C" magma_int_t magma_dsyevdx_gpu(magma_vec_t jobz, magma_range_t range, magma_uplo_t uplo, magma_int_t n, double *dA, magma_int_t ldda, double vl, double vu, magma_int_t il, magma_int_t iu, magma_int_t *m, double *w, double *wA, magma_int_t ldwa, double *work, magma_int_t lwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { magma_int_t ione = 1; double d__1; double eps; magma_int_t inde; double anrm; double rmin, rmax; double sigma; magma_int_t iinfo, lwmin; magma_int_t lower; magma_int_t wantz; magma_int_t indwk2, llwrk2; magma_int_t iscale; double safmin; double bignum; magma_int_t indtau; magma_int_t indwrk, liwmin; magma_int_t llwork; double smlnum; magma_int_t lquery; magma_int_t alleig, valeig, indeig; double *dwork; magma_int_t lddc = ldda; wantz = (jobz == MagmaVec); lower = (uplo == MagmaLower); alleig = (range == MagmaRangeAll); valeig = (range == MagmaRangeV); indeig = (range == MagmaRangeI); lquery = (lwork == -1 || liwork == -1); *info = 0; if (! (wantz || (jobz == MagmaNoVec))) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (lower || (uplo == MagmaUpper))) { *info = -3; } else if (n < 0) { *info = -4; } else if (ldda < max(1,n)) { *info = -6; } else if (ldwa < max(1,n)) { *info = -14; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -8; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -9; } else if (iu < min(n,il) || iu > n) { *info = -10; } } } magma_int_t nb = magma_get_dsytrd_nb( n ); if ( n <= 1 ) { lwmin = 1; liwmin = 1; } else if ( wantz ) { lwmin = max( 2*n + n*nb, 1 + 6*n + 2*n*n ); liwmin = 3 + 5*n; } else { lwmin = 2*n + n*nb; liwmin = 1; } // multiply by 1+eps (in Double!) to ensure length gets rounded up, // if it cannot be exactly represented in floating point. real_Double_t one_eps = 1. + lapackf77_dlamch("Epsilon"); work[0] = lwmin * one_eps; iwork[0] = liwmin; if ((lwork < lwmin) && !lquery) { *info = -16; } else if ((liwork < liwmin) && ! lquery) { *info = -18; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ if (n <= 128) { #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif const char* jobz_ = lapack_vec_const( jobz ); const char* uplo_ = lapack_uplo_const( uplo ); double *A; magma_dmalloc_cpu( &A, n*n ); magma_dgetmatrix(n, n, dA, ldda, A, n); lapackf77_dsyevd(jobz_, uplo_, &n, A, &n, w, work, &lwork, iwork, &liwork, info); magma_dsetmatrix( n, n, A, n, dA, ldda); magma_free_cpu(A); return *info; } magma_queue_t stream; magma_queue_create( &stream ); // n*lddc for dsytrd2_gpu // n for dlansy magma_int_t ldwork = n*lddc; if ( wantz ) { // need 3n^2/2 for dstedx ldwork = max( ldwork, 3*n*(n/2 + 1)); } if (MAGMA_SUCCESS != magma_dmalloc( &dwork, ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* Get machine constants. */ safmin = lapackf77_dlamch("Safe minimum"); eps = lapackf77_dlamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_dsqrt(smlnum); rmax = magma_dsqrt(bignum); /* Scale matrix to allowable range, if necessary. */ anrm = magmablas_dlansy(MagmaMaxNorm, uplo, n, dA, ldda, dwork); iscale = 0; sigma = 1; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { magmablas_dlascl(uplo, 0, 0, 1., sigma, n, n, dA, ldda, info); } /* Call DSYTRD to reduce symmetric matrix to tridiagonal form. */ // dsytrd work: e (n) + tau (n) + llwork (n*nb) ==> 2n + n*nb // dstedx work: e (n) + tau (n) + z (n*n) + llwrk2 (1 + 4*n + n^2) ==> 1 + 6n + 2n^2 inde = 0; indtau = inde + n; indwrk = indtau + n; indwk2 = indwrk + n*n; llwork = lwork - indwrk; llwrk2 = lwork - indwk2; magma_timer_t time=0; timer_start( time ); #ifdef FAST_SYMV magma_dsytrd2_gpu(uplo, n, dA, ldda, w, &work[inde], &work[indtau], wA, ldwa, &work[indwrk], llwork, dwork, n*lddc, &iinfo); #else magma_dsytrd_gpu(uplo, n, dA, ldda, w, &work[inde], &work[indtau], wA, ldwa, &work[indwrk], llwork, &iinfo); #endif timer_stop( time ); timer_printf( "time dsytrd = %6.2f\n", time ); /* For eigenvalues only, call DSTERF. For eigenvectors, first call DSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the tridiagonal matrix, then call DORMTR to multiply it to the Householder transformations represented as Householder vectors in A. */ if (! wantz) { lapackf77_dsterf(&n, w, &work[inde], info); magma_dmove_eig(range, n, w, &il, &iu, vl, vu, m); } else { timer_start( time ); magma_dstedx(range, n, vl, vu, il, iu, w, &work[inde], &work[indwrk], n, &work[indwk2], llwrk2, iwork, liwork, dwork, info); timer_stop( time ); timer_printf( "time dstedx = %6.2f\n", time ); timer_start( time ); magma_dmove_eig(range, n, w, &il, &iu, vl, vu, m); magma_dsetmatrix( n, *m, &work[indwrk + n* (il-1) ], n, dwork, lddc ); magma_dormtr_gpu(MagmaLeft, uplo, MagmaNoTrans, n, *m, dA, ldda, &work[indtau], dwork, lddc, wA, ldwa, &iinfo); magma_dcopymatrix( n, *m, dwork, lddc, dA, ldda ); timer_stop( time ); timer_printf( "time dormtr + copy = %6.2f\n", time ); } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { d__1 = 1. / sigma; blasf77_dscal(&n, &d__1, w, &ione); } work[0] = lwmin * one_eps; // round up iwork[0] = liwmin; magma_queue_destroy( stream ); magma_free( dwork ); return *info; } /* magma_dsyevd_gpu */
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf, cpu_time; double magma_error, cublas_error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t M, N, Xm, Ym, lda, sizeA, sizeX, sizeY; magma_int_t incx = 1; magma_int_t incy = 1; double c_neg_one = MAGMA_D_NEG_ONE; double alpha = MAGMA_D_MAKE( 1.5, -2.3 ); double beta = MAGMA_D_MAKE( -0.6, 0.8 ); double *A, *X, *Y, *Ycublas, *Ymagma; double *dA, *dX, *dY; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("trans = %s\n", lapack_trans_const(opts.transA) ); printf(" M N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("===================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = ((M+31)/32)*32; gflops = FLOPS_DGEMV( M, N ) / 1e9; if ( opts.transA == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } sizeA = lda*N; sizeX = incx*Xm; sizeY = incy*Ym; TESTING_MALLOC_CPU( A, double, sizeA ); TESTING_MALLOC_CPU( X, double, sizeX ); TESTING_MALLOC_CPU( Y, double, sizeY ); TESTING_MALLOC_CPU( Ycublas, double, sizeY ); TESTING_MALLOC_CPU( Ymagma, double, sizeY ); TESTING_MALLOC_DEV( dA, double, sizeA ); TESTING_MALLOC_DEV( dX, double, sizeX ); TESTING_MALLOC_DEV( dY, double, sizeY ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &sizeA, A ); lapackf77_dlarnv( &ione, ISEED, &sizeX, X ); lapackf77_dlarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_dsetmatrix( M, N, A, lda, dA, lda ); magma_dsetvector( Xm, X, incx, dX, incx ); magma_dsetvector( Ym, Y, incy, dY, incy ); cublas_time = magma_sync_wtime( 0 ); cublasDgemv( handle, cublas_trans_const(opts.transA), M, N, &alpha, dA, lda, dX, incx, &beta, dY, incy ); cublas_time = magma_sync_wtime( 0 ) - cublas_time; cublas_perf = gflops / cublas_time; magma_dgetvector( Ym, dY, incy, Ycublas, incy ); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_dsetvector( Ym, Y, incy, dY, incy ); magma_time = magma_sync_wtime( 0 ); magmablas_dgemv( opts.transA, M, N, alpha, dA, lda, dX, incx, beta, dY, incy ); magma_time = magma_sync_wtime( 0 ) - magma_time; magma_perf = gflops / magma_time; magma_dgetvector( Ym, dY, incy, Ymagma, incy ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_dgemv( lapack_trans_const(opts.transA), &M, &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_daxpy( &Ym, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_dlange( "M", &Ym, &ione, Ymagma, &Ym, work ) / Ym; blasf77_daxpy( &Ym, &c_neg_one, Y, &incy, Ycublas, &incy ); cublas_error = lapackf77_dlange( "M", &Ym, &ione, Ycublas, &Ym, work ) / Ym; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error, (magma_error < tol && cublas_error < tol ? "ok" : "failed")); status += ! (magma_error < tol && cublas_error < tol); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/** Purpose ------- DGEQRF3 computes a QR factorization of a real M-by-N matrix A: A = Q * R. This version stores the triangular dT matrices used in the block QR factorization so that they can be applied directly (i.e., without being recomputed) later. As a result, the application of Q is much faster. Also, the upper triangular matrices for V have 0s in them and the corresponding parts of the upper triangular R are stored separately in dT. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] tau DOUBLE_PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] dT (workspace) DOUBLE_PRECISION array on the GPU, dimension (2*MIN(M, N) + (N+31)/32*32 )*NB, where NB can be obtained through magma_get_dgeqrf_nb(M). It starts with MIN(M,N)*NB block that store the triangular T matrices, followed by the MIN(M,N)*NB block of the diagonal matrices for the R matrix. The rest of the array is used as workspace. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_dgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_dgeqrf3_gpu( magma_int_t m, magma_int_t n, magmaDouble_ptr dA, magma_int_t ldda, double *tau, magmaDouble_ptr dT, magma_int_t *info ) { #define dA(a_1,a_2) (dA + (a_2)*(ldda) + (a_1)) #define dT(a_1) (dT + (a_1)*nb) #define d_ref(a_1) (dT + ( minmn+(a_1))*nb) #define dd_ref(a_1) (dT + (2*minmn+(a_1))*nb) #define work(a_1) (work + (a_1)) #define hwork (work + (nb)*(m)) magma_int_t i, k, minmn, old_i, old_ib, rows, cols; magma_int_t ib, nb; magma_int_t ldwork, lddwork, lwork, lhwork; double *work, *ut; /* check arguments */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = minmn = min(m,n); if (k == 0) return *info; nb = magma_get_dgeqrf_nb(m); lwork = (m + n + nb)*nb; lhwork = lwork - m*nb; if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, lwork )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } ut = hwork+nb*(n); memset( ut, 0, nb*nb*sizeof(double)); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); ldwork = m; lddwork= n; if ( (nb > 1) && (nb < k) ) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nb; i += nb) { ib = min(k-i, nb); rows = m -i; magma_dgetmatrix_async( rows, ib, dA(i,i), ldda, work(i), ldwork, stream[1] ); if (i > 0) { /* Apply H' to A(i:m,i+2*ib:n) from the left */ cols = n-old_i-2*old_ib; magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, cols, old_ib, dA(old_i, old_i ), ldda, dT(old_i), nb, dA(old_i, old_i+2*old_ib), ldda, dd_ref(0), lddwork); /* store the diagonal */ magma_dsetmatrix_async( old_ib, old_ib, ut, old_ib, d_ref(old_i), old_ib, stream[0] ); } magma_queue_sync( stream[1] ); lapackf77_dgeqrf(&rows, &ib, work(i), &ldwork, tau+i, hwork, &lhwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work(i), &ldwork, tau+i, hwork, &ib); /* Put 0s in the upper triangular part of a panel (and 1s on the diagonal); copy the upper triangular in ut. */ magma_queue_sync( stream[0] ); dsplit_diag_block3(ib, work(i), ldwork, ut); magma_dsetmatrix( rows, ib, work(i), ldwork, dA(i,i), ldda ); if (i + ib < n) { /* Send the triangular factor T to the GPU */ magma_dsetmatrix( ib, ib, hwork, ib, dT(i), nb ); if (i+nb < k-nb) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dT(i), nb, dA(i, i+ib), ldda, dd_ref(0), lddwork); } else { cols = n-i-ib; magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, cols, ib, dA(i, i ), ldda, dT(i), nb, dA(i, i+ib), ldda, dd_ref(0), lddwork); /* Fix the diagonal block */ magma_dsetmatrix( ib, ib, ut, ib, d_ref(i), ib ); } old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_dgetmatrix( rows, ib, dA(i, i), ldda, work, rows ); lhwork = lwork - rows*ib; lapackf77_dgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_dsetmatrix( rows, ib, work, rows, dA(i, i), ldda ); } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free_pinned( work ); return *info; } /* magma_dgeqrf_gpu */
/** Purpose ------- DORGQR generates an M-by-N DOUBLE_PRECISION matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by DGEQRF. This version recomputes the T matrices on the CPU and sends them to the GPU. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] A DOUBLE_PRECISION array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by DGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] lda INTEGER The first dimension of the array A. LDA >= max(1,M). @param[in] tau DOUBLE_PRECISION array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by DGEQRF_GPU. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument has an illegal value @ingroup magma_dgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_dorgqr2( magma_int_t m, magma_int_t n, magma_int_t k, double *A, magma_int_t lda, double *tau, magma_int_t *info) { #define A(i,j) ( A + (i) + (j)*lda ) #define dA(i,j) (dA + (i) + (j)*ldda) double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; magma_int_t nb = magma_get_dgeqrf_nb(min(m, n)); magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldda; magma_int_t i, ib, ki, kk; //, iinfo; magma_int_t lddwork; double *dA, *dV, *dW, *dT, *T; double *work; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate GPU work space // ldda*n for matrix dA // ldda*nb for dV // lddwork*nb for dW larfb workspace ldda = ((m + 31) / 32) * 32; lddwork = ((n + 31) / 32) * 32; if (MAGMA_SUCCESS != magma_dmalloc( &dA, ldda*n + ldda*nb + lddwork*nb + nb*nb)) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dV = dA + ldda*n; dW = dA + ldda*n + ldda*nb; dT = dA + ldda*n + ldda*nb + lddwork*nb; // Allocate CPU work space lwork = (n+m+nb) * nb; magma_dmalloc_cpu( &work, lwork ); T = work; if (work == NULL) { magma_free( dA ); magma_free_cpu( work ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } double *V = work + (n+nb)*nb; magma_queue_t stream; magma_queue_create( &stream ); // Use unblocked code for the last or only block. if (kk < n) { m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; /* lapackf77_dorgqr( &m_kk, &n_kk, &k_kk, A(kk, kk), &lda, &tau[kk], work, &lwork, &iinfo ); */ lapackf77_dlacpy( MagmaUpperLowerStr, &m_kk, &k_kk, A(kk,kk), &lda, V, &m_kk); lapackf77_dlaset( MagmaUpperLowerStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda ); lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &k_kk, V, &m_kk, &tau[kk], work, &k_kk); lapackf77_dlarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &n_kk, &k_kk, V, &m_kk, work, &k_kk, A(kk, kk), &lda, work+k_kk*k_kk, &n_kk ); if (kk > 0) { magma_dsetmatrix( m_kk, n_kk, A(kk, kk), lda, dA(kk, kk), ldda ); // Set A(1:kk,kk+1:n) to zero. magmablas_dlaset( MagmaFull, kk, n - kk, c_zero, c_zero, dA(0, kk), ldda ); } } if (kk > 0) { // Use blocked code // stream: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation magmablasSetKernelStream( stream ); for (i = ki; i >= 0; i -= nb) { ib = min(nb, k - i); // Send current panel to the GPU mi = m - i; lapackf77_dlaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); magma_dsetmatrix_async( mi, ib, A(i, i), lda, dV, ldda, stream ); lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &mi, &ib, A(i,i), &lda, &tau[i], T, &nb); magma_dsetmatrix_async( ib, ib, T, nb, dT, nb, stream ); // set panel to identity magmablas_dlaset( MagmaFull, i, ib, c_zero, c_zero, dA(0, i), ldda ); magmablas_dlaset( MagmaFull, mi, ib, c_zero, c_one, dA(i, i), ldda ); magma_queue_sync( stream ); if (i < n) { // Apply H to A(i:m,i:n) from the left magma_dlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, n-i, ib, dV, ldda, dT, nb, dA(i, i), ldda, dW, lddwork ); } } // copy result back to CPU magma_dgetmatrix( m, n, dA(0, 0), ldda, A(0, 0), lda); } magma_queue_destroy( stream ); magma_free( dA ); magma_free_cpu( work ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_dorgqr */
extern "C" magma_int_t magma_dgetrf_gpu(magma_int_t m, magma_int_t n, double *dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define inAT(i,j) (dAT + (i)*nb*lddat + (j)*nb) double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t iinfo, nb; magma_int_t maxm, maxn, mindim; magma_int_t i, rows, cols, s, lddat, lddwork; double *dAT, *dAP, *work; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); nb = magma_get_dgetrf_nb(m); s = mindim / nb; if (nb <= 1 || nb >= min(m,n)) { /* Use CPU code. */ magma_dmalloc_cpu( &work, m * n ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_dgetmatrix( m, n, dA, ldda, work, m ); lapackf77_dgetrf(&m, &n, work, &m, ipiv, info); magma_dsetmatrix( m, n, work, m, dA, ldda ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; lddat = maxn; lddwork = maxm; dAT = dA; if (MAGMA_SUCCESS != magma_dmalloc( &dAP, nb*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if ((m == n) && (m % 32 == 0) && (ldda%32 == 0)){ lddat = ldda; magmablas_dinplace_transpose( dAT, ldda, m); } else { if (MAGMA_SUCCESS != magma_dmalloc( &dAT, maxm*maxn )) { magma_free( dAP ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmablas_dtranspose2( dAT, lddat, dA, ldda, m, n ); } if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, maxm*nb )) { magma_free( dAP ); if (! ((m == n) && (m % 32 == 0) && (ldda%32 == 0)) ) magma_free( dAT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } for( i=0; i<s; i++ ) { // download i-th panel cols = maxm - i*nb; magmablas_dtranspose( dAP, cols, inAT(i,i), lddat, nb, cols ); magma_dgetmatrix( m-i*nb, nb, dAP, cols, work, lddwork ); // make sure that gpu queue is empty magma_device_sync(); if ( i>0 ){ magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (i+1)*nb, nb, c_one, inAT(i-1,i-1), lddat, inAT(i-1,i+1), lddat ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, n-(i+1)*nb, m-i*nb, nb, c_neg_one, inAT(i-1,i+1), lddat, inAT(i, i-1), lddat, c_one, inAT(i, i+1), lddat ); } // do the cpu part rows = m - i*nb; lapackf77_dgetrf( &rows, &nb, work, &lddwork, ipiv+i*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + i*nb; magmablas_dpermute_long2( n, dAT, lddat, ipiv, nb, i*nb ); // upload i-th panel magma_dsetmatrix( m-i*nb, nb, work, lddwork, dAP, maxm ); magmablas_dtranspose(inAT(i,i), lddat, dAP, maxm, cols, nb); // do the small non-parallel computations if ( s > (i+1) ) { magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, inAT(i, i ), lddat, inAT(i, i+1), lddat); magma_dgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(i+1)*nb, nb, c_neg_one, inAT(i, i+1), lddat, inAT(i+1, i ), lddat, c_one, inAT(i+1, i+1), lddat ); } else { magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, inAT(i, i ), lddat, inAT(i, i+1), lddat); magma_dgemm( MagmaNoTrans, MagmaNoTrans, n-(i+1)*nb, m-(i+1)*nb, nb, c_neg_one, inAT(i, i+1), lddat, inAT(i+1, i ), lddat, c_one, inAT(i+1, i+1), lddat ); } } magma_int_t nb0 = min(m - s*nb, n - s*nb); rows = m - s*nb; cols = maxm - s*nb; magmablas_dtranspose2( dAP, maxm, inAT(s,s), lddat, nb0, rows); magma_dgetmatrix( rows, nb0, dAP, maxm, work, lddwork ); // make sure that gpu queue is empty magma_device_sync(); // do the cpu part lapackf77_dgetrf( &rows, &nb0, work, &lddwork, ipiv+s*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + s*nb; magmablas_dpermute_long2( n, dAT, lddat, ipiv, nb0, s*nb ); // upload i-th panel magma_dsetmatrix( rows, nb0, work, lddwork, dAP, maxm ); magmablas_dtranspose2( inAT(s,s), lddat, dAP, maxm, rows, nb0); magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, inAT(s,s), lddat, inAT(s,s)+nb0, lddat); if ((m == n) && (m % 32 == 0) && (ldda%32 == 0)){ magmablas_dinplace_transpose( dAT, lddat, m ); } else { magmablas_dtranspose2( dA, ldda, dAT, lddat, n, m ); magma_free( dAT ); } magma_free( dAP ); magma_free_pinned( work ); } return *info; /* End of MAGMA_DGETRF_GPU */ }
/** Purpose ------- DPOSV computes the solution to a real system of linear equations A * X = B, where A is an N-by-N symmetric positive definite matrix and X and B are N-by-NRHS matrices. The Cholesky decomposition is used to factor A as A = U**H * U, if UPLO = MagmaUpper, or A = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is a lower triangular matrix. The factored form of A is then used to solve the system of equations A * X = B. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in,out] A DOUBLE PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U**H*U or A = L*L**H. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in,out] B DOUBLE PRECISION array, dimension (LDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. @param[in] ldb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_dposv_driver ********************************************************************/ extern "C" magma_int_t magma_dposv( magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs, double *A, magma_int_t lda, double *B, magma_int_t ldb, magma_int_t *info ) { #ifdef HAVE_clBLAS #define dA(i_, j_) dA, ((i_) + (j_)*ldda) #define dB(i_, j_) dB, ((i_) + (j_)*lddb) #else #define dA(i_, j_) (dA + (i_) + (j_)*ldda) #define dB(i_, j_) (dB + (i_) + (j_)*lddb) #endif magma_int_t ngpu, ldda, lddb; magma_queue_t queue = NULL; magma_device_t cdev; *info = 0; if ( uplo != MagmaUpper && uplo != MagmaLower ) *info = -1; if ( n < 0 ) *info = -2; if ( nrhs < 0) *info = -3; if ( lda < max(1, n) ) *info = -5; if ( ldb < max(1, n) ) *info = -7; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (n == 0 || nrhs == 0) { return *info; } /* If single-GPU and allocation suceeds, use GPU interface. */ ngpu = magma_num_gpus(); magmaDouble_ptr dA, dB; if ( ngpu > 1 ) { goto CPU_INTERFACE; } ldda = magma_roundup( n, 32 ); lddb = ldda; if ( MAGMA_SUCCESS != magma_dmalloc( &dA, ldda*n )) { goto CPU_INTERFACE; } if ( MAGMA_SUCCESS != magma_dmalloc( &dB, lddb*nrhs )) { magma_free( dA ); goto CPU_INTERFACE; } magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); magma_dsetmatrix( n, n, A, lda, dA(0,0), ldda, queue ); magma_dpotrf_gpu( uplo, n, dA(0,0), ldda, info ); if ( *info == MAGMA_ERR_DEVICE_ALLOC ) { magma_queue_destroy( queue ); magma_free( dA ); magma_free( dB ); goto CPU_INTERFACE; } magma_dgetmatrix( n, n, dA(0,0), ldda, A, lda, queue ); if ( *info == 0 ) { magma_dsetmatrix( n, nrhs, B, ldb, dB(0,0), lddb, queue ); magma_dpotrs_gpu( uplo, n, nrhs, dA(0,0), ldda, dB(0,0), lddb, info ); magma_dgetmatrix( n, nrhs, dB(0,0), lddb, B, ldb, queue ); } magma_queue_destroy( queue ); magma_free( dA ); magma_free( dB ); return *info; CPU_INTERFACE: /* If multi-GPU or allocation failed, use CPU interface and LAPACK. * Faster to use LAPACK for potrs than to copy A to GPU. */ magma_dpotrf( uplo, n, A, lda, info ); if ( *info == 0 ) { lapackf77_dpotrs( lapack_uplo_const(uplo), &n, &nrhs, A, &lda, B, &ldb, info ); } return *info; }
extern "C" magma_int_t magma_dgehrd(magma_int_t n, magma_int_t ilo, magma_int_t ihi, double *a, magma_int_t lda, double *tau, double *work, magma_int_t lwork, magmaDouble_ptr dT, size_t dT_offset, magma_int_t *info, magma_queue_t queue) { /* -- clMAGMA (version 1.1.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date January 2014 Purpose ======= DGEHRD reduces a DOUBLE_PRECISION general matrix A to upper Hessenberg form H by an orthogonal similarity transformation: Q' * A * Q = H . This version stores the triangular matrices used in the factorization so that they can be applied directly (i.e., without being recomputed) later. As a result, the application of Q is much faster. Arguments ========= N (input) INTEGER The order of the matrix A. N >= 0. ILO (input) INTEGER IHI (input) INTEGER It is assumed that A is already upper triangular in rows and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally set by a previous call to DGEBAL; otherwise they should be set to 1 and N respectively. See Further Details. 1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0. A (input/output) DOUBLE_PRECISION array, dimension (LDA,N) On entry, the N-by-N general matrix to be reduced. On exit, the upper triangle and the first subdiagonal of A are overwritten with the upper Hessenberg matrix H, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). TAU (output) DOUBLE_PRECISION array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to zero. WORK (workspace/output) DOUBLE_PRECISION array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The length of the array WORK. LWORK >= max(1,N). For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. dT (output) DOUBLE_PRECISION array on the GPU, dimension N*NB, where NB is the optimal blocksize. It stores the NB*NB blocks of the triangular T matrices, used the the reduction. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value. Further Details =============== The matrix Q is represented as a product of (ihi-ilo) elementary reflectors Q = H(ilo) H(ilo+1) . . . H(ihi-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on exit in A(i+2:ihi,i), and tau in TAU(i). The contents of A are illustrated by the following example, with n = 7, ilo = 2 and ihi = 6: on entry, on exit, ( a a a a a a a ) ( a a h h h h a ) ( a a a a a a ) ( a h h h h a ) ( a a a a a a ) ( h h h h h h ) ( a a a a a a ) ( v2 h h h h h ) ( a a a a a a ) ( v2 v3 h h h h ) ( a a a a a a ) ( v2 v3 v4 h h h ) ( a ) ( a ) where a denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. ===================================================================== */ double c_one = MAGMA_D_ONE; double c_zero = MAGMA_D_ZERO; magma_int_t nb = magma_get_dgehrd_nb(n); magma_int_t N = n, ldda = n; magma_int_t ib; magma_int_t nh, iws; magma_int_t nbmin, iinfo; magma_int_t ldwork; magma_int_t lquery; --tau; *info = 0; MAGMA_D_SET2REAL( work[0], (double) n * nb ); lquery = lwork == -1; if (n < 0) { *info = -1; } else if (ilo < 1 || ilo > max(1,n)) { *info = -2; } else if (ihi < min(ilo,n) || ihi > n) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (lwork < max(1,n) && ! lquery) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; /* Quick return if possible */ nh = ihi - ilo + 1; if (nh <= 1) { work[0] = c_one; return *info; } magmaDouble_ptr da; size_t da_offset = 0; if (MAGMA_SUCCESS != magma_dmalloc( &da, (N*ldda + 2*N*nb + nb*nb) )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmaDouble_ptr d_A = da; size_t d_A_offset = da_offset; //double *d_work = da + (N+nb)*ldda; magmaDouble_ptr d_work = da; size_t d_work_offset = da_offset+(N+nb)*ldda; magma_int_t i__; double *t; magma_dmalloc_cpu( &t, nb*nb ); if ( t == NULL ) { magma_free( da ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } magmaDouble_ptr d_t; d_t = d_work; size_t d_t_offset = d_work_offset+nb*ldda; dzero_nbxnb_block(nb, d_A, d_A_offset+N*ldda, ldda, queue); /* Set elements 1:ILO-1 and IHI:N-1 of TAU to zero */ for (i__ = 1; i__ < ilo; ++i__) tau[i__] = c_zero; for (i__ = max(1,ihi); i__ < n; ++i__) tau[i__] = c_zero; for(i__=0; i__< nb*nb; i__+=4) t[i__] = t[i__+1] = t[i__+2] = t[i__+3] = c_zero; nbmin = 2; iws = 1; if (nb > 1 && nb < nh) { /* Determine when to cross over from blocked to unblocked code (last block is always handled by unblocked code) */ if (nb < nh) { /* Determine if workspace is large enough for blocked code */ iws = n * nb; if (lwork < iws) { /* Not enough workspace to use optimal NB: determine the minimum value of NB, and reduce NB or force use of unblocked code */ nbmin = nb; if (lwork >= n * nbmin) nb = lwork / n; else nb = 1; } } } ldwork = n; if (nb < nbmin || nb >= nh) { /* Use unblocked code below */ i__ = ilo; } else { /* Use blocked code */ /* Copy the matrix to the GPU */ magma_dsetmatrix( N, N-ilo+1, a+(ilo-1)*(lda), 0, lda, d_A, d_A_offset, ldda, queue ); for (i__ = ilo; i__ < ihi - nb; i__ += nb) { /* Computing MIN */ ib = min(nb, ihi - i__); /* Reduce columns i:i+ib-1 to Hessenberg form, returning the matrices V and T of the block reflector H = I - V*T*V' which performs the reduction, and also the matrix Y = A*V*T */ /* Get the current panel (no need for the 1st iteration) */ magma_dgetmatrix( ihi-i__+1, ib, d_A, (d_A_offset + (i__ - ilo)*ldda + i__ - 1), ldda, a + (i__ - 1 )*lda + i__ - 1, 0, lda, queue ); magma_dlahr2(ihi, i__, ib, d_A, d_A_offset +(i__ - ilo)*ldda, d_A, d_A_offset + N*ldda + 1, a + (i__ - 1 )*(lda) , lda, &tau[i__], t, nb, work, ldwork, queue); /* Copy T from the CPU to D_T on the GPU */ //d_t = dT + (i__ - ilo)*nb; d_t = dT; d_t_offset = dT_offset + (i__ - ilo)*nb; magma_dsetmatrix( nb, nb, t, 0, nb, d_t, d_t_offset, nb, queue ); magma_dlahru(n, ihi, i__ - 1, ib, a + (i__ - 1 )*(lda), lda, d_A, d_A_offset + (i__ - ilo)*ldda, d_A, d_A_offset + (i__ - ilo)*ldda + i__ - 1, d_A, d_A_offset + N*ldda, d_t, d_t_offset, d_work, d_work_offset, queue); } } /* Use unblocked code to reduce the rest of the matrix */ if (!(nb < nbmin || nb >= nh)) magma_dgetmatrix( n, n-i__+1, d_A, d_A_offset + (i__-ilo)*ldda, ldda, a + (i__-1)*(lda), 0, lda, queue ); lapackf77_dgehd2(&n, &i__, &ihi, a, &lda, &tau[1], work, &iinfo); MAGMA_D_SET2REAL( work[0], (double) iws ); magma_free( da ); magma_free_cpu(t); return *info; } /* magma_dgehrd */
extern "C" magma_int_t magma_dorgqr2(magma_int_t m, magma_int_t n, magma_int_t k, double *A, magma_int_t lda, double *tau, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= DORGQR generates an M-by-N DOUBLE_PRECISION matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by DGEQRF. This version recomputes the T matrices on the CPU and sends them to the GPU. Arguments ========= M (input) INTEGER The number of rows of the matrix Q. M >= 0. N (input) INTEGER The number of columns of the matrix Q. M >= N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. A (input/output) DOUBLE_PRECISION array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by DGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. LDA (input) INTEGER The first dimension of the array A. LDA >= max(1,M). TAU (input) DOUBLE_PRECISION array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by DGEQRF_GPU. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument has an illegal value ===================================================================== */ #define A(i,j) ( A + (i) + (j)*lda ) #define dA(i,j) (dA + (i) + (j)*ldda) double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; magma_int_t nb = magma_get_dgeqrf_nb(min(m, n)); magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldda; magma_int_t i, ib, ki, kk; //, iinfo; magma_int_t lddwork; double *dA, *dV, *dW, *dT, *T; double *work; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate GPU work space // ldda*n for matrix dA // ldda*nb for dV // lddwork*nb for dW larfb workspace ldda = ((m + 31) / 32) * 32; lddwork = ((n + 31) / 32) * 32; if (MAGMA_SUCCESS != magma_dmalloc( &dA, ldda*n + ldda*nb + lddwork*nb + nb*nb)) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dV = dA + ldda*n; dW = dA + ldda*n + ldda*nb; dT = dA + ldda*n + ldda*nb + lddwork*nb; // Allocate CPU work space lwork = (n+m+nb) * nb; magma_dmalloc_cpu( &work, lwork ); T = work; if (work == NULL) { magma_free( dA ); magma_free_cpu( work ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } double *V = work + (n+nb)*nb; magma_queue_t stream; magma_queue_create( &stream ); // Use unblocked code for the last or only block. if (kk < n) { m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; /* lapackf77_dorgqr( &m_kk, &n_kk, &k_kk, A(kk, kk), &lda, &tau[kk], work, &lwork, &iinfo ); */ lapackf77_dlacpy( MagmaUpperLowerStr, &m_kk, &k_kk, A(kk,kk), &lda, V, &m_kk); lapackf77_dlaset( MagmaUpperLowerStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda ); lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &k_kk, V, &m_kk, &tau[kk], work, &k_kk); lapackf77_dlarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &n_kk, &k_kk, V, &m_kk, work, &k_kk, A(kk, kk), &lda, work+k_kk*k_kk, &n_kk ); if (kk > 0) { magma_dsetmatrix( m_kk, n_kk, A(kk, kk), lda, dA(kk, kk), ldda ); // Set A(1:kk,kk+1:n) to zero. magmablas_dlaset( MagmaUpperLower, kk, n - kk, dA(0, kk), ldda ); } } if (kk > 0) { // Use blocked code // stream: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation magmablasSetKernelStream( stream ); for (i = ki; i >= 0; i -= nb) { ib = min(nb, k - i); // Send current panel to the GPU mi = m - i; lapackf77_dlaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); magma_dsetmatrix_async( mi, ib, A(i, i), lda, dV, ldda, stream ); lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &mi, &ib, A(i,i), &lda, &tau[i], T, &nb); magma_dsetmatrix_async( ib, ib, T, nb, dT , nb, stream ); // set panel to identity magmablas_dlaset( MagmaUpperLower, i, ib, dA(0, i), ldda ); magmablas_dlaset_identity( mi, ib, dA(i, i), ldda ); magma_queue_sync( stream ); if (i < n) { // Apply H to A(i:m,i:n) from the left magma_dlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, n-i, ib, dV, ldda, dT, nb, dA(i, i), ldda, dW, lddwork ); } } // copy result back to CPU magma_dgetmatrix( m, n, dA(0, 0), ldda, A(0, 0), lda); } magmablasSetKernelStream( NULL ); magma_queue_destroy( stream ); magma_free( dA ); magma_free_cpu( work ); return *info; } /* magma_dorgqr */
/** Purpose ------- DGEHRD2 reduces a DOUBLE_PRECISION general matrix A to upper Hessenberg form H by an orthogonal similarity transformation: Q' * A * Q = H . Arguments --------- @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] ilo INTEGER @param[in] ihi INTEGER It is assumed that A is already upper triangular in rows and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally set by a previous call to DGEBAL; otherwise they should be set to 1 and N respectively. See Further Details. 1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the N-by-N general matrix to be reduced. On exit, the upper triangle and the first subdiagonal of A are overwritten with the upper Hessenberg matrix H, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau DOUBLE_PRECISION array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to zero. @param[out] work (workspace) DOUBLE_PRECISION array, dimension (LWORK) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. LWORK >= max(1,N). For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value. Further Details --------------- The matrix Q is represented as a product of (ihi-ilo) elementary reflectors Q = H(ilo) H(ilo+1) . . . H(ihi-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on exit in A(i+2:ihi,i), and tau in TAU(i). The contents of A are illustrated by the following example, with n = 7, ilo = 2 and ihi = 6: @verbatim on entry, on exit, ( a a a a a a a ) ( a a h h h h a ) ( a a a a a a ) ( a h h h h a ) ( a a a a a a ) ( h h h h h h ) ( a a a a a a ) ( v2 h h h h h ) ( a a a a a a ) ( v2 v3 h h h h ) ( a a a a a a ) ( v2 v3 v4 h h h ) ( a ) ( a ) @endverbatim where a denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. @ingroup magma_dgeev_comp ********************************************************************/ extern "C" magma_int_t magma_dgehrd2( magma_int_t n, magma_int_t ilo, magma_int_t ihi, double *A, magma_int_t lda, double *tau, double *work, magma_int_t lwork, magma_int_t *info) { #define A(i_,j_) (A + (i_) + (j_)*lda) #define dA(i_,j_) (dA + (i_) + (j_)*ldda) double c_one = MAGMA_D_ONE; double c_zero = MAGMA_D_ZERO; magma_int_t nb = magma_get_dgehrd_nb(n); magma_int_t ldda = ((n+31)/32)*32; magma_int_t i, nh, iws; magma_int_t iinfo; magma_int_t ldwork; magma_int_t lquery; *info = 0; iws = n*nb; work[0] = MAGMA_D_MAKE( iws, 0 ); lquery = (lwork == -1); if (n < 0) { *info = -1; } else if (ilo < 1 || ilo > max(1,n)) { *info = -2; } else if (ihi < min(ilo,n) || ihi > n) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (lwork < max(1,n) && ! lquery) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; // Adjust from 1-based indexing ilo -= 1; // Quick return if possible nh = ihi - ilo; if (nh <= 1) { work[0] = c_one; return *info; } // If not enough workspace, use unblocked code if ( lwork < iws ) { nb = 1; } if (nb == 1 || nb > nh) { // Use unblocked code below i = ilo; } else { // Use blocked code // GPU workspace is: // nb*ldda for dwork for dlahru // nb*ldda for dV // n*ldda for dA // nb*nb for dT double *dwork; if (MAGMA_SUCCESS != magma_dmalloc( &dwork, 2*nb*ldda + n*ldda + nb*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } double *dV = dwork + nb*ldda; double *dA = dwork + nb*ldda*2; double *dT = dwork + nb*ldda*2 + n*ldda; ldwork = ldda; double *T; magma_dmalloc_cpu( &T, nb*nb ); if ( T == NULL ) { magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } // zero first block of V, which is lower triangular magmablas_dlaset( MagmaFull, nb, nb, c_zero, c_zero, dV, ldda ); // Set elements 0:ILO-1 and IHI-1:N-2 of TAU to zero for (i = 0; i < ilo; ++i) tau[i] = c_zero; for (i = max(0,ihi-1); i < n-1; ++i) tau[i] = c_zero; assert( nb % 4 == 0 ); for (i=0; i < nb*nb; i += 4) T[i] = T[i+1] = T[i+2] = T[i+3] = c_zero; // Copy the matrix to the GPU magma_dsetmatrix( n, n-ilo, A(0,ilo), lda, dA, ldda ); for (i = ilo; i < ihi-1 - nb; i += nb) { // Reduce columns i:i+nb-1 to Hessenberg form, returning the // matrices V and T of the block reflector H = I - V*T*V' // which performs the reduction, and also the matrix Y = A*V*T // Get the current panel (no need for the 1st iteration) magma_dgetmatrix( ihi-i, nb, dA(i,i-ilo), ldda, A(i,i), lda ); // add 1 to i for 1-based index magma_dlahr2( ihi, i+1, nb, dA(0,i-ilo), ldda, dV, ldda, A(0,i), lda, &tau[i], T, nb, work, ldwork ); // Copy T from the CPU to dT on the GPU magma_dsetmatrix( nb, nb, T, nb, dT, nb ); magma_dlahru( n, ihi, i, nb, A(0,i), lda, dA(0,i-ilo), ldda, // dA dA(i,i-ilo), ldda, // dY, stored over current panel dV, ldda, dT, dwork ); } // Copy remainder to host magma_dgetmatrix( n, n-i, dA(0,i-ilo), ldda, A(0,i), lda ); magma_free( dwork ); magma_free_cpu( T ); } // Use unblocked code to reduce the rest of the matrix // add 1 to i for 1-based index i += 1; lapackf77_dgehd2(&n, &i, &ihi, A, &lda, tau, work, &iinfo); work[0] = MAGMA_D_MAKE( iws, 0 ); return *info; } /* magma_dgehrd2 */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dtrmm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf, cpu_time; double cublas_error, Cnorm, work[1]; magma_int_t M, N; magma_int_t Ak; magma_int_t sizeA, sizeB; magma_int_t lda, ldb, ldda, lddb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double *h_A, *h_B, *h_Bcublas; double *d_A, *d_B; double c_neg_one = MAGMA_D_NEG_ONE; double alpha = MAGMA_D_MAKE( 0.29, -0.86 ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) double tol = opts.tolerance * lapackf77_dlamch("E"); printf("If running lapack (option --lapack), CUBLAS error is computed\n" "relative to CPU BLAS result.\n\n"); printf("side = %s, uplo = %s, transA = %s, diag = %s \n", lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag) ); printf(" M N CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("==================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; gflops = FLOPS_DTRMM(opts.side, M, N) / 1e9; if ( opts.side == MagmaLeft ) { lda = M; Ak = M; } else { lda = N; Ak = N; } ldb = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; sizeA = lda*Ak; sizeB = ldb*N; TESTING_MALLOC_CPU( h_A, double, lda*Ak ); TESTING_MALLOC_CPU( h_B, double, ldb*N ); TESTING_MALLOC_CPU( h_Bcublas, double, ldb*N ); TESTING_MALLOC_DEV( d_A, double, ldda*Ak ); TESTING_MALLOC_DEV( d_B, double, lddb*N ); /* Initialize the matrices */ lapackf77_dlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_dlarnv( &ione, ISEED, &sizeB, h_B ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_dsetmatrix( Ak, Ak, h_A, lda, d_A, ldda ); magma_dsetmatrix( M, N, h_B, ldb, d_B, lddb ); // note cublas does trmm out-of-place (i.e., adds output matrix C), // but allows C=B to do in-place. cublas_time = magma_sync_wtime( NULL ); cublasDtrmm( handle, cublas_side_const(opts.side), cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), cublas_diag_const(opts.diag), M, N, &alpha, d_A, ldda, d_B, lddb, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_dgetmatrix( M, N, d_B, lddb, h_Bcublas, ldb ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_dtrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &alpha, h_A, &lda, h_B, &ldb ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_dlange( "M", &M, &N, h_B, &ldb, work ); blasf77_daxpy( &sizeB, &c_neg_one, h_B, &ione, h_Bcublas, &ione ); cublas_error = lapackf77_dlange( "M", &M, &N, h_Bcublas, &ldb, work ) / Cnorm; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error, (cublas_error < tol ? "ok" : "failed")); status += ! (cublas_error < tol); } else { printf("%5d %5d %7.2f (%7.2f) --- ( --- ) --- ---\n", (int) M, (int) N, cublas_perf, 1000.*cublas_time); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_Bcublas ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/** Purpose ------- DORMQL overwrites the general real M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = MagmaTrans: Q**H * C C * Q**H @endverbatim where Q is a real unitary matrix defined as the product of k elementary reflectors Q = H(k) . . . H(2) H(1) as returned by DGEQLF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = MagmaTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] A DOUBLE_PRECISION array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by DGEQLF in the last k columns of its array argument A. A is modified by the routine but restored on exit. @param[in] lda INTEGER The leading dimension of the array A. If SIDE = MagmaLeft, LDA >= max(1,M); if SIDE = MagmaRight, LDA >= max(1,N). @param[in] tau DOUBLE_PRECISION array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by DGEQLF. @param[in,out] C DOUBLE_PRECISION array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M). For optimum performance if SIDE = MagmaLeft, LWORK >= N*NB; if SIDE = MagmaRight, LWORK >= M*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_dgeqlf_comp ********************************************************************/ extern "C" magma_int_t magma_dormql(magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, double *A, magma_int_t lda, double *tau, double *C, magma_int_t ldc, double *work, magma_int_t lwork, magma_int_t *info) { #define A(i_,j_) ( A + (i_) + (j_)*lda) #define dC(i_,j_) (dC + (i_) + (j_)*lddc) double *T, *T2; magma_int_t i, i1, i2, ib, nb, mi, ni, nq, nq_i, nw, step; magma_int_t iinfo, ldwork, lwkopt; magma_int_t left, notran, lquery; *info = 0; left = (side == MagmaLeft); notran = (trans == MagmaNoTrans); lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } /* Test the input arguments */ if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != MagmaTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } if (*info == 0) { nb = magma_get_dgelqf_nb( min( m, n )); lwkopt = max(1,nw)*nb; work[0] = MAGMA_D_MAKE( lwkopt, 0 ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = MAGMA_D_ONE; return *info; } ldwork = nw; if ( nb >= k ) { /* Use CPU code */ lapackf77_dormql( lapack_side_const(side), lapack_trans_const(trans), &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, &iinfo); } else { /* Use hybrid CPU-GPU code */ /* Allocate work space on the GPU. * nw*nb for dwork (m or n) by nb * nq*nb for dV (n or m) by nb * nb*nb for dT * lddc*n for dC. */ magma_int_t lddc = ((m+31)/32)*32; double *dwork, *dV, *dT, *dC; magma_dmalloc( &dwork, (nw + nq + nb)*nb + lddc*n ); if ( dwork == NULL ) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dV = dwork + nw*nb; dT = dV + nq*nb; dC = dT + nb*nb; /* work space on CPU. * nb*nb for T * nb*nb for T2, used to save and restore diagonal block of panel */ magma_dmalloc_pinned( &T, 2*nb*nb ); if ( T == NULL ) { magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } T2 = T + nb*nb; /* Copy matrix C from the CPU to the GPU */ magma_dsetmatrix( m, n, C, ldc, dC, lddc ); if ( (left && notran) || (! left && ! notran) ) { i1 = 0; i2 = k; step = nb; } else { i1 = ((k - 1) / nb) * nb; i2 = 0; step = -nb; } // silence "uninitialized" warnings mi = 0; ni = 0; if (left) { ni = n; } else { mi = m; } for (i = i1; (step < 0 ? i >= i2 : i < i2); i += step) { ib = min(nb, k - i); /* Form the triangular factor of the block reflector H = H(i+ib-1) . . . H(i+1) H(i) */ nq_i = nq - k + i + ib; lapackf77_dlarft("Backward", "Columnwise", &nq_i, &ib, A(0,i), &lda, &tau[i], T, &ib); /* 1) set lower triangle of panel in A to identity, 2) copy the panel from A to the GPU, and 3) restore A */ dpanel_to_q( MagmaLower, ib, A(nq_i-ib,i), lda, T2 ); magma_dsetmatrix( nq_i, ib, A(0, i), lda, dV, nq_i ); dq_to_panel( MagmaLower, ib, A(nq_i-ib,i), lda, T2 ); if (left) { /* H or H**H is applied to C(1:m-k+i+ib-1,1:n) */ mi = m - k + i + ib; } else { /* H or H**H is applied to C(1:m,1:n-k+i+ib-1) */ ni = n - k + i + ib; } /* Apply H or H**H; First copy T to the GPU */ magma_dsetmatrix( ib, ib, T, ib, dT, ib ); magma_dlarfb_gpu( side, trans, MagmaBackward, MagmaColumnwise, mi, ni, ib, dV, nq_i, dT, ib, dC, lddc, dwork, ldwork ); } magma_dgetmatrix( m, n, dC, lddc, C, ldc ); magma_free( dwork ); magma_free_pinned( T ); } work[0] = MAGMA_D_MAKE( lwkopt, 0 ); return *info; } /* magma_dormql */
extern "C" magma_int_t magma_dgeqrf_gpu( magma_int_t m, magma_int_t n, double *dA, magma_int_t ldda, double *tau, double *dT, magma_int_t *info ) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= DGEQRF computes a QR factorization of a real M-by-N matrix A: A = Q * R. This version stores the triangular dT matrices used in the block QR factorization so that they can be applied directly (i.e., without being recomputed) later. As a result, the application of Q is much faster. Also, the upper triangular matrices for V have 0s in them. The corresponding parts of the upper triangular R are inverted and stored separately in dT. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be dividable by 16. TAU (output) DOUBLE_PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). dT (workspace/output) DOUBLE_PRECISION array on the GPU, dimension (2*MIN(M, N) + (N+31)/32*32 )*NB, where NB can be obtained through magma_get_dgeqrf_nb(M). It starts with MIN(M,N)*NB block that store the triangular T matrices, followed by the MIN(M,N)*NB block of the diagonal inverses for the R matrix. The rest of the array is used as workspace. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define a_ref(a_1,a_2) (dA+(a_2)*(ldda) + (a_1)) #define t_ref(a_1) (dT+(a_1)*nb) #define d_ref(a_1) (dT+(minmn+(a_1))*nb) #define dd_ref(a_1) (dT+(2*minmn+(a_1))*nb) #define work_ref(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) magma_int_t i, k, minmn, old_i, old_ib, rows, cols; magma_int_t ib, nb; magma_int_t ldwork, lddwork, lwork, lhwork; double *work, *ut; /* check arguments */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = minmn = min(m,n); if (k == 0) return *info; nb = magma_get_dgeqrf_nb(m); lwork = (m + n + nb)*nb; lhwork = lwork - m*nb; if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, lwork )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } ut = hwork+nb*(n); memset( ut, 0, nb*nb*sizeof(double)); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); ldwork = m; lddwork= n; if ( (nb > 1) && (nb < k) ) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nb; i += nb) { ib = min(k-i, nb); rows = m -i; magma_dgetmatrix_async( rows, ib, a_ref(i,i), ldda, work_ref(i), ldwork, stream[1] ); if (i>0){ /* Apply H' to A(i:m,i+2*ib:n) from the left */ cols = n-old_i-2*old_ib; magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, m-old_i, cols, old_ib, a_ref(old_i, old_i ), ldda, t_ref(old_i), nb, a_ref(old_i, old_i+2*old_ib), ldda, dd_ref(0), lddwork); /* store the diagonal */ magma_dsetmatrix_async( old_ib, old_ib, ut, old_ib, d_ref(old_i), old_ib, stream[0] ); } magma_queue_sync( stream[1] ); lapackf77_dgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &ib); /* Put 0s in the upper triangular part of a panel (and 1s on the diagonal); copy the upper triangular in ut and invert it. */ magma_queue_sync( stream[0] ); dsplit_diag_block(ib, work_ref(i), ldwork, ut); magma_dsetmatrix( rows, ib, work_ref(i), ldwork, a_ref(i,i), ldda ); if (i + ib < n) { /* Send the triangular factor T to the GPU */ magma_dsetmatrix( ib, ib, hwork, ib, t_ref(i), nb ); if (i+nb < k-nb){ /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, a_ref(i, i ), ldda, t_ref(i), nb, a_ref(i, i+ib), ldda, dd_ref(0), lddwork); } else { cols = n-i-ib; magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, cols, ib, a_ref(i, i ), ldda, t_ref(i), nb, a_ref(i, i+ib), ldda, dd_ref(0), lddwork); /* Fix the diagonal block */ magma_dsetmatrix( ib, ib, ut, ib, d_ref(i), ib ); } old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_dgetmatrix( rows, ib, a_ref(i, i), ldda, work, rows ); lhwork = lwork - rows*ib; lapackf77_dgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_dsetmatrix( rows, ib, work, rows, a_ref(i, i), ldda ); } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free_pinned( work ); return *info; /* End of MAGMA_DGEQRF */ } /* magma_dgeqrf */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgemm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf, cpu_time; double magma_error, cublas_error, Cnorm, work[1]; magma_int_t M, N, K; magma_int_t Am, An, Bm, Bn; magma_int_t sizeA, sizeB, sizeC; magma_int_t lda, ldb, ldc, ldda, lddb, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; double *h_A, *h_B, *h_C, *h_Cmagma, *h_Ccublas; double *d_A, *d_B, *d_C; double c_neg_one = MAGMA_D_NEG_ONE; double alpha = MAGMA_D_MAKE( 0.29, -0.86 ); double beta = MAGMA_D_MAKE( -0.48, 0.38 ); magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("If running lapack (option --lapack), MAGMA and CUBLAS error are both computed\n" "relative to CPU BLAS result. Else, MAGMA error is computed relative to CUBLAS result.\n\n"); printf("transA = %s, transB = %s\n", lapack_trans_const(opts.transA), lapack_trans_const(opts.transB) ); printf(" M N K MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("=========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; K = opts.ksize[itest]; gflops = FLOPS_DGEMM( M, N, K ) / 1e9; if ( opts.transA == MagmaNoTrans ) { lda = Am = M; An = K; } else { lda = Am = K; An = M; } if ( opts.transB == MagmaNoTrans ) { ldb = Bm = K; Bn = N; } else { ldb = Bm = N; Bn = K; } ldc = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; lddc = ((ldc+31)/32)*32; sizeA = lda*An; sizeB = ldb*Bn; sizeC = ldc*N; TESTING_MALLOC_CPU( h_A, double, lda*An ); TESTING_MALLOC_CPU( h_B, double, ldb*Bn ); TESTING_MALLOC_CPU( h_C, double, ldc*N ); TESTING_MALLOC_CPU( h_Cmagma, double, ldc*N ); TESTING_MALLOC_CPU( h_Ccublas, double, ldc*N ); TESTING_MALLOC_DEV( d_A, double, ldda*An ); TESTING_MALLOC_DEV( d_B, double, lddb*Bn ); TESTING_MALLOC_DEV( d_C, double, lddc*N ); /* Initialize the matrices */ lapackf77_dlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_dlarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_dlarnv( &ione, ISEED, &sizeC, h_C ); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_dsetmatrix( Am, An, h_A, lda, d_A, ldda ); magma_dsetmatrix( Bm, Bn, h_B, ldb, d_B, lddb ); magma_dsetmatrix( M, N, h_C, ldc, d_C, lddc ); magma_time = magma_sync_wtime( NULL ); magmablas_dgemm( opts.transA, opts.transB, M, N, K, alpha, d_A, ldda, d_B, lddb, beta, d_C, lddc ); magma_time = magma_sync_wtime( NULL ) - magma_time; magma_perf = gflops / magma_time; magma_dgetmatrix( M, N, d_C, lddc, h_Cmagma, ldc ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_dsetmatrix( M, N, h_C, ldc, d_C, lddc ); cublas_time = magma_sync_wtime( NULL ); cublasDgemm( handle, cublas_trans_const(opts.transA), cublas_trans_const(opts.transB), M, N, K, &alpha, d_A, ldda, d_B, lddb, &beta, d_C, lddc ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_dgetmatrix( M, N, d_C, lddc, h_Ccublas, ldc ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_dgemm( lapack_trans_const(opts.transA), lapack_trans_const(opts.transB), &M, &N, &K, &alpha, h_A, &lda, h_B, &ldb, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_dlange( "M", &M, &N, h_C, &ldc, work ); blasf77_daxpy( &sizeC, &c_neg_one, h_C, &ione, h_Cmagma, &ione ); magma_error = lapackf77_dlange( "M", &M, &N, h_Cmagma, &ldc, work ) / Cnorm; blasf77_daxpy( &sizeC, &c_neg_one, h_C, &ione, h_Ccublas, &ione ); cublas_error = lapackf77_dlange( "M", &M, &N, h_Ccublas, &ldc, work ) / Cnorm; printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, (int) K, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error, (magma_error < tol && cublas_error < tol ? "ok" : "failed")); status += ! (magma_error < tol && cublas_error < tol); } else { // compute relative error for magma, relative to cublas Cnorm = lapackf77_dlange( "M", &M, &N, h_Ccublas, &ldc, work ); blasf77_daxpy( &sizeC, &c_neg_one, h_Ccublas, &ione, h_Cmagma, &ione ); magma_error = lapackf77_dlange( "M", &M, &N, h_Cmagma, &ldc, work ); // / Cnorm; printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) --- ( --- ) %8.2e --- %s\n", (int) M, (int) N, (int) K, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, magma_error, (magma_error < tol ? "ok" : "failed")); status += ! (magma_error < tol); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_C ); TESTING_FREE_CPU( h_Cmagma ); TESTING_FREE_CPU( h_Ccublas ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_C ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_R, *tau, *dtau, *h_work, tmp[1]; magmaDouble_ptr d_A; magmaDouble_ptr dwork; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); opts.lapack |= opts.check; // check (-c) implies lapack (-l) printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||R||_F / ||A||_F\n"); printf("=======================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_DGEQRF( M, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_dgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); TESTING_MALLOC_CPU( tau, double, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( h_work, double, lwork ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( dtau, double, min_mn ); TESTING_MALLOC_DEV( dwork, double, min_mn ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_dsetmatrix( M, N, h_R, lda, d_A, ldda ); // warmup if ( opts.warmup ) { magma_dgeqr2_gpu( M, N, d_A, ldda, dtau, dwork, &info ); magma_dsetmatrix( M, N, h_R, lda, d_A, ldda ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_sync_wtime( 0 ); magma_dgeqr2_gpu( M, N, d_A, ldda, dtau, dwork, &info ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgeqr2_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_dgetmatrix( M, N, d_A, ldda, h_R, M ); error = lapackf77_dlange("f", &M, &N, h_A, &lda, work); blasf77_daxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("f", &M, &N, h_R, &lda, work) / error; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, 1000.*gpu_time ); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dtau ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dlaset Code is very similar to testing_dlacpy.cpp */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_R; magmaDouble_ptr d_A; double offdiag = MAGMA_D_MAKE( 1.2000, 6.7000 ); double diag = MAGMA_D_MAKE( 3.1415, 2.7183 ); magma_int_t M, N, size, lda, ldda; magma_int_t ione = 1; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); magma_uplo_t uplo[] = { MagmaLower, MagmaUpper, MagmaFull }; printf("uplo M N CPU GByte/s (ms) GPU GByte/s (ms) check\n"); printf("=================================================================\n"); for( int iuplo = 0; iuplo < 3; ++iuplo ) { for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; //M += 2; // space for insets //N += 2; lda = M; ldda = ((M+31)/32)*32; size = lda*N; if ( uplo[iuplo] == MagmaLower || uplo[iuplo] == MagmaUpper ) { // save triangle (with diagonal) // TODO wrong for trapezoid gbytes = sizeof(double) * 0.5*N*(N+1) / 1e9; } else { // save entire matrix gbytes = sizeof(double) * 1.*M*N / 1e9; } TESTING_MALLOC_CPU( h_A, double, size ); TESTING_MALLOC_CPU( h_R, double, size ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); /* Initialize the matrix */ for( int j = 0; j < N; ++j ) { for( int i = 0; i < M; ++i ) { h_A[i + j*lda] = MAGMA_D_MAKE( i + j/10000., j ); } } /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix( M, N, h_A, lda, d_A, 0, ldda, opts.queue ); gpu_time = magma_sync_wtime( 0 ); //magmablas_dlaset( uplo[iuplo], M-2, N-2, offdiag, diag, d_A+1+ldda, 0, ldda, opts.queue ); // inset by 1 row & col magmablas_dlaset( uplo[iuplo], M, N, offdiag, diag, d_A, 0, ldda, opts.queue ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gbytes / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); //magma_int_t M2 = M-2; // inset by 1 row & col //magma_int_t N2 = N-2; //lapackf77_dlaset( lapack_uplo_const( uplo[iuplo] ), &M2, &N2, &offdiag, &diag, h_A+1+lda, &lda ); lapackf77_dlaset( lapack_uplo_const( uplo[iuplo] ), &M, &N, &offdiag, &diag, h_A, &lda ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if ( opts.verbose ) { printf( "A= " ); magma_dprint( M, N, h_A, lda ); printf( "dA=" ); magma_dprint_gpu( M, N, d_A, 0, ldda, opts.queue ); } /* ===================================================================== Check the result =================================================================== */ magma_dgetmatrix( M, N, d_A, 0, ldda, h_R, lda, opts.queue ); blasf77_daxpy(&size, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("f", &M, &N, h_R, &lda, work); printf("%5s %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %s\n", lapack_uplo_const( uplo[iuplo] ), (int) M, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., (error == 0. ? "ok" : "failed") ); status += ! (error == 0.); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } printf( "\n" ); } TESTING_FINALIZE(); return status; }
/** Purpose ------- DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] d_lA DOUBLE PRECISION array of pointers on the GPU, dimension (ngpu). On entry, the M-by-N matrix A distributed over GPUs (d_lA[d] points to the local matrix on d-th GPU). It uses 1D block column cyclic format with the block size of nb, and each local matrix is stored by column. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array d_lA. LDDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_dgesv_comp ********************************************************************/ extern "C" magma_int_t magma_dgetrf_mgpu( magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaDouble_ptr d_lA[], magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info) { magma_int_t nb, n_local[MagmaMaxGPUs]; magma_int_t maxm; magma_int_t i, j, d, lddat, lddwork; double *d_lAT[MagmaMaxGPUs]; double *d_panel[MagmaMaxGPUs], *work; magma_queue_t queues[MagmaMaxGPUs][2]; /* Check arguments */ *info = 0; if (m < 0) *info = -2; else if (n < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* create the queues */ for( d=0; d < ngpu; d++ ) { magma_queue_create( d, &queues[d][0] ); magma_queue_create( d, &queues[d][1] ); } /* Function Body */ nb = magma_get_dgetrf_nb( m, n ); if (nb <= 1 || nb >= n) { /* Use CPU code. */ magma_dmalloc_cpu( &work, m * n ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_dgetmatrix( m, n, d_lA[0], ldda, work, m, queues[0][0] ); lapackf77_dgetrf(&m, &n, work, &m, ipiv, info); magma_dsetmatrix( m, n, work, m, d_lA[0], ldda, queues[0][0] ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ magma_device_t orig_dev; magma_getdevice( &orig_dev ); maxm = magma_roundup( m, 32 ); if ( ngpu > ceil((double)n/nb) ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); *info = -1; return *info; } /* allocate workspace for each GPU */ lddat = magma_roundup( ((magma_ceildiv( n, nb )/ngpu)*nb), 32 ); lddat = magma_ceildiv( n, nb ); /* number of block columns */ lddat = magma_ceildiv( lddat, ngpu ); /* number of block columns per GPU */ lddat = nb*lddat; /* number of columns per GPU */ lddat = magma_roundup( lddat, 32 ); /* make it a multiple of 32 */ for (i=0; i < ngpu; i++) { magma_setdevice(i); /* local-n and local-ld */ n_local[i] = ((n/nb)/ngpu)*nb; if (i < (n/nb)%ngpu) n_local[i] += nb; else if (i == (n/nb)%ngpu) n_local[i] += n%nb; /* workspaces */ if (MAGMA_SUCCESS != magma_dmalloc( &d_panel[i], (3+ngpu)*nb*maxm )) { for( j=0; j <= i; j++ ) { magma_setdevice(j); } for( j=0; j < i; j++ ) { magma_setdevice(j); magma_free( d_panel[j] ); magma_free( d_lAT[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* local-matrix storage */ if (MAGMA_SUCCESS != magma_dmalloc( &d_lAT[i], lddat*maxm )) { for( j=0; j <= i; j++ ) { magma_setdevice(j); magma_free( d_panel[j] ); } for( j=0; j < i; j++ ) { magma_setdevice(j); magma_free( d_lAT[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmablas_dtranspose( m, n_local[i], d_lA[i], ldda, d_lAT[i], lddat, queues[i][1] ); } for (i=0; i < ngpu; i++) { magma_setdevice(i); magma_queue_sync(queues[i][0]); } magma_setdevice(0); /* cpu workspace */ lddwork = maxm; if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, lddwork*nb*ngpu )) { for (i=0; i < ngpu; i++ ) { magma_setdevice(i); magma_free( d_panel[i] ); magma_free( d_lAT[i] ); } *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* calling multi-gpu interface with allocated workspaces and queues */ magma_dgetrf2_mgpu(ngpu, m, n, nb, 0, d_lAT, lddat, ipiv, d_panel, work, maxm, queues, info); /* clean up */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); /* save on output */ magmablas_dtranspose( n_local[d], m, d_lAT[d], lddat, d_lA[d], ldda, queues[d][0] ); magma_queue_sync(queues[d][0]); magma_queue_sync(queues[d][1]); magma_free( d_lAT[d] ); magma_free( d_panel[d] ); } /* end of for d=1,..,ngpu */ magma_setdevice( orig_dev ); magma_free_pinned( work ); } /* clean up */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_destroy( queues[d][0] ); magma_queue_destroy( queues[d][1] ); } return *info; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dswap, dswapblk, dpermute, dlaswp, dlaswpx */ int main( int argc, char** argv) { TESTING_INIT(); double *h_A1, *h_A2; double *d_A1, *d_A2; double *h_R1, *h_R2; // row-major and column-major performance real_Double_t row_perf0, col_perf0; real_Double_t row_perf1, col_perf1; real_Double_t row_perf2, col_perf2; real_Double_t row_perf3; real_Double_t row_perf4; real_Double_t row_perf5, col_perf5; real_Double_t row_perf6, col_perf6; real_Double_t row_perf7; real_Double_t cpu_perf; real_Double_t time, gbytes; magma_int_t N, lda, ldda, nb, j; magma_int_t ione = 1; magma_int_t *ipiv, *ipiv2; magma_int_t *d_ipiv; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); magma_queue_t queue = 0; printf(" cublasDswap dswap dswapblk dlaswp dpermute dlaswp2 dlaswpx dcopymatrix CPU (all in )\n"); printf(" N nb row-maj/col-maj row-maj/col-maj row-maj/col-maj row-maj row-maj row-maj row-maj/col-maj row-blk/col-blk dlaswp (GByte/s)\n"); printf("==================================================================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { // For an N x N matrix, swap nb rows or nb columns using various methods. // Each test is assigned one bit in the 'check' bitmask; bit=1 indicates failure. // The variable 'shift' keeps track of which bit is for current test int shift = 1; int check = 0; N = opts.nsize[itest]; lda = N; ldda = ((N+31)/32)*32; nb = (opts.nb > 0 ? opts.nb : magma_get_dgetrf_nb( N )); nb = min( N, nb ); // each swap does 2N loads and 2N stores, for nb swaps gbytes = sizeof(double) * 4.*N*nb / 1e9; TESTING_MALLOC_PIN( h_A1, double, lda*N ); TESTING_MALLOC_PIN( h_A2, double, lda*N ); TESTING_MALLOC_PIN( h_R1, double, lda*N ); TESTING_MALLOC_PIN( h_R2, double, lda*N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, nb ); TESTING_MALLOC_CPU( ipiv2, magma_int_t, nb ); TESTING_MALLOC_DEV( d_ipiv, magma_int_t, nb ); TESTING_MALLOC_DEV( d_A1, double, ldda*N ); TESTING_MALLOC_DEV( d_A2, double, ldda*N ); for( j=0; j < nb; j++ ) { ipiv[j] = (magma_int_t) ((rand()*1.*N) / (RAND_MAX * 1.)) + 1; } /* ===================================================================== * cublasDswap, row-by-row (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { cublasDswap( N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1); } } time = magma_sync_wtime( queue ) - time; row_perf0 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { cublasDswap( N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda); } } time = magma_sync_wtime( queue ) - time; col_perf0 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * dswap, row-by-row (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { magmablas_dswap( N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1); } } time = magma_sync_wtime( queue ) - time; row_perf1 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { magmablas_dswap( N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda ); } } time = magma_sync_wtime( queue ) - time; col_perf1 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * dswapblk, blocked version (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); magmablas_dswapblk( MagmaRowMajor, N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0); time = magma_sync_wtime( queue ) - time; row_perf2 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); magmablas_dswapblk( MagmaColMajor, N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0); time = magma_sync_wtime( queue ) - time; col_perf2 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * dpermute_long (1 matrix) */ /* Row Major */ memcpy( ipiv2, ipiv, nb*sizeof(magma_int_t) ); // dpermute updates ipiv2 init_matrix( N, N, h_A1, lda, 0 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_dpermute_long2( N, d_A1, ldda, ipiv2, nb, 0 ); time = magma_sync_wtime( queue ) - time; row_perf3 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style dlaswp (1 matrix) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_dlaswp( N, d_A1, ldda, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; row_perf4 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style dlaswp (1 matrix) - d_ipiv on GPU */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magma_setvector( nb, sizeof(magma_int_t), ipiv, 1, d_ipiv, 1 ); magmablas_dlaswp2( N, d_A1, ldda, 1, nb, d_ipiv, 1 ); time = magma_sync_wtime( queue ) - time; row_perf7 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style dlaswpx (extended for row- and col-major) (1 matrix) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_dlaswpx( N, d_A1, ldda, 1, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; row_perf5 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* Col Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_dlaswpx( N, d_A1, 1, ldda, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; col_perf5 = gbytes / time; time = magma_wtime(); lapackf77_dlaswp( &N, h_A1, &lda, &ione, &nb, ipiv, &ione); time = magma_wtime() - time; cpu_perf = gbytes / time; magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * Copy matrix. */ time = magma_sync_wtime( queue ); magma_dcopymatrix( N, nb, d_A1, ldda, d_A2, ldda ); time = magma_sync_wtime( queue ) - time; // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap col_perf6 = 0.5 * gbytes / time; time = magma_sync_wtime( queue ); magma_dcopymatrix( nb, N, d_A1, ldda, d_A2, ldda ); time = magma_sync_wtime( queue ) - time; // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap row_perf6 = 0.5 * gbytes / time; printf("%5d %3d %6.2f%c/ %6.2f%c %6.2f%c/ %6.2f%c %6.2f%c/ %6.2f%c %6.2f%c %6.2f%c %6.2f%c %6.2f%c/ %6.2f%c %6.2f / %6.2f %6.2f %10s\n", (int) N, (int) nb, row_perf0, ((check & 0x001) != 0 ? '*' : ' '), col_perf0, ((check & 0x002) != 0 ? '*' : ' '), row_perf1, ((check & 0x004) != 0 ? '*' : ' '), col_perf1, ((check & 0x008) != 0 ? '*' : ' '), row_perf2, ((check & 0x010) != 0 ? '*' : ' '), col_perf2, ((check & 0x020) != 0 ? '*' : ' '), row_perf3, ((check & 0x040) != 0 ? '*' : ' '), row_perf4, ((check & 0x080) != 0 ? '*' : ' '), row_perf7, ((check & 0x100) != 0 ? '*' : ' '), row_perf5, ((check & 0x200) != 0 ? '*' : ' '), col_perf5, ((check & 0x400) != 0 ? '*' : ' '), row_perf6, col_perf6, cpu_perf, (check == 0 ? "ok" : "* failed") ); status += ! (check == 0); TESTING_FREE_PIN( h_A1 ); TESTING_FREE_PIN( h_A2 ); TESTING_FREE_PIN( h_R1 ); TESTING_FREE_PIN( h_R2 ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( ipiv2 ); TESTING_FREE_DEV( d_ipiv ); TESTING_FREE_DEV( d_A1 ); TESTING_FREE_DEV( d_A2 ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgetri */ int main( int argc, char** argv ) { TESTING_INIT(); // constants const double c_zero = MAGMA_D_ZERO; const double c_one = MAGMA_D_ONE; const double c_neg_one = MAGMA_D_NEG_ONE; real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double *h_A, *h_Ainv, *h_R, *work; magmaDouble_ptr d_A, dwork; magma_int_t N, n2, lda, ldda, info, lwork, ldwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double tmp; double error, rwork[1]; magma_int_t *ipiv; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("%% N CPU Gflop/s (sec) GPU Gflop/s (sec) ||I - A*A^{-1}||_1 / (N*cond(A))\n"); printf("%%===============================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default ldwork = N * magma_get_dgetri_nb( N ); gflops = FLOPS_DGETRI( N ) / 1e9; // query for workspace size lwork = -1; lapackf77_dgetri( &N, NULL, &lda, NULL, &tmp, &lwork, &info ); if (info != 0) { printf("lapackf77_dgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); } lwork = magma_int_t( MAGMA_D_REAL( tmp )); TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_CPU( work, double, lwork ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( h_Ainv, double, n2 ); TESTING_MALLOC_CPU( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( dwork, double, ldwork ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); /* Factor the matrix. Both MAGMA and LAPACK will use this factor. */ magma_dsetmatrix( N, N, h_A, lda, d_A, ldda ); magma_dgetrf_gpu( N, N, d_A, ldda, ipiv, &info ); magma_dgetmatrix( N, N, d_A, ldda, h_Ainv, lda ); if (info != 0) { printf("magma_dgetrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } // check for exact singularity //h_Ainv[ 10 + 10*lda ] = MAGMA_D_MAKE( 0.0, 0.0 ); //magma_dsetmatrix( N, N, h_Ainv, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_dgetri_gpu( N, d_A, ldda, ipiv, dwork, ldwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_dgetri_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_dgetri( &N, h_Ainv, &lda, ipiv, work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_dgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); } printf( "%5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf( "%5d --- ( --- ) %7.2f (%7.2f)", (int) N, gpu_perf, gpu_time ); } /* ===================================================================== Check the result =================================================================== */ if ( opts.check ) { magma_dgetmatrix( N, N, d_A, ldda, h_Ainv, lda ); // compute 1-norm condition number estimate, following LAPACK's zget03 double normA, normAinv, rcond; normA = lapackf77_dlange( "1", &N, &N, h_A, &lda, rwork ); normAinv = lapackf77_dlange( "1", &N, &N, h_Ainv, &lda, rwork ); if ( normA <= 0 || normAinv <= 0 ) { rcond = 0; error = 1 / (tol/opts.tolerance); // == 1/eps } else { rcond = (1 / normA) / normAinv; // R = I // R -= A*A^{-1} // err = ||I - A*A^{-1}|| / ( N ||A||*||A^{-1}|| ) = ||R|| * rcond / N, using 1-norm lapackf77_dlaset( "full", &N, &N, &c_zero, &c_one, h_R, &lda ); blasf77_dgemm( "no", "no", &N, &N, &N, &c_neg_one, h_A, &lda, h_Ainv, &lda, &c_one, h_R, &lda ); error = lapackf77_dlange( "1", &N, &N, h_R, &lda, rwork ); error = error * rcond / N; } bool okay = (error < tol); status += ! okay; printf( " %8.2e %s\n", error, (okay ? "ok" : "failed")); } else { printf( "\n" ); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_Ainv ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/** Purpose ------- DSYGVDX computes selected eigenvalues and, optionally, eigenvectors of a real generalized symmetric-definite eigenproblem, of the form A*x=(lambda)*B*x, A*Bx=(lambda)*x, or B*A*x=(lambda)*x. Here A and B are assumed to be symmetric and B is also positive definite. Eigenvalues and eigenvectors can be selected by specifying either a range of values or a range of indices for the desired eigenvalues. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] itype INTEGER Specifies the problem type to be solved: = 1: A*x = (lambda)*B*x = 2: A*B*x = (lambda)*x = 3: B*A*x = (lambda)*x @param[in] range magma_range_t - = MagmaRangeAll: all eigenvalues will be found. - = MagmaRangeV: all eigenvalues in the half-open interval (VL,VU] will be found. - = MagmaRangeI: the IL-th through IU-th eigenvalues will be found. @param[in] jobz magma_vec_t - = MagmaNoVec: Compute eigenvalues only; - = MagmaVec: Compute eigenvalues and eigenvectors. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangles of A and B are stored; - = MagmaLower: Lower triangles of A and B are stored. @param[in] n INTEGER The order of the matrices A and B. N >= 0. @param[in,out] A DOUBLE PRECISION array, dimension (LDA, N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. \n On exit, if JOBZ = MagmaVec, then if INFO = 0, A contains the matrix Z of eigenvectors. The eigenvectors are normalized as follows: if ITYPE = 1 or 2, Z**T * B * Z = I; if ITYPE = 3, Z**T * inv(B) * Z = I. If JOBZ = MagmaNoVec, then on exit the upper triangle (if UPLO=MagmaUpper) or the lower triangle (if UPLO=MagmaLower) of A, including the diagonal, is destroyed. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in,out] B DOUBLE PRECISION array, dimension (LDB, N) On entry, the symmetric matrix B. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of B contains the upper triangular part of the matrix B. If UPLO = MagmaLower, the leading N-by-N lower triangular part of B contains the lower triangular part of the matrix B. \n On exit, if INFO <= N, the part of B containing the matrix is overwritten by the triangular factor U or L from the Cholesky factorization B = U**T * U or B = L * L**T. @param[in] ldb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[in] vl DOUBLE PRECISION @param[in] vu DOUBLE PRECISION If RANGE=MagmaRangeV, the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = MagmaRangeAll or MagmaRangeI. @param[in] il INTEGER @param[in] iu INTEGER If RANGE=MagmaRangeI, the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = MagmaRangeAll or MagmaRangeV. @param[out] mout INTEGER The total number of eigenvalues found. 0 <= MOUT <= N. If RANGE = MagmaRangeAll, MOUT = N, and if RANGE = MagmaRangeI, MOUT = IU-IL+1. @param[out] w DOUBLE PRECISION array, dimension (N) If INFO = 0, the eigenvalues in ascending order. @param[out] work (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[out] work (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LWORK >= 2*N + N*NB. If JOBZ = MagmaVec and N > 1, LWORK >= max( 2*N + N*NB, 1 + 6*N + 2*N**2 ). NB can be obtained through magma_get_dsytrd_nb(N). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. @param[out] iwork (workspace) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. @param[in] liwork INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1. If JOBZ = MagmaVec and N > 1, LIWORK >= 3 + 5*N. \n If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: DPOTRF or DSYEVD returned an error code: <= N: if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = MagmaVec, then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1); > N: if INFO = N + i, for 1 <= i <= N, then the leading minor of order i of B is not positive definite. The factorization of B could not be completed and no eigenvalues or eigenvectors were computed. Further Details --------------- Based on contributions by Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA Modified so that no backsubstitution is performed if DSYEVD fails to converge (NEIG in old code could be greater than N causing out of bounds reference to A - reported by Ralf Meyer). Also corrected the description of INFO and the test on ITYPE. Sven, 16 Feb 05. @ingroup magma_dsygv_driver ********************************************************************/ extern "C" magma_int_t magma_dsygvdx( magma_int_t itype, magma_vec_t jobz, magma_range_t range, magma_uplo_t uplo, magma_int_t n, double *A, magma_int_t lda, double *B, magma_int_t ldb, double vl, double vu, magma_int_t il, magma_int_t iu, magma_int_t *mout, double *w, double *work, magma_int_t lwork, #ifdef COMPLEX double *rwork, magma_int_t lrwork, #endif magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { const char* uplo_ = lapack_uplo_const( uplo ); const char* jobz_ = lapack_vec_const( jobz ); double d_one = MAGMA_D_ONE; double *dA=NULL, *dB=NULL; magma_int_t ldda = magma_roundup( n, 32 ); magma_int_t lddb = ldda; magma_int_t lower; magma_trans_t trans; magma_int_t wantz, lquery; magma_int_t alleig, valeig, indeig; magma_int_t lwmin, liwmin; wantz = (jobz == MagmaVec); lower = (uplo == MagmaLower); alleig = (range == MagmaRangeAll); valeig = (range == MagmaRangeV); indeig = (range == MagmaRangeI); lquery = (lwork == -1 || liwork == -1); *info = 0; if (itype < 1 || itype > 3) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (wantz || (jobz == MagmaNoVec))) { *info = -3; } else if (! (lower || (uplo == MagmaUpper))) { *info = -4; } else if (n < 0) { *info = -5; } else if (lda < max(1,n)) { *info = -7; } else if (ldb < max(1,n)) { *info = -9; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -11; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -12; } else if (iu < min(n,il) || iu > n) { *info = -13; } } } magma_int_t nb = magma_get_dsytrd_nb( n ); if ( n <= 1 ) { lwmin = 1; liwmin = 1; } else if ( wantz ) { lwmin = max( 2*n + n*nb, 1 + 6*n + 2*n*n ); liwmin = 3 + 5*n; } else { lwmin = 2*n + n*nb; liwmin = 1; } work[0] = magma_dmake_lwork( lwmin ); iwork[0] = liwmin; if (lwork < lwmin && ! lquery) { *info = -17; } else if (liwork < liwmin && ! lquery) { *info = -19; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } /* If matrix is very small, then just call LAPACK on CPU, no need for GPU */ if (n <= 128) { lapackf77_dsygvd( &itype, jobz_, uplo_, &n, A, &lda, B, &ldb, w, work, &lwork, iwork, &liwork, info ); *mout = n; return *info; } if (MAGMA_SUCCESS != magma_dmalloc( &dA, n*ldda ) || MAGMA_SUCCESS != magma_dmalloc( &dB, n*lddb )) { magma_free( dA ); magma_free( dB ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); /* Form a Cholesky factorization of B. */ magma_dsetmatrix( n, n, B, ldb, dB, lddb, queue ); magma_dsetmatrix_async( n, n, A, lda, dA, ldda, queue ); magma_timer_t time=0; timer_start( time ); magma_dpotrf_gpu( uplo, n, dB, lddb, info ); if (*info != 0) { *info = n + *info; return *info; } timer_stop( time ); timer_printf( "time dpotrf_gpu = %6.2f\n", time ); magma_queue_sync( queue ); magma_dgetmatrix_async( n, n, dB, lddb, B, ldb, queue ); timer_start( time ); /* Transform problem to standard eigenvalue problem and solve. */ magma_dsygst_gpu( itype, uplo, n, dA, ldda, dB, lddb, info ); timer_stop( time ); timer_printf( "time dsygst_gpu = %6.2f\n", time ); /* simple fix to be able to run bigger size. * set dB=NULL so we know to re-allocate below * TODO: have dwork here that will be used as dB and then passed to dsyevd. */ if (n > 5000) { magma_queue_sync( queue ); magma_free( dB ); dB=NULL; } timer_start( time ); magma_dsyevdx_gpu( jobz, range, uplo, n, dA, ldda, vl, vu, il, iu, mout, w, A, lda, work, lwork, iwork, liwork, info ); timer_stop( time ); timer_printf( "time dsyevdx_gpu = %6.2f\n", time ); if (wantz && *info == 0) { timer_start( time ); /* allocate and copy dB back */ if (dB == NULL) { if (MAGMA_SUCCESS != magma_dmalloc( &dB, n*lddb ) ) { magma_free( dA ); dA=NULL; *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_dsetmatrix( n, n, B, ldb, dB, lddb, queue ); } /* Backtransform eigenvectors to the original problem. */ if (itype == 1 || itype == 2) { /* For A*x=(lambda)*B*x and A*B*x=(lambda)*x; backtransform eigenvectors: x = inv(L)'*y or inv(U)*y */ if (lower) { trans = MagmaTrans; } else { trans = MagmaNoTrans; } magma_dtrsm( MagmaLeft, uplo, trans, MagmaNonUnit, n, *mout, d_one, dB, lddb, dA, ldda, queue ); } else if (itype == 3) { /* For B*A*x=(lambda)*x; backtransform eigenvectors: x = L*y or U'*y */ if (lower) { trans = MagmaNoTrans; } else { trans = MagmaTrans; } magma_dtrmm( MagmaLeft, uplo, trans, MagmaNonUnit, n, *mout, d_one, dB, lddb, dA, ldda, queue ); } magma_dgetmatrix( n, *mout, dA, ldda, A, lda, queue ); timer_stop( time ); timer_printf( "time dtrsm/mm + getmatrix = %6.2f\n", time ); } magma_queue_sync( queue ); magma_queue_destroy( queue ); work[0] = magma_dmake_lwork( lwmin ); iwork[0] = liwmin; magma_free( dA ); dA=NULL; magma_free( dB ); dB=NULL; return *info; } /* magma_dsygvd */
/** Purpose ------- DORGQR generates an M-by-N DOUBLE_PRECISION matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by DGEQRF_GPU. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] dA DOUBLE_PRECISION array A on the GPU, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by DGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] ldda INTEGER The first dimension of the array A. LDDA >= max(1,M). @param[in] tau DOUBLE_PRECISION array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by DGEQRF_GPU. @param[in] dT (workspace) DOUBLE_PRECISION work space array on the GPU, dimension (2*MIN(M, N) + (N+31)/32*32 )*NB. This must be the 6th argument of magma_dgeqrf_gpu [ note that if N here is bigger than N in magma_dgeqrf_gpu, the workspace requirement DT in magma_dgeqrf_gpu must be as specified in this routine ]. @param[in] nb INTEGER This is the block size used in DGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in DT. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument has an illegal value @ingroup magma_dgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_dorgqr_gpu( magma_int_t m, magma_int_t n, magma_int_t k, magmaDouble_ptr dA, magma_int_t ldda, double *tau, magmaDouble_ptr dT, magma_int_t nb, magma_int_t *info) { #define dA(i,j) (dA + (i) + (j)*ldda) #define dT(j) (dT + (j)*nb) double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, lpanel; magma_int_t i, ib, ki, kk, iinfo; magma_int_t lddwork; magmaDouble_ptr dV, dW; double *work, *panel; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (ldda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min( k, ki+nb ); } else { ki = 0; kk = 0; } // Allocate CPU work space // n*nb for dorgqr workspace // (m - kk)*(n - kk) for last block's panel lwork = n*nb; lpanel = (m - kk)*(n - kk); magma_dmalloc_cpu( &work, lwork + lpanel ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } panel = work + lwork; // Allocate work space on GPU if (MAGMA_SUCCESS != magma_dmalloc( &dV, ldda*nb )) { magma_free_cpu( work ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } // dT workspace has: // 2*min(m,n)*nb for T and R^{-1} matrices from geqrf // ((n+31)/32*32 )*nb for dW larfb workspace. lddwork = min(m,n); dW = dT + 2*lddwork*nb; magma_queue_t stream; magma_queue_create( &stream ); // Use unblocked code for the last or only block. if (kk < n) { m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; magma_dgetmatrix( m_kk, k_kk, dA(kk, kk), ldda, panel, m_kk ); lapackf77_dorgqr( &m_kk, &n_kk, &k_kk, panel, &m_kk, &tau[kk], work, &lwork, &iinfo ); magma_dsetmatrix( m_kk, n_kk, panel, m_kk, dA(kk, kk), ldda ); // Set A(1:kk,kk+1:n) to zero. magmablas_dlaset( MagmaFull, kk, n - kk, c_zero, c_zero, dA(0, kk), ldda ); } if (kk > 0) { // Use blocked code // stream: copy Aii to V --> laset --> laset --> larfb --> [next] // CPU has no computation magmablasSetKernelStream( stream ); for (i = ki; i >= 0; i -= nb) { ib = min( nb, k-i ); mi = m - i; // Copy current panel on the GPU from dA to dV magma_dcopymatrix_async( mi, ib, dA(i,i), ldda, dV, ldda, stream ); // set panel to identity magmablas_dlaset( MagmaFull, i, ib, c_zero, c_zero, dA(0, i), ldda ); magmablas_dlaset( MagmaFull, mi, ib, c_zero, c_one, dA(i, i), ldda ); if (i < n) { // Apply H to A(i:m,i:n) from the left magma_dlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, n-i, ib, dV, ldda, dT(i), nb, dA(i, i), ldda, dW, lddwork ); } } } magma_queue_sync( stream ); magma_free( dV ); magma_free_cpu( work ); magma_queue_destroy( stream ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_dorgqr_gpu */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dpotrf */ int main( int argc, char** argv) { TESTING_CUDA_INIT(); magma_timestr_t start, end; double flops, gpu_perf, cpu_perf; double *h_A, *h_R; double *d_A; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112}; magma_int_t i, info; const char *uplo = MagmaUpperStr; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double work[1], matnorm; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); } if (N>0) size[0] = size[9] = N; else exit(1); } else { printf("\nUsage: \n"); printf(" testing_dpotri_gpu -N %d\n\n", 1024); } /* Allocate host memory for the matrix */ n2 = size[9] * size[9]; ldda = ((size[9]+31)/32) * 32; TESTING_MALLOC( h_A, double, n2); TESTING_HOSTALLOC( h_R, double, n2); TESTING_DEVALLOC( d_A, double, ldda*size[9] ); printf(" N CPU GFlop/s GPU GFlop/s ||R||_F / ||A||_F\n"); printf("========================================================\n"); for(i=0; i<10; i++){ N = size[i]; lda = N; n2 = lda*N; flops = FLOPS_DPOTRI( (double)N ) / 1000000; ldda = ((N+31)/32)*32; /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); /* Symmetrize and increase the diagonal */ { magma_int_t i, j; for(i=0; i<N; i++) { MAGMA_D_SET2REAL( h_A[i*lda+i], ( MAGMA_D_REAL(h_A[i*lda+i]) + 1.*N ) ); for(j=0; j<i; j++) h_A[i*lda+j] = (h_A[j*lda+i]); } } lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ //cublasSetMatrix( N, N, sizeof(double), h_A, lda, d_A, ldda); //magma_dpotrf_gpu(uplo[0], N, d_A, ldda, &info); /* factorize matrix */ magma_dsetmatrix( N, N, h_A, lda, d_A, ldda ); magma_dpotrf_gpu(uplo[0], N, d_A, ldda, &info); // check for exact singularity //magma_dgetmatrix( N, N, d_A, ldda, h_R, lda ); //h_R[ 10 + 10*lda ] = MAGMA_D_MAKE( 0.0, 0.0 ); //magma_dsetmatrix( N, N, h_R, lda, d_A, ldda ); start = get_current_time(); magma_dpotri_gpu(uplo[0], N, d_A, ldda, &info); end = get_current_time(); if (info != 0) printf("magma_dpotri_gpu returned error %d\n", (int) info); gpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_dpotrf(uplo, &N, h_A, &lda, &info); start = get_current_time(); lapackf77_dpotri(uplo, &N, h_A, &lda, &info); end = get_current_time(); if (info != 0) printf("lapackf77_dpotri returned error %d\n", (int) info); cpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_dgetmatrix( N, N, d_A, ldda, h_R, lda ); matnorm = lapackf77_dlange("f", &N, &N, h_A, &lda, work); blasf77_daxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); printf("%5d %6.2f %6.2f %e\n", (int) size[i], cpu_perf, gpu_perf, lapackf77_dlange("f", &N, &N, h_R, &lda, work) / matnorm); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE( h_A ); TESTING_HOSTFREE( h_R ); TESTING_DEVFREE( d_A ); /* Shutdown */ TESTING_CUDA_FINALIZE(); }
/** Purpose ------- DORMQR overwrites the general real M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = MagmaTrans: Q**H * C C * Q**H @endverbatim where Q is a real unitary matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by DGEQRF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = MagmaTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] dA DOUBLE_PRECISION array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by DGEQRF in the first k columns of its array argument A. The diagonal and the upper part are destroyed, the reflectors are not modified. @param[in] ldda INTEGER The leading dimension of the array DA. LDDA >= max(1,M) if SIDE = MagmaLeft; LDDA >= max(1,N) if SIDE = MagmaRight. @param[in] tau DOUBLE_PRECISION array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by DGEQRF. @param[in,out] dC DOUBLE_PRECISION array, dimension (LDDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by (Q*C) or (Q**H * C) or (C * Q**H) or (C*Q). @param[in] lddc INTEGER The leading dimension of the array C. LDDC >= max(1,M). @param[in] wA (workspace) DOUBLE_PRECISION array, dimension (LDWA,M) if SIDE = MagmaLeft (LDWA,N) if SIDE = MagmaRight The vectors which define the elementary reflectors, as returned by DSYTRD_GPU. @param[in] ldwa INTEGER The leading dimension of the array wA. LDWA >= max(1,M) if SIDE = MagmaLeft; LDWA >= max(1,N) if SIDE = MagmaRight. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_dgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_dormqr2_gpu(magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, double *dA, magma_int_t ldda, double *tau, double *dC, magma_int_t lddc, double *wA, magma_int_t ldwa, magma_int_t *info) { #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dC(i_,j_) (dC + (i_) + (j_)*lddc) #define wA(i_,j_) (wA + (i_) + (j_)*ldwa) /* Allocate work space on the GPU */ double *dwork; double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; magma_int_t i, i__4, lddwork; double T[2*4160] /* was [65][64] */; magma_int_t i1, i2, step, ib, ic, jc, nb, mi, ni, nq, nw; int left, notran; wA -= 1 + ldwa; dC -= 1 + lddc; --tau; *info = 0; left = (side == MagmaLeft); notran = (trans == MagmaNoTrans); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; magma_dmalloc( &dwork, (n + 64)*64 ); // TODO after checking args, else memory leak! } else { nq = n; nw = m; magma_dmalloc( &dwork, (m + 64)*64 ); // TODO after checking args, else memory leak! } if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != MagmaTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (ldda < max(1,nq)) { *info = -7; } else if (lddc < max(1,m)) { *info = -10; } else if (ldwa < max(1,nq)) { *info = -12; } // size of the block nb = 64; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { return *info; } /* Use hybrid CPU-GPU code */ if ( ( left && (! notran) ) || ( (! left) && notran ) ) { i1 = 1; i2 = k; step = nb; } else { i1 = ((k - 1)/nb)*nb + 1; i2 = 1; step = -nb; } // silence "uninitialized" warnings mi = 0; ni = 0; if (left) { ni = n; jc = 1; } else { mi = m; ic = 1; } // set nb-1 super-diagonals to 0, and diagonal to 1. // This way we can copy V directly to the GPU, // with the upper triangle parts already set to identity. magmablas_dlaset_band( MagmaUpper, k, k, nb, c_zero, c_one, dA, ldda ); // for i=i1 to i2 by step for (i = i1; (step < 0 ? i >= i2 : i <= i2); i += step) { ib = min(nb, k - i + 1); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ i__4 = nq - i + 1; lapackf77_dlarft("Forward", "Columnwise", &i__4, &ib, wA(i,i), &ldwa, &tau[i], T, &ib); if (left) { /* H or H' is applied to C(i:m,1:n) */ mi = m - i + 1; ic = i; } else { /* H or H' is applied to C(1:m,i:n) */ ni = n - i + 1; jc = i; } if (left) lddwork = ni; else lddwork = mi; /* Apply H or H'; First copy T to the GPU */ magma_dsetmatrix( ib, ib, T, ib, dwork, ib ); magma_dlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i-1,i-1), ldda, dwork, ib, // dA using 0-based indices here dC(ic,jc), lddc, dwork + ib*ib, lddwork); } magma_free( dwork ); return *info; } /* magma_dormqr */
/** Purpose ------- DGEQRF2_MGPU computes a QR factorization of a real M-by-N matrix A: A = Q * R. This is a GPU interface of the routine. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix dA. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] tau DOUBLE_PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_dgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_dgeqrf2_mgpu( magma_int_t num_gpus, magma_int_t m, magma_int_t n, double **dlA, magma_int_t ldda, double *tau, magma_int_t *info ) { #define dlA(dev, i, j) (dlA[dev] + (i) + (j)*(ldda)) #define hpanel(i) (hpanel + (i)) // set to NULL to make cleanup easy: free(NULL) does nothing. double *dwork[MagmaMaxGPUs]={NULL}, *dpanel[MagmaMaxGPUs]={NULL}; double *hwork=NULL, *hpanel=NULL; magma_queue_t stream[MagmaMaxGPUs][2]={{NULL}}; magma_event_t panel_event[MagmaMaxGPUs]={NULL}; magma_int_t i, j, min_mn, dev, ldhpanel, lddwork, rows; magma_int_t ib, nb; magma_int_t lhwork, lwork; magma_int_t panel_dev, i_local, i_nb_local, n_local[MagmaMaxGPUs], la_dev, dpanel_offset; *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } min_mn = min(m,n); if (min_mn == 0) return *info; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); nb = magma_get_dgeqrf_nb( m ); /* dwork is (n*nb) --- for T (nb*nb) and dlarfb work ((n-nb)*nb) --- * + dpanel (ldda*nb), on each GPU. * I think dlarfb work could be smaller, max(n_local[:]). * Oddly, T and dlarfb work get stacked on top of each other, both with lddwork=n. * on GPU that owns panel, set dpanel = dlA(dev,i,i_local). * on other GPUs, set dpanel = dwork[dev] + dpanel_offset. */ lddwork = n; dpanel_offset = lddwork*nb; for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); if ( MAGMA_SUCCESS != magma_dmalloc( &(dwork[dev]), (lddwork + ldda)*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto CLEANUP; } } /* hwork is MAX( workspace for dgeqrf (n*nb), two copies of T (2*nb*nb) ) * + hpanel (m*nb). * for last block, need 2*n*nb total. */ ldhpanel = m; lhwork = max( n*nb, 2*nb*nb ); lwork = max( lhwork + ldhpanel*nb, 2*n*nb ); if ( MAGMA_SUCCESS != magma_dmalloc_pinned( &hwork, lwork )) { *info = MAGMA_ERR_HOST_ALLOC; goto CLEANUP; } hpanel = hwork + lhwork; /* Set the number of local n for each GPU */ for( dev=0; dev < num_gpus; dev++ ) { n_local[dev] = ((n/nb)/num_gpus)*nb; if (dev < (n/nb) % num_gpus) n_local[dev] += nb; else if (dev == (n/nb) % num_gpus) n_local[dev] += n % nb; } for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_queue_create( &stream[dev][0] ); magma_queue_create( &stream[dev][1] ); magma_event_create( &panel_event[dev] ); } if ( nb < min_mn ) { /* Use blocked code initially */ // Note: as written, ib cannot be < nb. for( i = 0; i < min_mn-nb; i += nb ) { /* Set the GPU number that holds the current panel */ panel_dev = (i/nb) % num_gpus; /* Set the local index where the current panel is (j == i) */ i_local = i/(nb*num_gpus)*nb; ib = min(min_mn-i, nb); rows = m-i; /* Send current panel to the CPU, after panel_event indicates it has been updated */ magma_setdevice( panel_dev ); magma_queue_wait_event( stream[panel_dev][1], panel_event[panel_dev] ); magma_dgetmatrix_async( rows, ib, dlA(panel_dev, i, i_local), ldda, hpanel(i), ldhpanel, stream[panel_dev][1] ); magma_queue_sync( stream[panel_dev][1] ); // Factor panel lapackf77_dgeqrf( &rows, &ib, hpanel(i), &ldhpanel, tau+i, hwork, &lhwork, info ); if ( *info != 0 ) { fprintf( stderr, "error %d\n", (int) *info ); } // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, hpanel(i), &ldhpanel, tau+i, hwork, &ib ); dpanel_to_q( MagmaUpper, ib, hpanel(i), ldhpanel, hwork + ib*ib ); // Send the current panel back to the GPUs for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); if (dev == panel_dev) dpanel[dev] = dlA(dev, i, i_local); else dpanel[dev] = dwork[dev] + dpanel_offset; magma_dsetmatrix_async( rows, ib, hpanel(i), ldhpanel, dpanel[dev], ldda, stream[dev][0] ); } for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_queue_sync( stream[dev][0] ); } // TODO: if dpanel_to_q copied whole block, wouldn't need to restore // -- just send the copy to the GPUs. // TODO: also, could zero out the lower triangle and use Azzam's larfb w/ gemm. /* Restore the panel */ dq_to_panel( MagmaUpper, ib, hpanel(i), ldhpanel, hwork + ib*ib ); if (i + ib < n) { /* Send the T matrix to the GPU. */ for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_dsetmatrix_async( ib, ib, hwork, ib, dwork[dev], lddwork, stream[dev][0] ); } la_dev = (panel_dev+1) % num_gpus; for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magmablasSetKernelStream( stream[dev][0] ); if (dev == la_dev && i+nb < min_mn-nb) { // If not last panel, // for look-ahead panel, apply H' to A(i:m,i+ib:i+2*ib) i_nb_local = (i+nb)/(nb*num_gpus)*nb; magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local), ldda, // C dwork[dev]+ib, lddwork ); // work magma_event_record( panel_event[dev], stream[dev][0] ); // for trailing matrix, apply H' to A(i:m,i+2*ib:n) magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[dev]-(i_nb_local+ib), ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local+ib), ldda, // C dwork[dev]+ib, lddwork ); // work } else { // for trailing matrix, apply H' to A(i:m,i+ib:n) i_nb_local = i_local; if (dev <= panel_dev) { i_nb_local += ib; } magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[dev]-i_nb_local, ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local), ldda, // C dwork[dev]+ib, lddwork ); // work } } // Restore top of panel (after larfb is done) magma_setdevice( panel_dev ); magma_dsetmatrix_async( ib, ib, hpanel(i), ldhpanel, dlA(panel_dev, i, i_local), ldda, stream[panel_dev][0] ); } } } else { i = 0; } /* Use unblocked code to factor the last or only block row. */ if (i < min_mn) { rows = m-i; for( j=i; j < n; j += nb ) { panel_dev = (j/nb) % num_gpus; i_local = j/(nb*num_gpus)*nb; ib = min( n-j, nb ); magma_setdevice( panel_dev ); magma_dgetmatrix( rows, ib, dlA(panel_dev, i, i_local), ldda, hwork + (j-i)*rows, rows ); } // needs lwork >= 2*n*nb: // needs (m-i)*(n-i) for last block row, bounded by nb*n. // needs (n-i)*nb for dgeqrf work, bounded by n*nb. ib = n-i; // total columns in block row lhwork = lwork - ib*rows; lapackf77_dgeqrf( &rows, &ib, hwork, &rows, tau+i, hwork + ib*rows, &lhwork, info ); if ( *info != 0 ) { fprintf( stderr, "error %d\n", (int) *info ); } for( j=i; j < n; j += nb ) { panel_dev = (j/nb) % num_gpus; i_local = j/(nb*num_gpus)*nb; ib = min( n-j, nb ); magma_setdevice( panel_dev ); magma_dsetmatrix( rows, ib, hwork + (j-i)*rows, rows, dlA(panel_dev, i, i_local), ldda ); } } CLEANUP: // free(NULL) does nothing. for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_queue_destroy( stream[dev][0] ); magma_queue_destroy( stream[dev][1] ); magma_event_destroy( panel_event[dev] ); magma_free( dwork[dev] ); } magma_free_pinned( hwork ); magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_dgeqrf2_mgpu */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dsymmetrize Code is very similar to testing_dtranspose.cpp */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_R; double *d_A; magma_int_t N, nb, size, lda, ldda, mstride, nstride, ntile; magma_int_t ione = 1; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); nb = (opts.nb == 0 ? 64 : opts.nb); mstride = 2*nb; nstride = 3*nb; printf("uplo = %s, nb = %d, mstride = %d, nstride = %d\n", lapack_uplo_const(opts.uplo), (int) nb, (int) mstride, (int) nstride ); printf(" N ntile CPU GByte/s (ms) GPU GByte/s (ms) check\n"); printf("===========================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldda = ((N+31)/32)*32; size = lda*N; if ( N < nb ) { ntile = 0; } else { ntile = min( (N - nb)/mstride + 1, (N - nb)/nstride + 1 ); } // load each tile, save each tile gbytes = sizeof(double) * 2.*nb*nb*ntile / 1e9; TESTING_MALLOC_CPU( h_A, double, size ); TESTING_MALLOC_CPU( h_R, double, size ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); /* Initialize the matrix */ for( int j = 0; j < N; ++j ) { for( int i = 0; i < N; ++i ) { h_A[i + j*lda] = MAGMA_D_MAKE( i + j/10000., j ); } } /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix( N, N, h_A, lda, d_A, ldda ); gpu_time = magma_sync_wtime( 0 ); magmablas_dsymmetrize_tiles( opts.uplo, nb, d_A, ldda, ntile, mstride, nstride ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gbytes / gpu_time; /* ===================================================================== Performs operation using naive in-place algorithm (LAPACK doesn't implement symmetrize) =================================================================== */ cpu_time = magma_wtime(); for( int tile = 0; tile < ntile; ++tile ) { int offset = tile*mstride + tile*nstride*lda; for( int j = 0; j < nb; ++j ) { for( int i = 0; i < j; ++i ) { if ( opts.uplo == MagmaLower ) { h_A[offset + i + j*lda] = MAGMA_D_CNJG( h_A[offset + j + i*lda] ); } else { h_A[offset + j + i*lda] = MAGMA_D_CNJG( h_A[offset + i + j*lda] ); } } } } cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; /* ===================================================================== Check the result =================================================================== */ magma_dgetmatrix( N, N, d_A, ldda, h_R, lda ); blasf77_daxpy(&size, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("f", &N, &N, h_R, &lda, work); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %s\n", (int) N, (int) ntile, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., (error == 0. ? "ok" : "failed") ); status += ! (error == 0.); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_err_t magma_dpotrf_msub(int num_subs, int num_gpus, magma_uplo_t uplo, magma_int_t n, magmaDouble_ptr *d_lA, size_t dA_offset, magma_int_t ldda, magma_int_t *info, magma_queue_t *queues) { /* -- clMAGMA (version 1.1.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date January 2014 Purpose ======= DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix dA. The factorization has the form dA = U**T * U, if UPLO = 'U', or dA = L * L**T, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of dA is stored; = 'L': Lower triangle of dA is stored. N (input) INTEGER The order of the matrix dA. N >= 0. dA (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) On entry, the symmetric matrix dA. If UPLO = 'U', the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**T * U or dA = L * L**T. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be dividable by 16. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ int tot_subs = num_subs * num_gpus; magma_err_t err; magma_int_t j, nb, d, lddp, h; double *work; magmaDouble_ptr dwork[MagmaMaxGPUs]; *info = 0; nb = magma_get_dpotrf_nb(n); if ( uplo != MagmaUpper && uplo != MagmaLower ) { *info = -1; } else if (n < 0) { *info = -2; } else if (uplo != MagmaUpper) { lddp = nb*(n/(nb*tot_subs)); if( n%(nb*tot_subs) != 0 ) lddp+=min(nb,n-tot_subs*lddp); if( ldda < lddp ) *info = -4; } else if( ldda < n ) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (num_gpus == 1 && ((nb <= 1) || (nb >= n)) ) { /* Use unblocked code. */ err = magma_dmalloc_cpu( &work, n*nb ); if (err != MAGMA_SUCCESS) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_dgetmatrix( n, n, d_lA[0], 0, ldda, work, 0, n, queues[0] ); lapackf77_dpotrf(lapack_uplo_const(uplo), &n, work, &n, info); magma_dsetmatrix( n, n, work, 0, n, d_lA[0], 0, ldda, queues[0] ); magma_free_cpu( work ); } else { lddp = 32*((n+31)/32); for (d=0; d<num_gpus; d++) { if (MAGMA_SUCCESS != magma_dmalloc( &dwork[d], num_gpus*nb*lddp )) { for( j=0; j<d; j++ ) magma_free( dwork[j] ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } h = 1; //num_gpus; //(n+nb-1)/nb; #ifdef USE_PINNED_CLMEMORY cl_mem buffer = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(double)*n*nb*h, NULL, NULL); for (d=0; d<num_gpus; d++) { work = (double*)clEnqueueMapBuffer(queues[2*d], buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(double)*n*nb*h, 0, NULL, NULL, NULL); } #else if (MAGMA_SUCCESS != magma_dmalloc_cpu( &work, n*nb*h )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } #endif if (uplo == MagmaUpper) { /* with two queues for each device */ magma_dpotrf2_msub(num_subs, num_gpus, uplo, n, n, 0, 0, nb, d_lA, 0, ldda, dwork, lddp, work, n, h, info, queues); //magma_dpotrf3_msub(num_subs, num_gpus, uplo, n, n, 0, 0, nb, d_lA, 0, ldda, // dwork, lddp, work, n, h, info, queues); /* with three streams */ //magma_dpotrf3_msub(num_gpus, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, n, // h, stream, event, info); } else { /* with two queues for each device */ magma_dpotrf2_msub(num_subs, num_gpus, uplo, n, n, 0, 0, nb, d_lA, 0, ldda, dwork, lddp, work, nb*h, h, info, queues); //magma_dpotrf3_msub(num_subs, num_gpus, uplo, n, n, 0, 0, nb, d_lA, 0, ldda, // dwork, lddp, work, nb*h, h, info, queues); //magma_dpotrf4_msub(num_subs, num_gpus, uplo, n, n, 0, 0, nb, d_lA, 0, ldda, // dwork, lddp, work, nb*h, h, info, queues); /* with three streams */ //magma_dpotrf3_msub(num_gpus, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, nb*h, // h, stream, event, info); } /* clean up */ for (d=0; d<num_gpus; d++) magma_free( dwork[d] ); #ifdef USE_PINNED_CLMEMORY for (d=0; d<num_gpus; d++) { clEnqueueUnmapMemObject(queues[2*d], buffer, work, 0, NULL, NULL); } clReleaseMemObject( buffer ); #else magma_free_cpu( work ); #endif } /* end of not lapack */ return *info; } /* magma_dpotrf_msub */