Beispiel #1
0
//
//    m, n - dimensions in the source (input) matrix.
//             This routine copies the hA matrix from the CPU
//             to dAT on the GPU. In addition, the output matrix
//             is transposed. The routine uses a buffer of size
//             2*lddw*nb pointed to by dwork (lddw > m) on the GPU. 
//             Note that lda >= m and lddat >= n.
//
extern "C" void 
magmablas_ssetmatrix_transpose_mgpu(
    magma_int_t ngpu, magma_queue_t queues[][2],
    const float *hA,       magma_int_t lda, 
    magmaFloat_ptr    dAT[],    magma_int_t ldda, 
    magmaFloat_ptr    dwork[],  magma_int_t lddw,
    magma_int_t m, magma_int_t n, magma_int_t nb)
{
#define hA(j)       (hA         + (j)*lda)
#define dwork(d, j) (dwork[(d)] + (j)*nb*lddw)
#define dAT(d, j)   (dAT[(d)]   + (j)*nb)

    magma_int_t nqueues = 2, d, j, j_local, id, ib;
    
    /* Quick return */
    if ( (m == 0) || (n == 0) )
        return;
    
    if (lda < m || ngpu*ldda < n || lddw < m) {
        fprintf( stderr, "%s: wrong arguments (%d<%d), (%d*%d<%d), or (%d<%d).\n",
                 __func__, (int) lda, (int) m, (int) ngpu, (int) ldda, (int) n, (int) lddw, (int) m );
        return;
    }
    
    /* Move data from CPU to GPU by block columns and transpose it */
    for (j=0; j < n; j += nb) {
        d       = (j/nb)%ngpu;
        j_local = (j/nb)/ngpu;
        id      = j_local%nqueues;
        magma_setdevice(d);
        
        ib = min(n-j, nb);
        magma_ssetmatrix_async( m, ib,
                                hA(j),        lda,
                                dwork(d, id), lddw, 
                                queues[d][id] );
        
        magmablas_stranspose_q( m, ib, dwork(d,id), lddw, dAT(d,j_local), ldda, queues[d][id] );
    }
}
Beispiel #2
0
// QR Factorization of a MxN General Matrix A.
//    a       (IN/OUT - matrix(M,N)) On entry, the coefficient matrix A. On exit , the upper triangle and diagonal is the min(M,N) by N upper triangular matrix R.  The lower triangle, together with the tau vector, is the orthogonal matrix Q as a product of min(M,N) elementary reflectors.
//    tau     (OUT - vector (min(M,N))) Vector of the same numerical type as A. The scalar factors of the elementary reflectors.
//    info    (OUT - int)
//   0   : function completed normally
//   < 0 : The ith argument, where i = abs(return value) had an illegal value.
int geqrf (matrix_t& a, vector_t& tau)
{
	int              _m = int(a.size1());
	int              _n = int(a.size2());
	int              _lda = int(a.size1());
	int              _info;

	// make_sure tau's size is greater than or equal to min(m,n)
	if (int(tau.size()) < (_n<_m ? _n : _m) )
		return -104;

	int ldwork = _n*_n;
	vector_t dwork(ldwork);
	rawLAPACK::geqrf (_m, _n, a.data().begin(), _lda, tau.data().begin(), dwork.data().begin(), ldwork, _info);

	return _info;
}
Beispiel #3
0
/**
    Purpose
    -------
    DGEQLF computes a QL factorization of a DOUBLE_PRECISION M-by-N matrix A:
    A = Q * L.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    A       DOUBLE_PRECISION array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, if m >= n, the lower triangle of the subarray
            A(m-n+1:m,1:n) contains the N-by-N lower triangular matrix L;
            if m <= n, the elements on and below the (n-m)-th
            superdiagonal contain the M-by-N lower trapezoidal matrix L;
            the remaining elements, with the array TAU, represent the
            orthogonal matrix Q as a product of elementary reflectors
            (see Further Details).
    \n
            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    @param[out]
    tau     DOUBLE_PRECISION array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    work    (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.
    \n
            Higher performance is achieved if WORK is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.  LWORK >= max(1,N,2*NB^2).
            For optimum performance LWORK >= max(N*NB, 2*NB^2) where NB can be obtained
            through magma_get_dgeqlf_nb(M).
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ---------------
    The matrix Q is represented as a product of elementary reflectors

       Q = H(k) . . . H(2) H(1), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(m-k+i+1:m) = 0 and v(m-k+i) = 1; v(1:m-k+i-1) is stored on exit in
    A(1:m-k+i-1,n-k+i), and tau in TAU(i).

    @ingroup magma_dgeqlf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_dgeqlf(
    magma_int_t m, magma_int_t n,
    double *A,    magma_int_t lda, double *tau,
    double *work, magma_int_t lwork,
    magma_int_t *info)
{
    #define  A(i_,j_) ( A + (i_) + (j_)*lda)
    #define dA(i_,j_) (dA + (i_) + (j_)*ldda)
    #define dwork(i_) (dwork + (i_))

    magmaDouble_ptr dA, dwork;
    double c_one = MAGMA_D_ONE;
    magma_int_t i, k, lddwork, old_i, old_ib, nb;
    magma_int_t rows, cols;
    magma_int_t ib, ki, kk, mu, nu, iinfo, ldda;
    int lquery;

    nb = magma_get_dgeqlf_nb(m);
    *info = 0;
    lquery = (lwork == -1);

    // silence "uninitialized" warnings
    old_ib = nb;
    old_i  = 0;
    
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,m)) {
        *info = -4;
    }

    k = min(m,n);
    if (*info == 0) {
        if (k == 0)
            work[0] = c_one;
        else {
            work[0] = MAGMA_D_MAKE( max(n*nb, 2*nb*nb), 0 );
        }

        if (lwork < max(max(1,n), 2*nb*nb) && ! lquery)
            *info = -7;
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;

    /* Quick return if possible */
    if (k == 0)
        return *info;

    lddwork = ((n+31)/32)*32;
    ldda    = ((m+31)/32)*32;

    if (MAGMA_SUCCESS != magma_dmalloc( &dA, (n)*ldda + nb*lddwork )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }
    dwork = dA + ldda*n;

    magma_queue_t queues[2];
    magma_queue_create( &queues[0] );
    magma_queue_create( &queues[1] );

    if ( (nb > 1) && (nb < k) ) {
        /*  Use blocked code initially.
            The last kk columns are handled by the block method.
            First, copy the matrix on the GPU except the last kk columns */
        magma_dsetmatrix_async( m, n-nb,
                                A(0, 0),  lda,
                                dA(0, 0), ldda, queues[0] );

        ki = ((k - nb - 1) / nb) * nb;
        kk = min(k, ki + nb);
        for (i = k - kk + ki; i >= k -kk; i -= nb) {
            ib = min(k-i,nb);

            if (i < k - kk + ki) {
                /* 1. Copy asynchronously the current panel to the CPU.
                   2. Copy asynchronously the submatrix below the panel
                   to the CPU)                                        */
                rows = m - k + i + ib;
                magma_dgetmatrix_async( rows, ib,
                                        dA(0, n-k+i), ldda,
                                        A(0, n-k+i),  lda, queues[1] );

                magma_dgetmatrix_async( m-rows, ib,
                                        dA(rows, n-k+i), ldda,
                                        A(rows, n-k+i),  lda, queues[0] );

                /* Apply H' to A(1:m-k+i+ib-1,1:n-k+i-1) from the left in
                   two steps - implementing the lookahead techniques.
                   This is the main update from the lookahead techniques. */
                rows = m - k + old_i + old_ib;
                cols = n - k + old_i - old_ib;
                magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaBackward, MagmaColumnwise,
                                  rows, cols, old_ib,
                                  dA(0, cols+old_ib), ldda, dwork(0),      lddwork,
                                  dA(0, 0          ), ldda, dwork(old_ib), lddwork);
            }

            magma_queue_sync( queues[1] );
            /* Compute the QL factorization of the current block
               A(1:m-k+i+ib-1,n-k+i:n-k+i+ib-1) */
            rows = m - k + i + ib;
            cols = n - k + i;
            lapackf77_dgeqlf( &rows, &ib, A(0,cols), &lda, tau+i, work, &lwork, &iinfo );

            if (cols > 0) {
                /* Form the triangular factor of the block reflector
                   H = H(i+ib-1) . . . H(i+1) H(i) */
                lapackf77_dlarft( MagmaBackwardStr, MagmaColumnwiseStr,
                                  &rows, &ib,
                                  A(0, cols), &lda, tau + i, work, &ib);

                dpanel_to_q( MagmaLower, ib, A(rows-ib,cols), lda, work+ib*ib);
                magma_dsetmatrix( rows, ib,
                                  A(0,cols),  lda,
                                  dA(0,cols), ldda );
                dq_to_panel( MagmaLower, ib, A(rows-ib,cols), lda, work+ib*ib);

                // Send the triangular part on the GPU
                magma_dsetmatrix( ib, ib, work, ib, dwork(0), lddwork );

                /* Apply H' to A(1:m-k+i+ib-1,1:n-k+i-1) from the left in
                   two steps - implementing the lookahead techniques.
                   This is the update of first ib columns.                 */
                if (i-ib >= k -kk)
                    magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaBackward, MagmaColumnwise,
                                      rows, ib, ib,
                                      dA(0, cols),   ldda, dwork(0),  lddwork,
                                      dA(0,cols-ib), ldda, dwork(ib), lddwork);
                else {
                    magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaBackward, MagmaColumnwise,
                                      rows, cols, ib,
                                      dA(0, cols), ldda, dwork(0),  lddwork,
                                      dA(0, 0   ), ldda, dwork(ib), lddwork);
                }

                old_i  = i;
                old_ib = ib;
            }
        }
        mu = m - k + i + nb;
        nu = n - k + i + nb;

        magma_dgetmatrix( m, nu, dA(0,0), ldda, A(0,0), lda );
    } else {
        mu = m;
        nu = n;
    }

    /* Use unblocked code to factor the last or only block */
    if (mu > 0 && nu > 0)
        lapackf77_dgeqlf(&mu, &nu, A(0,0), &lda, tau, work, &lwork, &iinfo);

    magma_queue_destroy( queues[0] );
    magma_queue_destroy( queues[1] );
    magma_free( dA );
    
    return *info;
} /* magma_dgeqlf */
Beispiel #4
0
/**
    Purpose
    -------
    DORMQR overwrites the general real M-by-N matrix C with

    @verbatim
                                SIDE = MagmaLeft    SIDE = MagmaRight
    TRANS = MagmaNoTrans:       Q * C               C * Q
    TRANS = MagmaTrans:    Q**H * C            C * Q**H
    @endverbatim

    where Q is a real unitary matrix defined as the product of k
    elementary reflectors

          Q = H(1) H(2) . . . H(k)

    as returned by DGEQRF. Q is of order M if SIDE = MagmaLeft and of order N
    if SIDE = MagmaRight.

    Arguments
    ---------
    @param[in]
    ngpu    INTEGER
            Number of GPUs to use. ngpu > 0.

    @param[in]
    side    magma_side_t
      -     = MagmaLeft:      apply Q or Q**H from the Left;
      -     = MagmaRight:     apply Q or Q**H from the Right.

    @param[in]
    trans   magma_trans_t
      -     = MagmaNoTrans:    No transpose, apply Q;
      -     = MagmaTrans: Conjugate transpose, apply Q**H.

    @param[in]
    m       INTEGER
            The number of rows of the matrix C. M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix C. N >= 0.

    @param[in]
    k       INTEGER
            The number of elementary reflectors whose product defines
            the matrix Q.
            If SIDE = MagmaLeft,  M >= K >= 0;
            if SIDE = MagmaRight, N >= K >= 0.

    @param[in]
    A       DOUBLE_PRECISION array, dimension (LDA,K)
            The i-th column must contain the vector which defines the
            elementary reflector H(i), for i = 1,2,...,k, as returned by
            DGEQRF in the first k columns of its array argument A.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.
            If SIDE = MagmaLeft,  LDA >= max(1,M);
            if SIDE = MagmaRight, LDA >= max(1,N).

    @param[in]
    tau     DOUBLE_PRECISION array, dimension (K)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by DGEQRF.

    @param[in,out]
    C       DOUBLE_PRECISION array, dimension (LDC,N)
            On entry, the M-by-N matrix C.
            On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q.

    @param[in]
    ldc     INTEGER
            The leading dimension of the array C. LDC >= max(1,M).

    @param[out]
    work    (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.
            If SIDE = MagmaLeft,  LWORK >= max(1,N);
            if SIDE = MagmaRight, LWORK >= max(1,M).
            For optimum performance LWORK >= N*NB if SIDE = MagmaLeft, and
            LWORK >= M*NB if SIDE = MagmaRight, where NB is the optimal
            blocksize.
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value

    @ingroup magma_dgeqrf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_dormqr_m(
    magma_int_t ngpu,
    magma_side_t side, magma_trans_t trans,
    magma_int_t m, magma_int_t n, magma_int_t k,
    double *A,    magma_int_t lda,
    double *tau,
    double *C,    magma_int_t ldc,
    double *work, magma_int_t lwork,
    magma_int_t *info)
{
#define  A(i, j) (A + (j)*lda  + (i))
#define  C(i, j) (C + (j)*ldc  + (i))

#define    dC(gpui,      i, j) (dw[gpui] + (j)*lddc + (i))
#define  dA_c(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddac)
#define  dA_r(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddar)
#define    dT(gpui, ind)       (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + (ind)*((nb+1)*nb))
#define dwork(gpui, ind)       (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + 2*((nb+1)*nb) + (ind)*(lddwork*nb))

    double c_zero = MAGMA_D_ZERO;
    double c_one  = MAGMA_D_ONE;

    const char* side_  = lapack_side_const( side );
    const char* trans_ = lapack_trans_const( trans );

    // TODO fix memory leak (alloc after argument checks)
    magma_int_t nb = 128;
    double *T;
    magma_dmalloc_pinned(&T, nb*nb);
    //printf("calling dormqr_m with nb=%d\n", (int) nb);

    double* dw[MagmaMaxGPUs];
    magma_queue_t stream [MagmaMaxGPUs][2];
    magma_event_t  event [MagmaMaxGPUs][2];

    magma_int_t ind_c;
    magma_device_t igpu;
    
    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );

    *info = 0;

    magma_int_t left   = (side == MagmaLeft);
    magma_int_t notran = (trans == MagmaNoTrans);
    magma_int_t lquery = (lwork == -1);

    /* NQ is the order of Q and NW is the minimum dimension of WORK */
    magma_int_t nq, nw;
    if (left) {
        nq = m;
        nw = n;
    } else {
        nq = n;
        nw = m;
    }


    if (! left && side != MagmaRight) {
        *info = -1;
    } else if (! notran && trans != MagmaTrans) {
        *info = -2;
    } else if (m < 0) {
        *info = -3;
    } else if (n < 0) {
        *info = -4;
    } else if (k < 0 || k > nq) {
        *info = -5;
    } else if (lda < max(1,nq)) {
        *info = -7;
    } else if (ldc < max(1,m)) {
        *info = -10;
    } else if (lwork < max(1,nw) && ! lquery) {
        *info = -12;
    }

    magma_int_t lwkopt = max(1,nw) * nb;
    if (*info == 0) {
        work[0] = MAGMA_D_MAKE( lwkopt, 0 );
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0 || k == 0) {
        work[0] = c_one;
        return *info;
    }

    if (nb >= k) {
        /* Use CPU code */
        lapackf77_dormqr(side_, trans_, &m, &n, &k, A, &lda, tau,
                         C, &ldc, work, &lwork, info);
        return *info;
    }

    magma_int_t lddc = (m+63)/64*64;
    magma_int_t lddac = nq;
    magma_int_t lddar = nb;
    magma_int_t lddwork = nw;

    magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 };

    magma_int_t nb_l=256;
    magma_int_t nbl = (n-1)/nb_l+1; // number of blocks
    magma_int_t maxnlocal = (nbl+ngpu-1)/ngpu*nb_l;

    ngpu = min(ngpu, (n+nb_l-1)/nb_l); // Don't use GPU that will not have data.

    magma_int_t ldw = maxnlocal*lddc // dC
                    + 2*lddac*lddar // 2*dA
                    + 2*(nb + 1 + lddwork)*nb; // 2*(dT and dwork)

    for (igpu = 0; igpu < ngpu; ++igpu) {
        magma_setdevice(igpu);
        if (MAGMA_SUCCESS != magma_dmalloc( &dw[igpu], ldw )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            magma_xerbla( __func__, -(*info) );
            return *info;
        }
        magma_queue_create( &stream[igpu][0] );
        magma_queue_create( &stream[igpu][1] );
        magma_event_create( &event[igpu][0] );
        magma_event_create( &event[igpu][1] );
    }

    /* Use hybrid CPU-MGPU code */
    if (left) {
        //copy C to mgpus
        for (magma_int_t i = 0; i < nbl; ++i) {
            magma_int_t igpu = i%ngpu;
            magma_setdevice(igpu);
            magma_int_t kb = min(nb_l, n-i*nb_l);
            magma_dsetmatrix_async( m, kb,
                                   C(0, i*nb_l), ldc,
                                   dC(igpu, 0, i/ngpu*nb_l), lddc, stream[igpu][0] );
            nlocal[igpu] += kb;
        }

        magma_int_t i1, i2, i3;
        if ( !notran ) {
            i1 = 0;
            i2 = k;
            i3 = nb;
        } else {
            i1 = (k - 1) / nb * nb;
            i2 = 0;
            i3 = -nb;
        }

        ind_c = 0;

        for (magma_int_t i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) {
            // start the copy of A panel
            magma_int_t kb = min(nb, k - i);
            for (igpu = 0; igpu < ngpu; ++igpu) {
                magma_setdevice(igpu);
                magma_event_sync(event[igpu][ind_c]); // check if the new data can be copied
                magma_dsetmatrix_async(nq-i, kb,
                                       A(i, i),                 lda,
                                       dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] );
                // set upper triangular part of dA to identity
                magmablas_dlaset_band_q( MagmaUpper, kb, kb, kb, c_zero, c_one, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] );
            }

            /* Form the triangular factor of the block reflector
             H = H(i) H(i+1) . . . H(i+ib-1) */
            magma_int_t nqi = nq - i;
            lapackf77_dlarft("F", "C", &nqi, &kb, A(i, i), &lda,
                             &tau[i], T, &kb);

            /* H or H' is applied to C(1:m,i:n) */

            /* Apply H or H'; First copy T to the GPU */
            for (igpu = 0; igpu < ngpu; ++igpu) {
                magma_setdevice(igpu);
                magma_dsetmatrix_async(kb, kb,
                                       T,               kb,
                                       dT(igpu, ind_c), kb, stream[igpu][0] );
            }

            for (igpu = 0; igpu < ngpu; ++igpu) {
                magma_setdevice(igpu);
                magma_queue_sync( stream[igpu][0] ); // check if the data was copied
                magmablasSetKernelStream(stream[igpu][1]);
                magma_dlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
                                 m-i, nlocal[igpu], kb,
                                 dA_c(igpu, ind_c, i, 0), lddac, dT(igpu, ind_c), kb,
                                 dC(igpu, i, 0), lddc,
                                 dwork(igpu, ind_c), lddwork);
                magma_event_record(event[igpu][ind_c], stream[igpu][1] );
            }

            ind_c = (ind_c+1)%2;
        }

        for (igpu = 0; igpu < ngpu; ++igpu) {
            magma_setdevice(igpu);
            magma_queue_sync( stream[igpu][1] );
        }

        //copy C from mgpus
        for (magma_int_t i = 0; i < nbl; ++i) {
            magma_int_t igpu = i%ngpu;
            magma_setdevice(igpu);
            magma_int_t kb = min(nb_l, n-i*nb_l);
            magma_dgetmatrix( m, kb,
                              dC(igpu, 0, i/ngpu*nb_l), lddc,
                              C(0, i*nb_l), ldc );
//            magma_dgetmatrix_async( m, kb,
//                                   dC(igpu, 0, i/ngpu*nb_l), lddc,
//                                   C(0, i*nb_l), ldc, stream[igpu][0] );
        }
    } else {
        // TODO fix memory leak T, dw, event, stream
        fprintf(stderr, "The case (side == right) is not implemented\n");
        *info = MAGMA_ERR_NOT_IMPLEMENTED;
        magma_xerbla( __func__, -(*info) );
        return *info;
        /*
        if ( notran ) {
            i1 = 0;
            i2 = k;
            i3 = nb;
        } else {
            i1 = (k - 1) / nb * nb;
            i2 = 0;
            i3 = -nb;
        }

        mi = m;
        ic = 0;

        for (i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) {
            ib = min(nb, k - i);
            
            // Form the triangular factor of the block reflector
            // H = H(i) H(i+1) . . . H(i+ib-1)
            i__4 = nq - i;
            lapackf77_dlarft("F", "C", &i__4, &ib, A(i, i), &lda,
            &tau[i], T, &ib);
            
            // 1) copy the panel from A to the GPU, and
            // 2) set upper triangular part of dA to identity
            magma_dsetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda );
            magmablas_dlaset_band( MagmaUpper, ib, ib, ib, c_zero, c_one, dA(i, 0), ldda );
            
            // H or H' is applied to C(1:m,i:n)
            ni = n - i;
            jc = i;
            
            // Apply H or H'; First copy T to the GPU
            magma_dsetmatrix( ib, ib, T, ib, dT, ib );
            magma_dlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
            mi, ni, ib,
            dA(i, 0), ldda, dT, ib,
            dC(ic, jc), lddc,
            dwork, lddwork);
        }
        */
    }

    work[0] = MAGMA_D_MAKE( lwkopt, 0 );

    for (igpu = 0; igpu < ngpu; ++igpu) {
        magma_setdevice(igpu);
        magma_event_destroy( event[igpu][0] );
        magma_event_destroy( event[igpu][1] );
        magma_queue_destroy( stream[igpu][0] );
        magma_queue_destroy( stream[igpu][1] );
        magma_free( dw[igpu] );
    }
    magma_setdevice( orig_dev );
    magmablasSetKernelStream( orig_stream );

    return *info;
} /* magma_dormqr */
Beispiel #5
0
void magmablas_ssymm_mgpu_com(
    magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n,
    float alpha,
    float *dA[],    magma_int_t ldda,  magma_int_t offset,
    float *dB[],    magma_int_t lddb,
    float beta,     float *dC[], magma_int_t lddc,
    float *dwork[], magma_int_t dworksiz,
    float *C,       magma_int_t ldc,
    float *work[],  magma_int_t worksiz,
    magma_int_t ngpu, magma_int_t nb, 
    magma_queue_t streams[][20], magma_int_t nstream, 
    magma_event_t redevents[][MagmaMaxGPUs*MagmaMaxGPUs+10], magma_int_t nbevents, 
    magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2], magma_int_t nbcmplx )
{
    #define dA(dev, i, j) (dA[dev] + (i) + (j)*ldda)
    #define dB(dev, i, j) (dB[dev] + (i) + (j)*lddb)
    #define dC(dev, i, j) (dC[dev] + (i) + (j)*lddc)
    #define dwork(dev, i, j) (dwork[dev] + (i) + (j)*lddwork)
    #define C(i, j) (C + (i) + (j)*ldc)
    //printf("####################################################\n");
    //printf("                      start ssymm                   \n");
    //printf("####################################################\n");
   
    if ( side != MagmaLeft || uplo != MagmaLower ) {
        fprintf( stderr, "%s: only Left Lower implemented\n", __func__ );
    }
    
    assert( ldda >= m );
    assert( lddb >= m );
    assert( lddc >= m );
    assert( nstream >= ngpu );
    assert( nbevents >= ngpu*ngpu );
   
    
    float c_one  = MAGMA_S_ONE;

    float *dwork1[MagmaMaxGPUs];
    float *dwork2[MagmaMaxGPUs];


    magma_int_t maxgsize    = n*m;
    magma_int_t lddwork = lddc;
    magma_int_t ldwork  = m;
    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        dwork1[dev] = dwork[dev];  // size of dwork1 is n*lddwork
        dwork2[dev] = dwork[dev]+n*lddwork;  // size of dwork2 is maxgsize*ngpu
    }
    assert( dworksiz >= (n*lddwork+maxgsize*ngpu) );
    assert( worksiz  >= (n*ldwork) );

        
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_t cstream;
    magmablasGetKernelStream(&cstream);


    magma_int_t dev, devperm, myblk, mycolsize, myblkoffst;
    magma_int_t gmaster;
    magma_int_t masterdev, lcdev, lccolsize, myngpu;

    magma_int_t stdev       = (offset/nb)%ngpu;  
    magma_int_t blockoffset = offset % nb;  
    magma_int_t fstblksiz   = 0;
    if(blockoffset>0){
        fstblksiz   = min(m, (nb - blockoffset));
    }
    //magma_int_t nbblk       = magma_ceildiv(m, nb);
    magma_int_t nbblk       = magma_ceildiv((m+blockoffset), nb);
    magma_int_t remm        = m- fstblksiz;
    magma_int_t nbblkoffst  = offset/nb;


    magma_int_t nblstblks = -1;
    magma_int_t devlstblk = -1;
    magma_int_t lstblksiz = remm%nb;
    if(lstblksiz>0){
        nblstblks = nbblk%ngpu;
        devlstblk = (nblstblks-1+ngpu)%ngpu;
    }

    magma_int_t nbcmplxactive =  0;
    magma_int_t cmplxisactive[MagmaMaxGPUs];
    magma_int_t gpuisactive[MagmaMaxGPUs];
    memset(gpuisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t));
    memset(cmplxisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t));


    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        magma_setdevice( dev );
        magmablasSetKernelStream( streams[ dev ][ 0 ] );
        cudaMemset(dwork(dev,0,0), 0, (lddwork)*(n)*sizeof(float) );
        // put all dC on all dev to 0 except the one which
        // hold i==0 because this one has to multiply by beta.
        if(dev!=stdev){
           cudaMemset(dC(dev,0,0), 0, (lddc)*(n)*sizeof(float) );
        }
    }

    magma_int_t newoffset = offset;
    // 1. symmetrize
    if(blockoffset>0){
        newoffset  = offset+fstblksiz; // newoffset is adjusted over nb
        magma_int_t myblkoffst = (nbblkoffst/ngpu)+(nbblkoffst%ngpu > stdev?1:0);
        //printf("STDEV %d  voici offset %d remm %d   myblockoffset %d    siz %d \n", stdev, offset, remm, myblkoffst, fstblksiz);
        magma_setdevice( stdev );
        magmablasSetKernelStream( streams[ stdev ][ 0 ] );
        magmablas_ssymmetrize_tiles(  MagmaLower,  fstblksiz,  dA(stdev, offset, myblkoffst*nb+blockoffset),  ldda,  1,  ngpu*nb,  nb  );         
    }

    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        magma_int_t newstdev      = (newoffset/nb)%ngpu;
        magma_int_t nbblk = remm/nb; // number of block of size nb. if m%nb>0 then a last block exist and is of size ib=m%nb
        magma_int_t myblk = (nbblk/ngpu) + (nbblk%ngpu > ((dev-newstdev+ngpu)%ngpu) ?  1:0 );
        magma_int_t devperm   = (dev-newstdev+ngpu)%ngpu;
        magma_int_t nbblkoffst = newoffset/nb;
        magma_int_t myblkoffst = (nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0);
        //printf("dev %d  devperm %d   newoffset %d  rowoff %d    coloff %d    myblk %d  \n", dev, devperm, newoffset, newoffset+devperm*nb, myblkoffst*nb, myblk);
        magma_setdevice( dev );
        magmablasSetKernelStream( streams[ dev ][ 0 ] );
        magmablas_ssymmetrize_tiles(  MagmaLower,  nb,  dA(dev, newoffset+devperm*nb, myblkoffst*nb),  ldda,  myblk,  ngpu*nb,  nb  );
        if(remm%nb>0){
            magma_int_t nblstblks = (nbblk+1)%ngpu;
            magma_int_t devlstblk = (nblstblks-1+ngpu)%ngpu;
            //printf("==> siz %d devperm %d,    devlstblk %d,    newoffset+nbblk*nb %d,   myblkoffst*nb+ myblk*nb %d\n", remm % nb, devperm, devlstblk, newoffset+nbblk*nb, myblkoffst*nb+ myblk*nb);
            if(devperm==devlstblk)
                magmablas_ssymmetrize(  MagmaLower,  remm % nb,  dA(dev, newoffset+nbblk*nb, myblkoffst*nb+ myblk*nb),  ldda );  // last partial tile
        }
    }


    

/*
    magma_int_t siz = m+offset;
    float *R;
    magma_smalloc_cpu( &R, siz*siz );
    // collecte back A
    magmablas_sgetmatrix_1D_bcyclic( siz, siz, dA, ldda, R, siz, ngpu, nb );
    magma_setdevice( 0 );
    magmablasSetKernelStream( streams[ dev ][ 0 ] );
    //magma_sgetmatrix( siz, siz, dA[0], ldda, R, siz );
    FILE *trace_file;
    trace_file = fopen("AJETE/Aafter", "w");
    for (int j = 0; j < siz ; j++) 
          for (int i = 0; i < siz ; i++) 
                         fprintf(trace_file, "%10d%10d%40.30e\n", i+1, j+1, R[j*siz+i]);
    fclose(trace_file);
return;
*/
    

    // ROW GEMM transpose a row and make a gemm with a block
    // if only 1 GPU used the ROW GEMM is integrated with the 
    // COL GEMM (better accuracy observed) and better perf
    if(ngpu>1){
        for( magma_int_t i = fstblksiz; i < m; i += nb ) {
            magma_int_t ib     = min( nb, m-i );      // block size
            magma_int_t ioff   = i + offset;          // start global index in parent matrix
            //magma_int_t dev    = (ioff / nb) % ngpu;
            magma_int_t nbblkoffst = offset/nb;
            magma_int_t nbblk      = magma_ceildiv(i, nb);
            for( magma_int_t dev = 0; dev < ngpu; ++dev ) {


                magma_int_t myblk = (nbblk/ngpu) + (nbblk%ngpu > ((dev-stdev+ngpu)%ngpu) ?  1:0 );
                magma_int_t myblkoffst = (nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0);

                magma_int_t myrowsize = myblk * nb;
                magma_int_t coloffset = myblkoffst*nb;
                if(dev==stdev) {
                    myrowsize = myrowsize -blockoffset;
                    coloffset = myblkoffst*nb+blockoffset;
                }
                //printf("ROW GEMM: voici i %d   ib %d    ioff %d   nbblkoffst %d stdev %d  dev %d myblk %d  myblkoffset %d  coloffset %d  rowsize %d\n", i, ib, ioff, nbblkoffst, stdev, dev, myblk, myblkoffst, coloffset, myrowsize);
                if(myrowsize>0){
                    magma_setdevice( dev );
                    magmablasSetKernelStream( streams[ dev ][ 1 ] );    
                    magma_sgemm( MagmaConjTrans, MagmaNoTrans, myrowsize, n, ib,
                                 alpha, dA(dev,ioff,coloffset), ldda,
                                        dB(dev,i,0),    lddb,
                                 c_one, dwork(dev,0,0), lddwork );
                }
            }
        }
        for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
            magma_setdevice( dev );
            magma_event_record(redevents[dev][1], streams[dev][1]);
        }
    }
    

    // COL GEMM
    // blockoffset is offset within first block; for subsequent blocks it is 0
    if(blockoffset>0){
        magma_int_t ib     = min( nb-blockoffset, m );  // block size
        magma_int_t iblock = (offset / nb) / ngpu;          // local block id
        magma_int_t di     = iblock*nb+blockoffset;       // local index in parent matrix
        magma_setdevice( stdev );
        magmablasSetKernelStream( streams[ stdev ][ 0 ] );        
        //printf("DEV %d COL GEMM first   ioff %d  di %d   m %d   n %d   ib %d \n", stdev, offset, di, m, n, ib);
        magma_sgemm( MagmaNoTrans, MagmaNoTrans, m, n, ib,
                        alpha, dA(stdev,offset,di), ldda,
                               dB(stdev,0,0),     lddb,
                        beta,  dC(stdev,0,0),     lddc );
    }
   


    // COL GEMM
    for( magma_int_t i = fstblksiz; i < m; i += nb ) {
        magma_int_t ib     = min( nb, m-i );      // block size
        magma_int_t ioff   = i + offset;          // start global index in parent matrix
        magma_int_t iblock = (ioff / nb) / ngpu;  // local block id
        magma_int_t dev    = (ioff / nb) % ngpu;
        magma_int_t di     = iblock*nb;           // local index in parent matrix
        
        //printf("DEV %d COL GEMM i %d      ioff %d  di %d m-i %d    n %d   ib %d \n", dev, i, ioff, di, m-i, n, ib);
        
        magma_setdevice( dev );
        magmablasSetKernelStream( streams[ dev ][ 0 ] );
        if(i==0){
           magma_sgemm( MagmaNoTrans, MagmaNoTrans, m-i, n, ib,
                        alpha, dA(dev,ioff,di), ldda,
                               dB(dev,i,0),     lddb,
                        beta,  dC(dev,i,0),     lddc );
        }else{
           magma_sgemm( MagmaNoTrans, MagmaNoTrans, m-i, n, ib,
                        alpha, dA(dev,ioff,di), ldda,
                               dB(dev,i,0),        lddb,
                        c_one, dC(dev,i,0),     lddc );
        }
        magma_event_record(redevents[dev][0], streams[dev][0]);
        // if only 1 GPU is used, do the ROW GEMM
        if(ngpu==1){
            // NOTE THAT because the COL gemm write dC below the diagonal (i) 
            // and the ROW GEMM write dC from 0 to diag-1, so they could 
            // run in parallel on different streams. 
            // 
            // NO NO NO because
            // it might happen that col finished i and strated i+1 while row still at i    
            // magmablasSetKernelStream( streams[ dev ][ 0 ] );
            magma_sgemm( MagmaConjTrans, MagmaNoTrans, i, n, ib,
                         alpha, dA(dev,ioff,offset), ldda,
                                dB(dev,i,0),    lddb,
                         c_one, dC(dev,0,0),    lddc );
        }
    }


    
    if(ngpu>1){
        for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
            magma_int_t nbblk    = magma_ceildiv((m+blockoffset), nb);
            magma_int_t nbblkrow = nbblk-1; 
            magma_int_t devperm  = (dev-stdev+ngpu)%ngpu;
            magma_int_t myblk = (nbblkrow/ngpu) + (nbblkrow%ngpu > devperm ?  1:0 );
            magma_int_t myrowsize = myblk * nb;
             if(dev==stdev) {
                myrowsize = myrowsize - blockoffset;
            }
      
            //printf("blockoffset %d nbblkrow %d devperm %d  DEV %d RECEIVING myblk %d  myrowsize %d\n", blockoffset, nbblkrow, devperm, dev, myblk, myrowsize);
            if(myrowsize>0){
                magma_setdevice( dev );
                magmablasSetKernelStream( streams[ dev ][ 0 ] );
                magma_queue_wait_event(streams[ dev ][ 0 ], redevents[dev][1]);
                //magma_queue_sync( streams[ dev ][ 1 ] );
                // for each dev add the computed ROW block each on its placment with dC
                for( magma_int_t blki = 0; blki < myblk; ++blki){
                    magma_int_t gbblki = (blki*ngpu + devperm)*nb - blockoffset;
                    magma_int_t lcblki = blki*nb;
                    magma_int_t ib     = nb;// min(nb, m-gbblki);
                    if(dev==stdev){
                        lcblki = blki*nb-blockoffset;
                        if(blki==0){
                            gbblki = 0;
                            lcblki = 0;
                            ib     = nb-blockoffset;
                        }
                    }
                    magmablas_sgeadd(ib, n, c_one, 
                                    &dwork[dev][lcblki], lddwork, 
                                    &dC[dev][gbblki]   , lddc   );
                }
                magma_event_record(redevents[dev][0], streams[dev][0]);                
            }
        }
    }




    // ===========================================================
    //             COMMUNICATION ALL_REDUCE_SUM 
    // ===========================================================
    if(ngpu==1){
        return;
    }
    // INITIALIZE COMM
    for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
        masterdev     = -1;
        gnode[cmplxid][MagmaMaxGPUs+1] = -1;
        myngpu = gnode[cmplxid][MagmaMaxGPUs];
        for( magma_int_t idev = 0; idev < myngpu; ++idev ) {
            dev         = gnode[cmplxid][idev];
            devperm     = (dev-stdev+ngpu)%ngpu;
            myblk       = (nbblk/ngpu) + (nbblk%ngpu > devperm ?  1:0 );
            mycolsize   = myblk*nb;
            myblkoffst  = nb*((nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0));            
            if(dev==stdev){
                mycolsize  -=  blockoffset;
                myblkoffst +=  blockoffset;     // local index in parent matrix
            }
            if((devperm==devlstblk)&&(lstblksiz>0)){
                mycolsize -=  (nb-(remm%nb));
            }
            mycolsize = min(mycolsize, m);
            if(mycolsize>0){
                gpuisactive[dev] = mycolsize;
                if(masterdev==-1) {
                    masterdev     = dev;
                    nbcmplxactive = nbcmplxactive +1;
                    cmplxisactive[cmplxid] = 1;
                    gnode[cmplxid][MagmaMaxGPUs+1] = masterdev;
                }
            }
        }
    }
/*
    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        magma_setdevice( dev );
        magma_device_sync();
    }
*/
    //*******************************
    //  each GPU send its result
    //  to its master. The master make
    //  the addition and then send to 
    //  to the masters of other real
    //  and receive from the masters of 
    //  other real make the addition 
    //  and broadcast locally the final 
    //  result.
    //*******************************
    //printf("=======================================================================\n");
    //printf("                     sending to my master                             \n");
    //printf("=======================================================================\n");
    for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
        myngpu    = gnode[cmplxid][MagmaMaxGPUs];
        masterdev = gnode[cmplxid][MagmaMaxGPUs+1];
        //check if real is active
        if(masterdev!=-1){ 
            for( magma_int_t idev = 0; idev < myngpu; ++idev ) {
                dev         = gnode[cmplxid][idev];
                mycolsize   = gpuisactive[dev];
                if(mycolsize>0){
                    // I am an active GPU. if I am not the master, then send my result to my master.
                    // store result on dwork[masterdev][dev*maxgsize]
                    if(dev!=masterdev){
                        magma_setdevice( dev );        
                        //printf("             GPU %d sending to my master %d\n", dev, masterdev);
                        // wait the geadd of my ROW and COL GEMM is done
                        magma_queue_wait_event(streams[ dev ][ 0 ], redevents[dev][0]);
                        // sending to the master of my real
                        magma_scopymatrix_async(
                            m, n,
                            &dC[dev][0], lddc,
                            &dwork2[masterdev][maxgsize*dev], m, streams[dev][0] );
                        magma_event_record(redevents[dev][masterdev], streams[dev][0]);
                    } // end I am not the masterdev
                }// end if mycolsize>0
            }// for idev
        }// end of if masterdev!=-1 maening real is active
    }// for cmplxid
/*
    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        magma_setdevice( dev );
        magma_device_sync();
    }
*/

    //printf("=======================================================================\n");
    //printf(" each master do addition of local result and broadcast to other masters \n");
    //printf("=======================================================================\n");
    for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
        myngpu    = gnode[cmplxid][MagmaMaxGPUs];
        masterdev = gnode[cmplxid][MagmaMaxGPUs+1];
        //check if real is active
        if(masterdev!=-1){ 
            magma_setdevice( masterdev ); 
            // addition is done on stream 0 sequentially
            magmablasSetKernelStream( streams[ masterdev ][ 0 ] );
            // wait the geadd of my ROW and COL GEMM is done
            magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[masterdev][0]);
            // ========================================
            //     local addition
            // ========================================
            for( magma_int_t l = 0; l < myngpu; ++l ) {
                lcdev         = gnode[cmplxid][l];
                lccolsize     = gpuisactive[lcdev];
                if((lcdev!=masterdev)&&(lccolsize>0)){
                    //printf("             master %d receiving from %d and adding \n", masterdev, lcdev);
                    // this is an active GPU of my real. 
                    // wait I received what he send it to me and then do addition.
                    magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[lcdev][masterdev]);
                    magmablas_sgeadd(m, n, c_one, 
                                    &dwork2[masterdev][maxgsize*lcdev], m, 
                                    &dC[masterdev][0]   , lddc   );
                }
            }// for l=1:myngpu
            // because addition is done sequentially on stream 0, 
            // I have to record this to be able to synch using it 
            magma_event_record(redevents[masterdev][masterdev], streams[masterdev][0]);
            // ========================================
            //
            // ========================================
            //      send to other masters
            // ========================================
            for( magma_int_t k = 0; k < nbcmplx; ++k ) {
                if(k!=cmplxid){
                    gmaster = gnode[k][MagmaMaxGPUs+1];
                    if(gmaster!=-1){ //real is active
                         //Master has to  wait until finish the local addition then send using gmaster stream.
                         //use stream 0 to make it sequential or stream gmaster to make it parallel.
                         //Now both re the same.
                        //printf("             master %d from cmplx %d sending to other master %d on cmplx %d \n", masterdev, cmplxid, gmaster, k);
                        magma_queue_wait_event(streams[ masterdev ][ gmaster ], redevents[masterdev][masterdev]);
                        magma_scopymatrix_async(
                            m, n,
                            &dC[masterdev][0], lddc,
                            &dwork2[gmaster][maxgsize*masterdev], m, streams[masterdev][gmaster] );
                        magma_event_record(redevents[masterdev][gmaster], streams[masterdev][gmaster]);
                        magma_event_record(redevents[masterdev][masterdev], streams[masterdev][gmaster]);
                      } // end of gmaster!=-1
                } // end of k!=cmplxid
            }// for k = 0: nbcmplx
            // ========================================
        }// end of if masterdev!=-1 maening real is active
    }// for cmplxid
/*
    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        magma_setdevice( dev );
        magma_device_sync();
    }
*/
    //printf("=======================================================================\n");
    //printf(" each master wait receiving other masters results, do the addition and broadcast locally \n");
    //printf("=======================================================================\n");
    for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
        myngpu    = gnode[cmplxid][MagmaMaxGPUs];
        masterdev = gnode[cmplxid][MagmaMaxGPUs+1];
        //check if real is active
        if(masterdev!=-1){ 
            magma_setdevice( masterdev ); 
            // addition is done on stream 0 sequentially
            magmablasSetKernelStream( streams[ masterdev ][ 0 ] );
            // master has to wait until finishing all the send to other masters.
            magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[masterdev][masterdev]);
            // ========================================
            //  addition of results from other masters
            // ========================================
            for( magma_int_t k = 0; k < nbcmplx; ++k ) {
                if(k!=cmplxid){
                    gmaster = gnode[k][MagmaMaxGPUs+1];
                    if(gmaster!=-1){ //real is active
                        //Master has to  wait until receiving from gmaster, then do addition using stream 0
                        //printf("             master %d from cmplx %d receiving from other master %d on cmplx %d and adding \n", masterdev, cmplxid, gmaster, k);
                        magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[gmaster][masterdev]);
                        magmablas_sgeadd(m, n, c_one, 
                                        &dwork2[masterdev][maxgsize*gmaster], m, 
                                        &dC[masterdev][0]   , lddc   );
                    } // end of gmaster!=-1
                } // end of k!=cmplxid
            }// for k = 0: nbcmplx
            // because addition is done sequentially on stream 0, 
            // I have to record this to be able to synch using it 
            magma_event_record(redevents[masterdev][masterdev], streams[masterdev][0]);
            // ========================================
            // ========================================
            //     local broadcast of final results
            // ========================================
            for( magma_int_t l = 0; l < myngpu; ++l ) {
                lcdev         = gnode[cmplxid][l];
                lccolsize     = gpuisactive[lcdev];
                if((lcdev!=masterdev)&&(lccolsize>0)){
                    // this is an active GPU of my real. 
                    // wait the previous addition is done maening stream 0 is finished and broadcast sequentially for now.
                    // to make it parallel put stream lcdev instead of stream 0
                    //printf("             master %d broadcasting local to %d  \n", masterdev, lcdev);
                    magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[masterdev][masterdev]);
                    magma_scopymatrix_async(
                        m, n,
                        &dC[masterdev][0], lddc,
                        &dC[lcdev][0],     lddc, streams[masterdev][0] );
                    magma_event_record(redevents[masterdev][lcdev], streams[masterdev][0]);
                }
            }// for l=1:myngpu
            // ========================================
        }// end of if masterdev!=-1 maening real is active
    }// for cmplxid
/*
    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        magma_setdevice( dev );
        magma_device_sync();
    }
*/


    for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
        myngpu    = gnode[cmplxid][MagmaMaxGPUs];
        masterdev = gnode[cmplxid][MagmaMaxGPUs+1];
        //check if real is active
        if(masterdev!=-1){ 
            for( magma_int_t l = 0; l < myngpu; ++l ) {
                lcdev         = gnode[cmplxid][l];
                lccolsize     = gpuisactive[lcdev];
                if(lccolsize>0){
                    magma_setdevice( lcdev );
                    magma_queue_wait_event(streams[ lcdev ][ 0 ], redevents[lcdev][0]);
                    magma_queue_wait_event(streams[ lcdev ][ 0 ], redevents[masterdev][lcdev]);
                }
            }// for l=1:myngpu
        }// end of if masterdev!=-1 maening real is active
    }// for cmplxid


 
   //printf("****************************************************\n");
   //printf("                      finish ssymm                   \n");
   //printf("****************************************************\n");

    magma_setdevice( cdev );
    magmablasSetKernelStream( cstream );

}
Beispiel #6
0
/**
    Purpose
    -------
    SGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.  This version does not
    require work space on the GPU passed as input. GPU memory is allocated
    in the routine.

    The factorization has the form
        A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.

    It uses 2 queues to overlap communication and computation.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    A       REAL array, dimension (LDA,N)
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.
    \n
            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    @param[out]
    ipiv    INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
      -     > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    @ingroup magma_sgesv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_sgetrf(
    magma_int_t m, magma_int_t n,
    float *A, magma_int_t lda,
    magma_int_t *ipiv,
    magma_int_t *info)
{
    #ifdef HAVE_clBLAS
    #define  dA(i_, j_)     dA, ((i_)*nb  + (j_)*nb*ldda + dA_offset)
    #define dAT(i_, j_)    dAT, ((i_)*nb*lddat + (j_)*nb + dAT_offset)
    #define dwork(i_)    dwork, (i_)
    #else
    #define  dA(i_, j_) (   dA + (i_)*nb  + (j_)*nb*ldda)
    #define dAT(i_, j_) (  dAT + (i_)*nb*lddat + (j_)*nb)
    #define dwork(i_)   (dwork + (i_))
    #endif
    
    // Constants
    const float c_one     = MAGMA_S_ONE;
    const float c_neg_one = MAGMA_S_NEG_ONE;
    
    // Local variables
    float *work;
    magmaFloat_ptr dA, dAT, dwork;
    magma_int_t iinfo, nb;

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (lda < max(1,m))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    nb = magma_get_sgetrf_nb( m, n );

    if ( (nb <= 1) || (nb >= min(m,n)) ) {
        /* Use CPU code. */
        lapackf77_sgetrf( &m, &n, A, &lda, ipiv, info );
    }
    else {
        /* Use hybrid blocked code. */
        magma_int_t maxm, maxn, ldda, lddat, maxdim;
        magma_int_t i, j, rows, cols, s = min(m, n)/nb;
        
        maxm = magma_roundup( m, 32 );
        maxn = magma_roundup( n, 32 );
        maxdim = max( maxm, maxn );
        
        lddat = maxn;
        ldda  = maxm;
        
        /* set number of GPUs */
        magma_int_t ngpu = magma_num_gpus();
        if ( ngpu > 1 ) {
            /* call multi-GPU non-GPU-resident interface  */
            magma_sgetrf_m( ngpu, m, n, A, lda, ipiv, info );
            return *info;
        }
        
        magma_queue_t queues[2] = { NULL, NULL };
        magma_device_t cdev;
        magma_getdevice( &cdev );
        magma_queue_create( cdev, &queues[0] );
        magma_queue_create( cdev, &queues[1] );
        
        /* check the memory requirement */
        size_t mem_size = magma_queue_mem_size( queues[0] );
        mem_size /= sizeof(float);

        magma_int_t h = 1+(2+ngpu);
        magma_int_t ngpu2 = ngpu;
        magma_int_t NB = (magma_int_t)(0.8*mem_size/maxm - h*nb);
        const char* ngr_nb_char = getenv("MAGMA_NGR_NB");
        if ( ngr_nb_char != NULL )
            NB = max( nb, min( NB, atoi(ngr_nb_char) ) );

        if ( ngpu > ceil((float)NB/nb) ) {
            ngpu2 = (magma_int_t)ceil((float)NB/nb);
            h = 1+(2+ngpu2);
            NB = (magma_int_t)(0.8*mem_size/maxm - h*nb);
        }
        if ( ngpu2*NB < n ) {
            /* require too much memory, so call non-GPU-resident version */
            magma_sgetrf_m( ngpu, m, n, A, lda, ipiv, info );
            return *info;
        }

        work = A;
        if (maxdim*maxdim < 2*maxm*maxn) {
            // if close to square, allocate square matrix and transpose in-place
            // dwork is nb*maxm for panel, and maxdim*maxdim for A
            if (MAGMA_SUCCESS != magma_smalloc( &dwork, nb*maxm + maxdim*maxdim )) {
                /* alloc failed so call non-GPU-resident version */
                magma_sgetrf_m( ngpu, m, n, A, lda, ipiv, info );
                return *info;
            }
            dA = dwork + nb*maxm;
            
            ldda = lddat = maxdim;
            magma_ssetmatrix( m, n, A, lda, dA(0,0), ldda, queues[0] );
            
            dAT = dA;
            magmablas_stranspose_inplace( maxdim, dAT(0,0), lddat, queues[0] );
        }
        else {
            // if very rectangular, allocate dA and dAT and transpose out-of-place
            // dwork is nb*maxm for panel, and maxm*maxn for A
            if (MAGMA_SUCCESS != magma_smalloc( &dwork, (nb + maxn)*maxm )) {
                /* alloc failed so call non-GPU-resident version */
                magma_sgetrf_m( ngpu, m, n, A, lda, ipiv, info );
                return *info;
            }
            dA = dwork + nb*maxm;
            
            magma_ssetmatrix( m, n, A, lda, dA(0,0), ldda, queues[0] );
            
            if (MAGMA_SUCCESS != magma_smalloc( &dAT, maxm*maxn )) {
                /* alloc failed so call non-GPU-resident version */
                magma_free( dwork );
                magma_sgetrf_m( ngpu, m, n, A, lda, ipiv, info );
                return *info;
            }
            
            magmablas_stranspose( m, n, dA(0,0), ldda, dAT(0,0), lddat, queues[0] );
        }
        
        lapackf77_sgetrf( &m, &nb, work, &lda, ipiv, &iinfo );

        for( j = 0; j < s; j++ ) {
            // get j-th panel from device
            cols = maxm - j*nb;
            
            if (j > 0) {
                magmablas_stranspose( nb, cols, dAT(j,j), lddat, dwork(0), cols, queues[0] );
                magma_queue_sync( queues[0] );
                
                magma_sgetmatrix_async( m-j*nb, nb, dwork(0), cols, work, lda, queues[1] );
                
                magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             n - (j+1)*nb, nb,
                             c_one, dAT(j-1,j-1), lddat,
                                    dAT(j-1,j+1), lddat, queues[0] );
                magma_sgemm( MagmaNoTrans, MagmaNoTrans,
                             n-(j+1)*nb, m-j*nb, nb,
                             c_neg_one, dAT(j-1,j+1), lddat,
                                        dAT(j,  j-1), lddat,
                             c_one,     dAT(j,  j+1), lddat, queues[0] );
                
                // do the cpu part
                rows = m - j*nb;
                magma_queue_sync( queues[1] );
                lapackf77_sgetrf( &rows, &nb, work, &lda, ipiv+j*nb, &iinfo );
            }
            if (*info == 0 && iinfo > 0)
                *info = iinfo + j*nb;

            // put j-th panel onto device
            magma_ssetmatrix_async( m-j*nb, nb, work, lda, dwork(0), cols, queues[1] );
            
            for( i=j*nb; i < j*nb + nb; ++i ) {
                ipiv[i] += j*nb;
            }
            magmablas_slaswp( n, dAT(0,0), lddat, j*nb + 1, j*nb + nb, ipiv, 1, queues[0] );

            magma_queue_sync( queues[1] );
            
            magmablas_stranspose( cols, nb, dwork(0), cols, dAT(j,j), lddat, queues[0] );

            // do the small non-parallel computations (next panel update)
            if (s > (j+1)) {
                magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             nb, nb,
                             c_one, dAT(j, j  ), lddat,
                                    dAT(j, j+1), lddat, queues[0] );
                magma_sgemm( MagmaNoTrans, MagmaNoTrans,
                             nb, m-(j+1)*nb, nb,
                             c_neg_one, dAT(j,   j+1), lddat,
                                        dAT(j+1, j  ), lddat,
                             c_one,     dAT(j+1, j+1), lddat, queues[0] );
            }
            else {
                magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             n-s*nb, nb,
                             c_one, dAT(j, j  ), lddat,
                                    dAT(j, j+1), lddat, queues[0] );
                magma_sgemm( MagmaNoTrans, MagmaNoTrans,
                             n-(j+1)*nb, m-(j+1)*nb, nb,
                             c_neg_one, dAT(j,   j+1), lddat,
                                        dAT(j+1, j  ), lddat,
                             c_one,     dAT(j+1, j+1), lddat, queues[0] );
            }
        }
        
        magma_int_t nb0 = min( m - s*nb, n - s*nb );
        if ( nb0 > 0 ) {
            rows = m - s*nb;
            cols = maxm - s*nb;
            
            magmablas_stranspose( nb0, rows, dAT(s,s), lddat, dwork(0), cols, queues[0] );
            magma_sgetmatrix_async( rows, nb0, dwork(0), cols, work, lda, queues[0] );
            magma_queue_sync( queues[0] );
            
            // do the cpu part
            lapackf77_sgetrf( &rows, &nb0, work, &lda, ipiv+s*nb, &iinfo );
            if (*info == 0 && iinfo > 0)
                *info = iinfo + s*nb;
            
            for( i=s*nb; i < s*nb + nb0; ++i ) {
                ipiv[i] += s*nb;
            }
            magmablas_slaswp( n, dAT(0,0), lddat, s*nb + 1, s*nb + nb0, ipiv, 1, queues[0] );
            
            // put j-th panel onto device
            magma_ssetmatrix_async( rows, nb0, work, lda, dwork(0), cols, queues[0] );
            magmablas_stranspose( rows, nb0, dwork(0), cols, dAT(s,s), lddat, queues[0] );
    
            magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                         n-s*nb-nb0, nb0,
                         c_one, dAT(s, s),     lddat,
                                dAT(s, s)+nb0, lddat, queues[0] );
        }
        
        // undo transpose
        if (maxdim*maxdim < 2*maxm*maxn) {
            magmablas_stranspose_inplace( maxdim, dAT(0,0), lddat, queues[0] );
            magma_sgetmatrix( m, n, dAT(0,0), lddat, A, lda, queues[0] );
        }
        else {
            magmablas_stranspose( n, m, dAT(0,0), lddat, dA(0,0), ldda, queues[0] );
            magma_sgetmatrix( m, n, dA(0,0), ldda, A, lda, queues[0] );
            magma_free( dAT );
        }
        magma_free( dwork );
 
        magma_queue_destroy( queues[0] );
        magma_queue_destroy( queues[1] );
    }
    
    return *info;
} /* magma_sgetrf */
Beispiel #7
0
/**
    Purpose
    -------
    DGEHRD reduces a DOUBLE PRECISION general matrix A to upper Hessenberg form H by
    an orthogonal similarity transformation:  Q' * A * Q = H . This version
    stores the triangular matrices used in the factorization so that they can
    be applied directly (i.e., without being recomputed) later. As a result,
    the application of Q is much faster.

    Arguments
    ---------
    @param[in]
    n       INTEGER
            The order of the matrix A.  N >= 0.

    @param[in]
    ilo     INTEGER
    @param[in]
    ihi     INTEGER
            It is assumed that A is already upper triangular in rows
            and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally
            set by a previous call to DGEBAL; otherwise they should be
            set to 1 and N respectively. See Further Details.
            1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0.

    @param[in,out]
    A       DOUBLE PRECISION array, dimension (LDA,N)
            On entry, the N-by-N general matrix to be reduced.
            On exit, the upper triangle and the first subdiagonal of A
            are overwritten with the upper Hessenberg matrix H, and the
            elements below the first subdiagonal, with the array TAU,
            represent the orthogonal matrix Q as a product of elementary
            reflectors. See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    tau     DOUBLE PRECISION array, dimension (N-1)
            The scalar factors of the elementary reflectors (see Further
            Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to
            zero.

    @param[out]
    work    (workspace) DOUBLE PRECISION array, dimension (LWORK)
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The length of the array WORK.  LWORK >= N*NB,
            where NB is the optimal blocksize.
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    @param[out]
    dT      DOUBLE PRECISION array on the GPU, dimension NB*N,
            where NB is the optimal blocksize. It stores the NB*NB blocks
            of the triangular T matrices used in the reduction.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value.

    Further Details
    ---------------
    The matrix Q is represented as a product of (ihi-ilo) elementary
    reflectors

       Q = H(ilo) H(ilo+1) . . . H(ihi-1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on
    exit in A(i+2:ihi,i), and tau in TAU(i).

    The contents of A are illustrated by the following example, with
    n = 7, ilo = 2 and ihi = 6:

    @verbatim
    on entry,                        on exit,

    ( a   a   a   a   a   a   a )    (  a   a   h   h   h   h   a )
    (     a   a   a   a   a   a )    (      a   h   h   h   h   a )
    (     a   a   a   a   a   a )    (      h   h   h   h   h   h )
    (     a   a   a   a   a   a )    (      v2  h   h   h   h   h )
    (     a   a   a   a   a   a )    (      v2  v3  h   h   h   h )
    (     a   a   a   a   a   a )    (      v2  v3  v4  h   h   h )
    (                         a )    (                          a )
    @endverbatim

    where a denotes an element of the original matrix A, h denotes a
    modified element of the upper Hessenberg matrix H, and vi denotes an
    element of the vector defining H(i).

    This implementation follows the hybrid algorithm and notations described in

    S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg
    form through hybrid GPU-based computing," University of Tennessee Computer
    Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219),
    May 24, 2009.
    
    This version stores the T matrices in dT, for later use in magma_dorghr.

    @ingroup magma_dgeev_comp
    ********************************************************************/
extern "C" magma_int_t
magma_dgehrd(
    magma_int_t n, magma_int_t ilo, magma_int_t ihi,
    double *A, magma_int_t lda,
    double *tau,
    double *work, magma_int_t lwork,
    magmaDouble_ptr dT,
    magma_int_t *info)
{
    #define  A(i_,j_) ( A + (i_) + (j_)*lda)

    #ifdef HAVE_clBLAS
    #define dA(i_,j_)  dwork, ((i_) + (j_)*ldda + nb*ldda*2)
    #define dT(i_,j_)  dT,    ((i_) + (j_)*nb   + dT_offset)
    #define dV(i_,j_)  dwork, ((i_) + (j_)*ldda + nb*ldda)
    #define dwork(i_)  dwork, ((i_))
    #else
    #define dA(i_,j_) (dA    + (i_) + (j_)*ldda)
    #define dT(i_,j_) (dT    + (i_) + (j_)*nb)
    #define dV(i_,j_) (dV    + (i_) + (j_)*ldda)
    #define dwork(i_) (dwork + (i_))
    #endif

    // Constants
    const double c_one  = MAGMA_D_ONE;
    const double c_zero = MAGMA_D_ZERO;

    // Local variables
    magma_int_t nb = magma_get_dgehrd_nb( n );
    magma_int_t ldda = magma_roundup( n, 32 );

    magma_int_t i, nh, iws;
    magma_int_t iinfo;
    magma_int_t lquery;

    *info = 0;
    iws = n*nb;
    work[0] = magma_dmake_lwork( iws );

    lquery = (lwork == -1);
    if (n < 0) {
        *info = -1;
    } else if (ilo < 1 || ilo > max(1,n)) {
        *info = -2;
    } else if (ihi < min(ilo,n) || ihi > n) {
        *info = -3;
    } else if (lda < max(1,n)) {
        *info = -5;
    } else if (lwork < iws && ! lquery) {
        *info = -8;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;

    // Adjust from 1-based indexing
    ilo -= 1;
    
    // Quick return if possible
    nh = ihi - ilo;
    if (nh <= 1) {
        work[0] = c_one;
        return *info;
    }

    // Now requires lwork >= iws; else dT won't be computed in unblocked code.
    // If not enough workspace, use unblocked code
    //if ( lwork < iws ) {
    //    nb = 1;
    //}

    if (nb == 1 || nb > nh) {
        // Use unblocked code below
        i = ilo;
    }
    else {
        // Use blocked code
        magma_queue_t queue;
        magma_device_t cdev;
        magma_getdevice( &cdev );
        magma_queue_create( cdev, &queue );
        
        // GPU workspace is:
        //   nb*ldda for dwork for dlahru
        //   nb*ldda for dV
        //   n*ldda  for dA
        magmaDouble_ptr dwork;
        if (MAGMA_SUCCESS != magma_dmalloc( &dwork, 2*nb*ldda + n*ldda )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }
        double *dV = dwork + nb*ldda;
        double *dA = dwork + nb*ldda*2;
        
        double *T;
        magma_dmalloc_cpu( &T, nb*nb );
        if ( T == NULL ) {
            magma_free( dwork );
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        
        // zero first block of V, which is lower triangular
        magmablas_dlaset( MagmaFull, nb, nb, c_zero, c_zero, dV(0,0), ldda, queue );
        
        // Set elements 0:ILO-1 and IHI-1:N-2 of TAU to zero
        for (i = 0; i < ilo; ++i)
            tau[i] = c_zero;
        
        for (i = max(0,ihi-1); i < n-1; ++i)
            tau[i] = c_zero;
        
        assert( nb % 4 == 0 );
        for (i=0; i < nb*nb; i += 4)
            T[i] = T[i+1] = T[i+2] = T[i+3] = c_zero;
        
        magmablas_dlaset( MagmaFull, nb, n, c_zero, c_zero, dT(0,0), nb, queue );
        
        // Copy the matrix to the GPU
        magma_dsetmatrix( n, n-ilo, A(0,ilo), lda, dA(0,0), ldda, queue );
        
        for (i = ilo; i < ihi-1 - nb; i += nb) {
            // Reduce columns i:i+nb-1 to Hessenberg form, returning the
            // matrices V and T of the block reflector H = I - V*T*V'
            // which performs the reduction, and also the matrix Y = A*V*T
            
            // Get the current panel (no need for the 1st iteration)
            magma_dgetmatrix( ihi-i, nb,
                              dA(i,i-ilo), ldda,
                              A(i,i), lda, queue );
            
            // add 1 to i for 1-based index
            magma_dlahr2( ihi, i+1, nb,
                          dA(0,i-ilo), ldda,
                          dV(0,0),     ldda,
                          A(0,i),      lda,
                          &tau[i], T, nb, work, n, queue );
            
            // Copy T from the CPU to dT on the GPU
            magma_dsetmatrix( nb, nb, T, nb, dT(0,i-ilo), nb, queue );
            
            magma_dlahru( n, ihi, i, nb,
                          A(0,i),      lda,
                          dA(0,i-ilo), ldda, // dA
                          dA(i,i-ilo), ldda, // dY, stored over current panel
                          dV(0,0),     ldda,
                          dT(0,i-ilo), dwork(0), queue );
        }
        
        // Copy remainder to host
        magma_dgetmatrix( n, n-i,
                          dA(0,i-ilo), ldda,
                          A(0,i), lda, queue );
        
        magma_free( dwork );
        magma_free_cpu( T );
        
        magma_queue_destroy( queue );
    }

    // Use unblocked code to reduce the rest of the matrix
    // add 1 to i for 1-based index
    i += 1;
    lapackf77_dgehd2(&n, &i, &ihi, A, &lda, tau, work, &iinfo);
    work[0] = magma_dmake_lwork( iws );

    return *info;
} /* magma_dgehrd */
Beispiel #8
0
extern "C" magma_int_t
magma_dormqr_m(magma_int_t nrgpu, char side, char trans,
               magma_int_t m, magma_int_t n, magma_int_t k,
               double *a,    magma_int_t lda,
               double *tau,
               double *c,    magma_int_t ldc,
               double *work, magma_int_t lwork,
               magma_int_t *info)
{
/*  -- MAGMA (version 1.4.1) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       December 2013

    Purpose
    =======
    DORMQR overwrites the general real M-by-N matrix C with

                    SIDE = 'L'     SIDE = 'R'
    TRANS = 'N':      Q * C          C * Q
    TRANS = 'T':      Q**T * C       C * Q**T

    where Q is a real orthogonal matrix defined as the product of k
    elementary reflectors

          Q = H(1) H(2) . . . H(k)

    as returned by DGEQRF. Q is of order M if SIDE = 'L' and of order N
    if SIDE = 'R'.

    Arguments
    =========
    SIDE    (input) CHARACTER*1
            = 'L': apply Q or Q**T from the Left;
            = 'R': apply Q or Q**T from the Right.

    TRANS   (input) CHARACTER*1
            = 'N':  No transpose, apply Q;
            = 'T':  Transpose, apply Q**T.

    M       (input) INTEGER
            The number of rows of the matrix C. M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix C. N >= 0.

    K       (input) INTEGER
            The number of elementary reflectors whose product defines
            the matrix Q.
            If SIDE = 'L', M >= K >= 0;
            if SIDE = 'R', N >= K >= 0.

    A       (input) DOUBLE_PRECISION array, dimension (LDA,K)
            The i-th column must contain the vector which defines the
            elementary reflector H(i), for i = 1,2,...,k, as returned by
            DGEQRF in the first k columns of its array argument A.

    LDA     (input) INTEGER
            The leading dimension of the array A.
            If SIDE = 'L', LDA >= max(1,M);
            if SIDE = 'R', LDA >= max(1,N).

    TAU     (input) DOUBLE_PRECISION array, dimension (K)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by DGEQRF.

    C       (input/output) DOUBLE_PRECISION array, dimension (LDC,N)
            On entry, the M-by-N matrix C.
            On exit, C is overwritten by Q*C or Q**T*C or C*Q**T or C*Q.

    LDC     (input) INTEGER
            The leading dimension of the array C. LDC >= max(1,M).

    WORK    (workspace/output) DOUBLE_PRECISION array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK(0) returns the optimal LWORK.

    LWORK   (input) INTEGER
            The dimension of the array WORK.
            If SIDE = 'L', LWORK >= max(1,N);
            if SIDE = 'R', LWORK >= max(1,M).
            For optimum performance LWORK >= N*NB if SIDE = 'L', and
            LWORK >= M*NB if SIDE = 'R', where NB is the optimal
            blocksize.

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
    =====================================================================   */
    double c_one = MAGMA_D_ONE;

    char side_[2] = {side, 0};
    char trans_[2] = {trans, 0};

    magma_int_t nb = 128;
    double *t ;
    magma_dmalloc_pinned (&t, nb*nb);
    //printf("calling dormqr_m with nb=%d\n", (int) nb);

    double* dw[MagmaMaxGPUs];
    magma_queue_t stream [MagmaMaxGPUs][2];
    magma_event_t  event [MagmaMaxGPUs][2];

    magma_int_t ind_c;

    magma_int_t igpu = 0;
    int gpu_b;
    magma_getdevice(&gpu_b);

    *info = 0;

    magma_int_t left = lapackf77_lsame(side_, "L");
    magma_int_t notran = lapackf77_lsame(trans_, "N");
    magma_int_t lquery = (lwork == -1);

    /* NQ is the order of Q and NW is the minimum dimension of WORK */
    magma_int_t nq, nw;
    if (left) {
        nq = m;
        nw = n;
    } else {
        nq = n;
        nw = m;
    }


    if (! left && ! lapackf77_lsame(side_, "R")) {
        *info = -1;
    } else if (! notran && ! lapackf77_lsame(trans_, "T")) {
        *info = -2;
    } else if (m < 0) {
        *info = -3;
    } else if (n < 0) {
        *info = -4;
    } else if (k < 0 || k > nq) {
        *info = -5;
    } else if (lda < max(1,nq)) {
        *info = -7;
    } else if (ldc < max(1,m)) {
        *info = -10;
    } else if (lwork < max(1,nw) && ! lquery) {
        *info = -12;
    }

    magma_int_t lwkopt = max(1,nw) * nb;
    if (*info == 0)
    {
        work[0] = MAGMA_D_MAKE( lwkopt, 0 );
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0 || k == 0) {
        work[0] = c_one;
        return *info;
    }

    if (nb >= k)
    {
        /* Use CPU code */
        lapackf77_dormqr(side_, trans_, &m, &n, &k, a, &lda, tau,
                         c, &ldc, work, &lwork, info);
        return *info;
    }

    magma_int_t lddc = (m+63)/64*64;
    magma_int_t lddac = nq;
    magma_int_t lddar = nb;
    magma_int_t lddwork = nw;

    magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 };

    magma_int_t nb_l=256;
    magma_int_t nbl = (n-1)/nb_l+1; // number of blocks
    magma_int_t maxnlocal = (nbl+nrgpu-1)/nrgpu*nb_l;

    nrgpu = min(nrgpu, (n+nb_l-1)/nb_l); // Don't use GPU that will not have data.

    magma_int_t ldw = maxnlocal*lddc // dC
    + 2*lddac*lddar // 2*dA
    + 2*(nb + 1 + lddwork)*nb; // 2*(dT and dwork)

    for (igpu = 0; igpu < nrgpu; ++igpu){
        magma_setdevice(igpu);
        if (MAGMA_SUCCESS != magma_dmalloc( &dw[igpu], ldw)) {

            magma_xerbla( __func__, -(*info) );
            *info = MAGMA_ERR_DEVICE_ALLOC;

            return *info;
        }
        magma_queue_create( &stream[igpu][0] );
        magma_queue_create( &stream[igpu][1] );
        magma_event_create( &event[igpu][0] );
        magma_event_create( &event[igpu][1] );
    }

    /* Use hybrid CPU-MGPU code */
    if (left) {

        //copy C to mgpus
        for (magma_int_t i = 0; i < nbl; ++i){
            magma_int_t igpu = i%nrgpu;
            magma_setdevice(igpu);
            magma_int_t kb = min(nb_l, n-i*nb_l);
            magma_dsetmatrix_async( m, kb,
                                   C(0, i*nb_l), ldc,
                                   dC(igpu, 0, i/nrgpu*nb_l), lddc, stream[igpu][0] );
            nlocal[igpu] += kb;
        }

        magma_int_t i1, i2, i3;
        if ( !notran ) {
            i1 = 0;
            i2 = k;
            i3 = nb;
        } else {
            i1 = (k - 1) / nb * nb;
            i2 = 0;
            i3 = -nb;
        }

        ind_c = 0;

        for (magma_int_t i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3)
        {
            // start the copy of A panel
            magma_int_t kb = min(nb, k - i);
            for (igpu = 0; igpu < nrgpu; ++igpu){
                magma_setdevice(igpu);
                magma_event_sync(event[igpu][ind_c]); // check if the new data can be copied
                magma_dsetmatrix_async(nq-i, kb,
                                       A(i, i),                 lda,
                                       dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] );
                // Put 0s in the upper triangular part of dA;
                magmablas_dsetdiag1subdiag0_stream('L', kb, kb, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0]);
            }

            /* Form the triangular factor of the block reflector
             H = H(i) H(i+1) . . . H(i+ib-1) */
            magma_int_t nqi = nq - i;
            lapackf77_dlarft("F", "C", &nqi, &kb, A(i, i), &lda,
                             &tau[i], t, &kb);

            /* H or H' is applied to C(1:m,i:n) */

            /* Apply H or H'; First copy T to the GPU */
            for (igpu = 0; igpu < nrgpu; ++igpu){
                magma_setdevice(igpu);
                magma_dsetmatrix_async(kb, kb,
                                       t,               kb,
                                       dt(igpu, ind_c), kb, stream[igpu][0] );
            }

            for (igpu = 0; igpu < nrgpu; ++igpu){
                magma_setdevice(igpu);
                magma_queue_sync( stream[igpu][0] ); // check if the data was copied
                magmablasSetKernelStream(stream[igpu][1]);
                magma_dlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
                                 m-i, nlocal[igpu], kb,
                                 dA_c(igpu, ind_c, i, 0), lddac, dt(igpu, ind_c), kb,
                                 dC(igpu, i, 0), lddc,
                                 dwork(igpu, ind_c), lddwork);
                magma_event_record(event[igpu][ind_c], stream[igpu][1] );
            }

            ind_c = (ind_c+1)%2;
        }

        for (igpu = 0; igpu < nrgpu; ++igpu){
            magma_setdevice(igpu);
            magma_queue_sync( stream[igpu][1] );
        }

        //copy C from mgpus
        for (magma_int_t i = 0; i < nbl; ++i){
            magma_int_t igpu = i%nrgpu;
            magma_setdevice(igpu);
            magma_int_t kb = min(nb_l, n-i*nb_l);
            magma_dgetmatrix( m, kb,
                              dC(igpu, 0, i/nrgpu*nb_l), lddc,
                              C(0, i*nb_l), ldc );
//            magma_dgetmatrix_async( m, kb,
//                                   dC(igpu, 0, i/nrgpu*nb_l), lddc,
//                                   C(0, i*nb_l), ldc, stream[igpu][0] );
        }

    } else {

        fprintf(stderr, "The case (side == right) is not implemented\n");
        magma_xerbla( __func__, 1 );
        return *info;
        /*
         if ( notran ) {
         i1 = 0;
         i2 = k;
         i3 = nb;
         } else {
         i1 = (k - 1) / nb * nb;
         i2 = 0;
         i3 = -nb;
         }

         mi = m;
         ic = 0;

         for (i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3)
         {
         ib = min(nb, k - i);

         // Form the triangular factor of the block reflector
         // H = H(i) H(i+1) . . . H(i+ib-1)
         i__4 = nq - i;
         lapackf77_dlarft("F", "C", &i__4, &ib, A(i, i), &lda,
         &tau[i], t, &ib);

         // 1) copy the panel from A to the GPU, and
         // 2) Put 0s in the upper triangular part of dA;
         magma_dsetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda );
         magmablas_dsetdiag1subdiag0('L', ib, ib, dA(i, 0), ldda);


         // H or H' is applied to C(1:m,i:n)
         ni = n - i;
         jc = i;

         // Apply H or H'; First copy T to the GPU
         magma_dsetmatrix( ib, ib, t, ib, dt, ib );
         magma_dlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
         mi, ni, ib,
         dA(i, 0), ldda, dt, ib,
         dC(ic, jc), lddc,
         dwork, lddwork);
         }
         */
    }

    work[0] = MAGMA_D_MAKE( lwkopt, 0 );

    for (igpu = 0; igpu < nrgpu; ++igpu){
        magma_setdevice(igpu);
        magmablasSetKernelStream(NULL);
        magma_event_destroy( event[igpu][0] );
        magma_event_destroy( event[igpu][1] );
        magma_queue_destroy( stream[igpu][0] );
        magma_queue_destroy( stream[igpu][1] );
        magma_free( dw[igpu] );
    }

    magma_setdevice(gpu_b);

    return *info;
} /* magma_dormqr */
Beispiel #9
0
/**
    Purpose
    -------
    SGEQRF computes a QR factorization of a REAL M-by-N matrix A:
    A = Q * R. This version does not require work space on the GPU
    passed as input. GPU memory is allocated in the routine.

    This uses 2 queues to overlap communication and computation.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    A       REAL array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).
    \n
            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    @param[out]
    tau     REAL array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    work    (workspace) REAL array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.
    \n
            Higher performance is achieved if WORK is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.  LWORK >= max( N*NB, 2*NB*NB ),
            where NB can be obtained through magma_get_sgeqrf_nb( M, N ).
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ---------------
    The matrix Q is represented as a product of elementary reflectors

        Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

        H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).

    @ingroup magma_sgeqrf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_sgeqrf(
    magma_int_t m, magma_int_t n,
    float *A,    magma_int_t lda,
    float *tau,
    float *work, magma_int_t lwork,
    magma_int_t *info )
{
    #define  A(i_,j_)  (A + (i_) + (j_)*lda)
    
    #ifdef HAVE_clBLAS
    #define dA(i_,j_)  dA,    ((i_) + (j_)*ldda + dA_offset)
    #define dT(i_,j_)  dT,    ((i_) + (j_)*nb   + dT_offset)
    #define dwork(i_)  dwork, ((i_)             + dwork_offset)
    #else
    #define dA(i_,j_) (dA    + (i_) + (j_)*ldda)
    #define dT(i_,j_) (dT    + (i_) + (j_)*nb)
    #define dwork(i_) (dwork + (i_))
    #endif
    
    /* Constants */
    const float c_one = MAGMA_S_ONE;
    
    /* Local variables */
    magmaFloat_ptr dA, dT, dwork;
    magma_int_t i, ib, min_mn, ldda, lddwork, old_i, old_ib;
    
    /* Function Body */
    *info = 0;
    magma_int_t nb = magma_get_sgeqrf_nb( m, n );
    
    // need 2*nb*nb to store T and upper triangle of V simultaneously
    magma_int_t lwkopt = max( n*nb, 2*nb*nb );
    work[0] = magma_smake_lwork( lwkopt );
    bool lquery = (lwork == -1);
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,m)) {
        *info = -4;
    } else if (lwork < max(1, lwkopt) && ! lquery) {
        *info = -7;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;
    
    min_mn = min( m, n );
    if (min_mn == 0) {
        work[0] = c_one;
        return *info;
    }
    
    // largest N for larfb is n-nb (trailing matrix lacks 1st panel)
    lddwork = magma_roundup( n, 32 ) - nb;
    ldda    = magma_roundup( m, 32 );
    
    magma_int_t ngpu = magma_num_gpus();
    if ( ngpu > 1 ) {
        /* call multiple-GPU interface  */
        return magma_sgeqrf_m( ngpu, m, n, A, lda, tau, work, lwork, info );
    }
    
    // allocate space for dA, dwork, and dT
    if (MAGMA_SUCCESS != magma_smalloc( &dA, n*ldda + nb*lddwork + nb*nb )) {
        /* alloc failed so call non-GPU-resident version */
        return magma_sgeqrf_ooc( m, n, A, lda, tau, work, lwork, info );
    }
    
    dwork = dA + n*ldda;
    dT    = dA + n*ldda + nb*lddwork;
    
    magma_queue_t queues[2];
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queues[0] );
    magma_queue_create( cdev, &queues[1] );
    
    if ( (nb > 1) && (nb < min_mn) ) {
        /* Use blocked code initially.
           Asynchronously send the matrix to the GPU except the first panel. */
        magma_ssetmatrix_async( m, n-nb,
                                 A(0,nb), lda,
                                dA(0,nb), ldda, queues[0] );
        
        old_i = 0;
        old_ib = nb;
        for (i = 0; i < min_mn-nb; i += nb) {
            ib = min( min_mn-i, nb );
            if (i > 0) {
                /* get i-th panel from device */
                magma_queue_sync( queues[1] );
                magma_sgetmatrix_async( m-i, ib,
                                        dA(i,i), ldda,
                                         A(i,i), lda, queues[0] );
                
                /* Apply H' to A(i:m,i+2*ib:n) from the left */
                magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                  m-old_i, n-old_i-2*old_ib, old_ib,
                                  dA(old_i, old_i),          ldda, dT(0,0),  nb,
                                  dA(old_i, old_i+2*old_ib), ldda, dwork(0), lddwork, queues[1] );
                
                magma_sgetmatrix_async( i, ib,
                                        dA(0,i), ldda,
                                         A(0,i), lda, queues[1] );
                magma_queue_sync( queues[0] );
            }
            
            magma_int_t rows = m-i;
            lapackf77_sgeqrf( &rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info );
            
            /* Form the triangular factor of the block reflector
               H = H(i) H(i+1) . . . H(i+ib-1) */
            lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr,
                              &rows, &ib, A(i,i), &lda, tau+i, work, &ib );
            
            magma_spanel_to_q( MagmaUpper, ib, A(i,i), lda, work+ib*ib );
            
            /* put i-th V matrix onto device */
            magma_ssetmatrix_async( rows, ib, A(i,i), lda, dA(i,i), ldda, queues[0] );
            
            /* put T matrix onto device */
            magma_queue_sync( queues[1] );
            magma_ssetmatrix_async( ib, ib, work, ib, dT(0,0), nb, queues[0] );
            magma_queue_sync( queues[0] );
            
            if (i + ib < n) {
                if (i+ib < min_mn-nb) {
                    /* Apply H' to A(i:m,i+ib:i+2*ib) from the left (look-ahead) */
                    magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                      rows, ib, ib,
                                      dA(i, i   ), ldda, dT(0,0),  nb,
                                      dA(i, i+ib), ldda, dwork(0), lddwork, queues[1] );
                    magma_sq_to_panel( MagmaUpper, ib, A(i,i), lda, work+ib*ib );
                }
                else {
                    /* After last panel, update whole trailing matrix. */
                    /* Apply H' to A(i:m,i+ib:n) from the left */
                    magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                      rows, n-i-ib, ib,
                                      dA(i, i   ), ldda, dT(0,0),  nb,
                                      dA(i, i+ib), ldda, dwork(0), lddwork, queues[1] );
                    magma_sq_to_panel( MagmaUpper, ib, A(i,i), lda, work+ib*ib );
                }
                
                old_i  = i;
                old_ib = ib;
            }
        }
    } else {
        i = 0;
    }
    
    /* Use unblocked code to factor the last or only block. */
    if (i < min_mn) {
        ib = n-i;
        if (i != 0) {
            magma_sgetmatrix( m, ib, dA(0,i), ldda, A(0,i), lda, queues[1] );
        }
        magma_int_t rows = m-i;
        lapackf77_sgeqrf( &rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info );
    }
    
    magma_queue_destroy( queues[0] );
    magma_queue_destroy( queues[1] );
    
    magma_free( dA );
    
    return *info;
} /* magma_sgeqrf */
Beispiel #10
0
/**
    Purpose
    -------
    CGEQRF3 computes a QR factorization of a complex M-by-N matrix A:
    A = Q * R.
    
    This version stores the triangular dT matrices used in
    the block QR factorization so that they can be applied directly (i.e.,
    without being recomputed) later. As a result, the application
    of Q is much faster. Also, the upper triangular matrices for V have 0s
    in them. The corresponding parts of the upper triangular R are
    stored separately in dT.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    dA      COMPLEX array on the GPU, dimension (LDDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).

    @param[in]
    ldda    INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).
            To benefit from coalescent memory accesses LDDA must be
            divisible by 16.

    @param[out]
    tau     COMPLEX array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    dT      (workspace) COMPLEX array on the GPU,
            dimension (2*MIN(M, N) + ceil(N/32)*32 )*NB,
            where NB can be obtained through magma_get_cgeqrf_nb( M, N ).
            It starts with a MIN(M,N)*NB block that stores the triangular T
            matrices, followed by a MIN(M,N)*NB block that stores
            the diagonal blocks of the R matrix.
            The rest of the array is used as workspace.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ---------------
    The matrix Q is represented as a product of elementary reflectors

        Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

        H(i) = I - tau * v * v^H

    where tau is a complex scalar, and v is a complex vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).

    @ingroup magma_cgeqrf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_cgeqrf3_gpu(
    magma_int_t m, magma_int_t n,
    magmaFloatComplex_ptr dA, magma_int_t ldda,
    magmaFloatComplex *tau,
    magmaFloatComplex_ptr dT,
    magma_int_t *info )
{
    #ifdef HAVE_clBLAS
    #define dA(i_, j_)  dA, (dA_offset + (i_) + (j_)*(ldda))
    #define dT(i_)      dT, (dT_offset + (i_)*nb)
    #define dR(i_)      dT, (dT_offset + (  minmn + (i_))*nb)
    #define dwork(i_)   dT, (dT_offset + (2*minmn + (i_))*nb)
    #else
    #define dA(i_, j_) (dA + (i_) + (j_)*(ldda))
    #define dT(i_)     (dT + (i_)*nb)
    #define dR(i_)     (dT + (  minmn + (i_))*nb)
    #define dwork(i_)  (dT + (2*minmn + (i_))*nb)
    #endif
    
    magmaFloatComplex *work, *hwork, *R;
    magma_int_t cols, i, ib, ldwork, lddwork, lhwork, lwork, minmn, nb, old_i, old_ib, rows;
    
    // check arguments
    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    
    minmn = min( m, n );
    if (minmn == 0)
        return *info;
    
    // TODO: use min(m,n), but that affects dT
    nb = magma_get_cgeqrf_nb( m, n );
    
    // dT contains 3 blocks:
    // dT    is minmn*nb
    // dR    is minmn*nb
    // dwork is n*nb
    lddwork = n;
    
    // work  is m*nb for panel
    // hwork is n*nb, and at least nb*nb for T in larft
    // R     is nb*nb
    ldwork = m;
    lhwork = max( n*nb, nb*nb );
    lwork  = ldwork*nb + lhwork + nb*nb;
    // last block needs rows*cols for matrix and prefers cols*nb for work
    // worst case is n > m*nb, m a small multiple of nb:
    // needs n*nb + n > (m+n)*nb
    // prefers 2*n*nb, about twice above (m+n)*nb.
    i = ((minmn-1)/nb)*nb;
    lwork = max( lwork, (m-i)*(n-i) + (n-i)*nb );
    
    if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, lwork )) {
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }
    hwork = work + ldwork*nb;
    R     = work + ldwork*nb + lhwork;
    memset( R, 0, nb*nb*sizeof(magmaFloatComplex) );
    
    magma_queue_t queues[2];
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queues[0] );
    magma_queue_create( cdev, &queues[1] );
        
    if ( nb > 1 && nb < minmn ) {
        // need nb*nb for T in larft
        assert( lhwork >= nb*nb );
        
        // Use blocked code initially
        old_i = 0; old_ib = nb;
        for (i = 0; i < minmn-nb; i += nb) {
            ib = min( minmn-i, nb );
            rows = m - i;
            
            // get i-th panel from device
            magma_cgetmatrix_async( rows, ib,
                                    dA(i,i), ldda,
                                    work,    ldwork, queues[1] );
            if (i > 0) {
                // Apply H^H to A(i:m,i+2*ib:n) from the left
                cols = n - old_i - 2*old_ib;
                magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                  m-old_i, cols, old_ib,
                                  dA(old_i, old_i         ), ldda, dT(old_i), nb,
                                  dA(old_i, old_i+2*old_ib), ldda, dwork(0),  lddwork, queues[0] );
                
                // Fix the diagonal block
                magma_csetmatrix_async( old_ib, old_ib,
                                        R,         old_ib,
                                        dR(old_i), old_ib, queues[0] );
            }
            
            magma_queue_sync( queues[1] );  // wait to get work(i)
            lapackf77_cgeqrf( &rows, &ib, work, &ldwork, &tau[i], hwork, &lhwork, info );
            // Form the triangular factor of the block reflector in hwork
            // H = H(i) H(i+1) . . . H(i+ib-1)
            lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr,
                              &rows, &ib,
                              work, &ldwork, &tau[i], hwork, &ib );
            
            // wait for previous trailing matrix update (above) to finish with R
            magma_queue_sync( queues[0] );
            
            // copy the upper triangle of panel to R and invert it, and
            // set  the upper triangle of panel (V) to identity
            csplit_diag_block( ib, work, ldwork, R );
            
            // send i-th V matrix to device
            magma_csetmatrix( rows, ib,
                              work, ldwork,
                              dA(i,i), ldda, queues[1] );
            
            if (i + ib < n) {
                // send T matrix to device
                magma_csetmatrix( ib, ib,
                                  hwork, ib,
                                  dT(i), nb, queues[1] );
                
                if (i+nb < minmn-nb) {
                    // Apply H^H to A(i:m,i+ib:i+2*ib) from the left
                    magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                      rows, ib, ib,
                                      dA(i, i   ), ldda, dT(i),  nb,
                                      dA(i, i+ib), ldda, dwork(0), lddwork, queues[1] );
                    // wait for larfb to finish with dwork before larfb in next iteration starts
                    magma_queue_sync( queues[1] );
                }
                else {
                    // Apply H^H to A(i:m,i+ib:n) from the left
                    magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                      rows, n-i-ib, ib,
                                      dA(i, i   ), ldda, dT(i),  nb,
                                      dA(i, i+ib), ldda, dwork(0), lddwork, queues[1] );
                    // Fix the diagonal block
                    magma_csetmatrix( ib, ib,
                                      R,     ib,
                                      dR(i), ib, queues[1] );
                }
                old_i  = i;
                old_ib = ib;
            }
        }
    } else {
        i = 0;
    }
    
    // Use unblocked code to factor the last or only block.
    if (i < minmn) {
        rows = m-i;
        cols = n-i;
        magma_cgetmatrix( rows, cols, dA(i, i), ldda, work, rows, queues[1] );
        // see comments for lwork above
        lhwork = lwork - rows*cols;
        lapackf77_cgeqrf( &rows, &cols, work, &rows, &tau[i], &work[rows*cols], &lhwork, info );
        magma_csetmatrix( rows, cols, work, rows, dA(i, i), ldda, queues[1] );
    }
    
    magma_queue_destroy( queues[0] );
    magma_queue_destroy( queues[1] );
    
    magma_free_pinned( work );
    
    return *info;
} // magma_cgeqrf_gpu
Beispiel #11
0
/**
    Purpose
    -------
    SGEQLF computes a QL factorization of a REAL M-by-N matrix A:
    A = Q * L.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    A       REAL array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, if m >= n, the lower triangle of the subarray
            A(m-n+1:m,1:n) contains the N-by-N lower triangular matrix L;
            if m <= n, the elements on and below the (n-m)-th
            superdiagonal contain the M-by-N lower trapezoidal matrix L;
            the remaining elements, with the array TAU, represent the
            orthogonal matrix Q as a product of elementary reflectors
            (see Further Details).
    \n
            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    @param[out]
    tau     REAL array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    work    (workspace) REAL array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.
    \n
            Higher performance is achieved if WORK is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.  LWORK >= max(1,N,2*NB^2).
            For optimum performance LWORK >= max(N*NB, 2*NB^2) where NB can be obtained
            through magma_get_sgeqlf_nb( M, N ).
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ---------------
    The matrix Q is represented as a product of elementary reflectors

        Q = H(k) . . . H(2) H(1), where k = min(m,n).

    Each H(i) has the form

        H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(m-k+i+1:m) = 0 and v(m-k+i) = 1; v(1:m-k+i-1) is stored on exit in
    A(1:m-k+i-1,n-k+i), and tau in TAU(i).

    @ingroup magma_sgeqlf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_sgeqlf(
    magma_int_t m, magma_int_t n,
    float *A,    magma_int_t lda, float *tau,
    float *work, magma_int_t lwork,
    magma_int_t *info)
{
    #define  A(i_,j_) ( A + (i_) + (j_)*lda)
    #define dA(i_,j_) (dA + (i_) + (j_)*ldda)
    #define dwork(i_) (dwork + (i_))

    /* Constants */
    const float c_one = MAGMA_S_ONE;
    
    /* Local variables */
    magmaFloat_ptr dA, dwork;
    magma_int_t i, minmn, lddwork, old_i, old_ib, nb;
    magma_int_t rows, cols;
    magma_int_t ib, ki, kk, mu, nu, iinfo, ldda;

    nb = magma_get_sgeqlf_nb( m, n );
    *info = 0;
    bool lquery = (lwork == -1);

    // silence "uninitialized" warnings
    old_ib = nb;
    old_i  = 0;
    
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,m)) {
        *info = -4;
    }

    minmn = min(m,n);
    if (*info == 0) {
        if (minmn == 0) {
            work[0] = c_one;
        }
        else {
            work[0] = magma_smake_lwork( max(n*nb, 2*nb*nb) );
        }

        if (lwork < max(max(1,n), 2*nb*nb) && ! lquery)
            *info = -7;
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;

    /* Quick return if possible */
    if (minmn == 0)
        return *info;

    lddwork = magma_roundup( n, 32 );
    ldda    = magma_roundup( m, 32 );

    if (MAGMA_SUCCESS != magma_smalloc( &dA, n*ldda + nb*lddwork )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }
    dwork = dA + ldda*n;

    magma_queue_t queues[2];
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queues[0] );
    magma_queue_create( cdev, &queues[1] );

    if ( (nb > 1) && (nb < minmn) ) {
        /*  Use blocked code initially.
            The last kk columns are handled by the block method.
            First, copy the matrix on the GPU except the last kk columns */
        magma_ssetmatrix_async( m, n-nb,
                                A(0, 0),  lda,
                                dA(0, 0), ldda, queues[0] );

        ki = ((minmn - nb - 1) / nb) * nb;
        kk = min( minmn, ki + nb );
        for (i = minmn - kk + ki; i >= minmn - kk; i -= nb) {
            ib = min( minmn-i, nb );

            if (i < minmn - kk + ki) {
                // 1. Copy asynchronously the current panel to the CPU.
                // 2. Copy asynchronously the submatrix below the panel to the CPU
                rows = m - minmn + i + ib;
                magma_sgetmatrix_async( rows, ib,
                                        dA(0, n-minmn+i), ldda,
                                        A(0, n-minmn+i),  lda, queues[1] );

                magma_sgetmatrix_async( m-rows, ib,
                                        dA(rows, n-minmn+i), ldda,
                                        A(rows, n-minmn+i),  lda, queues[0] );

                /* Apply H^H to A(1:m-minmn+i+ib-1,1:n-minmn+i-1) from the left in
                   two steps - implementing the lookahead techniques.
                   This is the main update from the lookahead techniques. */
                rows = m - minmn + old_i + old_ib;
                cols = n - minmn + old_i - old_ib;
                magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaBackward, MagmaColumnwise,
                                  rows, cols, old_ib,
                                  dA(0, cols+old_ib), ldda, dwork(0),      lddwork,
                                  dA(0, 0          ), ldda, dwork(old_ib), lddwork, queues[0] );
            }

            magma_queue_sync( queues[1] );  // wait for panel
            /* Compute the QL factorization of the current block
               A(1:m-minmn+i+ib-1,n-minmn+i:n-minmn+i+ib-1) */
            rows = m - minmn + i + ib;
            cols = n - minmn + i;
            lapackf77_sgeqlf( &rows, &ib, A(0,cols), &lda, tau+i, work, &lwork, &iinfo );

            if (cols > 0) {
                /* Form the triangular factor of the block reflector
                   H = H(i+ib-1) . . . H(i+1) H(i) */
                lapackf77_slarft( MagmaBackwardStr, MagmaColumnwiseStr,
                                  &rows, &ib,
                                  A(0, cols), &lda, tau + i, work, &ib );

                magma_spanel_to_q( MagmaLower, ib, A(rows-ib,cols), lda, work+ib*ib );
                magma_ssetmatrix( rows, ib,
                                  A(0,cols),  lda,
                                  dA(0,cols), ldda, queues[1] );
                magma_sq_to_panel( MagmaLower, ib, A(rows-ib,cols), lda, work+ib*ib );

                // wait for main update (above) to finish with dwork
                magma_queue_sync( queues[0] );
                
                // Send the triangular part to the GPU
                magma_ssetmatrix( ib, ib, work, ib, dwork(0), lddwork, queues[1] );

                /* Apply H^H to A(1:m-minmn+i+ib-1,1:n-minmn+i-1) from the left in
                   two steps - implementing the lookahead techniques.
                   This is the update of first ib columns.                 */
                if (i-ib >= minmn - kk) {
                    magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaBackward, MagmaColumnwise,
                                      rows, ib, ib,
                                      dA(0, cols),   ldda, dwork(0),  lddwork,
                                      dA(0,cols-ib), ldda, dwork(ib), lddwork, queues[1] );
                    // wait for larfb to finish with dwork before larfb in next iteration starts
                    magma_queue_sync( queues[1] );
                }
                else {
                    magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaBackward, MagmaColumnwise,
                                      rows, cols, ib,
                                      dA(0, cols), ldda, dwork(0),  lddwork,
                                      dA(0, 0   ), ldda, dwork(ib), lddwork, queues[1] );
                }

                old_i  = i;
                old_ib = ib;
            }
        }
        mu = m - minmn + i + nb;
        nu = n - minmn + i + nb;

        magma_sgetmatrix( m, nu, dA(0,0), ldda, A(0,0), lda, queues[1] );
    } else {
        mu = m;
        nu = n;
    }

    /* Use unblocked code to factor the last or only block */
    if (mu > 0 && nu > 0) {
        lapackf77_sgeqlf( &mu, &nu, A(0,0), &lda, tau, work, &lwork, &iinfo );
    }

    magma_queue_destroy( queues[0] );
    magma_queue_destroy( queues[1] );
    magma_free( dA );
    
    return *info;
} /* magma_sgeqlf */
Beispiel #12
0
/**
    Purpose
    -------
    ZUNMQR overwrites the general complex M-by-N matrix C with

    @verbatim
                                SIDE = MagmaLeft    SIDE = MagmaRight
    TRANS = MagmaNoTrans:       Q * C               C * Q
    TRANS = Magma_ConjTrans:    Q**H * C            C * Q**H
    @endverbatim

    where Q is a complex unitary matrix defined as the product of k
    elementary reflectors

          Q = H(1) H(2) . . . H(k)

    as returned by ZGEQRF. Q is of order M if SIDE = MagmaLeft and of order N
    if SIDE = MagmaRight.

    Arguments
    ---------
    @param[in]
    ngpu    INTEGER
            Number of GPUs to use. ngpu > 0.

    @param[in]
    side    magma_side_t
      -     = MagmaLeft:      apply Q or Q**H from the Left;
      -     = MagmaRight:     apply Q or Q**H from the Right.

    @param[in]
    trans   magma_trans_t
      -     = MagmaNoTrans:    No transpose, apply Q;
      -     = Magma_ConjTrans: Conjugate transpose, apply Q**H.

    @param[in]
    m       INTEGER
            The number of rows of the matrix C. M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix C. N >= 0.

    @param[in]
    k       INTEGER
            The number of elementary reflectors whose product defines
            the matrix Q.
            If SIDE = MagmaLeft,  M >= K >= 0;
            if SIDE = MagmaRight, N >= K >= 0.

    @param[in]
    A       COMPLEX_16 array, dimension (LDA,K)
            The i-th column must contain the vector which defines the
            elementary reflector H(i), for i = 1,2,...,k, as returned by
            ZGEQRF in the first k columns of its array argument A.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.
            If SIDE = MagmaLeft,  LDA >= max(1,M);
            if SIDE = MagmaRight, LDA >= max(1,N).

    @param[in]
    tau     COMPLEX_16 array, dimension (K)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by ZGEQRF.

    @param[in,out]
    C       COMPLEX_16 array, dimension (LDC,N)
            On entry, the M-by-N matrix C.
            On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q.

    @param[in]
    ldc     INTEGER
            The leading dimension of the array C. LDC >= max(1,M).

    @param[out]
    work    (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.
            If SIDE = MagmaLeft,  LWORK >= max(1,N);
            if SIDE = MagmaRight, LWORK >= max(1,M).
            For optimum performance LWORK >= N*NB if SIDE = MagmaLeft, and
            LWORK >= M*NB if SIDE = MagmaRight, where NB is the optimal
            blocksize.
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value

    @ingroup magma_zgeqrf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_zunmqr_m(
    magma_int_t ngpu,
    magma_side_t side, magma_trans_t trans,
    magma_int_t m, magma_int_t n, magma_int_t k,
    magmaDoubleComplex *A,    magma_int_t lda,
    magmaDoubleComplex *tau,
    magmaDoubleComplex *C,    magma_int_t ldc,
    magmaDoubleComplex *work, magma_int_t lwork,
    magma_int_t *info)
{
#define  A(i, j) (A + (j)*lda  + (i))
#define  C(i, j) (C + (j)*ldc  + (i))

#define    dC(gpui,      i, j) (dw[gpui] + (j)*lddc + (i))
#define  dA_c(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddac)
#define  dA_r(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddar)
#define    dT(gpui, ind)       (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + (ind)*((nb+1)*nb))
#define dwork(gpui, ind)       (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + 2*((nb+1)*nb) + (ind)*(lddwork*nb))

    /* Constants */
    magmaDoubleComplex c_zero = MAGMA_Z_ZERO;
    magmaDoubleComplex c_one  = MAGMA_Z_ONE;

    /* Local variables */
    const char* side_  = lapack_side_const( side );
    const char* trans_ = lapack_trans_const( trans );

    magma_int_t nb = 128;
    magmaDoubleComplex *T = NULL;
    magmaDoubleComplex_ptr dw[MagmaMaxGPUs] = { NULL };
    magma_queue_t queues[MagmaMaxGPUs][2] = {{ NULL }};
    magma_event_t events[MagmaMaxGPUs][2] = {{ NULL }};

    magma_int_t ind_c;
    magma_device_t dev;
    
    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );

    *info = 0;

    magma_int_t left   = (side == MagmaLeft);
    magma_int_t notran = (trans == MagmaNoTrans);
    magma_int_t lquery = (lwork == -1);

    /* NQ is the order of Q and NW is the minimum dimension of WORK */
    magma_int_t nq, nw;
    if (left) {
        nq = m;
        nw = n;
    } else {
        nq = n;
        nw = m;
    }

    if (! left && side != MagmaRight) {
        *info = -1;
    } else if (! notran && trans != Magma_ConjTrans) {
        *info = -2;
    } else if (m < 0) {
        *info = -3;
    } else if (n < 0) {
        *info = -4;
    } else if (k < 0 || k > nq) {
        *info = -5;
    } else if (lda < max(1,nq)) {
        *info = -7;
    } else if (ldc < max(1,m)) {
        *info = -10;
    } else if (lwork < max(1,nw) && ! lquery) {
        *info = -12;
    }

    magma_int_t lwkopt = max(1,nw) * nb;
    if (*info == 0) {
        work[0] = magma_zmake_lwork( lwkopt );
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0 || k == 0) {
        work[0] = c_one;
        return *info;
    }

    if (nb >= k) {
        /* Use CPU code */
        lapackf77_zunmqr(side_, trans_, &m, &n, &k, A, &lda, tau,
                         C, &ldc, work, &lwork, info);
        return *info;
    }

    magma_int_t lddc = magma_roundup( m, 64 );  // TODO why 64 instead of 32 ?
    magma_int_t lddac = nq;
    magma_int_t lddar = nb;
    magma_int_t lddwork = nw;

    magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 };

    magma_int_t nb_l=256;
    magma_int_t nbl = magma_ceildiv( n, nb_l ); // number of blocks
    magma_int_t maxnlocal = magma_ceildiv( nbl, ngpu )*nb_l;

    ngpu = min( ngpu, magma_ceildiv( n, nb_l )); // Don't use GPU that will not have data.

    magma_int_t ldw = maxnlocal*lddc // dC
                    + 2*lddac*lddar // 2*dA
                    + 2*(nb + 1 + lddwork)*nb; // 2*(dT and dwork)

    if (MAGMA_SUCCESS != magma_zmalloc_pinned( &T, nb*nb )) {
        *info = MAGMA_ERR_HOST_ALLOC;
        goto cleanup;
    }
    for (dev = 0; dev < ngpu; ++dev) {
        magma_setdevice( dev );
        if (MAGMA_SUCCESS != magma_zmalloc( &dw[dev], ldw )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            goto cleanup;
        }
        magma_queue_create( dev, &queues[dev][0] );
        magma_queue_create( dev, &queues[dev][1] );
        magma_event_create( &events[dev][0] );
        magma_event_create( &events[dev][1] );
    }

    /* Use hybrid CPU-MGPU code */
    if (left) {
        //copy C to mgpus
        for (magma_int_t i = 0; i < nbl; ++i) {
            dev = i % ngpu;
            magma_setdevice( dev );
            magma_int_t kb = min(nb_l, n-i*nb_l);
            magma_zsetmatrix_async( m, kb,
                                   C(0, i*nb_l), ldc,
                                   dC(dev, 0, i/ngpu*nb_l), lddc, queues[dev][0] );
            nlocal[dev] += kb;
        }

        magma_int_t i1, i2, i3;
        if ( !notran ) {
            i1 = 0;
            i2 = k;
            i3 = nb;
        } else {
            i1 = (k - 1) / nb * nb;
            i2 = 0;
            i3 = -nb;
        }

        ind_c = 0;

        for (magma_int_t i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) {
            // start the copy of A panel
            magma_int_t kb = min(nb, k - i);
            for (dev = 0; dev < ngpu; ++dev) {
                magma_setdevice( dev );
                magma_event_sync( events[dev][ind_c] ); // check if the new data can be copied
                magma_zsetmatrix_async(nq-i, kb,
                                       A(i, i),                 lda,
                                       dA_c(dev, ind_c, i, 0), lddac, queues[dev][0] );
                // set upper triangular part of dA to identity
                magmablas_zlaset_band( MagmaUpper, kb, kb, kb, c_zero, c_one, dA_c(dev, ind_c, i, 0), lddac, queues[dev][0] );
            }

            /* Form the triangular factor of the block reflector
             H = H(i) H(i+1) . . . H(i+ib-1) */
            magma_int_t nqi = nq - i;
            lapackf77_zlarft("F", "C", &nqi, &kb, A(i, i), &lda,
                             &tau[i], T, &kb);

            /* H or H' is applied to C(1:m,i:n) */

            /* Apply H or H'; First copy T to the GPU */
            for (dev = 0; dev < ngpu; ++dev) {
                magma_setdevice( dev );
                magma_zsetmatrix_async(kb, kb,
                                       T,               kb,
                                       dT(dev, ind_c), kb, queues[dev][0] );
            }

            for (dev = 0; dev < ngpu; ++dev) {
                magma_setdevice( dev );
                magma_queue_sync( queues[dev][0] ); // check if the data was copied
                magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
                                 m-i, nlocal[dev], kb,
                                 dA_c(dev, ind_c, i, 0), lddac, dT(dev, ind_c), kb,
                                 dC(dev, i, 0), lddc,
                                 dwork(dev, ind_c), lddwork, queues[dev][1] );
                magma_event_record(events[dev][ind_c], queues[dev][1] );
            }

            ind_c = (ind_c+1)%2;
        }

        for (dev = 0; dev < ngpu; ++dev) {
            magma_setdevice( dev );
            magma_queue_sync( queues[dev][1] );
        }

        //copy C from mgpus
        for (magma_int_t i = 0; i < nbl; ++i) {
            dev = i % ngpu;
            magma_setdevice( dev );
            magma_int_t kb = min(nb_l, n-i*nb_l);
            magma_zgetmatrix( m, kb,
                              dC(dev, 0, i/ngpu*nb_l), lddc,
                              C(0, i*nb_l), ldc, queues[dev][1] );
//            magma_zgetmatrix_async( m, kb,
//                                   dC(dev, 0, i/ngpu*nb_l), lddc,
//                                   C(0, i*nb_l), ldc, queues[dev][0] );
        }
    } else {
        *info = MAGMA_ERR_NOT_IMPLEMENTED;
        magma_xerbla( __func__, -(*info) );
        goto cleanup;
        
        /*
        if ( notran ) {
            i1 = 0;
            i2 = k;
            i3 = nb;
        } else {
            i1 = (k - 1) / nb * nb;
            i2 = 0;
            i3 = -nb;
        }

        mi = m;
        ic = 0;

        for (i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) {
            ib = min(nb, k - i);
            
            // Form the triangular factor of the block reflector
            // H = H(i) H(i+1) . . . H(i+ib-1)
            i__4 = nq - i;
            lapackf77_zlarft("F", "C", &i__4, &ib, A(i, i), &lda,
            &tau[i], T, &ib);
            
            // 1) copy the panel from A to the GPU, and
            // 2) set upper triangular part of dA to identity
            magma_zsetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda, queues[dev][1] );
            magmablas_zlaset_band( MagmaUpper, ib, ib, ib, c_zero, c_one, dA(i, 0), ldda, queues[dev][1] );
            
            // H or H' is applied to C(1:m,i:n)
            ni = n - i;
            jc = i;
            
            // Apply H or H'; First copy T to the GPU
            magma_zsetmatrix( ib, ib, T, ib, dT, ib, queues[dev][1] );
            magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
            mi, ni, ib,
            dA(i, 0), ldda, dT, ib,
            dC(ic, jc), lddc,
            dwork, lddwork, queues[dev][1] );
        }
        */
    }

cleanup:
    work[0] = magma_zmake_lwork( lwkopt );

    for (dev = 0; dev < ngpu; ++dev) {
        magma_setdevice( dev );
        magma_event_destroy( events[dev][0] );
        magma_event_destroy( events[dev][1] );
        magma_queue_destroy( queues[dev][0] );
        magma_queue_destroy( queues[dev][1] );
        magma_free( dw[dev] );
    }
    magma_setdevice( orig_dev );
    magma_free_pinned( T );

    return *info;
} /* magma_zunmqr */
Beispiel #13
0
/**
    Purpose
    -------
    CUNMLQ overwrites the general complex M-by-N matrix C with

    @verbatim
                             SIDE = MagmaLeft     SIDE = MagmaRight
    TRANS = MagmaNoTrans:    Q * C                C * Q
    TRANS = Magma_ConjTrans: Q**H * C             C * Q**H
    @endverbatim

    where Q is a complexunitary matrix defined as the product of k
    elementary reflectors

          Q = H(k)**H . . . H(2)**H H(1)**H

    as returned by CGELQF. Q is of order M if SIDE = MagmaLeft and of order N
    if SIDE = MagmaRight.

    Arguments
    ---------
    @param[in]
    side    magma_side_t
      -     = MagmaLeft:      apply Q or Q**H from the Left;
      -     = MagmaRight:     apply Q or Q**H from the Right.

    @param[in]
    trans   magma_trans_t
      -     = MagmaNoTrans:    No transpose, apply Q;
      -     = Magma_ConjTrans: Conjugate transpose, apply Q**H.

    @param[in]
    m       INTEGER
            The number of rows of the matrix C. M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix C. N >= 0.

    @param[in]
    k       INTEGER
            The number of elementary reflectors whose product defines
            the matrix Q.
            If SIDE = MagmaLeft,  M >= K >= 0;
            if SIDE = MagmaRight, N >= K >= 0.

    @param[in]
    A       COMPLEX array, dimension
                (LDA,M) if SIDE = MagmaLeft,
                (LDA,N) if SIDE = MagmaRight.
            The i-th row must contain the vector which defines the
            elementary reflector H(i), for i = 1,2,...,k, as returned by
            CGELQF in the first k rows of its array argument A.
            A is modified by the routine but restored on exit.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A. LDA >= max(1,K).

    @param[in]
    tau     COMPLEX array, dimension (K)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by CGELQF.

    @param[in,out]
    C       COMPLEX array, dimension (LDC,N)
            On entry, the M-by-N matrix C.
            On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q.

    @param[in]
    ldc     INTEGER
            The leading dimension of the array C. LDC >= max(1,M).

    @param[out]
    work    (workspace) COMPLEX array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.
            If SIDE = MagmaLeft,  LWORK >= max(1,N);
            if SIDE = MagmaRight, LWORK >= max(1,M).
            For optimum performance
            if SIDE = MagmaLeft,  LWORK >= N*NB;
            if SIDE = MagmaRight, LWORK >= M*NB,
            where NB is the optimal blocksize.
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value

    @ingroup magma_cgelqf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_cunmlq(
    magma_side_t side, magma_trans_t trans,
    magma_int_t m, magma_int_t n, magma_int_t k,
    magmaFloatComplex *A, magma_int_t lda,
    magmaFloatComplex *tau,
    magmaFloatComplex *C, magma_int_t ldc,
    magmaFloatComplex *work, magma_int_t lwork,
    magma_int_t *info)
{
    #define  A(i_,j_) ( A + (i_) + (j_)*lda)
    #define dC(i_,j_) (dC + (i_) + (j_)*lddc)
    #define dV(i_,j_) (dV + (i_) + (j_)*ib)
    #define dT(i_,j_) (dT + (i_) + (j_)*ib)
    #define dwork(i_) (dwork + (i_))

    magmaFloatComplex *T, *T2;
    magma_int_t i, i1, i2, ib, ic, jc, nb, mi, ni, nq, nq_i, nw, step;
    magma_int_t iinfo, ldwork, lwkopt;
    magma_int_t left, notran, lquery;
    magma_trans_t transt;

    *info = 0;
    left   = (side  == MagmaLeft);
    notran = (trans == MagmaNoTrans);
    lquery = (lwork == -1);

    /* NQ is the order of Q and NW is the minimum dimension of WORK */
    if (left) {
        nq = m;
        nw = n;
    } else {
        nq = n;
        nw = m;
    }
    
    /* Test the input arguments */
    if (! left && side != MagmaRight) {
        *info = -1;
    } else if (! notran && trans != Magma_ConjTrans) {
        *info = -2;
    } else if (m < 0) {
        *info = -3;
    } else if (n < 0) {
        *info = -4;
    } else if (k < 0 || k > nq) {
        *info = -5;
    } else if (lda < max(1,k)) {
        *info = -7;
    } else if (ldc < max(1,m)) {
        *info = -10;
    } else if (lwork < max(1,nw) && ! lquery) {
        *info = -12;
    }

    if (*info == 0) {
        nb = magma_get_cgelqf_nb( min( m, n ));
        lwkopt = max(1,nw)*nb;
        work[0] = MAGMA_C_MAKE( lwkopt, 0 );
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0 || k == 0) {
        work[0] = MAGMA_C_ONE;
        return *info;
    }

    ldwork = nw;
    
    if (nb >= k) {
        /* Use CPU code */
        lapackf77_cunmlq( lapack_side_const(side), lapack_trans_const(trans),
            &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, &iinfo);
    }
    else {
        /* Use hybrid CPU-GPU code */
        /* Allocate work space on the GPU.
         * nw*nb  for dwork (m or n) by nb
         * nq*nb  for dV    (n or m) by nb
         * nb*nb  for dT
         * lddc*n for dC.
         */
        magma_int_t lddc = ((m+31)/32)*32;
        magmaFloatComplex_ptr dwork, dV, dT, dC;
        magma_cmalloc( &dwork, (nw + nq + nb)*nb + lddc*n );
        if ( dwork == NULL ) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }
        dV = dwork + nw*nb;
        dT = dV    + nq*nb;
        dC = dT    + nb*nb;
        
        /* work space on CPU.
         * nb*nb for T
         * nb*nb for T2, used to save and restore diagonal block of panel  */
        magma_cmalloc_cpu( &T, 2*nb*nb );
        if ( T == NULL ) {
            magma_free( dwork );
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        T2 = T + nb*nb;
        
        /* Copy matrix C from the CPU to the GPU */
        magma_csetmatrix( m, n, C, ldc, dC(0,0), lddc );
        
        if ( (left && notran) || (! left && ! notran) ) {
            i1 = 0;
            i2 = k;
            step = nb;
        } else {
            i1 = ((k - 1) / nb)*nb;
            i2 = 0;
            step = -nb;
        }

        // silence "uninitialized" warnings
        mi = 0;
        ni = 0;
        
        if (left) {
            ni = n;
            jc = 0;
        } else {
            mi = m;
            ic = 0;
        }

        if (notran) {
            transt = Magma_ConjTrans;
        } else {
            transt = MagmaNoTrans;
        }

        for (i = i1; (step < 0 ? i >= i2 : i < i2); i += step) {
            ib = min(nb, k - i);
            
            /* Form the triangular factor of the block reflector
               H = H(i) H(i + 1) . . . H(i + ib-1) */
            nq_i = nq - i;
            lapackf77_clarft("Forward", "Rowwise", &nq_i, &ib,
                             A(i,i), &lda, &tau[i], T, &ib);

            /* 1) set upper triangle of panel in A to identity,
               2) copy the panel from A to the GPU, and
               3) restore A                                      */
            cpanel_to_q( MagmaLower, ib, A(i,i), lda, T2 );
            magma_csetmatrix( ib, nq_i,  A(i,i), lda, dV(0,0), ib );
            cq_to_panel( MagmaLower, ib, A(i,i), lda, T2 );
            
            if (left) {
                /* H or H**H is applied to C(i:m,1:n) */
                mi = m - i;
                ic = i;
            }
            else {
                /* H or H**H is applied to C(1:m,i:n) */
                ni = n - i;
                jc = i;
            }
            
            /* Apply H or H**H; First copy T to the GPU */
            magma_csetmatrix( ib, ib, T, ib, dT(0,0), ib );
            magma_clarfb_gpu( side, transt, MagmaForward, MagmaRowwise,
                              mi, ni, ib,
                              dV(0,0), ib,
                              dT(0,0), ib,
                              dC(ic,jc), lddc,
                              dwork(0), ldwork );
        }
        magma_cgetmatrix( m, n, dC(0,0), lddc, C, ldc );
        
        magma_free( dwork );
        magma_free_cpu( T );
    }
    work[0] = MAGMA_C_MAKE( lwkopt, 0 );
    
    return *info;
} /* magma_cunmlq */
Beispiel #14
0
/**
    Purpose
    -------
    DGEQRF computes a QR factorization of a DOUBLE PRECISION M-by-N matrix A:
    A = Q * R. This version does not require work space on the GPU
    passed as input. GPU memory is allocated in the routine.

    This uses 2 queues to overlap communication and computation.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    A       DOUBLE PRECISION array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).
    \n
            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    @param[out]
    tau     DOUBLE PRECISION array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    work    (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.
    \n
            Higher performance is achieved if WORK is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.  LWORK >= max( N*NB, 2*NB*NB ),
            where NB can be obtained through magma_get_dgeqrf_nb( M, N ).
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ---------------
    The matrix Q is represented as a product of elementary reflectors

        Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

        H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).

    @ingroup magma_dgeqrf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_dgeqrf(
    magma_int_t m, magma_int_t n,
    double *A,    magma_int_t lda,
    double *tau,
    double *work, magma_int_t lwork,
    magma_int_t *info )
{
    #define  A(i_,j_)  (A + (i_) + (j_)*lda)
    
    #ifdef HAVE_clBLAS
    #define dA(i_,j_)  dA,    ((i_) + (j_)*ldda + dA_offset)
    #define dT(i_,j_)  dT,    ((i_) + (j_)*nb   + dT_offset)
    #define dwork(i_)  dwork, ((i_)             + dwork_offset)
    #else
    #define dA(i_,j_) (dA    + (i_) + (j_)*ldda)
    #define dT(i_,j_) (dT    + (i_) + (j_)*nb)
    #define dwork(i_) (dwork + (i_))
    #endif
    
    /* Constants */
    const double c_one = MAGMA_D_ONE;
    
    /* Local variables */
    magmaDouble_ptr dA, dT, dwork;
    magma_int_t i, ib, min_mn, ldda, lddwork, old_i, old_ib;
    
    /* Function Body */
    *info = 0;
    magma_int_t nb = magma_get_dgeqrf_nb( m, n );
    
    // need 2*nb*nb to store T and upper triangle of V simultaneously
    magma_int_t lwkopt = max( n*nb, 2*nb*nb );
    work[0] = magma_dmake_lwork( lwkopt );
    bool lquery = (lwork == -1);
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,m)) {
        *info = -4;
    } else if (lwork < max(1, lwkopt) && ! lquery) {
        *info = -7;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;
    
    min_mn = min( m, n );
    if (min_mn == 0) {
        work[0] = c_one;
        return *info;
    }
    
    // largest N for larfb is n-nb (trailing matrix lacks 1st panel)
    lddwork = magma_roundup( n, 32 ) - nb;
    ldda    = magma_roundup( m, 32 );
    
    magma_int_t ngpu = magma_num_gpus();
    if ( ngpu > 1 ) {
        /* call multiple-GPU interface  */
        return magma_dgeqrf_m( ngpu, m, n, A, lda, tau, work, lwork, info );
    }
    
    // allocate space for dA, dwork, and dT
    if (MAGMA_SUCCESS != magma_dmalloc( &dA, n*ldda + nb*lddwork + nb*nb )) {
        /* alloc failed so call non-GPU-resident version */
        return magma_dgeqrf_ooc( m, n, A, lda, tau, work, lwork, info );
    }
    
    dwork = dA + n*ldda;
    dT    = dA + n*ldda + nb*lddwork;
    
    magma_queue_t queues[2];
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queues[0] );
    magma_queue_create( cdev, &queues[1] );
    

    //used for timing CPU and GPU
    int iter = 0;
    float cpu_time = 0.0;
    float gpu_time = 0.0;

    int affinity = map_cpu(0);
    if(affinity != 0)
    {
        printf("affinity failed\n");
        return -1;
    }


    magma_set_lapack_numthreads(1);

    // for initial setting, better to be automatic in the future
    // SetGPUFreq(324, 324);
    // system("echo 1200000 > /sys/devices/system/cpu/cpu0/cpufreq/scaling_setspeed");
    SetGPUFreq(2600, 705);
    system("echo 2500000 > /sys/devices/system/cpu/cpu0/cpufreq/scaling_setspeed");

    double gpu_iter1_low = 2096.544434;
    double gpu_iter1_high = 478.825226;
    double cpu_iter1_low = 1792.011230;
    double cpu_iter1_high = 1413.732788;

    double gpu_pred_high = gpu_iter1_high;
    double gpu_pred_low = gpu_iter1_low;
    double cpu_pred_high = cpu_iter1_high;
    double cpu_pred_low = cpu_iter1_low;

    double ratio_split_freq = 0;
    double time_until_interrupt = 0;

    cudaEvent_t start_cpu, stop_cpu;
    cudaEvent_t start_gpu, stop_gpu;

    // switches for different modes
    bool timing = false; //for initial setting only, greatly impact performance
    bool dvfs = false; //turn on dvfs energy saving
    bool relax = false; //turn on relax scheme
    bool r2h = false; // turn on race to halt

    //these parameters need to be tuned in future works.
    double dvfs_converage = 0.5;
    double prediction_offset_gpu = 0.65;
    double prediction_offset_cpu = 0.65;


    //for nvprof profiler, brings slight constant performance overhead
    //cudaProfilerStart();

    if ( (nb > 1) && (nb < min_mn) ) {
        /* Use blocked code initially.
           Asynchronously send the matrix to the GPU except the first panel. */
        magma_dsetmatrix_async( m, n-nb,
                                 A(0,nb), lda,
                                dA(0,nb), ldda, queues[0] );
        
        old_i = 0;
        old_ib = nb;
        for (i = 0; i < min_mn-nb; i += nb) {
            ib = min( min_mn-i, nb );
            if (i > 0) {
                /* get i-th panel from device */
                magma_queue_sync( queues[1] );
                magma_dgetmatrix_async( m-i, ib,
                                        dA(i,i), ldda,
                                         A(i,i), lda, queues[0] );
                
                if (timing) {
                    //start gpu timing
                    cudaEventCreate(&start_gpu);
                    cudaEventCreate(&stop_gpu);
                    cudaEventRecord(start_gpu, 0);
                }

                /* Apply H' to A(i:m,i+2*ib:n) from the left */
                magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                  m-old_i, n-old_i-2*old_ib, old_ib,
                                  dA(old_i, old_i),          ldda, dT(0,0),  nb,
                                  dA(old_i, old_i+2*old_ib), ldda, dwork(0), lddwork, queues[1] );
                


                double ratio_slack_pred = 1.0 - (double)nb/(m-iter*nb);
                cpu_pred_high = cpu_pred_high * ratio_slack_pred;
                cpu_pred_low = cpu_pred_low * ratio_slack_pred;
                gpu_pred_high = gpu_pred_high * ratio_slack_pred * ratio_slack_pred;
                gpu_pred_low = gpu_pred_low * ratio_slack_pred * ratio_slack_pred;

                if (timing) {
                    printf("iter:%d GPU time pred:%f\n", iter, gpu_pred_high);
                    printf("iter:%d CPU time pred:%f\n", iter, cpu_pred_high);
                }


                if (iter < dvfs_converage*(min_mn-nb)/nb) {
                    if (cpu_pred_high > gpu_pred_high) { //slack on GPU
                        ratio_split_freq = (cpu_pred_high - gpu_pred_high) / (gpu_pred_high * ((gpu_iter1_low / gpu_iter1_high) - 1));
                        time_until_interrupt = gpu_pred_low * ratio_split_freq;
                         //printf("iter:%d time_until_interrupt:%f\n", iter, time_until_interrupt);
                        // printf("iter:%d ratio_split_freq:%f\n", iter, ratio_split_freq);
                        if (dvfs) {
                            if ((!relax) || (relax && ratio_split_freq > 0.05)) {
                                if (ratio_split_freq < 1)
                                    dvfs_adjust(time_until_interrupt*prediction_offset_gpu, 'g');
                                else
                                    dvfs_adjust(cpu_pred_high, 'g');
                            }
                        } else if (r2h) {
                            r2h_adjust(gpu_pred_high, cpu_pred_high - gpu_pred_high, 'g');
                        }
                    } else { //slack on CPU
                        ratio_split_freq = (gpu_pred_high - cpu_pred_high) / (cpu_pred_high * ((cpu_iter1_low / cpu_iter1_high) - 1));
                        time_until_interrupt = cpu_pred_low * ratio_split_freq;
                        if (dvfs) {
                            if ((!relax) || (relax && ratio_split_freq > 0.05)) {
                                if (ratio_split_freq < 1)
                                    dvfs_adjust(time_until_interrupt*prediction_offset_cpu, 'c');
                                else
                                    dvfs_adjust(gpu_pred_high, 'c');
                            }
                        } else if (r2h) {
                            r2h_adjust(cpu_pred_high, gpu_pred_high - cpu_pred_high, 'c');
                        }
                    }
                }

                if (timing) {
                    //end gpu timing
                    cudaEventRecord(stop_gpu, 0);
                    cudaEventSynchronize(stop_gpu);
                    cudaEventElapsedTime(&gpu_time, start_gpu, stop_gpu);
                    cudaEventDestroy(start_gpu);
                    cudaEventDestroy(stop_gpu);

                    printf("iter:%d GPU time:%f\n", iter, gpu_time);
                }



                magma_dgetmatrix_async( i, ib,
                                        dA(0,i), ldda,
                                         A(0,i), lda, queues[1] );
                magma_queue_sync( queues[0] );
            }
            
            magma_int_t rows = m-i;

            if (timing) {
                //start cpu timing
                cudaEventCreate(&start_cpu);
                cudaEventCreate(&stop_cpu);
                cudaEventRecord(start_cpu, 0);
            }

            lapackf77_dgeqrf( &rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info );
            
            /* Form the triangular factor of the block reflector
               H = H(i) H(i+1) . . . H(i+ib-1) */
            lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr,
                              &rows, &ib, A(i,i), &lda, tau+i, work, &ib );
            

            if (timing) {
                //end cpu timing
                cudaEventRecord(stop_cpu, 0);
                cudaEventSynchronize(stop_cpu);
                cudaEventElapsedTime(&cpu_time, start_cpu, stop_cpu);
                cudaEventDestroy(start_cpu);
                cudaEventDestroy(stop_cpu);
                printf("iter:%d CPU time:%f\n", iter, cpu_time);
                if (gpu_time < cpu_time) {
                    printf("slack: +\n");
                } else {
                    printf("slack: -\n");
                }
            }

            magma_dpanel_to_q( MagmaUpper, ib, A(i,i), lda, work+ib*ib );
            
            /* put i-th V matrix onto device */
            magma_dsetmatrix_async( rows, ib, A(i,i), lda, dA(i,i), ldda, queues[0] );
            
            /* put T matrix onto device */
            magma_queue_sync( queues[1] );
            magma_dsetmatrix_async( ib, ib, work, ib, dT(0,0), nb, queues[0] );
            magma_queue_sync( queues[0] );
            
            if (i + ib < n) {
                if (i+ib < min_mn-nb) {
                    /* Apply H' to A(i:m,i+ib:i+2*ib) from the left (look-ahead) */
                    magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                      rows, ib, ib,
                                      dA(i, i   ), ldda, dT(0,0),  nb,
                                      dA(i, i+ib), ldda, dwork(0), lddwork, queues[1] );
                    magma_dq_to_panel( MagmaUpper, ib, A(i,i), lda, work+ib*ib );
                }
                else {
                    /* After last panel, update whole trailing matrix. */
                    /* Apply H' to A(i:m,i+ib:n) from the left */
                    magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                      rows, n-i-ib, ib,
                                      dA(i, i   ), ldda, dT(0,0),  nb,
                                      dA(i, i+ib), ldda, dwork(0), lddwork, queues[1] );
                    magma_dq_to_panel( MagmaUpper, ib, A(i,i), lda, work+ib*ib );
                }
                
                old_i  = i;
                old_ib = ib;
                
            }
            iter ++;

        }
        //for nvprof profiler.
        //cudaProfilerStop();
    } else {
        i = 0;
    }
    
    /* Use unblocked code to factor the last or only block. */
    if (i < min_mn) {
        ib = n-i;
        if (i != 0) {
            magma_dgetmatrix( m, ib, dA(0,i), ldda, A(0,i), lda, queues[1] );
        }
        magma_int_t rows = m-i;
        lapackf77_dgeqrf( &rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info );
    }
    
    magma_queue_destroy( queues[0] );
    magma_queue_destroy( queues[1] );
    
    magma_free( dA );
    
    return *info;
} /* magma_dgeqrf */
Beispiel #15
0
extern "C" magma_int_t
magma_zunmqr_m(magma_int_t nrgpu, char side, char trans,
               magma_int_t m, magma_int_t n, magma_int_t k,
               cuDoubleComplex *a,    magma_int_t lda,
               cuDoubleComplex *tau,
               cuDoubleComplex *c,    magma_int_t ldc,
               cuDoubleComplex *work, magma_int_t lwork,
               magma_int_t *info)
{
/*  -- MAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2012

    Purpose
    =======
    ZUNMQR overwrites the general complex M-by-N matrix C with

                    SIDE = 'L'     SIDE = 'R'
    TRANS = 'N':      Q * C          C * Q
    TRANS = 'T':      Q**H * C       C * Q**H

    where Q is a complex orthogonal matrix defined as the product of k
    elementary reflectors

          Q = H(1) H(2) . . . H(k)

    as returned by ZGEQRF. Q is of order M if SIDE = 'L' and of order N
    if SIDE = 'R'.

    Arguments
    =========
    SIDE    (input) CHARACTER*1
            = 'L': apply Q or Q**H from the Left;
            = 'R': apply Q or Q**H from the Right.

    TRANS   (input) CHARACTER*1
            = 'N':  No transpose, apply Q;
            = 'T':  Transpose, apply Q**H.

    M       (input) INTEGER
            The number of rows of the matrix C. M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix C. N >= 0.

    K       (input) INTEGER
            The number of elementary reflectors whose product defines
            the matrix Q.
            If SIDE = 'L', M >= K >= 0;
            if SIDE = 'R', N >= K >= 0.

    A       (input) COMPLEX_16 array, dimension (LDA,K)
            The i-th column must contain the vector which defines the
            elementary reflector H(i), for i = 1,2,...,k, as returned by
            ZGEQRF in the first k columns of its array argument A.

    LDA     (input) INTEGER
            The leading dimension of the array A.
            If SIDE = 'L', LDA >= max(1,M);
            if SIDE = 'R', LDA >= max(1,N).

    TAU     (input) COMPLEX_16 array, dimension (K)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by ZGEQRF.

    C       (input/output) COMPLEX_16 array, dimension (LDC,N)
            On entry, the M-by-N matrix C.
            On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q.

    LDC     (input) INTEGER
            The leading dimension of the array C. LDC >= max(1,M).

    WORK    (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK(0) returns the optimal LWORK.

    LWORK   (input) INTEGER
            The dimension of the array WORK.
            If SIDE = 'L', LWORK >= max(1,N);
            if SIDE = 'R', LWORK >= max(1,M).
            For optimum performance LWORK >= N*NB if SIDE = 'L', and
            LWORK >= M*NB if SIDE = 'R', where NB is the optimal
            blocksize.

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
    =====================================================================   */
    cuDoubleComplex c_one = MAGMA_Z_ONE;

    char side_[2] = {side, 0};
    char trans_[2] = {trans, 0};

    cuDoubleComplex* dw[MagmaMaxGPUs];
    cudaStream_t stream [MagmaMaxGPUs][2];

    magma_int_t ind_c, kb;

    magma_int_t i__4;
    magma_int_t i;
    cuDoubleComplex t[4160];        /* was [65][64] */
    magma_int_t i1, i2, i3, ib, nb, nq, nw;
    magma_int_t left, notran, lquery;
    magma_int_t iinfo, lwkopt;

    magma_int_t igpu = 0;

    int gpu_b;
    magma_getdevice(&gpu_b);

    *info = 0;
    left = lapackf77_lsame(side_, "L");
    notran = lapackf77_lsame(trans_, "N");
    lquery = (lwork == -1);

    /* NQ is the order of Q and NW is the minimum dimension of WORK */
    if (left) {
        nq = m;
        nw = n;
    } else {
        nq = n;
        nw = m;
    }
    if (! left && ! lapackf77_lsame(side_, "R")) {
        *info = -1;
    } else if (! notran && ! lapackf77_lsame(trans_, "T")) {
        *info = -2;
    } else if (m < 0) {
        *info = -3;
    } else if (n < 0) {
        *info = -4;
    } else if (k < 0 || k > nq) {
        *info = -5;
    } else if (lda < max(1,nq)) {
        *info = -7;
    } else if (ldc < max(1,m)) {
        *info = -10;
    } else if (lwork < max(1,nw) && ! lquery) {
        *info = -12;
    }

    if (*info == 0)
    {
        /* Determine the block size.  NB may be at most NBMAX, where NBMAX
         is used to define the local array T.    */
        nb = 64;
        lwkopt = max(1,nw) * nb;
        MAGMA_Z_SET2REAL( work[0], lwkopt );
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0 || k == 0) {
        work[0] = c_one;
        return *info;
    }

    magma_int_t lddc = (m+63)/64*64;
    magma_int_t lddac = nq;
    magma_int_t lddar =nb;
    magma_int_t lddwork = nw;

    magma_int_t n_l = (n+nrgpu-1)/nrgpu; // local n
    n_l = ((n_l+63)/64)*64;

    if (n_l<256)
       n_l=256;

    nrgpu = min(nrgpu, (n+n_l-1)/n_l); // Don't use GPU that will not have data.

    for (igpu = 0; igpu < nrgpu; ++igpu){
        magma_setdevice(igpu);
        magmablasSetKernelStream(NULL);
        if (MAGMA_SUCCESS != magma_zmalloc( &dw[igpu], (n_l*lddc + 2*lddac*lddar + 2*(nb + 1 + lddwork)*nb))) {
            magma_xerbla( __func__, -(*info) );
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }
        magma_queue_create( &stream[igpu][0] );
        magma_queue_create( &stream[igpu][1] );
    }

    if (nb >= k)
    {
        /* Use CPU code */
        lapackf77_zunmqr(side_, trans_, &m, &n, &k, a, &lda, tau,
                         c, &ldc, work, &lwork, &iinfo);
    }
    else
    {
        /* Use hybrid CPU-MGPU code */
        if (left) {

            //copy C to mgpus
            for (igpu = 0; igpu < nrgpu; ++igpu){
                magma_setdevice(igpu);
                kb = min(n_l, n-igpu*n_l);
                magma_zsetmatrix_async( m, kb,
                                        C(0, igpu*n_l), ldc,
                                        dC(igpu, 0, 0), lddc, stream[igpu][0] );
            }

            if ( !notran ) {
                i1 = 0;
                i2 = k;
                i3 = nb;
            } else {
                i1 = (k - 1) / nb * nb;
                i2 = 0;
                i3 = -nb;
            }

            kb = min(nb, k-i1);
            for (igpu = 0; igpu < nrgpu; ++igpu){
                magma_setdevice(igpu);
                magma_zsetmatrix_async( nq-i1, kb,
                                        A(i1, i1),            lda,
                                        dA_c(igpu, 0, i1, 0), lddac, stream[igpu][0] );
            }
            ind_c = 0;

            for (i = i1; i3 < 0 ? i >= i2 : i < i2; i += i3)
            {
                ib = min(nb, k - i);
                /* Form the triangular factor of the block reflector
                   H = H(i) H(i+1) . . . H(i+ib-1) */
                i__4 = nq - i;
                lapackf77_zlarft("F", "C", &i__4, &ib, A(i, i), &lda,
                                 &tau[i], t, &ib);

                /* H or H' is applied to C(1:m,i:n) */

                /* Apply H or H'; First copy T to the GPU */
                for (igpu = 0; igpu < nrgpu; ++igpu){
                    magma_setdevice(igpu);
                    magma_zsetmatrix_async( ib, ib,
                                            t,               ib,
                                            dt(igpu, ind_c), ib, stream[igpu][ind_c] );

                    magma_queue_sync( stream[igpu][ind_c] ); // Makes sure that we can change t next iteration.
                }

                // start the copy of next A panel
                kb = min(nb, k - i - i3);
                if (kb > 0 && i+i3 >= 0){
                    for (igpu = 0; igpu < nrgpu; ++igpu){
                        magma_setdevice(igpu);
                        magma_zsetmatrix_async( nq-(i+i3), kb,
                                                A(i+i3, i+i3),                    lda,
                                                dA_c(igpu, (ind_c+1)%2, i+i3, 0), lddac, stream[igpu][(ind_c+1)%2] );
                    }
                }

                for (igpu = 0; igpu < nrgpu; ++igpu){
                    magma_setdevice(igpu);
                    // Put 0s in the upper triangular part of dA;
                    magmablas_zsetdiag1subdiag0_stream('L', ib, ib, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][ind_c]);

                    kb = min(n_l, n-igpu*n_l);
                    magmablasSetKernelStream(stream[igpu][ind_c]);
                    magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
                                     m-i, kb, ib,
                                     dA_c(igpu, ind_c, i, 0), lddac, dt(igpu, ind_c), ib,
                                     dC(igpu, i, 0), lddc,
                                     dwork(igpu, ind_c), lddwork);
                }

                ind_c = (ind_c+1)%2;
            }

            //copy C from mgpus
            for (igpu = 0; igpu < nrgpu; ++igpu){
                magma_setdevice(igpu);
                magma_queue_sync( stream[igpu][0] );
                magma_queue_sync( stream[igpu][1] );
                kb = min(n_l, n-igpu*n_l);
                //asynchronous copy gives problems sometimes...
//                magma_zgetmatrix_async( m, kb,
//                                        dC(igpu, 0, 0), lddc,
//                                        C(0, igpu*n_l), ldc, stream[igpu][0] );
                magma_zgetmatrix( m, kb,
                                  dC(igpu, 0, 0), lddc,
                                  C(0, igpu*n_l), ldc );
            }

        } else {

            fprintf(stderr, "The case (side == right) is not implemented\n");
            magma_xerbla( __func__, 1 );
            return *info;

            /*if ( notran ) {
                i1 = 0;
                i2 = k;
                i3 = nb;
            } else {
                i1 = (k - 1) / nb * nb;
                i2 = 0;
                i3 = -nb;
            }

            mi = m;
            ic = 0;

            for (i = i1; i3 < 0 ? i >= i2 : i < i2; i += i3)
            {
                ib = min(nb, k - i);

                // Form the triangular factor of the block reflector
                // H = H(i) H(i+1) . . . H(i+ib-1)
                i__4 = nq - i;
                lapackf77_zlarft("F", "C", &i__4, &ib, A(i, i), &lda,
                                 &tau[i], t, &ib);

                // 1) copy the panel from A to the GPU, and
                // 2) Put 0s in the upper triangular part of dA;
                magma_zsetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda );
                magmablas_zsetdiag1subdiag0('L', ib, ib, dA(i, 0), ldda);


                // H or H' is applied to C(1:m,i:n)
                ni = n - i;
                jc = i;

                // Apply H or H'; First copy T to the GPU
                magma_zsetmatrix( ib, ib, t, ib, dt, ib );
                magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
                                 mi, ni, ib,
                                 dA(i, 0), ldda, dt, ib,
                                 dC(ic, jc), lddc,
                                 dwork, lddwork);
            }
            */
        }
    }
    MAGMA_Z_SET2REAL( work[0], lwkopt );

    for (igpu = 0; igpu < nrgpu; ++igpu){
        magma_setdevice(igpu);
        magma_queue_sync( stream[igpu][0] );
        magmablasSetKernelStream(NULL);
        magma_queue_destroy( stream[igpu][0] );
        magma_queue_destroy( stream[igpu][1] );
        magma_free( dw[igpu] );
    }

    magma_setdevice(gpu_b);

    return *info;
} /* magma_zunmqr */