Esempio n. 1
0
extern "C" magma_int_t
magma_dlatrd(char uplo, magma_int_t n, magma_int_t nb,
             double *a,  magma_int_t lda,
             double *e, double *tau,
             double *w,  magma_int_t ldw,
             double *da, magma_int_t ldda,
             double *dw, magma_int_t lddw)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    DLATRD reduces NB rows and columns of a real symmetric matrix A to
    symmetric tridiagonal form by an orthogonal similarity
    transformation Q' * A * Q, and returns the matrices V and W which are
    needed to apply the transformation to the unreduced part of A.

    If UPLO = 'U', DLATRD reduces the last NB rows and columns of a
    matrix, of which the upper triangle is supplied;
    if UPLO = 'L', DLATRD reduces the first NB rows and columns of a
    matrix, of which the lower triangle is supplied.

    This is an auxiliary routine called by DSYTRD.

    Arguments
    =========
    UPLO    (input) CHARACTER*1
            Specifies whether the upper or lower triangular part of the
            symmetric matrix A is stored:
            = 'U': Upper triangular
            = 'L': Lower triangular

    N       (input) INTEGER
            The order of the matrix A.

    NB      (input) INTEGER
            The number of rows and columns to be reduced.

    A       (input/output) DOUBLE_PRECISION array, dimension (LDA,N)
            On entry, the symmetric matrix A.  If UPLO = 'U', the leading
            n-by-n upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = 'L', the
            leading n-by-n lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
            On exit:
            if UPLO = 'U', the last NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements above the diagonal
              with the array TAU, represent the orthogonal matrix Q as a
              product of elementary reflectors;
            if UPLO = 'L', the first NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements below the diagonal
              with the array TAU, represent the  orthogonal matrix Q as a
              product of elementary reflectors.
            See Further Details.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= (1,N).

    E       (output) DOUBLE_PRECISION array, dimension (N-1)
            If UPLO = 'U', E(n-nb:n-1) contains the superdiagonal
            elements of the last NB columns of the reduced matrix;
            if UPLO = 'L', E(1:nb) contains the subdiagonal elements of
            the first NB columns of the reduced matrix.

    TAU     (output) DOUBLE_PRECISION array, dimension (N-1)
            The scalar factors of the elementary reflectors, stored in
            TAU(n-nb:n-1) if UPLO = 'U', and in TAU(1:nb) if UPLO = 'L'.
            See Further Details.

    W       (output) DOUBLE_PRECISION array, dimension (LDW,NB)
            The n-by-nb matrix W required to update the unreduced part
            of A.

    LDW     (input) INTEGER
            The leading dimension of the array W. LDW >= max(1,N).

    Further Details
    ===============
    If UPLO = 'U', the matrix Q is represented as a product of elementary
    reflectors

       Q = H(n) H(n-1) . . . H(n-nb+1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i),
    and tau in TAU(i-1).

    If UPLO = 'L', the matrix Q is represented as a product of elementary
    reflectors

       Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i),
    and tau in TAU(i).

    The elements of the vectors v together form the n-by-nb matrix V
    which is needed, with W, to apply the transformation to the unreduced
    part of the matrix, using a symmetric rank-2k update of the form:
    A := A - V*W' - W*V'.

    The contents of A on exit are illustrated by the following examples
    with n = 5 and nb = 2:

    if UPLO = 'U':                       if UPLO = 'L':

      (  a   a   a   v4  v5 )              (  d                  )
      (      a   a   v4  v5 )              (  1   d              )
      (          a   1   v5 )              (  v1  1   a          )
      (              d   1  )              (  v1  v2  a   a      )
      (                  d  )              (  v1  v2  a   a   a  )

    where d denotes a diagonal element of the reduced matrix, a denotes
    an element of the original matrix that is unchanged, and vi denotes
    an element of the vector defining H(i).
    =====================================================================    */
    
    char uplo_[2]  = {uplo, 0};

    magma_int_t i;
    
    double c_neg_one = MAGMA_D_NEG_ONE;
    double c_one     = MAGMA_D_ONE;
    double c_zero    = MAGMA_D_ZERO;

    double value = MAGMA_D_ZERO;
    
    magma_int_t ione = 1;

    magma_int_t i_n, i_1, iw;
    
    double alpha;
    double *f;

    if (n <= 0) {
        return 0;
    }

    magma_queue_t stream;
    magma_queue_create( &stream );
    magma_dmalloc_cpu( &f, n );
    assert( f != NULL );  // TODO return error, or allocate outside dlatrd

    if (lapackf77_lsame(uplo_, "U")) {

        /* Reduce last NB columns of upper triangle */
        for (i = n-1; i >= n - nb ; --i) {
            i_1 = i + 1;
            i_n = n - i - 1;
            
            iw = i - n + nb;
            if (i < n-1) {
                /* Update A(1:i,i) */
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_dlacgv(&i_n, W(i, iw+1), &ldw);
                #endif
                blasf77_dgemv("No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda,
                              W(i, iw+1), &ldw, &c_one, A(0, i), &ione);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_dlacgv(&i_n, W(i, iw+1), &ldw);
                lapackf77_dlacgv(&i_n, A(i, i+1), &lda);
                #endif
                blasf77_dgemv("No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                              A(i, i+1), &lda, &c_one, A(0, i), &ione);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_dlacgv(&i_n, A(i, i+1), &lda);
                #endif
            }
            if (i > 0) {
                /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */
                
                alpha = *A(i-1, i);
                
                lapackf77_dlarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]);
                
                e[i-1] = MAGMA_D_REAL( alpha );
                MAGMA_D_SET2REAL(*A(i-1, i), 1.);
                
                /* Compute W(1:i-1,i) */
                // 1. Send the block reflector  A(0:n-i-1,i) to the GPU
                magma_dsetvector( i, A(0, i), 1, dA(0, i), 1 );
                
                magma_dsymv(MagmaUpper, i, c_one, dA(0, 0), ldda,
                            dA(0, i), ione, c_zero, dW(0, iw), ione);
                
                // 2. Start putting the result back (asynchronously)
                magma_dgetmatrix_async( i, 1,
                                        dW(0, iw),         lddw,
                                        W(0, iw) /*test*/, ldw, stream );
                
                if (i < n-1) {
                    blasf77_dgemv(MagmaTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw,
                                  A(0, i), &ione, &c_zero, W(i+1, iw), &ione);
                }
                
                // 3. Here is where we need it // TODO find the right place
                magma_queue_sync( stream );
                
                if (i < n-1) {
                    blasf77_dgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda,
                                  W(i+1, iw), &ione, &c_one, W(0, iw), &ione);
                    
                    blasf77_dgemv(MagmaTransStr, &i, &i_n, &c_one, A(0, i+1), &lda,
                                  A(0, i), &ione, &c_zero, W(i+1, iw), &ione);
                    
                    blasf77_dgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                                  W(i+1, iw), &ione, &c_one, W(0, iw), &ione);
                }
                
                blasf77_dscal(&i, &tau[i - 1], W(0, iw), &ione);
                
                #if defined(PRECISION_z) || defined(PRECISION_c)
                cblas_ddot_sub( i, W(0,iw), ione, A(0,i), ione, &value );
                #else
                value = cblas_ddot( i, W(0,iw), ione, A(0,i), ione );
                #endif
                alpha = tau[i - 1] * -0.5f * value;
                blasf77_daxpy(&i, &alpha, A(0, i), &ione,
                              W(0, iw), &ione);
            }
        }

    } else {

        /*  Reduce first NB columns of lower triangle */
        for (i = 0; i < nb; ++i) {
        
            /* Update A(i:n,i) */
            i_n = n - i;
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv(&i, W(i, 0), &ldw);
            #endif
            blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda,
                          W(i, 0), &ldw, &c_one, A(i, i), &ione);
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv(&i, W(i, 0), &ldw);
            lapackf77_dlacgv(&i, A(i ,0), &lda);
            #endif
            blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw,
                          A(i, 0), &lda, &c_one, A(i, i), &ione);
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv(&i, A(i, 0), &lda);
            #endif
        
            if (i < n-1) {
                /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */
                i_n = n - i - 1;
                alpha = *A(i+1, i);
                lapackf77_dlarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]);
                e[i] = MAGMA_D_REAL( alpha );
                MAGMA_D_SET2REAL(*A(i+1, i), 1.);
        
                /* Compute W(i+1:n,i) */
                // 1. Send the block reflector  A(i+1:n,i) to the GPU
                magma_dsetvector( i_n, A(i+1, i), 1, dA(i+1, i), 1 );
            
                magma_dsymv(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero,
                            dW(i+1, i), ione);
            
                // 2. Start putting the result back (asynchronously)
                magma_dgetmatrix_async( i_n, 1,
                                        dW(i+1, i), lddw,
                                        W(i+1, i),  ldw, stream );
        
                blasf77_dgemv(MagmaTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw,
                              A(i+1, i), &ione, &c_zero, W(0, i), &ione);
        
                blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda,
                              W(0, i), &ione, &c_zero, f, &ione);
                
                blasf77_dgemv(MagmaTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda,
                              A(i+1, i), &ione, &c_zero, W(0, i), &ione);
        
                // 3. Here is where we need it
                magma_queue_sync( stream );
        
                if (i!=0)
                    blasf77_daxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione);
        
                blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw,
                              W(0, i), &ione, &c_one, W(i+1, i), &ione);
                blasf77_dscal(&i_n, &tau[i], W(i+1,i), &ione);
                
                #if defined(PRECISION_z) || defined(PRECISION_c)
                cblas_ddot_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value );
                #else
                value = cblas_ddot( i_n, W(i+1,i), ione, A(i+1,i), ione );
                #endif
                alpha = tau[i] * -0.5f * value;
                blasf77_daxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione);
            }
        }
    }

    magma_free_cpu(f);
    magma_queue_destroy( stream );

    return 0;
} /* dlatrd */
Esempio n. 2
0
extern "C" magma_int_t
magma_ssytrd(char uplo, magma_int_t n,
             float *a, magma_int_t lda,
             float *d, float *e, float *tau,
             float *work, magma_int_t lwork,
             magma_int_t *info)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    SSYTRD reduces a real symmetric matrix A to real symmetric
    tridiagonal form T by an orthogonal similarity transformation:
    Q**T * A * Q = T.

    Arguments
    =========
    UPLO    (input) CHARACTER*1
            = 'U':  Upper triangle of A is stored;
            = 'L':  Lower triangle of A is stored.

    N       (input) INTEGER
            The order of the matrix A.  N >= 0.

    A       (input/output) REAL array, dimension (LDA,N)
            On entry, the symmetric matrix A.  If UPLO = 'U', the leading
            N-by-N upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = 'L', the
            leading N-by-N lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
            On exit, if UPLO = 'U', the diagonal and first superdiagonal
            of A are overwritten by the corresponding elements of the
            tridiagonal matrix T, and the elements above the first
            superdiagonal, with the array TAU, represent the orthogonal
            matrix Q as a product of elementary reflectors; if UPLO
            = 'L', the diagonal and first subdiagonal of A are over-
            written by the corresponding elements of the tridiagonal
            matrix T, and the elements below the first subdiagonal, with
            the array TAU, represent the orthogonal matrix Q as a product
            of elementary reflectors. See Further Details.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    D       (output) REAL array, dimension (N)
            The diagonal elements of the tridiagonal matrix T:
            D(i) = A(i,i).

    E       (output) REAL array, dimension (N-1)
            The off-diagonal elements of the tridiagonal matrix T:
            E(i) = A(i,i+1) if UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'.

    TAU     (output) REAL array, dimension (N-1)
            The scalar factors of the elementary reflectors (see Further
            Details).

    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.

    LWORK   (input) INTEGER
            The dimension of the array WORK.  LWORK >= N*NB, where NB is the
            optimal blocksize given by magma_get_ssytrd_nb().

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value

    Further Details
    ===============
    If UPLO = 'U', the matrix Q is represented as a product of elementary
    reflectors

       Q = H(n-1) . . . H(2) H(1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in
    A(1:i-1,i+1), and tau in TAU(i).

    If UPLO = 'L', the matrix Q is represented as a product of elementary
    reflectors

       Q = H(1) H(2) . . . H(n-1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i),
    and tau in TAU(i).

    The contents of A on exit are illustrated by the following examples
    with n = 5:

    if UPLO = 'U':                       if UPLO = 'L':

      (  d   e   v2  v3  v4 )              (  d                  )
      (      d   e   v3  v4 )              (  e   d              )
      (          d   e   v4 )              (  v1  e   d          )
      (              d   e  )              (  v1  v2  e   d      )
      (                  d  )              (  v1  v2  v3  e   d  )

    where d and e denote diagonal and off-diagonal elements of T, and vi
    denotes an element of the vector defining H(i).
    =====================================================================    */

    char uplo_[2] = {uplo, 0};

    magma_int_t ldda = lda;
    magma_int_t nb = magma_get_ssytrd_nb(n);

    float c_neg_one = MAGMA_S_NEG_ONE;
    float c_one     = MAGMA_S_ONE;
    float          d_one     = MAGMA_D_ONE;
    
    magma_int_t kk, nx;
    magma_int_t i, j, i_n;
    magma_int_t iinfo;
    magma_int_t ldwork, lddwork, lwkopt;
    magma_int_t lquery;

    *info = 0;
    int upper = lapackf77_lsame(uplo_, "U");
    lquery = lwork == -1;
    if (! upper && ! lapackf77_lsame(uplo_, "L")) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,n)) {
        *info = -4;
    } else if (lwork < nb*n && ! lquery) {
        *info = -9;
    }

    /* Determine the block size. */
    ldwork = lddwork = n;
    lwkopt = n * nb;
    if (*info == 0) {
        MAGMA_S_SET2REAL( work[0], lwkopt );
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;

    /* Quick return if possible */
    if (n == 0) {
        work[0] = c_one;
        return *info;
    }

    float *da;
    if (MAGMA_SUCCESS != magma_smalloc( &da, n*ldda + 2*n*nb )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    float *dwork = da + (n)*ldda;

    if (n < 2048)
        nx = n;
    else
        nx = 512;

    if (upper) {
        /* Copy the matrix to the GPU */
        magma_ssetmatrix( n, n, A(0, 0), lda, dA(0, 0), ldda );

        /*  Reduce the upper triangle of A.
            Columns 1:kk are handled by the unblocked method. */
        kk = n - (n - nx + nb - 1) / nb * nb;

        for (i = n - nb; i >= kk; i -= nb) {
            /* Reduce columns i:i+nb-1 to tridiagonal form and form the
               matrix W which is needed to update the unreduced part of
               the matrix */
            
            /*   Get the current panel (no need for the 1st iteration) */
            if (i!=n-nb)
                magma_sgetmatrix( i+nb, nb, dA(0, i), ldda, A(0, i), lda );
            
            magma_slatrd(uplo, i+nb, nb, A(0, 0), lda, e, tau,
                         work, ldwork, dA(0, 0), ldda, dwork, lddwork);

            /* Update the unreduced submatrix A(0:i-2,0:i-2), using an
               update of the form:  A := A - V*W' - W*V' */
            magma_ssetmatrix( i + nb, nb, work, ldwork, dwork, lddwork );

            magma_ssyr2k(uplo, MagmaNoTrans, i, nb, c_neg_one,
                         dA(0, i), ldda, dwork,
                         lddwork, d_one, dA(0, 0), ldda);
            
            /* Copy superdiagonal elements back into A, and diagonal
               elements into D */
            for (j = i; j < i+nb; ++j) {
                MAGMA_S_SET2REAL( *A(j-1, j), e[j - 1] );
                d[j] = MAGMA_S_REAL( *A(j, j) );
            }
        }
        
        magma_sgetmatrix( kk, kk, dA(0, 0), ldda, A(0, 0), lda );
        
        /*  Use unblocked code to reduce the last or only block */
        lapackf77_ssytd2(uplo_, &kk, A(0, 0), &lda, d, e, tau, &iinfo);
    }
    else {
        /* Copy the matrix to the GPU */
        if (1<=n-nx)
            magma_ssetmatrix( n, n, A(0,0), lda, dA(0,0), ldda );

        #ifdef FAST_HEMV
        // TODO this leaks memory from da, above
        float *dwork2;
        if (MAGMA_SUCCESS != magma_smalloc( &dwork2, n*n )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }
        #endif
        /* Reduce the lower triangle of A */
        for (i = 0; i < n-nx; i += nb) {
            /* Reduce columns i:i+nb-1 to tridiagonal form and form the
               matrix W which is needed to update the unreduced part of
               the matrix */

            /*   Get the current panel (no need for the 1st iteration) */
            if (i!=0)
                magma_sgetmatrix( n-i, nb, dA(i, i), ldda, A(i, i), lda );
            #ifdef FAST_HEMV
            magma_slatrd2(uplo, n-i, nb, A(i, i), lda, &e[i],
                         &tau[i], work, ldwork,
                         dA(i, i), ldda,
                         dwork, lddwork, dwork2, n*n);
            #else
            magma_slatrd(uplo, n-i, nb, A(i, i), lda, &e[i],
                         &tau[i], work, ldwork,
                         dA(i, i), ldda,
                         dwork, lddwork);
            #endif
            /* Update the unreduced submatrix A(i+ib:n,i+ib:n), using
               an update of the form:  A := A - V*W' - W*V' */
            magma_ssetmatrix( n-i, nb, work, ldwork, dwork, lddwork );

            magma_ssyr2k(MagmaLower, MagmaNoTrans, n-i-nb, nb, c_neg_one,
                         dA(i+nb, i), ldda,
                         &dwork[nb], lddwork, d_one,
                         dA(i+nb, i+nb), ldda);
            
            /* Copy subdiagonal elements back into A, and diagonal
               elements into D */
            for (j = i; j < i+nb; ++j) {
                MAGMA_S_SET2REAL( *A(j+1, j), e[j] );
                d[j] = MAGMA_S_REAL( *A(j, j) );
            }
        }

        #ifdef FAST_HEMV
        magma_free( dwork2 );
        #endif

        /* Use unblocked code to reduce the last or only block */
        if (1<=n-nx)
            magma_sgetmatrix( n-i, n-i, dA(i, i), ldda, A(i, i), lda );
        i_n = n-i;
        lapackf77_ssytrd(uplo_, &i_n, A(i, i), &lda, &d[i], &e[i],
                         &tau[i], work, &lwork, &iinfo);
        
    }
    
    magma_free( da );
    MAGMA_S_SET2REAL( work[0], lwkopt );

    return *info;
} /* magma_ssytrd */
Esempio n. 3
0
extern "C" magma_int_t
magma_zlauum(char uplo, magma_int_t n,
             cuDoubleComplex *a, magma_int_t lda, magma_int_t *info)
{
/*  -- MAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2012

        Purpose
        =======

        ZLAUUM computes the product U * U' or L' * L, where the triangular
        factor U or L is stored in the upper or lower triangular part of
        the array A.

        If UPLO = 'U' or 'u' then the upper triangle of the result is stored,
        overwriting the factor U in A.
        If UPLO = 'L' or 'l' then the lower triangle of the result is stored,
        overwriting the factor L in A.
        This is the blocked form of the algorithm, calling Level 3 BLAS.

        Arguments
        =========

        UPLO    (input) CHARACTER*1
                        Specifies whether the triangular factor stored in the array A
                        is upper or lower triangular:
                        = 'U':  Upper triangular
                        = 'L':  Lower triangular

        N       (input) INTEGER
                        The order of the triangular factor U or L.  N >= 0.

        A       (input/output) COPLEX_16 array, dimension (LDA,N)
                        On entry, the triangular factor U or L.
                        On exit, if UPLO = 'U', the upper triangle of A is
                        overwritten with the upper triangle of the product U * U';
                        if UPLO = 'L', the lower triangle of A is overwritten with
                        the lower triangle of the product L' * L.

        LDA     (input) INTEGER
                        The leading dimension of the array A.  LDA >= max(1,N).

        INFO    (output) INTEGER
                        = 0: successful exit
                        < 0: if INFO = -k, the k-th argument had an illegal value

        ===================================================================== */


        /* Local variables */
        char uplo_[2] = {uplo, 0};
        magma_int_t     ldda, nb;
        magma_int_t i, ib;
        cuDoubleComplex    c_one = MAGMA_Z_ONE;
        double             d_one = MAGMA_D_ONE;
        cuDoubleComplex    *work;
        int upper = lapackf77_lsame(uplo_, "U");

        *info = 0;
        if ((! upper) && (! lapackf77_lsame(uplo_, "L")))
                *info = -1;
        else if (n < 0)
                *info = -2;
        else if (lda < max(1,n))
                *info = -4;

        if (*info != 0) {
                magma_xerbla( __func__, -(*info) );
                return *info;
        }

        /* Quick return */
        if ( n == 0 )
                return *info;

        ldda = ((n+31)/32)*32;

        if (MAGMA_SUCCESS != magma_zmalloc( &work, (n)*ldda )) {
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
        }

        cudaStream_t stream[2];
        magma_queue_create( &stream[0] );
        magma_queue_create( &stream[1] );

        nb = magma_get_zpotrf_nb(n);

        if (nb <= 1 || nb >= n)
                lapackf77_zlauum(uplo_, &n, a, &lda, info);
        else
        {
                if (upper)
                {
                        /* Compute the product U * U'. */
                        for (i=0; i<n; i=i+nb)
                        {
                                ib=min(nb,n-i);

                                //cublasSetMatrix(ib, (n-i), sizeof(cuDoubleComplex), A(i, i), lda, dA(i, i), ldda);
                                
                                magma_zsetmatrix_async( ib, ib,
                                                        A(i,i),   lda,
                                                        dA(i, i), ldda, stream[1] );

                                magma_zsetmatrix_async( ib, (n-i-ib),
                                                        A(i,i+ib),  lda,
                                                        dA(i,i+ib), ldda, stream[0] );

                                magma_queue_sync( stream[1] );

                                magma_ztrmm( MagmaRight, MagmaUpper,
                                             MagmaConjTrans, MagmaNonUnit, i, ib,
                                             c_one, dA(i,i), ldda, dA(0, i),ldda);


                                lapackf77_zlauum(MagmaUpperStr, &ib, A(i,i), &lda, info);

                                magma_zsetmatrix_async( ib, ib,
                                                        A(i, i),  lda,
                                                        dA(i, i), ldda, stream[0] );

                                if (i+ib < n)
                                {
                                        magma_zgemm( MagmaNoTrans, MagmaConjTrans,
                                                     i, ib, (n-i-ib), c_one, dA(0,i+ib),
                                                     ldda, dA(i, i+ib),ldda, c_one,
                                                     dA(0,i), ldda);

                                        magma_queue_sync( stream[0] );

                                        magma_zherk( MagmaUpper, MagmaNoTrans, ib,(n-i-ib),
                                                     d_one, dA(i, i+ib), ldda,
                                                     d_one, dA(i, i), ldda);
                                }
                                
                                magma_zgetmatrix( i+ib, ib,
                                                  dA(0, i), ldda,
                                                  A(0, i),  lda );
                        }
                }
                else
                {
                        /* Compute the product L' * L. */
                        for(i=0; i<n; i=i+nb)
                        {
                                ib=min(nb,n-i);
                                //cublasSetMatrix((n-i), ib, sizeof(cuDoubleComplex),
                                //                A(i, i), lda, dA(i, i), ldda);

                                magma_zsetmatrix_async( ib, ib,
                                                        A(i,i),   lda,
                                                        dA(i, i), ldda, stream[1] );

                                magma_zsetmatrix_async( (n-i-ib), ib,
                                                        A(i+ib, i),  lda,
                                                        dA(i+ib, i), ldda, stream[0] );

                                magma_queue_sync( stream[1] );

                                magma_ztrmm( MagmaLeft, MagmaLower,
                                             MagmaConjTrans, MagmaNonUnit, ib,
                                             i, c_one, dA(i,i), ldda,
                                             dA(i, 0),ldda);


                                lapackf77_zlauum(MagmaLowerStr, &ib, A(i,i), &lda, info);

                                //cublasSetMatrix(ib, ib, sizeof(cuDoubleComplex),
                                //                A(i, i), lda, dA(i, i), ldda);

                                magma_zsetmatrix_async( ib, ib,
                                                        A(i, i),  lda,
                                                        dA(i, i), ldda, stream[0] );

                                if (i+ib < n)
                                {
                                        magma_zgemm(MagmaConjTrans, MagmaNoTrans,
                                                        ib, i, (n-i-ib), c_one, dA( i+ib,i),
                                                        ldda, dA(i+ib, 0),ldda, c_one,
                                                        dA(i,0), ldda);

                                        magma_queue_sync( stream[0] );
                                        
                                        magma_zherk(MagmaLower, MagmaConjTrans, ib, (n-i-ib),
                                                        d_one, dA(i+ib, i), ldda,
                                                        d_one, dA(i, i), ldda);
                                }
                                magma_zgetmatrix( ib, i+ib,
                                                  dA(i, 0), ldda,
                                                  A(i, 0),  lda );
                        }
                }
        }
        magma_queue_destroy( stream[0] );
        magma_queue_destroy( stream[1] );

        magma_free( work );

        return *info;

}
Esempio n. 4
0
extern "C" magma_int_t
magma_chegvr(magma_int_t itype, char jobz, char range, char uplo, magma_int_t n,
             magmaFloatComplex *a, magma_int_t lda, magmaFloatComplex *b, magma_int_t ldb,
             float vl, float vu, magma_int_t il, magma_int_t iu, float abstol,
             magma_int_t *m, float *w,  magmaFloatComplex *z, magma_int_t ldz,
             magma_int_t *isuppz, magmaFloatComplex *work, magma_int_t lwork,
             float *rwork, magma_int_t lrwork, magma_int_t *iwork,
             magma_int_t liwork, magma_int_t *info)
{
/*  -- MAGMA (version 1.4.1) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       December 2013
    
    Purpose
    =======
    CHEGVR computes all the eigenvalues, and optionally, the eigenvectors
    of a complex generalized Hermitian-definite eigenproblem, of the form
    A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
    B are assumed to be Hermitian and B is also positive definite.

    Whenever possible, CHEEVR calls CSTEGR to compute the
    eigenspectrum using Relatively Robust Representations.  CSTEGR
    computes eigenvalues by the dqds algorithm, while orthogonal
    eigenvectors are computed from various "good" L D L^T representations
    (also known as Relatively Robust Representations). Gram-Schmidt
    orthogonalization is avoided as far as possible. More specifically,
    the various steps of the algorithm are as follows. For the i-th
    unreduced block of T,
       (a) Compute T - sigma_i = L_i D_i L_i^T, such that L_i D_i L_i^T
            is a relatively robust representation,
       (b) Compute the eigenvalues, lambda_j, of L_i D_i L_i^T to high
           relative accuracy by the dqds algorithm,
       (c) If there is a cluster of close eigenvalues, "choose" sigma_i
           close to the cluster, and go to step (a),
       (d) Given the approximate eigenvalue lambda_j of L_i D_i L_i^T,
           compute the corresponding eigenvector by forming a
           rank-revealing twisted factorization.
    The desired accuracy of the output can be specified by the input
    parameter ABSTOL.

    For more details, see "A new O(n^2) algorithm for the symmetric
    tridiagonal eigenvalue/eigenvector problem", by Inderjit Dhillon,
    Computer Science Division Technical Report No. UCB//CSD-97-971,
    UC Berkeley, May 1997.


    Note 1 : CHEEVR calls CSTEGR when the full spectrum is requested
    on machines which conform to the ieee-754 floating point standard.
    CHEEVR calls SSTEBZ and CSTEIN on non-ieee machines and
    when partial spectrum requests are made.

    Normal execution of CSTEGR may create NaNs and infinities and
    hence may abort due to a floating point exception in environments
    which do not handle NaNs and infinities in the ieee standard default
    manner.

    Arguments
    =========
    ITYPE   (input) INTEGER
            Specifies the problem type to be solved:
            = 1:  A*x = (lambda)*B*x
            = 2:  A*B*x = (lambda)*x
            = 3:  B*A*x = (lambda)*x

    RANGE   (input) CHARACTER*1
            = 'A': all eigenvalues will be found.
            = 'V': all eigenvalues in the half-open interval (VL,VU]
                   will be found.
            = 'I': the IL-th through IU-th eigenvalues will be found.

    JOBZ    (input) CHARACTER*1
            = 'N':  Compute eigenvalues only;
            = 'V':  Compute eigenvalues and eigenvectors.

    UPLO    (input) CHARACTER*1
            = 'U':  Upper triangles of A and B are stored;
            = 'L':  Lower triangles of A and B are stored.

    N       (input) INTEGER
            The order of the matrices A and B.  N >= 0.

    A       (input/output) COMPLEX array, dimension (LDA, N)
            On entry, the Hermitian matrix A.  If UPLO = 'U', the
            leading N-by-N upper triangular part of A contains the
            upper triangular part of the matrix A.  If UPLO = 'L',
            the leading N-by-N lower triangular part of A contains
            the lower triangular part of the matrix A.

            On exit, if JOBZ = 'V', then if INFO = 0, A contains the
            matrix Z of eigenvectors.  The eigenvectors are normalized
            as follows:
            if ITYPE = 1 or 2, Z**H*B*Z = I;
            if ITYPE = 3, Z**H*inv(B)*Z = I.
            If JOBZ = 'N', then on exit the upper triangle (if UPLO='U')
            or the lower triangle (if UPLO='L') of A, including the
            diagonal, is destroyed.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    B       (input/output) COMPLEX array, dimension (LDB, N)
            On entry, the Hermitian matrix B.  If UPLO = 'U', the
            leading N-by-N upper triangular part of B contains the
            upper triangular part of the matrix B.  If UPLO = 'L',
            the leading N-by-N lower triangular part of B contains
            the lower triangular part of the matrix B.

            On exit, if INFO <= N, the part of B containing the matrix is
            overwritten by the triangular factor U or L from the Cholesky
            factorization B = U**H*U or B = L*L**H.

    LDB     (input) INTEGER
            The leading dimension of the array B.  LDB >= max(1,N).

    VL      (input) REAL
    VU      (input) REAL
            If RANGE='V', the lower and upper bounds of the interval to
            be searched for eigenvalues. VL < VU.
            Not referenced if RANGE = 'A' or 'I'.

    IL      (input) INTEGER
    IU      (input) INTEGER
            If RANGE='I', the indices (in ascending order) of the
            smallest and largest eigenvalues to be returned.
            1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
            Not referenced if RANGE = 'A' or 'V'.

    ABSTOL  (input) REAL
            The absolute error tolerance for the eigenvalues.
            An approximate eigenvalue is accepted as converged
            when it is determined to lie in an interval [a,b]
            of width less than or equal to

                    ABSTOL + EPS *   max( |a|,|b| ) ,

            where EPS is the machine precision.  If ABSTOL is less than
            or equal to zero, then  EPS*|T|  will be used in its place,
            where |T| is the 1-norm of the tridiagonal matrix obtained
            by reducing A to tridiagonal form.

            See "Computing Small Singular Values of Bidiagonal Matrices
            with Guaranteed High Relative Accuracy," by Demmel and
            Kahan, LAPACK Working Note #3.

            If high relative accuracy is important, set ABSTOL to
            SLAMCH( 'Safe minimum' ).  Doing so will guarantee that
            eigenvalues are computed to high relative accuracy when
            possible in future releases.  The current code does not
            make any guarantees about high relative accuracy, but
            furutre releases will. See J. Barlow and J. Demmel,
            "Computing Accurate Eigensystems of Scaled Diagonally
            Dominant Matrices", LAPACK Working Note #7, for a discussion
            of which matrices define their eigenvalues to high relative
            accuracy.

    M       (output) INTEGER
            The total number of eigenvalues found.  0 <= M <= N.
            If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1.

    W       (output) REAL array, dimension (N)
            If INFO = 0, the eigenvalues in ascending order.

    Z       (output) COMPLEX array, dimension (LDZ, max(1,M))
            If JOBZ = 'V', then if INFO = 0, the first M columns of Z
            contain the orthonormal eigenvectors of the matrix A
            corresponding to the selected eigenvalues, with the i-th
            column of Z holding the eigenvector associated with W(i).
            If JOBZ = 'N', then Z is not referenced.
            Note: the user must ensure that at least max(1,M) columns are
            supplied in the array Z; if RANGE = 'V', the exact value of M
            is not known in advance and an upper bound must be used.

    LDZ     (input) INTEGER
            The leading dimension of the array Z.  LDZ >= 1, and if
            JOBZ = 'V', LDZ >= max(1,N).

    ISUPPZ  (output) INTEGER ARRAY, dimension ( 2*max(1,M) )
            The support of the eigenvectors in Z, i.e., the indices
            indicating the nonzero elements in Z. The i-th eigenvector
            is nonzero only in elements ISUPPZ( 2*i-1 ) through
            ISUPPZ( 2*i ).
   ********* Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1

    WORK    (workspace/output) COMPLEX array, dimension (LWORK)
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.

    LWORK   (input) INTEGER
            The length of the array WORK.  LWORK >= max(1,2*N).
            For optimal efficiency, LWORK >= (NB+1)*N,
            where NB is the max of the blocksize for CHETRD and for
            CUNMTR as returned by ILAENV.

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    RWORK   (workspace/output) REAL array, dimension (LRWORK)
            On exit, if INFO = 0, RWORK(1) returns the optimal
            (and minimal) LRWORK.

    LRWORK  (input) INTEGER
            The length of the array RWORK.  LRWORK >= max(1,24*N).

            If LRWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the RWORK array, returns
            this value as the first entry of the RWORK array, and no error
            message related to LRWORK is issued by XERBLA.

    IWORK   (workspace/output) INTEGER array, dimension (LIWORK)
            On exit, if INFO = 0, IWORK(1) returns the optimal
            (and minimal) LIWORK.

    LIWORK  (input) INTEGER
            The dimension of the array IWORK.  LIWORK >= max(1,10*N).

            If LIWORK = -1, then a workspace query is assumed; the
            routine only calculates the optimal size of the IWORK array,
            returns this value as the first entry of the IWORK array, and
            no error message related to LIWORK is issued by XERBLA.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
            > 0:  Internal error

    Further Details
    ===============
    Based on contributions by
       Inderjit Dhillon, IBM Almaden, USA
       Osni Marques, LBNL/NERSC, USA
       Ken Stanley, Computer Science Division, University of
         California at Berkeley, USA
    =====================================================================  */
    
    char uplo_[2] = {uplo, 0};
    char jobz_[2] = {jobz, 0};
    char range_[2] = {range, 0};
    
    magmaFloatComplex c_one = MAGMA_C_ONE;
    
    magmaFloatComplex *da;
    magmaFloatComplex *db;
    magmaFloatComplex *dz;
    magma_int_t ldda = n;
    magma_int_t lddb = n;
    magma_int_t lddz = n;
    
    magma_int_t lower;
    char trans[1];
    magma_int_t wantz;
    magma_int_t lquery;
    magma_int_t alleig, valeig, indeig;
    
    magma_int_t lwmin, lrwmin, liwmin;
    
    magma_queue_t stream;
    magma_queue_create( &stream );
    
    wantz = lapackf77_lsame(jobz_, MagmaVecStr);
    lower = lapackf77_lsame(uplo_, MagmaLowerStr);
    alleig = lapackf77_lsame(range_, "A");
    valeig = lapackf77_lsame(range_, "V");
    indeig = lapackf77_lsame(range_, "I");
    lquery = lwork == -1;
    
    *info = 0;
    if (itype < 1 || itype > 3) {
        *info = -1;
    } else if (! (alleig || valeig || indeig)) {
        *info = -2;
    } else if (! (wantz || lapackf77_lsame(jobz_, MagmaNoVecStr))) {
        *info = -3;
    } else if (! (lower || lapackf77_lsame(uplo_, MagmaUpperStr))) {
        *info = -4;
    } else if (n < 0) {
        *info = -5;
    } else if (lda < max(1,n)) {
        *info = -7;
    } else if (ldb < max(1,n)) {
        *info = -9;
    } else if (ldz < 1 || (wantz && ldz < n)) {
        *info = -18;
    } else {
        if (valeig) {
            if (n > 0 && vu <= vl) {
                *info = -11;
            }
        } else if (indeig) {
            if (il < 1 || il > max(1,n)) {
                *info = -12;
            } else if (iu < min(n,il) || iu > n) {
                *info = -13;
            }
        }
    }
    
    magma_int_t nb = magma_get_chetrd_nb(n);
    
    lwmin =  n * (nb + 1);
    lrwmin = 24 * n;
    liwmin = 10 * n;
    
    work[0] = MAGMA_C_MAKE( lwmin, 0 );
    rwork[0] = lrwmin;
    iwork[0] = liwmin;
    
    if (lwork < lwmin && ! lquery) {
        *info = -21;
    } else if ((lrwork < lrwmin) && ! lquery) {
        *info = -23;
    } else if ((liwork < liwmin) && ! lquery) {
        *info = -25;
    }
    
    if (*info != 0) {
        magma_xerbla( __func__, -(*info));
        return *info;
    } else if (lquery) {
        return *info;
    }
    
    /* Quick return if possible */
    if (n == 0) {
        return *info;
    }
    
    if (MAGMA_SUCCESS != magma_cmalloc( &da, n*ldda ) ||
        MAGMA_SUCCESS != magma_cmalloc( &db, n*lddb ) ||
        MAGMA_SUCCESS != magma_cmalloc( &dz, n*lddz )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }
    
    /* Form a Cholesky factorization of B. */
    magma_csetmatrix( n, n, b, ldb, db, lddb );
    
    magma_csetmatrix_async( n, n,
                            a,  lda,
                            da, ldda, stream );
    
    magma_cpotrf_gpu(uplo_[0], n, db, lddb, info);
    if (*info != 0) {
        *info = n + *info;
        return *info;
    }
    
    magma_queue_sync( stream );
    
    magma_cgetmatrix_async( n, n,
                            db, lddb,
                            b,  ldb, stream );
    
    /* Transform problem to standard eigenvalue problem and solve. */
    magma_chegst_gpu(itype, uplo, n, da, ldda, db, lddb, info);
    
    magma_cheevr_gpu(jobz, range, uplo, n, da, ldda, vl, vu, il, iu, abstol,
                     m, w, dz, lddz, isuppz, a, lda, z, ldz, work, lwork,
                     rwork, lrwork, iwork, liwork, info);
    
    if (wantz && *info == 0) {
        /* Backtransform eigenvectors to the original problem. */
    
        if (itype == 1 || itype == 2) {
            /* For A*x=(lambda)*B*x and A*B*x=(lambda)*x;
               backtransform eigenvectors: x = inv(L)'*y or inv(U)*y */
            if (lower) {
                *(unsigned char *)trans = MagmaConjTrans;
            } else {
                *(unsigned char *)trans = MagmaNoTrans;
            }
            
            magma_ctrsm(MagmaLeft, uplo, *trans, MagmaNonUnit, n, *m, c_one,
                          db, lddb, dz, lddz);
        }
        else if (itype == 3) {
            /* For B*A*x=(lambda)*x;
               backtransform eigenvectors: x = L*y or U'*y */
            if (lower) {
                *(unsigned char *)trans = MagmaNoTrans;
            } else {
                *(unsigned char *)trans = MagmaConjTrans;
            }
            
            magma_ctrmm(MagmaLeft, uplo, *trans, MagmaNonUnit, n, *m, c_one,
                          db, lddb, dz, lddz);
        }
        
        magma_cgetmatrix( n, *m, dz, lddz, z, ldz );
        
    }
    
    magma_queue_sync( stream );
    
    magma_queue_destroy( stream );
    
    magma_free( da );
    magma_free( db );
    magma_free( dz );
    
    return *info;
} /* chegvr */
Esempio n. 5
0
extern "C" magma_int_t
magma_cgetrs_gpu(char trans, magma_int_t n, magma_int_t nrhs, 
                 cuFloatComplex *dA, magma_int_t ldda,
                 magma_int_t *ipiv, 
                 cuFloatComplex *dB, magma_int_t lddb, 
                 magma_int_t *info)
{
/*  -- MAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2012

    Purpose
    =======

    Solves a system of linear equations
      A * X = B  or  A' * X = B
    with a general N-by-N matrix A using the LU factorization computed by CGETRF_GPU.

    Arguments
    =========

    TRANS   (input) CHARACTER*1
            Specifies the form of the system of equations:
            = 'N':  A * X = B  (No transpose)
            = 'T':  A'* X = B  (Transpose)
            = 'C':  A'* X = B  (Conjugate transpose = Transpose)

    N       (input) INTEGER
            The order of the matrix A.  N >= 0.

    NRHS    (input) INTEGER
            The number of right hand sides, i.e., the number of columns
            of the matrix B.  NRHS >= 0.

    A       (input) COMPLEX array on the GPU, dimension (LDA,N)
            The factors L and U from the factorization A = P*L*U as computed
            by CGETRF_GPU.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    IPIV    (input) INTEGER array, dimension (N)
            The pivot indices from CGETRF; for 1<=i<=N, row i of the
            matrix was interchanged with row IPIV(i).

    B       (input/output) COMPLEX array on the GPU, dimension (LDB,NRHS)
            On entry, the right hand side matrix B.
            On exit, the solution matrix X.

    LDB     (input) INTEGER
            The leading dimension of the array B.  LDB >= max(1,N).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value

    HWORK   (workspace) COMPLEX array, dimension N*NRHS
    =====================================================================    */


    cuFloatComplex c_one = MAGMA_C_ONE;
    cuFloatComplex *work = NULL;
    char            trans_[2] = {trans, 0};
    int notran = lapackf77_lsame(trans_, "N");
    magma_int_t i1, i2, inc;

    *info = 0;
    if ( (! notran) && 
         (! lapackf77_lsame(trans_, "T")) && 
         (! lapackf77_lsame(trans_, "C")) ) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (nrhs < 0) {
        *info = -3;
    } else if (ldda < max(1,n)) {
        *info = -5;
    } else if (lddb < max(1,n)) {
        *info = -8;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (n == 0 || nrhs == 0) {
        return *info;
    }

    magma_cmalloc_cpu( &work, n * nrhs );
    if ( work == NULL ) {
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }
      
    i1 = 1;
    i2 = n;
    if (notran) {
        inc = 1;

        /* Solve A * X = B. */
        magma_cgetmatrix( n, nrhs, dB, lddb, work, n );
        lapackf77_claswp(&nrhs, work, &n, &i1, &i2, ipiv, &inc);
        magma_csetmatrix( n, nrhs, work, n, dB, lddb );

        if ( nrhs == 1) {
            magma_ctrsv(MagmaLower, MagmaNoTrans, MagmaUnit,    n, dA, ldda, dB, 1 );
            magma_ctrsv(MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA, ldda, dB, 1 );
        } else {
            magma_ctrsm(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit,    n, nrhs, c_one, dA, ldda, dB, lddb );
            magma_ctrsm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb );
        }
    } else {
        inc = -1;

        /* Solve A' * X = B. */
        if ( nrhs == 1) {
            magma_ctrsv(MagmaUpper, trans, MagmaNonUnit, n, dA, ldda, dB, 1 );
            magma_ctrsv(MagmaLower, trans, MagmaUnit,    n, dA, ldda, dB, 1 );
        } else {
            magma_ctrsm(MagmaLeft, MagmaUpper, trans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb );
            magma_ctrsm(MagmaLeft, MagmaLower, trans, MagmaUnit,    n, nrhs, c_one, dA, ldda, dB, lddb );
        }

        magma_cgetmatrix( n, nrhs, dB, lddb, work, n );
        lapackf77_claswp(&nrhs, work, &n, &i1, &i2, ipiv, &inc);
        magma_csetmatrix( n, nrhs, work, n, dB, lddb );
    }
    magma_free_cpu(work);

    return *info;
}
Esempio n. 6
0
extern "C" magma_int_t
magma_sormqr_gpu_2stages(const char side, const char trans,
                         magma_int_t m, magma_int_t n, magma_int_t k,
                         float *da,   magma_int_t ldda,
                         float *dc,    magma_int_t lddc,
                         float *dT,    magma_int_t nb,
                         magma_int_t *info)
{
    /*  -- MAGMA (version 1.3.0) --
     Univ. of Tennessee, Knoxville
     Univ. of California, Berkeley
     Univ. of Colorado, Denver
     November 2012

     Purpose
     =======
     SORMQR_GPU overwrites the general real M-by-N matrix C with

     SIDE = 'L'     SIDE = 'R'
     TRANS = 'N':      Q * C          C * Q
     TRANS = 'T':      Q**T * C       C * Q**T

     where Q is a real orthogonal matrix defined as the product of k
     elementary reflectors

     Q = H(1) H(2) . . . H(k)

     as returned by SGEQRF. Q is of order M if SIDE = 'L' and of order N
     if SIDE = 'R'.

     Arguments
     =========
     SIDE    (input) CHARACTER*1
     = 'L': apply Q or Q**T from the Left;
     = 'R': apply Q or Q**T from the Right.

     TRANS   (input) CHARACTER*1
     = 'N':  No transpose, apply Q;
     = 'T':  Transpose, apply Q**T.

     M       (input) INTEGER
     The number of rows of the matrix C. M >= 0.

     N       (input) INTEGER
     The number of columns of the matrix C. N >= 0.

     K       (input) INTEGER
     The number of elementary reflectors whose product defines
     the matrix Q.
     If SIDE = 'L', M >= K >= 0;
     if SIDE = 'R', N >= K >= 0.

     DA      (input) REAL array on the GPU, dimension (LDDA,K)
     The i-th column must contain the vector which defines the
     elementary reflector H(i), for i = 1,2,...,k, as returned by
     SGEQRF in the first k columns of its array argument DA.
     DA is modified by the routine but restored on exit.

     LDDA    (input) INTEGER
     The leading dimension of the array DA.
     If SIDE = 'L', LDDA >= max(1,M);
     if SIDE = 'R', LDDA >= max(1,N).

     DC      (input/output) REAL array on the GPU, dimension (LDDC,N)
     On entry, the M-by-N matrix C.
     On exit, C is overwritten by Q*C or Q**T * C or C * Q**T or C*Q.

     LDDC     (input) INTEGER
     The leading dimension of the array DC. LDDC >= max(1,M).

     DT      (input) REAL array on the GPU that is the output
     (the 9th argument) of magma_sgeqrf_gpu.

     NB      (input) INTEGER
     This is the blocking size that was used in pre-computing DT, e.g.,
     the blocking size used in magma_sgeqrf_gpu.

     INFO    (output) INTEGER
     = 0:  successful exit
     < 0:  if INFO = -i, the i-th argument had an illegal value
     =====================================================================   */

    float c_one = MAGMA_S_ONE;

    char side_[2] = {side, 0};
    char trans_[2] = {trans, 0};

    float *dwork;

    magma_int_t i1, i2, i3, ib, ic, jc, mi, ni, nq, nw, ret;
    int left, notran;
    magma_int_t lwkopt;

    *info = 0;
    left   = lapackf77_lsame(side_, "L");
    notran = lapackf77_lsame(trans_, "N");

    /* NQ is the order of Q and NW is the minimum dimension of WORK */
    if (left) {
        nq = m;
        nw = n;
    } else {
        nq = n;
        nw = m;
    }
    if ( (!left) && (!lapackf77_lsame(side_, "R")) ) {
        *info = -1;
    } else if ( (!notran) && (!lapackf77_lsame(trans_, MagmaTransStr)) ) {
        *info = -2;
    } else if (m < 0) {
        *info = -3;
    } else if (n < 0) {
        *info = -4;
    } else if (k < 0 || k > nq) {
        *info = -5;
    } else if (ldda < max(1,nq)) {
        *info = -7;
    } else if (lddc < max(1,m)) {
        *info = -10;
    }

    if(MAGMA_SUCCESS != magma_smalloc( &dwork, n*nb )) {
        printf ("!!!! sorgqr_2stage magma_alloc failed for: dwork\n" );
        exit(-1);
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0 || k == 0) {
        return *info;
    }

    if ( (left && (! notran)) || ( (!left) && notran ) ) {
        i1 = 0;
        i2 = k;
        i3 = nb;
    } else {
        i1 = (k - 1) / nb * nb;
        i2 = 0;
        i3 = -nb;
    }

    if (left) {
        ni = n;
        jc = 0;
    } else {
        mi = m;
        ic = 0;
    }

    for (magma_int_t i=i1; i3<0 ? i>=i2 : i<i2; i+=i3)
    {
        ib = min(nb, k - i);
        if (left){
            mi = m - i;
            ic = i;
        }
        else {
            ni = n - i;
            jc = i;
        }
        ret = magma_slarfb_gpu( MagmaLeft, trans, MagmaForward, MagmaColumnwise,
                               mi, ni, ib, da+i+i*ldda, ldda, dT+i*nb, nb,
                               dc+ic+jc*lddc, lddc, dwork, nw);

        if ( ret != MAGMA_SUCCESS ){
            magma_free(dwork);
            return ret;
        }
    }

   return MAGMA_SUCCESS;
}   /* End of MAGMA_SORMQR_GPU_2stages */
Esempio n. 7
0
extern "C" magma_int_t
magma_spotrf_m(magma_int_t num_gpus0, char uplo, magma_int_t n,
               float *a, magma_int_t lda, magma_int_t *info)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    SPOTRF_OOC computes the Cholesky factorization of a real symmetric
    positive definite matrix A. This version does not require work
    space on the GPU passed as input. GPU memory is allocated in the
    routine. The matrix A may not fit entirely in the GPU memory.

    The factorization has the form
       A = U**T * U,  if UPLO = 'U', or
       A = L  * L**T, if UPLO = 'L',
    where U is an upper triangular matrix and L is lower triangular.

    This is the block version of the algorithm, calling Level 3 BLAS.

    Arguments
    =========
    UPLO    (input) CHARACTER*1
            = 'U':  Upper triangle of A is stored;
            = 'L':  Lower triangle of A is stored.

    N       (input) INTEGER
            The order of the matrix A.  N >= 0.

    A       (input/output) REAL array, dimension (LDA,N)
            On entry, the symmetric matrix A.  If UPLO = 'U', the leading
            N-by-N upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = 'L', the
            leading N-by-N lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.

            On exit, if INFO = 0, the factor U or L from the Cholesky
            factorization A = U**T * U or A = L * L**T.

            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
            > 0:  if INFO = i, the leading minor of order i is not
                  positive definite, and the factorization could not be
                  completed.

    =====================================================================    */


    /* Local variables */
    float                 d_one     =  1.0;
    float                 d_neg_one = -1.0;
    float     c_one     = MAGMA_S_ONE;
    float     c_neg_one = MAGMA_S_NEG_ONE;
    char                   uplo_[2]  = {uplo, 0};
    int                    upper     = lapackf77_lsame(uplo_, "U");

    float *dwork[MagmaMaxGPUs], *dt[MagmaMaxGPUs];
    magma_int_t     ldda, lddla, nb, iinfo, n_local[MagmaMaxGPUs], J2, d, num_gpus;
    magma_int_t     j, jj, jb, J, JB, NB, MB, h;
    magma_queue_t   stream[MagmaMaxGPUs][3];
    magma_event_t   event[MagmaMaxGPUs][5];
    #ifdef ROW_MAJOR_PROFILE
    magma_timestr_t start, end, start0, end0;
    float chol_time = 1.0;
    #endif
    *info = 0;
    if ((! upper) && (! lapackf77_lsame(uplo_, "L"))) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,n)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return */
    if ( n == 0 )
        return *info;

    nb = magma_get_dpotrf_nb(n);
    if( num_gpus0 > n/nb ) {
        num_gpus = n/nb;
        if( n%nb != 0 ) num_gpus ++;
    } else {
        num_gpus = num_gpus0;
    }
    //ldda  = ((n+31)/32)*32;
    ldda  = ((n+nb-1)/nb)*nb;
    lddla = ((nb*((n+nb*num_gpus-1)/(nb*num_gpus))+31)/32)*32;

    /* figure out NB */
    size_t freeMem, totalMem;
    cudaMemGetInfo( &freeMem, &totalMem );
    freeMem /= sizeof(float);
    
    MB = n;  /* number of rows in the big panel    */
    NB = (magma_int_t)((0.8*freeMem-max(2,num_gpus)*nb*ldda-(n+nb)*nb)/lddla); /* number of columns in the big panel */
    //NB = min(5*nb,n);

    if( NB >= n ) {
        #ifdef CHECK_SPOTRF_OOC
        printf( "      * still fit in GPU memory.\n" );
        #endif
        NB = n;
    } else {
        #ifdef CHECK_SPOTRF_OOC
        printf( "      * don't fit in GPU memory.\n" );
        #endif
        NB = (NB/nb) * nb;   /* making sure it's devisable by nb   */
    }
    #ifdef CHECK_SPOTRF_OOC
    if( NB != n ) printf( "      * running in out-core mode (n=%d, NB=%d, nb=%d, lddla=%d, freeMem=%.2e).\n",n,NB,nb,lddla,(float)freeMem );
    else          printf( "      * running in in-core mode  (n=%d, NB=%d, nb=%d, lddla=%d, freeMem=%.2e).\n",n,NB,nb,lddla,(float)freeMem );
    fflush(stdout);
    #endif
    for (d=0; d<num_gpus; d++ ) {
        magma_setdevice(d);
        if (MAGMA_SUCCESS != magma_smalloc( &dt[d], NB*lddla + max(2,num_gpus)*nb*ldda )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }
        dwork[d] = &dt[d][max(2,num_gpus)*nb*ldda];
        
        for( j=0; j<3; j++ )
            magma_queue_create( &stream[d][j] );
        for( j=0; j<5; j++ )
            magma_event_create( &event[d][j]  );
        magma_device_sync(); // synch the device
    }
    magma_setdevice(0);

    #ifdef ROW_MAJOR_PROFILE
    start0 = get_current_time();
    #endif

    if (nb <= 1 || nb >= n) {
        lapackf77_spotrf(uplo_, &n, a, &lda, info);
    } else {

    /* Use hybrid blocked code. */
    if (upper) {
        /* =========================================================== *
         * Compute the Cholesky factorization A = U'*U.                *
         * big panel is divided by block-row and distributed in block  *
         * column cyclic format                                        */
        
        /* for each big-panel */
        for( J=0; J<n; J+=NB ) {
            JB = min(NB,n-J);
            if( num_gpus0 > (n-J)/nb ) {
                num_gpus = (n-J)/nb;
                if( (n-J)%nb != 0 ) num_gpus ++;
            } else {
                num_gpus = num_gpus0;
            }
            
            /* load the new big-panel by block-rows */
            magma_shtodpo( num_gpus, &uplo, JB, n, J, J, nb, a, lda, dwork, NB, stream, &iinfo);
            
            #ifdef ROW_MAJOR_PROFILE
            start = get_current_time();
            #endif      
            /* update with the previous big-panels */
            for( j=0; j<J; j+=nb ) {
                /* upload the diagonal of the block column (broadcast to all GPUs) */
                for( d=0; d<num_gpus; d++ ) {
                    magma_setdevice(d);
                    magma_ssetmatrix_async( nb, JB,
                                            A(j, J),       lda,
                                            dTup(d, 0, J), nb,
                                            stream[d][0] );
                    n_local[d] = 0;
                }
                
                /* distribute off-diagonal blocks to GPUs */
                for( jj=J+JB; jj<n; jj+=nb ) {
                    d  = ((jj-J)/nb)%num_gpus;
                    magma_setdevice(d);
                    
                    jb = min(nb, n-jj);
                    magma_ssetmatrix_async( nb, jb,
                                            A(j, jj),                    lda,
                                            dTup(d, 0, J+JB+n_local[d]), nb,
                                            stream[d][0] );
                    n_local[d] += jb;
                }
                
                /* wait for the communication */
                for( d=0; d<num_gpus; d++ ) {
                    magma_setdevice(d);
                    magma_queue_sync( stream[d][0] );
                }
                
                /* update the current big-panel using the previous block-row */
                /* -- process the big diagonal block of the big panel */
                for( jj=0; jj<JB; jj+=nb ) { // jj is 'local' column index within the big panel
                    d  = (jj/nb)%num_gpus;
                    J2 = jj/(nb*num_gpus);
                    
                    magma_setdevice(d);
                    magmablasSetKernelStream(stream[d][J2%2]); // the last stream (2) used to process off-diagonal
                    J2 = nb*J2;

                    jb = min(nb,JB-jj); // number of columns in this current block-row
                    magma_sgemm( MagmaTrans, MagmaNoTrans,
                                 jj, jb, nb,
                                 c_neg_one, dTup(d, 0, J   ), nb,
                                            dTup(d, 0, J+jj), nb,
                                 c_one,     dAup(d, 0, J2), NB);
                    
                    magma_ssyrk(MagmaUpper, MagmaTrans, jb, nb,
                                d_neg_one, dTup(d, 0,  J+jj), nb,
                                d_one,     dAup(d, jj, J2), NB);
                }
                /* -- process the remaining big off-diagonal block of the big panel */
                if( n > J+JB ) { 
                    for( d=0; d<num_gpus; d++ ) {
                        magma_setdevice(d);
                        magmablasSetKernelStream(stream[d][2]);
                        
                        /* local number of columns in the big panel */
                        n_local[d] = ((n-J)/(nb*num_gpus))*nb;
                        if (d < ((n-J)/nb)%num_gpus)
                            n_local[d] += nb;
                        else if (d == ((n-J)/nb)%num_gpus)
                            n_local[d] += (n-J)%nb;
                        
                        /* subtracting the local number of columns in the diagonal */
                        J2 = nb*(JB/(nb*num_gpus));
                        if( d < (JB/nb)%num_gpus ) J2+=nb;

                        n_local[d] -= J2;
                        
                        magma_sgemm( MagmaTrans, MagmaNoTrans,
                                     JB, n_local[d], nb,
                                     c_neg_one, dTup(d, 0, J   ), nb,
                                                dTup(d, 0, J+JB), nb,
                                     c_one,     dAup(d, 0, J2), NB);
                    }
                }
                
                /* wait for the previous updates */
                for( d=0; d<num_gpus; d++ ) {
                    magma_setdevice(d);
                    for( jj=0; jj<3; jj++ )
                        magma_queue_sync( stream[d][jj] );
                    magmablasSetKernelStream(NULL);
                }
                magma_setdevice(0);
            } /* end of updates with previous rows */
            
            /* factor the big panel */
            h  = (JB+nb-1)/nb; // big diagonal of big panel will be on CPU
            // using two streams
            //magma_spotrf2_mgpu(num_gpus, uplo, JB, n-J, J, J, nb,
            //                   dwork, NB, dt, ldda, a, lda, h, stream, event, &iinfo);
            // using three streams
            magma_spotrf3_mgpu(num_gpus, uplo, JB, n-J, J, J, nb,
                               dwork, NB, dt, ldda, a, lda, h, stream, event, &iinfo);
            if( iinfo != 0 ) {
                *info = J+iinfo;
                break;
            }
            #ifdef ROW_MAJOR_PROFILE
            end = get_current_time();
            chol_time += GetTimerValue(start, end);
            #endif      
            
            /* upload the off-diagonal (and diagonal!!!) big panel */
            magma_sdtohpo(num_gpus, &uplo, JB, n, J, J, nb, NB, a, lda, dwork, NB, stream, &iinfo);
            //magma_sdtohpo(num_gpus, &uplo, JB, n, J, J, nb, 0, a, lda, dwork, NB, stream, &iinfo);
        }
    } else {
        /* ========================================================= *
         * Compute the Cholesky factorization A = L*L'.              */
        
        /* for each big-panel */
        for( J=0; J<n; J+=NB ) {
            JB = min(NB,n-J);
            if( num_gpus0 > (n-J)/nb ) {
                num_gpus = (n-J)/nb;
                if( (n-J)%nb != 0 ) num_gpus ++;
            } else {
                num_gpus = num_gpus0;
            }
            
            /* load the new big-panel by block-columns */
            magma_shtodpo( num_gpus, &uplo, n, JB, J, J, nb, a, lda, dwork, lddla, stream, &iinfo);
            
            /* update with the previous big-panels */
            #ifdef ROW_MAJOR_PROFILE
            start = get_current_time();
            #endif      
            for( j=0; j<J; j+=nb ) {
                /* upload the diagonal of big panel */
                for( d=0; d<num_gpus; d++ ) {
                    magma_setdevice(d);
                    magma_ssetmatrix_async( JB, nb,
                                            A(J, j),     lda,
                                            dT(d, J, 0), ldda,
                                            stream[d][0] );
                    n_local[d] = 0;
                }
                
                /* upload off-diagonals */
                for( jj=J+JB; jj<n; jj+=nb ) {
                    d  = ((jj-J)/nb)%num_gpus;
                    magma_setdevice(d);
                    
                    jb = min(nb, n-jj);
                    magma_ssetmatrix_async( jb, nb,
                                            A(jj, j),                  lda,
                                            dT(d, J+JB+n_local[d], 0), ldda,
                                            stream[d][0] );
                    n_local[d] += jb;
                }
                
                /* wait for the communication */
                for( d=0; d<num_gpus; d++ ) {
                    magma_setdevice(d);
                    magma_queue_sync( stream[d][0] );
                }
                
                /* update the current big-panel using the previous block-row */
                for( jj=0; jj<JB; jj+=nb ) { /* diagonal */
                    d  = (jj/nb)%num_gpus;
                    J2 = jj/(nb*num_gpus);
                    
                    magma_setdevice(d);
                    magmablasSetKernelStream(stream[d][J2%2]);
                    
                    J2 = nb*J2;
                    jb = min(nb,JB-jj);
                    magma_sgemm( MagmaNoTrans, MagmaTrans,
                                 jb, jj, nb,
                                 c_neg_one, dT(d, J+jj, 0), ldda,
                                            dT(d, J,    0), ldda,
                                 c_one,     dA(d, J2,   0), lddla);
                    
                    magma_ssyrk(MagmaLower, MagmaNoTrans, jb, nb,
                                d_neg_one, dT(d, J+jj, 0), ldda,
                                d_one,     dA(d, J2,  jj), lddla);
                }
                
                if( n > J+JB ) { /* off-diagonal */
                    for( d=0; d<num_gpus; d++ ) {
                        magma_setdevice(d);
                        magmablasSetKernelStream(stream[d][2]);
                        
                        /* local number of columns in the big panel */
                        n_local[d] = (((n-J)/nb)/num_gpus)*nb;
                        if (d < ((n-J)/nb)%num_gpus)
                            n_local[d] += nb;
                        else if (d == ((n-J)/nb)%num_gpus)
                            n_local[d] += (n-J)%nb;
                        
                        /* subtracting local number of columns in diagonal */
                        J2 = nb*(JB/(nb*num_gpus));
                        if( d < (JB/nb)%num_gpus ) J2+=nb;

                        n_local[d] -= J2;
                        
                        magma_sgemm( MagmaNoTrans, MagmaTrans,
                                     n_local[d], JB, nb,
                                     c_neg_one, dT(d, J+JB, 0), ldda,
                                                dT(d, J,    0), ldda,
                                     c_one,     dA(d, J2,   0), lddla);
                    }
                }
                /* wait for the previous updates */
                for( d=0; d<num_gpus; d++ ) {
                    magma_setdevice(d);
                    for( jj=0; jj<3; jj++ ) 
                        magma_queue_sync( stream[d][jj] );
                    magmablasSetKernelStream(NULL);
                }
                magma_setdevice(0);
            }
            
            /* factor the big panel */
            h = (JB+nb-1)/nb; // big diagonal of big panel will be on CPU
            // using two streams
            //magma_spotrf2_mgpu(num_gpus, uplo, n-J, JB, J, J, nb,
            //                   dwork, lddla, dt, ldda, a, lda, h, stream, event, &iinfo);
            // using three streams
            magma_spotrf3_mgpu(num_gpus, uplo, n-J, JB, J, J, nb,
                               dwork, lddla, dt, ldda, a, lda, h, stream, event, &iinfo);
            if( iinfo != 0 ) {
                *info = J+iinfo;
                break;
            }
            #ifdef ROW_MAJOR_PROFILE
            end = get_current_time();
            chol_time += GetTimerValue(start, end);
            #endif      
            /* upload the off-diagonal big panel */
            magma_sdtohpo( num_gpus, &uplo, n, JB, J, J, nb, JB, a, lda, dwork, lddla, stream, &iinfo);
        
        } /* end of for J */
    } /* if upper */
    } /* if nb */
    #ifdef ROW_MAJOR_PROFILE
    end0 = get_current_time();
    #endif
    if( num_gpus0 > n/nb ) {
        num_gpus = n/nb;
        if( n%nb != 0 ) num_gpus ++;
    } else {
        num_gpus = num_gpus0;
    }
    for (d=0; d<num_gpus; d++ ) {
        magma_setdevice(d);

        for( j=0; j<3; j++ ) {
            if( stream[d][j] != NULL ) magma_queue_destroy( stream[d][j] );
        }
        magma_free( dt[d] );

        for( j=0; j<5; j++ ) {
            magma_event_destroy( event[d][j] );
        }
    }
    magma_setdevice(0);

    #ifdef ROW_MAJOR_PROFILE
    printf("\n n=%d NB=%d nb=%d\n",n,NB,nb);
    printf(" Without memory allocation: %f / %f = %f GFlop/s\n",
           FLOPS_SPOTRF(n)/1000000, GetTimerValue(start0, end0),
           FLOPS_SPOTRF(n)/(1000000*GetTimerValue(start0, end0)));
    printf(" Performance %f / %f = %f GFlop/s\n",
           FLOPS_SPOTRF(n)/1000000, chol_time,
           FLOPS_SPOTRF(n)/(1000000*chol_time));
    #endif
    return *info;
} /* magma_spotrf_ooc */
Esempio n. 8
0
extern "C" magma_int_t
magma_sstedx_m(magma_int_t nrgpu, char range, magma_int_t n, float vl, float vu,
               magma_int_t il, magma_int_t iu, float* d, float* e, float* z, magma_int_t ldz,
               float* work, magma_int_t lwork, magma_int_t* iwork, magma_int_t liwork,
               magma_int_t* info)
{
/*  -- MAGMA (version 1.4.1) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       December 2013

       .. Scalar Arguments ..
      CHARACTER          RANGE
      INTEGER            IL, IU, INFO, LDZ, LIWORK, LWORK, N
      REAL   VL, VU
       ..
       .. Array Arguments ..
      INTEGER            IWORK( * )
      REAL   D( * ), E( * ), WORK( * ), Z( LDZ, * ),
     $                   DWORK ( * )
       ..

    Purpose
    =======
    SSTEDX computes some eigenvalues and, optionally, eigenvectors of a
    symmetric tridiagonal matrix using the divide and conquer method.

    This code makes very mild assumptions about floating point
    arithmetic. It will work on machines with a guard digit in
    add/subtract, or on those binary machines without guard digits
    which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
    It could conceivably fail on hexadecimal or decimal machines
    without guard digits, but we know of none.  See SLAEX3 for details.

    Arguments
    =========
    RANGE   (input) CHARACTER*1
            = 'A': all eigenvalues will be found.
            = 'V': all eigenvalues in the half-open interval (VL,VU]
                   will be found.
            = 'I': the IL-th through IU-th eigenvalues will be found.

    N       (input) INTEGER
            The dimension of the symmetric tridiagonal matrix.  N >= 0.

    VL      (input) REAL
    VU      (input) REAL
            If RANGE='V', the lower and upper bounds of the interval to
            be searched for eigenvalues. VL < VU.
            Not referenced if RANGE = 'A' or 'I'.

    IL      (input) INTEGER
    IU      (input) INTEGER
            If RANGE='I', the indices (in ascending order) of the
            smallest and largest eigenvalues to be returned.
            1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
            Not referenced if RANGE = 'A' or 'V'.

    D       (input/output) REAL array, dimension (N)
            On entry, the diagonal elements of the tridiagonal matrix.
            On exit, if INFO = 0, the eigenvalues in ascending order.

    E       (input/output) REAL array, dimension (N-1)
            On entry, the subdiagonal elements of the tridiagonal matrix.
            On exit, E has been destroyed.

    Z       (input/output) REAL array, dimension (LDZ,N)
            On exit, if INFO = 0, Z contains the orthonormal eigenvectors
            of the symmetric tridiagonal matrix.

    LDZ     (input) INTEGER
            The leading dimension of the array Z. LDZ >= max(1,N).

    WORK    (workspace/output) REAL array,
                                           dimension (LWORK)
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.

    LWORK   (input) INTEGER
            The dimension of the array WORK.
            If N > 1 then LWORK >= ( 1 + 4*N + N**2 ).
            Note that  if N is less than or
            equal to the minimum divide size, usually 25, then LWORK need
            only be max(1,2*(N-1)).

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    IWORK   (workspace/output) INTEGER array, dimension (MAX(1,LIWORK))
            On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.

    LIWORK  (input) INTEGER
            The dimension of the array IWORK.
            LIWORK >= ( 3 + 5*N ).
            Note that if N is less than or
            equal to the minimum divide size, usually 25, then LIWORK
            need only be 1.

            If LIWORK = -1, then a workspace query is assumed; the
            routine only calculates the optimal size of the IWORK array,
            returns this value as the first entry of the IWORK array, and
            no error message related to LIWORK is issued by XERBLA.

    INFO    (output) INTEGER
            = 0:  successful exit.
            < 0:  if INFO = -i, the i-th argument had an illegal value.
            > 0:  The algorithm failed to compute an eigenvalue while
                  working on the submatrix lying in rows and columns
                  INFO/(N+1) through mod(INFO,N+1).

    Further Details
    ===============
    Based on contributions by
       Jeff Rutter, Computer Science Division, University of California
       at Berkeley, USA
    Modified by Francoise Tisseur, University of Tennessee.

    =====================================================================
*/
    char range_[2] = {range, 0};

    float d_zero = 0.;
    float d_one  = 1.;
    magma_int_t izero = 0;
    magma_int_t ione = 1;


    magma_int_t alleig, indeig, valeig, lquery;
    magma_int_t i, j, k, m;
    magma_int_t liwmin, lwmin;
    magma_int_t start, end, smlsiz;
    float eps, orgnrm, p, tiny;

    // Test the input parameters.

    alleig = lapackf77_lsame(range_, "A");
    valeig = lapackf77_lsame(range_, "V");
    indeig = lapackf77_lsame(range_, "I");
    lquery = lwork == -1 || liwork == -1;

    *info = 0;

    if (! (alleig || valeig || indeig)) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldz < max(1,n)) {
        *info = -10;
    } else {
        if (valeig) {
            if (n > 0 && vu <= vl) {
                *info = -4;
            }
        } else if (indeig) {
            if (il < 1 || il > max(1,n)) {
                *info = -5;
            } else if (iu < min(n,il) || iu > n) {
                *info = -6;
            }
        }
    }

    if (*info == 0) {
        // Compute the workspace requirements

        smlsiz = magma_get_smlsize_divideconquer();
        if( n <= 1 ){
            lwmin = 1;
            liwmin = 1;
        } else {
            lwmin = 1 + 4*n + n*n;
            liwmin = 3 + 5*n;
        }

        work[0] = lwmin;
        iwork[0] = liwmin;

        if (lwork < lwmin && ! lquery) {
            *info = -12;
        } else if (liwork < liwmin && ! lquery) {
            *info = -14;
        }
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info));
        return MAGMA_ERR_ILLEGAL_VALUE;
    } else if (lquery) {
        return MAGMA_SUCCESS;
    }

    // Quick return if possible

    if(n==0)
        return MAGMA_SUCCESS;
    if(n==1){
        *z = 1.;
        return MAGMA_SUCCESS;
    }

    /* determine the number of threads */
    magma_int_t threads = magma_get_numthreads();
    magma_setlapack_numthreads(threads);

#ifdef ENABLE_DEBUG
    printf("  D&C_m is using %d threads\n", threads);
#endif

    // If N is smaller than the minimum divide size (SMLSIZ+1), then
    // solve the problem with another solver.

    if (n < smlsiz){
        char char_I[]= {'I', 0};
        lapackf77_ssteqr(char_I, &n, d, e, z, &ldz, work, info);
    } else {
        char char_F[]= {'F', 0};
        lapackf77_slaset(char_F, &n, &n, &d_zero, &d_one, z, &ldz);

        //Scale.
        char char_M[]= {'M', 0};

        orgnrm = lapackf77_slanst(char_M, &n, d, e);

        if (orgnrm == 0){
            work[0]  = lwmin;
            iwork[0] = liwmin;
            return MAGMA_SUCCESS;
        }

        eps = lapackf77_slamch( "Epsilon" );

        if (alleig){
            start = 0;
            while ( start < n ){

                // Let FINISH be the position of the next subdiagonal entry
                // such that E( END ) <= TINY or FINISH = N if no such
                // subdiagonal exists.  The matrix identified by the elements
                // between START and END constitutes an independent
                // sub-problem.

                for(end = start+1; end < n; ++end){
                    tiny = eps * sqrt( MAGMA_S_ABS(d[end-1]*d[end]));
                    if (MAGMA_S_ABS(e[end-1]) <= tiny)
                        break;
                }

                // (Sub) Problem determined.  Compute its size and solve it.

                m = end - start;
                if (m==1){
                    start = end;
                    continue;
                }
                if (m > smlsiz){

                    // Scale
                    char char_G[] = {'G', 0};
                    orgnrm = lapackf77_slanst(char_M, &m, &d[start], &e[start]);
                    lapackf77_slascl(char_G, &izero, &izero, &orgnrm, &d_one, &m, &ione, &d[start], &m, info);
                    magma_int_t mm = m-1;
                    lapackf77_slascl(char_G, &izero, &izero, &orgnrm, &d_one, &mm, &ione, &e[start], &mm, info);

                    magma_slaex0_m( nrgpu, m, &d[start], &e[start], Z(start, start), ldz, work, iwork, 'A', vl, vu, il, iu, info);

                    if( *info != 0) {
                        return MAGMA_SUCCESS;
                    }

                    // Scale Back
                    lapackf77_slascl(char_G, &izero, &izero, &d_one, &orgnrm, &m, &ione, &d[start], &m, info);

                } else {

                    char char_I[]= {'I', 0};
                    lapackf77_ssteqr( char_I, &m, &d[start], &e[start], Z(start, start), &ldz, work, info);
                    if (*info != 0){
                        *info = (start+1) *(n+1) + end;
                    }
                }

                start = end;
            }


            // If the problem split any number of times, then the eigenvalues
            // will not be properly ordered.  Here we permute the eigenvalues
            // (and the associated eigenvectors) into ascending order.

            if (m < n){

                // Use Selection Sort to minimize swaps of eigenvectors
                for (i = 1; i < n; ++i){
                    k = i-1;
                    p = d[i-1];
                    for (j = i; j < n; ++j){
                        if (d[j] < p){
                            k = j;
                            p = d[j];
                        }
                    }
                    if(k != i-1) {
                        d[k] = d[i-1];
                        d[i-1] = p;
                        blasf77_sswap(&n, Z(0,i-1), &ione, Z(0,k), &ione);
                    }
                }
            }

        } else {

            // Scale
            char char_G[] = {'G', 0};
            lapackf77_slascl(char_G, &izero, &izero, &orgnrm, &d_one, &n, &ione, d, &n, info);
            magma_int_t nm = n-1;
            lapackf77_slascl(char_G, &izero, &izero, &orgnrm, &d_one, &nm, &ione, e, &nm, info);

            magma_slaex0_m(nrgpu, n, d, e, z, ldz, work, iwork, range, vl, vu, il, iu, info);

            if( *info != 0) {
                return MAGMA_SUCCESS;
            }

            // Scale Back
            lapackf77_slascl(char_G, &izero, &izero, &d_one, &orgnrm, &n, &ione, d, &n, info);

        }
    }

    work[0]  = lwmin;
    iwork[0] = liwmin;

    return MAGMA_SUCCESS;

} /* magma_sstedx_m */
Esempio n. 9
0
extern "C" magma_int_t
magma_ssytrd_sy2sb( char uplo, magma_int_t n, magma_int_t nb,
                    float *a, magma_int_t lda, 
                    float *tau,
                    float *work, magma_int_t lwork,
                    float *dT,
                    magma_int_t threads, magma_int_t *info)
{
/*  -- MAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2012

    Purpose   
    =======   
    SSYTRD_HE2HB reduces a real symmetric matrix A to real symmetric   
    band-diagonal form T by an orthogonal similarity transformation:   
    Q**T * A * Q = T.   
    This version stores the triangular matrices T used in the accumulated
    Householder transformations (I - V T V').

    Arguments   
    =========   
    UPLO    (input) CHARACTER*1   
            = 'U':  Upper triangle of A is stored;   
            = 'L':  Lower triangle of A is stored.   

    N       (input) INTEGER   
            The order of the matrix A.  N >= 0.   

    A       (input/output) REAL array, dimension (LDA,N)   
            On entry, the symmetric matrix A.  If UPLO = 'U', the leading   
            N-by-N upper triangular part of A contains the upper   
            triangular part of the matrix A, and the strictly lower   
            triangular part of A is not referenced.  If UPLO = 'L', the   
            leading N-by-N lower triangular part of A contains the lower   
            triangular part of the matrix A, and the strictly upper   
            triangular part of A is not referenced.   
            On exit, if UPLO = 'U', the Upper band-diagonal of A is 
            overwritten by the corresponding elements of the   
            band-diagonal matrix T, and the elements above the band   
            diagonal, with the array TAU, represent the orthogonal   
            matrix Q as a product of elementary reflectors; if UPLO   
            = 'L', the the Lower band-diagonal of A is overwritten by 
            the corresponding elements of the band-diagonal   
            matrix T, and the elements below the band-diagonal, with   
            the array TAU, represent the orthogonal matrix Q as a product   
            of elementary reflectors. See Further Details.   

    LDA     (input) INTEGER   
            The leading dimension of the array A.  LDA >= max(1,N).   

    TAU     (output) REAL array, dimension (N-1)   
            The scalar factors of the elementary reflectors (see Further   
            Details).   

    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))   
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.   

    LWORK   (input) INTEGER   
            The dimension of the array WORK.  LWORK >= 1.   
            For optimum performance LWORK >= N*NB, where NB is the   
            optimal blocksize.   

            If LWORK = -1, then a workspace query is assumed; the routine   
            only calculates the optimal size of the WORK array, returns   
            this value as the first entry of the WORK array, and no error   
            message related to LWORK is issued by XERBLA.   

    dT      (output) REAL array on the GPU, dimension N*NB, 
            where NB is the optimal blocksize.
            On exit dT holds the upper triangular matrices T from the 
            accumulated Householder transformations (I - V T V') used
            in the factorization. The nb x nb matrices T are ordered 
            consecutively in memory one after another.

    INFO    (output) INTEGER   
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value   

    Further Details   
    ===============   
    If UPLO = 'U', the matrix Q is represented as a product of elementary   
    reflectors   

       Q = H(n-1) . . . H(2) H(1).   

    Each H(i) has the form   

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with   
    v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in   
    A(1:i-1,i+1), and tau in TAU(i).   

    If UPLO = 'L', the matrix Q is represented as a product of elementary   
    reflectors   

       Q = H(1) H(2) . . . H(n-1).   

    Each H(i) has the form   

       H(i) = I - tau * v * v'   

    where tau is a real scalar, and v is a real vector with   
    v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i),   
    and tau in TAU(i).

    The contents of A on exit are illustrated by the following examples   
    with n = 5:   

    if UPLO = 'U':                       if UPLO = 'L':   

      (  d   e   v2  v3  v4 )              (  d                  )   
      (      d   e   v3  v4 )              (  e   d              )   
      (          d   e   v4 )              (  v1  e   d          )   
      (              d   e  )              (  v1  v2  e   d      )   
      (                  d  )              (  v1  v2  v3  e   d  )   

    where d and e denote diagonal and off-diagonal elements of T, and vi   
    denotes an element of the vector defining H(i).   
    =====================================================================    */

    #define a_ref(a_1,a_2)  ( a  + ((a_2)-1)*( lda) + (a_1)-1)
    #define da_ref(a_1,a_2) (da  + ((a_2)-1)*(ldda) + (a_1)-1)
    #define tau_ref(a_1)    (tau + (a_1)-1)
    #define t_ref(a_1)      (dT  + ((a_1)-1)*(lddt))

    char uplo_[2] = {uplo, 0};

    int ldda = ((n+31)/32)*32;
    int lddt = nb;
   
    float c_neg_one  = MAGMA_S_NEG_ONE;
    float c_neg_half = MAGMA_S_NEG_HALF;
    float c_one  = MAGMA_S_ONE ;
    float c_zero = MAGMA_S_ZERO;
    float  d_one = MAGMA_D_ONE;

    magma_int_t pm, pn, indi, indj, pk;
    magma_int_t pm_old=0, pn_old=0, indi_old=0, indj_old=0;

    int i;
    int lwkopt;
    int lquery;

    *info = 0;
    int upper = lapackf77_lsame(uplo_, "U");
    lquery = lwork == -1;
    if (! upper && ! lapackf77_lsame(uplo_, "L")) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,n)) {
        *info = -4;
    } else if (lwork < 1 && ! lquery) {
        *info = -9;
    }

    if (*info == 0) {
      /* Determine the block size. */
      lwkopt = n * nb;
      MAGMA_S_SET2REAL( work[0], lwkopt );
    }

    if (*info != 0)
      return *info;
    else if (lquery)
      return *info;

    /* Quick return if possible */
    if (n == 0) {
        work[0] = c_one;
        return *info;
    }

    float *da;
    if (MAGMA_SUCCESS != magma_smalloc( &da, (n + 2*nb)*ldda )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    magma_int_t mklth = min(threads,12);
#if defined(USEMKL)
    mkl_set_num_threads(mklth);
#endif
#if defined(USEACML)
    omp_set_num_threads(mklth);
#endif


    /* Use the first panel of da as work space */
    float *dwork = da+n*ldda;
    float *dW    = dwork + nb*ldda;

    #ifdef TRACING
    char buf[80];
    #endif
    cudaStream_t stream[3];
    magma_queue_create( &stream[0] );
    magma_queue_create( &stream[1] );
    stream[2] = 0;  // default stream
    
    trace_init( 1, 1, 3, stream );

    float *hT = work + lwork - nb*nb;
    lwork -= nb*nb;
    memset( hT, 0, nb*nb*sizeof(float));

    magmablasSetKernelStream( stream[0] );
    cudaEvent_t Pupdate_event;
    cudaEventCreateWithFlags(&Pupdate_event,cudaEventDisableTiming);
    //cudaEventCreate(&Pupdate_event);


    if (upper) {
      printf("SSYTRD_HE2HB is not yet implemented for upper matrix storage. Exit.\n");
      exit(1);

    }else {
        /* Copy the matrix to the GPU */
        if (1 <= n-nb){
            trace_gpu_start( 0, 0, "set", "set A" );
            magma_ssetmatrix_async( (n-nb), (n-nb),
                                    a_ref(nb+1, nb+1),  lda,
                                    da_ref(nb+1, nb+1), ldda, stream[0] );
            trace_gpu_end( 0, 0 );
        }

        /* Reduce the lower triangle of A */
        for (i = 1; i <= n-nb; i += nb) 
        {
             indi = i+nb;
             indj = i;
             pm   = n - i - nb + 1;
             //pn   = min(i+nb-1, n-nb) -i + 1;
             pn   = nb;
             
             /*   Get the current panel (no need for the 1st iteration) */
             if (i > 1 ){
                 // spanel_to_q copy the upper oof diagonal part of 
                 // the matrix to work to be restored later. acctually
                 //  the zero's and one's putted are not used this is only
                 //   because we don't have a function that copy only the
                 //    upper part of A to be restored after copying the 
                 //    lookahead panel that has been computted from GPU to CPU. 
                 spanel_to_q(MagmaUpper, pn-1, a_ref(i, i+1), lda, work);

                 trace_gpu_start( 0, 1, "get", "get panel" );
                 //magma_queue_sync( stream[0] );
                 cudaStreamWaitEvent(stream[1], Pupdate_event, 0);
                 magma_sgetmatrix_async( (pm+pn), pn,
                                         da_ref( i, i), ldda,
                                         a_ref ( i, i), lda, stream[1] );
                 trace_gpu_end( 0, 1 );

                 trace_gpu_start( 0, 2, "syr2k", "syr2k" );
                 magma_ssyr2k(MagmaLower, MagmaNoTrans, pm_old-pn_old, pn_old, c_neg_one,
                      da_ref(indi_old+pn_old, indj_old), ldda,
                      dW + pn_old           , pm_old, d_one,
                      da_ref(indi_old+pn_old, indi_old+pn_old), ldda);
                 trace_gpu_end( 0, 2 );

                 trace_cpu_start( 0, "sync", "sync on 1" );
                 magma_queue_sync( stream[1] );
                 trace_cpu_end( 0 );
                 sq_to_panel(MagmaUpper, pn-1, a_ref(i, i+1), lda, work);
             }

             /* ==========================================================
                QR factorization on a panel starting nb off of the diagonal.
                Prepare the V and T matrices. 
                ==========================================================  */
             #ifdef TRACING
             snprintf( buf, sizeof(buf), "panel %d", i );
             #endif
             trace_cpu_start( 0, "geqrf", buf );
             lapackf77_sgeqrf(&pm, &pn, a_ref(indi, indj), &lda, 
                        tau_ref(i), work, &lwork, info);
             
             /* Form the matrix T */
                         pk=min(pm,pn);
             lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr,
                           &pm, &pk, a_ref(indi, indj), &lda,
                           tau_ref(i), hT, &nb);

             /* Prepare V - put 0s in the upper triangular part of the panel
                (and 1s on the diagonal), temporaly storing the original in work */
             spanel_to_q(MagmaUpper, pk, a_ref(indi, indj), lda, work);
             trace_cpu_end( 0 );

             /* Send V from the CPU to the GPU */
             trace_gpu_start( 0, 0, "set", "set V and T" );
             magma_ssetmatrix_async( pm, pk,
                                     a_ref(indi, indj),  lda,
                                     da_ref(indi, indj), ldda, stream[0] );

             /* Send the triangular factor T to the GPU */
             magma_ssetmatrix_async( pk, pk,
                                     hT,       nb,
                                     t_ref(i), lddt, stream[0] );
             trace_gpu_end( 0, 0 );
             
             /* ==========================================================
                Compute W:
                1. X = A (V T)
                2. W = X - 0.5* V * (T' * (V' * X)) 
                ==========================================================  */
             /* dwork = V T */
             trace_cpu_start( 0, "sync", "sync on 0" );
             // this sync is done here to be sure that the copy has been finished
             // because below we made a restore sq_to_panel and this restore need
             // to ensure that the copy has been finished. we did it here to allow
             // overlapp of restore with next gemm and symm.
             magma_queue_sync( stream[0] );
             trace_cpu_end( 0 );
             
             trace_gpu_start( 0, 2, "gemm", "work = V*T" );
             magma_sgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk,
                         c_one, da_ref(indi, indj), ldda, 
                         t_ref(i), lddt,
                         c_zero, dwork, pm);
             trace_gpu_end( 0, 2 );
             
             /* dW = X = A*V*T. dW = A*dwork */ 
             trace_gpu_start( 0, 2, "symm", "X = A*work" );
             magma_ssymm(MagmaLeft, uplo, pm, pk,
                         c_one, da_ref(indi, indi), ldda,
                         dwork, pm,
                         c_zero, dW, pm);
             trace_gpu_end( 0, 2 );
             /* restore the panel */
             sq_to_panel(MagmaUpper, pk, a_ref(indi, indj), lda, work);
             
             /* dwork = V*T already ==> dwork' = T'*V'
              * compute T'*V'*X ==> dwork'*W ==>
              * dwork + pm*nb = ((T' * V') * X) = dwork' * X = dwork' * W */
             trace_gpu_start( 0, 2, "gemm", "work = T'*V'*X" );
             magma_sgemm(MagmaTrans, MagmaNoTrans, pk, pk, pm,
                         c_one, dwork, pm, 
                         dW, pm,
                         c_zero, dwork + pm*nb, nb);
             trace_gpu_end( 0, 2 );
             
             /* W = X - 0.5 * V * T'*V'*X
              *   = X - 0.5 * V * (dwork + pm*nb) = W - 0.5 * V * (dwork + pm*nb) */
             trace_gpu_start( 0, 2, "gemm", "W = X - 0.5*V*(T'*V'*X)" );
             magma_sgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk,
                         c_neg_half, da_ref(indi, indj), ldda,
                         dwork + pm*nb, nb, 
                         c_one,     dW, pm);
             trace_gpu_end( 0, 2 );

             /* ==========================================================
                Update the unreduced submatrix A(i+ib:n,i+ib:n), using   
                an update of the form:  A := A - V*W' - W*V' 
                ==========================================================  */
             if (i + nb <= n-nb){
                 /* There would be next iteration;
                    do lookahead - update the next panel */
                 trace_gpu_start( 0, 2, "gemm", "gemm 4 next panel left" );
                 magma_sgemm(MagmaNoTrans, MagmaTrans, pm, pn, pn, c_neg_one,
                             da_ref(indi, indj), ldda,
                             dW                , pm, c_one,
                             da_ref(indi, indi), ldda);
                 trace_gpu_end( 0, 2 );
             
                 trace_gpu_start( 0, 2, "gemm", "gemm 5 next panel right" );
                 magma_sgemm(MagmaNoTrans, MagmaTrans, pm, pn, pn, c_neg_one,
                             dW                , pm,
                             da_ref(indi, indj), ldda, c_one,
                             da_ref(indi, indi), ldda);
                 trace_gpu_end( 0, 2 );
                 cudaEventRecord(Pupdate_event, stream[0]);
             }
             else {
                 /* no look-ahead as this is last iteration */
                 trace_gpu_start( 0, 2, "syr2k", "syr2k last iteration" );
                 magma_ssyr2k(MagmaLower, MagmaNoTrans, pk, pk, c_neg_one,
                              da_ref(indi, indj), ldda,
                              dW                , pm, d_one,
                              da_ref(indi, indi), ldda);
                 trace_gpu_end( 0, 2 );
             }
             
             indi_old = indi;
             indj_old = indj;
             pm_old   = pm;
             pn_old   = pn;
        }  // end loop for(i)

        /* Send the last block to the CPU */
        pk = min(pm,pn);
        if (1 <= n-nb){
            spanel_to_q(MagmaUpper, pk-1, a_ref(n-pk+1, n-pk+2), lda, work);
            trace_gpu_start( 0, 2, "get", "get last block" );
            magma_sgetmatrix( pk, pk,
                              da_ref(n-pk+1, n-pk+1), ldda,
                              a_ref(n-pk+1, n-pk+1),  lda );
            trace_gpu_end( 0, 2 );
            sq_to_panel(MagmaUpper, pk-1, a_ref(n-pk+1, n-pk+2), lda, work);
        }
    }// end of LOWER
    
    trace_finalize( "ssytrd_sy2sb.svg", "trace.css" );

    cudaEventDestroy(Pupdate_event);
    magma_queue_destroy( stream[0] );
    magma_queue_destroy( stream[1] );
    magma_free( da );
    MAGMA_S_SET2REAL( work[0], lwkopt );
    magmablasSetKernelStream( 0 );
    
#if defined(USEMKL)
    mkl_set_num_threads(1);
#endif
#if defined(USEACML)
    omp_set_num_threads(1);
#endif
    

    return *info;
} /* ssytrd_sy2sb_ */
Esempio n. 10
0
extern "C" magma_int_t
magma_dsgetrs_gpu(char trans, magma_int_t n, magma_int_t nrhs,
                  float  *dA, magma_int_t ldda,
                  magma_int_t        *ipiv,
                  double *dB, magma_int_t lddb,
                  double *dX, magma_int_t lddx,
                  float  *dSX,
                  magma_int_t *info)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    DSGETRS solves a system of linear equations
       A * X = B  or  A' * X = B
    with a general N-by-N matrix A using the LU factorization computed
    by MAGMA_SGETRF_GPU. B and X are in DOUBLE PRECISION, and A is in SINGLE PRECISION.
    This routine is used in the mixed precision iterative solver
    magma_dsgesv.

    Arguments
    =========
    TRANS   (input) CHARACTER*1
            Specifies the form of the system of equations:
            = 'N':  A * X = B  (No transpose)
            = 'T':  A'* X = B  (Transpose)
            = 'C':  A'* X = B  (Conjugate transpose = Transpose)

    N       (input) INTEGER
            The order of the matrix A.  N >= 0.

    NRHS    (input) INTEGER
            The number of right hand sides, i.e., the number of columns
            of the matrix B.  NRHS >= 0.

    dA      (input) SINGLE PRECISION array on the GPU, dimension (LDDA,N)
            The factors L and U from the factorization A = P*L*U
            as computed by CGETRF_GPU.

    LDDA    (input) INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,N).

    IPIV    (input) INTEGER array on the GPU, dimension (N)
            The pivot indices from CGETRF_GPU; Row i of the
            matrix was moved to row IPIV(i).

    dB      (input) DOUBLE PRECISION array on the GPU, dimension (LDDB,NRHS)
            On entry, the right hand side matrix B.

    LDDB    (input) INTEGER
            The leading dimension of the arrays X and B.  LDDB >= max(1,N).

    dX      (output) DOUBLE PRECISION array on the GPU, dimension (LDDX, NRHS)
            On exit, the solution matrix dX.

    LDDX    (input) INTEGER
            The leading dimension of the array dX, LDDX >= max(1,N).

    dSX     (workspace) SINGLE PRECISION array on the GPU used as workspace,
            dimension (N, NRHS)

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
    =====================================================================    */

    float c_one = MAGMA_S_ONE;
    char            trans_[2] = {trans, 0};
    int notran = lapackf77_lsame(trans_, "N");
    magma_int_t inc;

    *info = 0;
    if ( (! notran) &&
         (! lapackf77_lsame(trans_, "T")) &&
         (! lapackf77_lsame(trans_, "C")) ) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (nrhs < 0) {
        *info = -3;
    } else if (ldda < n) {
        *info = -5;
    } else if (lddb < n) {
        *info = -8;
    } else if (lddx < n) {
        *info = -10;
    } else if (lddx != lddb) { /* TODO: remove it when dslaswp will have the correct interface */
        *info = -10;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (n == 0 || nrhs == 0) {
        return *info;
    }
    
    if (notran) {
        inc = 1;
        
        /* Get X by row applying interchanges to B and cast to single */
        /*
         * TODO: clean dslaswp interface to have interface closer to zlaswp
         */
        //magmablas_dslaswp(nrhs, dB, lddb, dSX, lddbx, 1, n, ipiv);
        magmablas_dslaswp(nrhs, dB, lddb, dSX, n, ipiv, inc);
        
        /* Solve L*X = B, overwriting B with SX. */
        magma_strsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit,
                     n, nrhs, c_one, dA, ldda, dSX, n);
        
        /* Solve U*X = B, overwriting B with X. */
        magma_strsm( MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit,
                     n, nrhs, c_one, dA, ldda, dSX, n);
        
        magmablas_slag2d( n, nrhs, dSX, n, dX, lddx, info );
    }
    else {
        inc = -1;
        
        /* Cast the DOUBLE PRECISION RHS to SINGLE PRECISION */
        magmablas_dlag2s( n, nrhs, dB, lddb, dSX, n, info );
        
        /* Solve A' * X = B. */
        magma_strsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit,
                     n, nrhs, c_one, dA, ldda, dSX, n );
        magma_strsm( MagmaLeft, MagmaLower, MagmaTrans, MagmaUnit,
                     n, nrhs, c_one, dA, ldda, dSX, n );
        
        magmablas_dslaswp( nrhs, dX, lddx, dSX, n, ipiv, inc );
    }

    return *info;
} /* magma_dsgetrs */
Esempio n. 11
0
extern "C" magma_int_t
magma_dlauum_gpu(char uplo, magma_int_t n,
                 double  *dA, magma_int_t ldda, magma_int_t *info)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    DLAUUM computes the product U * U' or L' * L, where the triangular
    factor U or L is stored in the upper or lower triangular part of
    the array dA.

    If UPLO = 'U' or 'u' then the upper triangle of the result is stored,
    overwriting the factor U in dA.
    If UPLO = 'L' or 'l' then the lower triangle of the result is stored,
    overwriting the factor L in dA.
    This is the blocked form of the algorithm, calling Level 3 BLAS.

    Arguments
    =========
    UPLO    (input) CHARACTER*1
            Specifies whether the triangular factor stored in the array dA
            is upper or lower triangular:
            = 'U':  Upper triangular
            = 'L':  Lower triangular

    N       (input) INTEGER
            The order of the triangular factor U or L.  N >= 0.

    dA      (input/output) DOUBLE PRECISION array on the GPU, dimension (LDDA,N)
            On entry, the triangular factor U or L.
            On exit, if UPLO = 'U', the upper triangle of dA is
            overwritten with the upper triangle of the product U * U';
            if UPLO = 'L', the lower triangle of dA is overwritten with
            the lower triangle of the product L' * L.

    LDDA    (input) INTEGER
            The leading dimension of the array A.  LDDA >= max(1,N).

    INFO    (output) INTEGER
            = 0: successful exit
            < 0: if INFO = -k, the k-th argument had an illegal value

    ===================================================================== */

    /* Local variables */
    char uplo_[2] = {uplo, 0};
    magma_int_t         nb, i, ib;
    double              d_one = MAGMA_D_ONE;
    double  c_one = MAGMA_D_ONE;
    double  *work;

    int upper  = lapackf77_lsame(uplo_, "U");

    *info = 0;

    if ((! upper) && (! lapackf77_lsame(uplo_, "L")))
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (ldda < max(1,n))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    nb = magma_get_dpotrf_nb(n);

    if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, nb*nb )) {
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }

    magma_queue_t stream[2];
    magma_queue_create( &stream[0] );
    magma_queue_create( &stream[1] );

    if (nb <= 1 || nb >= n)
    {
        magma_dgetmatrix( n, n, dA, ldda, work, n );
        lapackf77_dlauum(uplo_, &n, work, &n, info);
        magma_dsetmatrix( n, n, work, n, dA, ldda );
    }
    else
    {
        if (upper)
        {
            /* Compute inverse of upper triangular matrix */
            for (i=0; i < n; i += nb)
            {
                ib = min(nb, (n-i));

                /* Compute the product U * U'. */
                magma_dtrmm( MagmaRight, MagmaUpper,
                         MagmaTrans, MagmaNonUnit, i, ib,
                         c_one, dA(i,i), ldda, dA(0, i),ldda);

                magma_dgetmatrix( ib, ib,
                                  dA(i, i), ldda,
                                  work,     ib );

                lapackf77_dlauum(MagmaUpperStr, &ib, work, &ib, info);

                magma_dsetmatrix( ib, ib,
                                  work,     ib,
                                  dA(i, i), ldda );

                if(i+ib < n)
                {
                    magma_dgemm( MagmaNoTrans, MagmaTrans,
                                 i, ib, (n-i-ib), c_one, dA(0,i+ib),
                                 ldda, dA(i, i+ib), ldda, c_one,
                                 dA(0,i), ldda);

                    magma_dsyrk( MagmaUpper, MagmaNoTrans, ib,(n-i-ib),
                                 d_one, dA(i, i+ib), ldda,
                                 d_one, dA(i, i),    ldda);
                }
            }
        }
        else
        {
            /* Compute the product L' * L. */
            for(i=0; i<n; i=i+nb)
            {
                ib=min(nb,(n-i));

                magma_dtrmm( MagmaLeft, MagmaLower,
                             MagmaTrans, MagmaNonUnit, ib,
                             i, c_one, dA(i,i), ldda,
                             dA(i, 0),ldda);

                magma_dgetmatrix( ib, ib,
                                  dA(i, i), ldda,
                                  work,     ib );

                lapackf77_dlauum(MagmaLowerStr, &ib, work, &ib, info);

                magma_dsetmatrix( ib, ib,
                                  work,     ib,
                                  dA(i, i), ldda );

                if((i+ib) < n)
                {
                    magma_dgemm( MagmaTrans, MagmaNoTrans,
                                 ib, i, (n-i-ib), c_one, dA( i+ib,i),
                                 ldda, dA(i+ib, 0),ldda, c_one,
                                 dA(i,0), ldda);
                    magma_dsyrk( MagmaLower, MagmaTrans, ib, (n-i-ib),
                                 d_one, dA(i+ib, i), ldda,
                                 d_one, dA(i, i),    ldda);
                }
            }
        }
    }

    magma_queue_destroy( stream[0] );
    magma_queue_destroy( stream[1] );

    magma_free_pinned( work );

    return *info;
}
Esempio n. 12
0
extern "C" magma_int_t
magma_ssygvd_m(magma_int_t nrgpu, magma_int_t itype, char jobz, char uplo, magma_int_t n,
               float *a, magma_int_t lda, float *b, magma_int_t ldb,
               float *w, float *work, magma_int_t lwork,
               magma_int_t *iwork, magma_int_t liwork, magma_int_t *info)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    SSYGVD computes all the eigenvalues, and optionally, the eigenvectors
    of a real generalized symmetric-definite eigenproblem, of the form
    A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
    B are assumed to be symmetric and B is also positive definite.
    If eigenvectors are desired, it uses a divide and conquer algorithm.

    The divide and conquer algorithm makes very mild assumptions about
    floating point arithmetic. It will work on machines with a guard
    digit in add/subtract, or on those binary machines without guard
    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
    Cray-2. It could conceivably fail on hexadecimal or decimal machines
    without guard digits, but we know of none.

    Arguments
    =========
    ITYPE   (input) INTEGER
            Specifies the problem type to be solved:
            = 1:  A*x = (lambda)*B*x
            = 2:  A*B*x = (lambda)*x
            = 3:  B*A*x = (lambda)*x

    JOBZ    (input) CHARACTER*1
            = 'N':  Compute eigenvalues only;
            = 'V':  Compute eigenvalues and eigenvectors.

    UPLO    (input) CHARACTER*1
            = 'U':  Upper triangles of A and B are stored;
            = 'L':  Lower triangles of A and B are stored.

    N       (input) INTEGER
            The order of the matrices A and B.  N >= 0.

    A       (input/output) COMPLEX*16 array, dimension (LDA, N)
            On entry, the symmetric matrix A.  If UPLO = 'U', the
            leading N-by-N upper triangular part of A contains the
            upper triangular part of the matrix A.  If UPLO = 'L',
            the leading N-by-N lower triangular part of A contains
            the lower triangular part of the matrix A.

            On exit, if JOBZ = 'V', then if INFO = 0, A contains the
            matrix Z of eigenvectors.  The eigenvectors are normalized
            as follows:
            if ITYPE = 1 or 2, Z**T *   B    * Z = I;
            if ITYPE = 3,      Z**T * inv(B) * Z = I.
            If JOBZ = 'N', then on exit the upper triangle (if UPLO='U')
            or the lower triangle (if UPLO='L') of A, including the
            diagonal, is destroyed.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    B       (input/output) COMPLEX*16 array, dimension (LDB, N)
            On entry, the symmetric matrix B.  If UPLO = 'U', the
            leading N-by-N upper triangular part of B contains the
            upper triangular part of the matrix B.  If UPLO = 'L',
            the leading N-by-N lower triangular part of B contains
            the lower triangular part of the matrix B.

            On exit, if INFO <= N, the part of B containing the matrix is
            overwritten by the triangular factor U or L from the Cholesky
            factorization B = U**T * U or B = L * L**T.

    LDB     (input) INTEGER
            The leading dimension of the array B.  LDB >= max(1,N).

    W       (output) DOUBLE PRECISION array, dimension (N)
            If INFO = 0, the eigenvalues in ascending order.

    WORK    (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.

    LWORK   (input) INTEGER
            The length of the array WORK.
            If N <= 1,                LWORK >= 1.
            If JOBZ  = 'N' and N > 1, LWORK >= N + 1.
            If JOBZ  = 'V' and N > 1, LWORK >= 2*N*nb + N**2.

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal sizes of the WORK, RWORK and
            IWORK arrays, returns these values as the first entries of
            the WORK, RWORK and IWORK arrays, and no error message
            related to LWORK or LRWORK or LIWORK is issued by XERBLA.

    RWORK   (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LRWORK))
            On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK.

    LRWORK  (input) INTEGER
            The dimension of the array RWORK.
            If N <= 1,                LRWORK >= 1.
            If JOBZ  = 'N' and N > 1, LRWORK >= N.
            If JOBZ  = 'V' and N > 1, LRWORK >= 1 + 5*N + 2*N**2.

            If LRWORK = -1, then a workspace query is assumed; the
            routine only calculates the optimal sizes of the WORK, RWORK
            and IWORK arrays, returns these values as the first entries
            of the WORK, RWORK and IWORK arrays, and no error message
            related to LWORK or LRWORK or LIWORK is issued by XERBLA.

    IWORK   (workspace/output) INTEGER array, dimension (MAX(1,LIWORK))
            On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.

    LIWORK  (input) INTEGER
            The dimension of the array IWORK.
            If N <= 1,                LIWORK >= 1.
            If JOBZ  = 'N' and N > 1, LIWORK >= 1.
            If JOBZ  = 'V' and N > 1, LIWORK >= 3 + 5*N.

            If LIWORK = -1, then a workspace query is assumed; the
            routine only calculates the optimal sizes of the WORK, RWORK
            and IWORK arrays, returns these values as the first entries
            of the WORK, RWORK and IWORK arrays, and no error message
            related to LWORK or LRWORK or LIWORK is issued by XERBLA.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
            > 0:  SPOTRF or SSYEVD returned an error code:
               <= N:  if INFO = i and JOBZ = 'N', then the algorithm
                      failed to converge; i off-diagonal elements of an
                      intermediate tridiagonal form did not converge to
                      zero;
                      if INFO = i and JOBZ = 'V', then the algorithm
                      failed to compute an eigenvalue while working on
                      the submatrix lying in rows and columns INFO/(N+1)
                      through mod(INFO,N+1);
               > N:   if INFO = N + i, for 1 <= i <= N, then the leading
                      minor of order i of B is not positive definite.
                      The factorization of B could not be completed and
                      no eigenvalues or eigenvectors were computed.

    Further Details
    ===============
    Based on contributions by
       Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA

    Modified so that no backsubstitution is performed if SSYEVD fails to
    converge (NEIG in old code could be greater than N causing out of
    bounds reference to A - reported by Ralf Meyer).  Also corrected the
    description of INFO and the test on ITYPE. Sven, 16 Feb 05.
    =====================================================================  */

    char uplo_[2] = {uplo, 0};
    char jobz_[2] = {jobz, 0};

    float d_one = MAGMA_S_ONE;

    magma_int_t lower;
    char trans[1];
    magma_int_t wantz;
    magma_int_t lquery;

    magma_int_t lwmin;
    magma_int_t liwmin;

    magma_queue_t stream;
    magma_queue_create( &stream );

    wantz = lapackf77_lsame(jobz_, MagmaVecStr);
    lower = lapackf77_lsame(uplo_, MagmaLowerStr);
    lquery = lwork == -1 || liwork == -1;

    *info = 0;
    if (itype < 1 || itype > 3) {
        *info = -1;
    } else if (! (wantz || lapackf77_lsame(jobz_, MagmaNoVecStr))) {
        *info = -2;
    } else if (! (lower || lapackf77_lsame(uplo_, MagmaUpperStr))) {
        *info = -3;
    } else if (n < 0) {
        *info = -4;
    } else if (lda < max(1,n)) {
        *info = -6;
    } else if (ldb < max(1,n)) {
        *info = -8;
    }

    magma_int_t nb = magma_get_ssytrd_nb( n );
    if ( n <= 1 ) {
        lwmin  = 1;
        liwmin = 1;
    }
    else if ( wantz ) {
        lwmin  = 1 + 6*n + 2*n*n;
        liwmin = 3 + 5*n;
    }
    else {
        lwmin  = 2*n + n*nb;
        liwmin = 1;
    }

    work[0]  = lwmin * (1. + lapackf77_slamch("Epsilon"));  // round up
    iwork[0] = liwmin;

    if (lwork < lwmin && ! lquery) {
        *info = -11;
    } else if (liwork < liwmin && ! lquery) {
        *info = -13;
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /*     Quick return if possible */
    if (n == 0) {
        return *info;
    }
    /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */
    if (n <= 128){
        #ifdef ENABLE_DEBUG
        printf("--------------------------------------------------------------\n");
        printf("  warning matrix too small N=%d NB=%d, calling lapack on CPU  \n", (int) n, (int) nb);
        printf("--------------------------------------------------------------\n");
        #endif
        lapackf77_ssygvd(&itype, jobz_, uplo_,
                         &n, a, &lda, b, &ldb,
                         w, work, &lwork,
                         iwork, &liwork, info);
        return *info;
    }


//
#ifdef ENABLE_TIMER
        magma_timestr_t start, end;
        start = get_current_time();
#endif

    magma_spotrf_m(nrgpu, uplo_[0], n, b, ldb, info);
    if (*info != 0) {
        *info = n + *info;
        return *info;
    }

#ifdef ENABLE_TIMER
        end = get_current_time();
        printf("time spotrf = %6.2f\n", GetTimerValue(start,end)/1000.);
        start = get_current_time();
#endif

    /*  Transform problem to standard eigenvalue problem and solve. */
    magma_ssygst_m(nrgpu, itype, uplo_[0], n, a, lda, b, ldb, info);

#ifdef ENABLE_TIMER
        end = get_current_time();
        printf("time ssygst = %6.2f\n", GetTimerValue(start,end)/1000.);
        start = get_current_time();
#endif

    magma_ssyevd_m(nrgpu, jobz_[0], uplo_[0], n, a, lda, w, work, lwork, iwork, liwork, info);

#ifdef ENABLE_TIMER
        end = get_current_time();
        printf("time ssyevd = %6.2f\n", GetTimerValue(start,end)/1000.);
#endif

    if (wantz && *info == 0)
    {

#ifdef ENABLE_TIMER
        start = get_current_time();
#endif

        /* Backtransform eigenvectors to the original problem. */
        if (itype == 1 || itype == 2) {
            /* For A*x=(lambda)*B*x and A*B*x=(lambda)*x;
               backtransform eigenvectors: x = inv(L)'*y or inv(U)*y */
            if (lower) {
                *(unsigned char *)trans = MagmaTrans;
            } else {
                *(unsigned char *)trans = MagmaNoTrans;
            }

            magma_strsm_m(nrgpu, MagmaLeft, uplo, *trans, MagmaNonUnit,
                          n, n, d_one, b, ldb, a, lda);
        }
        else if (itype == 3) {
            /* For B*A*x=(lambda)*x;
               backtransform eigenvectors: x = L*y or U'*y */
            if (lower) {
                *(unsigned char *)trans = MagmaNoTrans;
            } else {
                *(unsigned char *)trans = MagmaTrans;
            }

            //magma_strmm(MagmaLeft, uplo_[0], *trans, MagmaNonUnit,
            //            n, n, c_one, db, lddb, da, ldda);
        }

#ifdef ENABLE_TIMER
        end = get_current_time();
        printf("time setmatrices trsm/mm + getmatrices = %6.2f\n", GetTimerValue(start,end)/1000.);
#endif
    }

    work[0]  = lwmin * (1. + lapackf77_slamch("Epsilon"));  // round up
    iwork[0] = liwmin;

    return *info;
} /* magma_ssygvd_m */
Esempio n. 13
0
extern "C" magma_int_t
magma_zungtr(char uplo, magma_int_t n, magmaDoubleComplex *a,
             magma_int_t lda, magmaDoubleComplex *tau,
             magmaDoubleComplex *work, magma_int_t lwork,
             magmaDoubleComplex *dT, magma_int_t nb,
             magma_int_t *info)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    ZUNGTR generates a complex unitary matrix Q which is defined as the
    product of n-1 elementary reflectors of order N, as returned by
    ZHETRD:

    if UPLO = 'U', Q = H(n-1) . . . H(2) H(1),

    if UPLO = 'L', Q = H(1) H(2) . . . H(n-1).

    Arguments
    =========
    UPLO    (input) CHARACTER*1
            = 'U': Upper triangle of A contains elementary reflectors
                   from ZHETRD;
            = 'L': Lower triangle of A contains elementary reflectors
                   from ZHETRD.

    N       (input) INTEGER
            The order of the matrix Q. N >= 0.

    A       (input/output) COMPLEX_16 array, dimension (LDA,N)
            On entry, the vectors which define the elementary reflectors,
            as returned by ZHETRD.
            On exit, the N-by-N unitary matrix Q.

    LDA     (input) INTEGER
            The leading dimension of the array A. LDA >= N.

    TAU     (input) COMPLEX_16 array, dimension (N-1)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by ZHETRD.

    WORK    (workspace/output) COMPLEX_16 array, dimension (LWORK)
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.

    LWORK   (input) INTEGER
            The dimension of the array WORK. LWORK >= N-1.
            For optimum performance LWORK >= N*NB, where NB is
            the optimal blocksize.

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    DT      (input) COMPLEX_16 array on the GPU device.
            DT contains the T matrices used in blocking the elementary
            reflectors H(i) as returned by magma_zhetrd.

    NB      (input) INTEGER
            This is the block size used in ZHETRD, and correspondingly
            the size of the T matrices, used in the factorization, and
            stored in DT.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
    =====================================================================    */

#define a_ref(i,j) ( a + (j)*lda+ (i))

    char uplo_[2]  = {uplo, 0};
    
    magma_int_t i__1;
    magma_int_t i, j;
    magma_int_t iinfo;
    magma_int_t upper, lwkopt, lquery;

    *info = 0;
    lquery = lwork == -1;
    upper = lapackf77_lsame(uplo_, "U");
    if (! upper && ! lapackf77_lsame(uplo_, "L")) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,n)) {
        *info = -4;
    } else /* if(complicated condition) */ {
        /* Computing MAX */
        if (lwork < max(1, n-1) && ! lquery) {
            *info = -7;
        }
    }

    lwkopt = max(1, n) * nb;
    if (*info == 0) {
        MAGMA_Z_SET2REAL( work[0], lwkopt);
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info));
        return *info;
    } else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (n == 0) {
        work[0] = MAGMA_Z_ONE;
        return *info;
    }

    if (upper) {
        /*  Q was determined by a call to ZHETRD with UPLO = 'U'
            Shift the vectors which define the elementary reflectors one
            column to the left, and set the last row and column of Q to
            those of the unit matrix                                    */
        for (j = 0; j < n-1; ++j) {
            for (i = 0; i < j-1; ++i)
                *a_ref(i, j) = *a_ref(i, j + 1);

            *a_ref(n-1, j) = MAGMA_Z_ZERO;
        }
        for (i = 0; i < n-1; ++i) {
            *a_ref(i, n-1) = MAGMA_Z_ZERO;
        }
        *a_ref(n-1, n-1) = MAGMA_Z_ONE;
        
        /* Generate Q(1:n-1,1:n-1) */
        i__1 = n - 1;
        lapackf77_zungql(&i__1, &i__1, &i__1, a_ref(0,0), &lda, tau, work,
                         &lwork, &iinfo);
    } else {
        
        /*  Q was determined by a call to ZHETRD with UPLO = 'L'.
            Shift the vectors which define the elementary reflectors one
            column to the right, and set the first row and column of Q to
            those of the unit matrix                                      */
        for (j = n-1; j > 0; --j) {
            *a_ref(0, j) = MAGMA_Z_ZERO;
            for (i = j; i < n-1; ++i)
                *a_ref(i, j) = *a_ref(i, j - 1);
        }

        *a_ref(0, 0) = MAGMA_Z_ONE;
        for (i = 1; i < n-1; ++i)
            *a_ref(i, 0) = MAGMA_Z_ZERO;
        
        if (n > 1) {
            /* Generate Q(2:n,2:n) */
            magma_zungqr(n-1, n-1, n-1, a_ref(1, 1), lda, tau, dT, nb, &iinfo);
        }
    }
    
    MAGMA_Z_SET2REAL( work[0], lwkopt);

    return *info;
} /* magma_zungtr */
Esempio n. 14
0
extern "C" magma_int_t
magma_spotrf_gpu(char uplo, magma_int_t n,
                 float *dA, magma_int_t ldda, magma_int_t *info)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    SPOTRF computes the Cholesky factorization of a real symmetric
    positive definite matrix dA.

    The factorization has the form
       dA = U**T * U,  if UPLO = 'U', or
       dA = L  * L**T,  if UPLO = 'L',
    where U is an upper triangular matrix and L is lower triangular.

    This is the block version of the algorithm, calling Level 3 BLAS.
    If the current stream is NULL, this version replaces it with user defined
    stream to overlap computation with communication.

    Arguments
    =========
    UPLO    (input) CHARACTER*1
            = 'U':  Upper triangle of dA is stored;
            = 'L':  Lower triangle of dA is stored.

    N       (input) INTEGER
            The order of the matrix dA.  N >= 0.

    dA      (input/output) REAL array on the GPU, dimension (LDDA,N)
            On entry, the symmetric matrix dA.  If UPLO = 'U', the leading
            N-by-N upper triangular part of dA contains the upper
            triangular part of the matrix dA, and the strictly lower
            triangular part of dA is not referenced.  If UPLO = 'L', the
            leading N-by-N lower triangular part of dA contains the lower
            triangular part of the matrix dA, and the strictly upper
            triangular part of dA is not referenced.

            On exit, if INFO = 0, the factor U or L from the Cholesky
            factorization dA = U**T * U or dA = L * L**T.

    LDDA     (input) INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            dividable by 16.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
            > 0:  if INFO = i, the leading minor of order i is not
                  positive definite, and the factorization could not be
                  completed.
    =====================================================================   */


    magma_int_t     j, jb, nb;
    char            uplo_[2] = {uplo, 0};
    float c_one     = MAGMA_S_ONE;
    float c_neg_one = MAGMA_S_NEG_ONE;
    float *work;
    float          d_one     =  1.0;
    float          d_neg_one = -1.0;
    int upper = lapackf77_lsame(uplo_, "U");

    *info = 0;
    if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,n)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    nb = magma_get_spotrf_nb(n);

    if (MAGMA_SUCCESS != magma_smalloc_pinned( &work, nb*nb )) {
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }

    /* Define user stream if current stream is NULL */
    cudaStream_t stream[2], current_stream;
    magmablasGetKernelStream(&current_stream);

    magma_queue_create( &stream[0] );
    if (current_stream == NULL) {
      magma_queue_create( &stream[1] );
      magmablasSetKernelStream(stream[1]);
    }
    else
      stream[1] = current_stream;

    if ((nb <= 1) || (nb >= n)) {
        /*  Use unblocked code. */
        magma_sgetmatrix_async( n, n, dA, ldda, work, n, stream[1] );
        magma_queue_sync( stream[1] );
        lapackf77_spotrf(uplo_, &n, work, &n, info);
        magma_ssetmatrix_async( n, n, work, n, dA, ldda, stream[1] );
    }
    else {

        /* Use blocked code. */
        if (upper) {
            
            /* Compute the Cholesky factorization A = U'*U. */
            for (j=0; j<n; j+=nb) {
                
                /* Update and factorize the current diagonal block and test
                   for non-positive-definiteness. Computing MIN */
                jb = min(nb, (n-j));
                
                magma_ssyrk(MagmaUpper, MagmaTrans, jb, j,
                            d_neg_one, dA(0, j), ldda,
                            d_one,     dA(j, j), ldda);

                magma_queue_sync( stream[1] );
                magma_sgetmatrix_async( jb, jb,
                                        dA(j, j), ldda,
                                        work,     jb, stream[0] );
                
                if ( (j+jb) < n) {
                    /* Compute the current block row. */
                    magma_sgemm(MagmaTrans, MagmaNoTrans,
                                jb, (n-j-jb), j,
                                c_neg_one, dA(0, j   ), ldda,
                                           dA(0, j+jb), ldda,
                                c_one,     dA(j, j+jb), ldda);
                }
                
                magma_queue_sync( stream[0] );
                lapackf77_spotrf(MagmaUpperStr, &jb, work, &jb, info);
                magma_ssetmatrix_async( jb, jb,
                                        work,     jb,
                                        dA(j, j), ldda, stream[1] );
                if (*info != 0) {
                    *info = *info + j;
                    break;
                }

                if ( (j+jb) < n) {
                    magma_strsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit,
                                 jb, (n-j-jb),
                                 c_one, dA(j, j   ), ldda,
                                        dA(j, j+jb), ldda);
                }
            }
        }
        else {
            //=========================================================
            // Compute the Cholesky factorization A = L*L'.
            for (j=0; j<n; j+=nb) {

                //  Update and factorize the current diagonal block and test
                //  for non-positive-definiteness. Computing MIN
                jb = min(nb, (n-j));

                magma_ssyrk(MagmaLower, MagmaNoTrans, jb, j,
                            d_neg_one, dA(j, 0), ldda,
                            d_one,     dA(j, j), ldda);
                
                magma_queue_sync( stream[1] );
                magma_sgetmatrix_async( jb, jb,
                                        dA(j, j), ldda,
                                        work,     jb, stream[0] );
                
                if ( (j+jb) < n) {
                    magma_sgemm( MagmaNoTrans, MagmaTrans,
                                 (n-j-jb), jb, j,
                                 c_neg_one, dA(j+jb, 0), ldda,
                                            dA(j,    0), ldda,
                                 c_one,     dA(j+jb, j), ldda);
                }

                magma_queue_sync( stream[0] );
                lapackf77_spotrf(MagmaLowerStr, &jb, work, &jb, info);
                magma_ssetmatrix_async( jb, jb,
                                        work,     jb,
                                        dA(j, j), ldda, stream[1] );
                if (*info != 0) {
                    *info = *info + j;
                    break;
                }
                
                if ( (j+jb) < n) {
                    magma_strsm(MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit,
                                (n-j-jb), jb,
                                c_one, dA(j,    j), ldda,
                                       dA(j+jb, j), ldda);
                }
            }
        }
    }

    magma_free_pinned( work );

    magma_queue_destroy( stream[0] );
    if (current_stream == NULL) {
      magma_queue_destroy( stream[1] );
      magmablasSetKernelStream(NULL);
    }

    return *info;
} /* magma_spotrf_gpu */
Esempio n. 15
0
extern "C" magma_int_t
magma_dgeev(
    char jobvl, char jobvr, magma_int_t n,
    double *A, magma_int_t lda,
    double *WR, double *WI,
    double *vl, magma_int_t ldvl,
    double *vr, magma_int_t ldvr,
    double *work, magma_int_t lwork,
    magma_int_t *info )
{
/*  -- MAGMA (version 1.4.1) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       December 2013

    Purpose
    =======
    DGEEV computes for an N-by-N real nonsymmetric matrix A, the
    eigenvalues and, optionally, the left and/or right eigenvectors.

    The right eigenvector v(j) of A satisfies
                     A * v(j) = lambda(j) * v(j)
    where lambda(j) is its eigenvalue.
    The left eigenvector u(j) of A satisfies
                  u(j)**T * A = lambda(j) * u(j)**T
    where u(j)**T denotes the transpose of u(j).

    The computed eigenvectors are normalized to have Euclidean norm
    equal to 1 and largest component real.

    Arguments
    =========
    JOBVL   (input) CHARACTER*1
            = 'N': left eigenvectors of A are not computed;
            = 'V': left eigenvectors of are computed.

    JOBVR   (input) CHARACTER*1
            = 'N': right eigenvectors of A are not computed;
            = 'V': right eigenvectors of A are computed.

    N       (input) INTEGER
            The order of the matrix A. N >= 0.

    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
            On entry, the N-by-N matrix A.
            On exit, A has been overwritten.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    WR      (output) DOUBLE PRECISION array, dimension (N)
    WI      (output) DOUBLE PRECISION array, dimension (N)
            WR and WI contain the real and imaginary parts,
            respectively, of the computed eigenvalues.  Complex
            conjugate pairs of eigenvalues appear consecutively
            with the eigenvalue having the positive imaginary part
            first.

    VL      (output) DOUBLE PRECISION array, dimension (LDVL,N)
            If JOBVL = 'V', the left eigenvectors u(j) are stored one
            after another in the columns of VL, in the same order
            as their eigenvalues.
            If JOBVL = 'N', VL is not referenced.
            u(j) = VL(:,j), the j-th column of VL.

    LDVL    (input) INTEGER
            The leading dimension of the array VL.  LDVL >= 1; if
            JOBVL = 'V', LDVL >= N.

    VR      (output) DOUBLE PRECISION array, dimension (LDVR,N)
            If JOBVR = 'V', the right eigenvectors v(j) are stored one
            after another in the columns of VR, in the same order
            as their eigenvalues.
            If JOBVR = 'N', VR is not referenced.
            v(j) = VR(:,j), the j-th column of VR.

    LDVR    (input) INTEGER
            The leading dimension of the array VR.  LDVR >= 1; if
            JOBVR = 'V', LDVR >= N.

    WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.

    LWORK   (input) INTEGER
            The dimension of the array WORK.  LWORK >= (2+nb)*N.
            For optimal performance, LWORK >= (2+2*nb)*N.

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value.
            > 0:  if INFO = i, the QR algorithm failed to compute all the
                  eigenvalues, and no eigenvectors have been computed;
                  elements and i+1:N of W contain eigenvalues which have
                  converged.
    =====================================================================    */

    #define vl(i,j)  (vl + (i) + (j)*ldvl)
    #define vr(i,j)  (vr + (i) + (j)*ldvr)
    
    magma_int_t c_one = 1;
    magma_int_t c_zero = 0;
    
    double d__1, d__2;
    double r, cs, sn, scl;
    double dum[1], eps;
    double anrm, cscale, bignum, smlnum;
    magma_int_t i, k, ilo, ihi;
    magma_int_t ibal, ierr, itau, iwrk, nout, liwrk, i__1, i__2, nb;
    magma_int_t scalea, minwrk, lquery, wantvl, wantvr, select[1];

    char side[2]   = {0, 0};
    char jobvl_[2] = {jobvl, 0};
    char jobvr_[2] = {jobvr, 0};

    *info = 0;
    lquery = lwork == -1;
    wantvl = lapackf77_lsame( jobvl_, "V" );
    wantvr = lapackf77_lsame( jobvr_, "V" );
    if (! wantvl && ! lapackf77_lsame( jobvl_, "N" )) {
        *info = -1;
    } else if (! wantvr && ! lapackf77_lsame( jobvr_, "N" )) {
        *info = -2;
    } else if (n < 0) {
        *info = -3;
    } else if (lda < max(1,n)) {
        *info = -5;
    } else if ( (ldvl < 1) || (wantvl && (ldvl < n))) {
        *info = -9;
    } else if ( (ldvr < 1) || (wantvr && (ldvr < n))) {
        *info = -11;
    }

    /* Compute workspace */
    nb = magma_get_dgehrd_nb( n );
    if (*info == 0) {
        minwrk = (2+nb)*n;
        work[0] = MAGMA_D_MAKE( (double) minwrk, 0. );
        
        if (lwork < minwrk && ! lquery) {
            *info = -13;
        }
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (n == 0) {
        return *info;
    }
    
    #if defined(VERSION3)
    double *dT;
    if (MAGMA_SUCCESS != magma_dmalloc( &dT, nb*n )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }
    #endif

    /* Get machine constants */
    eps    = lapackf77_dlamch( "P" );
    smlnum = lapackf77_dlamch( "S" );
    bignum = 1. / smlnum;
    lapackf77_dlabad( &smlnum, &bignum );
    smlnum = magma_dsqrt( smlnum ) / eps;
    bignum = 1. / smlnum;

    /* Scale A if max element outside range [SMLNUM,BIGNUM] */
    anrm = lapackf77_dlange( "M", &n, &n, A, &lda, dum );
    scalea = 0;
    if (anrm > 0. && anrm < smlnum) {
        scalea = 1;
        cscale = smlnum;
    } else if (anrm > bignum) {
        scalea = 1;
        cscale = bignum;
    }
    if (scalea) {
        lapackf77_dlascl( "G", &c_zero, &c_zero, &anrm, &cscale, &n, &n, A, &lda, &ierr );
    }

    /* Balance the matrix
     * (Workspace: need N) */
    ibal = 0;
    lapackf77_dgebal( "B", &n, A, &lda, &ilo, &ihi, &work[ibal], &ierr );

    /* Reduce to upper Hessenberg form
     * (Workspace: need 3*N, prefer 2*N + N*NB) */
    itau = ibal + n;
    iwrk = itau + n;
    liwrk = lwork - iwrk;

    #if defined(VERSION1)
        // Version 1 - LAPACK
        lapackf77_dgehrd( &n, &ilo, &ihi, A, &lda,
                          &work[itau], &work[iwrk], &liwrk, &ierr );
    #elif defined(VERSION2)
        // Version 2 - LAPACK consistent HRD
        magma_dgehrd2( n, ilo, ihi, A, lda,
                       &work[itau], &work[iwrk], liwrk, &ierr );
    #elif defined(VERSION3)
        // Version 3 - LAPACK consistent MAGMA HRD + T matrices stored,
        magma_dgehrd( n, ilo, ihi, A, lda,
                      &work[itau], &work[iwrk], liwrk, dT, &ierr );
    #endif

    if (wantvl) {
        /* Want left eigenvectors
         * Copy Householder vectors to VL */
        side[0] = 'L';
        lapackf77_dlacpy( MagmaLowerStr, &n, &n,
                          A, &lda, vl, &ldvl );

        /* Generate orthogonal matrix in VL
         * (Workspace: need 3*N-1, prefer 2*N + (N-1)*NB) */
        #if defined(VERSION1) || defined(VERSION2)
            // Version 1 & 2 - LAPACK
            lapackf77_dorghr( &n, &ilo, &ihi, vl, &ldvl, &work[itau],
                              &work[iwrk], &liwrk, &ierr );
        #elif defined(VERSION3)
            // Version 3 - LAPACK consistent MAGMA HRD + T matrices stored
            magma_dorghr( n, ilo, ihi, vl, ldvl, &work[itau], dT, nb, &ierr );
        #endif

        /* Perform QR iteration, accumulating Schur vectors in VL
         * (Workspace: need N+1, prefer N+HSWORK (see comments) ) */
        iwrk = itau;
        liwrk = lwork - iwrk;
        lapackf77_dhseqr( "S", "V", &n, &ilo, &ihi, A, &lda, WR, WI,
                          vl, &ldvl, &work[iwrk], &liwrk, info );

        if (wantvr) {
            /* Want left and right eigenvectors
             * Copy Schur vectors to VR */
            side[0] = 'B';
            lapackf77_dlacpy( "F", &n, &n, vl, &ldvl, vr, &ldvr );
        }
    }
    else if (wantvr) {
        /* Want right eigenvectors
         * Copy Householder vectors to VR */
        side[0] = 'R';
        lapackf77_dlacpy( "L", &n, &n, A, &lda, vr, &ldvr );

        /* Generate orthogonal matrix in VR
         * (Workspace: need 3*N-1, prefer 2*N + (N-1)*NB) */
        #if defined(VERSION1) || defined(VERSION2)
            // Version 1 & 2 - LAPACK
            lapackf77_dorghr( &n, &ilo, &ihi, vr, &ldvr, &work[itau],
                              &work[iwrk], &liwrk, &ierr );
        #elif defined(VERSION3)
            // Version 3 - LAPACK consistent MAGMA HRD + T matrices stored
            magma_dorghr( n, ilo, ihi, vr, ldvr, &work[itau], dT, nb, &ierr );
        #endif

        /* Perform QR iteration, accumulating Schur vectors in VR
         * (Workspace: need N+1, prefer N+HSWORK (see comments) ) */
        iwrk = itau;
        liwrk = lwork - iwrk;
        lapackf77_dhseqr( "S", "V", &n, &ilo, &ihi, A, &lda, WR, WI,
                          vr, &ldvr, &work[iwrk], &liwrk, info );
    }
    else {
        /* Compute eigenvalues only
         * (Workspace: need N+1, prefer N+HSWORK (see comments) ) */
        iwrk = itau;
        liwrk = lwork - iwrk;
        lapackf77_dhseqr( "E", "N", &n, &ilo, &ihi, A, &lda, WR, WI,
                          vr, &ldvr, &work[iwrk], &liwrk, info );
    }

    /* If INFO > 0 from DHSEQR, then quit */
    if (*info > 0) {
        goto CLEANUP;
    }

    if (wantvl || wantvr) {
        /* Compute left and/or right eigenvectors
         * (Workspace: need 4*N) */
        liwrk = lwork - iwrk;
        #if TREVC_VERSION == 1
        lapackf77_dtrevc( side, "B", select, &n, A, &lda, vl, &ldvl,
                          vr, &ldvr, &n, &nout, &work[iwrk], &ierr );
        #elif TREVC_VERSION == 2
        lapackf77_dtrevc3( side, "B", select, &n, A, &lda, vl, &ldvl,
                           vr, &ldvr, &n, &nout, &work[iwrk], &liwrk, &ierr );
        #endif
    }

    if (wantvl) {
        /* Undo balancing of left eigenvectors
         * (Workspace: need N) */
        lapackf77_dgebak( "B", "L", &n, &ilo, &ihi, &work[ibal], &n,
                          vl, &ldvl, &ierr );

        /* Normalize left eigenvectors and make largest component real */
        for (i = 0; i < n; ++i) {
            if ( WI[i] == 0. ) {
                scl = 1. / cblas_dnrm2( n, vl(0,i), 1 );
                cblas_dscal( n, scl, vl(0,i), 1 );
            }
            else if ( WI[i] > 0. ) {
                d__1 = cblas_dnrm2( n, vl(0,i),   1 );
                d__2 = cblas_dnrm2( n, vl(0,i+1), 1 );
                scl = 1. / lapackf77_dlapy2( &d__1, &d__2 );
                cblas_dscal( n, scl, vl(0,i),   1 );
                cblas_dscal( n, scl, vl(0,i+1), 1 );
                for (k = 0; k < n; ++k) {
                    /* Computing 2nd power */
                    d__1 = *vl(k,i);
                    d__2 = *vl(k,i+1);
                    work[iwrk + k] = d__1*d__1 + d__2*d__2;
                }
                k = cblas_idamax( n, &work[iwrk], 1 );
                lapackf77_dlartg( vl(k,i), vl(k,i+1), &cs, &sn, &r );
                cblas_drot( n, vl(0,i), 1, vl(0,i+1), 1, cs, sn );
                *vl(k,i+1) = 0.;
            }
        }
    }

    if (wantvr) {
        /* Undo balancing of right eigenvectors
         * (Workspace: need N) */
        lapackf77_dgebak( "B", "R", &n, &ilo, &ihi, &work[ibal], &n,
                          vr, &ldvr, &ierr );

        /* Normalize right eigenvectors and make largest component real */
        for (i = 0; i < n; ++i) {
            if ( WI[i] == 0. ) {
                scl = 1. / cblas_dnrm2( n, vr(0,i), 1 );
                cblas_dscal( n, scl, vr(0,i), 1 );
            }
            else if ( WI[i] > 0. ) {
                d__1 = cblas_dnrm2( n, vr(0,i),   1 );
                d__2 = cblas_dnrm2( n, vr(0,i+1), 1 );
                scl = 1. / lapackf77_dlapy2( &d__1, &d__2 );
                cblas_dscal( n, scl, vr(0,i),   1 );
                cblas_dscal( n, scl, vr(0,i+1), 1 );
                for (k = 0; k < n; ++k) {
                    /* Computing 2nd power */
                    d__1 = *vr(k,i);
                    d__2 = *vr(k,i+1);
                    work[iwrk + k] = d__1*d__1 + d__2*d__2;
                }
                k = cblas_idamax( n, &work[iwrk], 1 );
                lapackf77_dlartg( vr(k,i), vr(k,i+1), &cs, &sn, &r );
                cblas_drot( n, vr(0,i), 1, vr(0,i+1), 1, cs, sn );
                *vr(k,i+1) = 0.;
            }
        }
    }

CLEANUP:
    /* Undo scaling if necessary */
    if (scalea) {
        i__1 = n - (*info);
        i__2 = max( n - (*info), 1 );
        lapackf77_dlascl( "G", &c_zero, &c_zero, &cscale, &anrm, &i__1, &c_one,
                          WR + (*info), &i__2, &ierr );
        lapackf77_dlascl( "G", &c_zero, &c_zero, &cscale, &anrm, &i__1, &c_one,
                          WI + (*info), &i__2, &ierr );
        if (*info > 0) {
            i__1 = ilo - 1;
            lapackf77_dlascl( "G", &c_zero, &c_zero, &cscale, &anrm, &i__1, &c_one,
                              WR, &n, &ierr );
            lapackf77_dlascl( "G", &c_zero, &c_zero, &cscale, &anrm, &i__1, &c_one,
                              WI, &n, &ierr );
        }
    }

    #if defined(VERSION3)
    magma_free( dT );
    #endif
    
    return *info;
} /* magma_dgeev */
Esempio n. 16
0
extern "C" magma_int_t
magma_zunmqr2_gpu(const char side, const char trans,
                  magma_int_t m, magma_int_t n, magma_int_t k,
                  magmaDoubleComplex *da,   magma_int_t ldda,
                  magmaDoubleComplex *tau,
                  magmaDoubleComplex *dc,    magma_int_t lddc,
                  magmaDoubleComplex *wa,    magma_int_t ldwa,
                  magma_int_t *info)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    ZUNMQR overwrites the general complex M-by-N matrix C with

                    SIDE = 'L'     SIDE = 'R'
    TRANS = 'N':      Q * C          C * Q
    TRANS = 'T':      Q**H * C     C * Q**H

    where Q is a complex orthogonal matrix defined as the product of k
    elementary reflectors

          Q = H(1) H(2) . . . H(k)

    as returned by ZGEQRF. Q is of order M if SIDE = 'L' and of order N
    if SIDE = 'R'.

    Arguments
    =========
    SIDE    (input) CHARACTER*1
            = 'L': apply Q or Q**H from the Left;
            = 'R': apply Q or Q**H from the Right.

    TRANS   (input) CHARACTER*1
            = 'N':  No transpose, apply Q;
            = 'T':  Transpose, apply Q**H.

    M       (input) INTEGER
            The number of rows of the matrix C. M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix C. N >= 0.

    K       (input) INTEGER
            The number of elementary reflectors whose product defines
            the matrix Q.
            If SIDE = 'L', M >= K >= 0;
            if SIDE = 'R', N >= K >= 0.

    DA      (input) COMPLEX_16 array, dimension (LDA,K)
            The i-th column must contain the vector which defines the
            elementary reflector H(i), for i = 1,2,...,k, as returned by
            ZGEQRF in the first k columns of its array argument A.
            The diagonal and the upper part
            are destroyed, the reflectors are not modified.

    LDDA    (input) INTEGER
            The leading dimension of the array DA.
            LDDA >= max(1,M) if SIDE = 'L'; LDDA >= max(1,N) if SIDE = 'R'.

    TAU     (input) COMPLEX_16 array, dimension (K)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by ZGEQRF.

    DC      (device input/output) COMPLEX_16 array, dimension (LDDC,N)
            On entry, the M-by-N matrix C.
            On exit, C is overwritten by (Q*C) or (Q**H * C) or (C * Q**H) or (C*Q).

    LDDC    (input) INTEGER
            The leading dimension of the array C. LDDC >= max(1,M).

    WA      (input/workspace) COMPLEX_16 array, dimension
                                 (LDWA,M) if SIDE = 'L'
                                 (LDWA,N) if SIDE = 'R'
            The vectors which define the elementary reflectors, as
            returned by ZHETRD_GPU.

    LDWA    (input) INTEGER
            The leading dimension of the array A.
            LDWA >= max(1,M) if SIDE = 'L'; LDWA >= max(1,N) if SIDE = 'R'.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
    =====================================================================   */

    char side_[2]  = {side,  0};
    char trans_[2] = {trans, 0};

    /* Allocate work space on the GPU */
    magmaDoubleComplex *dwork;

    magma_int_t wa_offset, dc_offset, i__4, lddwork;
    magma_int_t i;
    magmaDoubleComplex t[2*4160]        /* was [65][64] */;
    magma_int_t i1, i2, step, ib, ic, jc, nb, mi, ni, nq, nw;
    int left, notran;

    wa_offset = 1 + ldwa;
    wa -= wa_offset;
    --tau;
    dc_offset = 1 + lddc;
    dc -= dc_offset;

    *info = 0;
    left   = lapackf77_lsame(side_,  "L");
    notran = lapackf77_lsame(trans_, "N");

    /* NQ is the order of Q and NW is the minimum dimension of WORK */
    if (left) {
        nq = m;
        nw = n;
        magma_zmalloc( &dwork, (n + 64)*64 );
    } else {
        nq = n;
        nw = m;
        magma_zmalloc( &dwork, (m + 64)*64 );
    }
    if (! left && ! lapackf77_lsame(side_, "R")) {
        *info = -1;
    } else if (! notran && ! lapackf77_lsame(trans_, "T")) {
        *info = -2;
    } else if (m < 0) {
        *info = -3;
    } else if (n < 0) {
        *info = -4;
    } else if (k < 0 || k > nq) {
        *info = -5;
    } else if (ldda < max(1,nq)) {
        *info = -7;
    } else if (lddc < max(1,m)) {
        *info = -10;
    } else if (ldwa < max(1,nq)) {
        *info = -12;
    }

    // size of the block
    nb = 64;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0 || k == 0) {
        return *info;
    }

    /* Use hybrid CPU-GPU code */
    if ( ( left && (! notran) ) ||  ( (! left) && notran ) ) {
        i1 = 1;
        i2 = k;
        step = nb;
    } else {
        i1 = ((k - 1)/nb)*nb + 1;
        i2 = 1;
        step = -nb;
    }

    // silence "uninitialized" warnings
    mi = 0;
    ni = 0;
    
    if (left) {
        ni = n;
        jc = 1;
    } else {
        mi = m;
        ic = 1;
    }

    magmablas_zsetdiag1subdiag0('L', k, nb, da, ldda);

    // for i=i1 to i2 by step
    for (i = i1; (step < 0 ? i >= i2 : i <= i2); i += step) {
        ib = min(nb, k - i + 1);

        /* Form the triangular factor of the block reflector
           H = H(i) H(i+1) . . . H(i+ib-1) */
        i__4 = nq - i + 1;
        lapackf77_zlarft("F", "C", &i__4, &ib, &wa[i + i*ldwa], &ldwa,
                         &tau[i], t, &ib);


        if (left) {
            /* H or H' is applied to C(i:m,1:n) */
            mi = m - i + 1;
            ic = i;
        }
        else {
            /* H or H' is applied to C(1:m,i:n) */
            ni = n - i + 1;
            jc = i;
        }

        if (left)
            lddwork = ni;
        else
            lddwork = mi;

        /* Apply H or H'; First copy T to the GPU */
        magma_zsetmatrix( ib, ib, t, ib, dwork, ib );
        magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
                          mi, ni, ib,
                          da + (i - 1) + (i - 1)*ldda , ldda, dwork, ib,
                          &dc[ic + jc*lddc], lddc,
                          dwork + ib*ib, lddwork);
    }

    magma_free( dwork );

    return *info;
} /* magma_zunmqr */
Esempio n. 17
0
extern "C" magma_int_t
magma_dormql2_gpu(const char side, const char trans,
                  magma_int_t m, magma_int_t n, magma_int_t k,
                  double *da, magma_int_t ldda,
                  double *tau,
                  double *dc, magma_int_t lddc,
                  double *wa, magma_int_t ldwa,
                  magma_int_t *info)
{
/*  -- MAGMA (version 1.4.1) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       December 2013

    Purpose
    =======
    DORMQL overwrites the general real M-by-N matrix C with

                    SIDE = 'L'     SIDE = 'R'
    TRANS = 'N':      Q * C          C * Q
    TRANS = 'C':      Q**T * C       C * Q**T

    where Q is a real unitary matrix defined as the product of k
    elementary reflectors

          Q = H(k) . . . H(2) H(1)

    as returned by DGEQLF. Q is of order M if SIDE = 'L' and of order N
    if SIDE = 'R'.

    Arguments
    =========
    SIDE    (input) CHARACTER*1
            = 'L': apply Q or Q**T from the Left;
            = 'R': apply Q or Q**T from the Right.

    TRANS   (input) CHARACTER*1
            = 'N':  No transpose, apply Q;
            = 'C':  Transpose, apply Q**T.

    M       (input) INTEGER
            The number of rows of the matrix C. M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix C. N >= 0.

    K       (input) INTEGER
            The number of elementary reflectors whose product defines
            the matrix Q.
            If SIDE = 'L', M >= K >= 0;
            if SIDE = 'R', N >= K >= 0.

    DA      (input) DOUBLE_PRECISION array, dimension (LDA,K)
            The i-th column must contain the vector which defines the
            elementary reflector H(i), for i = 1,2,...,k, as returned by
            DGEQLF in the last k columns of its array argument A.
            The diagonal and the lower part
            are destroyed, the reflectors are not modified.

    LDDA    (input) INTEGER
            The leading dimension of the array DA.
            LDDA >= max(1,M) if SIDE = 'L'; LDDA >= max(1,N) if SIDE = 'R'.

    TAU     (input) DOUBLE_PRECISION array, dimension (K)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by DGEQLF.

    DC      (device input/output) DOUBLE_PRECISION array, dimension (LDDC,N)
            On entry, the M-by-N matrix C.
            On exit, C is overwritten by Q*C or Q**T*C or C*Q**T or C*Q.

    LDDC    (input) INTEGER
            The leading dimension of the array C. LDDC >= max(1,M).

    WA      (input/workspace) DOUBLE_PRECISION array, dimension
                                 (LDWA,M) if SIDE = 'L'
                                 (LDWA,N) if SIDE = 'R'
            The vectors which define the elementary reflectors, as
            returned by DSYTRD_GPU.

    LDWA    (input) INTEGER
            The leading dimension of the array A.
            LDWA >= max(1,M) if SIDE = 'L'; LDWA >= max(1,N) if SIDE = 'R'.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
    =====================================================================    */
    
    char side_[2] = {side, 0};
    char trans_[2] = {trans, 0};

    /* Allocate work space on the GPU */
    double *dwork;
    magma_dmalloc( &dwork, 2*(m + 64)*64 );

    magma_int_t wa_offset, dc_offset, i__4;
    
    magma_int_t i__;
    double t[2*4160]        /* was [65][64] */;
    magma_int_t i1, i2, i3, ib, nb, mi, ni, nq, nw;
    magma_int_t ldwork;
    int left, notran;

    wa_offset = 1 + ldwa;
    wa -= wa_offset;
    --tau;
    dc_offset = 1 + lddc;
    dc -= dc_offset;

    *info  = 0;
    left   = lapackf77_lsame(side_, "L");
    notran = lapackf77_lsame(trans_, "N");

    /* NQ is the order of Q and NW is the minimum dimension of WORK */
    if (left) {
        nq = m;
        nw = max(1,n);
    } else {
        nq = n;
        nw = max(1,m);
    }
    if (! left && ! lapackf77_lsame(side_, "R")) {
        *info = -1;
    } else if (! notran && ! lapackf77_lsame(trans_, "C")) {
        *info = -2;
    } else if (m < 0) {
        *info = -3;
    } else if (n < 0) {
        *info = -4;
    } else if (k < 0 || k > nq) {
        *info = -5;
    } else if (ldda < max(1,nq)) {
        *info = -7;
    } else if (lddc < max(1,m)) {
        *info = -10;
    } else if (ldwa < max(1,nq)) {
        *info = -12;
    }
    
    // size of the block
    nb = 64;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0) {
        return *info;
    }

    ldwork = nw;
        
    /* Use hybrid CPU-GPU code */
    if ((left && notran) || (! left && ! notran)) {
        i1 = 1;
        i2 = k;
        i3 = nb;
    } else {
        i1 = (k - 1) / nb * nb + 1;
        i2 = 1;
        i3 = -nb;
    }
    
    // silence "uninitialized" warnings
    mi = 0;
    ni = 0;
    
    if (left) {
        ni = n;
    } else {
        mi = m;
    }
    
    magmablas_dsetdiag1subdiag0('U', k, nb, da, ldda);
    
    for (i__ = i1; (i3 < 0 ? i__ >= i2 : i__ <= i2); i__ += i3) {
        ib = min(nb, k - i__ + 1);
        
        /* Form the triangular factor of the block reflector
           H = H(i+ib-1) . . . H(i+1) H(i) */
        i__4 = nq - k + i__ + ib - 1;
        lapackf77_dlarft("Backward", "Columnwise", &i__4, &ib,
                         &wa[i__ * ldwa + 1], &ldwa, &tau[i__], t, &ib);
    
        if (left) {
            /* H or H' is applied to C(1:m-k+i+ib-1,1:n) */
            mi = m - k + i__ + ib - 1;
        }
        else {
            /* H or H' is applied to C(1:m,1:n-k+i+ib-1) */
            ni = n - k + i__ + ib - 1;
        }
        
        /* Apply H or H'; First copy T to the GPU */
        magma_dsetmatrix( ib, ib, t, ib, dwork+i__4*ib, ib );
        magma_dlarfb_gpu(side, trans, MagmaBackward, MagmaColumnwise,
                         mi, ni, ib,
                         &da[(i__-1) * ldda], ldda, dwork+i__4*ib, ib,
                         &dc[1+lddc], lddc,
                         dwork+i__4*ib + ib*ib, ldwork);
    }

    magma_free( dwork );

    return *info;
} /* magma_dormql */
Esempio n. 18
0
extern "C" magma_int_t
magma_cpotri(char uplo, magma_int_t n,
              magmaFloatComplex *A, magma_int_t lda, magma_int_t *info)
{
/*  -- MAGMA (version 1.4.1) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       December 2013

    Purpose
    =======
    CPOTRI computes the inverse of a real symmetric positive definite
    matrix A using the Cholesky factorization A = U**T*U or A = L*L**T
    computed by CPOTRF.

    Arguments
    =========
    UPLO    (input) CHARACTER*1
            = 'U':  Upper triangle of A is stored;
            = 'L':  Lower triangle of A is stored.

    N       (input) INTEGER
            The order of the matrix A.  N >= 0.

    A       (input/output) COMPLEX array, dimension (LDA,N)
            On entry, the triangular factor U or L from the Cholesky
            factorization A = U**T*U or A = L*L**T, as computed by
            CPOTRF.
            On exit, the upper or lower triangle of the (symmetric)
            inverse of A, overwriting the input factor U or L.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
            > 0:  if INFO = i, the (i,i) element of the factor U or L is
                  zero, and the inverse could not be computed.

    ===================================================================== */

    /* Local variables */
    char uplo_[2] = {uplo, 0};

    *info = 0;
    if ((! lapackf77_lsame(uplo_, "U")) && (! lapackf77_lsame(uplo_, "L")))
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (lda < max(1,n))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if ( n == 0 )
        return *info;
    
    /* Invert the triangular Cholesky factor U or L */
    magma_ctrtri( uplo, MagmaNonUnit, n, A, lda, info );
    if ( *info == 0 ) {
        /* Form inv(U) * inv(U)**T or inv(L)**T * inv(L) */
        magma_clauum( uplo, n, A, lda, info );
    }
    
    return *info;
} /* magma_cpotri */
magma_int_t
magma_zsytrf_stapiv_gpu(char uplo, magma_int_t n,
                        cuDoubleComplex *dA, magma_int_t ldda,
                        double criteria, PASTIX_INT * npiv, PASTIX_FLOAT * tmp4, magma_int_t *info)
{
/*  -- MAGMA (version 1.2.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       May 2012

    Purpose
    =======
    ZSYTRF computes the Cholesky factorization of a complex Hermitian
    positive definite matrix dA.

    The factorization has the form
       dA = U**H * U,  if UPLO = 'U', or
       dA = L  * L**H,  if UPLO = 'L',
    where U is an upper triangular matrix and L is lower triangular.

    This is the block version of the algorithm, calling Level 3 BLAS.

    Arguments
    =========
    UPLO    (input) CHARACTER*1
            = 'U':  Upper triangle of dA is stored;
            = 'L':  Lower triangle of dA is stored.

    N       (input) INTEGER
            The order of the matrix dA.  N >= 0.

    dA      (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N)
            On entry, the Hermitian matrix dA.  If UPLO = 'U', the leading
            N-by-N upper triangular part of dA contains the upper
            triangular part of the matrix dA, and the strictly lower
            triangular part of dA is not referenced.  If UPLO = 'L', the
            leading N-by-N lower triangular part of dA contains the lower
            triangular part of the matrix dA, and the strictly upper
            triangular part of dA is not referenced.

            On exit, if INFO = 0, the factor U or L from the Cholesky
            factorization dA = U**H * U or dA = L * L**H.

    LDDA     (input) INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            dividable by 16.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
            > 0:  if INFO = i, the leading minor of order i is not
                  positive definite, and the factorization could not be
                  completed.
    =====================================================================   */


    magma_int_t     j, jb, nb;
    char            uplo_[2] = {uplo, 0};
    cuDoubleComplex c_one     = MAGMA_Z_ONE;
    cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    cuDoubleComplex *work;
    double          d_one     =  1.0;
    double          d_neg_one = -1.0;
    long int        upper = uplo_lapackf77_lsame(uplo_, "U");

    *info = 0;
    if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,n)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    nb = magma_get_zsytrf_nb(n);

    if (MAGMA_SUCCESS != magma_zmalloc_host( &work, nb*nb )) {
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }

    static cudaStream_t stream[2];
    magma_queue_create( &stream[0] );
    magma_queue_create( &stream[1] );

    if ((nb <= 1) || (nb >= n)) {
        /*  Use unblocked code. */
        magma_zgetmatrix( n, n, dA, ldda, work, n );
        assert(!upper); /* PaStiX only works with lower */
        PASTIX_sytrf_block((PASTIX_FLOAT*)work, n, n,
                           npiv,
                           criteria, tmp4);
        magma_zsetmatrix( n, n, work, n, dA, ldda );
    } else {
        /* Use blocked code. */
        if (upper) {
          assert(0); /* PaStiX only works with lower */
      
            /* Compute the Cholesky factorization A = U'*U. */
            for (j=0; j<n; j+=nb) {

                /* Update and factorize the current diagonal block and test
                   for non-positive-definiteness. Computing MIN */
                jb = min(nb, (n-j));

                magma_zherk(MagmaUpper, MagmaConjTrans, jb, j,
                            d_neg_one, dA(0, j), ldda,
                            d_one,     dA(j, j), ldda);

                magma_zgetmatrix_async( jb, jb,
                                        dA(j, j), ldda,
                                        work,     jb, stream[1] );

                if ( (j+jb) < n) {
                    /* Compute the current block row. */
                    magma_zgemm(MagmaConjTrans, MagmaNoTrans,
                                jb, (n-j-jb), j,
                                c_neg_one, dA(0, j   ), ldda,
                                           dA(0, j+jb), ldda,
                                c_one,     dA(j, j+jb), ldda);
                }

                magma_queue_sync( stream[1] );

                /* lapackf77_zsytrf(MagmaUpperStr, &jb, work, &jb, info); */
                magma_zsetmatrix_async( jb, jb,
                                        work,     jb,
                                        dA(j, j), ldda, stream[0] );
                if (*info != 0) {
                  *info = *info + j;
                  break;
                }

                if ( (j+jb) < n)
                    magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                 jb, (n-j-jb),
                                 c_one, dA(j, j   ), ldda,
                                        dA(j, j+jb), ldda);
            }
        } else {
            //=========================================================
            // Compute the Cholesky factorization A = L*L'.
            for (j=0; j<n; j+=nb) {

                //  Update and factorize the current diagonal block and test
                //  for non-positive-definiteness. Computing MIN
                jb = min(nb, (n-j));

                magma_zsyrk(MagmaLower, MagmaNoTrans, jb, j,
                            c_neg_one, dA(j, 0), ldda,
                            c_one,     dA(j, j), ldda);

                magma_zgetmatrix_async( jb, jb,
                                        dA(j, j), ldda,
                                        work,     jb, stream[1] );

                if ( (j+jb) < n) {
                    magma_zgemm( MagmaNoTrans, MagmaConjTrans,
                                 (n-j-jb), jb, j,
                                 c_neg_one, dA(j+jb, 0), ldda,
                                            dA(j,    0), ldda,
                                 c_one,     dA(j+jb, j), ldda);
                }

                magma_queue_sync( stream[1] );
                PASTIX_sytrf_block((PASTIX_FLOAT*)work, jb, jb,
                                   npiv,
                                   criteria, tmp4);

                magma_zsetmatrix_async( jb, jb,
                                        work,     jb,
                                        dA(j, j), ldda, stream[0] );
                if (*info != 0) {
                  *info = *info + j;
                  break;
                }

                if ( (j+jb) < n)
                    magma_ztrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                (n-j-jb), jb,
                                c_one, dA(j,    j), ldda,
                                       dA(j+jb, j), ldda);
            }

        }
    }

    magma_queue_destroy( stream[0] );
    magma_queue_destroy( stream[1] );
    magma_free_host( work );

    return *info;
} /* magma_zsytrf_gpu */
extern "C" magma_int_t
magma_sormqr_gpu(magma_side_t side, magma_trans_t trans,
                 magma_int_t m, magma_int_t n, magma_int_t k,
                 magmaFloat_ptr dA, size_t dA_offset, magma_int_t ldda, 
                 float *tau,
                 magmaFloat_ptr dC, size_t dC_offset, magma_int_t lddc,
                 float *hwork, magma_int_t lwork,
                 magmaFloat_ptr dT, size_t dT_offset, magma_int_t nb, 
                 magma_int_t *info, magma_queue_t queue)
{
/*  -- clMAGMA (version 1.0.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       April 2012

    Purpose
    =======
    SORMQR_GPU overwrites the general real M-by-N matrix C with

                    SIDE = 'L'     SIDE = 'R'
    TRANS = 'N':      Q * C          C * Q
    TRANS = 'T':      Q**T * C       C * Q**T

    where Q is a real orthogonal matrix defined as the product of k
    elementary reflectors

          Q = H(1) H(2) . . . H(k)

    as returned by SGEQRF. Q is of order M if SIDE = 'L' and of order N
    if SIDE = 'R'.

    Arguments
    =========
    SIDE    (input) CHARACTER*1
            = 'L': apply Q or Q**T from the Left;
            = 'R': apply Q or Q**T from the Right.

    TRANS   (input) CHARACTER*1
            = 'N':  No transpose, apply Q;
            = 'T':  Transpose, apply Q**T.

    M       (input) INTEGER
            The number of rows of the matrix C. M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix C. N >= 0.

    K       (input) INTEGER
            The number of elementary reflectors whose product defines
            the matrix Q.
            If SIDE = 'L', M >= K >= 0;
            if SIDE = 'R', N >= K >= 0.

    DA      (input) REAL array on the GPU, dimension (LDDA,K)
            The i-th column must contain the vector which defines the
            elementary reflector H(i), for i = 1,2,...,k, as returned by
            SGEQRF in the first k columns of its array argument DA.
            DA is modified by the routine but restored on exit.

    LDDA    (input) INTEGER
            The leading dimension of the array DA.
            If SIDE = 'L', LDDA >= max(1,M);
            if SIDE = 'R', LDDA >= max(1,N).

    TAU     (input) REAL array, dimension (K)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by SGEQRF.

    DC      (input/output) REAL array on the GPU, dimension (LDDC,N)
            On entry, the M-by-N matrix C.
            On exit, C is overwritten by Q*C or Q**T * C or C * Q**T or C*Q.

    LDDC     (input) INTEGER
            The leading dimension of the array DC. LDDC >= max(1,M).

    HWORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, HWORK(1) returns the optimal LWORK.

    LWORK   (input) INTEGER
            The dimension of the array HWORK.
            LWORK >= (M-K+NB)*(N+2*NB) if SIDE = 'L', 
            and LWORK >= (N-K+NB)*(M+2*NB) if SIDE = 'R', where NB is the 
            optimal blocksize.

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the HWORK array, returns
            this value as the first entry of the HWORK array, and no error
            message related to LWORK is issued by XERBLA.

    DT      (input) REAL array on the GPU that is the output 
            (the 9th argument) of magma_sgeqrf_gpu.

    NB      (input) INTEGER
            This is the blocking size that was used in pre-computing DT, e.g.,
            the blocking size used in magma_sgeqrf_gpu.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
    =====================================================================   */

    #define a_ref(a_1,a_2) dA, (dA_offset+(a_1)+(a_2)*(ldda))
    #define c_ref(a_1,a_2) dC, (dC_offset+(a_1)+(a_2)*(lddc))           
    #define t_ref(a_1)     dT, (dT_offset+(a_1)*nb)

    float c_one = MAGMA_S_ONE;

    magma_side_t side_ = side;
    magma_trans_t trans_ = trans;

    magmaFloat_ptr dwork;
    magma_int_t i, lddwork;

    magma_int_t i1, i2, i3, ib, ic, jc, mi, ni, nq, nw, ret;
    long int left, notran, lquery;
    static magma_int_t lwkopt;

    *info = 0;
    left   = lapackf77_lsame(lapack_const(side_), lapack_const(MagmaLeft));
    notran = lapackf77_lsame(lapack_const(trans_), lapack_const(MagmaNoTrans));
    lquery = (lwork == -1);

    if (!left || notran)
      printf("sormqr_gpu called with arguments not yet supported\n");

    /* NQ is the order of Q and NW is the minimum dimension of WORK */
    if (left) {
        nq = m;
        nw = n;
    } else {
        nq = n;
        nw = m;
    }
    if ( (!left) && (!lapackf77_lsame(lapack_const(side_), lapack_const(MagmaRight))) ) {
        *info = -1;
    } else if ( (!notran) && (!lapackf77_lsame(lapack_const(trans_), lapack_const(MagmaTrans))) ) {
        *info = -2;
    } else if (m < 0) {
        *info = -3;
    } else if (n < 0) {
        *info = -4;
    } else if (k < 0 || k > nq) {
        *info = -5;
    } else if (ldda < max(1,nq)) {
        *info = -7;
    } else if (lddc < max(1,m)) {
        *info = -10;
    } else if (lwork < max(1,nw) && ! lquery) {
        *info = -12;
    }

    lwkopt = (m-k+nb)*(n+2*nb);
    hwork[0] = MAGMA_S_MAKE( lwkopt, 0 );

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0 || k == 0) {
        hwork[0] = c_one;
        return *info;
    }

    lddwork= k;
    dwork  = dT;
	size_t dwork_offset = 2*lddwork*nb;

    if ( (left && (! notran)) || ( (!left) && notran ) ) {
        i1 = 0;
        i2 = k-nb;
        i3 = nb;
    } else {
        i1 = (k - 1 - nb) / nb * nb;
        i2 = 0;
        i3 = -nb;
    }

    if (left) {
        ni = n;
        jc = 0;
    } else {
        mi = m;
        ic = 0;
    }

    if (nb < k)
    {
        for (i=i1; i3<0 ? i>i2 : i<i2; i+=i3)
        {
            ib = min(nb, k - i);
            if (left){
                mi = m - i;
                ic = i;
            }
            else {
                ni = n - i;
                jc = i;
            }
            ret = magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise,
                                    mi, ni, ib, 
                                    a_ref(i,  i ), ldda, t_ref(i), nb, 
                                    c_ref(ic, jc), lddc, dwork, dwork_offset, nw, queue);
            if ( ret != MAGMA_SUCCESS )
              return ret;
        }
    }
    else
    {
        i = i1;
    }

    /* Use unblocked code to multiply the last or only block. */
    if (i < k) {
        ib   = k-i;
        if (left){
            mi = m - i;
            ic = i;
        }
        else {
            ni = n - i;
            jc = i;
        }

		magma_sgetmatrix(mi, ib, a_ref(i, i), ldda, hwork, 0, mi, queue);
        magma_sgetmatrix(mi, ni, c_ref(ic, jc), lddc, hwork+mi*ib, 0, mi, queue);

        magma_int_t lhwork = lwork - mi*(ib + ni);
        lapackf77_sormqr( MagmaLeftStr, MagmaTransStr, 
                          &mi, &ni, &ib, 
                          hwork,       &mi, tau+i, 
                          hwork+mi*ib, &mi, 
                          hwork+mi*(ib+ni), &lhwork, info);

        // send the updated part of c back to the GPU
		magma_ssetmatrix(mi, ni, hwork+mi*ib, 0, mi, c_ref(ic, jc), lddc, queue);
    }

    return *info;
    /* End of MAGMA_SORMQR_GPU */
}
Esempio n. 21
0
extern "C" magma_int_t
magma_chetrd2_gpu(char uplo, magma_int_t n,               
                  cuFloatComplex *da, magma_int_t ldda, 
                  float *d, float *e, cuFloatComplex *tau,
                  cuFloatComplex *wa,  magma_int_t ldwa,
                  cuFloatComplex *work, magma_int_t lwork, 
                  cuFloatComplex *dwork, magma_int_t ldwork,
                  magma_int_t *info)
{
/*  -- MAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2012

    Purpose   
    =======   
    CHETRD2_GPU reduces a complex Hermitian matrix A to real symmetric   
    tridiagonal form T by an orthogonal similarity transformation:   
    Q**H * A * Q = T.   
    This version passes a workspace that is used in an optimized 
    GPU matrix-vector product.

    Arguments   
    =========   
    UPLO    (input) CHARACTER*1   
            = 'U':  Upper triangle of A is stored;   
            = 'L':  Lower triangle of A is stored.   

    N       (input) INTEGER   
            The order of the matrix A.  N >= 0.   

    DA      (device input/output) COMPLEX array, dimension (LDA,N)   
            On entry, the Hermitian matrix A.  If UPLO = 'U', the leading   
            N-by-N upper triangular part of A contains the upper   
            triangular part of the matrix A, and the strictly lower   
            triangular part of A is not referenced.  If UPLO = 'L', the   
            leading N-by-N lower triangular part of A contains the lower   
            triangular part of the matrix A, and the strictly upper   
            triangular part of A is not referenced.   
            On exit, if UPLO = 'U', the diagonal and first superdiagonal   
            of A are overwritten by the corresponding elements of the   
            tridiagonal matrix T, and the elements above the first   
            superdiagonal, with the array TAU, represent the orthogonal   
            matrix Q as a product of elementary reflectors; if UPLO   
            = 'L', the diagonal and first subdiagonal of A are over-   
            written by the corresponding elements of the tridiagonal   
            matrix T, and the elements below the first subdiagonal, with   
            the array TAU, represent the orthogonal matrix Q as a product   
            of elementary reflectors. See Further Details.   

    LDDA    (input) INTEGER   
            The leading dimension of the array A.  LDA >= max(1,N).   

    D       (output) COMPLEX array, dimension (N)   
            The diagonal elements of the tridiagonal matrix T:   
            D(i) = A(i,i).   

    E       (output) COMPLEX array, dimension (N-1)   
            The off-diagonal elements of the tridiagonal matrix T:   
            E(i) = A(i,i+1) if UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'.   

    TAU     (output) COMPLEX array, dimension (N-1)   
            The scalar factors of the elementary reflectors (see Further   
            Details).   

    WA      (workspace/output) COMPLEX array, dimension (LDA,N)
            On exit the diagonal, the  upper part (UPLO='U')
            or the lower part (UPLO='L') are copies of DA

    LDWA    (input) INTEGER   
            The leading dimension of the array WA.  LDWA >= max(1,N).
 
    WORK    (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))   
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.   

    LWORK   (input) INTEGER   
            The dimension of the array WORK.  LWORK >= 1.   
            For optimum performance LWORK >= N*NB, where NB is the   
            optimal blocksize.   

            If LWORK = -1, then a workspace query is assumed; the routine   
            only calculates the optimal size of the WORK array, returns   
            this value as the first entry of the WORK array, and no error   
            message related to LWORK is issued by XERBLA.   

    DWORK   (workspace/output) COMPLEX array on the GPU, dim (MAX(1,LDWORK))
 
    LDWORK  (input) INTEGER
            The dimension of the array DWORK.  LDWORK >= 1.
            To be done: determine the precise dimension needed

    INFO    (output) INTEGER   
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value   

    Further Details   
    ===============   
    If UPLO = 'U', the matrix Q is represented as a product of elementary   
    reflectors   

       Q = H(n-1) . . . H(2) H(1).   

    Each H(i) has the form   

       H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with   
    v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in   
    A(1:i-1,i+1), and tau in TAU(i).   

    If UPLO = 'L', the matrix Q is represented as a product of elementary   
    reflectors   

       Q = H(1) H(2) . . . H(n-1).   

    Each H(i) has the form   

       H(i) = I - tau * v * v'   

    where tau is a complex scalar, and v is a complex vector with   
    v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i),   
    and tau in TAU(i).

    The contents of A on exit are illustrated by the following examples   
    with n = 5:   

    if UPLO = 'U':                       if UPLO = 'L':   

      (  d   e   v2  v3  v4 )              (  d                  )   
      (      d   e   v3  v4 )              (  e   d              )   
      (          d   e   v4 )              (  v1  e   d          )   
      (              d   e  )              (  v1  v2  e   d      )   
      (                  d  )              (  v1  v2  v3  e   d  )   

    where d and e denote diagonal and off-diagonal elements of T, and vi   
    denotes an element of the vector defining H(i).   
    =====================================================================    */

    char uplo_[2] = {uplo, 0};

    magma_int_t nb = magma_get_chetrd_nb(n); 

    cuFloatComplex c_neg_one = MAGMA_C_NEG_ONE;
    cuFloatComplex c_one     = MAGMA_C_ONE;
    float          d_one     = MAGMA_D_ONE;
    
    magma_int_t kk, nx;
    magma_int_t i, j, i_n;
    magma_int_t iinfo;
    magma_int_t ldw, lddw, lwkopt;
    magma_int_t lquery;

    *info = 0;
    int upper = lapackf77_lsame(uplo_, "U");
    lquery = lwork == -1;
    if (! upper && ! lapackf77_lsame(uplo_, "L")) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,n)) {
        *info = -4;
    } else if (ldwa < max(1,n)) {
        *info = -9;
    } else if (lwork < 1 && ! lquery) { 
        *info = -11;
    }

    if (*info == 0) {
      /* Determine the block size. */
      ldw = lddw = n;
      lwkopt = n * nb;
      MAGMA_C_SET2REAL( work[0], lwkopt );
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
      return *info;

    /* Quick return if possible */
    if (n == 0) {
        work[0] = c_one;
        return *info;
    }

    if (n < 1024)
      nx = n;
    else
      nx = 300;
  
    if (2*ldw*nb > ldwork){
      printf("Not enough work space passed in chetrd2_gpu. Exit\n");
      exit(1);
    }

  if (upper) {
    /*  Reduce the upper triangle of A.   
     Columns 1:kk are handled by the unblocked method. */
    kk = n - (n - nx + nb - 1) / nb * nb;
    
    for (i = n - nb; i >= kk; i -= nb) 
    {
      /* Reduce columns i:i+nb-1 to tridiagonal form and form the   
       matrix W which is needed to update the unreduced part of   
       the matrix */
      
      /*   Get the current panel */
      magma_cgetmatrix( i+nb, nb, dA(0, i), ldda, A(0, i), ldwa );
      
      magma_clatrd2(uplo, i+nb, nb, A(0, 0), ldwa, e, tau, 
                    work, ldw, dA(0, 0), ldda, dwork, lddw, dwork + 2*ldw*nb, ldwork - 2*ldw*nb);
      
      /* Update the unreduced submatrix A(0:i-2,0:i-2), using an   
       update of the form:  A := A - V*W' - W*V' */
      
      magma_csetmatrix( i + nb, nb, work, ldw, dwork, lddw );
      
      magma_cher2k(uplo, MagmaNoTrans, i, nb, c_neg_one, 
                   dA(0, i), ldda, dwork, 
                   lddw, d_one, dA(0, 0), ldda);
      
      /* Copy superdiagonal elements back into A, and diagonal   
       elements into D */
      for (j = i; j < i+nb; ++j) {
        MAGMA_C_SET2REAL( *A(j-1, j), e[j - 1] );
        d[j] = MAGMA_C_REAL( *A(j, j) );
      }      
    }
    
    magma_cgetmatrix( kk, kk, dA(0, 0), ldda, A(0, 0), ldwa );
    
    /*  Use CPU code to reduce the last or only block */
    lapackf77_chetrd(uplo_, &kk, A(0, 0), &ldwa, d, e, tau, work, &lwork, &iinfo);
    
    magma_csetmatrix( kk, kk, A(0, 0), ldwa, dA(0, 0), ldda );
  } 
  else 
  {
    /* Reduce the lower triangle of A */
    for (i = 0; i < n-nx; i += nb) 
    {
      /* Reduce columns i:i+nb-1 to tridiagonal form and form the
       matrix W which is needed to update the unreduced part of
       the matrix */
      
      /*   Get the current panel */
      magma_cgetmatrix( n-i, nb, dA(i, i), ldda, A(i, i), ldwa );
      
      magma_clatrd2(uplo, n-i, nb, A(i, i), ldwa, &e[i], 
                    &tau[i], work, ldw, 
                    dA(i, i), ldda,
                    dwork, lddw,
                    dwork + 2*ldw*nb, ldwork - 2*ldw*nb);
      
      /* Update the unreduced submatrix A(i+ib:n,i+ib:n), using   
       an update of the form:  A := A - V*W' - W*V' */      
      magma_csetmatrix( n-i, nb, work, ldw, dwork, lddw );
      
      magma_cher2k(MagmaLower, MagmaNoTrans, n-i-nb, nb, c_neg_one, 
                   dA(i+nb, i), ldda, 
                   &dwork[nb], lddw, d_one, 
                   dA(i+nb, i+nb), ldda);
      
      /* Copy subdiagonal elements back into A, and diagonal   
       elements into D */
      for (j = i; j < i+nb; ++j) {
        MAGMA_C_SET2REAL( *A(j+1, j), e[j] );
        d[j] = MAGMA_C_REAL( *A(j, j) );
      }
    }
    /* Use unblocked code to reduce the last or only block */
    magma_cgetmatrix( n-i, n-i, dA(i, i), ldda, A(i, i), ldwa );
    
    i_n = n-i;
    lapackf77_chetrd(uplo_, &i_n, A(i, i), &ldwa, &d[i], &e[i],
                     &tau[i], work, &lwork, &iinfo);
    
    magma_csetmatrix( n-i, n-i, A(i, i), ldwa, dA(i, i), ldda );
  }  
    
    MAGMA_C_SET2REAL( work[0], lwkopt );
    return *info;
} /* chetrd2_gpu */
Esempio n. 22
0
extern "C" magma_int_t
magma_zhegst_gpu(magma_int_t itype, char uplo, magma_int_t n,
                 cuDoubleComplex *da, magma_int_t ldda,
                 cuDoubleComplex *db, magma_int_t lddb, magma_int_t *info)
{
/*
  -- MAGMA (version 1.3.0) --
     Univ. of Tennessee, Knoxville
     Univ. of California, Berkeley
     Univ. of Colorado, Denver
     November 2012
 
   Purpose
   =======
   ZHEGST_GPU reduces a complex Hermitian-definite generalized
   eigenproblem to standard form.
   
   If ITYPE = 1, the problem is A*x = lambda*B*x,
   and A is overwritten by inv(U**H)*A*inv(U) or inv(L)*A*inv(L**H)
   
   If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or
   B*A*x = lambda*x, and A is overwritten by U*A*U**H or L**H*A*L.
   
   B must have been previously factorized as U**H*U or L*L**H by ZPOTRF.
   
   Arguments
   =========
   ITYPE   (input) INTEGER
           = 1: compute inv(U**H)*A*inv(U) or inv(L)*A*inv(L**H);
           = 2 or 3: compute U*A*U**H or L**H*A*L.
   
   UPLO    (input) CHARACTER*1
           = 'U':  Upper triangle of A is stored and B is factored as
                   U**H*U;
           = 'L':  Lower triangle of A is stored and B is factored as
                   L*L**H.
   
   N       (input) INTEGER
           The order of the matrices A and B.  N >= 0.
   
   DA      (device input/output) COMPLEX*16 array, dimension (LDA,N)
           On entry, the Hermitian matrix A.  If UPLO = 'U', the leading
           N-by-N upper triangular part of A contains the upper
           triangular part of the matrix A, and the strictly lower
           triangular part of A is not referenced.  If UPLO = 'L', the
           leading N-by-N lower triangular part of A contains the lower
           triangular part of the matrix A, and the strictly upper
           triangular part of A is not referenced.
   
           On exit, if INFO = 0, the transformed matrix, stored in the
           same format as A.
   
   LDDA    (input) INTEGER
           The leading dimension of the array A.  LDA >= max(1,N).
   
   DB      (device input) COMPLEX*16 array, dimension (LDB,N)
           The triangular factor from the Cholesky factorization of B,
           as returned by ZPOTRF.
   
   LDDB    (input) INTEGER
           The leading dimension of the array B.  LDB >= max(1,N).
   
   INFO    (output) INTEGER
           = 0:  successful exit
           < 0:  if INFO = -i, the i-th argument had an illegal value
   =====================================================================*/
  
  char uplo_[2] = {uplo, 0};
  magma_int_t        nb;
  magma_int_t        k, kb, kb2;
  cuDoubleComplex    c_one      = MAGMA_Z_ONE;
  cuDoubleComplex    c_neg_one  = MAGMA_Z_NEG_ONE;
  cuDoubleComplex    c_half     = MAGMA_Z_HALF;
  cuDoubleComplex    c_neg_half = MAGMA_Z_NEG_HALF;
  cuDoubleComplex   *w;
  magma_int_t        lda;
  magma_int_t        ldb;
  double             d_one = 1.0;
  int upper = lapackf77_lsame(uplo_, "U");
  
  /* Test the input parameters. */
  *info = 0;
  if (itype<1 || itype>3){
    *info = -1;
  }else if ((! upper) && (! lapackf77_lsame(uplo_, "L"))) {
    *info = -2;
  } else if (n < 0) {
    *info = -3;
  } else if (ldda < max(1,n)) {
    *info = -5;
  }else if (lddb < max(1,n)) {
    *info = -7;
  }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
  
  /* Quick return */
  if ( n == 0 )
    return *info;
  
  nb = magma_get_zhegst_nb(n);
  
  lda = nb;
  ldb = nb;
  
  if (MAGMA_SUCCESS != magma_zmalloc_pinned( &w, 2*nb*nb )) {
    *info = MAGMA_ERR_DEVICE_ALLOC;
    return *info;
  }
  
  cudaStream_t stream[3];
  magma_queue_create( &stream[0] );
  magma_queue_create( &stream[1] );
  magma_queue_create( &stream[2] );
  
  /* Use hybrid blocked code */    
  if (itype==1) 
    {
      if (upper) 
        {
          kb = min(n,nb);
        
          /* Compute inv(U')*A*inv(U) */
          magma_zgetmatrix_async( kb, kb,
                                  dB(0, 0), lddb,
                                  B(0, 0),  nb, stream[2] );
          magma_zgetmatrix_async( kb, kb,
                                  dA(0, 0), ldda,
                                  A(0, 0),  nb, stream[1] );
          
          for(k = 0; k<n; k+=nb){
            kb = min(n-k,nb);
            kb2= min(n-k-nb,nb);
            
            /* Update the upper triangle of A(k:n,k:n) */
            
            magma_queue_sync( stream[2] );
            magma_queue_sync( stream[1] );
            
            lapackf77_zhegs2( &itype, uplo_, &kb, A(0,0), &lda, B(0,0), &ldb, info);
            
            magma_zsetmatrix_async( kb, kb,
                                    A(0, 0),  lda,
                                    dA(k, k), ldda, stream[0] );
            
            if(k+kb<n){
              
              // Start copying the new B block
              magma_zgetmatrix_async( kb2, kb2,
                                      dB(k+kb, k+kb), lddb,
                                      B(0, 0),        nb, stream[2] );
            
              magma_ztrsm(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                          kb, n-k-kb,
                          c_one, dB(k,k), lddb, 
                          dA(k,k+kb), ldda); 
            
              magma_queue_sync( stream[0] );
            
              magma_zhemm(MagmaLeft, MagmaUpper,
                          kb, n-k-kb,
                          c_neg_half, dA(k,k), ldda,
                          dB(k,k+kb), lddb,
                          c_one, dA(k, k+kb), ldda);
              
              magma_zher2k(MagmaUpper, MagmaConjTrans,
                           n-k-kb, kb,
                           c_neg_one, dA(k,k+kb), ldda,
                           dB(k,k+kb), lddb,
                           d_one, dA(k+kb,k+kb), ldda);
            
              magma_zgetmatrix_async( kb2, kb2,
                                      dA(k+kb, k+kb), ldda,
                                      A(0, 0),        lda, stream[1] );
            
              magma_zhemm(MagmaLeft, MagmaUpper,
                          kb, n-k-kb,
                          c_neg_half, dA(k,k), ldda,
                          dB(k,k+kb), lddb,
                          c_one, dA(k, k+kb), ldda);
              
              magma_ztrsm(MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit,
                          kb, n-k-kb,
                          c_one ,dB(k+kb,k+kb), lddb,
                          dA(k,k+kb), ldda);
              
            }
            
          }
          
          magma_queue_sync( stream[0] );
          
        } else {
        
        kb = min(n,nb);
        
        /* Compute inv(L)*A*inv(L') */
        
        magma_zgetmatrix_async( kb, kb,
                                dB(0, 0), lddb,
                                B(0, 0),  nb, stream[2] );
        magma_zgetmatrix_async( kb, kb,
                                dA(0, 0), ldda,
                                A(0, 0),  nb, stream[1] );
        
        for(k = 0; k<n; k+=nb){
          kb= min(n-k,nb);
          kb2= min(n-k-nb,nb);
          
          /* Update the lower triangle of A(k:n,k:n) */
          
          magma_queue_sync( stream[2] );
          magma_queue_sync( stream[1] );
          
          lapackf77_zhegs2( &itype, uplo_, &kb, A(0, 0), &lda, B(0, 0), &ldb, info);
          
          magma_zsetmatrix_async( kb, kb,
                                  A(0, 0),  lda,
                                  dA(k, k), ldda, stream[0] );
          
          if(k+kb<n){
            
            // Start copying the new B block
            magma_zgetmatrix_async( kb2, kb2,
                                    dB(k+kb, k+kb), lddb,
                                    B(0, 0),        nb, stream[2] );
            
            magma_ztrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                        n-k-kb, kb,
                        c_one, dB(k,k), lddb, 
                        dA(k+kb,k), ldda);
            
            magma_queue_sync( stream[0] );
            
            magma_zhemm(MagmaRight, MagmaLower,
                        n-k-kb, kb,
                        c_neg_half, dA(k,k), ldda,
                        dB(k+kb,k), lddb,
                        c_one, dA(k+kb, k), ldda);
            
            magma_zher2k(MagmaLower, MagmaNoTrans,
                         n-k-kb, kb,
                         c_neg_one, dA(k+kb,k), ldda,
                         dB(k+kb,k), lddb,
                         d_one, dA(k+kb,k+kb), ldda);
            
            magma_zgetmatrix_async( kb2, kb2,
                                    dA(k+kb, k+kb), ldda,
                                    A(0, 0),        lda, stream[1] );
            
            magma_zhemm(MagmaRight, MagmaLower,
                        n-k-kb, kb,
                        c_neg_half, dA(k,k), ldda,
                        dB(k+kb,k), lddb,
                        c_one, dA(k+kb, k), ldda);
            
            magma_ztrsm(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit,
                        n-k-kb, kb,
                        c_one, dB(k+kb,k+kb), lddb, 
                        dA(k+kb,k), ldda);            
          }
          
        }
        
      }
      
      magma_queue_sync( stream[0] );
      
    } else {
      
      if (upper) {
        
        /* Compute U*A*U' */
        
        for(k = 0; k<n; k+=nb){
          kb= min(n-k,nb);
          
          magma_zgetmatrix_async( kb, kb,
                                  dB(k, k), lddb,
                                  B(0, 0),  nb, stream[2] );
          
          /* Update the upper triangle of A(1:k+kb-1,1:k+kb-1) */
          if(k>0){
            
            magma_ztrmm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit,
                        k, kb,
                        c_one ,dB(0,0), lddb,
                        dA(0,k), ldda);
            
            magma_zhemm(MagmaRight, MagmaUpper,
                        k, kb,
                        c_half, dA(k,k), ldda,
                        dB(0,k), lddb,
                        c_one, dA(0, k), ldda);
            
            magma_queue_sync( stream[1] );
            
          }
          
          magma_zgetmatrix_async( kb, kb,
                                  dA(k, k), ldda,
                                  A(0, 0),  lda, stream[0] );
          
          if(k>0){
            
            magma_zher2k(MagmaUpper, MagmaNoTrans,
                         k, kb,
                         c_one, dA(0,k), ldda,
                         dB(0,k), lddb,
                         d_one, dA(0,0), ldda);
            
            magma_zhemm(MagmaRight, MagmaUpper,
                        k, kb,
                        c_half, dA(k,k), ldda,
                        dB(0,k), lddb,
                        c_one, dA(0, k), ldda);
            
            magma_ztrmm(MagmaRight, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                        k, kb,
                        c_one, dB(k,k), lddb, 
                        dA(0,k), ldda);
            
          }

          magma_queue_sync( stream[2] );
          magma_queue_sync( stream[0] );
          
          lapackf77_zhegs2( &itype, uplo_, &kb, A(0, 0), &lda, B(0, 0), &ldb, info);
          
          magma_zsetmatrix_async( kb, kb,
                                  A(0, 0),  lda,
                                  dA(k, k), ldda, stream[1] );
          
        }
        
        magma_queue_sync( stream[1] );
        
      } else {
        
        /* Compute L'*A*L */
        
        for(k = 0; k<n; k+=nb){
          kb= min(n-k,nb);
          
          magma_zgetmatrix_async( kb, kb,
                                  dB(k, k), lddb,
                                  B(0, 0),  nb, stream[2] );
          
          /* Update the lower triangle of A(1:k+kb-1,1:k+kb-1) */
          if(k>0){ 
            
            magma_ztrmm(MagmaRight, MagmaLower, MagmaNoTrans, MagmaNonUnit,
                        kb, k,
                        c_one ,dB(0,0), lddb,
                        dA(k,0), ldda);
            
            magma_zhemm(MagmaLeft, MagmaLower,
                        kb, k,
                        c_half, dA(k,k), ldda,
                        dB(k,0), lddb,
                        c_one, dA(k, 0), ldda);
            
            magma_queue_sync( stream[1] );
            
          }
          
          magma_zgetmatrix_async( kb, kb,
                                  dA(k, k), ldda,
                                  A(0, 0),  lda, stream[0] );
          
          if(k>0){
            
            magma_zher2k(MagmaLower, MagmaConjTrans,
                         k, kb,
                         c_one, dA(k,0), ldda,
                         dB(k,0), lddb,
                         d_one, dA(0,0), ldda);
            
            magma_zhemm(MagmaLeft, MagmaLower,
                        kb, k,
                        c_half, dA(k,k), ldda,
                        dB(k,0), lddb,
                        c_one, dA(k, 0), ldda);
            
            magma_ztrmm(MagmaLeft, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                        kb, k,
                        c_one, dB(k,k), lddb, 
                        dA(k,0), ldda);
          }
          
          magma_queue_sync( stream[2] );
          magma_queue_sync( stream[0] );
          
          lapackf77_zhegs2( &itype, uplo_, &kb, A(0, 0), &lda, B(0, 0), &ldb, info);
          
          magma_zsetmatrix_async( kb, kb,
                                  A(0, 0),  lda,
                                  dA(k, k), ldda, stream[1] );
        }
        
        magma_queue_sync( stream[1] );
        
      }
  }
  magma_queue_destroy( stream[0] );
  magma_queue_destroy( stream[1] ); 
  magma_queue_destroy( stream[2] );
  
  magma_free_pinned( w );
  
  return *info;
} /* magma_zhegst_gpu */
Esempio n. 23
0
extern "C" magma_int_t
magma_zunmtr(char side, char uplo, char trans,
             magma_int_t m, magma_int_t n, 
             cuDoubleComplex *a,    magma_int_t lda, 
             cuDoubleComplex *tau, 
             cuDoubleComplex *c,    magma_int_t ldc,
             cuDoubleComplex *work, magma_int_t lwork, 
             magma_int_t *info)
{
/*  -- MAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2012

    Purpose   
    =======   
    ZUNMTR overwrites the general complex M-by-N matrix C with   

                    SIDE = 'L'     SIDE = 'R'   
    TRANS = 'N':      Q * C          C * Q   
    TRANS = 'T':      Q**H * C       C * Q**H   

    where Q is a complex orthogonal matrix of order nq, with nq = m if   
    SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of   
    nq-1 elementary reflectors, as returned by SSYTRD:   

    if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1);   

    if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1).   

    Arguments   
    =========   
    SIDE    (input) CHARACTER*1   
            = 'L': apply Q or Q**H from the Left;   
            = 'R': apply Q or Q**H from the Right.   

    UPLO    (input) CHARACTER*1   
            = 'U': Upper triangle of A contains elementary reflectors   
                   from SSYTRD;   
            = 'L': Lower triangle of A contains elementary reflectors   
                   from SSYTRD.   

    TRANS   (input) CHARACTER*1   
            = 'N':  No transpose, apply Q;   
            = 'T':  Transpose, apply Q**H.   

    M       (input) INTEGER   
            The number of rows of the matrix C. M >= 0.   

    N       (input) INTEGER   
            The number of columns of the matrix C. N >= 0.   

    A       (input) COMPLEX_16 array, dimension   
                                 (LDA,M) if SIDE = 'L'   
                                 (LDA,N) if SIDE = 'R'   
            The vectors which define the elementary reflectors, as   
            returned by SSYTRD.   

    LDA     (input) INTEGER   
            The leading dimension of the array A.   
            LDA >= max(1,M) if SIDE = 'L'; LDA >= max(1,N) if SIDE = 'R'.   

    TAU     (input) COMPLEX_16 array, dimension   
                                 (M-1) if SIDE = 'L'   
                                 (N-1) if SIDE = 'R'   
            TAU(i) must contain the scalar factor of the elementary   
            reflector H(i), as returned by SSYTRD.   

    C       (input/output) COMPLEX_16 array, dimension (LDC,N)   
            On entry, the M-by-N matrix C.   
            On exit, C is overwritten by Q*C or Q**H * C or C * Q**H or C*Q.   

    LDC     (input) INTEGER   
            The leading dimension of the array C. LDC >= max(1,M).   

    WORK    (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK))   
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.   

    LWORK   (input) INTEGER   
            The dimension of the array WORK.   
            If SIDE = 'L', LWORK >= max(1,N);   
            if SIDE = 'R', LWORK >= max(1,M).   
            For optimum performance LWORK >= N*NB if SIDE = 'L', and   
            LWORK >= M*NB if SIDE = 'R', where NB is the optimal   
            blocksize.   

            If LWORK = -1, then a workspace query is assumed; the routine   
            only calculates the optimal size of the WORK array, returns   
            this value as the first entry of the WORK array, and no error   
            message related to LWORK is issued.   

    INFO    (output) INTEGER   
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value   
    =====================================================================    */
   
    cuDoubleComplex c_one = MAGMA_Z_ONE;

    char side_[2]  = {side, 0};
    char uplo_[2]  = {uplo, 0};
    char trans_[2] = {trans, 0};
    magma_int_t  i__2;
    magma_int_t i1, i2, nb, mi, ni, nq, nw;
    int left, upper, lquery;
    magma_int_t iinfo;
    magma_int_t lwkopt;

    *info = 0;
    left   = lapackf77_lsame(side_, "L");
    upper  = lapackf77_lsame(uplo_, "U");
    lquery = lwork == -1;

    /* NQ is the order of Q and NW is the minimum dimension of WORK */
    if (left) {
        nq = m;
        nw = n;
    } else {
        nq = n;
        nw = m;
    }
    if (! left && ! lapackf77_lsame(side_, "R")) {
        *info = -1;
    } else if (! upper && ! lapackf77_lsame(uplo_, "L")) {
        *info = -2;
    } else if (! lapackf77_lsame(trans_, "N") && 
               ! lapackf77_lsame(trans_, "C")) {
        *info = -3;
    } else if (m < 0) {
        *info = -4;
    } else if (n < 0) {
        *info = -5;
    } else if (lda < max(1,nq)) {
        *info = -7;
    } else if (ldc < max(1,m)) {
        *info = -10;
    } else if (lwork < max(1,nw) && ! lquery) {
        *info = -12;
    }

    if (*info == 0) 
      {
        nb = 32;
        lwkopt = max(1,nw) * nb;
        MAGMA_Z_SET2REAL( work[0], lwkopt );
      }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0 || nq == 1) {
        work[0] = c_one;
        return *info;
    }

    if (left) {
        mi = m - 1;
        ni = n;
    } else {
        mi = m;
        ni = n - 1;
    }

    if (upper) 
      {
        /* Q was determined by a call to SSYTRD with UPLO = 'U' */
        i__2 = nq - 1;
        //lapackf77_zunmql(side_, trans_, &mi, &ni, &i__2, &a[lda], &lda, 
        //                 tau, c, &ldc, work, &lwork, &iinfo);
        magma_zunmql(side, trans, mi, ni, i__2, &a[lda], lda, tau,
                     c, ldc, work, lwork, &iinfo);
      }
    else 
      {
        /* Q was determined by a call to SSYTRD with UPLO = 'L' */
        if (left) {
            i1 = 1;
            i2 = 0;
        } else {
            i1 = 0;
            i2 = 1;
        }
        i__2 = nq - 1;
        magma_zunmqr(side, trans, mi, ni, i__2, &a[1], lda, tau,
                     &c[i1 + i2 * ldc], ldc, work, lwork, &iinfo);
      }

    MAGMA_Z_SET2REAL( work[0], lwkopt );

    return *info;
} /* magma_zunmtr */
Esempio n. 24
0
extern "C" magma_int_t
magma_dpotri_gpu(magma_uplo_t uplo, magma_int_t n,
        magmaDouble_ptr a, size_t offset_a, magma_int_t lda, magma_int_t *info, magma_queue_t queue)
{
/*  -- MAGMA (version 1.1.0) --
    Univ. of Tennessee, Knoxville
    Univ. of California, Berkeley
    Univ. of Colorado, Denver
    @date January 2014

    Purpose
    =======

    DPOTRI computes the inverse of a real symmetric positive definite
    matrix A using the Cholesky factorization A = U**T*U or A = L*L**T
    computed by DPOTRF.

    Arguments
    =========

    UPLO    (input) CHARACTER*1
    = 'U':  Upper triangle of A is stored;
    = 'L':  Lower triangle of A is stored.

    N       (input) INTEGER
    The order of the matrix A.  N >= 0.

    A       (input/output) DOUBLE_PRECISION array, dimension (LDA,N)
    On entry, the triangular factor U or L from the Cholesky
    factorization A = U**T*U or A = L*L**T, as computed by
    DPOTRF.
    On exit, the upper or lower triangle of the (symmetric)
    inverse of A, overwriting the input factor U or L.

    LDA     (input) INTEGER
    The leading dimension of the array A.  LDA >= max(1,N).
    INFO    (output) INTEGER
    = 0:  successful exit
    < 0:  if INFO = -i, the i-th argument had an illegal value
    > 0:  if INFO = i, the (i,i) element of the factor U or L is
    zero, and the inverse could not be computed.

    ===================================================================== */

    /* Local variables */
    magma_uplo_t uplo_ = uplo;

    *info = 0;
    if ((! lapackf77_lsame(lapack_const(uplo_), lapack_const(MagmaUpper))) && (! lapackf77_lsame(lapack_const(uplo_), lapack_const(MagmaLower))))
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (lda < max(1,n))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if ( n == 0 )
        return *info;

    /* Invert the triangular Cholesky factor U or L */
    magma_dtrtri_gpu( uplo, MagmaNonUnit, n, a, offset_a, lda, info );
    
    if ( *info == 0 ) {
        /* Form inv(U) * inv(U)**T or inv(L)**T * inv(L) */
        magma_dlauum_gpu( uplo, n, a, offset_a, lda, info, queue );
    }

    return *info;
} /* magma_dpotri */
Esempio n. 25
0
extern "C" magma_int_t
magma_dpotrf3_mgpu(int num_gpus, char uplo, magma_int_t m, magma_int_t n, 
                   magma_int_t off_i, magma_int_t off_j, magma_int_t nb,
                   double **d_lA,  magma_int_t ldda, 
                   double **d_lP,  magma_int_t lddp, 
                   double *a,      magma_int_t lda,   magma_int_t h,
                   cudaStream_t stream[][3], cudaEvent_t event[][5],
                   magma_int_t *info ) 
{
/*  -- MAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2012

    Purpose   
    =======   
    DPOTRF computes the Cholesky factorization of a real symmetric   
    positive definite matrix dA.   
    Auxiliary subroutine for dpotrf2_ooc. It is multiple gpu interface to compute 
    Cholesky of a "rectangular" matrix.

    The factorization has the form   
       dA = U**T * U,  if UPLO = 'U', or   
       dA = L  * L**T,  if UPLO = 'L',   
    where U is an upper triangular matrix and L is lower triangular.   

    This is the block version of the algorithm, calling Level 3 BLAS.   

    Arguments   
    =========   
    UPLO    (input) CHARACTER*1   
            = 'U':  Upper triangle of dA is stored;   
            = 'L':  Lower triangle of dA is stored.   

    N       (input) INTEGER   
            The order of the matrix dA.  N >= 0.   

    dA      (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N)   
            On entry, the symmetric matrix dA.  If UPLO = 'U', the leading   
            N-by-N upper triangular part of dA contains the upper   
            triangular part of the matrix dA, and the strictly lower   
            triangular part of dA is not referenced.  If UPLO = 'L', the   
            leading N-by-N lower triangular part of dA contains the lower   
            triangular part of the matrix dA, and the strictly upper   
            triangular part of dA is not referenced.   

            On exit, if INFO = 0, the factor U or L from the Cholesky   
            factorization dA = U**T * U or dA = L * L**T.   

    LDDA     (input) INTEGER   
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            dividable by 16.

    INFO    (output) INTEGER   
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value   
            > 0:  if INFO = i, the leading minor of order i is not   
                  positive definite, and the factorization could not be   
                  completed.   
    =====================================================================   */


    magma_int_t     j, jb, nb0, nb2, d, dd, id, j_local, j_local2, buf;
    char            uplo_[2] = {uplo, 0};
    double c_one     = MAGMA_D_ONE;
    double c_neg_one = MAGMA_D_NEG_ONE;
    double          d_one     =  1.0;
    double          d_neg_one = -1.0;
    int upper = lapackf77_lsame(uplo_, "U");
    double *dlpanel;
    magma_int_t n_local[MagmaMaxGPUs], ldpanel;
    //cudaEvent_t event0[MagmaMaxGPUs],  /* send row to CPU    */
    //            event1[MagmaMaxGPUs],  /* send diag to GPU   */
    //            event2[MagmaMaxGPUs],  /* offdiagonal update */
    //            event3[MagmaMaxGPUs],  /* send row to GPU    */
    //            event4[MagmaMaxGPUs];  /* lookahead          */
    const magma_int_t stream1 = 0, stream2 = 1, stream3 = 2;
    double *d_dinvA[MagmaMaxGPUs][2], *d_x[MagmaMaxGPUs][2]; /* used by dtrsm_work */

    *info = 0;
    if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (!upper && num_gpus*ldda < max(1,n)) {
        *info = -4;
    } else if (upper && ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* initialization */
    for( d=0; d<num_gpus; d++ ) {
      /* local-n and local-ld */
      if (upper) {
        n_local[d] = ((n/nb)/num_gpus)*nb;
        if (d < (n/nb)%num_gpus)
          n_local[d] += nb;
        else if (d == (n/nb)%num_gpus)
          n_local[d] += n%nb;
      } else {
        n_local[d] = ((m/nb)/num_gpus)*nb;
        if (d < (m/nb)%num_gpus)
          n_local[d] += nb;
        else if (d == (m/nb)%num_gpus)
          n_local[d] += m%nb;
      }
      //magma_setdevice(d);
      //magma_event_create( &event0[d] );
      //magma_event_create( &event1[d] );
      //magma_event_create( &event2[d] );
      //magma_event_create( &event3[d] );
      //magma_event_create( &event4[d] );
    }

    /* == initialize the trace */
    trace_init( 1, num_gpus, 3, (CUstream_st**)stream );

    if (upper) 
    {     
      /* ---------------------------------------------- */
      /* Upper-triangular case                          */
      /* > Compute the Cholesky factorization A = U'*U. */
      /* ---------------------------------------------- */
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
      /* invert the diagonals 
       * Allocate device memory for the inversed diagonal blocks, size=m*NB
       */
      for( d=0; d<num_gpus; d++ ) {
          magma_setdevice(d);
          for( j=0; j<2; j++ ) {
              cudaMalloc((void**)&d_dinvA[d][j], nb*nb*sizeof(double));
              cudaMalloc((void**)&d_x[d][j],      n*nb*sizeof(double));
              cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(double));
              cudaMemset(d_x[d][j],     0,  n*nb*sizeof(double));
          }
      }
      magma_setdevice(0);
#endif

      for (j=0; j<m; j+=nb) {

        /* Set the GPU number that holds the current panel */
        id  = (j/nb)%num_gpus;
        buf = (j/nb)%num_gpus;

        /* Set the local index where the current panel is */
        j_local = j/(nb*num_gpus);
        jb = min(nb, (m-j));

        /* Update the current diagonal block on stream1 */
        magma_setdevice(id);
        if( j > 0 ) {
            magmablasSetKernelStream(stream[id][stream1]);
            trace_gpu_start( id, stream1, "syrk", "syrk" );
            magma_dsyrk(MagmaUpper, MagmaTrans, jb, j, 
                        d_neg_one, dlA(id, 0, nb*j_local), ldda, 
                        d_one,     dlA(id, j, nb*j_local), ldda);
            trace_gpu_end( id, stream1 );
        }

        /* send the diagonal to cpu on stream1 */
        trace_gpu_start( id, stream1, "comm", "D to CPU" );
        magma_dgetmatrix_async( jb, jb,
                                dlA(id, j, nb*j_local), ldda,
                                Aup(j,j),               lda, 
                                stream[id][stream1] );
        trace_gpu_end( id, stream1 );

        /* update off-diagonal blocks in the panel */
        if( j > 0 ) {
            d = (j/nb+1)%num_gpus;
            for( dd=0; dd<num_gpus; dd++ ) {
                j_local2 = j_local+1;
                if( d > id ) j_local2 --;
                nb0 = nb*j_local2; 

                if( n_local[d] > nb0 ) {
                    magma_setdevice(d);
                    magmablasSetKernelStream(stream[d][stream2]);
                    if( d == id ) {
                        dlpanel = dlA(d, 0, nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlP(d, jb, 0, buf);
                        ldpanel = lddp;
                        magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu
                    }
                    trace_gpu_start( d, stream2, "gemm", "gemm" );
                    magma_dgemm(MagmaTrans, MagmaNoTrans, 
                                jb, n_local[d]-nb0, j, 
                                c_neg_one, dlpanel,        ldpanel, 
                                           dlA(d, 0, nb0), ldda,
                                c_one,     dlA(d, j, nb0), ldda);
                    trace_gpu_end( d, stream2 );
                    magma_event_record( event[d][2], stream[d][stream2] );
                }
                d = (d+1)%num_gpus;
            }
        }

        /* wait for panel and factorize it on cpu */
        magma_setdevice(id);
        magma_queue_sync( stream[id][stream1] );
        trace_cpu_start( 0, "getrf", "getrf" );
        lapackf77_dpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info);
        trace_cpu_end( 0 );
        if (*info != 0) {
            *info = *info + j;
            break;
        }

        /* send the diagonal to gpus on stream1 */
        if ( (j+jb) < n) {
          d = (j/nb+1)%num_gpus;
          for( dd=0; dd<num_gpus; dd++ ) {
            if( d == id ) {
                dlpanel = dlA(d, j, nb*j_local);
                ldpanel = ldda;
            } else {
                dlpanel = dlP(d, 0, 0, buf);
                ldpanel = lddp;
            }
            magma_setdevice(d);
            trace_gpu_start( d, stream1, "comm", "comm" );
            magma_dsetmatrix_async( jb, jb,
                                    Aup(j,j), lda,
                                    dlpanel,  ldpanel, 
                                    stream[d][stream1] );
            trace_gpu_end( d, stream1 );
            magma_event_record( event[d][1], stream[d][stream1] );
            d = (d+1)%num_gpus;
          }
        } else {
          magma_setdevice(id);
          trace_gpu_start( id, stream1, "comm", "comm" );
          magma_dsetmatrix_async( jb, jb,
                                  Aup(j,j),               lda,
                                  dlA(id, j, nb*j_local), ldda, 
                                  stream[id][stream1] );
          trace_gpu_end( id, stream1 );
        }

        /* panel-factorize the off-diagonal */
        if ( (j+jb) < n) {
            d = (j/nb+1)%num_gpus;
            for( dd=0; dd<num_gpus; dd++ ) {
                /* next column */
                j_local2 = j_local+1;
                if( d > id ) j_local2--;
                if( d == id ) {
                    dlpanel = dlA(d,j,nb*j_local);
                    ldpanel = ldda;
                } else {
                    dlpanel = dlP(d, 0, 0, buf);
                    ldpanel = lddp;
                }
                nb2 = n_local[d] - j_local2*nb;
                nb0 = min(nb, nb2);
        
                magma_setdevice(d);
                //magma_queue_sync( stream[d][stream1] );  // synch on chol for remaining update
                //magma_queue_sync( stream[d][stream2] );
                if( j+jb < m && d == (j/nb+1)%num_gpus ) { 
                    /* owns the next column, look-ahead next block on stream1 */
                    magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update 
                    magmablasSetKernelStream(stream[d][stream1]);
                    trace_gpu_start( d, stream1, "trsm", "trsm" );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
                    magmablas_dtrsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, 
                                          jb, nb0, c_one,
                                          dlpanel,                ldpanel, 
                                          dlA(d, j, nb*j_local2), ldda,
                                          d_dinvA[d][0], d_x[d][0] );
#else
                    magma_dtrsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, 
                                 jb, nb0, c_one,
                                 dlpanel,                ldpanel, 
                                 dlA(d, j, nb*j_local2), ldda);
#endif
                    magma_event_record( event[d][4], stream[d][stream1] );
                    trace_gpu_end( d, stream1 );
                } else if( nb2 > 0 ) {
                    /* update all the blocks on stream2 */
                    magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor 
                    trace_gpu_start( d, stream2, "trsm", "trsm" );
                    magmablasSetKernelStream(stream[d][stream2]);
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
                    magmablas_dtrsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, 
                                          jb, nb2, c_one,
                                          dlpanel,                ldpanel, 
                                          dlA(d, j, nb*j_local2), ldda,
                                          d_dinvA[d][1], d_x[d][1] );
#else
                    magma_dtrsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, 
                                 jb, nb2, c_one,
                                 dlpanel,                ldpanel, 
                                 dlA(d, j, nb*j_local2), ldda);
#endif
                    trace_gpu_end( d, stream2 );
                }
                d = (d+1)%num_gpus;
            } /* end of for */

/* ======================================================================================== */

            d = (j/nb+1)%num_gpus;
            /* next column */
            j_local2 = j_local+1;
            if( d > id ) j_local2--;
            nb0 = min(nb, n_local[d]-nb*j_local2 );
        
            /* even on 1 gpu, off-diagonals are copied to cpu (synchronize at the end).      *
             * so we have the Cholesky factor, but only diagonal submatrix of the big panel, *
             * on cpu at the end.                                                            */
            if( j+jb < m ) { 
                int d2, id2, j2, buf2;
                magma_setdevice(d);
                /* make sure all the previous sets are done */
                if( h < num_gpus ) {
                    /* > offdiagonal */
                    for( d2=0; d2<num_gpus; d2++ ) {
                        j2 = j - (1+d2)*nb;
                        if( j2 < 0 ) break;
                        id2  = (j2/nb)%num_gpus;
                        magma_queue_wait_event( stream[d][stream3], event[id2][0] );
                    }

                    /* > diagonal */
                    for( d2=0; d2<num_gpus; d2++ ) {
                        j2 = j - d2*nb;
                        if( j2 < 0 ) break;
                        id2  = (j2/nb)%num_gpus;
                        magma_queue_wait_event( stream[d][stream3], event[id2][1] );
                    }
                }
                /* lookahead */
                magma_queue_wait_event( stream[d][stream3], event[d][4] );

                trace_gpu_start( d, stream3, "comm", "row to CPU" );
                magma_dgetmatrix_async( (j+jb), nb0,
                                        dlA(d, 0, nb*j_local2), ldda,
                                        Aup(0,j+jb),            lda, 
                                        stream[d][stream3] );
                trace_gpu_end( d, stream3 );
                magma_event_record( event[d][3], stream[d][stream3] );
/* needed on pluto */
magma_queue_sync( stream[d][stream3] );

                /* wait for the off-diagonal on cpu */
                //magma_setdevice(id);
                //magma_queue_sync( stream[id][stream3] );

                /* broadcast rows to gpus on stream2 */
                buf2 = ((j+jb)/nb)%num_gpus;
                for( d2=0; d2<num_gpus; d2++ ) {
                    if( d2 != d ) 
                    {
                        magma_setdevice(d2);
                        trace_gpu_start( d2, stream3, "comm", "row to GPUs" );
                        magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // rows arrived at cpu on stream3
                        magma_dsetmatrix_async( j+jb, nb0,
                                                Aup(0,j+jb),        lda,
                                                dlP(d2,nb0,0,buf2), lddp, 
                                                stream[d2][stream3] );
                        trace_gpu_end( d2, stream3 );
                        magma_event_record( event[d2][0], stream[d2][stream3] );
                    }
                } 
            }

/* ======================================================================================== */
            /* gpu owning the next column                    */
            /* after look ahead, update the remaining blocks */
            if( j+jb < m ) /* no update on the last block column */ 
            {
                d = (j/nb+1)%num_gpus;
                /* next column */
                j_local2 = j_local+1;
                if( d > id ) j_local2--;
                if( d == id ) {
                    dlpanel = dlA(d, j, nb*j_local);
                    ldpanel = ldda;
                } else {
                    dlpanel = dlP(d, 0, 0, buf);
                    ldpanel = lddp;
                }
                nb0 = min(nb, n_local[d]-nb*j_local2 );
                nb2 =         n_local[d]-nb*j_local2 - nb0;
        
                /* update the remaining blocks */
                if( nb2 > 0 ) {
                    magma_setdevice(d);
                    magmablasSetKernelStream(stream[d][stream2]);  
                    magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor
                    trace_gpu_start( d, stream2, "trsm", "trsm" );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
                    magmablas_dtrsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, 
                                          jb, nb2, c_one,
                                          dlpanel,                    ldpanel, 
                                          dlA(d, j, nb*j_local2+nb0), ldda,
                                          d_dinvA[d][1], d_x[d][1] );
#else
                    magma_dtrsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, 
                                 jb, nb2, c_one,
                                 dlpanel,                    ldpanel, 
                                 dlA(d, j, nb*j_local2+nb0), ldda);
#endif
                    trace_gpu_end( d, stream2 );
                }
            }
        } /* end of dtrsm */
      } /* end of for j=1, .., n */
    } else { 

      /* ---------------------------------------------- */
      /* Lower-triangular case                          */
      /* > Compute the Cholesky factorization A = L*L'. */
      /* ---------------------------------------------- */
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
      /* 
       * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE
       */
      for( d=0; d<num_gpus; d++ ) {
         magma_setdevice(d);
         for( j=0; j<2; j++ ) {
             cudaMalloc((void**)&d_dinvA[d][j], nb*nb*sizeof(double));
             cudaMalloc((void**)&d_x[d][j],     nb*m *sizeof(double));
             cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(double));
             cudaMemset(d_x[d][j],     0, nb* m*sizeof(double));
         }
      }
      magma_setdevice(0);
#endif

      for (j=0; j<n; j+=nb) {

        /* Set the GPU number that holds the current panel */
        id  = (j/nb)%num_gpus;
        buf = (j/nb)%num_gpus;

        /* Set the local index where the current panel is */
        j_local = j/(nb*num_gpus);
        jb = min(nb, (n-j));

        /* Update the current diagonal block on stream1 */
        magma_setdevice(id);
        if( j > 0 ) {
            magmablasSetKernelStream(stream[id][stream1]);
            magma_dsyrk(MagmaLower, MagmaNoTrans, jb, j,
                        d_neg_one, dlA(id, nb*j_local, 0), ldda,
                        d_one,     dlA(id, nb*j_local, j), ldda);
        }

        /* send the diagonal to cpu on stream1 */
        magma_dgetmatrix_async( jb, jb,
                                dlA(id, nb*j_local, j), ldda,
                                Alo(j,j),               lda, 
                                stream[id][stream1] );

        /* update off-diagonal blocks of the panel */
        if( j > 0 ) {
            d = (j/nb+1)%num_gpus;
            for( dd=0; dd<num_gpus; dd++ ) {
                j_local2 = j_local+1;
                if( d > id ) j_local2 --;
                nb0 = nb*j_local2; 

                if( nb0 < n_local[d] ) {
                    magma_setdevice(d);
                    magmablasSetKernelStream(stream[d][stream2]);
                    if( d == id ) {
                        dlpanel = dlA(d, nb*j_local, 0);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlPT(d,0,jb,buf);
                        ldpanel = nb;
                        magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu
                    }
                    magma_dgemm( MagmaNoTrans, MagmaTrans,
                                 n_local[d]-nb0, jb, j,
                                 c_neg_one, dlA(d, nb0, 0), ldda,
                                            dlpanel,        ldpanel,
                                 c_one,     dlA(d, nb0, j), ldda);
                    magma_event_record( event[d][2], stream[d][stream2] );
                }
                d = (d+1)%num_gpus;
            }
        }

        /* wait for the panel and factorized it on cpu */
        magma_setdevice(id);
        magma_queue_sync( stream[id][stream1] );
        lapackf77_dpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info);
        if (*info != 0) {
           *info = *info + j;
           break;
        }

        /* send the diagonal to gpus on stream1 */
        if ( (j+jb) < m) {
          d = (j/nb+1)%num_gpus;
          for( dd=0; dd<num_gpus; dd++ ) {
            if( d == id ) {
                dlpanel = dlA(d, nb*j_local, j);
                ldpanel = ldda;
            } else {
                dlpanel = dlPT(d, 0, 0, buf);
                ldpanel = nb;
            }
            magma_setdevice(d);
            magma_dsetmatrix_async( jb, jb,
                                    Alo(j,j), lda,
                                    dlpanel,  ldpanel, 
                                    stream[d][stream1] );
            magma_event_record( event[d][1], stream[d][stream1] );
            d = (d+1)%num_gpus;
          }
        } else {
          magma_setdevice(id);
          magma_dsetmatrix_async( jb, jb,
                                  Alo(j,j),               lda,
                                  dlA(id, nb*j_local, j), ldda, 
                                  stream[id][stream1] );
        }

        /* panel factorize the off-diagonal */
        if ( (j+jb) < m) {
          d = (j/nb+1)%num_gpus;
          for( dd=0; dd<num_gpus; dd++ ) {
            /* next column */
            j_local2 = j_local+1;
            if( d > id ) j_local2--;
            if( d == id ) {
                dlpanel = dlA(d, nb*j_local, j);
                ldpanel = ldda;
            } else {         
                dlpanel = dlPT(d, 0, 0, buf);
                ldpanel = nb;
            }
            nb2 = n_local[d] - j_local2*nb;
            nb0 = min(nb, nb2 );

            magma_setdevice(d);
            if( j+nb < n && d == (j/nb+1)%num_gpus ) { /* owns next column, look-ahead next block on stream1 */
              magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update 
              magmablasSetKernelStream(stream[d][stream1]);
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
              magmablas_dtrsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, 
                                    nb0, jb, c_one,
                                    dlpanel,                ldpanel, 
                                    dlA(d, nb*j_local2, j), ldda,
                                    d_dinvA[d][0], d_x[d][0] );
#else
              magma_dtrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, 
                           nb0, jb, c_one,
                           dlpanel,                ldpanel, 
                           dlA(d, nb*j_local2, j), ldda);
#endif
              magma_event_record( event[d][4], stream[d][stream1] );
            } else if( nb2 > 0 ) { /* other gpus updating all the blocks on stream2 */
              /* update the entire column */
              magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for the cholesky factor
              magmablasSetKernelStream(stream[d][stream2]);
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
              magmablas_dtrsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, 
                                    nb2, jb, c_one,
                                    dlpanel,                ldpanel, 
                                    dlA(d, nb*j_local2, j), ldda,
                                    d_dinvA[d][1], d_x[d][1] );
#else
              magma_dtrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, 
                           nb2, jb, c_one,
                           dlpanel,                ldpanel, 
                           dlA(d, nb*j_local2, j), ldda);
#endif
            }
            d = (d+1)%num_gpus;
          } /* end for d */

/* ======================================================================================== */

          d = (j/nb+1)%num_gpus;
          /* next column */
          j_local2 = j_local+1;
          if( d > id ) j_local2--;
          nb0 = min(nb, n_local[d]-nb*j_local2 );

          /* even on 1 gpu, we copy off-diagonal to cpu (but don't synchronize).  */
          /* so we have the Cholesky factor on cpu at the end.                    */
          if( j+jb < n ) {
              int d2, id2, j2, buf2;
              magma_setdevice(d);
              /* make sure all the previous sets are done */
              if( h < num_gpus ) {
                  /* > offdiagonal */
                  for( d2=0; d2<num_gpus; d2++ ) {
                      j2 = j - (1+d2)*nb;
                      if( j2 < 0 ) break;
                      id2  = (j2/nb)%num_gpus;
                      magma_queue_wait_event( stream[d][stream3], event[id2][0] );
                  }

                  /* > diagonal */
                  for( d2=0; d2<num_gpus; d2++ ) {
                      j2 = j - d2*nb;
                      if( j2 < 0 ) break;
                      id2  = (j2/nb)%num_gpus;
                      magma_queue_wait_event( stream[d][stream3], event[id2][1] );
                  }
              }
              // lookahead done
              magma_queue_wait_event( stream[d][stream3], event[d][4] );

              magma_dgetmatrix_async( nb0, j+jb,
                                      dlA(d, nb*j_local2, 0), ldda,
                                      Alo(j+jb,0),            lda, 
                                      stream[d][stream3] );
              magma_event_record( event[d][3], stream[d][stream3] );
/* syn on rows on CPU, seem to be needed on Pluto */
magma_queue_sync( stream[d][stream3] );

              /* broadcast the rows to gpus */
              buf2 = ((j+jb)/nb)%num_gpus;
              for( d2=0; d2<num_gpus; d2++ ) {
                  if( d2 != d ) 
                  {
                      magma_setdevice(d2);
                      magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // getmatrix done
                      magma_dsetmatrix_async( nb0, j+jb,
                                              Alo(j+jb,0),         lda,
                                              dlPT(d2,0,nb0,buf2), nb, 
                                              stream[d2][stream3] );
                      magma_event_record( event[d2][0], stream[d2][stream3] );
                  }
              }
          }

/* ======================================================================================== */
          /* gpu owing the next column updates remaining blocks on stream2 */
          if( j+nb < n ) { // no lookahead on the last block column
            d = (j/nb+1)%num_gpus;

            /* next column */
            j_local2 = j_local+1;
            if( d > id ) j_local2--;
            if( d == id ) {
              dlpanel = dlA(d, nb*j_local, j);
              ldpanel = ldda;
            } else {         
              dlpanel = dlPT(d,0,0,buf);
              ldpanel = nb;
            }
            nb0 = min(nb, n_local[d]-nb*j_local2 );
            nb2 = n_local[d] - j_local2*nb - nb0;

            if( nb2 > 0 ) {
                magma_setdevice(d);
                magmablasSetKernelStream(stream[d][stream2]);
                /* update the remaining blocks in the column */
                magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
                magmablas_dtrsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, 
                                      nb2, jb, c_one,
                                      dlpanel,                    ldpanel, 
                                      dlA(d, nb*j_local2+nb0, j), ldda,
                                      d_dinvA[d][1], d_x[d][1] );
#else
                magma_dtrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, 
                             nb2, jb, c_one,
                             dlpanel,                    ldpanel, 
                             dlA(d, nb*j_local2+nb0, j), ldda);
#endif
            }
          }
        }
      }
    } /* end of else not upper */

    /* == finalize the trace == */
    trace_finalize( "dpotrf.svg","trace.css" );
    for( d=0; d<num_gpus; d++ ) {
      magma_setdevice(d);
      magma_queue_sync( stream[d][0] ); 
      magma_queue_sync( stream[d][1] );
      magma_queue_sync( stream[d][2] );
      magmablasSetKernelStream(NULL);

      //magma_event_destroy( event0[d] ); 
      //magma_event_destroy( event1[d] );
      //magma_event_destroy( event2[d] );
      //magma_event_destroy( event3[d] );
      //magma_event_destroy( event4[d] );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
      for( j=0; j<2; j++ ) {
          magma_free( d_dinvA[d][j] );
          magma_free( d_x[d][j] );
      }
#endif
    }
    magma_setdevice(0);

    return *info;
} /* magma_dpotrf_mgpu */
Esempio n. 26
0
extern "C" magma_int_t
magma_zunmqr_m(magma_int_t nrgpu, char side, char trans,
               magma_int_t m, magma_int_t n, magma_int_t k,
               cuDoubleComplex *a,    magma_int_t lda,
               cuDoubleComplex *tau,
               cuDoubleComplex *c,    magma_int_t ldc,
               cuDoubleComplex *work, magma_int_t lwork,
               magma_int_t *info)
{
/*  -- MAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2012

    Purpose
    =======
    ZUNMQR overwrites the general complex M-by-N matrix C with

                    SIDE = 'L'     SIDE = 'R'
    TRANS = 'N':      Q * C          C * Q
    TRANS = 'T':      Q**H * C       C * Q**H

    where Q is a complex orthogonal matrix defined as the product of k
    elementary reflectors

          Q = H(1) H(2) . . . H(k)

    as returned by ZGEQRF. Q is of order M if SIDE = 'L' and of order N
    if SIDE = 'R'.

    Arguments
    =========
    SIDE    (input) CHARACTER*1
            = 'L': apply Q or Q**H from the Left;
            = 'R': apply Q or Q**H from the Right.

    TRANS   (input) CHARACTER*1
            = 'N':  No transpose, apply Q;
            = 'T':  Transpose, apply Q**H.

    M       (input) INTEGER
            The number of rows of the matrix C. M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix C. N >= 0.

    K       (input) INTEGER
            The number of elementary reflectors whose product defines
            the matrix Q.
            If SIDE = 'L', M >= K >= 0;
            if SIDE = 'R', N >= K >= 0.

    A       (input) COMPLEX_16 array, dimension (LDA,K)
            The i-th column must contain the vector which defines the
            elementary reflector H(i), for i = 1,2,...,k, as returned by
            ZGEQRF in the first k columns of its array argument A.

    LDA     (input) INTEGER
            The leading dimension of the array A.
            If SIDE = 'L', LDA >= max(1,M);
            if SIDE = 'R', LDA >= max(1,N).

    TAU     (input) COMPLEX_16 array, dimension (K)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by ZGEQRF.

    C       (input/output) COMPLEX_16 array, dimension (LDC,N)
            On entry, the M-by-N matrix C.
            On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q.

    LDC     (input) INTEGER
            The leading dimension of the array C. LDC >= max(1,M).

    WORK    (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK(0) returns the optimal LWORK.

    LWORK   (input) INTEGER
            The dimension of the array WORK.
            If SIDE = 'L', LWORK >= max(1,N);
            if SIDE = 'R', LWORK >= max(1,M).
            For optimum performance LWORK >= N*NB if SIDE = 'L', and
            LWORK >= M*NB if SIDE = 'R', where NB is the optimal
            blocksize.

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
    =====================================================================   */
    cuDoubleComplex c_one = MAGMA_Z_ONE;

    char side_[2] = {side, 0};
    char trans_[2] = {trans, 0};

    cuDoubleComplex* dw[MagmaMaxGPUs];
    cudaStream_t stream [MagmaMaxGPUs][2];

    magma_int_t ind_c, kb;

    magma_int_t i__4;
    magma_int_t i;
    cuDoubleComplex t[4160];        /* was [65][64] */
    magma_int_t i1, i2, i3, ib, nb, nq, nw;
    magma_int_t left, notran, lquery;
    magma_int_t iinfo, lwkopt;

    magma_int_t igpu = 0;

    int gpu_b;
    magma_getdevice(&gpu_b);

    *info = 0;
    left = lapackf77_lsame(side_, "L");
    notran = lapackf77_lsame(trans_, "N");
    lquery = (lwork == -1);

    /* NQ is the order of Q and NW is the minimum dimension of WORK */
    if (left) {
        nq = m;
        nw = n;
    } else {
        nq = n;
        nw = m;
    }
    if (! left && ! lapackf77_lsame(side_, "R")) {
        *info = -1;
    } else if (! notran && ! lapackf77_lsame(trans_, "T")) {
        *info = -2;
    } else if (m < 0) {
        *info = -3;
    } else if (n < 0) {
        *info = -4;
    } else if (k < 0 || k > nq) {
        *info = -5;
    } else if (lda < max(1,nq)) {
        *info = -7;
    } else if (ldc < max(1,m)) {
        *info = -10;
    } else if (lwork < max(1,nw) && ! lquery) {
        *info = -12;
    }

    if (*info == 0)
    {
        /* Determine the block size.  NB may be at most NBMAX, where NBMAX
         is used to define the local array T.    */
        nb = 64;
        lwkopt = max(1,nw) * nb;
        MAGMA_Z_SET2REAL( work[0], lwkopt );
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0 || k == 0) {
        work[0] = c_one;
        return *info;
    }

    magma_int_t lddc = (m+63)/64*64;
    magma_int_t lddac = nq;
    magma_int_t lddar =nb;
    magma_int_t lddwork = nw;

    magma_int_t n_l = (n+nrgpu-1)/nrgpu; // local n
    n_l = ((n_l+63)/64)*64;

    if (n_l<256)
       n_l=256;

    nrgpu = min(nrgpu, (n+n_l-1)/n_l); // Don't use GPU that will not have data.

    for (igpu = 0; igpu < nrgpu; ++igpu){
        magma_setdevice(igpu);
        magmablasSetKernelStream(NULL);
        if (MAGMA_SUCCESS != magma_zmalloc( &dw[igpu], (n_l*lddc + 2*lddac*lddar + 2*(nb + 1 + lddwork)*nb))) {
            magma_xerbla( __func__, -(*info) );
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }
        magma_queue_create( &stream[igpu][0] );
        magma_queue_create( &stream[igpu][1] );
    }

    if (nb >= k)
    {
        /* Use CPU code */
        lapackf77_zunmqr(side_, trans_, &m, &n, &k, a, &lda, tau,
                         c, &ldc, work, &lwork, &iinfo);
    }
    else
    {
        /* Use hybrid CPU-MGPU code */
        if (left) {

            //copy C to mgpus
            for (igpu = 0; igpu < nrgpu; ++igpu){
                magma_setdevice(igpu);
                kb = min(n_l, n-igpu*n_l);
                magma_zsetmatrix_async( m, kb,
                                        C(0, igpu*n_l), ldc,
                                        dC(igpu, 0, 0), lddc, stream[igpu][0] );
            }

            if ( !notran ) {
                i1 = 0;
                i2 = k;
                i3 = nb;
            } else {
                i1 = (k - 1) / nb * nb;
                i2 = 0;
                i3 = -nb;
            }

            kb = min(nb, k-i1);
            for (igpu = 0; igpu < nrgpu; ++igpu){
                magma_setdevice(igpu);
                magma_zsetmatrix_async( nq-i1, kb,
                                        A(i1, i1),            lda,
                                        dA_c(igpu, 0, i1, 0), lddac, stream[igpu][0] );
            }
            ind_c = 0;

            for (i = i1; i3 < 0 ? i >= i2 : i < i2; i += i3)
            {
                ib = min(nb, k - i);
                /* Form the triangular factor of the block reflector
                   H = H(i) H(i+1) . . . H(i+ib-1) */
                i__4 = nq - i;
                lapackf77_zlarft("F", "C", &i__4, &ib, A(i, i), &lda,
                                 &tau[i], t, &ib);

                /* H or H' is applied to C(1:m,i:n) */

                /* Apply H or H'; First copy T to the GPU */
                for (igpu = 0; igpu < nrgpu; ++igpu){
                    magma_setdevice(igpu);
                    magma_zsetmatrix_async( ib, ib,
                                            t,               ib,
                                            dt(igpu, ind_c), ib, stream[igpu][ind_c] );

                    magma_queue_sync( stream[igpu][ind_c] ); // Makes sure that we can change t next iteration.
                }

                // start the copy of next A panel
                kb = min(nb, k - i - i3);
                if (kb > 0 && i+i3 >= 0){
                    for (igpu = 0; igpu < nrgpu; ++igpu){
                        magma_setdevice(igpu);
                        magma_zsetmatrix_async( nq-(i+i3), kb,
                                                A(i+i3, i+i3),                    lda,
                                                dA_c(igpu, (ind_c+1)%2, i+i3, 0), lddac, stream[igpu][(ind_c+1)%2] );
                    }
                }

                for (igpu = 0; igpu < nrgpu; ++igpu){
                    magma_setdevice(igpu);
                    // Put 0s in the upper triangular part of dA;
                    magmablas_zsetdiag1subdiag0_stream('L', ib, ib, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][ind_c]);

                    kb = min(n_l, n-igpu*n_l);
                    magmablasSetKernelStream(stream[igpu][ind_c]);
                    magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
                                     m-i, kb, ib,
                                     dA_c(igpu, ind_c, i, 0), lddac, dt(igpu, ind_c), ib,
                                     dC(igpu, i, 0), lddc,
                                     dwork(igpu, ind_c), lddwork);
                }

                ind_c = (ind_c+1)%2;
            }

            //copy C from mgpus
            for (igpu = 0; igpu < nrgpu; ++igpu){
                magma_setdevice(igpu);
                magma_queue_sync( stream[igpu][0] );
                magma_queue_sync( stream[igpu][1] );
                kb = min(n_l, n-igpu*n_l);
                //asynchronous copy gives problems sometimes...
//                magma_zgetmatrix_async( m, kb,
//                                        dC(igpu, 0, 0), lddc,
//                                        C(0, igpu*n_l), ldc, stream[igpu][0] );
                magma_zgetmatrix( m, kb,
                                  dC(igpu, 0, 0), lddc,
                                  C(0, igpu*n_l), ldc );
            }

        } else {

            fprintf(stderr, "The case (side == right) is not implemented\n");
            magma_xerbla( __func__, 1 );
            return *info;

            /*if ( notran ) {
                i1 = 0;
                i2 = k;
                i3 = nb;
            } else {
                i1 = (k - 1) / nb * nb;
                i2 = 0;
                i3 = -nb;
            }

            mi = m;
            ic = 0;

            for (i = i1; i3 < 0 ? i >= i2 : i < i2; i += i3)
            {
                ib = min(nb, k - i);

                // Form the triangular factor of the block reflector
                // H = H(i) H(i+1) . . . H(i+ib-1)
                i__4 = nq - i;
                lapackf77_zlarft("F", "C", &i__4, &ib, A(i, i), &lda,
                                 &tau[i], t, &ib);

                // 1) copy the panel from A to the GPU, and
                // 2) Put 0s in the upper triangular part of dA;
                magma_zsetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda );
                magmablas_zsetdiag1subdiag0('L', ib, ib, dA(i, 0), ldda);


                // H or H' is applied to C(1:m,i:n)
                ni = n - i;
                jc = i;

                // Apply H or H'; First copy T to the GPU
                magma_zsetmatrix( ib, ib, t, ib, dt, ib );
                magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
                                 mi, ni, ib,
                                 dA(i, 0), ldda, dt, ib,
                                 dC(ic, jc), lddc,
                                 dwork, lddwork);
            }
            */
        }
    }
    MAGMA_Z_SET2REAL( work[0], lwkopt );

    for (igpu = 0; igpu < nrgpu; ++igpu){
        magma_setdevice(igpu);
        magma_queue_sync( stream[igpu][0] );
        magmablasSetKernelStream(NULL);
        magma_queue_destroy( stream[igpu][0] );
        magma_queue_destroy( stream[igpu][1] );
        magma_free( dw[igpu] );
    }

    magma_setdevice(gpu_b);

    return *info;
} /* magma_zunmqr */
Esempio n. 27
0
extern "C" magma_int_t
magma_zunmtr_gpu(char side, char uplo, char trans,
                 magma_int_t m, magma_int_t n,
                 magmaDoubleComplex *da,    magma_int_t ldda,
                 magmaDoubleComplex *tau,
                 magmaDoubleComplex *dc,    magma_int_t lddc,
                 magmaDoubleComplex *wa,    magma_int_t ldwa,
                 magma_int_t *info)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    ZUNMTR overwrites the general complex M-by-N matrix C with

                    SIDE = 'L'     SIDE = 'R'
    TRANS = 'N':      Q * C          C * Q
    TRANS = 'T':      Q**H * C       C * Q**H

    where Q is a complex orthogonal matrix of order nq, with nq = m if
    SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
    nq-1 elementary reflectors, as returned by SSYTRD:

    if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1);

    if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1).

    Arguments
    =========
    SIDE    (input) CHARACTER*1
            = 'L': apply Q or Q**H from the Left;
            = 'R': apply Q or Q**H from the Right.

    UPLO    (input) CHARACTER*1
            = 'U': Upper triangle of A contains elementary reflectors
                   from SSYTRD;
            = 'L': Lower triangle of A contains elementary reflectors
                   from SSYTRD.

    TRANS   (input) CHARACTER*1
            = 'N':  No transpose, apply Q;
            = 'T':  Transpose, apply Q**H.

    M       (input) INTEGER
            The number of rows of the matrix C. M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix C. N >= 0.

    DA      (device input) COMPLEX_16 array, dimension
                                 (LDDA,M) if SIDE = 'L'
                                 (LDDA,N) if SIDE = 'R'
            The vectors which define the elementary reflectors, as
            returned by ZHETRD_GPU. On output the diagonal, the subdiagonal and the
            upper part (UPLO='L') or lower part (UPLO='U') are destroyed.

    LDDA    (input) INTEGER
            The leading dimension of the array DA.
            LDDA >= max(1,M) if SIDE = 'L'; LDDA >= max(1,N) if SIDE = 'R'.

    TAU     (input) COMPLEX_16 array, dimension
                                 (M-1) if SIDE = 'L'
                                 (N-1) if SIDE = 'R'
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by SSYTRD.

    DC      (device input/output) COMPLEX_16 array, dimension (LDDC,N)
            On entry, the M-by-N matrix C.
            On exit, C is overwritten by (Q*C) or (Q**H * C) or (C * Q**H) or (C*Q).

    LDDC    (input) INTEGER
            The leading dimension of the array C. LDDC >= max(1,M).

    WA      (input/workspace) COMPLEX_16 array, dimension
                                 (LDWA,M) if SIDE = 'L'
                                 (LDWA,N) if SIDE = 'R'
            The vectors which define the elementary reflectors, as
            returned by ZHETRD_GPU.

    LDWA    (input) INTEGER
            The leading dimension of the array A.
            LDWA >= max(1,M) if SIDE = 'L'; LDWA >= max(1,N) if SIDE = 'R'.

    WORK    (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.

    LWORK   (input) INTEGER
            The dimension of the array WORK.
            If SIDE = 'L', LWORK >= max(1,N);
            if SIDE = 'R', LWORK >= max(1,M).
            For optimum performance LWORK >= N*NB if SIDE = 'L', and
            LWORK >= M*NB if SIDE = 'R', where NB is the optimal
            blocksize.

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
    =====================================================================    */

    char side_[2]  = {side, 0};
    char uplo_[2]  = {uplo, 0};
    char trans_[2] = {trans, 0};
    magma_int_t i1, i2, mi, ni, nq, nw;
    int left, upper;
    magma_int_t iinfo;

    *info = 0;
    left   = lapackf77_lsame(side_, "L");
    upper  = lapackf77_lsame(uplo_, "U");

    /* NQ is the order of Q and NW is the minimum dimension of WORK */
    if (left) {
        nq = m;
        nw = n;
    } else {
        nq = n;
        nw = m;
    }
    if (! left && ! lapackf77_lsame(side_, "R")) {
        *info = -1;
    } else if (! upper && ! lapackf77_lsame(uplo_, "L")) {
        *info = -2;
    } else if (! lapackf77_lsame(trans_, "N") &&
               ! lapackf77_lsame(trans_, "C")) {
        *info = -3;
    } else if (m < 0) {
        *info = -4;
    } else if (n < 0) {
        *info = -5;
    } else if (ldda < max(1,nq)) {
        *info = -7;
    } else if (lddc < max(1,m)) {
        *info = -10;
    } else if (ldwa < max(1,nq)) {
        *info = -12;
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0 || nq == 1) {
        return *info;
    }

    if (left) {
        mi = m - 1;
        ni = n;
    } else {
        mi = m;
        ni = n - 1;
    }

    if (upper) {
        magma_zunmql2_gpu(side, trans, mi, ni, nq-1, &da[ldda], ldda, tau,
                          dc, lddc, &wa[ldwa], ldwa, &iinfo);
    }
    else {
        /* Q was determined by a call to SSYTRD with UPLO = 'L' */
        if (left) {
            i1 = 1;
            i2 = 0;
        } else {
            i1 = 0;
            i2 = 1;
        }
        magma_zunmqr2_gpu(side, trans, mi, ni, nq-1, &da[1], ldda, tau,
                          &dc[i1 + i2*lddc], lddc, &wa[1], ldwa, &iinfo);
    }

    return *info;
} /* zunmtr */
Esempio n. 28
0
extern "C" magma_int_t
magma_dlaex3_m(magma_int_t nrgpu,
               magma_int_t k, magma_int_t n, magma_int_t n1, double* d,
               double* q, magma_int_t ldq, double rho,
               double* dlamda, double* q2, magma_int_t* indx,
               magma_int_t* ctot, double* w, double* s, magma_int_t* indxq,
               double** dwork, magma_queue_t stream[MagmaMaxGPUs][2],
               char range, double vl, double vu, magma_int_t il, magma_int_t iu,
               magma_int_t* info )
{
/*
    Purpose
    =======
    DLAEX3 finds the roots of the secular equation, as defined by the
    values in D, W, and RHO, between 1 and K.  It makes the
    appropriate calls to DLAED4 and then updates the eigenvectors by
    multiplying the matrix of eigenvectors of the pair of eigensystems
    being combined by the matrix of eigenvectors of the K-by-K system
    which is solved here.

    It is used in the last step when only a part of the eigenvectors
    is required.
    It compute only the required part of the eigenvectors and the rest
    is not used.

    This code makes very mild assumptions about floating point
    arithmetic. It will work on machines with a guard digit in
    add/subtract, or on those binary machines without guard digits
    which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
    It could conceivably fail on hexadecimal or decimal machines
    without guard digits, but we know of none.

    Arguments
    =========
    K       (input) INTEGER
            The number of terms in the rational function to be solved by
            DLAED4.  K >= 0.

    N       (input) INTEGER
            The number of rows and columns in the Q matrix.
            N >= K (deflation may result in N>K).

    N1      (input) INTEGER
            The location of the last eigenvalue in the leading submatrix.
            min(1,N) <= N1 <= N/2.

    D       (output) DOUBLE PRECISION array, dimension (N)
            D(I) contains the updated eigenvalues for
            1 <= I <= K.

    Q       (output) DOUBLE PRECISION array, dimension (LDQ,N)
            Initially the first K columns are used as workspace.
            On output the columns ??? to ??? contain
            the updated eigenvectors.

    LDQ     (input) INTEGER
            The leading dimension of the array Q.  LDQ >= max(1,N).

    RHO     (input) DOUBLE PRECISION
            The value of the parameter in the rank one update equation.
            RHO >= 0 required.

    DLAMDA  (input/output) DOUBLE PRECISION array, dimension (K)
            The first K elements of this array contain the old roots
            of the deflated updating problem.  These are the poles
            of the secular equation. May be changed on output by
            having lowest order bit set to zero on Cray X-MP, Cray Y-MP,
            Cray-2, or Cray C-90, as described above.

    Q2      (input) DOUBLE PRECISION array, dimension (LDQ2, N)
            The first K columns of this matrix contain the non-deflated
            eigenvectors for the split problem.

    INDX    (input) INTEGER array, dimension (N)
            The permutation used to arrange the columns of the deflated
            Q matrix into three groups (see DLAED2).
            The rows of the eigenvectors found by DLAED4 must be likewise
            permuted before the matrix multiply can take place.

    CTOT    (input) INTEGER array, dimension (4)
            A count of the total number of the various types of columns
            in Q, as described in INDX.  The fourth column type is any
            column which has been deflated.

    W       (input/output) DOUBLE PRECISION array, dimension (K)
            The first K elements of this array contain the components
            of the deflation-adjusted updating vector. Destroyed on
            output.

    S       (workspace) DOUBLE PRECISION array, dimension (N1 + 1)*K
            Will contain the eigenvectors of the repaired matrix which
            will be multiplied by the previously accumulated eigenvectors
            to update the system.

    INDXQ   (output) INTEGER array, dimension (N)
            On exit, the permutation which will reintegrate the
            subproblems back into sorted order,
            i.e. D( INDXQ( I = 1, N ) ) will be in ascending order.

    DWORK   (devices workspaces) DOUBLE PRECISION array of arrays,
            dimension NRGPU.
            if NRGPU = 1 the dimension of the first workspace
            should be (3*N*N/2+3*N)
            otherwise the NRGPU workspaces should have the size
            ceil((N-N1) * (N-N1) / floor(nrgpu/2)) +
            NB * ((N-N1) + (N-N1) / floor(nrgpu/2))

    STREAM  (device stream) magma_queue_t array,
            dimension (MagmaMaxGPUs,2)

    INFO    (output) INTEGER
            = 0:  successful exit.
            < 0:  if INFO = -i, the i-th argument had an illegal value.
            > 0:  if INFO = 1, an eigenvalue did not converge

    Further Details
    ===============
    Based on contributions by
    Jeff Rutter, Computer Science Division, University of California
    at Berkeley, USA
    Modified by Francoise Tisseur, University of Tennessee.

    ===================================================================== */
    
    if (nrgpu==1){
        magma_setdevice(0);
        magma_dlaex3(k, n, n1, d, q, ldq, rho,
                     dlamda, q2, indx, ctot, w, s, indxq,
                     *dwork, range, vl, vu, il, iu, info );
        return MAGMA_SUCCESS;
    }
    double d_one  = 1.;
    double d_zero = 0.;
    magma_int_t ione = 1;
    magma_int_t ineg_one = -1;
    char range_[] = {range, 0};

    magma_int_t iil, iiu, rk;
    magma_int_t n1_loc, n2_loc, ib, nb, ib2, igpu;
    magma_int_t ni_loc[MagmaMaxGPUs];

    magma_int_t i,ind,iq2,j,n12,n2,n23,tmp,lq2;
    double temp;
    magma_int_t alleig, valeig, indeig;

    alleig = lapackf77_lsame(range_, "A");
    valeig = lapackf77_lsame(range_, "V");
    indeig = lapackf77_lsame(range_, "I");

    *info = 0;

    if(k < 0)
        *info=-1;
    else if(n < k)
        *info=-2;
    else if(ldq < max(1,n))
        *info=-6;
    else if (! (alleig || valeig || indeig))
        *info = -15;
    else {
        if (valeig) {
            if (n > 0 && vu <= vl)
                *info = -17;
        }
        else if (indeig) {
            if (il < 1 || il > max(1,n))
                *info = -18;
            else if (iu < min(n,il) || iu > n)
                *info = -19;
        }
    }


    if(*info != 0){
        magma_xerbla(__func__, -(*info));
        return MAGMA_ERR_ILLEGAL_VALUE;
    }

    // Quick return if possible
    if(k == 0)
        return MAGMA_SUCCESS;
    /*
     Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can
     be computed with high relative accuracy (barring over/underflow).
     This is a problem on machines without a guard digit in
     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
     The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I),
     which on any of these machines zeros out the bottommost
     bit of DLAMDA(I) if it is 1; this makes the subsequent
     subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation
     occurs. On binary machines with a guard digit (almost all
     machines) it does not change DLAMDA(I) at all. On hexadecimal
     and decimal machines with a guard digit, it slightly
     changes the bottommost bits of DLAMDA(I). It does not account
     for hexadecimal or decimal machines without guard digits
     (we know of none). We use a subroutine call to compute
     2*DLAMBDA(I) to prevent optimizing compilers from eliminating
     this code.*/

//#define CHECK_CPU
#ifdef CHECK_CPU
    double *hwS[2][MagmaMaxGPUs], *hwQ[2][MagmaMaxGPUs], *hwQ2[MagmaMaxGPUs];
    #define hQ2(id) (hwQ2[id])
    #define hS(id, ii) (hwS[ii][id])
    #define hQ(id, ii) (hwQ[ii][id])
#endif
    n2 = n - n1;

    n12 = ctot[0] + ctot[1];
    n23 = ctot[1] + ctot[2];

    iq2 = n1 * n12;
    lq2 = iq2 + n2 * n23;

    n1_loc = (n1-1) / (nrgpu/2) + 1;
    n2_loc = (n2-1) / (nrgpu/2) + 1;

    nb = magma_get_dlaex3_m_nb();

    if (n1 >= magma_get_dlaex3_m_k()){
#ifdef CHECK_CPU
        for (igpu = 0; igpu < nrgpu; ++igpu){
            magma_dmalloc_pinned( &(hwS[0][igpu]), n2*nb );
            magma_dmalloc_pinned( &(hwS[1][igpu]), n2*nb );
            magma_dmalloc_pinned( &(hwQ2[igpu]), n2*n2_loc );
            magma_dmalloc_pinned( &(hwQ[0][igpu]), n2_loc*nb );
            magma_dmalloc_pinned( &(hwQ[1][igpu]), n2_loc*nb );
        }
#endif
        for (igpu = 0; igpu < nrgpu-1; igpu += 2){
            ni_loc[igpu] = min(n1_loc, n1 - igpu/2 * n1_loc);
#ifdef CHECK_CPU
            lapackf77_dlacpy("A", &ni_loc[igpu], &n12, q2+n1_loc*(igpu/2), &n1, hQ2(igpu), &n1_loc);
#endif
            magma_setdevice(igpu);
            magma_dsetmatrix_async( ni_loc[igpu], n12,
                                    q2+n1_loc*(igpu/2), n1,
                                    dQ2(igpu),          n1_loc, stream[igpu][0] );
            ni_loc[igpu+1] = min(n2_loc, n2 - igpu/2 * n2_loc);
#ifdef CHECK_CPU
            lapackf77_dlacpy("A", &ni_loc[igpu+1], &n23, q2+iq2+n2_loc*(igpu/2), &n2, hQ2(igpu+1), &n2_loc);
#endif
            magma_setdevice(igpu+1);
            magma_dsetmatrix_async( ni_loc[igpu+1], n23,
                                    q2+iq2+n2_loc*(igpu/2), n2,
                                    dQ2(igpu+1),            n2_loc, stream[igpu+1][0] );
        }
    }

    //

#ifdef _OPENMP
    /////////////////////////////////////////////////////////////////////////////////
    //openmp implementation
    /////////////////////////////////////////////////////////////////////////////////
#ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER
    magma_timestr_t start, end;
    start = get_current_time();
#endif

#pragma omp parallel private(i, j, tmp, temp)
    {
        magma_int_t id = omp_get_thread_num();
        magma_int_t tot = omp_get_num_threads();

        magma_int_t ib = (  id   * k) / tot; //start index of local loop
        magma_int_t ie = ((id+1) * k) / tot; //end index of local loop
        magma_int_t ik = ie - ib;           //number of local indices

        for(i = ib; i < ie; ++i)
            dlamda[i]=lapackf77_dlamc3(&dlamda[i], &dlamda[i]) - dlamda[i];

        for(j = ib; j < ie; ++j){
            magma_int_t tmpp=j+1;
            magma_int_t iinfo = 0;
            lapackf77_dlaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo);
            // If the zero finder fails, the computation is terminated.
            if(iinfo != 0){
#pragma omp critical (info)
                *info=iinfo;
                break;
            }
        }

#pragma omp barrier

        if(*info == 0){

#pragma omp single
            {
                //Prepare the INDXQ sorting permutation.
                magma_int_t nk = n - k;
                lapackf77_dlamrg( &k, &nk, d, &ione , &ineg_one, indxq);

                //compute the lower and upper bound of the non-deflated eigenvectors
                if (valeig)
                    magma_dvrange(k, d, &iil, &iiu, vl, vu);
                else if (indeig)
                    magma_dirange(k, indxq, &iil, &iiu, il, iu);
                else {
                    iil = 1;
                    iiu = k;
                }
                rk = iiu - iil + 1;
            }

            if (k == 2){
#pragma omp single
                {
                    for(j = 0; j < k; ++j){
                        w[0] = *Q(0,j);
                        w[1] = *Q(1,j);

                        i = indx[0] - 1;
                        *Q(0,j) = w[i];
                        i = indx[1] - 1;
                        *Q(1,j) = w[i];
                    }
                }

            }
            else if(k != 1){

                // Compute updated W.
                blasf77_dcopy( &ik, &w[ib], &ione, &s[ib], &ione);

                // Initialize W(I) = Q(I,I)
                tmp = ldq + 1;
                blasf77_dcopy( &ik, Q(ib,ib), &tmp, &w[ib], &ione);

                for(j = 0; j < k; ++j){
                    magma_int_t i_tmp = min(j, ie);
                    for(i = ib; i < i_tmp; ++i)
                        w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
                    i_tmp = max(j+1, ib);
                    for(i = i_tmp; i < ie; ++i)
                        w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
                }

                for(i = ib; i < ie; ++i)
                    w[i] = copysign( sqrt( -w[i] ), s[i]);

#pragma omp barrier

                //reduce the number of used threads to have enough S workspace
                tot = min(n1, omp_get_num_threads());

                if(id < tot){
                    ib = (  id   * rk) / tot + iil - 1;
                    ie = ((id+1) * rk) / tot + iil - 1;
                    ik = ie - ib;
                }
                else{
                    ib = -1;
                    ie = -1;
                    ik = -1;
                }

                // Compute eigenvectors of the modified rank-1 modification.
                for(j = ib; j < ie; ++j){
                    for(i = 0; i < k; ++i)
                        s[id*k + i] = w[i] / *Q(i,j);
                    temp = cblas_dnrm2( k, s+id*k, 1);
                    for(i = 0; i < k; ++i){
                        magma_int_t iii = indx[i] - 1;
                        *Q(i,j) = s[id*k + iii] / temp;
                    }
                }
            }
        }
    }
    if (*info != 0)
        return MAGMA_SUCCESS; //??????

#ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER
    end = get_current_time();
    printf("eigenvalues/vector D+zzT = %6.2f\n", GetTimerValue(start,end)/1000.);
#endif

#else
    /////////////////////////////////////////////////////////////////////////////////
    // Non openmp implementation
    /////////////////////////////////////////////////////////////////////////////////
#ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER
    magma_timestr_t start, end;
    start = get_current_time();
#endif

    for(i = 0; i < k; ++i)
        dlamda[i]=lapackf77_dlamc3(&dlamda[i], &dlamda[i]) - dlamda[i];

    for(j = 0; j < k; ++j){
        magma_int_t tmpp=j+1;
        magma_int_t iinfo = 0;
        lapackf77_dlaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo);
        // If the zero finder fails, the computation is terminated.
        if(iinfo != 0)
            *info=iinfo;
    }
    if(*info != 0)
        return MAGMA_SUCCESS;

    //Prepare the INDXQ sorting permutation.
    magma_int_t nk = n - k;
    lapackf77_dlamrg( &k, &nk, d, &ione , &ineg_one, indxq);

    //compute the lower and upper bound of the non-deflated eigenvectors
    if (valeig)
        magma_dvrange(k, d, &iil, &iiu, vl, vu);
    else if (indeig)
        magma_dirange(k, indxq, &iil, &iiu, il, iu);
    else {
        iil = 1;
        iiu = k;
    }
    rk = iiu - iil + 1;

    if (k == 2){

        for(j = 0; j < k; ++j){
            w[0] = *Q(0,j);
            w[1] = *Q(1,j);

            i = indx[0] - 1;
            *Q(0,j) = w[i];
            i = indx[1] - 1;
            *Q(1,j) = w[i];
        }

    }
    else if(k != 1){

        // Compute updated W.
        blasf77_dcopy( &k, w, &ione, s, &ione);

        // Initialize W(I) = Q(I,I)
        tmp = ldq + 1;
        blasf77_dcopy( &k, q, &tmp, w, &ione);

        for(j = 0; j < k; ++j){
            for(i = 0; i < j; ++i)
                w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
            for(i = j+1; i < k; ++i)
                w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
        }

        for(i = 0; i < k; ++i)
            w[i] = copysign( sqrt( -w[i] ), s[i]);

        // Compute eigenvectors of the modified rank-1 modification.
        for(j = iil-1; j < iiu; ++j){
            for(i = 0; i < k; ++i)
                s[i] = w[i] / *Q(i,j);
            temp = cblas_dnrm2( k, s, 1);
            for(i = 0; i < k; ++i){
                magma_int_t iii = indx[i] - 1;
                *Q(i,j) = s[iii] / temp;
            }
        }
    }

#ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER
    end = get_current_time();
    printf("eigenvalues/vector D+zzT = %6.2f\n", GetTimerValue(start,end)/1000.);
#endif

#endif //_OPENMP

    // Compute the updated eigenvectors.

#ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER
    start = get_current_time();
#endif

    if(rk > 0){
        if (n1 < magma_get_dlaex3_m_k()){
            // stay on the CPU
            if( n23 != 0 ){
                lapackf77_dlacpy("A", &n23, &rk, Q(ctot[0],iil-1), &ldq, s, &n23);
                blasf77_dgemm("N", "N", &n2, &rk, &n23, &d_one, &q2[iq2], &n2,
                              s, &n23, &d_zero, Q(n1,iil-1), &ldq );
            }
            else
                lapackf77_dlaset("A", &n2, &rk, &d_zero, &d_zero, Q(n1,iil-1), &ldq);

            if( n12 != 0 ) {
                lapackf77_dlacpy("A", &n12, &rk, Q(0,iil-1), &ldq, s, &n12);
                blasf77_dgemm("N", "N", &n1, &rk, &n12, &d_one, q2, &n1,
                              s, &n12, &d_zero, Q(0,iil-1), &ldq);
            }
            else
                lapackf77_dlaset("A", &n1, &rk, &d_zero, &d_zero, Q(0,iil-1), &ldq);
        }
        else {
            //use the gpus
            ib = min(nb, rk);
            for (igpu = 0; igpu < nrgpu-1; igpu += 2){
                if (n23 != 0) {
                    magma_setdevice(igpu+1);
                    magma_dsetmatrix_async( n23, ib,
                                            Q(ctot[0],iil-1), ldq,
                                            dS(igpu+1,0),     n23, stream[igpu+1][0] );
                }
                if (n12 != 0) {
                    magma_setdevice(igpu);
                    magma_dsetmatrix_async( n12, ib,
                                            Q(0,iil-1), ldq,
                                            dS(igpu,0), n12, stream[igpu][0] );
                }
            }

            for (i = 0; i<rk; i+=nb){
                ib = min(nb, rk - i);
                ind = (i/nb)%2;
                if (i+nb<rk){
                    ib2 = min(nb, rk - i - nb);
                    for (igpu = 0; igpu < nrgpu-1; igpu += 2){
                        if (n23 != 0) {
                            magma_setdevice(igpu+1);
                            magma_dsetmatrix_async( n23, ib2,
                                                    Q(ctot[0],iil-1+i+nb), ldq,
                                                    dS(igpu+1,(ind+1)%2),  n23, stream[igpu+1][(ind+1)%2] );
                        }
                        if (n12 != 0) {
                            magma_setdevice(igpu);
                            magma_dsetmatrix_async( n12, ib2,
                                                    Q(0,iil-1+i+nb),    ldq,
                                                    dS(igpu,(ind+1)%2), n12, stream[igpu][(ind+1)%2] );
                        }
                    }
                }

                // Ensure that the data is copied on gpu since we will overwrite it.
                for (igpu = 0; igpu < nrgpu-1; igpu += 2){
                    if (n23 != 0) {
#ifdef CHECK_CPU
                        lapackf77_dlacpy("A", &n23, &ib, Q(ctot[0],iil-1+i), &ldq, hS(igpu+1,ind), &n23);
#endif
                        magma_setdevice(igpu+1);
                        magma_queue_sync( stream[igpu+1][ind] );
                    }
                    if (n12 != 0) {
#ifdef CHECK_CPU
                        lapackf77_dlacpy("A", &n12, &ib, Q(0,iil-1+i), &ldq, hS(igpu,ind), &n12);
#endif
                        magma_setdevice(igpu);
                        magma_queue_sync( stream[igpu][ind] );
                    }

                }
                for (igpu = 0; igpu < nrgpu-1; igpu += 2){
                    if (n23 != 0) {
#ifdef CHECK_CPU
                        blasf77_dgemm("N", "N", &ni_loc[igpu+1], &ib, &n23, &d_one, hQ2(igpu+1), &n2_loc,
                                      hS(igpu+1,ind), &n23, &d_zero, hQ(igpu+1, ind), &n2_loc);
#endif
                        magma_setdevice(igpu+1);
                        magmablasSetKernelStream(stream[igpu+1][ind]);
                        magma_dgemm(MagmaNoTrans, MagmaNoTrans, ni_loc[igpu+1], ib, n23, d_one, dQ2(igpu+1), n2_loc,
                                    dS(igpu+1, ind), n23, d_zero, dQ(igpu+1, ind), n2_loc);
#ifdef CHECK_CPU
                        printf("norm Q %d: %f\n", igpu+1, cpu_gpu_ddiff(ni_loc[igpu+1], ib, hQ(igpu+1, ind), n2_loc, dQ(igpu+1, ind), n2_loc));
#endif
                    }
                    if (n12 != 0) {
#ifdef CHECK_CPU
                        blasf77_dgemm("N", "N", &ni_loc[igpu], &ib, &n12, &d_one, hQ2(igpu), &n1_loc,
                                      hS(igpu,ind%2), &n12, &d_zero, hQ(igpu, ind%2), &n1_loc);
#endif
                        magma_setdevice(igpu);
                        magmablasSetKernelStream(stream[igpu][ind]);
                        magma_dgemm(MagmaNoTrans, MagmaNoTrans, ni_loc[igpu], ib, n12, d_one, dQ2(igpu), n1_loc,
                                    dS(igpu, ind), n12, d_zero, dQ(igpu, ind), n1_loc);
#ifdef CHECK_CPU
                        printf("norm Q %d: %f\n", igpu, cpu_gpu_ddiff(ni_loc[igpu], ib, hQ(igpu, ind), n1_loc, dQ(igpu, ind), n1_loc));
#endif
                    }
                }
                for (igpu = 0; igpu < nrgpu-1; igpu += 2){
                    if (n23 != 0) {
                        magma_setdevice(igpu+1);
                        magma_dgetmatrix( ni_loc[igpu+1], ib, dQ(igpu+1, ind), n2_loc,
                                          Q(n1+n2_loc*(igpu/2),iil-1+i), ldq );
//                        magma_dgetmatrix_async( ni_loc[igpu+1], ib, dQ(igpu+1, ind), n2_loc,
//                                                Q(n1+n2_loc*(igpu/2),iil-1+i), ldq, stream[igpu+1][ind] );
                    }
                    if (n12 != 0) {
                        magma_setdevice(igpu);
                        magma_dgetmatrix( ni_loc[igpu], ib, dQ(igpu, ind), n1_loc,
                                          Q(n1_loc*(igpu/2),iil-1+i), ldq );
//                        magma_dgetmatrix_async( ni_loc[igpu], ib, dQ(igpu, ind), n1_loc,
//                                                Q(n1_loc*(igpu/2),iil-1+i), ldq, stream[igpu][ind] );
                    }
                }
            }
            for (igpu = 0; igpu < nrgpu; ++igpu){
#ifdef CHECK_CPU
                magma_free_pinned( hwS[1][igpu] );
                magma_free_pinned( hwS[0][igpu] );
                magma_free_pinned( hwQ2[igpu] );
                magma_free_pinned( hwQ[1][igpu] );
                magma_free_pinned( hwQ[0][igpu] );
#endif
                magma_setdevice(igpu);
                magmablasSetKernelStream(NULL);
                magma_queue_sync( stream[igpu][0] );
                magma_queue_sync( stream[igpu][1] );
            }
            if( n23 == 0 )
                lapackf77_dlaset("A", &n2, &rk, &d_zero, &d_zero, Q(n1,iil-1), &ldq);

            if( n12 == 0 )
                lapackf77_dlaset("A", &n1, &rk, &d_zero, &d_zero, Q(0,iil-1), &ldq);
        }
    }
#ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER
    end = get_current_time();
    printf("gemms = %6.2f\n", GetTimerValue(start,end)/1000.);
#endif

    return MAGMA_SUCCESS;
} /*magma_dlaed3_m*/
Esempio n. 29
0
extern "C" magma_int_t
magma_ssyevd(magma_vec_t jobz, magma_vec_t uplo,
             magma_int_t n,
             float *a, magma_int_t lda,
             float *w,
             float *work, magma_int_t lwork,
             magma_int_t *iwork, magma_int_t liwork,
             magma_int_t *info, magma_queue_t queue)
{
/*  -- MAGMA (version 1.0.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       September 2012

    Purpose
    =======
    SSYEVD computes all eigenvalues and, optionally, eigenvectors of
    a real symmetric matrix A.  If eigenvectors are desired, it uses a
    divide and conquer algorithm.

    The divide and conquer algorithm makes very mild assumptions about
    floating point arithmetic. It will work on machines with a guard
    digit in add/subtract, or on those binary machines without guard
    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
    Cray-2. It could conceivably fail on hexadecimal or decimal machines
    without guard digits, but we know of none.

    Arguments
    =========
    JOBZ    (input) CHARACTER*1
            = 'N':  Compute eigenvalues only;
            = 'V':  Compute eigenvalues and eigenvectors.

    UPLO    (input) CHARACTER*1
            = 'U':  Upper triangle of A is stored;
            = 'L':  Lower triangle of A is stored.

    N       (input) INTEGER
            The order of the matrix A.  N >= 0.

    A       (input/output) REAL array, dimension (LDA, N)
            On entry, the symmetric matrix A.  If UPLO = 'U', the
            leading N-by-N upper triangular part of A contains the
            upper triangular part of the matrix A.  If UPLO = 'L',
            the leading N-by-N lower triangular part of A contains
            the lower triangular part of the matrix A.
            On exit, if JOBZ = 'V', then if INFO = 0, A contains the
            orthonormal eigenvectors of the matrix A.
            If JOBZ = 'N', then on exit the lower triangle (if UPLO='L')
            or the upper triangle (if UPLO='U') of A, including the
            diagonal, is destroyed.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    W       (output) DOUBLE PRECISION array, dimension (N)
            If INFO = 0, the eigenvalues in ascending order.

    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    LWORK   (input) INTEGER
            The length of the array WORK.
            If N <= 1,                LWORK must be at least 1.
            If JOBZ  = 'N' and N > 1, LWORK must be at least 2*N + N*NB.
            If JOBZ  = 'V' and N > 1, LWORK must be at least 1 + 6*N + 2*N**2.
            NB can be obtained through magma_get_ssytrd_nb(N).

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal sizes of the WORK and IWORK
            arrays, returns these values as the first entries of the WORK
            and IWORK arrays, and no error message related to LWORK or
            LIWORK is issued by XERBLA.

    IWORK   (workspace/output) INTEGER array, dimension (MAX(1,LIWORK))
            On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK.

    LIWORK  (input) INTEGER
            The dimension of the array IWORK.
            If N <= 1,                LIWORK must be at least 1.
            If JOBZ  = 'N' and N > 1, LIWORK must be at least 1.
            If JOBZ  = 'V' and N > 1, LIWORK must be at least 3 + 5*N.

            If LIWORK = -1, then a workspace query is assumed; the
            routine only calculates the optimal sizes of the WORK and
            IWORK arrays, returns these values as the first entries of
            the WORK and IWORK arrays, and no error message related to
            LWORK or LIWORK is issued by XERBLA.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
            > 0:  if INFO = i and JOBZ = 'N', then the algorithm failed
                  to converge; i off-diagonal elements of an intermediate
                  tridiagonal form did not converge to zero;
                  if INFO = i and JOBZ = 'V', then the algorithm failed
                  to compute an eigenvalue while working on the submatrix
                  lying in rows and columns INFO/(N+1) through
                  mod(INFO,N+1).

    Further Details
    ===============
    Based on contributions by
       Jeff Rutter, Computer Science Division, University of California
       at Berkeley, USA

    Modified description of INFO. Sven, 16 Feb 05.
    =====================================================================   */

    magma_uplo_t uplo_ = uplo;
    magma_vec_t jobz_ = jobz;
    magma_int_t ione = 1;
    magma_int_t izero = 0;
    float d_one = 1.;
    
    float d__1;

    float eps;
    magma_int_t inde;
    float anrm;
    float rmin, rmax;
    float sigma;
    magma_int_t iinfo, lwmin;
    magma_int_t lower;
    magma_int_t wantz;
    magma_int_t indwk2, llwrk2;
    magma_int_t iscale;
    float safmin;
    float bignum;
    magma_int_t indtau;
    magma_int_t indwrk, liwmin;
    magma_int_t llwork;
    float smlnum;
    magma_int_t lquery;

    //magmaFloat_ptr dwork;

    wantz = lapackf77_lsame(lapack_const(jobz_), MagmaVectorsStr);
    lower = lapackf77_lsame(lapack_const(uplo_), MagmaLowerStr);
    lquery = lwork == -1 || liwork == -1;

    *info = 0;
    if (! (wantz || lapackf77_lsame(lapack_const(jobz_), MagmaNoVectorsStr))) {
        *info = -1;
    } else if (! (lower || lapackf77_lsame(lapack_const(uplo_), MagmaUpperStr))) {
        *info = -2;
    } else if (n < 0) {
        *info = -3;
    } else if (lda < max(1,n)) {
        *info = -5;
    }

    magma_int_t nb = magma_get_ssytrd_nb( n );
    if ( n <= 1 ) {
        lwmin  = 1;
        liwmin = 1;
    }
    else if ( wantz ) {
        lwmin  = 1 + 6*n + 2*n*n;
        liwmin = 3 + 5*n;
    }
    else {
        lwmin  = 2*n + n*nb;
        liwmin = 1;
    }
    // multiply by 1+eps to ensure length gets rounded up,
    // if it cannot be exactly represented in floating point.
// ACD
//    work[0]  = lwmin * (1. + lapackf77_slamch("Epsilon"));
    work[0]  = (float)( lwmin * (1. + lapackf77_slamch("Epsilon")) );
    iwork[0] = liwmin;

    if ((lwork < lwmin) && !lquery) {
        *info = -8;
    } else if ((liwork < liwmin) && ! lquery) {
        *info = -10;
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (n == 0) {
        return *info;
    }

    if (n == 1) {
        w[0] = a[0];
        if (wantz) {
            a[0] = 1.;
        }
        return *info;
    }

    /* Get machine constants. */
    safmin = lapackf77_slamch("Safe minimum");
    eps    = lapackf77_slamch("Precision");
    smlnum = safmin / eps;
// ACD
//    bignum = 1. / smlnum;
    bignum = 1.F / smlnum;
    rmin = magma_ssqrt(smlnum);
    rmax = magma_ssqrt(bignum);

    /* Scale matrix to allowable range, if necessary. */
    anrm = lapackf77_slansy("M", lapack_const(uplo_), &n, a, &lda, work );
    iscale = 0;
    if (anrm > 0. && anrm < rmin) {
        iscale = 1;
        sigma = rmin / anrm;
    } else if (anrm > rmax) {
        iscale = 1;
        sigma = rmax / anrm;
    }
    if (iscale == 1) {
        lapackf77_slascl(lapack_const(uplo_), &izero, &izero, &d_one, &sigma, &n, &n, a,
                &lda, info);
    }

    /* Call SSYTRD to reduce symmetric matrix to tridiagonal form. */
    // ssytrd work: e (n) + tau (n) + llwork (n*nb)  ==>  2n + n*nb
    // sstedx work: e (n) + tau (n) + z (n*n) + llwrk2 (1 + 4*n + n^2)  ==>  1 + 6n + 2n^2
    inde   = 0;
    indtau = inde   + n;
    indwrk = indtau + n;
    indwk2 = indwrk + n*n;
    llwork = lwork - indwrk;
    llwrk2 = lwork - indwk2;

//#define ENABLE_TIMER
#ifdef ENABLE_TIMER
    magma_timestr_t start, end;
    start = get_current_time();
#endif
	//char _uplo_[2] = { lapack_const(uplo)[0], 0 };
	//lapackf77_ssytrd(_uplo_, &n, a, &lda, w, &work[inde],
	//	&work[indtau], &work[indwrk], &llwork, &iinfo);
    magma_ssytrd(lapack_const(uplo)[0], n, a, lda, w, &work[inde],
                 &work[indtau], &work[indwrk], llwork, &iinfo, queue);
    
#ifdef ENABLE_TIMER
    end = get_current_time();
    printf("time ssytrd = %6.2f\n", GetTimerValue(start,end)/1000.);
#endif

    /* For eigenvalues only, call SSTERF.  For eigenvectors, first call
       SSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the
       tridiagonal matrix, then call SORMTR to multiply it to the Householder
       transformations represented as Householder vectors in A. */
    if (! wantz) {
        lapackf77_ssterf(&n, w, &work[inde], info);
    } else {

#ifdef ENABLE_TIMER
        start = get_current_time();
#endif
        
        //if (MAGMA_SUCCESS != magma_malloc( &dwork, 3*n*(n/2 + 1)*sizeof(float) )) {
        //    *info = MAGMA_ERR_DEVICE_ALLOC;
        //    return *info;
        //}

		//magma_sstedx(MagmaAllVec, n, 0., 0., 0, 0, w, &work[inde],
        //             &work[indwrk], n, &work[indwk2],
        //             llwrk2, iwork, liwork, dwork, info, queue);
		lapackf77_sstevd(V_char, &n, w, &work[inde],
			&work[indwrk], &n, &work[indwk2],
			&llwrk2, iwork, &liwork, info
			);
        
        //magma_free( dwork );
        
#ifdef ENABLE_TIMER
        end = get_current_time();
        printf("time sstedx = %6.2f\n", GetTimerValue(start,end)/1000.);
        
        start = get_current_time();
#endif
		lapackf77_sormtr(L_char, lapack_const(uplo), N_char, &n, &n, a, &lda, &work[indtau],
			&work[indwrk], &n, &work[indwk2], &llwrk2, &iinfo);
        //magma_sormtr(MagmaLeft, uplo, MagmaNoTrans, n, n, a, lda, &work[indtau],
        //             &work[indwrk], n, &work[indwk2], llwrk2, &iinfo, queue);
        
        lapackf77_slacpy("A", &n, &n, &work[indwrk], &n, a, &lda);

#ifdef ENABLE_TIMER
        end = get_current_time();
        printf("time sormtr + copy = %6.2f\n", GetTimerValue(start,end)/1000.);
#endif
    }

    /* If matrix was scaled, then rescale eigenvalues appropriately. */
    if (iscale == 1) {
// ACD
//        d__1 = 1. / sigma;
        d__1 = 1.F / sigma;
        blasf77_sscal(&n, &d__1, w, &ione);
    }

// ACD
//    work[0]  = lwmin * (1. + lapackf77_slamch("Epsilon"));  // round up
    work[0]  = (float)( lwmin * (1. + lapackf77_slamch("Epsilon")) );  // round up
    iwork[0] = liwmin;

    return *info;
} /* magma_ssyevd */
Esempio n. 30
0
magma_int_t
magmablas_zhemv_mgpu( magma_int_t num_gpus, magma_int_t k, char uplo,
                      magma_int_t n, magma_int_t nb,
                      magmaDoubleComplex alpha,
                      magmaDoubleComplex **da, magma_int_t ldda, magma_int_t offset,
                      magmaDoubleComplex **dx, magma_int_t incx,
                      magmaDoubleComplex beta,
                      magmaDoubleComplex **dy, magma_int_t incy,
                      magmaDoubleComplex **dwork, magma_int_t ldwork,
                      magmaDoubleComplex *work, magmaDoubleComplex *w,
                      magma_queue_t stream[][10] )
{

#define dX(id, i)    (dx[(id)]+incx*(i))
#define dY(id, i, j) (dy[(id)]+incy*(i)+n*(j))

    magma_int_t id;

#ifdef MAGMABLAS_ZHEMV_MGPU
    for( id=0; id<num_gpus; id++ ) {
        magma_setdevice(id);
        magmablasSetKernelStream(stream[id][0]);
        trace_gpu_start( id, 0, "memset", "memset" );
        cudaMemset( dwork[id], 0, ldwork*sizeof(magmaDoubleComplex) );
        trace_gpu_end( id, 0 );
        trace_gpu_start( id, 0, "symv", "symv" );
    }

    if( nb == 32 ) {
        magmablas_zhemv_mgpu_32_offset( uplo, offset+n, alpha, da, ldda,
                                        dx, incx,
                                        beta,
                                        dy, incy,
                                        dwork, ldwork,
                                        num_gpus, nb, offset,
                                        stream );
    } else {
        magmablas_zhemv_mgpu_offset( uplo, offset+n, alpha, da, ldda,
                                     dx, incx,
                                     beta,
                                     dy, incy,
                                     dwork, ldwork,
                                     num_gpus, nb, offset,
                                     stream );
    }
    for( id=0; id<num_gpus; id++ ) {
        magma_setdevice(id);
        trace_gpu_end( id, 0 );
        magmablasSetKernelStream(NULL);
    }
    //magma_setdevice(0);
    //magmablasSetKernelStream(stream[0][0]);
    //magma_zhemv('L', n, alpha, &da[0][offset+offset*ldda], ldda, &dx[0][offset], incx, beta, &dy[0][offset], incy );
    //magmablasSetKernelStream(NULL);

    /* send to CPU */
    magma_setdevice(0);
    trace_gpu_start( 0, 0, "comm", "comm" );
    magma_zgetvector_async( n, dY(0, offset, 0), 1, w, 1, stream[0][0] );
    trace_gpu_end( 0, 0 );
    magmablasSetKernelStream(NULL);

    for( id=1; id<num_gpus; id++ ) {
        magma_setdevice(id);
        trace_gpu_start(  id, 0, "comm", "comm" );
        magma_zgetvector_async( n, dY(id, offset, 0), 1, &work[id*n], 1, stream[id][0] );
        trace_gpu_end( id, 0 );
        magmablasSetKernelStream(NULL);
    }
#else
    magmaDoubleComplex c_one = MAGMA_Z_ONE;
    char uplo_[2]  = {uplo, 0};
    magma_int_t i, ii, j, kk, ib, ib0, i_1, i_local, idw;
    magma_int_t i_0=n;
    magma_int_t loffset0 = nb*(offset/(nb*num_gpus));
    magma_int_t loffset1 = offset%nb;
    magma_int_t loffset;    
    
    //magma_zhemv(uplo, n, alpha, da, ldda, dx, incx, beta, dy, incy );

    idw = (offset/nb)%num_gpus;

    for( id=0; id<num_gpus; id++ ) {
        magma_setdevice(id);
        magmablasSetKernelStream(stream[id][0]);
        cudaMemset( dy[id], 0, n*k*sizeof(magmaDoubleComplex) );
    }

    if( lapackf77_lsame( uplo_, "L" ) ) {
        /* the first block */
        if( loffset1 > 0 ) {
            id = idw;
            kk = 0;

            magma_setdevice(id);
            magmablasSetKernelStream(stream[id][kk]);

            loffset = loffset0+loffset1;
            ib0 = min(nb-loffset1,n);
            // diagonal
            magma_zhemv(MagmaLower, ib0, c_one, dA(id, 0, 0 ), ldda,
                        dX(id, 0), incx, c_one, dY(id, 0, kk), incy);
            // off-diagonl
            if( ib0 < n ) {
                for( j=ib0; j<n; j+= i_0 ) {
                    i_1 = min(i_0, n-j);
                    magma_zgemv(MagmaNoTrans, i_1, ib0, c_one, dA(id, j, 0), ldda,
                                dX(id, 0), incx, c_one, dY(id, j, kk), incy);
                    magma_zgemv(MagmaConjTrans, i_1, ib0, c_one, dA(id, j, 0), ldda,
                                dX(id, j), incx, c_one, dY(id, 0, kk), incy);
                }
            }
        }
        else {
            ib0 = 0;
        }

        /* diagonal */
        for( i=ib0; i<n; i+=nb ) {
            id = ((i+offset)/nb)%num_gpus;
            kk = ((i+loffset1)/(nb*num_gpus))%k;

            magma_setdevice(id);
            magmablasSetKernelStream(stream[id][kk]);

            i_local = (i+loffset1)/(nb*num_gpus);
            ib = min(nb,n-i);

            ii = nb*i_local;

            loffset = loffset0;
            if( id < idw ) loffset += nb;
            magma_zhemv(MagmaLower,  ib, c_one, dA(id, i, ii), ldda,
                        dX(id, i), incx, c_one, dY(id, i, kk), incy);
        }

        /* off-diagonal */
        for( i=ib0; i<n-nb; i+=nb ) {
            id = ((i+offset)/nb)%num_gpus;
            kk = ((i+loffset1)/(nb*num_gpus))%k;
            magma_setdevice(id);
            magmablasSetKernelStream(stream[id][kk]);

            i_local = ((i+loffset1)/nb)/num_gpus;
            ii = nb*i_local;
            ib = min(nb,n-i);
            loffset = loffset0;
            if( id < idw ) loffset += nb;

            for( j=i+ib; j<n; j+= i_0 ) {
                i_1 = min(i_0, n-j);
                magma_zgemv(MagmaNoTrans, i_1, ib, c_one, dA(id, j, ii), ldda,
                            dX(id, i), incx, c_one, dY(id, j, kk), incy);
                magma_zgemv(MagmaConjTrans, i_1, ib, c_one, dA(id, j, ii), ldda,
                            dX(id, j), incx, c_one, dY(id, i, kk), incy);
            }
        }
    } else { /* upper-triangular storage */
        loffset = 0;
        /* diagonal */
        for( i=0; i<n; i+=nb ) {
            id = (i/nb)%num_gpus;
            kk = (i/(nb*num_gpus))%k;
            ib = min(nb,n-i);

            magma_setdevice(id);
            magmablasSetKernelStream(stream[id][kk]);

            i_local = i/(nb*num_gpus);
            ii = nb*i_local;

            magma_zhemv(MagmaUpper, ib, c_one, dA(id, i, ii), ldda,
                        dX(id, i), incx, c_one, dY(id, i, kk), incy);
        }

        /* off-diagonal */
        for( i=nb; i<n; i+=nb ) {
            id = (i/nb)%num_gpus;
            kk = (i/(nb*num_gpus))%k;
            magma_setdevice(id);
            magmablasSetKernelStream(stream[id][kk]);

            i_local = (i/nb)/num_gpus;
            ii = nb*i_local;
            ib = min(nb,n-i);

            magma_zgemv(MagmaNoTrans, i, ib, c_one, dA(id, 0, ii), ldda,
                        dX(id, i), incx, c_one, dY(id, 0, kk), incy);
            magma_zgemv(MagmaConjTrans, i, ib, c_one, dA(id, 0, ii), ldda,
                        dX(id, 0), incx, c_one, dY(id, i, kk), incy);
        }
    }
    /* send to CPU */
    magma_setdevice(0);
    magma_zgetvector_async( n, dY(0, 0, 0), 1, w, 1, stream[0][0] );
    for( kk=1; kk<k; kk++ ) {
        magma_zgetvector_async( n, dY(0, 0, kk), 1, &work[kk*n], 1, stream[0][kk] );
    }
    magmablasSetKernelStream(NULL);

    for( id=1; id<num_gpus; id++ ) {
        magma_setdevice(id);
        for( kk=0; kk<k; kk++ ) {
            magma_zgetvector_async( n, dY(id, 0, kk), 1, &work[id*k*n + kk*n], 1, stream[id][kk] );
        }
        magmablasSetKernelStream(NULL);
    }
#endif
    return 0;
}