Esempio n. 1
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing sormbr
*/
int main( int argc, char** argv )
{
    TESTING_INIT();
    
    real_Double_t   gflops, gpu_perf, gpu_time, cpu_perf, cpu_time;
    float Cnorm, error, dwork[1];
    float c_neg_one = MAGMA_S_NEG_ONE;
    magma_int_t ione = 1;
    magma_int_t m, n, k, mi, ni, mm, nn, nq, size, info;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t nb, ldc, lda, lwork, lwork_max;
    float *C, *R, *A, *work, *tau, *tauq, *taup;
    float *d, *e;
    magma_int_t status = 0;
    
    magma_opts opts;
    opts.parse_opts( argc, argv );
    
    // need slightly looser bound (60*eps instead of 30*eps) for some tests
    opts.tolerance = max( 60., opts.tolerance );
    float tol = opts.tolerance * lapackf77_slamch("E");
    
    // test all combinations of input parameters
    magma_vect_t  vect [] = { MagmaQ,          MagmaP       };
    magma_side_t  side [] = { MagmaLeft,       MagmaRight   };
    magma_trans_t trans[] = { MagmaTrans, MagmaNoTrans };

    printf("%%   M     N     K   vect side   trans   CPU Gflop/s (sec)   GPU Gflop/s (sec)   ||R||_F / ||QC||_F\n");
    printf("%%==============================================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
      for( int ivect = 0; ivect < 2; ++ivect ) {
      for( int iside = 0; iside < 2; ++iside ) {
      for( int itran = 0; itran < 2; ++itran ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            m = opts.msize[itest];
            n = opts.nsize[itest];
            k = opts.ksize[itest];
            nb  = magma_get_sgebrd_nb( m, n );
            ldc = m;
            // A is nq x k (vect=Q) or k x nq (vect=P)
            // where nq=m (left) or nq=n (right)
            nq  = (side[iside] == MagmaLeft ? m  : n );
            mm  = (vect[ivect] == MagmaQ    ? nq : k );
            nn  = (vect[ivect] == MagmaQ    ? k  : nq);
            lda = mm;
            
            // MBR calls either MQR or MLQ in various ways
            if ( vect[ivect] == MagmaQ ) {
                if ( nq >= k ) {
                    gflops = FLOPS_SORMQR( m, n, k, side[iside] ) / 1e9;
                }
                else {
                    if ( side[iside] == MagmaLeft ) {
                        mi = m - 1;
                        ni = n;
                    }
                    else {
                        mi = m;
                        ni = n - 1;
                    }
                    gflops = FLOPS_SORMQR( mi, ni, nq-1, side[iside] ) / 1e9;
                }
            }
            else {
                if ( nq > k ) {
                    gflops = FLOPS_SORMLQ( m, n, k, side[iside] ) / 1e9;
                }
                else {
                    if ( side[iside] == MagmaLeft ) {
                        mi = m - 1;
                        ni = n;
                    }
                    else {
                        mi = m;
                        ni = n - 1;
                    }
                    gflops = FLOPS_SORMLQ( mi, ni, nq-1, side[iside] ) / 1e9;
                }
            }
            
            // workspace for gebrd is (mm + nn)*nb
            // workspace for unmbr is m*nb or n*nb, depending on side
            lwork_max = max( (mm + nn)*nb, max( m*nb, n*nb ));
            // this rounds it up slightly if needed to agree with lwork query below
            lwork_max = int( real( magma_smake_lwork( lwork_max )));
            
            TESTING_MALLOC_CPU( C,    float, ldc*n );
            TESTING_MALLOC_CPU( R,    float, ldc*n );
            TESTING_MALLOC_CPU( A,    float, lda*nn );
            TESTING_MALLOC_CPU( work, float, lwork_max );
            TESTING_MALLOC_CPU( d,    float,             min(mm,nn) );
            TESTING_MALLOC_CPU( e,    float,             min(mm,nn) );
            TESTING_MALLOC_CPU( tauq, float, min(mm,nn) );
            TESTING_MALLOC_CPU( taup, float, min(mm,nn) );
            
            // C is full, m x n
            size = ldc*n;
            lapackf77_slarnv( &ione, ISEED, &size, C );
            lapackf77_slacpy( "Full", &m, &n, C, &ldc, R, &ldc );
            
            size = lda*nn;
            lapackf77_slarnv( &ione, ISEED, &size, A );
            
            // compute BRD factorization to get Householder vectors in A, tauq, taup
            //lapackf77_sgebrd( &mm, &nn, A, &lda, d, e, tauq, taup, work, &lwork_max, &info );
            magma_sgebrd( mm, nn, A, lda, d, e, tauq, taup, work, lwork_max, &info );
            if (info != 0) {
                printf("magma_sgebrd returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            }
            
            if ( vect[ivect] == MagmaQ ) {
                tau = tauq;
            } else {
                tau = taup;
            }
            
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            cpu_time = magma_wtime();
            lapackf77_sormbr( lapack_vect_const( vect[ivect] ),
                              lapack_side_const( side[iside] ),
                              lapack_trans_const( trans[itran] ),
                              &m, &n, &k,
                              A, &lda, tau, C, &ldc, work, &lwork_max, &info );
            cpu_time = magma_wtime() - cpu_time;
            cpu_perf = gflops / cpu_time;
            if (info != 0) {
                printf("lapackf77_sormbr returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            }
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            // query for workspace size
            lwork = -1;
            magma_sormbr( vect[ivect], side[iside], trans[itran],
                          m, n, k,
                          A, lda, tau, R, ldc, work, lwork, &info );
            if (info != 0) {
                printf("magma_sormbr (lwork query) returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            }
            lwork = (magma_int_t) MAGMA_S_REAL( work[0] );
            if ( lwork < 0 || lwork > lwork_max ) {
                printf("Warning: optimal lwork %d > allocated lwork_max %d\n", (int) lwork, (int) lwork_max );
                lwork = lwork_max;
            }
            
            gpu_time = magma_wtime();
            magma_sormbr( vect[ivect], side[iside], trans[itran],
                          m, n, k,
                          A, lda, tau, R, ldc, work, lwork, &info );
            gpu_time = magma_wtime() - gpu_time;
            gpu_perf = gflops / gpu_time;
            if (info != 0) {
                printf("magma_sormbr returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            }
            
            /* =====================================================================
               compute relative error |QC_magma - QC_lapack| / |QC_lapack|
               =================================================================== */
            size = ldc*n;
            blasf77_saxpy( &size, &c_neg_one, C, &ione, R, &ione );
            Cnorm = lapackf77_slange( "Fro", &m, &n, C, &ldc, dwork );
            error = lapackf77_slange( "Fro", &m, &n, R, &ldc, dwork ) / (magma_ssqrt(m*n) * Cnorm);
            
            printf( "%5d %5d %5d   %c   %4c   %5c   %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e   %s\n",
                    (int) m, (int) n, (int) k,
                    lapacke_vect_const( vect[ivect] ),
                    lapacke_side_const( side[iside] ),
                    lapacke_trans_const( trans[itran] ),
                    cpu_perf, cpu_time, gpu_perf, gpu_time,
                    error, (error < tol ? "ok" : "failed") );
            status += ! (error < tol);
            
            TESTING_FREE_CPU( C );
            TESTING_FREE_CPU( R );
            TESTING_FREE_CPU( A );
            TESTING_FREE_CPU( work );
            TESTING_FREE_CPU( d );
            TESTING_FREE_CPU( e );
            TESTING_FREE_CPU( taup );
            TESTING_FREE_CPU( tauq );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
      }}}  // end ivect, iside, itran
      printf( "\n" );
    }
    
    opts.cleanup();
    TESTING_FINALIZE();
    return status;
}
Esempio n. 2
0
/**
    Purpose
    -------
    SGEBRD reduces a general real M-by-N matrix A to upper or lower
    bidiagonal form B by an orthogonal transformation: Q**H * A * P = B.

    If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows in the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns in the matrix A.  N >= 0.

    @param[in,out]
    A       REAL array, dimension (LDA,N)
            On entry, the M-by-N general matrix to be reduced.
            On exit,
            if m >= n, the diagonal and the first superdiagonal are
              overwritten with the upper bidiagonal matrix B; the
              elements below the diagonal, with the array TAUQ, represent
              the orthogonal matrix Q as a product of elementary
              reflectors, and the elements above the first superdiagonal,
              with the array TAUP, represent the orthogonal matrix P as
              a product of elementary reflectors;
    \n
            if m < n, the diagonal and the first subdiagonal are
              overwritten with the lower bidiagonal matrix B; the
              elements below the first subdiagonal, with the array TAUQ,
              represent the orthogonal matrix Q as a product of
              elementary reflectors, and the elements above the diagonal,
              with the array TAUP, represent the orthogonal matrix P as
              a product of elementary reflectors.
            See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    @param[out]
    d       real array, dimension (min(M,N))
            The diagonal elements of the bidiagonal matrix B:
            D(i) = A(i,i).

    @param[out]
    e       real array, dimension (min(M,N)-1)
            The off-diagonal elements of the bidiagonal matrix B:
            if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1;
            if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1.

    @param[out]
    tauq    REAL array dimension (min(M,N))
            The scalar factors of the elementary reflectors which
            represent the orthogonal matrix Q. See Further Details.

    @param[out]
    taup    REAL array, dimension (min(M,N))
            The scalar factors of the elementary reflectors which
            represent the orthogonal matrix P. See Further Details.

    @param[out]
    work    (workspace) REAL array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The length of the array WORK.  LWORK >= (M+N)*NB, where NB
            is the optimal blocksize.
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value.

    Further Details
    ---------------
    The matrices Q and P are represented as products of elementary
    reflectors:

    If m >= n,

       Q = H(1) H(2) . . . H(n)  and  P = G(1) G(2) . . . G(n-1)

    Each H(i) and G(i) has the form:

       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'

    where tauq and taup are real scalars, and v and u are real vectors;
    v(1:i-1) = 0, v(i) = 1, and v(i+1:m) is stored on exit in A(i+1:m,i);
    u(1:i) = 0, u(i+1) = 1, and u(i+2:n) is stored on exit in A(i,i+2:n);
    tauq is stored in TAUQ(i) and taup in TAUP(i).

    If m < n,

       Q = H(1) H(2) . . . H(m-1)  and  P = G(1) G(2) . . . G(m)

    Each H(i) and G(i) has the form:

       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'

    where tauq and taup are real scalars, and v and u are real vectors;
    v(1:i) = 0, v(i+1) = 1, and v(i+2:m) is stored on exit in A(i+2:m,i);
    u(1:i-1) = 0, u(i) = 1, and u(i+1:n) is stored on exit in A(i,i+1:n);
    tauq is stored in TAUQ(i) and taup in TAUP(i).

    The contents of A on exit are illustrated by the following examples:

    @verbatim
    m = 6 and n = 5 (m > n):          m = 5 and n = 6 (m < n):

      (  d   e   u1  u1  u1 )           (  d   u1  u1  u1  u1  u1 )
      (  v1  d   e   u2  u2 )           (  e   d   u2  u2  u2  u2 )
      (  v1  v2  d   e   u3 )           (  v1  e   d   u3  u3  u3 )
      (  v1  v2  v3  d   e  )           (  v1  v2  e   d   u4  u4 )
      (  v1  v2  v3  v4  d  )           (  v1  v2  v3  e   d   u5 )
      (  v1  v2  v3  v4  v5 )
    @endverbatim

    where d and e denote diagonal and off-diagonal elements of B, vi
    denotes an element of the vector defining H(i), and ui an element of
    the vector defining G(i).

    @ingroup magma_sgesvd_comp
    ********************************************************************/
extern "C" magma_int_t
magma_sgebrd(
    magma_int_t m, magma_int_t n,
    float *A, magma_int_t lda, float *d, float *e,
    float *tauq, float *taup,
    float *work, magma_int_t lwork,
    magma_int_t *info)
{
#define  A(i, j) (A  + (j)*lda  + (i))
#define dA(i, j) (dA + (j)*ldda + (i))

    float c_neg_one = MAGMA_S_NEG_ONE;
    float c_one     = MAGMA_S_ONE;
    float *dA, *dwork;

    magma_int_t ncol, nrow, jmax, nb, ldda;

    magma_int_t i, j, nx;
    magma_int_t iinfo;

    magma_int_t minmn;
    magma_int_t ldwrkx, ldwrky, lwkopt;
    magma_int_t lquery;

    nb   = magma_get_sgebrd_nb(n);
    ldda = m;

    lwkopt = (m + n) * nb;
    work[0] = MAGMA_S_MAKE( lwkopt, 0. );
    lquery = (lwork == -1);
    
    /* Check arguments */
    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,m)) {
        *info = -4;
    } else if (lwork < lwkopt && (! lquery) ) {
        *info = -10;
    }
    if (*info < 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;

    /* Quick return if possible */
    minmn = min(m,n);
    if (minmn == 0) {
        work[0] = c_one;
        return *info;
    }

    if (MAGMA_SUCCESS != magma_smalloc( &dA, n*ldda + (m + n)*nb )) {
        fprintf (stderr, "!!!! device memory allocation error in sgebrd\n" );
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }
    dwork = dA + n*ldda;

    ldwrkx = m;
    ldwrky = n;

    /* Set the block/unblock crossover point NX. */
    nx = 128;

    /* Copy the matrix to the GPU */
    if (minmn - nx >= 1) {
        magma_ssetmatrix( m, n, A, lda, dA, ldda );
    }
    
    for (i=0; i < (minmn - nx); i += nb) {
        /*  Reduce rows and columns i:i+nb-1 to bidiagonal form and return
            the matrices X and Y which are needed to update the unreduced
            part of the matrix */
        nrow = m - i;
        ncol = n - i;

        /*   Get the current panel (no need for the 1st iteration) */
        if ( i > 0 ) {
            magma_sgetmatrix( nrow, nb, dA(i, i), ldda, A( i, i), lda );
            magma_sgetmatrix( nb, ncol - nb,
                              dA(i, i+nb), ldda,
                              A( i, i+nb), lda );
        }

        magma_slabrd_gpu(nrow, ncol, nb,
                         A(i, i),          lda,    dA(i, i),          ldda,
                         d+i, e+i, tauq+i, taup+i,
                         work,             ldwrkx, dwork,             ldwrkx,  // x, dx
                         work+(ldwrkx*nb), ldwrky, dwork+(ldwrkx*nb), ldwrky); // y, dy

        /*  Update the trailing submatrix A(i+nb:m,i+nb:n), using an update
            of the form  A := A - V*Y' - X*U' */
        nrow = m - i - nb;
        ncol = n - i - nb;

        // Send Y back to the GPU
        magma_ssetmatrix( nrow, nb, work  + nb, ldwrkx, dwork + nb, ldwrkx );
        magma_ssetmatrix( ncol, nb,
                          work  + (ldwrkx+1)*nb, ldwrky,
                          dwork + (ldwrkx+1)*nb, ldwrky );

        magma_sgemm( MagmaNoTrans, MagmaConjTrans,
                     nrow, ncol, nb,
                     c_neg_one, dA(i+nb, i   ),      ldda,
                                dwork+(ldwrkx+1)*nb, ldwrky,
                     c_one,     dA(i+nb, i+nb),      ldda);

        magma_sgemm( MagmaNoTrans, MagmaNoTrans,
                     nrow, ncol, nb,
                     c_neg_one, dwork+nb,         ldwrkx,
                                dA( i,    i+nb ), ldda,
                     c_one,     dA( i+nb, i+nb ), ldda);

        /* Copy diagonal and off-diagonal elements of B back into A */
        if (m >= n) {
            jmax = i + nb;
            for (j = i; j < jmax; ++j) {
                *A(j, j  ) = MAGMA_S_MAKE( d[j], 0. );
                *A(j, j+1) = MAGMA_S_MAKE( e[j], 0. );
            }
        } else {
            jmax = i + nb;
            for (j = i; j < jmax; ++j) {
                *A(j,   j ) = MAGMA_S_MAKE( d[j], 0. );
                *A(j+1, j ) = MAGMA_S_MAKE( e[j], 0. );
            }
        }
    }

    /* Use unblocked code to reduce the remainder of the matrix */
    nrow = m - i;
    ncol = n - i;

    if ( 0 < minmn - nx ) {
        magma_sgetmatrix( nrow, ncol, dA(i, i), ldda, A(i, i), lda );
    }
    
    lapackf77_sgebrd( &nrow, &ncol,
                      A(i, i), &lda, d+i, e+i,
                      tauq+i, taup+i, work, &lwork, &iinfo);
    work[0] = MAGMA_S_MAKE( lwkopt, 0. );

    magma_free( dA );
    return *info;
} /* magma_sgebrd */
Esempio n. 3
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Return nb for gesvd based on m
*/
magma_int_t magma_get_sgesvd_nb( magma_int_t m )
{
    return magma_get_sgebrd_nb( m );
}
Esempio n. 4
0
magma_int_t magmaf_get_sgebrd_nb( magma_int_t *m )
{
    return magma_get_sgebrd_nb( *m );
}
Esempio n. 5
0
/***************************************************************************//**
    Purpose
    -------
    SORMBR multiplies by Q or P as part of the SVD decomposition.
    
    If VECT = MagmaQ, SORMBR overwrites the general real M-by-N matrix C with
                                 SIDE = MagmaLeft     SIDE = MagmaRight
    TRANS = MagmaNoTrans:        Q*C                  C*Q
    TRANS = MagmaTrans:     Q**H*C               C*Q**H
    
    If VECT = MagmaP, SORMBR overwrites the general real M-by-N matrix C with
                                 SIDE = MagmaLeft     SIDE = MagmaRight
    TRANS = MagmaNoTrans:        P*C                  C*P
    TRANS = MagmaTrans:     P**H*C               C*P**H
    
    Here Q and P**H are the orthogonal matrices determined by SGEBRD when
    reducing A real matrix A to bidiagonal form: A = Q*B * P**H. Q
    and P**H are defined as products of elementary reflectors H(i) and
    G(i) respectively.
    
    Let nq = m if SIDE = MagmaLeft and nq = n if SIDE = MagmaRight. Thus nq is the
    order of the orthogonal matrix Q or P**H that is applied.
    
    If VECT = MagmaQ, A is assumed to have been an NQ-by-K matrix:
    if nq >= k, Q = H(1) H(2) . . . H(k);
    if nq <  k, Q = H(1) H(2) . . . H(nq-1).
    
    If VECT = MagmaP, A is assumed to have been A K-by-NQ matrix:
    if k <  nq, P = G(1) G(2) . . . G(k);
    if k >= nq, P = G(1) G(2) . . . G(nq-1).
    
    Arguments
    ---------
    @param[in]
    vect    magma_vect_t
      -     = MagmaQ: apply Q or Q**H;
      -     = MagmaP: apply P or P**H.
    
    @param[in]
    side    magma_side_t
      -     = MagmaLeft:  apply Q, Q**H, P or P**H from the Left;
      -     = MagmaRight: apply Q, Q**H, P or P**H from the Right.
    
    @param[in]
    trans   magma_trans_t
      -     = MagmaNoTrans:    No transpose, apply Q or P;
      -     = MagmaTrans: Conjugate transpose, apply Q**H or P**H.
    
    @param[in]
    m       INTEGER
            The number of rows of the matrix C. M >= 0.
    
    @param[in]
    n       INTEGER
            The number of columns of the matrix C. N >= 0.
    
    @param[in]
    k       INTEGER
            If VECT = MagmaQ, the number of columns in the original
            matrix reduced by SGEBRD.
            If VECT = MagmaP, the number of rows in the original
            matrix reduced by SGEBRD.
            K >= 0.
    
    @param[in]
    A       REAL array, dimension
                                  (LDA,min(nq,K)) if VECT = MagmaQ
                                  (LDA,nq)        if VECT = MagmaP
            The vectors which define the elementary reflectors H(i) and
            G(i), whose products determine the matrices Q and P, as
            returned by SGEBRD.
    
    @param[in]
    lda     INTEGER
            The leading dimension of the array A.
            If VECT = MagmaQ, LDA >= max(1,nq);
            if VECT = MagmaP, LDA >= max(1,min(nq,K)).
    
    @param[in]
    tau     REAL array, dimension (min(nq,K))
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i) or G(i) which determines Q or P, as returned
            by SGEBRD in the array argument TAUQ or TAUP.
    
    @param[in,out]
    C       REAL array, dimension (LDC,N)
            On entry, the M-by-N matrix C.
            On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q
            or P*C or P**H*C or C*P or C*P**H.
    
    @param[in]
    ldc     INTEGER
            The leading dimension of the array C. LDC >= max(1,M).
    
    @param[out]
    work    (workspace) REAL array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.
    
    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.
            If SIDE = MagmaLeft,  LWORK >= max(1,N);
            if SIDE = MagmaRight, LWORK >= max(1,M);
            if N = 0 or M = 0, LWORK >= 1.
            For optimum performance
            if SIDE = MagmaLeft,  LWORK >= max(1,N*NB);
            if SIDE = MagmaRight, LWORK >= max(1,M*NB),
            where NB is the optimal blocksize. (NB = 0 if M = 0 or N = 0.)
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.
    
    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
    

    @ingroup magma_unmbr
*******************************************************************************/
extern "C" magma_int_t
magma_sormbr(
    magma_vect_t vect, magma_side_t side, magma_trans_t trans,
    magma_int_t m, magma_int_t n, magma_int_t k,
    float *A, magma_int_t lda,
    float *tau,
    float *C, magma_int_t ldc,
    float *work, magma_int_t lwork,
    magma_int_t *info)
{
    #define A(i,j)  (A + (i) + (j)*lda)
    #define C(i,j)  (C + (i) + (j)*ldc)
            
    magma_int_t i1, i2, nb, mi, ni, nq, nq_1, minwrk, iinfo, lwkopt;
    magma_int_t left, notran, applyq, lquery;
    magma_trans_t transt;
    
    MAGMA_UNUSED( nq_1 );  // used only in version 1

    *info = 0;
    applyq = (vect  == MagmaQ);
    left   = (side  == MagmaLeft);
    notran = (trans == MagmaNoTrans);
    lquery = (lwork == -1);

    /* NQ is the order of Q or P and MINWRK (previously "nw") is the minimum dimension of WORK */
    if (left) {
        nq = m;
        minwrk = n;
    }
    else {
        nq = n;
        minwrk = m;
    }
    if (m == 0 || n == 0) {
        minwrk = 0;
    }
    
    /* check arguments */
    if (! applyq && vect != MagmaP) {
        *info = -1;
    }
    else if (! left && side != MagmaRight) {
        *info = -2;
    }
    else if (! notran && trans != MagmaTrans) {
        *info = -3;
    }
    else if (m < 0) {
        *info = -4;
    }
    else if (n < 0) {
        *info = -5;
    }
    else if (k < 0) {
        *info = -6;
    }
    else if ( (   applyq && lda < max(1,nq)        ) ||
              ( ! applyq && lda < max(1,min(nq,k)) ) ) {
        *info = -8;
    }
    else if (ldc < max(1,m)) {
        *info = -11;
    }
    else if (lwork < max(1,minwrk) && ! lquery) {
        *info = -13;
    }

    if (*info == 0) {
        if (minwrk > 0) {
            // TODO have get_sormqr_nb and get_sormlq_nb routines? see original LAPACK sormbr.
            // TODO make them dependent on m, n, and k?
            nb = magma_get_sgebrd_nb( m, n );
            lwkopt = max(1, minwrk*nb);
        }
        else {
            lwkopt = 1;
        }
        work[0] = magma_smake_lwork( lwkopt );
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0) {
        return *info;
    }

    if (applyq) {
        /* Apply Q */
        if (nq >= k) {
            /* Q was determined by a call to SGEBRD with nq >= k */
            #if VERSION == 1
            lapackf77_sormqr( lapack_side_const(side), lapack_trans_const(trans),
                              &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, &iinfo);
            #else
            magma_sormqr( side, trans,
                          m, n, k, A, lda, tau, C, ldc, work, lwork, &iinfo);
            #endif
        }
        else if (nq > 1) {
            /* Q was determined by a call to SGEBRD with nq < k */
            if (left) {
                mi = m - 1;
                ni = n;
                i1 = 1;
                i2 = 0;
            }
            else {
                mi = m;
                ni = n - 1;
                i1 = 0;
                i2 = 1;
            }
            #if VERSION == 1
            nq_1 = nq - 1;
            lapackf77_sormqr( lapack_side_const(side), lapack_trans_const(trans),
                              &mi, &ni, &nq_1, A(1,0), &lda, tau, C(i1,i2), &ldc, work, &lwork, &iinfo);
            #else
            magma_sormqr( side, trans,
                          mi, ni, nq-1, A(1,0), lda, tau, C(i1,i2), ldc, work, lwork, &iinfo);
            #endif
        }
    }
    else {
        /* Apply P */
        if (notran) {
            transt = MagmaTrans;
        }
        else {
            transt = MagmaNoTrans;
        }
        if (nq > k) {
            /* P was determined by a call to SGEBRD with nq > k */
            #if VERSION == 1
            lapackf77_sormlq( lapack_side_const(side), lapack_trans_const(transt),
                              &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, &iinfo);
            #else
            magma_sormlq( side, transt,
                          m, n, k, A, lda, tau, C, ldc, work, lwork, &iinfo);
            #endif
        }
        else if (nq > 1) {
            /* P was determined by a call to SGEBRD with nq <= k */
            if (left) {
                mi = m - 1;
                ni = n;
                i1 = 1;
                i2 = 0;
            }
            else {
                mi = m;
                ni = n - 1;
                i1 = 0;
                i2 = 1;
            }
            #if VERSION == 1
            nq_1 = nq - 1;
            lapackf77_sormlq( lapack_side_const(side), lapack_trans_const(transt),
                              &mi, &ni, &nq_1, A(0,1), &lda, tau, C(i1,i2), &ldc, work, &lwork, &iinfo);
            #else
            magma_sormlq( side, transt,
                          mi, ni, nq-1, A(0,1), lda, tau, C(i1,i2), ldc, work, lwork, &iinfo);
            #endif
        }
    }
    work[0] = magma_smake_lwork( lwkopt );
    return *info;
} /* magma_sormbr */
Esempio n. 6
0
extern "C" magma_int_t
magma_sgebrd(magma_int_t m, magma_int_t n,
             float *a, magma_int_t lda, float *d, float *e,
             float *tauq, float *taup, 
             float *work, magma_int_t lwork, 
             magma_int_t *info)
{
/*  -- MAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2012

    Purpose
    =======
    SGEBRD reduces a general real M-by-N matrix A to upper or lower
    bidiagonal form B by an orthogonal transformation: Q**T * A * P = B.

    If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows in the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns in the matrix A.  N >= 0.

    A       (input/output) REAL array, dimension (LDA,N)
            On entry, the M-by-N general matrix to be reduced.
            On exit,
            if m >= n, the diagonal and the first superdiagonal are
              overwritten with the upper bidiagonal matrix B; the
              elements below the diagonal, with the array TAUQ, represent
              the orthogonal matrix Q as a product of elementary
              reflectors, and the elements above the first superdiagonal,
              with the array TAUP, represent the orthogonal matrix P as
              a product of elementary reflectors;
            if m < n, the diagonal and the first subdiagonal are
              overwritten with the lower bidiagonal matrix B; the
              elements below the first subdiagonal, with the array TAUQ,
              represent the orthogonal matrix Q as a product of
              elementary reflectors, and the elements above the diagonal,
              with the array TAUP, represent the orthogonal matrix P as
              a product of elementary reflectors.
            See Further Details.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    D       (output) real array, dimension (min(M,N))
            The diagonal elements of the bidiagonal matrix B:
            D(i) = A(i,i).

    E       (output) real array, dimension (min(M,N)-1)
            The off-diagonal elements of the bidiagonal matrix B:
            if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1;
            if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1.

    TAUQ    (output) REAL array dimension (min(M,N))
            The scalar factors of the elementary reflectors which
            represent the orthogonal matrix Q. See Further Details.

    TAUP    (output) REAL array, dimension (min(M,N))
            The scalar factors of the elementary reflectors which
            represent the orthogonal matrix P. See Further Details.

    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    LWORK   (input) INTEGER
            The length of the array WORK.  LWORK >= (M+N)*NB, where NB
            is the optimal blocksize.

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value.

    Further Details
    ===============
    The matrices Q and P are represented as products of elementary
    reflectors:

    If m >= n,
       Q = H(1) H(2) . . . H(n)  and  P = G(1) G(2) . . . G(n-1)
    Each H(i) and G(i) has the form:
       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'
    where tauq and taup are real scalars, and v and u are real vectors;
    v(1:i-1) = 0, v(i) = 1, and v(i+1:m) is stored on exit in A(i+1:m,i);
    u(1:i) = 0, u(i+1) = 1, and u(i+2:n) is stored on exit in A(i,i+2:n);
    tauq is stored in TAUQ(i) and taup in TAUP(i).

    If m < n,
       Q = H(1) H(2) . . . H(m-1)  and  P = G(1) G(2) . . . G(m)
    Each H(i) and G(i) has the form:
       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'
    where tauq and taup are real scalars, and v and u are real vectors;
    v(1:i) = 0, v(i+1) = 1, and v(i+2:m) is stored on exit in A(i+2:m,i);
    u(1:i-1) = 0, u(i) = 1, and u(i+1:n) is stored on exit in A(i,i+1:n);
    tauq is stored in TAUQ(i) and taup in TAUP(i).

    The contents of A on exit are illustrated by the following examples:

    m = 6 and n = 5 (m > n):          m = 5 and n = 6 (m < n):

      (  d   e   u1  u1  u1 )           (  d   u1  u1  u1  u1  u1 )
      (  v1  d   e   u2  u2 )           (  e   d   u2  u2  u2  u2 )
      (  v1  v2  d   e   u3 )           (  v1  e   d   u3  u3  u3 )
      (  v1  v2  v3  d   e  )           (  v1  v2  e   d   u4  u4 )
      (  v1  v2  v3  v4  d  )           (  v1  v2  v3  e   d   u5 )
      (  v1  v2  v3  v4  v5 )

    where d and e denote diagonal and off-diagonal elements of B, vi
    denotes an element of the vector defining H(i), and ui an element of
    the vector defining G(i).
    =====================================================================    */

    float c_neg_one = MAGMA_S_NEG_ONE;
    float c_one     = MAGMA_S_ONE;
    float *da, *dwork;

    magma_int_t ncol, nrow, jmax, nb, ldda;

    magma_int_t i, j, nx;
    magma_int_t iinfo;

    magma_int_t minmn;
    magma_int_t ldwrkx, ldwrky, lwkopt;
    magma_int_t lquery;

    nb   = magma_get_sgebrd_nb(n);
    ldda = m;

    lwkopt = (m + n) * nb;
    work[0] = MAGMA_S_MAKE( lwkopt, 0. );
    lquery = (lwork == -1);
    
    /* Check arguments */
    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,m)) {
        *info = -4;
    } else if (lwork < lwkopt && (! lquery) ) {
        *info = -10;
    }
    if (*info < 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;

    /* Quick return if possible */
    minmn = min(m,n);
    if (minmn == 0) {
        work[0] = c_one;
        return *info;
    }

    if (MAGMA_SUCCESS != magma_smalloc( &da, n*ldda + (m + n)*nb )) {
        fprintf (stderr, "!!!! device memory allocation error in sgebrd\n" );
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info; 
    }
    dwork = da + (n)*ldda;

    ldwrkx = m;
    ldwrky = n;

    /* Set the block/unblock crossover point NX. */
    nx = 128;

    /* Copy the matrix to the GPU */
    if (minmn - nx >= 1) {
        magma_ssetmatrix( m, n, a, lda, da, ldda );
    }
    
    for (i=0; i< (minmn - nx); i += nb) {

        /*  Reduce rows and columns i:i+nb-1 to bidiagonal form and return
            the matrices X and Y which are needed to update the unreduced
            part of the matrix */
        nrow = m - i;
        ncol = n - i;

        /*   Get the current panel (no need for the 1st iteration) */
        if ( i > 0 ) {
            magma_sgetmatrix( nrow, nb, dA(i, i), ldda, A( i, i), lda );
            magma_sgetmatrix( nb, ncol - nb,
                              dA(i, i+nb), ldda,
                              A( i, i+nb), lda );
        }

        magma_slabrd_gpu(nrow, ncol, nb,
                         A(i, i),          lda,    dA(i, i),          ldda,
                         d+i, e+i, tauq+i, taup+i,
                         work,             ldwrkx, dwork,             ldwrkx,  // x, dx
                         work+(ldwrkx*nb), ldwrky, dwork+(ldwrkx*nb), ldwrky); // y, dy

        /*  Update the trailing submatrix A(i+nb:m,i+nb:n), using an update
            of the form  A := A - V*Y' - X*U' */
        nrow = m - i - nb;
        ncol = n - i - nb;

        // Send Y back to the GPU
        magma_ssetmatrix( nrow, nb, work  + nb, ldwrkx, dwork + nb, ldwrkx );
        magma_ssetmatrix( ncol, nb,
                          work  + (ldwrkx+1)*nb, ldwrky,
                          dwork + (ldwrkx+1)*nb, ldwrky );

        magma_sgemm( MagmaNoTrans, MagmaTrans, 
                     nrow, ncol, nb, 
                     c_neg_one, dA(i+nb, i   ),      ldda,
                                dwork+(ldwrkx+1)*nb, ldwrky,
                     c_one,     dA(i+nb, i+nb),      ldda);

        magma_sgemm( MagmaNoTrans, MagmaNoTrans, 
                     nrow, ncol, nb, 
                     c_neg_one, dwork+nb,         ldwrkx,
                                dA( i,    i+nb ), ldda,
                     c_one,     dA( i+nb, i+nb ), ldda);

        /* Copy diagonal and off-diagonal elements of B back into A */
        if (m >= n) {
            jmax = i + nb;
            for (j = i; j < jmax; ++j) {
                *A(j, j  ) = MAGMA_S_MAKE( d[j], 0. );
                *A(j, j+1) = MAGMA_S_MAKE( e[j], 0. );
            }
        } else {
            jmax = i + nb;
            for (j = i; j < jmax; ++j) {
                *A(j,   j ) = MAGMA_S_MAKE( d[j], 0. );
                *A(j+1, j ) = MAGMA_S_MAKE( e[j], 0. );
            }
        }
    }

    /* Use unblocked code to reduce the remainder of the matrix */
    nrow = m - i;
    ncol = n - i;

    if ( 0 < minmn - nx ) {
        magma_sgetmatrix( nrow, ncol, dA(i, i), ldda, A(i, i), lda );
    }
    
    lapackf77_sgebrd( &nrow, &ncol, 
                      A(i, i), &lda, d+i, e+i,
                      tauq+i, taup+i, work, &lwork, &iinfo);
    work[0] = MAGMA_S_MAKE( lwkopt, 0. );

    magma_free( da );
    return *info;
} /* sgebrd */