コード例 #1
0
ファイル: testing_cblas_z.cpp プロジェクト: xulunfan/magma
// ----------------------------------------
int main( int argc, char** argv )
{
    TESTING_INIT();
    
    //real_Double_t   t_m, t_c, t_f;
    magma_int_t ione = 1;
    
    magmaDoubleComplex  *A, *B;
    double error_cblas, error_fblas, error_inline;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t i, j, k, m, n, size, maxn, ld;
    
    // complex x for magma, cblas, fortran, inline blas respectively
    magmaDoubleComplex x2_m, x2_c, x2_f, x2_i;
    
    // real    x for magma, cblas, fortran, inline blas respectively
    double x_m, x_c, x_f, x_i;
    
    MAGMA_UNUSED( x_c  );
    MAGMA_UNUSED( x_f  );
    MAGMA_UNUSED( x2_c );
    MAGMA_UNUSED( x2_f );
    MAGMA_UNUSED( x2_m );
    
    magma_opts opts;
    opts.parse_opts( argc, argv );
    
    opts.tolerance = max( 100., opts.tolerance );
    double tol = opts.tolerance * lapackf77_dlamch("E");
    gTol = tol;
    
    magma_int_t inc[] = { -2, -1, 1, 2 };  //{ 1 };  //{ -1, 1 };
    magma_int_t ninc = sizeof(inc)/sizeof(*inc);
    magma_int_t maxinc = 0;
    for( i=0; i < ninc; ++i ) {
        maxinc = max( maxinc, abs(inc[i]) );
    }
    
    printf( "!! Calling these CBLAS and Fortran BLAS sometimes crashes (segfaults), which !!\n"
            "!! is why we use wrappers. It does not necesarily indicate a bug in MAGMA.   !!\n"
            "!! If MAGMA_WITH_MKL or __APPLE__ are defined, known failures are skipped.   !!\n"
            "\n" );
    
    // tell user about disabled functions
    #ifndef HAVE_CBLAS
        printf( "n/a: HAVE_CBLAS not defined, so no cblas functions tested.\n\n" );
    #endif
    
    #if defined(MAGMA_WITH_MKL)
        printf( "n/a: cblas_zdotc, cblas_zdotu, blasf77_zdotc, and blasf77_zdotu are disabled with MKL, due to segfaults.\n\n" );
    #endif
    
    #if defined(__APPLE__)
        printf( "n/a: blasf77_zdotc and blasf77_zdotu are disabled on MacOS, due to segfaults.\n\n" );
    #endif
    
    printf( "%%                                          Error w.r.t.   Error w.r.t.   Error w.r.t.\n"
            "%%   M     N     K  incx  incy   Function   CBLAS          Fortran BLAS   inline\n"
            "%%====================================================================================\n" );
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        if ( itest > 0 ) {
            printf( "%%----------------------------------------------------------------------\n" );
        }
        
        m = opts.msize[itest];
        n = opts.nsize[itest];
        k = opts.ksize[itest];
        
        // allocate matrices
        // over-allocate so they can be any combination of
        // {m,n,k} * {abs(incx), abs(incy)} by
        // {m,n,k} * {abs(incx), abs(incy)}
        maxn = max( max( m, n ), k ) * maxinc;
        ld = max( 1, maxn );
        size = ld*maxn;
        TESTING_MALLOC_CPU( A, magmaDoubleComplex, size );
        TESTING_MALLOC_CPU( B, magmaDoubleComplex, size );
        
        // initialize matrices
        lapackf77_zlarnv( &ione, ISEED, &size, A );
        lapackf77_zlarnv( &ione, ISEED, &size, B );
        
        // ----- test DZASUM
        for( int iincx = 0; iincx < ninc; ++iincx ) {
            magma_int_t incx = inc[iincx];
            
            for( int iincy = 0; iincy < ninc; ++iincy ) {
                magma_int_t incy = inc[iincy];
                
                // get one-norm of column j of A
                if ( incx > 0 && incx == incy ) {  // positive, no incy
                    error_cblas  = 0;
                    error_fblas  = 0;
                    error_inline = 0;
                    for( j=0; j < k; ++j ) {
                        x_m = magma_cblas_dzasum( m, A(0,j), incx );
                        
                        #ifdef HAVE_CBLAS
                            x_c = cblas_dzasum( m, A(0,j), incx );
                            error_cblas = max( error_cblas, fabs(x_m - x_c) / fabs(m*x_c) );
                        #else
                            x_c = 0;
                            error_cblas = SKIPPED_FLAG;
                        #endif
                        
                        x_f = blasf77_dzasum( &m, A(0,j), &incx );
                        error_fblas = max( error_fblas, fabs(x_m - x_f) / fabs(m*x_f) );
                        
                        // inline implementation
                        x_i = 0;
                        for( i=0; i < m; ++i ) {
                            x_i += MAGMA_Z_ABS1( *A(i*incx,j) );  // |real(Aij)| + |imag(Aij)|
                        }
                        error_inline = max( error_inline, fabs(x_m - x_i) / fabs(m*x_i) );
                        
                        //printf( "dzasum xm %.8e, xc %.8e, xf %.8e, xi %.8e\n", x_m, x_c, x_f, x_i );
                    }
                    output( "dzasum", m, n, k, incx, incy, error_cblas, error_fblas, error_inline );
                }
            }
        }
        printf( "\n" );
        
        // ----- test DZNRM2
        // get two-norm of column j of A
        for( int iincx = 0; iincx < ninc; ++iincx ) {
            magma_int_t incx = inc[iincx];
            
            for( int iincy = 0; iincy < ninc; ++iincy ) {
                magma_int_t incy = inc[iincy];
                
                if ( incx > 0 && incx == incy ) {  // positive, no incy
                    error_cblas  = 0;
                    error_fblas  = 0;
                    error_inline = 0;
                    for( j=0; j < k; ++j ) {
                        x_m = magma_cblas_dznrm2( m, A(0,j), incx );
                        
                        #ifdef HAVE_CBLAS
                            x_c = cblas_dznrm2( m, A(0,j), incx );
                            error_cblas = max( error_cblas, fabs(x_m - x_c) / fabs(m*x_c) );
                        #else
                            x_c = 0;
                            error_cblas = SKIPPED_FLAG;
                        #endif
                        
                        x_f = blasf77_dznrm2( &m, A(0,j), &incx );
                        error_fblas = max( error_fblas, fabs(x_m - x_f) / fabs(m*x_f) );
                        
                        // inline implementation (poor -- doesn't scale)
                        x_i = 0;
                        for( i=0; i < m; ++i ) {
                            x_i += real( *A(i*incx,j) ) * real( *A(i*incx,j) )
                                +  imag( *A(i*incx,j) ) * imag( *A(i*incx,j) );
                            // same: real( conj( *A(i*incx,j) ) * *A(i*incx,j) );
                        }
                        x_i = sqrt( x_i );
                        error_inline = max( error_inline, fabs(x_m - x_i) / fabs(m*x_i) );
                        
                        //printf( "dznrm2 xm %.8e, xc %.8e, xf %.8e, xi %.8e\n", x_m, x_c, x_f, x_i );
                    }
                    output( "dznrm2", m, n, k, incx, incy, error_cblas, error_fblas, error_inline );
                }
            }
        }
        printf( "\n" );
        
        // ----- test ZDOTC
        // dot columns, Aj^H Bj
        for( int iincx = 0; iincx < ninc; ++iincx ) {
            magma_int_t incx = inc[iincx];
            
            for( int iincy = 0; iincy < ninc; ++iincy ) {
                magma_int_t incy = inc[iincy];
                
                error_cblas  = 0;
                error_fblas  = 0;
                error_inline = 0;
                for( j=0; j < k; ++j ) {
                    // MAGMA implementation, not just wrapper
                    x2_m = magma_cblas_zdotc( m, A(0,j), incx, B(0,j), incy );
                    
                    // crashes with MKL 11.1.2, ILP64
                    #if defined(HAVE_CBLAS) && ! defined(MAGMA_WITH_MKL)
                        #ifdef COMPLEX
                        cblas_zdotc_sub( m, A(0,j), incx, B(0,j), incy, &x2_c );
                        #else
                        x2_c = cblas_zdotc( m, A(0,j), incx, B(0,j), incy );
                        #endif
                        error_cblas = max( error_cblas, fabs(x2_m - x2_c) / fabs(m*x2_c) );
                    #else
                        x2_c = MAGMA_Z_ZERO;
                        error_cblas = SKIPPED_FLAG;
                    #endif
                    
                    // crashes with MKL 11.2.3 and MacOS 10.9
                    #if (! defined(COMPLEX) || ! defined(MAGMA_WITH_MKL)) && ! defined(__APPLE__)
                        x2_f = blasf77_zdotc( &m, A(0,j), &incx, B(0,j), &incy );
                        error_fblas = max( error_fblas, fabs(x2_m - x2_f) / fabs(m*x2_f) );
                    #else
                        x2_f = MAGMA_Z_ZERO;
                        error_fblas = SKIPPED_FLAG;
                    #endif
                    
                    // inline implementation
                    x2_i = MAGMA_Z_ZERO;
                    magma_int_t A_offset = (incx > 0 ? 0 : (-n + 1)*incx);
                    magma_int_t B_offset = (incy > 0 ? 0 : (-n + 1)*incy);
                    for( i=0; i < m; ++i ) {
                        x2_i += conj( *A(A_offset + i*incx,j) ) * *B(B_offset + i*incy,j);
                    }
                    error_inline = max( error_inline, fabs(x2_m - x2_i) / fabs(m*x2_i) );
                    
                    //printf( "zdotc xm %.8e + %.8ei, xc %.8e + %.8ei, xf %.8e + %.8ei, xi %.8e + %.8ei\n",
                    //        real(x2_m), imag(x2_m),
                    //        real(x2_c), imag(x2_c),
                    //        real(x2_f), imag(x2_f),
                    //        real(x2_i), imag(x2_i) );
                }
                output( "zdotc", m, n, k, incx, incy, error_cblas, error_fblas, error_inline );
            }
        }
        printf( "\n" );
        
        // ----- test ZDOTU
        // dot columns, Aj^T * Bj
        for( int iincx = 0; iincx < ninc; ++iincx ) {
            magma_int_t incx = inc[iincx];
            
            for( int iincy = 0; iincy < ninc; ++iincy ) {
                magma_int_t incy = inc[iincy];
                
                error_cblas  = 0;
                error_fblas  = 0;
                error_inline = 0;
                for( j=0; j < k; ++j ) {
                    // MAGMA implementation, not just wrapper
                    x2_m = magma_cblas_zdotu( m, A(0,j), incx, B(0,j), incy );
                    
                    // crashes with MKL 11.1.2, ILP64
                    #if defined(HAVE_CBLAS) && ! defined(MAGMA_WITH_MKL)
                        #ifdef COMPLEX
                        cblas_zdotu_sub( m, A(0,j), incx, B(0,j), incy, &x2_c );
                        #else
                        x2_c = cblas_zdotu( m, A(0,j), incx, B(0,j), incy );
                        #endif
                        error_cblas = max( error_cblas, fabs(x2_m - x2_c) / fabs(m*x2_c) );
                    #else
                        x2_c = MAGMA_Z_ZERO;
                        error_cblas = SKIPPED_FLAG;
                    #endif
                    
                    // crashes with MKL 11.2.3 and MacOS 10.9
                    #if (! defined(COMPLEX) || ! defined(MAGMA_WITH_MKL)) && ! defined(__APPLE__)
                        x2_f = blasf77_zdotu( &m, A(0,j), &incx, B(0,j), &incy );
                        error_fblas = max( error_fblas, fabs(x2_m - x2_f) / fabs(m*x2_f) );
                    #else
                        x2_f = MAGMA_Z_ZERO;
                        error_fblas = SKIPPED_FLAG;
                    #endif
                    
                    // inline implementation
                    x2_i = MAGMA_Z_ZERO;
                    magma_int_t A_offset = (incx > 0 ? 0 : (-n + 1)*incx);
                    magma_int_t B_offset = (incy > 0 ? 0 : (-n + 1)*incy);
                    for( i=0; i < m; ++i ) {
                        x2_i += *A(A_offset + i*incx,j) * *B(B_offset + i*incy,j);
                    }
                    error_inline = max( error_inline, fabs(x2_m - x2_i) / fabs(m*x2_i) );
                    
                    //printf( "zdotu xm %.8e + %.8ei, xc %.8e + %.8ei, xf %.8e + %.8ei, xi %.8e + %.8ei\n",
                    //        real(x2_m), imag(x2_m),
                    //        real(x2_c), imag(x2_c),
                    //        real(x2_f), imag(x2_f),
                    //        real(x2_i), imag(x2_i) );
                }
                output( "zdotu", m, n, k, incx, incy, error_cblas, error_fblas, error_inline );
            }
        }
        
        // cleanup
        TESTING_FREE_CPU( A );
        TESTING_FREE_CPU( B );
        fflush( stdout );
    }  // itest, incx, incy
    
    opts.cleanup();
    TESTING_FINALIZE();
    return gStatus;
}
コード例 #2
0
/***************************************************************************//**
    Purpose
    -------
    DLARFB applies a real block reflector H or its transpose H^H to a
    DOUBLE PRECISION m by n matrix C, from the left.
    
    __Note that this function assumes__ that the upper part of dV_array is 0
    because it is referenced. Same for upper/lower part of dT_array.

    Arguments
    ---------
    @param[in]
    side    magma_side_t
      -     = MagmaLeft:      apply H or H^H from the Left
      -     = MagmaRight:     apply H or H^H from the Right

    @param[in]
    trans   magma_trans_t
      -     = MagmaNoTrans:    apply H   (No transpose)
      -     = MagmaTrans: apply H^H (Conjugate transpose)

    @param[in]
    direct  magma_direct_t
            Indicates how H is formed from a product of elementary
            reflectors
      -     = MagmaForward:  H = H(1) H(2) . . . H(k) (Forward)
      -     = MagmaBackward: H = H(k) . . . H(2) H(1) (Backward)

    @param[in]
    storev  magma_storev_t
            Indicates how the vectors which define the elementary
            reflectors are stored:
      -     = MagmaColumnwise: Columnwise
      -     = MagmaRowwise:    Rowwise

    @param[in]
    m       INTEGER
            The number of rows of the matrix C.

    @param[in]
    n       INTEGER
            The number of columns of the matrix C.

    @param[in]
    k       INTEGER
            The order of the matrix T (= the number of elementary
            reflectors whose product defines the block reflector).

    @param[in]
    dV_array      DOUBLE PRECISION array on the GPU, dimension
                (LDDV,K) if STOREV = MagmaColumnwise
                (LDDV,M) if STOREV = MagmaRowwise and SIDE = MagmaLeft
                (LDDV,N) if STOREV = MagmaRowwise and SIDE = MagmaRight
            The matrix V. See further details.

    @param[in]
    lddv    INTEGER
            The leading dimension of the array V.
            If STOREV = MagmaColumnwise and SIDE = MagmaLeft, LDDV >= max(1,M);
            if STOREV = MagmaColumnwise and SIDE = MagmaRight, LDDV >= max(1,N);
            if STOREV = MagmaRowwise, LDDV >= K.

    @param[in]
    dT_array      DOUBLE PRECISION array on the GPU, dimension (LDDT,K)
            The triangular k by k matrix T in the representation of the
            block reflector.

    @param[in]
    lddt    INTEGER
            The leading dimension of the array T. LDDT >= K.

    @param[in,out]
    dC_array      DOUBLE PRECISION array on the GPU, dimension (LDDC,N)
            On entry, the m by n matrix C.
            On exit, C is overwritten by H*C, or H^H*C, or C*H, or C*H^H.

    @param[in]
    lddc    INTEGER
            The leading dimension of the array C. LDDC >= max(1,M).

    @param
    dwork_array   (workspace) DOUBLE PRECISION array, dimension (LDWORK,K)

    @param[in]
    ldwork  INTEGER
            The leading dimension of the array WORK.
            If SIDE = MagmaLeft,  LDWORK >= max(1,N);
            if SIDE = MagmaRight, LDWORK >= max(1,M);

    @param
    dworkvt_array (workspace) DOUBLE PRECISION array, dimension (LDWORKT,K)

    @param[in]
    ldworkvt INTEGER
            The leading dimension of the array WORKVT.
            LDWORKVT >= max(1,min(M,N));

    @param[in]
    batchCount  INTEGER
                The number of matrices to operate on.

    @param[in]
    queue   magma_queue_t
            Queue to execute in.

    Further Details
    ---------------
    The shape of the matrix V and the storage of the vectors which define
    the H(i) is best illustrated by the following example with n = 5 and
    k = 3.
    All elements including 0's and 1's are stored, unlike LAPACK.

        DIRECT = MagmaForward and         DIRECT = MagmaForward and
        STOREV = MagmaColumnwise:         STOREV = MagmaRowwise:

                 V = (  1  0  0 )                 V = (  1 v1 v1 v1 v1 )
                     ( v1  1  0 )                     (  0  1 v2 v2 v2 )
                     ( v1 v2  1 )                     (  0  0  1 v3 v3 )
                     ( v1 v2 v3 )
                     ( v1 v2 v3 )

        DIRECT = MagmaBackward and        DIRECT = MagmaBackward and
        STOREV = MagmaColumnwise:         STOREV = MagmaRowwise:

                 V = ( v1 v2 v3 )                 V = ( v1 v1  1  0  0 )
                     ( v1 v2 v3 )                     ( v2 v2 v2  1  0 )
                     (  1 v2 v3 )                     ( v3 v3 v3 v3  1 )
                     (  0  1 v3 )
                     (  0  0  1 )

    @ingroup magma_larfb_batched
*******************************************************************************/
extern "C" magma_int_t
magma_dlarfb_gemm_batched(
    magma_side_t side, magma_trans_t trans, magma_direct_t direct, magma_storev_t storev,
    magma_int_t m, magma_int_t n, magma_int_t k,
    magmaDouble_const_ptr dV_array[],    magma_int_t lddv,
    magmaDouble_const_ptr dT_array[],    magma_int_t lddt,
    magmaDouble_ptr dC_array[],          magma_int_t lddc,
    magmaDouble_ptr dwork_array[],       magma_int_t ldwork,
    magmaDouble_ptr dworkvt_array[],     magma_int_t ldworkvt,
    magma_int_t batchCount, magma_queue_t queue)
{
    // Constants
    const double c_zero    = MAGMA_D_ZERO;
    const double c_one     = MAGMA_D_ONE;
    const double c_neg_one = MAGMA_D_NEG_ONE;

    /* Function Body */
    magma_int_t info = 0;
    if (m <= 0 || n <= 0) {
        return info;
    }
    
    // Local variables
    magma_int_t ldwvt = (m > n ?  k : m);
    magma_int_t ldw;
    if ( side == MagmaLeft ) {
        ldw = k;
    } else {
        ldw = m;
    }
    // opposite of trans
    magma_trans_t transt;
    if (trans == MagmaNoTrans)
        transt = MagmaTrans;
    else
        transt = MagmaNoTrans;
    
    MAGMA_UNUSED( transt );  // TODO: is this a bug that it isn't used?
    
    // whether V is stored transposed or not
    magma_trans_t notransV, transV;
    if (storev == MagmaColumnwise) {
        notransV = MagmaNoTrans;
        transV   = MagmaTrans;
    }
    else {
        notransV = MagmaTrans;
        transV   = MagmaNoTrans;
    }

    if ( side == MagmaLeft ) {
        // Form H C or H^H C
        // Comments assume H C.
        // When forming H^H C, T gets transposed via transt for m >= n or by trans for m < n.
        
        // W = V^H C                              
        magma_dgemm_batched( MagmaTrans,notransV, /*NontransLeft*/
                     k, n, m,
                     c_one,  dV_array,    lddv,
                             dC_array,    lddc,
                     c_zero, dwork_array, ldw,
                     batchCount, queue );

        if (m <= n) {
            // W2 = V T
            magma_dgemm_batched( notransV, trans, /* (NoTrans), trans(ConjTrans),*/
                         m, k, k,
                         c_one,  dV_array, lddv,
                                 dT_array, lddt,
                         c_zero, dworkvt_array, ldwvt,
                         batchCount, queue );

            // C = C - W2 W = C - V T V^H C = (I - V T V^H) C = H C
            magma_dgemm_batched( MagmaNoTrans, MagmaNoTrans,
                         m, n, k,
                         c_neg_one, dworkvt_array,  ldwvt,
                                    dwork_array,    ldw,
                         c_one,     dC_array,       lddc,
                         batchCount, queue );
        }
        else {
            // W2 = T W  = T  V^H C
            magma_dgemm_batched( trans, MagmaNoTrans,
                         k, n, k,
                         c_one,  dT_array, lddt,
                                 dwork_array, ldw,
                         c_zero, dworkvt_array, ldwvt,
                         batchCount, queue );

            // C = C - V W2 = C - V T V^H C = (I - V T V^H) C = H C
            magma_dgemm_batched( notransV, MagmaNoTrans,
                         m, n, k,
                         c_neg_one, dV_array,  lddv,
                                    dworkvt_array,  ldwvt,
                         c_one,     dC_array,       lddc,
                         batchCount, queue );
        }
    }
    else {
        // Form C H or C H^H
        // Comments assume C H.
        // When forming C H^H, T gets transposed via trans.
        
        // W = C V
        magma_dgemm_batched( MagmaNoTrans, notransV,
                     m, k, n,
                     c_one,  dC_array,    lddc,
                             dV_array,    lddv,
                     c_zero, dwork_array, ldw,
                     batchCount, queue );

        if (m <= n) {
            // W2 = W T = C V T
            magma_dgemm_batched( MagmaNoTrans, trans,
                         m, k, k,
                         c_one,  dwork_array, ldw,
                                 dT_array, lddt,
                         c_zero, dworkvt_array, ldwvt,
                         batchCount, queue );

            // C = C - W2 V^H = C - C V T V^H = C (I - V T V^H) = C H
            magma_dgemm_batched( MagmaNoTrans, transV,
                         m, n, k,
                         c_neg_one, dworkvt_array, ldwvt,
                                    dV_array,    lddv,
                         c_one,     dC_array,    lddc,
                         batchCount, queue );
        }
        else {
            // W2 = T V^H
            magma_dgemm_batched( trans, transV,
                         k, n, k,
                         c_one,  dT_array, lddt,
                                 dV_array, lddv,
                         c_zero, dworkvt_array, ldwvt,
                         batchCount, queue );

            // C = C - W W2 = C - C V T V^H = C (I - V T V^H) = C H
            magma_dgemm_batched( MagmaNoTrans, MagmaNoTrans,
                         m, n, k,
                         c_neg_one, dwork_array,   ldw,
                                    dworkvt_array, ldwvt,
                         c_one,     dC_array,      lddc,
                         batchCount, queue );
        }
    }

    return info;
} /* magma_dlarfb */
コード例 #3
0
ファイル: zssssm_gpu.cpp プロジェクト: cjy7117/FT-MAGMA
/**
    Purpose
    -------
    ZSSSSM applies the LU factorization update from a complex
    matrix formed by a lower triangular IB-by-K tile L1 on top of a
    M2-by-K tile L2 to a second complex matrix formed by a M1-by-N1
    tile A1 on top of a M2-by-N2 tile A2 (N1 == N2).

    This is the right-looking Level 2.5 BLAS version of the algorithm.

    Arguments
    ---------
    @param[in]
    m1      INTEGER
            The number of rows of the matrix A1.  M1 >= 0.

    @param[in]
    n1      INTEGER
            The number of columns of the matrix A1.  N1 >= 0.

    @param[in]
    m2      INTEGER
            The number of rows of the matrix A2.  M2 >= 0.

    @param[in]
    n2      INTEGER
            The number of columns of the matrix A2.  N2 >= 0.

    @param[in]
    k       INTEGER
            The number of columns of the matrix L1 and L2.  K >= 0.

    @param[in]
    ib      INTEGER
            The inner-blocking size.  IB >= 0.

    @param[in,out]
    dA1     COMPLEX_16 array, dimension(LDDA1, N), on gpu.
            On entry, the M1-by-N1 tile dA1.
            On exit, dA1 is updated by the application of dL (dL1 dL2).

    @param[in]
    ldda1   INTEGER
            The leading dimension of the array dA1.  LDDA1 >= max(1,M1).

    @param[in,out]
    dA2     COMPLEX_16 array, dimension(LDDA2, N), on gpu.
            On entry, the M2-by-N2 tile dA2.
            On exit, dA2 is updated by the application of dL (dL1 dL2).

    @param[in]
    ldda2   INTEGER
            The leading dimension of the array dA2.  LDDA2 >= max(1,M2).

    @param[in]
    dL1     COMPLEX_16 array, dimension(LDDL1, K), on gpu.
            The inverse of the IB-by-K lower triangular tile as returned by
            ZTSTRF.

    @param[in]
    lddl1   INTEGER
            The leading dimension of the array L1.  LDDL1 >= max(1,2*IB).

    @param[in]
    dL2     COMPLEX_16 array, dimension(LDDL2, K)
            The M2-by-K tile as returned by ZTSTRF.

    @param[in]
    lddl2   INTEGER
            The leading dimension of the array L2.  LDDL2 >= max(1,M2).

    @param[in]
    ipiv    INTEGER array on the cpu.
            The pivot indices array of size K as returned by ZTSTRF

    @ingroup magma_zgesv_tile
    ********************************************************************/
extern "C" magma_int_t
magma_zssssm_gpu(
    magma_order_t order, magma_int_t m1, magma_int_t n1,
    magma_int_t m2, magma_int_t n2, magma_int_t k, magma_int_t ib,
    magmaDoubleComplex_ptr dA1, magma_int_t ldda1,
    magmaDoubleComplex_ptr dA2, magma_int_t ldda2,
    magmaDoubleComplex_ptr dL1, magma_int_t lddl1,
    magmaDoubleComplex_ptr dL2, magma_int_t lddl2,
    magma_int_t *ipiv,
    magma_int_t *info)
{
#define A1T(i,j) (dA1T + (i)*ldda1 + (j))
#define A2T(i,j) (dA2T + (i)*ldda2 + (j))
#define L1(i)    (dL1  + (i)*lddl1      )
#define L2(i,j)  (dL2  + (i)*lddl2i + (j)*lddl2j)

    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;

    int ip, ii, sb;
    magmaDoubleComplex_ptr dA1T, dA2T;
    magma_trans_t transL;
    int lddl2i, lddl2j;
    
    MAGMA_UNUSED( ip );  // used only if NOSWAPBLK

    /* Check input arguments */
    *info = 0;
    if (m1 < 0) {
        *info = -1;
    }
    else if (n1 < 0) {
        *info = -2;
    }
    else if (m2 < 0) {
        *info = -3;
    }
    else if (n2 < 0) {
        *info = -4;
    }
    else if (k < 0) {
        *info = -5;
    }
    else if (ib < 0) {
        *info = -6;
    }
    else if (ldda1 < max(1,m1)) {
        *info = -8;
    }
    else if (ldda2 < max(1,m2)) {
        *info = -10;
    }
    else if (lddl1 < max(1,ib)) {
        *info = -12;
    }
    else if (lddl2 < max(1,m2)) {
        *info = -14;
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return */
    if ((m1 == 0) || (n1 == 0) || (m2 == 0) || (n2 == 0) || (k == 0) || (ib == 0))
        return *info;

    if ( order == MagmaColMajor ) {
        magmablas_zgetmo_in( dA1, dA1T, ldda1, m1, n1 );
        magmablas_zgetmo_in( dA2, dA2T, ldda2, m2, n2 );
        transL = MagmaTrans;
        lddl2i = 1; lddl2j = lddl2;
    } else {
        dA1T = dA1;
        dA2T = dA2;
        transL = MagmaNoTrans;
        lddl2i = lddl2; lddl2j = 1;
    }

    ip = 0;
    for( ii=0; ii < k; ii += ib ) {
        sb = min( k-ii, ib);

#ifndef NOSWAPBLK
        magmablas_zswapblk( MagmaRowMajor, n1,
                            A1T(0, 0), ldda1,
                            A2T(0, 0), ldda2,
                            ii+1, ii+ib, ipiv, 1, m1 );
#else
        {
            int im;
            for (i=0; i < ib; i++) {
                im = ipiv[ip]-1;

                if (im != (ii+i)) {
                    im = im - m1;

                    assert( (im >= 0) && (im < m1) && (im < m2) );
                    magmablas_zswap( n1, A1T(ii+i, 0), 1, A2T(im, 0), 1 );
                }
                ip++;
            }
        }
#endif

#ifndef WITHOUTTRTRI
        /* Lower, Trans, because L1 is not transposed */
        magma_ztrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                     n1, sb,
                     c_one, L1( ii),    lddl1,
                            A1T(ii, 0), ldda1);
#else
        /* Lower, Trans, because L1 is not transposed */
        magma_ztrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                     n1, sb,
                     c_one, L1( ii),    lddl1,
                            A1T(ii, 0), ldda1);
#endif

        /* Second parameter is trans because L2 is not transposed */
        magma_zgemm( MagmaNoTrans, transL,
                     n2, m2, sb,
                     c_neg_one, A1T(ii, 0), ldda1,
                                L2( 0, ii), lddl2,
                     c_one,     A2T(0, 0 ), ldda2 );
    }

    if ( order == MagmaColMajor ) {
        magmablas_zgetmo_out( dA1, dA1T, ldda1, m1, n1 );
        magmablas_zgetmo_out( dA2, dA2T, ldda2, m2, n2 );
    }
    return *info;
}
コード例 #4
0
ファイル: dormbr.cpp プロジェクト: cjy7117/FT-MAGMA
/**
    Purpose
    -------
    If VECT = MagmaQ, DORMBR overwrites the general real M-by-N matrix C with
                                 SIDE = MagmaLeft     SIDE = MagmaRight
    TRANS = MagmaNoTrans:        Q*C                  C*Q
    TRANS = MagmaTrans:     Q**H*C               C*Q**H
    
    If VECT = MagmaP, DORMBR overwrites the general real M-by-N matrix C with
                                 SIDE = MagmaLeft     SIDE = MagmaRight
    TRANS = MagmaNoTrans:        P*C                  C*P
    TRANS = MagmaTrans:     P**H*C               C*P**H
    
    Here Q and P**H are the unitary matrices determined by DGEBRD when
    reducing A real matrix A to bidiagonal form: A = Q*B * P**H. Q
    and P**H are defined as products of elementary reflectors H(i) and
    G(i) respectively.
    
    Let nq = m if SIDE = MagmaLeft and nq = n if SIDE = MagmaRight. Thus nq is the
    order of the unitary matrix Q or P**H that is applied.
    
    If VECT = MagmaQ, A is assumed to have been an NQ-by-K matrix:
    if nq >= k, Q = H(1) H(2) . . . H(k);
    if nq <  k, Q = H(1) H(2) . . . H(nq-1).
    
    If VECT = MagmaP, A is assumed to have been A K-by-NQ matrix:
    if k <  nq, P = G(1) G(2) . . . G(k);
    if k >= nq, P = G(1) G(2) . . . G(nq-1).
    
    Arguments
    ---------
    @param[in]
    vect    magma_vect_t
      -     = MagmaQ: apply Q or Q**H;
      -     = MagmaP: apply P or P**H.
    
    @param[in]
    side    magma_side_t
      -     = MagmaLeft:  apply Q, Q**H, P or P**H from the Left;
      -     = MagmaRight: apply Q, Q**H, P or P**H from the Right.
    
    @param[in]
    trans   magma_trans_t
      -     = MagmaNoTrans:    No transpose, apply Q or P;
      -     = MagmaTrans: Conjugate transpose, apply Q**H or P**H.
    
    @param[in]
    m       INTEGER
            The number of rows of the matrix C. M >= 0.
    
    @param[in]
    n       INTEGER
            The number of columns of the matrix C. N >= 0.
    
    @param[in]
    k       INTEGER
            If VECT = MagmaQ, the number of columns in the original
            matrix reduced by DGEBRD.
            If VECT = MagmaP, the number of rows in the original
            matrix reduced by DGEBRD.
            K >= 0.
    
    @param[in]
    A       DOUBLE_PRECISION array, dimension
                                  (LDA,min(nq,K)) if VECT = MagmaQ
                                  (LDA,nq)        if VECT = MagmaP
            The vectors which define the elementary reflectors H(i) and
            G(i), whose products determine the matrices Q and P, as
            returned by DGEBRD.
    
    @param[in]
    lda     INTEGER
            The leading dimension of the array A.
            If VECT = MagmaQ, LDA >= max(1,nq);
            if VECT = MagmaP, LDA >= max(1,min(nq,K)).
    
    @param[in]
    tau     DOUBLE_PRECISION array, dimension (min(nq,K))
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i) or G(i) which determines Q or P, as returned
            by DGEBRD in the array argument TAUQ or TAUP.
    
    @param[in,out]
    C       DOUBLE_PRECISION array, dimension (LDC,N)
            On entry, the M-by-N matrix C.
            On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q
            or P*C or P**H*C or C*P or C*P**H.
    
    @param[in]
    ldc     INTEGER
            The leading dimension of the array C. LDC >= max(1,M).
    
    @param[out]
    work    (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.
    
    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.
            If SIDE = MagmaLeft,  LWORK >= max(1,N);
            if SIDE = MagmaRight, LWORK >= max(1,M);
            if N = 0 or M = 0, LWORK >= 1.
            For optimum performance
            if SIDE = MagmaLeft,  LWORK >= max(1,N*NB);
            if SIDE = MagmaRight, LWORK >= max(1,M*NB),
            where NB is the optimal blocksize. (NB = 0 if M = 0 or N = 0.)
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.
    
    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
    

    @ingroup magma_dgesvd_comp
    ********************************************************************/
extern "C" magma_int_t
magma_dormbr(
    magma_vect_t vect, magma_side_t side, magma_trans_t trans,
    magma_int_t m, magma_int_t n, magma_int_t k,
    double *A, magma_int_t lda,
    double *tau,
    double *C, magma_int_t ldc,
    double *work, magma_int_t lwork,
    magma_int_t *info)
{
    #define A(i,j)  (A + (i) + (j)*lda)
    #define C(i,j)  (C + (i) + (j)*ldc)
            
    magma_int_t i1, i2, nb, mi, ni, nq, nq_1, nw, iinfo, lwkopt;
    magma_int_t left, notran, applyq, lquery;
    magma_trans_t transt;
    
    MAGMA_UNUSED( nq_1 );  // used only in version 1

    *info = 0;
    applyq = (vect  == MagmaQ);
    left   = (side  == MagmaLeft);
    notran = (trans == MagmaNoTrans);
    lquery = (lwork == -1);

    /* NQ is the order of Q or P and NW is the minimum dimension of WORK */
    if (left) {
        nq = m;
        nw = n;
    }
    else {
        nq = n;
        nw = m;
    }
    if (m == 0 || n == 0) {
        nw = 0;
    }
    
    /* check arguments */
    if (! applyq && vect != MagmaP) {
        *info = -1;
    }
    else if (! left && side != MagmaRight) {
        *info = -2;
    }
    else if (! notran && trans != MagmaTrans) {
        *info = -3;
    }
    else if (m < 0) {
        *info = -4;
    }
    else if (n < 0) {
        *info = -5;
    }
    else if (k < 0) {
        *info = -6;
    }
    else if ( (   applyq && lda < max(1,nq)        ) ||
              ( ! applyq && lda < max(1,min(nq,k)) ) ) {
        *info = -8;
    }
    else if (ldc < max(1,m)) {
        *info = -11;
    }
    else if (lwork < max(1,nw) && ! lquery) {
        *info = -13;
    }

    if (*info == 0) {
        if (nw > 0) {
            // TODO have get_dormqr_nb and get_dormlq_nb routines? see original LAPACK dormbr.
            // TODO make them dependent on m, n, and k?
            nb = magma_get_dgebrd_nb( min( m, n ));
            lwkopt = max(1, nw*nb);
        }
        else {
            lwkopt = 1;
        }
        work[0] = MAGMA_D_MAKE( lwkopt, 0 );
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0) {
        return *info;
    }

    if (applyq) {
        /* Apply Q */
        if (nq >= k) {
            /* Q was determined by a call to DGEBRD with nq >= k */
            #if VERSION == 1
            lapackf77_dormqr( lapack_side_const(side), lapack_trans_const(trans),
                              &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, &iinfo);
            #else
            magma_dormqr( side, trans,
                          m, n, k, A, lda, tau, C, ldc, work, lwork, &iinfo);
            #endif
        }
        else if (nq > 1) {
            /* Q was determined by a call to DGEBRD with nq < k */
            if (left) {
                mi = m - 1;
                ni = n;
                i1 = 1;
                i2 = 0;
            }
            else {
                mi = m;
                ni = n - 1;
                i1 = 0;
                i2 = 1;
            }
            #if VERSION == 1
            nq_1 = nq - 1;
            lapackf77_dormqr( lapack_side_const(side), lapack_trans_const(trans),
                              &mi, &ni, &nq_1, A(1,0), &lda, tau, C(i1,i2), &ldc, work, &lwork, &iinfo);
            #else
            magma_dormqr( side, trans,
                          mi, ni, nq-1, A(1,0), lda, tau, C(i1,i2), ldc, work, lwork, &iinfo);
            #endif
        }
    }
    else {
        /* Apply P */
        if (notran) {
            transt = MagmaTrans;
        }
        else {
            transt = MagmaNoTrans;
        }
        if (nq > k) {
            /* P was determined by a call to DGEBRD with nq > k */
            #if VERSION == 1
            lapackf77_dormlq( lapack_side_const(side), lapack_trans_const(transt),
                              &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, &iinfo);
            #else
            magma_dormlq( side, transt,
                          m, n, k, A, lda, tau, C, ldc, work, lwork, &iinfo);
            #endif
        }
        else if (nq > 1) {
            /* P was determined by a call to DGEBRD with nq <= k */
            if (left) {
                mi = m - 1;
                ni = n;
                i1 = 1;
                i2 = 0;
            }
            else {
                mi = m;
                ni = n - 1;
                i1 = 0;
                i2 = 1;
            }
            #if VERSION == 1
            nq_1 = nq - 1;
            lapackf77_dormlq( lapack_side_const(side), lapack_trans_const(transt),
                              &mi, &ni, &nq_1, A(0,1), &lda, tau, C(i1,i2), &ldc, work, &lwork, &iinfo);
            #else
            magma_dormlq( side, transt,
                          mi, ni, nq-1, A(0,1), lda, tau, C(i1,i2), ldc, work, lwork, &iinfo);
            #endif
        }
    }
    work[0] = MAGMA_D_MAKE( lwkopt, 0 );
    return *info;
} /* magma_dormbr */
コード例 #5
0
ファイル: sgelqf_gpu.cpp プロジェクト: maxhutch/magma
/***************************************************************************//**
    Purpose
    -------
    SGELQF computes an LQ factorization of a REAL M-by-N matrix dA:
    dA = L * Q.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    dA      REAL array on the GPU, dimension (LDDA,N)
            On entry, the M-by-N matrix dA.
            On exit, the elements on and below the diagonal of the array
            contain the m-by-min(m,n) lower trapezoidal matrix L (L is
            lower triangular if m <= n); the elements above the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of elementary reflectors (see Further Details).

    @param[in]
    ldda    INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).

    @param[out]
    tau     REAL array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    work    (workspace) REAL array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.
    \n
            Higher performance is achieved if WORK is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.  LWORK >= max(1,M).
            For optimum performance LWORK >= M*NB, where NB is the
            optimal blocksize.
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ---------------
    The matrix Q is represented as a product of elementary reflectors

       Q = H(k) . . . H(2) H(1), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in A(i,i+1:n),
    and tau in TAU(i).

    @ingroup magma_gelqf
*******************************************************************************/
extern "C" magma_int_t
magma_sgelqf_gpu(
    magma_int_t m, magma_int_t n,
    magmaFloat_ptr dA, magma_int_t ldda,
    float *tau,
    float *work, magma_int_t lwork,
    magma_int_t *info)
{
    /* Constants */
    const float c_one = MAGMA_S_ONE;
    const magma_int_t ione = 1;
    MAGMA_UNUSED( ione );  // used only for real

    /* Local variables */
    magmaFloat_ptr dAT=NULL;
    magma_int_t min_mn, maxm, maxn, nb;
    magma_int_t iinfo;

    *info = 0;
    nb = magma_get_sgelqf_nb( m, n );
    min_mn = min( m, n );

    work[0] = magma_smake_lwork( m*nb );
    bool lquery = (lwork == -1);
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,m)) {
        *info = -4;
    } else if (lwork < max(1,m) && ! lquery) {
        *info = -7;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /*  Quick return if possible */
    if (min_mn == 0) {
        work[0] = c_one;
        return *info;
    }

    maxm = magma_roundup( m, 32 );
    maxn = magma_roundup( n, 32 );

    magma_int_t lddat = maxn;

    magma_queue_t queue;
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queue );
    
    if ( m == n ) {
        dAT = dA;
        lddat = ldda;
        magmablas_stranspose_inplace( m, dAT, ldda, queue );
    }
    else {
        if (MAGMA_SUCCESS != magma_smalloc( &dAT, maxm*maxn ) ) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            goto cleanup;
        }
        
        magmablas_stranspose( m, n, dA, ldda, dAT, lddat, queue );
    }
    
    magma_sgeqrf2_gpu( n, m, dAT, lddat, tau, &iinfo );
    assert( iinfo >= 0 );
    if ( iinfo > 0 ) {
        *info = iinfo;
    }
    
    // conjugate tau
    #ifdef COMPLEX
    lapackf77_slacgv( &min_mn, tau, &ione );
    #endif
    
    if ( m == n ) {
        magmablas_stranspose_inplace( m, dAT, lddat, queue );
    }
    else {
        magmablas_stranspose( n, m, dAT, lddat, dA, ldda, queue );
        magma_free( dAT );
    }

cleanup:
    magma_queue_destroy( queue );
    
    return *info;
} /* magma_sgelqf_gpu */
コード例 #6
0
ファイル: clarfb_gpu_gemm.cpp プロジェクト: cjy7117/FT-MAGMA
/**
    Purpose
    -------
    CLARFB applies a complex block reflector H or its transpose H^H to a
    COMPLEX m by n matrix C, from the left.
    
    __Note that this function assumes__ that the upper part of dV is 0
    because it is referenced. Same for upper/lower part of dT.

    Arguments
    ---------
    @param[in]
    side    magma_side_t
      -     = MagmaLeft:      apply H or H^H from the Left
      -     = MagmaRight:     apply H or H^H from the Right

    @param[in]
    trans   magma_trans_t
      -     = MagmaNoTrans:    apply H   (No transpose)
      -     = Magma_ConjTrans: apply H^H (Conjugate transpose)

    @param[in]
    direct  magma_direct_t
            Indicates how H is formed from a product of elementary
            reflectors
      -     = MagmaForward:  H = H(1) H(2) . . . H(k) (Forward)
      -     = MagmaBackward: H = H(k) . . . H(2) H(1) (Backward)

    @param[in]
    storev  magma_storev_t
            Indicates how the vectors which define the elementary
            reflectors are stored:
      -     = MagmaColumnwise: Columnwise
      -     = MagmaRowwise:    Rowwise

    @param[in]
    m       INTEGER
            The number of rows of the matrix C.

    @param[in]
    n       INTEGER
            The number of columns of the matrix C.

    @param[in]
    k       INTEGER
            The order of the matrix T (= the number of elementary
            reflectors whose product defines the block reflector).

    @param[in]
    dV      COMPLEX array on the GPU, dimension
                (LDDV,K) if STOREV = MagmaColumnwise
                (LDDV,M) if STOREV = MagmaRowwise and SIDE = MagmaLeft
                (LDDV,N) if STOREV = MagmaRowwise and SIDE = MagmaRight
            The matrix V. See further details.

    @param[in]
    lddv    INTEGER
            The leading dimension of the array V.
            If STOREV = MagmaColumnwise and SIDE = MagmaLeft, LDDV >= max(1,M);
            if STOREV = MagmaColumnwise and SIDE = MagmaRight, LDDV >= max(1,N);
            if STOREV = MagmaRowwise, LDDV >= K.

    @param[in]
    dT      COMPLEX array on the GPU, dimension (LDDT,K)
            The triangular k by k matrix T in the representation of the
            block reflector.

    @param[in]
    lddt    INTEGER
            The leading dimension of the array T. LDDT >= K.

    @param[in,out]
    dC      COMPLEX array on the GPU, dimension (LDDC,N)
            On entry, the m by n matrix C.
            On exit, C is overwritten by H*C, or H^H*C, or C*H, or C*H^H.

    @param[in]
    lddc    INTEGER
            The leading dimension of the array C. LDA >= max(1,M).

    @param
    dwork   (workspace) COMPLEX array, dimension (LDWORK,K)

    @param[in]
    ldwork  INTEGER
            The leading dimension of the array WORK.
            If SIDE = MagmaLeft,  LDWORK >= max(1,N);
            if SIDE = MagmaRight, LDWORK >= max(1,M);

    @param
    dworkvt (workspace) COMPLEX array, dimension (LDWORKT,K)

    @param[in]
    ldworkvt INTEGER
            The leading dimension of the array WORKVT.
            LDWORKVT >= max(1,min(M,N));

    Further Details
    ---------------
    The shape of the matrix V and the storage of the vectors which define
    the H(i) is best illustrated by the following example with n = 5 and
    k = 3.
    All elements including 0's and 1's are stored, unlike LAPACK.

        DIRECT = MagmaForward and         DIRECT = MagmaForward and
        STOREV = MagmaColumnwise:         STOREV = MagmaRowwise:

                 V = (  1  0  0 )                 V = (  1 v1 v1 v1 v1 )
                     ( v1  1  0 )                     (  0  1 v2 v2 v2 )
                     ( v1 v2  1 )                     (  0  0  1 v3 v3 )
                     ( v1 v2 v3 )
                     ( v1 v2 v3 )

        DIRECT = MagmaBackward and        DIRECT = MagmaBackward and
        STOREV = MagmaColumnwise:         STOREV = MagmaRowwise:

                 V = ( v1 v2 v3 )                 V = ( v1 v1  1  0  0 )
                     ( v1 v2 v3 )                     ( v2 v2 v2  1  0 )
                     (  1 v2 v3 )                     ( v3 v3 v3 v3  1 )
                     (  0  1 v3 )
                     (  0  0  1 )

    @ingroup magma_caux3
    ********************************************************************/
extern "C" magma_int_t
magma_clarfb_gpu_gemm(
    magma_side_t side, magma_trans_t trans, magma_direct_t direct, magma_storev_t storev,
    magma_int_t m, magma_int_t n, magma_int_t k,
    magmaFloatComplex_const_ptr dV,    magma_int_t lddv,
    magmaFloatComplex_const_ptr dT,    magma_int_t lddt,
    magmaFloatComplex_ptr dC,          magma_int_t lddc,
    magmaFloatComplex_ptr dwork,       magma_int_t ldwork,
    magmaFloatComplex_ptr dworkvt,     magma_int_t ldworkvt)
{
    magmaFloatComplex c_zero    = MAGMA_C_ZERO;
    magmaFloatComplex c_one     = MAGMA_C_ONE;
    magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;

    magma_int_t info = 0;
    
    /* Function Body */
    if (m <= 0 || n <= 0) {
        return info;
    }
    
    // internal variable
    magma_int_t ldwvt = (m > n ?  k : m);
    magma_int_t ldw;
    if ( side == MagmaLeft ) {
        ldw = k;
    } else {
        ldw = m;
    }
    
    // opposite of trans
    magma_trans_t transt;
    if (trans == MagmaNoTrans)
        transt = Magma_ConjTrans;
    else
        transt = MagmaNoTrans;
    
    MAGMA_UNUSED( transt );  // TODO: is this a bug that it isn't used?
    
    // whether V is stored transposed or not
    magma_trans_t notransV, transV;
    if (storev == MagmaColumnwise) {
        notransV = MagmaNoTrans;
        transV   = Magma_ConjTrans;
    }
    else {
        notransV = Magma_ConjTrans;
        transV   = MagmaNoTrans;
    }

    if ( side == MagmaLeft ) {
        // Form H C or H^H C
        // Comments assume H C.
        // When forming H^H C, T gets transposed via transt for m >= n or by trans for m < n.
        
        // W = V^H C
        magma_cgemm( Magma_ConjTrans, notransV,
                     k, n, m,
                     c_one,  dV,    lddv,
                             dC,    lddc,
                     c_zero, dwork, ldw);

        if (m <= n) {
            // W2 = V T
            magma_cgemm( notransV, trans,
                         m, k, k,
                         c_one,  dV, lddv,
                                 dT, lddt,
                         c_zero, dworkvt, ldwvt);
            // C = C - W2 W = C - V T V^H C = (I - V T V^H) C = H C
            magma_cgemm( MagmaNoTrans, MagmaNoTrans,
                         m, n, k,
                         c_neg_one, dworkvt,  ldwvt,
                                    dwork,    ldw,
                         c_one,     dC,       lddc);
        } else {
            // W2 = T W  = T  V^H C
            magma_cgemm( trans, MagmaNoTrans,
                         k, n, k,
                         c_one,  dT, lddt,
                                 dwork, ldw,
                         c_zero, dworkvt, ldwvt);
            // C = C - V W2 = C - V T V^H C = (I - V T V^H) C = H C
            magma_cgemm( notransV, MagmaNoTrans,
                         m, n, k,
                         c_neg_one, dV,  lddv,
                                    dworkvt,  ldwvt,
                         c_one,     dC,       lddc);
        }
    }
    else {
        // Form C H or C H^H
        // Comments assume C H.
        // When forming C H^H, T gets transposed via trans.
        
        // W = C V
        magma_cgemm( MagmaNoTrans, notransV,
                     m, k, n,
                     c_one,  dC,    lddc,
                             dV,    lddv,
                     c_zero, dwork, ldw);
        if (m <= n) {
            // W2 = W T = C V T
            magma_cgemm( MagmaNoTrans, trans,
                         m, k, k,
                         c_one,  dwork, ldw,
                                 dT, lddt,
                         c_zero, dworkvt, ldwvt);
            // C = C - W2 V^H = C - C V T V^H = C (I - V T V^H) = C H
            magma_cgemm( MagmaNoTrans, transV,
                         m, n, k,
                         c_neg_one, dworkvt, ldwvt,
                                    dV,    lddv,
                         c_one,     dC,    lddc);
        } else {
            // W2 = T V^H
            magma_cgemm( trans, transV,
                         k, n, k,
                         c_one,  dT, lddt,
                                 dV, lddv,
                         c_zero, dworkvt, ldwvt);
            // C = C - W W2 = C - C V T V^H = C (I - V T V^H) = C H
            magma_cgemm( MagmaNoTrans, MagmaNoTrans,
                         m, n, k,
                         c_neg_one, dwork,   ldw,
                                    dworkvt, ldwvt,
                         c_one,     dC,      lddc);
        }
    }

    return info;
} /* magma_clarfb */
コード例 #7
0
ファイル: dgelqf_gpu.cpp プロジェクト: cjy7117/FT-MAGMA
/**
    Purpose
    -------
    DGELQF computes an LQ factorization of a DOUBLE_PRECISION M-by-N matrix dA:
    dA = L * Q.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    dA      DOUBLE_PRECISION array on the GPU, dimension (LDDA,N)
            On entry, the M-by-N matrix dA.
            On exit, the elements on and below the diagonal of the array
            contain the m-by-min(m,n) lower trapezoidal matrix L (L is
            lower triangular if m <= n); the elements above the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of elementary reflectors (see Further Details).

    @param[in]
    ldda    INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).

    @param[out]
    tau     DOUBLE_PRECISION array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    work    (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.
    \n
            Higher performance is achieved if WORK is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.  LWORK >= max(1,M).
            For optimum performance LWORK >= M*NB, where NB is the
            optimal blocksize.
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ---------------
    The matrix Q is represented as a product of elementary reflectors

       Q = H(k) . . . H(2) H(1), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in A(i,i+1:n),
    and tau in TAU(i).

    @ingroup magma_dgelqf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_dgelqf_gpu(
    magma_int_t m, magma_int_t n,
    magmaDouble_ptr dA, magma_int_t ldda,
    double *tau,
    double *work, magma_int_t lwork,
    magma_int_t *info)
{
    const double c_one = MAGMA_D_ONE;
    const magma_int_t        ione  = 1;
    MAGMA_UNUSED( ione );  // used only for real

    double *dAT;
    magma_int_t min_mn, maxm, maxn, nb;
    magma_int_t iinfo;
    int lquery;

    *info = 0;
    nb = magma_get_dgelqf_nb(m);
    min_mn = min(m,n);

    work[0] = MAGMA_D_MAKE( (double)(m*nb), 0 );
    lquery = (lwork == -1);
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,m)) {
        *info = -4;
    } else if (lwork < max(1,m) && ! lquery) {
        *info = -7;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /*  Quick return if possible */
    if (min_mn == 0) {
        work[0] = c_one;
        return *info;
    }

    maxm = ((m + 31)/32)*32;
    maxn = ((n + 31)/32)*32;

    magma_int_t lddat = maxn;

    dAT = dA;
    
    if ( m == n ) {
        lddat = ldda;
        magmablas_dtranspose_inplace( m, dAT, ldda );
    }
    else {
        if (MAGMA_SUCCESS != magma_dmalloc( &dAT, maxm*maxn ) ) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }
        
        magmablas_dtranspose( m, n, dA, ldda, dAT, lddat );
    }
    
    magma_dgeqrf2_gpu( n, m, dAT, lddat, tau, &iinfo );
    assert( iinfo >= 0 );
    if ( iinfo > 0 ) {
        *info = iinfo;
    }
    
    // conjugate tau
    #ifdef COMPLEX
    lapackf77_dlacgv( &min_mn, tau, &ione );
    #endif
    
    if ( m == n ) {
        magmablas_dtranspose_inplace( m, dAT, lddat );
    }
    else {
        magmablas_dtranspose( n, m, dAT, lddat, dA, ldda );
        magma_free( dAT );
    }

    return *info;
} /* magma_dgelqf_gpu */
コード例 #8
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing cgesdd (SVD with Divide & Conquer)
      Please keep code in testing_cgesdd.cpp and testing_cgesvd.cpp similar.
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    real_Double_t   gpu_time, cpu_time;
    magmaFloatComplex *h_A, *h_R, *U, *VT, *h_work;
    magmaFloatComplex dummy[1];
    float *S1, *S2;
    #ifdef COMPLEX
    magma_int_t lrwork=0;
    float *rwork;
    #endif
    magma_int_t *iwork;
    
    magma_int_t M, N, N_U, M_VT, lda, ldu, ldv, n2, min_mn, max_mn, info, nb, lwork;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_vec_t jobz;
    magma_int_t status = 0;

    MAGMA_UNUSED( max_mn );  // used only in complex
    
    magma_opts opts;
    opts.parse_opts( argc, argv );
    
    float tol = opts.tolerance * lapackf77_slamch("E");
    
    jobz = opts.jobu;
    
    magma_vec_t jobs[] = { MagmaNoVec, MagmaSomeVec, MagmaOverwriteVec, MagmaAllVec };
    
    if ( opts.check && ! opts.all && (jobz == MagmaNoVec)) {
        printf( "%% NOTE: some checks require that singular vectors are computed;\n"
                "%%       set jobz (option -U[NASO]) to be S, O, or A.\n\n" );
    }
    printf("%% jobz   M     N  CPU time (sec)  GPU time (sec)   |S1-S2|   |A-USV^H|   |I-UU^H|/M   |I-VV^H|/N   S sorted\n");
    printf("%%==========================================================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int ijobz = 0; ijobz < 4; ++ijobz ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            if ( opts.all ) {
                jobz = jobs[ ijobz ];
            }
            else if ( ijobz > 0 ) {
                // if not testing all, run only once, when ijobz = 0,
                // but jobz come from opts (above loops).
                continue;
            }
            
            M = opts.msize[itest];
            N = opts.nsize[itest];
            min_mn = min(M, N);
            max_mn = max(M, N);
            N_U  = (jobz == MagmaAllVec ? M : min_mn);
            M_VT = (jobz == MagmaAllVec ? N : min_mn);
            lda = M;
            ldu = M;
            ldv = M_VT;
            n2 = lda*N;
            nb = magma_get_cgesvd_nb( M, N );
            
            // x and y abbreviations used in cgesdd and dgesdd documentation
            magma_int_t x = max(M,N);
            magma_int_t y = min(M,N);
            #ifdef COMPLEX
            bool tall = (x >= int(y*17/9.));  // true if tall (m >> n) or wide (n >> m)
            #else
            bool tall = (x >= int(y*11/6.));  // true if tall (m >> n) or wide (n >> m)
            #endif
            
            // query or use formula for workspace size
            switch( opts.svd_work ) {
                case 0: {  // query for workspace size
                    lwork = -1;
                    magma_cgesdd( jobz, M, N,
                                  NULL, lda, NULL, NULL, ldu, NULL, ldv, dummy, lwork,
                                  #ifdef COMPLEX
                                  NULL,
                                  #endif
                                  NULL, &info );
                    lwork = (int) MAGMA_C_REAL( dummy[0] );
                    break;
                }
                
                case 1:    // minimum
                case 2:    // optimal
                case 3: {  // optimal (for gesdd, 2 & 3 are same; for gesvd, they differ)
                    // formulas from cgesdd and dgesdd documentation
                    bool sml = (opts.svd_work == 1);  // 1 is small workspace, 2,3 are large workspace
                    
                    #ifdef COMPLEX  // ----------------------------------------
                        if (jobz == MagmaNoVec) {
                            if (tall)    { lwork = 2*y + (2*y)*nb; }
                            else         { lwork = 2*y + (x+y)*nb; }
                        }
                        if (jobz == MagmaOverwriteVec) {
                            if (tall)    {
                                if (sml) { lwork = 2*y*y     + 2*y + (2*y)*nb; }
                                else     { lwork = y*y + x*y + 2*y + (2*y)*nb; }  // not big deal
                            }
                            else         {
                                //if (sml) { lwork = 2*y + max( (x+y)*nb, y*y + y    ); }
                                //else     { lwork = 2*y + max( (x+y)*nb, x*y + y*nb ); }
                                // LAPACK 3.4.2 over-estimates workspaces. For compatability, use these:
                                if (sml) { lwork = 2*y + max( (x+y)*nb, y*y + x ); }
                                else     { lwork = 2*y +      (x+y)*nb + x*y; }
                            }
                        }
                        if (jobz == MagmaSomeVec) {
                            if (tall)    { lwork = y*y + 2*y + (2*y)*nb; }
                            else         { lwork =       2*y + (x+y)*nb; }
                        }
                        if (jobz == MagmaAllVec) {
                            if (tall) {
                                if (sml) { lwork = y*y + 2*y + max( (2*y)*nb, x    ); }
                                else     { lwork = y*y + 2*y + max( (2*y)*nb, x*nb ); }
                            }
                            else         { lwork =       2*y +      (x+y)*nb; }
                        }
                    #else // REAL ----------------------------------------
                        if (jobz == MagmaNoVec) {
                            if (tall)    { lwork =       3*y + max( (2*y)*nb, 7*y ); }
                            else         { lwork =       3*y + max( (x+y)*nb, 7*y ); }
                        }
                        if (jobz == MagmaOverwriteVec) {
                            if (tall)    {
                                if (sml) { lwork = y*y + 3*y +      max( (2*y)*nb, 4*y*y + 4*y ); }
                                else     { lwork = y*y + 3*y + max( max( (2*y)*nb, 4*y*y + 4*y ), y*y + y*nb ); }
                            }
                            else         {
                                if (sml) { lwork =       3*y + max( (x+y)*nb, 4*y*y + 4*y ); }
                                else     { lwork =       3*y + max( (x+y)*nb, 3*y*y + 4*y + x*y ); }  // extra space not too important?
                            }
                        }
                        if (jobz == MagmaSomeVec) {
                            if (tall)    { lwork = y*y + 3*y + max( (2*y)*nb, 3*y*y + 4*y ); }
                            else         { lwork =       3*y + max( (x+y)*nb, 3*y*y + 4*y ); }
                        }
                        if (jobz == MagmaAllVec) {
                            if (tall) {
                                if (sml) { lwork = y*y + max( 3*y + max( (2*y)*nb, 3*y*y + 4*y ), y + x    ); }
                                else     { lwork = y*y + max( 3*y + max( (2*y)*nb, 3*y*y + 4*y ), y + x*nb ); }
                                // LAPACK 3.4.2 over-estimates workspaces. For compatability, use these:
                                //if (sml) { lwork = y*y +      3*y + max( (2*y)*nb,      3*y*y + 3*y + x ); }
                                //else     { lwork = y*y + max( 3*y + max( (2*y)*nb, max( 3*y*y + 3*y + x, 3*y*y + 4*y )), y + x*nb ); }
                            }
                            else         { lwork =            3*y + max( (x+y)*nb, 3*y*y + 4*y ); }
                        }
                    #endif
                    break;
                }
                
                default: {
                    fprintf( stderr, "Error: unknown option svd_work %d\n", (int) opts.svd_work );
                    return -1;
                    break;
                }
            }
            
            TESTING_MALLOC_CPU( h_A,   magmaFloatComplex, lda*N );
            TESTING_MALLOC_CPU( VT,    magmaFloatComplex, ldv*N );   // N x N (jobz=A) or min(M,N) x N
            TESTING_MALLOC_CPU( U,     magmaFloatComplex, ldu*N_U ); // M x M (jobz=A) or M x min(M,N)
            TESTING_MALLOC_CPU( S1,    float, min_mn );
            TESTING_MALLOC_CPU( S2,    float, min_mn );
            TESTING_MALLOC_CPU( iwork, magma_int_t, 8*min_mn );
            
            TESTING_MALLOC_PIN( h_R,    magmaFloatComplex, lda*N );
            TESTING_MALLOC_PIN( h_work, magmaFloatComplex, lwork );
            
            #ifdef COMPLEX
                if (jobz == MagmaNoVec) {
                    // requires  5*min_mn, but MKL (11.1) seems to have a bug
                    // requiring 7*min_mn in some cases (e.g., jobz=N, m=100, n=170)
                    lrwork = 7*min_mn;
                }
                else if (tall) {
                    lrwork = 5*min_mn*min_mn + 5*min_mn;
                }
                else {
                    lrwork = max( 5*min_mn*min_mn + 5*min_mn,
                                  2*max_mn*min_mn + 2*min_mn*min_mn + min_mn );
                }
                TESTING_MALLOC_CPU( rwork, float, lrwork );
            #endif
            
            /* Initialize the matrix */
            lapackf77_clarnv( &ione, ISEED, &n2, h_A );
            lapackf77_clacpy( MagmaFullStr, &M, &N, h_A, &lda, h_R, &lda );
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            gpu_time = magma_wtime();
            magma_cgesdd( jobz, M, N,
                          h_R, lda, S1, U, ldu, VT, ldv, h_work, lwork,
                          #ifdef COMPLEX
                          rwork,
                          #endif
                          iwork, &info );
            gpu_time = magma_wtime() - gpu_time;
            if (info != 0) {
                printf("magma_cgesdd returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            }

            float eps = lapackf77_slamch( "E" );
            float result[5] = { -1/eps, -1/eps, -1/eps, -1/eps, -1/eps };
            if ( opts.check ) {
                /* =====================================================================
                   Check the results following the LAPACK's [zcds]drvbd routine.
                   A is factored as A = U diag(S) VT and the following 4 tests computed:
                   (1)    | A - U diag(S) VT | / ( |A| max(M,N) )
                   (2)    | I - U^H U   | / ( M )
                   (3)    | I - VT VT^H | / ( N )
                   (4)    S contains MNMIN nonnegative values in decreasing order.
                          (Return 0 if true, 1/ULP if false.)
                   =================================================================== */
                magma_int_t izero = 0;
                
                // get size and location of U and V^T depending on jobz
                // U2=NULL and VT2=NULL if they were not computed (e.g., jobz=N)
                magmaFloatComplex *U2  = NULL;
                magmaFloatComplex *VT2 = NULL;
                if ( jobz == MagmaSomeVec || jobz == MagmaAllVec ) {
                    U2  = U;
                    VT2 = VT;
                }
                else if ( jobz == MagmaOverwriteVec ) {
                    if ( M >= N ) {
                        U2  = h_R;
                        ldu = lda;
                        VT2 = VT;
                    }
                    else {
                        U2  = U;
                        VT2 = h_R;
                        ldv = lda;
                    }
                }
                
                // cbdt01 needs M+N
                // cunt01 prefers N*(N+1) to check U; M*(M+1) to check V
                magma_int_t lwork_err = M+N;
                if ( U2 != NULL ) {
                    lwork_err = max( lwork_err, N_U*(N_U+1) );
                }
                if ( VT2 != NULL ) {
                    lwork_err = max( lwork_err, M_VT*(M_VT+1) );
                }
                magmaFloatComplex *h_work_err;
                TESTING_MALLOC_CPU( h_work_err, magmaFloatComplex, lwork_err );
                
                // cbdt01 and cunt01 need max(M,N), depending
                float *rwork_err;
                TESTING_MALLOC_CPU( rwork_err, float, max(M,N) );
                
                if ( U2 != NULL && VT2 != NULL ) {
                    // since KD=0 (3rd arg), E is not referenced so pass NULL (9th arg)
                    lapackf77_cbdt01(&M, &N, &izero, h_A, &lda,
                                     U2, &ldu, S1, NULL, VT2, &ldv,
                                     h_work_err,
                                     #ifdef COMPLEX
                                     rwork_err,
                                     #endif
                                     &result[0]);
                }
                if ( U2 != NULL ) {
                    lapackf77_cunt01("Columns", &M,  &N_U, U2,  &ldu, h_work_err, &lwork_err,
                                     #ifdef COMPLEX
                                     rwork_err,
                                     #endif
                                     &result[1]);
                }
                if ( VT2 != NULL ) {
                    lapackf77_cunt01("Rows",    &M_VT, &N, VT2, &ldv, h_work_err, &lwork_err,
                                     #ifdef COMPLEX
                                     rwork_err,
                                     #endif
                                     &result[2]);
                }
                
                result[3] = 0.;
                for (int j=0; j < min_mn-1; j++) {
                    if ( S1[j] < S1[j+1] )
                        result[3] = 1.;
                    if ( S1[j] < 0. )
                        result[3] = 1.;
                }
                if (min_mn > 1 && S1[min_mn-1] < 0.)
                    result[3] = 1.;
                
                result[0] *= eps;
                result[1] *= eps;
                result[2] *= eps;
                
                TESTING_FREE_CPU( h_work_err );
                TESTING_FREE_CPU( rwork_err );
            }
            
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            if ( opts.lapack ) {
                cpu_time = magma_wtime();
                lapackf77_cgesdd( lapack_vec_const(jobz), &M, &N,
                                  h_A, &lda, S2, U, &ldu, VT, &ldv, h_work, &lwork,
                                  #ifdef COMPLEX
                                  rwork,
                                  #endif
                                  iwork, &info);
                cpu_time = magma_wtime() - cpu_time;
                if (info != 0) {
                    printf("lapackf77_cgesdd returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));
                }
                
                /* =====================================================================
                   Check the result compared to LAPACK
                   =================================================================== */
                float work[1], c_neg_one = -1;
                
                blasf77_saxpy(&min_mn, &c_neg_one, S1, &ione, S2, &ione);
                result[4]  = lapackf77_slange("f", &min_mn, &ione, S2, &min_mn, work);
                result[4] /= lapackf77_slange("f", &min_mn, &ione, S1, &min_mn, work);
                
                printf("   %c %5d %5d  %7.2f         %7.2f          %8.2e",
                       lapack_vec_const(jobz)[0],
                       (int) M, (int) N, cpu_time, gpu_time, result[4] );
            }
            else {
                printf("   %c %5d %5d    ---           %7.2f            ---   ",
                       lapack_vec_const(jobz)[0],
                       (int) M, (int) N, gpu_time );
            }
            if ( opts.check ) {
                if ( result[0] < 0. ) { printf("     ---   ");   } else { printf("  %#9.3g",   result[0]); }
                if ( result[1] < 0. ) { printf("      ---   ");  } else { printf("   %#9.3g",  result[1]); }
                if ( result[2] < 0. ) { printf("       ---   "); } else { printf("    %#9.3g", result[2]); }
                bool okay = (result[0] < tol) && (result[1] < tol) && (result[2] < tol) && (result[3] == 0.) && (result[4] < tol);
                printf("    %3s   %s\n", (result[3] == 0. ? "yes" : "no"), (okay ? "ok" : "failed"));
                status += ! okay;
            }
            else {
                printf("\n");
            }
            
            TESTING_FREE_CPU( h_A );
            TESTING_FREE_CPU( VT  );
            TESTING_FREE_CPU( U   );
            TESTING_FREE_CPU( S1  );
            TESTING_FREE_CPU( S2  );
            TESTING_FREE_CPU( iwork );
            
            #ifdef COMPLEX
            TESTING_FREE_CPU( rwork );
            #endif
            
            TESTING_FREE_PIN( h_R    );
            TESTING_FREE_PIN( h_work );
            
            fflush( stdout );
        }}
        if ( opts.all || opts.niter > 1 ) {
            printf("\n");
        }
    }

    opts.cleanup();
    TESTING_FINALIZE();
    return status;
}
コード例 #9
0
ファイル: sgelqf.cpp プロジェクト: maxhutch/magma
/***************************************************************************//**
    Purpose
    -------
    SGELQF computes an LQ factorization of a REAL M-by-N matrix A:
    A = L * Q.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    A       REAL array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and below the diagonal of the array
            contain the m-by-min(m,n) lower trapezoidal matrix L (L is
            lower triangular if m <= n); the elements above the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of elementary reflectors (see Further Details).
    \n
            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    @param[out]
    tau     REAL array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    work    (workspace) REAL array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.  LWORK >= max(1,M).
            For optimum performance LWORK >= M*NB, where NB is the
            optimal blocksize.
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued.
    \n
            TODO: work is currently unused. sgeqrf2 allocates its own work of (m + n)*nb.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ---------------
    The matrix Q is represented as a product of elementary reflectors

       Q = H(k) . . . H(2) H(1), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in A(i,i+1:n),
    and tau in TAU(i).

    @ingroup magma_gelqf
*******************************************************************************/
extern "C" magma_int_t
magma_sgelqf(
    magma_int_t m, magma_int_t n,
    float *A,    magma_int_t lda,   float *tau,
    float *work, magma_int_t lwork,
    magma_int_t *info)
{
    #define  dA(i_, j_)  (dA  + (i_) + (j_)*ldda)
    #define dAT(i_, j_)  (dAT + (i_) + (j_)*ldda)
    
    /* Constants */
    const float c_one = MAGMA_S_ONE;
    const magma_int_t ione = 1;
    MAGMA_UNUSED( ione );  // used only for real
    
    /* Local variables */
    magmaFloat_ptr dA=NULL, dAT=NULL;
    magma_int_t min_mn, maxm, maxn, maxdim, nb;
    magma_int_t iinfo, ldda, lddat;

    /* Function Body */
    *info = 0;
    nb = magma_get_sgelqf_nb( m, n );
    min_mn = min( m, n );

    work[0] = magma_smake_lwork( m*nb );
    bool lquery = (lwork == -1);
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,m)) {
        *info = -4;
    } else if (lwork < max(1,m) && ! lquery) {
        *info = -7;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (min_mn == 0) {
        work[0] = c_one;
        return *info;
    }

    maxm = magma_roundup( m, 32 );
    maxn = magma_roundup( n, 32 );
    maxdim = max( maxm, maxn );

    magma_queue_t queue = NULL;
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queue );
    
    // copy to GPU and transpose
    if (maxdim*maxdim < 2*maxm*maxn) {
        // close to square, do everything in-place
        ldda  = maxdim;
        lddat = maxdim;

        if (MAGMA_SUCCESS != magma_smalloc( &dA, maxdim*maxdim )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            goto cleanup;
        }

        magma_ssetmatrix( m, n, A, lda, dA(0,0), ldda, queue );
        dAT = dA;
        magmablas_stranspose_inplace( lddat, dAT(0,0), lddat, queue );
    }
    else {
        // rectangular, do everything out-of-place
        ldda  = maxm;
        lddat = maxn;

        if (MAGMA_SUCCESS != magma_smalloc( &dA, 2*maxn*maxm )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            goto cleanup;
        }

        magma_ssetmatrix( m, n, A, lda, dA(0,0), ldda, queue );

        dAT = dA + maxn * maxm;
        magmablas_stranspose( m, n, dA(0,0), ldda, dAT(0,0), lddat, queue );
    }

    // factor QR
    magma_sgeqrf2_gpu( n, m, dAT(0,0), lddat, tau, &iinfo );
    assert( iinfo >= 0 );
    if ( iinfo > 0 ) {
        *info = iinfo;
    }
    
    // conjugate tau
    #ifdef COMPLEX
    lapackf77_slacgv( &min_mn, tau, &ione );
    #endif

    // undo transpose
    if (maxdim*maxdim < 2*maxm*maxn) {
        magmablas_stranspose_inplace( lddat, dAT(0,0), lddat, queue );
        magma_sgetmatrix( m, n, dA(0,0), ldda, A, lda, queue );
    } else {
        magmablas_stranspose( n, m, dAT(0,0), lddat, dA(0,0), ldda, queue );
        magma_sgetmatrix( m, n, dA(0,0), ldda, A, lda, queue );
    }

cleanup:
    magma_queue_destroy( queue );
    magma_free( dA );

    return *info;
} /* magma_sgelqf */