Example #1
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing cungqr
*/
int main( int argc, char** argv )
{
    TESTING_INIT();

    real_Double_t    gflops, gpu_perf, gpu_time, cpu_perf, cpu_time;
    float           error, work[1];
    magmaFloatComplex  c_neg_one = MAGMA_C_NEG_ONE;
    magmaFloatComplex *hA, *hR, *tau, *h_work;
    magmaFloatComplex *dA, *dT;
    magma_int_t m, n, k;
    magma_int_t n2, lda, ldda, lwork, min_mn, nb, info;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t status = 0;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );

    float tol = opts.tolerance * lapackf77_slamch("E");
    opts.lapack |= opts.check;  // check (-c) implies lapack (-l)
    
    printf("Running version %d; available are (specified through --version num):\n",
           (int) opts.version);
    printf("1 - uses precomputed clarft matrices (default)\n");
    printf("2 - recomputes the clarft matrices on the fly\n\n");

    printf("    m     n     k   CPU GFlop/s (sec)   GPU GFlop/s (sec)   ||R|| / ||A||\n");
    printf("=========================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            m = opts.msize[itest];
            n = opts.nsize[itest];
            k = opts.ksize[itest];
            if ( m < n || n < k ) {
                printf( "%5d %5d %5d   skipping because m < n or n < k\n", (int) m, (int) n, (int) k );
                continue;
            }
            
            lda  = m;
            ldda = ((m + 31)/32)*32;
            n2 = lda*n;
            min_mn = min(m, n);
            nb = magma_get_cgeqrf_nb( m );
            lwork  = (m + 2*n+nb)*nb;
            gflops = FLOPS_CUNGQR( m, n, k ) / 1e9;
            
            TESTING_MALLOC_PIN( h_work, magmaFloatComplex, lwork  );
            TESTING_MALLOC_PIN( hR,     magmaFloatComplex, lda*n  );
            
            TESTING_MALLOC_CPU( hA,     magmaFloatComplex, lda*n  );
            TESTING_MALLOC_CPU( tau,    magmaFloatComplex, min_mn );
            
            TESTING_MALLOC_DEV( dA,     magmaFloatComplex, ldda*n );
            TESTING_MALLOC_DEV( dT,     magmaFloatComplex, ( 2*min_mn + ((n + 31)/32)*32 )*nb );
            
            lapackf77_clarnv( &ione, ISEED, &n2, hA );
            lapackf77_clacpy( MagmaUpperLowerStr, &m, &n, hA, &lda, hR, &lda );
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            // first, get QR factors
            magma_csetmatrix( m, n, hA, lda, dA, ldda );
            magma_cgeqrf_gpu( m, n, dA, ldda, tau, dT, &info );
            if (info != 0)
                printf("magma_cgeqrf_gpu returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            magma_cgetmatrix( m, n, dA, ldda, hR, lda );
            
            gpu_time = magma_wtime();
            if (opts.version == 1)
                magma_cungqr( m, n, k, hR, lda, tau, dT, nb, &info );
            else
                magma_cungqr2(m, n, k, hR, lda, tau, &info );
            gpu_time = magma_wtime() - gpu_time;
            gpu_perf = gflops / gpu_time;
            if (info != 0)
                printf("magma_cungqr_gpu returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            if ( opts.lapack ) {
                error = lapackf77_clange("f", &m, &n, hA, &lda, work );
                
                lapackf77_cgeqrf( &m, &n, hA, &lda, tau, h_work, &lwork, &info );
                if (info != 0)
                    printf("lapackf77_cgeqrf returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));
                
                cpu_time = magma_wtime();
                lapackf77_cungqr( &m, &n, &k, hA, &lda, tau, h_work, &lwork, &info );
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
                if (info != 0)
                    printf("lapackf77_cungqr returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));
                
                // compute relative error |R|/|A| := |Q_magma - Q_lapack|/|A|
                blasf77_caxpy( &n2, &c_neg_one, hA, &ione, hR, &ione );
                error = lapackf77_clange("f", &m, &n, hR, &lda, work) / error;
                
                printf("%5d %5d %5d   %7.1f (%7.2f)   %7.1f (%7.2f)   %8.2e   %s\n",
                       (int) m, (int) n, (int) k,
                       cpu_perf, cpu_time, gpu_perf, gpu_time,
                       error, (error < tol ? "ok" : "failed"));
                status += ! (error < tol);
            }
            else {
                printf("%5d %5d %5d     ---   (  ---  )   %7.1f (%7.2f)     ---  \n",
                       (int) m, (int) n, (int) k,
                       gpu_perf, gpu_time );
            }
            
            TESTING_FREE_PIN( h_work );
            TESTING_FREE_PIN( hR     );
            
            TESTING_FREE_CPU( hA  );
            TESTING_FREE_CPU( tau );
            
            TESTING_FREE_DEV( dA );
            TESTING_FREE_DEV( dT );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }
    
    TESTING_FINALIZE();
    return status;
}
Example #2
0
/**
    Purpose
    -------
    Solves the overdetermined, least squares problem
           min || A*X - C ||
    using the QR factorization A.
    The underdetermined problem (m < n) is not currently handled.


    Arguments
    ---------
    @param[in]
    trans   magma_trans_t
      -     = MagmaNoTrans:   the linear system involves A.
            Only TRANS=MagmaNoTrans is currently handled.

    @param[in]
    m       INTEGER
            The number of rows of the matrix A. M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A. M >= N >= 0.

    @param[in]
    nrhs    INTEGER
            The number of columns of the matrix C. NRHS >= 0.

    @param[in,out]
    dA       COMPLEX array on the GPU, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, A is overwritten by details of its QR
            factorization as returned by CGEQRF.

    @param[in]
    ldda    INTEGER
            The leading dimension of the array A, LDDA >= M.

    @param[in,out]
    dB      COMPLEX array on the GPU, dimension (LDDB,NRHS)
            On entry, the M-by-NRHS matrix C.
            On exit, the N-by-NRHS solution matrix X.

    @param[in]
    lddb    INTEGER
            The leading dimension of the array dB. LDDB >= M.

    @param[out]
    hwork   (workspace) COMPLEX array, dimension MAX(1,LWORK).
            On exit, if INFO = 0, HWORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The dimension of the array HWORK,
            LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB,
            where NB is the blocksize given by magma_get_cgeqrf_nb( M ).
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the HWORK array, returns
            this value as the first entry of the HWORK array.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value

    @ingroup magma_cgels_driver
    ********************************************************************/
extern "C" magma_int_t
magma_cgels_gpu( magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t nrhs,
                 magmaFloatComplex *dA,    magma_int_t ldda,
                 magmaFloatComplex *dB,    magma_int_t lddb,
                 magmaFloatComplex *hwork, magma_int_t lwork,
                 magma_int_t *info)
{
    magmaFloatComplex *dT;
    magmaFloatComplex *tau;
    magma_int_t k;

    magma_int_t nb     = magma_get_cgeqrf_nb(m);
    magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb;
    int lquery = (lwork == -1);

    hwork[0] = MAGMA_C_MAKE( (float)lwkopt, 0. );

    *info = 0;
    /* For now, N is the only case working */
    if ( trans != MagmaNoTrans )
        *info = -1;
    else if (m < 0)
        *info = -2;
    else if (n < 0 || m < n) /* LQ is not handle for now*/
        *info = -3;
    else if (nrhs < 0)
        *info = -4;
    else if (ldda < max(1,m))
        *info = -6;
    else if (lddb < max(1,m))
        *info = -8;
    else if (lwork < lwkopt && ! lquery)
        *info = -10;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;

    k = min(m,n);
    if (k == 0) {
        hwork[0] = MAGMA_C_ONE;
        return *info;
    }

    /*
     * Allocate temporary buffers
     */
    int ldtwork = ( 2*k + ((n+31)/32)*32 )*nb;
    if (nb < nrhs)
        ldtwork = ( 2*k + ((n+31)/32)*32 )*nrhs;
    if (MAGMA_SUCCESS != magma_cmalloc( &dT, ldtwork )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }
    
    magma_cmalloc_cpu( &tau, k );
    if ( tau == NULL ) {
        magma_free( dT );
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }

    magma_cgeqrf_gpu( m, n, dA, ldda, tau, dT, info );

    if ( *info == 0 ) {
        magma_cgeqrs_gpu( m, n, nrhs,
                          dA, ldda, tau, dT,
                          dB, lddb, hwork, lwork, info );
    }
    
    magma_free( dT );
    magma_free_cpu(tau);
    return *info;
}
Example #3
0
extern "C" magma_int_t
magma_cgeqrf_ooc(magma_int_t m, magma_int_t n,
                 magmaFloatComplex *a,    magma_int_t lda, magmaFloatComplex *tau,
                 magmaFloatComplex *work, magma_int_t lwork,
                 magma_int_t *info )
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    CGEQRF_OOC computes a QR factorization of a COMPLEX M-by-N matrix A:
    A = Q * R. This version does not require work space on the GPU
    passed as input. GPU memory is allocated in the routine.
    This is an out-of-core (ooc) version that is similar to magma_cgeqrf but
    the difference is that this version can use a GPU even if the matrix
    does not fit into the GPU memory at once.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) COMPLEX array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).

            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    TAU     (output) COMPLEX array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    WORK    (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.

            Higher performance is achieved if WORK is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    LWORK   (input) INTEGER
            The dimension of the array WORK.  LWORK >= N*NB,
            where NB can be obtained through magma_get_cgeqrf_nb(M).

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ===============
    The matrix Q is represented as a product of elementary reflectors

       Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).
    =====================================================================    */

    #define  a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1))
    #define da_ref(a_1,a_2) (da+(a_2)*ldda  + (a_1))

    magmaFloatComplex *da, *dwork;
    magmaFloatComplex c_one = MAGMA_C_ONE;

    int  k, lddwork, ldda;

    *info = 0;
    int nb = magma_get_cgeqrf_nb(min(m, n));

    int lwkopt = n * nb;
    work[0] = MAGMA_C_MAKE( (float)lwkopt, 0 );
    int lquery = (lwork == -1);
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,m)) {
        *info = -4;
    } else if (lwork < max(1,n) && ! lquery) {
        *info = -7;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;

    /* Check how much memory do we have */
    size_t freeMem, totalMem;
    cudaMemGetInfo( &freeMem, &totalMem );
    freeMem /= sizeof(magmaFloatComplex);
    
    magma_int_t IB, NB = (magma_int_t)(0.8*freeMem/m);
    NB = (NB / nb) * nb;

    if (NB >= n)
        return magma_cgeqrf(m, n, a, lda, tau, work, lwork, info);

    k = min(m,n);
    if (k == 0) {
        work[0] = c_one;
        return *info;
    }

    lddwork = ((NB+31)/32)*32+nb;
    ldda    = ((m+31)/32)*32;

    if (MAGMA_SUCCESS != magma_cmalloc( &da, (NB + nb)*ldda + nb*lddwork )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    magma_queue_t stream[2];
    magma_queue_create( &stream[0] );
    magma_queue_create( &stream[1] );

    //   magmablasSetKernelStream(stream[1]);

    magmaFloatComplex *ptr = da + ldda * NB;
    dwork = da + ldda*(NB + nb);

    /* start the main loop over the blocks that fit in the GPU memory */
    for(int i=0; i<n; i+=NB) {
        IB = min(n-i, NB);
        //printf("Processing %5d columns -- %5d to %5d ... \n", IB, i, i+IB);

        /* 1. Copy the next part of the matrix to the GPU */
        magma_csetmatrix_async( (m), IB,
                                a_ref(0,i),  lda,
                                da_ref(0,0), ldda, stream[0] );
        magma_queue_sync( stream[0] );

        /* 2. Update it with the previous transformations */
        for(int j=0; j<min(i,k); j+=nb) {
            magma_int_t ib = min(k-j, nb);

            /* Get a panel in ptr.                                           */
            //   1. Form the triangular factor of the block reflector
            //   2. Send it to the GPU.
            //   3. Put 0s in the upper triangular part of V.
            //   4. Send V to the GPU in ptr.
            //   5. Update the matrix.
            //   6. Restore the upper part of V.
            magma_int_t rows = m-j;
            lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr,
                              &rows, &ib, a_ref(j,j), &lda, tau+j, work, &ib);
            magma_csetmatrix_async( ib, ib,
                                    work,  ib,
                                    dwork, lddwork, stream[1] );

            cpanel_to_q(MagmaUpper, ib, a_ref(j,j), lda, work+ib*ib);
            magma_csetmatrix_async( rows, ib,
                                    a_ref(j,j), lda,
                                    ptr,        rows, stream[1] );
            magma_queue_sync( stream[1] );

            magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                              rows, IB, ib,
                              ptr, rows, dwork,    lddwork,
                              da_ref(j, 0), ldda, dwork+ib, lddwork);

            cq_to_panel(MagmaUpper, ib, a_ref(j,j), lda, work+ib*ib);
        }

        /* 3. Do a QR on the current part */
        if (i<k)
            magma_cgeqrf2_gpu(m-i, IB, da_ref(i,0), ldda, tau+i, info);

        /* 4. Copy the current part back to the CPU */
        magma_cgetmatrix_async( (m), IB,
                                da_ref(0,0), ldda,
                                a_ref(0,i),  lda, stream[0] );
    }

    magma_queue_sync( stream[0] );

    magma_queue_destroy( stream[0] );
    magma_queue_destroy( stream[1] );
    magma_free( da );

    return *info;
} /* magma_cgeqrf_ooc */
Example #4
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing cgelqf_gpu
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    const float             d_neg_one = MAGMA_D_NEG_ONE;
    const float             d_one     = MAGMA_D_ONE;
    const magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;
    const magmaFloatComplex c_one     = MAGMA_C_ONE;
    const magmaFloatComplex c_zero    = MAGMA_C_ZERO;
    const magma_int_t        ione      = 1;
    
    real_Double_t    gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0;
    float           Anorm, error=0, error2=0;
    magmaFloatComplex *h_A, *h_R, *tau, *h_work, tmp[1];
    magmaFloatComplex_ptr d_A;
    magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t status = 0;

    magma_opts opts;
    opts.parse_opts( argc, argv );

    float tol = opts.tolerance * lapackf77_slamch("E");
    
    printf("%%   M     N   CPU Gflop/s (sec)   GPU Gflop/s (sec)   |L - A*Q^H|   |I - Q*Q^H|\n");
    printf("%%==============================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            min_mn = min(M, N);
            lda    = M;
            ldda   = magma_roundup( M, opts.align );  // multiple of 32 by default
            n2     = lda*N;
            nb     = magma_get_cgeqrf_nb( M, N );
            gflops = FLOPS_CGELQF( M, N ) / 1e9;
            
            // query for workspace size
            lwork = -1;
            lapackf77_cgelqf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info);
            lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] );
            lwork = max( lwork, M*nb );
            
            TESTING_MALLOC_CPU( tau,    magmaFloatComplex, min_mn );
            TESTING_MALLOC_CPU( h_A,    magmaFloatComplex, n2     );
            
            TESTING_MALLOC_PIN( h_R,    magmaFloatComplex, n2     );
            TESTING_MALLOC_PIN( h_work, magmaFloatComplex, lwork  );
            
            TESTING_MALLOC_DEV( d_A,    magmaFloatComplex, ldda*N );
            
            /* Initialize the matrix */
            lapackf77_clarnv( &ione, ISEED, &n2, h_A );
            lapackf77_clacpy( MagmaFullStr, &M, &N, h_A, &lda, h_R, &lda );
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            magma_csetmatrix( M, N, h_R, lda, d_A, ldda, opts.queue );
            gpu_time = magma_wtime();
            magma_cgelqf_gpu( M, N, d_A, ldda, tau, h_work, lwork, &info);
            gpu_time = magma_wtime() - gpu_time;
            gpu_perf = gflops / gpu_time;
            if (info != 0) {
                printf("magma_cgelqf_gpu returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            }
            
            /* =====================================================================
               Check the result, following zlqt01 except using the reduced Q.
               This works for any M,N (square, tall, wide).
               =================================================================== */
            if ( opts.check ) {
                magma_cgetmatrix( M, N, d_A, ldda, h_R, lda, opts.queue );
                
                magma_int_t ldq = min_mn;
                magma_int_t ldl = M;
                magmaFloatComplex *Q, *L;
                float *work;
                TESTING_MALLOC_CPU( Q,    magmaFloatComplex, ldq*N );       // K by N
                TESTING_MALLOC_CPU( L,    magmaFloatComplex, ldl*min_mn );  // M by K
                TESTING_MALLOC_CPU( work, float,             min_mn );
                
                // generate K by N matrix Q, where K = min(M,N)
                lapackf77_clacpy( "Upper", &min_mn, &N, h_R, &lda, Q, &ldq );
                lapackf77_cunglq( &min_mn, &N, &min_mn, Q, &ldq, tau, h_work, &lwork, &info );
                assert( info == 0 );
                
                // copy N by K matrix L
                lapackf77_claset( "Upper", &M, &min_mn, &c_zero, &c_zero, L, &ldl );
                lapackf77_clacpy( "Lower", &M, &min_mn, h_R, &lda,        L, &ldl );
                
                // error = || L - A*Q^H || / (N * ||A||)
                blasf77_cgemm( "NoTrans", "Conj", &M, &min_mn, &N,
                               &c_neg_one, h_A, &lda, Q, &ldq, &c_one, L, &ldl );
                Anorm = lapackf77_clange( "1", &M, &N,      h_A, &lda, work );
                error = lapackf77_clange( "1", &M, &min_mn, L,   &ldl, work );
                if ( N > 0 && Anorm > 0 )
                    error /= (N*Anorm);
                
                // set L = I (K by K), then L = I - Q*Q^H
                // error = || I - Q*Q^H || / N
                lapackf77_claset( "Upper", &min_mn, &min_mn, &c_zero, &c_one, L, &ldl );
                blasf77_cherk( "Upper", "NoTrans", &min_mn, &N, &d_neg_one, Q, &ldq, &d_one, L, &ldl );
                error2 = safe_lapackf77_clanhe( "1", "Upper", &min_mn, L, &ldl, work );
                if ( N > 0 )
                    error2 /= N;
                
                TESTING_FREE_CPU( Q    );  Q    = NULL;
                TESTING_FREE_CPU( L    );  L    = NULL;
                TESTING_FREE_CPU( work );  work = NULL;
            }
            
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            if ( opts.lapack ) {
                cpu_time = magma_wtime();
                lapackf77_cgelqf( &M, &N, h_A, &lda, tau, h_work, &lwork, &info );
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
                if (info != 0) {
                    printf("lapack_cgelqf returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));
                }
            }
            
            /* =====================================================================
               Print performance and error.
               =================================================================== */
            printf("%5d %5d   ", (int) M, (int) N );
            if ( opts.lapack ) {
                printf( "%7.2f (%7.2f)", cpu_perf, cpu_time );
            }
            else {
                printf("  ---   (  ---  )" );
            }
            printf( "   %7.2f (%7.2f)   ", gpu_perf, gpu_time );
            if ( opts.check ) {
                bool okay = (error < tol && error2 < tol);
                printf( "error %.4g, error2 %.4g, tol %.4g, okay %d\n", error, error2, tol, okay );
                status += ! okay;
                printf( "%11.2e   %11.2e   %s\n", error, error2, (okay ? "ok" : "failed") );
            }
            else {
                printf( "    ---\n" );
            }
            
            TESTING_FREE_CPU( tau );
            TESTING_FREE_CPU( h_A );
            
            TESTING_FREE_PIN( h_R    );
            TESTING_FREE_PIN( h_work );
            
            TESTING_FREE_DEV( d_A );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    opts.cleanup();
    TESTING_FINALIZE();
    return status;
}
Example #5
0
extern "C" magma_int_t
magma_cgeqrf_gpu( magma_int_t m, magma_int_t n,
                  magmaFloatComplex *dA,   magma_int_t ldda,
                  magmaFloatComplex *tau, magmaFloatComplex *dT,
                  magma_int_t *info )
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    CGEQRF computes a QR factorization of a complex M-by-N matrix A:
    A = Q * R.
    
    This version stores the triangular dT matrices used in
    the block QR factorization so that they can be applied directly (i.e.,
    without being recomputed) later. As a result, the application
    of Q is much faster. Also, the upper triangular matrices for V have 0s
    in them. The corresponding parts of the upper triangular R are inverted
    and stored separately in dT.
    
    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    dA      (input/output) COMPLEX array on the GPU, dimension (LDDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).

    LDDA     (input) INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).
            To benefit from coalescent memory accesses LDDA must be
            dividable by 16.

    TAU     (output) COMPLEX array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    dT      (workspace/output)  COMPLEX array on the GPU,
            dimension (2*MIN(M, N) + (N+31)/32*32 )*NB,
            where NB can be obtained through magma_get_cgeqrf_nb(M).
            It starts with MIN(M,N)*NB block that store the triangular T
            matrices, followed by the MIN(M,N)*NB block of the diagonal
            inverses for the R matrix. The rest of the array is used as workspace.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ===============
    The matrix Q is represented as a product of elementary reflectors

       Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).
    =====================================================================    */

    #define a_ref(a_1,a_2) (dA+(a_2)*(ldda) + (a_1))
    #define t_ref(a_1)     (dT+(a_1)*nb)
    #define d_ref(a_1)     (dT+(minmn+(a_1))*nb)
    #define dd_ref(a_1)    (dT+(2*minmn+(a_1))*nb)
    #define work_ref(a_1)  ( work + (a_1))
    #define hwork          ( work + (nb)*(m))

    magma_int_t i, k, minmn, old_i, old_ib, rows, cols;
    magma_int_t ib, nb;
    magma_int_t ldwork, lddwork, lwork, lhwork;
    magmaFloatComplex *work, *ut;

    /* check arguments */
    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    k = minmn = min(m,n);
    if (k == 0)
        return *info;

    nb = magma_get_cgeqrf_nb(m);

    lwork  = (m + n + nb)*nb;
    lhwork = lwork - m*nb;

    if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, lwork )) {
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }
    
    ut = hwork+nb*(n);
    memset( ut, 0, nb*nb*sizeof(magmaFloatComplex));

    magma_queue_t stream[2];
    magma_queue_create( &stream[0] );
    magma_queue_create( &stream[1] );

    ldwork = m;
    lddwork= n;

    if ( (nb > 1) && (nb < k) ) {
        /* Use blocked code initially */
        old_i = 0; old_ib = nb;
        for (i = 0; i < k-nb; i += nb) {
            ib = min(k-i, nb);
            rows = m -i;
            magma_cgetmatrix_async( rows, ib,
                                    a_ref(i,i),  ldda,
                                    work_ref(i), ldwork, stream[1] );
            if (i>0){
                /* Apply H' to A(i:m,i+2*ib:n) from the left */
                cols = n-old_i-2*old_ib;
                magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                  m-old_i, cols, old_ib,
                                  a_ref(old_i, old_i         ), ldda, t_ref(old_i), nb,
                                  a_ref(old_i, old_i+2*old_ib), ldda, dd_ref(0),    lddwork);
                
                /* store the diagonal */
                magma_csetmatrix_async( old_ib, old_ib,
                                        ut,           old_ib,
                                        d_ref(old_i), old_ib, stream[0] );
            }

            magma_queue_sync( stream[1] );
            lapackf77_cgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info);
            /* Form the triangular factor of the block reflector
               H = H(i) H(i+1) . . . H(i+ib-1) */
            lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr,
                              &rows, &ib,
                              work_ref(i), &ldwork, tau+i, hwork, &ib);

            /* Put 0s in the upper triangular part of a panel (and 1s on the
               diagonal); copy the upper triangular in ut and invert it. */
            magma_queue_sync( stream[0] );
            csplit_diag_block(ib, work_ref(i), ldwork, ut);
            magma_csetmatrix( rows, ib, work_ref(i), ldwork, a_ref(i,i), ldda );

            if (i + ib < n) {
                /* Send the triangular factor T to the GPU */
                magma_csetmatrix( ib, ib, hwork, ib, t_ref(i), nb );

                if (i+nb < k-nb){
                    /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */
                    magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                      rows, ib, ib,
                                      a_ref(i, i   ), ldda, t_ref(i),  nb,
                                      a_ref(i, i+ib), ldda, dd_ref(0), lddwork);
                }
                else {
                    cols = n-i-ib;
                    magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                      rows, cols, ib,
                                      a_ref(i, i   ), ldda, t_ref(i),  nb,
                                      a_ref(i, i+ib), ldda, dd_ref(0), lddwork);
                    /* Fix the diagonal block */
                    magma_csetmatrix( ib, ib, ut, ib, d_ref(i), ib );
                }
                old_i  = i;
                old_ib = ib;
            }
        }
    } else {
        i = 0;
    }

    /* Use unblocked code to factor the last or only block. */
    if (i < k) {
        ib   = n-i;
        rows = m-i;
        magma_cgetmatrix( rows, ib, a_ref(i, i), ldda, work, rows );
        lhwork = lwork - rows*ib;
        lapackf77_cgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info);
        
        magma_csetmatrix( rows, ib, work, rows, a_ref(i, i), ldda );
    }

    magma_queue_destroy( stream[0] );
    magma_queue_destroy( stream[1] );
    magma_free_pinned( work );
    return *info;

/*     End of MAGMA_CGEQRF */

} /* magma_cgeqrf */
Example #6
0
/***************************************************************************//**
    Purpose
    -------
    CGEQRS solves the least squares problem
           min || A*X - C ||
    using the QR factorization A = Q*R computed by CGEQRF_GPU.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A. M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A. M >= N >= 0.

    @param[in]
    nrhs    INTEGER
            The number of columns of the matrix C. NRHS >= 0.

    @param[in]
    dA      COMPLEX array on the GPU, dimension (LDDA,N)
            The i-th column must contain the vector which defines the
            elementary reflector H(i), for i = 1,2,...,n, as returned by
            CGEQRF_GPU in the first n columns of its array argument A.

    @param[in]
    ldda    INTEGER
            The leading dimension of the array A, LDDA >= M.

    @param[in]
    tau     COMPLEX array, dimension (N)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by MAGMA_CGEQRF_GPU.

    @param[in,out]
    dB      COMPLEX array on the GPU, dimension (LDDB,NRHS)
            On entry, the M-by-NRHS matrix C.
            On exit, the N-by-NRHS solution matrix X.

    @param[in,out]
    dT      COMPLEX array that is the output (the 6th argument)
            of magma_cgeqrf_gpu of size
            2*MIN(M, N)*NB + ceil(N/32)*32 )* MAX(NB, NRHS).
            The array starts with a block of size MIN(M,N)*NB that stores
            the triangular T matrices used in the QR factorization,
            followed by MIN(M,N)*NB block storing the diagonal block
            inverses for the R matrix, followed by work space of size
            (ceil(N/32)*32)* MAX(NB, NRHS).

    @param[in]
    lddb    INTEGER
            The leading dimension of the array dB. LDDB >= M.

    @param[out]
    hwork   (workspace) COMPLEX array, dimension (LWORK)
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK,
            LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB,
            where NB is the blocksize given by magma_get_cgeqrf_nb( M, N ).
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the HWORK array, returns
            this value as the first entry of the WORK array.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value

    @ingroup magma_geqrs
*******************************************************************************/
extern "C" magma_int_t
magma_cgeqrs_gpu(
    magma_int_t m, magma_int_t n, magma_int_t nrhs,
    magmaFloatComplex_const_ptr dA,    magma_int_t ldda,
    magmaFloatComplex const *tau,
    magmaFloatComplex_ptr dT,
    magmaFloatComplex_ptr dB, magma_int_t lddb,
    magmaFloatComplex *hwork, magma_int_t lwork,
    magma_int_t *info)
{
    #define dA(i_,j_) (dA + (i_) + (j_)*ldda)
    #define dT(i_)    (dT + (lddwork + (i_))*nb)

    /* Constants */
    const magmaFloatComplex c_zero    = MAGMA_C_ZERO;
    const magmaFloatComplex c_one     = MAGMA_C_ONE;
    const magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;
    const magma_int_t ione = 1;
    
    /* Local variables */
    magmaFloatComplex_ptr dwork;
    magma_int_t i, min_mn, lddwork, rows, ib;

    magma_int_t nb     = magma_get_cgeqrf_nb( m, n );
    magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb;
    bool lquery = (lwork == -1);

    hwork[0] = magma_cmake_lwork( lwkopt );

    *info = 0;
    if (m < 0)
        *info = -1;
    else if (n < 0 || m < n)
        *info = -2;
    else if (nrhs < 0)
        *info = -3;
    else if (ldda < max(1,m))
        *info = -5;
    else if (lddb < max(1,m))
        *info = -9;
    else if (lwork < lwkopt && ! lquery)
        *info = -11;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;

    min_mn = min(m,n);
    if (min_mn == 0) {
        hwork[0] = c_one;
        return *info;
    }

    magma_queue_t queue;
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queue );
    
    /* B := Q^H * B */
    magma_cunmqr_gpu( MagmaLeft, Magma_ConjTrans,
                      m, nrhs, n,
                      dA(0,0), ldda, tau,
                      dB, lddb, hwork, lwork, dT, nb, info );
    if ( *info != 0 ) {
        magma_queue_destroy( queue );
        return *info;
    }

    /* Solve R*X = B(1:n,:) */
    lddwork= min_mn;
    if (nb < min_mn)
        dwork = dT+2*lddwork*nb;
    else
        dwork = dT;
    // To do: Why did we have this line originally; seems to be a bug (Stan)?
    // dwork = dT;

    i    = (min_mn - 1)/nb * nb;
    ib   = n-i;
    rows = m-i;

    // TODO: this assumes that, on exit from magma_cunmqr_gpu, hwork contains
    // the last block of A and B (i.e., C in cunmqr). This should be fixed.
    // Seems this data should already be on the GPU, so could switch to
    // magma_ctrsm and drop the csetmatrix.
    if ( nrhs == 1 ) {
        blasf77_ctrsv( MagmaUpperStr, MagmaNoTransStr, MagmaNonUnitStr,
                       &ib, hwork,         &rows,
                            hwork+rows*ib, &ione);
    } else {
        blasf77_ctrsm( MagmaLeftStr, MagmaUpperStr, MagmaNoTransStr, MagmaNonUnitStr,
                       &ib, &nrhs,
                       &c_one, hwork,         &rows,
                               hwork+rows*ib, &rows);
    }
    
    // update the solution vector
    magma_csetmatrix( ib, nrhs,
                      hwork+rows*ib, rows,
                      dwork+i,       lddwork, queue );

    // update c
    if (nrhs == 1) {
        magma_cgemv( MagmaNoTrans, i, ib,
                     c_neg_one, dA(0, i), ldda,
                                dwork + i,   1,
                     c_one,     dB,           1, queue );
    }
    else {
        magma_cgemm( MagmaNoTrans, MagmaNoTrans, i, nrhs, ib,
                     c_neg_one, dA(0, i),  ldda,
                                dwork + i, lddwork,
                     c_one,     dB,        lddb, queue );
    }

    magma_int_t start = i-nb;
    if (nb < min_mn) {
        for (i = start; i >= 0; i -= nb) {
            ib = min(min_mn - i, nb);
            rows = m - i;

            if (i + ib < n) {
                if (nrhs == 1) {
                    magma_cgemv( MagmaNoTrans, ib, ib,
                                 c_one,  dT(i), ib,
                                         dB+i,      1,
                                 c_zero, dwork+i,  1, queue );
                    magma_cgemv( MagmaNoTrans, i, ib,
                                 c_neg_one, dA(0, i), ldda,
                                            dwork + i,   1,
                                 c_one,     dB,           1, queue );
                }
                else {
                    magma_cgemm( MagmaNoTrans, MagmaNoTrans, ib, nrhs, ib,
                                 c_one,  dT(i),   ib,
                                         dB+i,    lddb,
                                 c_zero, dwork+i, lddwork, queue );
                    magma_cgemm( MagmaNoTrans, MagmaNoTrans, i, nrhs, ib,
                                 c_neg_one, dA(0, i),  ldda,
                                            dwork + i, lddwork,
                                 c_one,     dB,        lddb, queue );
                }
            }
        }
    }

    magma_ccopymatrix( n, nrhs,
                       dwork, lddwork,
                       dB,    lddb, queue );
    
    magma_queue_destroy( queue );
    return *info;
}
Example #7
0
/**
    Purpose
    -------
    CGEQRF computes a QR factorization of a COMPLEX M-by-N matrix A:
    A = Q * R. This version does not require work space on the GPU
    passed as input. GPU memory is allocated in the routine.

    If the current stream is NULL, this version replaces it with user defined
    stream to overlap computation with communication.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    A       COMPLEX array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).
    \n
            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    @param[out]
    tau     COMPLEX array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    work    (workspace) COMPLEX array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
    \n
            Higher performance is achieved if WORK is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.  LWORK >= max( N*NB, 2*NB*NB ),
            where NB can be obtained through magma_get_cgeqrf_nb(M).
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ---------------
    The matrix Q is represented as a product of elementary reflectors

        Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

        H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).

    @ingroup magma_cgeqrf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_cgeqrf(magma_int_t m, magma_int_t n,
             magmaFloatComplex *A,    magma_int_t lda, magmaFloatComplex *tau,
             magmaFloatComplex *work, magma_int_t lwork,
             magma_int_t *info )
{
    #define  A(i,j) ( A + (i) + (j)*lda )
    #define dA(i,j) (dA + (i) + (j)*ldda)

    magmaFloatComplex *dA, *dwork, *dT;
    magmaFloatComplex c_one = MAGMA_C_ONE;

    magma_int_t i, k, lddwork, old_i, old_ib;
    magma_int_t ib, ldda;

    /* Function Body */
    *info = 0;
    magma_int_t nb = magma_get_cgeqrf_nb(min(m, n));

    // need 2*nb*nb to store T and upper triangle of V simultaneously
    magma_int_t lwkopt = max(n*nb, 2*nb*nb);
    work[0] = MAGMA_C_MAKE( (float)lwkopt, 0 );
    int lquery = (lwork == -1);
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,m)) {
        *info = -4;
    } else if (lwork < max(1, lwkopt) && ! lquery) {
        *info = -7;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;

    k = min(m,n);
    if (k == 0) {
        work[0] = c_one;
        return *info;
    }

    // largest N for larfb is n-nb (trailing matrix lacks 1st panel)
    lddwork = ((n+31)/32)*32 - nb;
    ldda    = ((m+31)/32)*32;

    magma_int_t num_gpus = magma_num_gpus();
    if ( num_gpus > 1 ) {
        /* call multiple-GPU interface  */
        return magma_cgeqrf4(num_gpus, m, n, A, lda, tau, work, lwork, info);
    }

    // allocate space for dA, dwork, and dT
    if (MAGMA_SUCCESS != magma_cmalloc( &dA, n*ldda + nb*lddwork + nb*nb )) {
        /* Switch to the "out-of-core" (out of GPU-memory) version */
        return magma_cgeqrf_ooc(m, n, A, lda, tau, work, lwork, info);
    }

    /* Define user stream if current stream is NULL */
    magma_queue_t stream[2], current_stream;
    magmablasGetKernelStream(&current_stream);

    magma_queue_create( &stream[0] );
    if (current_stream == NULL) {
        magma_queue_create( &stream[1] );
        magmablasSetKernelStream(stream[1]);
    }
    else {
        stream[1] = current_stream;
    }

    dwork = dA + n*ldda;
    dT    = dA + n*ldda + nb*lddwork;

    if ( (nb > 1) && (nb < k) ) {
        /* Use blocked code initially.
           Asynchronously send the matrix to the GPU except the first panel. */
        magma_csetmatrix_async( m, n-nb,
                                A(0,nb),  lda,
                                dA(0,nb), ldda, stream[0] );

        old_i = 0;
        old_ib = nb;
        for (i = 0; i < k-nb; i += nb) {
            ib = min(k-i, nb);
            if (i > 0) {
                /* download i-th panel */
                magma_queue_sync( stream[1] );
                magma_cgetmatrix_async( m-i, ib,
                                        dA(i,i), ldda,
                                        A(i,i),  lda, stream[0] );

                /* Apply H' to A(i:m,i+2*ib:n) from the left */
                magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                  m-old_i, n-old_i-2*old_ib, old_ib,
                                  dA(old_i, old_i),          ldda, dT,    nb,
                                  dA(old_i, old_i+2*old_ib), ldda, dwork, lddwork);

                magma_cgetmatrix_async( i, ib,
                                        dA(0,i), ldda,
                                        A(0,i),  lda, stream[1] );
                magma_queue_sync( stream[0] );
            }

            magma_int_t rows = m-i;
            lapackf77_cgeqrf(&rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info);
            
            /* Form the triangular factor of the block reflector
               H = H(i) H(i+1) . . . H(i+ib-1) */
            lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr,
                              &rows, &ib, A(i,i), &lda, tau+i, work, &ib);

            cpanel_to_q(MagmaUpper, ib, A(i,i), lda, work+ib*ib);

            /* download the i-th V matrix */
            magma_csetmatrix_async( rows, ib, A(i,i), lda, dA(i,i), ldda, stream[0] );

            /* download the T matrix */
            magma_queue_sync( stream[1] );
            magma_csetmatrix_async( ib, ib, work, ib, dT, nb, stream[0] );
            magma_queue_sync( stream[0] );

            if (i + ib < n) {
                if (i+ib < k-nb) {
                    /* Apply H' to A(i:m,i+ib:i+2*ib) from the left (look-ahead) */
                    magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                      rows, ib, ib,
                                      dA(i, i   ), ldda, dT,    nb,
                                      dA(i, i+ib), ldda, dwork, lddwork);
                    cq_to_panel(MagmaUpper, ib, A(i,i), lda, work+ib*ib);
                }
                else {
                    /* After last panel, update whole trailing matrix. */
                    /* Apply H' to A(i:m,i+ib:n) from the left */
                    magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                      rows, n-i-ib, ib,
                                      dA(i, i   ), ldda, dT,    nb,
                                      dA(i, i+ib), ldda, dwork, lddwork);
                    cq_to_panel(MagmaUpper, ib, A(i,i), lda, work+ib*ib);
                }

                old_i  = i;
                old_ib = ib;
            }
        }
    } else {
        i = 0;
    }
    
    /* Use unblocked code to factor the last or only block. */
    if (i < k) {
        ib = n-i;
        if (i != 0) {
            magma_cgetmatrix_async( m, ib, dA(0,i), ldda, A(0,i), lda, stream[1] );
            magma_queue_sync( stream[1] );
        }
        magma_int_t rows = m-i;
        lapackf77_cgeqrf(&rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info);
    }

    magma_queue_destroy( stream[0] );
    if (current_stream == NULL) {
        magma_queue_destroy( stream[1] );
        magmablasSetKernelStream(NULL);
    }

    magma_free( dA );
    
    return *info;
} /* magma_cgeqrf */
Example #8
0
/**
    Purpose
    -------
    CGEQRF3 computes a QR factorization of a complex M-by-N matrix A:
    A = Q * R.
    
    This version stores the triangular dT matrices used in
    the block QR factorization so that they can be applied directly (i.e.,
    without being recomputed) later. As a result, the application
    of Q is much faster. Also, the upper triangular matrices for V have 0s
    in them and the corresponding parts of the upper triangular R are
    stored separately in dT.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    dA      COMPLEX array on the GPU, dimension (LDDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).

    @param[in]
    ldda    INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).
            To benefit from coalescent memory accesses LDDA must be
            divisible by 16.

    @param[out]
    tau     COMPLEX array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    dT      (workspace) COMPLEX array on the GPU,
            dimension (2*MIN(M, N) + (N+31)/32*32 )*NB,
            where NB can be obtained through magma_get_cgeqrf_nb(M).
            It starts with MIN(M,N)*NB block that store the triangular T
            matrices, followed by the MIN(M,N)*NB block of the diagonal
            matrices for the R matrix. The rest of the array is used as workspace.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ---------------
    The matrix Q is represented as a product of elementary reflectors

       Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).

    @ingroup magma_cgeqrf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_cgeqrf3_gpu( magma_int_t m, magma_int_t n,
                  magmaFloatComplex *dA,   magma_int_t ldda,
                  magmaFloatComplex *tau, magmaFloatComplex *dT,
                  magma_int_t *info )
{
    #define dA(a_1,a_2) (dA + (a_2)*(ldda) + (a_1))
    #define dT(a_1)     (dT + (a_1)*nb)
    #define d_ref(a_1)  (dT + (  minmn+(a_1))*nb)
    #define dd_ref(a_1) (dT + (2*minmn+(a_1))*nb)
    #define work(a_1)   (work + (a_1))
    #define hwork       (work + (nb)*(m))

    magma_int_t i, k, minmn, old_i, old_ib, rows, cols;
    magma_int_t ib, nb;
    magma_int_t ldwork, lddwork, lwork, lhwork;
    magmaFloatComplex *work, *ut;

    /* check arguments */
    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    k = minmn = min(m,n);
    if (k == 0)
        return *info;

    nb = magma_get_cgeqrf_nb(m);

    lwork  = (m + n + nb)*nb;
    lhwork = lwork - m*nb;

    if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, lwork )) {
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }
    
    ut = hwork+nb*(n);
    memset( ut, 0, nb*nb*sizeof(magmaFloatComplex));

    magma_queue_t stream[2];
    magma_queue_create( &stream[0] );
    magma_queue_create( &stream[1] );

    ldwork = m;
    lddwork= n;

    if ( (nb > 1) && (nb < k) ) {
        /* Use blocked code initially */
        old_i = 0; old_ib = nb;
        for (i = 0; i < k-nb; i += nb) {
            ib = min(k-i, nb);
            rows = m -i;
            magma_cgetmatrix_async( rows, ib,
                                    dA(i,i),  ldda,
                                    work(i), ldwork, stream[1] );
            if (i > 0) {
                /* Apply H' to A(i:m,i+2*ib:n) from the left */
                cols = n-old_i-2*old_ib;
                magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                  m-old_i, cols, old_ib,
                                  dA(old_i, old_i         ), ldda, dT(old_i), nb,
                                  dA(old_i, old_i+2*old_ib), ldda, dd_ref(0),    lddwork);
                
                /* store the diagonal */
                magma_csetmatrix_async( old_ib, old_ib,
                                        ut,           old_ib,
                                        d_ref(old_i), old_ib, stream[0] );
            }

            magma_queue_sync( stream[1] );
            lapackf77_cgeqrf(&rows, &ib, work(i), &ldwork, tau+i, hwork, &lhwork, info);
            /* Form the triangular factor of the block reflector
               H = H(i) H(i+1) . . . H(i+ib-1) */
            lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr,
                              &rows, &ib,
                              work(i), &ldwork, tau+i, hwork, &ib);

            /* Put 0s in the upper triangular part of a panel (and 1s on the
               diagonal); copy the upper triangular in ut.     */
            magma_queue_sync( stream[0] );
            csplit_diag_block3(ib, work(i), ldwork, ut);
            magma_csetmatrix( rows, ib, work(i), ldwork, dA(i,i), ldda );

            if (i + ib < n) {
                /* Send the triangular factor T to the GPU */
                magma_csetmatrix( ib, ib, hwork, ib, dT(i), nb );

                if (i+nb < k-nb) {
                    /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */
                    magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                      rows, ib, ib,
                                      dA(i, i   ), ldda, dT(i),  nb,
                                      dA(i, i+ib), ldda, dd_ref(0), lddwork);
                }
                else {
                    cols = n-i-ib;
                    magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                      rows, cols, ib,
                                      dA(i, i   ), ldda, dT(i),  nb,
                                      dA(i, i+ib), ldda, dd_ref(0), lddwork);
                    /* Fix the diagonal block */
                    magma_csetmatrix( ib, ib, ut, ib, d_ref(i), ib );
                }
                old_i  = i;
                old_ib = ib;
            }
        }
    } else {
        i = 0;
    }

    /* Use unblocked code to factor the last or only block. */
    if (i < k) {
        ib   = n-i;
        rows = m-i;
        magma_cgetmatrix( rows, ib, dA(i, i), ldda, work, rows );
        lhwork = lwork - rows*ib;
        lapackf77_cgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info);
        
        magma_csetmatrix( rows, ib, work, rows, dA(i, i), ldda );
    }

    magma_queue_destroy( stream[0] );
    magma_queue_destroy( stream[1] );
    magma_free_pinned( work );
    return *info;
} /* magma_cgeqrf_gpu */
Example #9
0
/***************************************************************************//**
    Purpose
    -------
    CUNGQR generates an M-by-N COMPLEX matrix Q with orthonormal columns,
    which is defined as the first N columns of a product of K elementary
    reflectors of order M

          Q  =  H(1) H(2) . . . H(k)

    as returned by CGEQRF.

    This version recomputes the T matrices on the CPU and sends them to the GPU.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix Q. M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix Q. M >= N >= 0.

    @param[in]
    k       INTEGER
            The number of elementary reflectors whose product defines the
            matrix Q. N >= K >= 0.

    @param[in,out]
    A       COMPLEX array A, dimension (LDDA,N).
            On entry, the i-th column must contain the vector
            which defines the elementary reflector H(i), for
            i = 1,2,...,k, as returned by CGEQRF_GPU in the
            first k columns of its array argument A.
            On exit, the M-by-N matrix Q.

    @param[in]
    lda     INTEGER
            The first dimension of the array A. LDA >= max(1,M).

    @param[in]
    tau     COMPLEX array, dimension (K)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by CGEQRF_GPU.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument has an illegal value

    @ingroup magma_ungqr
*******************************************************************************/
extern "C" magma_int_t
magma_cungqr2(
    magma_int_t m, magma_int_t n, magma_int_t k,
    magmaFloatComplex *A, magma_int_t lda,
    magmaFloatComplex *tau,
    magma_int_t *info)
{
#define  A(i,j) ( A + (i) + (j)*lda )
#define dA(i,j) (dA + (i) + (j)*ldda)

    magmaFloatComplex c_zero = MAGMA_C_ZERO;
    magmaFloatComplex c_one  = MAGMA_C_ONE;

    magma_int_t nb = magma_get_cgeqrf_nb( m, n );

    magma_int_t  m_kk, n_kk, k_kk, mi;
    magma_int_t lwork, ldda;
    magma_int_t i, ib, ki, kk;  //, iinfo;
    magma_int_t lddwork;
    magmaFloatComplex *dA, *dV, *dW, *dT, *T;
    magmaFloatComplex *work;

    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if ((n < 0) || (n > m)) {
        *info = -2;
    } else if ((k < 0) || (k > n)) {
        *info = -3;
    } else if (lda < max(1,m)) {
        *info = -5;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    if (n <= 0) {
        return *info;
    }

    // first kk columns are handled by blocked method.
    // ki is start of 2nd-to-last block
    if ((nb > 1) && (nb < k)) {
        ki = (k - nb - 1) / nb * nb;
        kk = min(k, ki + nb);
    } else {
        ki = 0;
        kk = 0;
    }

    // Allocate GPU work space
    // ldda*n     for matrix dA
    // ldda*nb    for dV
    // lddwork*nb for dW larfb workspace
    ldda    = magma_roundup( m, 32 );
    lddwork = magma_roundup( n, 32 );
    if (MAGMA_SUCCESS != magma_cmalloc( &dA, ldda*n + ldda*nb + lddwork*nb + nb*nb)) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }
    dV = dA + ldda*n;
    dW = dA + ldda*n + ldda*nb;
    dT = dA + ldda*n + ldda*nb + lddwork*nb;

    // Allocate CPU work space
    lwork = (n+m+nb) * nb;
    magma_cmalloc_cpu( &work, lwork );

    T = work;

    if (work == NULL) {
        magma_free( dA );
        magma_free_cpu( work );
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }
    magmaFloatComplex *V = work + (n+nb)*nb;

    magma_queue_t queue;
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queue );

    // Use unblocked code for the last or only block.
    if (kk < n) {
        m_kk = m - kk;
        n_kk = n - kk;
        k_kk = k - kk;
        /*
            lapackf77_cungqr( &m_kk, &n_kk, &k_kk,
                              A(kk, kk), &lda,
                              &tau[kk], work, &lwork, &iinfo );
        */
        lapackf77_clacpy( MagmaFullStr, &m_kk, &k_kk, A(kk,kk), &lda, V, &m_kk);
        lapackf77_claset( MagmaFullStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda );

        lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr,
                          &m_kk, &k_kk,
                          V, &m_kk, &tau[kk], work, &k_kk);
        lapackf77_clarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr,
                          &m_kk, &n_kk, &k_kk,
                          V, &m_kk, work, &k_kk, A(kk, kk), &lda, work+k_kk*k_kk, &n_kk );
        
        if (kk > 0) {
            magma_csetmatrix( m_kk, n_kk,
                              A(kk, kk),  lda,
                              dA(kk, kk), ldda, queue );
        
            // Set A(1:kk,kk+1:n) to zero.
            magmablas_claset( MagmaFull, kk, n - kk, c_zero, c_zero, dA(0, kk), ldda, queue );
        }
    }

    if (kk > 0) {
        // Use blocked code
        // queue: set Aii (V) --> laset --> laset --> larfb --> [next]
        // CPU has no computation
        
        for (i = ki; i >= 0; i -= nb) {
            ib = min(nb, k - i);

            // Send current panel to the GPU
            mi = m - i;
            lapackf77_claset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda );
            magma_csetmatrix_async( mi, ib,
                                    A(i, i), lda,
                                    dV,      ldda, queue );
            lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr,
                              &mi, &ib,
                              A(i,i), &lda, &tau[i], T, &nb);
            magma_csetmatrix_async( ib, ib,
                                    T, nb,
                                    dT, nb, queue );

            // set panel to identity
            magmablas_claset( MagmaFull, i,  ib, c_zero, c_zero, dA(0, i), ldda, queue );
            magmablas_claset( MagmaFull, mi, ib, c_zero, c_one,  dA(i, i), ldda, queue );
            
            magma_queue_sync( queue );
            if (i < n) {
                // Apply H to A(i:m,i:n) from the left
                magma_clarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise,
                                  mi, n-i, ib,
                                  dV,       ldda, dT, nb,
                                  dA(i, i), ldda, dW, lddwork, queue );
            }
        }
    
        // copy result back to CPU
        magma_cgetmatrix( m, n,
                          dA(0, 0), ldda, A(0, 0), lda, queue );
    }

    magma_queue_destroy( queue );
    magma_free( dA );
    magma_free_cpu( work );

    return *info;
} /* magma_cungqr */
Example #10
0
/**
    Purpose
    -------
    CGEQRF4 computes a QR factorization of a COMPLEX M-by-N matrix A:
    A = Q * R using multiple GPUs. This version does not require work space on the GPU
    passed as input. GPU memory is allocated in the routine.

    Arguments
    ---------
    @param[in]
    num_gpus INTEGER
            The number of GPUs to be used for the factorization.

    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    A       COMPLEX array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).
    \n
            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    @param[out]
    tau     COMPLEX array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    work    (workspace) COMPLEX array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
    \n
            Higher performance is achieved if WORK is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.  LWORK >= N*NB,
            where NB can be obtained through magma_get_cgeqrf_nb(M).
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ---------------
    The matrix Q is represented as a product of elementary reflectors

       Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).

    @ingroup magma_cgeqrf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_cgeqrf4(magma_int_t num_gpus, magma_int_t m, magma_int_t n,
              magmaFloatComplex *A,    magma_int_t lda, magmaFloatComplex *tau,
              magmaFloatComplex *work, magma_int_t lwork,
              magma_int_t *info )
{
    magmaFloatComplex *da[MagmaMaxGPUs];
    magmaFloatComplex c_one = MAGMA_C_ONE;

    int i, k, ldda;

    *info = 0;
    int nb = magma_get_cgeqrf_nb(min(m, n));

    int lwkopt = n * nb;
    work[0] = MAGMA_C_MAKE( (float)lwkopt, 0 );
    int lquery = (lwork == -1);
    if (num_gpus < 0 || num_gpus > MagmaMaxGPUs) {
        *info = -1;
    } else if (m < 0) {
        *info = -2;
    } else if (n < 0) {
        *info = -3;
    } else if (lda < max(1,m)) {
        *info = -5;
    } else if (lwork < max(1,n) && ! lquery) {
        *info = -8;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;

    k = min(m,n);
    if (k == 0) {
        work[0] = c_one;
        return *info;
    }

    ldda    = ((m+31)/32)*32;

    magma_int_t  n_local[MagmaMaxGPUs];
    for (i=0; i < num_gpus; i++) {
        n_local[i] = ((n/nb)/num_gpus)*nb;
        if (i < (n/nb)%num_gpus)
            n_local[i] += nb;
        else if (i == (n/nb)%num_gpus)
            n_local[i] += n%nb;

        magma_setdevice(i);
        
        // TODO on failure, free previously allocated memory
        if (MAGMA_SUCCESS != magma_cmalloc( &da[i], ldda*n_local[i] )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }
    }

    if (m > nb && n > nb) {
        /* Copy the matrix to the GPUs in 1D block cyclic distribution */
        magma_csetmatrix_1D_col_bcyclic(m, n, A, lda, da, ldda, num_gpus, nb);

        /* Factor using the GPU interface */
        magma_cgeqrf2_mgpu( num_gpus, m, n, da, ldda, tau, info);

        /* Copy the matrix back from the GPUs to the CPU */
        magma_cgetmatrix_1D_col_bcyclic(m, n, da, ldda, A, lda, num_gpus, nb);
    }
    else {
        lapackf77_cgeqrf(&m, &n, A, &lda, tau, work, &lwork, info);
    }


    /* Free the allocated GPU memory */
    for (i=0; i < num_gpus; i++) {
        magma_setdevice(i);
        magma_free( da[i] );
    }

    return *info;
} /* magma_cgeqrf4 */
Example #11
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing cunmqr
*/
int main( int argc, char** argv )
{
    TESTING_INIT();
    
    real_Double_t   gflops, gpu_perf, gpu_time, cpu_perf, cpu_time;
    float error, work[1];
    magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;
    magma_int_t ione = 1;
    magma_int_t m, n, k, size, info;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t nb, ldc, lda, lwork, lwork_max;
    magmaFloatComplex *C, *R, *A, *W, *tau;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    // test all combinations of input parameters
    const char* side[]   = { MagmaLeftStr,      MagmaRightStr   };
    const char* trans[]  = { MagmaConjTransStr, MagmaNoTransStr };

    printf("    M     N     K  side   trans      CPU GFlop/s (sec)   GPU GFlop/s (sec)   ||R||_F / ||QC||_F\n");
    printf("===============================================================================================\n");
    for( int i = 0; i < opts.ntest; ++i ) {
        for( int iside = 0; iside < 2; ++iside ) {
        for( int itran = 0; itran < 2; ++itran ) {
            m = opts.msize[i];
            n = opts.nsize[i];
            k = opts.ksize[i];
            nb  = magma_get_cgeqrf_nb( m );
            ldc = ((m + 31)/32)*32;
            lda = ((max(m,n) + 31)/32)*32;
            gflops = FLOPS_CUNMQR( m, n, k, *side[iside] ) / 1e9;
            
            if ( *side[iside] == 'L' && m < k ) {
                printf( "%5d %5d %5d  %-5s  %-9s   skipping because side=left and m < k\n",
                        (int) m, (int) n, (int) k, side[iside], trans[itran] );
                continue;
            }
            if ( *side[iside] == 'R' && n < k ) {
                printf( "%5d %5d %5d  %-5s  %-9s   skipping because side=right and n < k\n",
                        (int) m, (int) n, (int) k, side[iside], trans[itran] );
                continue;
            }
            
            lwork_max = max( m*nb, n*nb );
            
            TESTING_MALLOC( C, magmaFloatComplex, ldc*n );
            TESTING_MALLOC( R, magmaFloatComplex, ldc*n );
            TESTING_MALLOC( A, magmaFloatComplex, lda*k );
            TESTING_MALLOC( W, magmaFloatComplex, lwork_max );
            TESTING_MALLOC( tau, magmaFloatComplex, k );
            
            // C is full, m x n
            size = ldc*n;
            lapackf77_clarnv( &ione, ISEED, &size, C );
            lapackf77_clacpy( "Full", &m, &n, C, &ldc, R, &ldc );
            //magma_csetmatrix( m,   n, C, ldc, dC, ldc );
            
            // A is m x k (left) or n x k (right)
            lda = (*side[iside] == 'L' ? m : n);
            size = lda*k;
            lapackf77_clarnv( &ione, ISEED, &size, A );
            
            // compute QR factorization to get Householder vectors in A, tau
            magma_cgeqrf( lda, k, A, lda, tau, W, lwork_max, &info );
            if (info != 0)
                printf("magma_cgeqrf returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            cpu_time = magma_wtime();
            lapackf77_cunmqr( side[iside], trans[itran],
                              &m, &n, &k,
                              A, &lda, tau, C, &ldc, W, &lwork_max, &info );
            cpu_time = magma_wtime() - cpu_time;
            cpu_perf = gflops / cpu_time;
            if (info != 0)
                printf("lapackf77_cunmqr returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            // query for work size
            lwork = -1;
            magma_cunmqr( *side[iside], *trans[itran],
                          m, n, k,
                          A, lda, tau, R, ldc, W, lwork, &info );
            if (info != 0)
                printf("magma_cunmqr (lwork query) returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            lwork = (magma_int_t) MAGMA_C_REAL( W[0] );
            if ( lwork < 0 || lwork > lwork_max )
                printf("invalid lwork %d, lwork_max %d\n", (int) lwork, (int) lwork_max );
            
            gpu_time = magma_wtime();
            magma_cunmqr( *side[iside], *trans[itran],
                          m, n, k,
                          A, lda, tau, R, ldc, W, lwork, &info );
            gpu_time = magma_wtime() - gpu_time;
            gpu_perf = gflops / gpu_time;
            if (info != 0)
                printf("magma_cunmqr returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            
            //magma_cgetmatrix( m, n, dC, ldc, R, ldc );
            
            /* =====================================================================
               compute relative error |QC_magma - QC_lapack| / |QC_lapack|
               =================================================================== */
            error = lapackf77_clange( "Fro", &m, &n, C, &ldc, work );
            size = ldc*n;
            blasf77_caxpy( &size, &c_neg_one, C, &ione, R, &ione );
            error = lapackf77_clange( "Fro", &m, &n, R, &ldc, work ) / error;
            
            printf( "%5d %5d %5d  %-5s  %-9s  %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e\n",
                    (int) m, (int) n, (int) k, side[iside], trans[itran],
                    cpu_perf, cpu_time, gpu_perf, gpu_time, error );
            
            TESTING_FREE( C );
            TESTING_FREE( R );
            TESTING_FREE( A );
            TESTING_FREE( W );
            TESTING_FREE( tau );
        }}  // end iside, itran
        printf( "\n" );
    }
    
    TESTING_FINALIZE();
    return 0;
}
Example #12
0
extern "C" magma_int_t
magma_zcgeqrsv_gpu(magma_int_t m, magma_int_t n, magma_int_t nrhs,
                   magmaDoubleComplex *dA,  magma_int_t ldda,
                   magmaDoubleComplex *dB,  magma_int_t lddb,
                   magmaDoubleComplex *dX,  magma_int_t lddx,
                   magma_int_t *iter, magma_int_t *info)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    ZCGEQRSV solves the least squares problem
       min || A*X - B ||,
    where A is an M-by-N matrix and X and B are M-by-NRHS matrices.

    ZCGEQRSV first attempts to factorize the matrix in complex SINGLE PRECISION
    and use this factorization within an iterative refinement procedure
    to produce a solution with complex DOUBLE PRECISION norm-wise backward error
    quality (see below). If the approach fails the method switches to a
    complex DOUBLE PRECISION factorization and solve.

    The iterative refinement is not going to be a winning strategy if
    the ratio complex SINGLE PRECISION performance over complex DOUBLE PRECISION
    performance is too small. A reasonable strategy should take the
    number of right-hand sides and the size of the matrix into account.
    This might be done with a call to ILAENV in the future. Up to now, we
    always try iterative refinement.
    
    The iterative refinement process is stopped if
        ITER > ITERMAX
    or for all the RHS we have:
        RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX
    where
        o ITER is the number of the current iteration in the iterative
          refinement process
        o RNRM is the infinity-norm of the residual
        o XNRM is the infinity-norm of the solution
        o ANRM is the infinity-operator-norm of the matrix A
        o EPS is the machine epsilon returned by DLAMCH('Epsilon')
    The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A. M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A. M >= N >= 0.

    NRHS    (input) INTEGER
            The number of right hand sides, i.e., the number of columns
            of the matrix B.  NRHS >= 0.

    dA      (input or input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N)
            On entry, the M-by-N coefficient matrix A.
            On exit, if iterative refinement has been successfully used
            (info.EQ.0 and ITER.GE.0, see description below), A is
            unchanged. If double precision factorization has been used
            (info.EQ.0 and ITER.LT.0, see description below), then the
            array dA contains the QR factorization of A as returned by
            function DGEQRF_GPU.

    LDDA    (input) INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).

    dB      (input or input/output) COMPLEX_16 array on the GPU, dimension (LDDB,NRHS)
            The M-by-NRHS right hand side matrix B.
            May be overwritten (e.g., if refinement fails).

    LDDB    (input) INTEGER
            The leading dimension of the array dB.  LDDB >= max(1,M).

    dX      (output) COMPLEX_16 array on the GPU, dimension (LDDX,NRHS)
            If info = 0, the N-by-NRHS solution matrix X.

    LDDX    (input) INTEGER
            The leading dimension of the array dX.  LDDX >= max(1,N).

    ITER    (output) INTEGER
            < 0: iterative refinement has failed, double precision
                 factorization has been performed
                 -1 : the routine fell back to full precision for
                      implementation- or machine-specific reasons
                 -2 : narrowing the precision induced an overflow,
                      the routine fell back to full precision
                 -3 : failure of SGEQRF
                 -31: stop the iterative refinement after the 30th iteration
            > 0: iterative refinement has been successfully used.
                 Returns the number of iterations

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if info = -i, the i-th argument had an illegal value

    =====================================================================    */

    #define dB(i,j)     (dB + (i) + (j)*lddb)
    #define dX(i,j)     (dX + (i) + (j)*lddx)
    #define dR(i,j)     (dR + (i) + (j)*lddr)
    #define dSX(i,j)    (dSX + (i) + (j)*lddsx)
    
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magma_int_t     ione  = 1;
    magmaDoubleComplex *dworkd, *hworkd;
    magmaFloatComplex  *dworks, *hworks;
    magmaDoubleComplex *dR, *tau, *dT;
    magmaFloatComplex  *dSA, *dSX, *dST, *stau;
    magmaDoubleComplex Xnrmv, Rnrmv;
    double          Anrm, Xnrm, Rnrm, cte, eps;
    magma_int_t     i, j, iiter, lddsa, lddsx, lddr, nb, lhwork, minmn, size, ldworkd;

    /* Check arguments */
    *iter = 0;
    *info = 0;
    if ( m < 0 )
        *info = -1;
    else if ( n < 0 || n > m )
        *info = -2;
    else if ( nrhs < 0 )
        *info = -3;
    else if ( ldda < max(1,m))
        *info = -5;
    else if ( lddb < max(1,m))
        *info = -7;
    else if ( lddx < max(1,n))
        *info = -9;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    if ( m == 0 || n == 0 || nrhs == 0 )
        return *info;

    nb   = magma_get_cgeqrf_nb(m);
    minmn= min(m, n);
    
    /* dSX contains both B and X, so must be max(m or lddb,n). */
    lddsa = ldda;
    lddsx = max(lddb,n);
    lddr  = lddb;
    
    /*
     * Allocate temporary buffers
     */
    /* dworks(dSA + dSX + dST) */
    size = lddsa*n + lddsx*nrhs + ( 2*minmn + ((n+31)/32)*32 )*nb;
    if (MAGMA_SUCCESS != magma_cmalloc( &dworks, size )) {
        fprintf(stderr, "Allocation of dworks failed (%d)\n", (int) size);
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }
    dSA = dworks;
    dSX = dSA + lddsa*n;
    dST = dSX + lddsx*nrhs;

    /* dworkd(dR) = lddr*nrhs */
    ldworkd = lddr*nrhs;
    if (MAGMA_SUCCESS != magma_zmalloc( &dworkd, ldworkd )) {
        magma_free( dworks );
        fprintf(stderr, "Allocation of dworkd failed\n");
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }
    dR = dworkd;

    /* hworks(workspace for cgeqrs + stau) = min(m,n) + lhworks */
    lhwork = (m - n + nb)*(nrhs + nb) + nrhs*nb;
    size = lhwork + minmn;
    magma_cmalloc_cpu( &hworks, size );
    if ( hworks == NULL ) {
        magma_free( dworks );
        magma_free( dworkd );
        fprintf(stderr, "Allocation of hworks failed\n");
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }
    stau = hworks + lhwork;

    eps  = lapackf77_dlamch("Epsilon");
    Anrm = magmablas_zlange('I', m, n, dA, ldda, (double*)dworkd );
    cte  = Anrm * eps * pow((double)n, 0.5) * BWDMAX;

    /*
     * Convert to single precision
     */
    magmablas_zlag2c( m, nrhs, dB, lddb, dSX, lddsx, info );
    if (*info != 0) {
        *iter = -2;
        goto FALLBACK;
    }

    magmablas_zlag2c( m, n, dA, ldda, dSA, lddsa, info );
    if (*info != 0) {
        *iter = -2;
        goto FALLBACK;
    }

    // factor dSA in single precision
    magma_cgeqrf_gpu( m, n, dSA, lddsa, stau, dST, info );
    if (*info != 0) {
        *iter = -3;
        goto FALLBACK;
    }

    // solve dSA*dSX = dB in single precision
    magma_cgeqrs_gpu( m, n, nrhs, dSA, lddsa, stau, dST, dSX, lddsx, hworks, lhwork, info );
    if (*info != 0) {
        *iter = -3;
        goto FALLBACK;
    }

    // residual dR = dB - dA*dX in double precision
    magmablas_clag2z( n, nrhs, dSX, lddsx, dX, lddx, info );
    magmablas_zlacpy( MagmaUpperLower, m, nrhs, dB, lddb, dR, lddr );
    if ( nrhs == 1 ) {
        magma_zgemv( MagmaNoTrans, m, n,
                     c_neg_one, dA, ldda,
                                dX, 1,
                     c_one,     dR, 1 );
    }
    else {
        magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, nrhs, n,
                     c_neg_one, dA, ldda,
                                dX, lddx,
                     c_one,     dR, lddr );
    }

    // TODO: use MAGMA_Z_ABS( dX(i,j) ) instead of zlange?
    for( j=0; j < nrhs; j++ ) {
        i = magma_izamax( n, dX(0,j), 1) - 1;
        magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 );
        Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL );

        i = magma_izamax ( m, dR(0,j), 1 ) - 1;
        magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 );
        Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL );

        if ( Rnrm >  Xnrm*cte ) {
            goto REFINEMENT;
        }
    }

    *iter = 0;

    /* Free workspaces */
    magma_free( dworks );
    magma_free( dworkd );
    magma_free_cpu( hworks );
    return *info;

REFINEMENT:
    /* TODO: this iterative refinement algorithm works only for compatibile
     * systems (B in colspan of A).
     * See Matrix Computations (3rd ed) p. 267 for correct algorithm. */
    for( iiter=1; iiter < ITERMAX; ) {
        *info = 0;
        // convert residual dR to single precision dSX
        magmablas_zlag2c( m, nrhs, dR, lddr, dSX, lddsx, info );
        if (*info != 0) {
            *iter = -2;
            goto FALLBACK;
        }
        // solve dSA*dSX = R in single precision
        magma_cgeqrs_gpu( m, n, nrhs, dSA, lddsa, stau, dST, dSX, lddsx, hworks, lhwork, info );
        if (*info != 0) {
            *iter = -3;
            goto FALLBACK;
        }

        // Add correction and setup residual
        // dX += dSX [including conversion]  --and--
        // dR[1:n] = dB[1:n]   (only n rows, not whole m rows! -- useless if m > n)
        for( j=0; j < nrhs; j++ ) {
            magmablas_zcaxpycp( n, dSX(0,j), dX(0,j), dB(0,j), dR(0,j) );
        }
        // dR = dB  (whole m rows)
        magmablas_zlacpy( MagmaUpperLower, m, nrhs, dB, lddb, dR, lddr );
        
        // residual dR = dB - dA*dX in double precision
        if ( nrhs == 1 ) {
            magma_zgemv( MagmaNoTrans, m, n,
                         c_neg_one, dA, ldda,
                                    dX, 1,
                         c_one,     dR, 1 );
        }
        else {
            magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, nrhs, n,
                         c_neg_one, dA, ldda,
                                    dX, lddx,
                         c_one,     dR, lddr );
        }

        /*  Check whether the nrhs normwise backward errors satisfy the
         *  stopping criterion. If yes, set ITER=IITER>0 and return. */
        for( j=0; j < nrhs; j++ ) {
            i = magma_izamax( n, dX(0,j), 1) - 1;
            magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 );
            Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL );

            i = magma_izamax ( m, dR(0,j), 1 ) - 1;
            magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 );
            Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL );

            if ( Rnrm >  Xnrm*cte ) {
                goto L20;
            }
        }

        /*  If we are here, the nrhs normwise backward errors satisfy
         *  the stopping criterion, we are good to exit. */
        *iter = iiter;

        /* Free workspaces */
        magma_free( dworks );
        magma_free( dworkd );
        magma_free_cpu( hworks );
        return *info;
        
      L20:
        iiter++;
    }

    /* If we are at this place of the code, this is because we have
     * performed ITER=ITERMAX iterations and never satisified the
     * stopping criterion. Set up the ITER flag accordingly and follow
     * up on double precision routine. */
    *iter = -ITERMAX - 1;
    
FALLBACK:
    /* Single-precision iterative refinement failed to converge to a
     * satisfactory solution, so we resort to double precision. */
    magma_free( dworks );
    magma_free_cpu( hworks );

    /*
     * Allocate temporary buffers
     */
    /* dworkd = dT for zgeqrf */
    nb   = magma_get_zgeqrf_nb( m );
    size = (2*min(m, n) + (n+31)/32*32 )*nb;
    if ( size > ldworkd ) {
        magma_free( dworkd );
        if (MAGMA_SUCCESS != magma_zmalloc( &dworkd, size )) {
            fprintf(stderr, "Allocation of dworkd2 failed\n");
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }
    }
    dT = dworkd;

    /* hworkd(dtau + workspace for zgeqrs) = min(m,n) + lhwork */
    size = lhwork + minmn;
    magma_zmalloc_cpu( &hworkd, size );
    if ( hworkd == NULL ) {
        magma_free( dworkd );
        fprintf(stderr, "Allocation of hworkd2 failed\n");
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }
    tau = hworkd + lhwork;

    magma_zgeqrf_gpu( m, n, dA, ldda, tau, dT, info );
    if (*info == 0) {
        // if m > n, then dB won't fit in dX, so solve with dB and copy n rows to dX
        magma_zgeqrs_gpu( m, n, nrhs, dA, ldda, tau, dT, dB, lddb, hworkd, lhwork, info );
        magmablas_zlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dX, lddx );
    }

    magma_free( dworkd );
    magma_free_cpu( hworkd );
    return *info;
}
Example #13
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing cgelqf_gpu
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    real_Double_t    gflops, gpu_perf, gpu_time, cpu_perf, cpu_time;
    float           error, work[1];
    magmaFloatComplex  c_neg_one = MAGMA_C_NEG_ONE;
    magmaFloatComplex *h_A, *h_R, *tau, *h_work, tmp[1];
    magmaFloatComplex *d_A;
    magma_int_t M, N, n2, lda, lwork, info, min_mn, nb;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};

    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    printf("    M     N   CPU GFlop/s (sec)   GPU GFlop/s (sec)   ||R||_F / ||A||_F\n");
    printf("=======================================================================\n");
    for( int i = 0; i < opts.ntest; ++i ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[i];
            N = opts.nsize[i];
            min_mn = min(M, N);
            lda    = M;
            n2     = lda*N;
            nb     = magma_get_cgeqrf_nb(M);
            gflops = FLOPS_CGELQF( M, N ) / 1e9;
            
            // query for workspace size
            lwork = -1;
            lapackf77_cgelqf(&M, &N, h_A, &M, tau, tmp, &lwork, &info);
            lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] );
            lwork = max( lwork, M*nb );
            
            TESTING_MALLOC(    tau,    magmaFloatComplex, min_mn );
            TESTING_MALLOC(    h_A,    magmaFloatComplex, n2     );
            TESTING_HOSTALLOC( h_R,    magmaFloatComplex, n2     );
            TESTING_DEVALLOC(  d_A,    magmaFloatComplex, lda*N  );
            TESTING_HOSTALLOC( h_work, magmaFloatComplex, lwork  );
            
            /* Initialize the matrix */
            lapackf77_clarnv( &ione, ISEED, &n2, h_A );
            lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda );
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            magma_csetmatrix( M, N, h_R, lda, d_A, lda );
            gpu_time = magma_wtime();
            magma_cgelqf_gpu( M, N, d_A, lda, tau, h_work, lwork, &info);
            gpu_time = magma_wtime() - gpu_time;
            gpu_perf = gflops / gpu_time;
            if (info != 0)
                printf("magma_cgelqf_gpu returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            cpu_time = magma_wtime();
            lapackf77_cgelqf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info);
            cpu_time = magma_wtime() - cpu_time;
            cpu_perf = gflops / cpu_time;
            if (info != 0)
                printf("lapack_cgelqf returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            
            /* =====================================================================
               Check the result compared to LAPACK
               =================================================================== */
            magma_cgetmatrix( M, N, d_A, lda, h_R, lda );
            error = lapackf77_clange("f", &M, &N, h_A, &lda, work);
            blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione);
            error = lapackf77_clange("f", &M, &N, h_R, &lda, work) / error;
            
            printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e\n",
                   (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error );
            
            TESTING_FREE( tau );
            TESTING_FREE( h_A );
            TESTING_HOSTFREE( h_R );
            TESTING_HOSTFREE( h_work );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    TESTING_FINALIZE();
    return 0;
}
Example #14
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing cgelqf
*/
int main( int argc, char** argv)
{
    TESTING_CUDA_INIT();

    magma_timestr_t       start, end;
    float           flops, gpu_perf, cpu_perf;
    float           matnorm, work[1];
    cuFloatComplex  c_neg_one = MAGMA_C_NEG_ONE;
    cuFloatComplex *h_A, *h_R, *tau, *h_work, tmp[1];

    /* Matrix size */
    magma_int_t M = 0, N = 0, n2, lda, lwork;
    magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112};

    magma_int_t i, info, min_mn, nb;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};

    if (argc != 1){
        for(i = 1; i<argc; i++){
            if (strcmp("-N", argv[i])==0)
                N = atoi(argv[++i]);
            else if (strcmp("-M", argv[i])==0)
                M = atoi(argv[++i]);
        }
        if ( M == 0 ) {
            M = N;
        }
        if ( N == 0 ) {
            N = M;
        }
        if (N>0 && M>0)
            printf("  testing_cgelqf -M %d -N %d\n\n", (int) M, (int) N);
        else
            {
                printf("\nUsage: \n");
                printf("  testing_cgelqf -M %d -N %d\n\n", (int) M, (int) N);
                exit(1);
            }
    }
    else {
        printf("\nUsage: \n");
        printf("  testing_cgelqf -M %d -N %d\n\n", 1024, 1024);
        M = N = size[9];
    }

    n2  = M * N;
    min_mn = min(M, N);
    nb = magma_get_cgeqrf_nb(M);

    TESTING_MALLOC(    tau, cuFloatComplex, min_mn );
    TESTING_MALLOC(    h_A, cuFloatComplex, n2     );
    TESTING_HOSTALLOC( h_R, cuFloatComplex, n2     );

    lwork = -1;
    lapackf77_cgelqf(&M, &N, h_A, &M, tau, tmp, &lwork, &info);
    lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] );
    lwork = max( lwork, M*nb );

    TESTING_HOSTALLOC( h_work, cuFloatComplex, lwork );

    printf("  M     N   CPU GFlop/s   GPU GFlop/s    ||R||_F / ||A||_F\n");
    printf("==========================================================\n");
    for(i=0; i<10; i++){
        if (argc == 1){
            M = N = size[i];
        }
        min_mn= min(M, N);
        lda   = M;
        n2    = lda*N;
        flops = FLOPS( (float)M, (float)N ) / 1000000;

        /* Initialize the matrix */
        lapackf77_clarnv( &ione, ISEED, &n2, h_A );
        lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda );

        /* ====================================================================
           Performs operation using MAGMA
           =================================================================== */
        start = get_current_time();
        magma_cgelqf( M, N, h_R, lda, tau, h_work, lwork, &info);
        end = get_current_time();
        if (info < 0)
            printf("Argument %d of magma_cgelqf had an illegal value.\n", (int) -info);
        
        gpu_perf = flops / GetTimerValue(start, end);

        /* =====================================================================
           Performs operation using LAPACK
           =================================================================== */
        start = get_current_time();
        lapackf77_cgelqf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info);
        end = get_current_time();
        if (info < 0)
            printf("Argument %d of lapack_cgelqf had an illegal value.\n", (int) -info);
        
        cpu_perf = flops / GetTimerValue(start, end);

        /* =====================================================================
           Check the result compared to LAPACK
           =================================================================== */
        matnorm = lapackf77_clange("f", &M, &N, h_A, &lda, work);
        blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione);

        printf("%5d %5d  %6.2f         %6.2f        %e\n",
               (int) M, (int) N, cpu_perf, gpu_perf,
               lapackf77_clange("f", &M, &N, h_R, &lda, work) / matnorm);

        if (argc != 1)
            break;
    }

    /* Memory clean up */
    TESTING_FREE( tau );
    TESTING_FREE( h_A );
    TESTING_HOSTFREE( h_R );
    TESTING_HOSTFREE( h_work );

    /* Shutdown */
    TESTING_CUDA_FINALIZE();
}
Example #15
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing cunmqr_gpu
*/
int main( int argc, char** argv )
{
    TESTING_INIT();
    
    real_Double_t   gflops, gpu_perf, gpu_time, cpu_perf, cpu_time;
    float error, work[1];
    magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;
    magma_int_t ione = 1;
    magma_int_t m, n, k, size, info;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t nb, ldc, lda, lwork, lwork_max, dt_size;
    magmaFloatComplex *C, *R, *A, *W, *tau;
    magmaFloatComplex_ptr dC, dA, dT;
    magma_int_t status = 0;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    float tol = 2. * opts.tolerance * lapackf77_slamch("E");
    
    // test all combinations of input parameters
    magma_side_t  side [] = { MagmaLeft,       MagmaRight   };
    magma_trans_t trans[] = { Magma_ConjTrans, MagmaNoTrans };

    printf("    M     N     K   side   trans   CPU GFlop/s (sec)   GPU GFlop/s (sec)   ||R||_F / ||QC||_F\n");
    printf("===============================================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
      for( int iside = 0; iside < 2; ++iside ) {
      for( int itran = 0; itran < 2; ++itran ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {        
            m = opts.msize[itest];
            n = opts.nsize[itest];
            k = opts.ksize[itest];
            nb  = magma_get_cgeqrf_nb( m );
            ldc = ((m + 31)/32)*32;
            lda = ((max(m,n) + 31)/32)*32;
            gflops = FLOPS_CUNMQR( m, n, k, side[iside] ) / 1e9;
            
            if ( side[iside] == MagmaLeft && m < k ) {
                printf( "%5d %5d %5d   %4c   %5c   skipping because side=left  and m < k\n",
                        (int) m, (int) n, (int) k,
                        lapacke_side_const( side[iside] ),
                        lapacke_trans_const( trans[itran] ) );
                continue;
            }
            if ( side[iside] == MagmaRight && n < k ) {
                printf( "%5d %5d %5d   %4c   %5c   skipping because side=right and n < k\n",
                        (int) m, (int) n, (int) k,
                        lapacke_side_const( side[iside] ),
                        lapacke_trans_const( trans[itran] ) );
                continue;
            }
            
            if ( side[iside] == MagmaLeft ) {
                // side = left
                lwork_max = (m - k + nb)*(n + nb) + n*nb;
                dt_size = ( 2*min(m,k) + ((max(m,n) + 31)/32)*32 )*nb;
            }
            else {
                // side = right
                lwork_max = (n - k + nb)*(m + nb) + m*nb;
                dt_size = ( 2*min(n,k) + ((max(m,n) + 31)/32)*32 )*nb;
            }
            
            TESTING_MALLOC_CPU( C,   magmaFloatComplex, ldc*n );
            TESTING_MALLOC_CPU( R,   magmaFloatComplex, ldc*n );
            TESTING_MALLOC_CPU( A,   magmaFloatComplex, lda*k );
            TESTING_MALLOC_CPU( W,   magmaFloatComplex, lwork_max );
            TESTING_MALLOC_CPU( tau, magmaFloatComplex, k );
            
            TESTING_MALLOC_DEV( dC, magmaFloatComplex, ldc*n );
            TESTING_MALLOC_DEV( dA, magmaFloatComplex, lda*k );
            TESTING_MALLOC_DEV( dT, magmaFloatComplex, dt_size );
            
            // C is full, m x n
            size = ldc*n;
            lapackf77_clarnv( &ione, ISEED, &size, C );
            magma_csetmatrix( m, n, C, ldc, dC, ldc );
            
            // A is m x k (left) or n x k (right)
            lda = (side[iside] == MagmaLeft ? m : n);
            size = lda*k;
            lapackf77_clarnv( &ione, ISEED, &size, A );
            
            // compute QR factorization to get Householder vectors in dA, tau, dT
            magma_csetmatrix( lda, k, A,  lda, dA, lda );
            magma_cgeqrf_gpu( lda, k, dA, lda, tau, dT, &info );
            magma_cgetmatrix( lda, k, dA, lda, A,  lda );
            if (info != 0)
                printf("magma_cgeqrf_gpu returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            cpu_time = magma_wtime();
            lapackf77_cunmqr( lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ),
                              &m, &n, &k,
                              A, &lda, tau, C, &ldc, W, &lwork_max, &info );
            cpu_time = magma_wtime() - cpu_time;
            cpu_perf = gflops / cpu_time;
            if (info != 0)
                printf("lapackf77_cunmqr returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            // query for workspace size
            lwork = -1;
            magma_cunmqr_gpu( side[iside], trans[itran],
                              m, n, k,
                              dA, lda, tau, dC, ldc, W, lwork, dT, nb, &info );
            if (info != 0)
                printf("magma_cunmqr_gpu (lwork query) returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            lwork = (magma_int_t) MAGMA_C_REAL( W[0] );
            if ( lwork < 0 || lwork > lwork_max )
                printf("invalid lwork %d, lwork_max %d\n", (int) lwork, (int) lwork_max );
            
            gpu_time = magma_sync_wtime( 0 );  // sync needed for L,N and R,T cases
            magma_cunmqr_gpu( side[iside], trans[itran],
                              m, n, k,
                              dA, lda, tau, dC, ldc, W, lwork, dT, nb, &info );
            gpu_time = magma_sync_wtime( 0 ) - gpu_time;
            gpu_perf = gflops / gpu_time;
            if (info != 0)
                printf("magma_cunmqr_gpu returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            
            magma_cgetmatrix( m, n, dC, ldc, R, ldc );
            
            /* =====================================================================
               compute relative error |QC_magma - QC_lapack| / |QC_lapack|
               =================================================================== */
            error = lapackf77_clange( "Fro", &m, &n, C, &ldc, work );
            size = ldc*n;
            blasf77_caxpy( &size, &c_neg_one, C, &ione, R, &ione );
            error = lapackf77_clange( "Fro", &m, &n, R, &ldc, work ) / error;
            
            printf( "%5d %5d %5d   %4c   %5c   %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e   %s\n",
                    (int) m, (int) n, (int) k,
                    lapacke_side_const( side[iside] ),
                    lapacke_trans_const( trans[itran] ),
                    cpu_perf, cpu_time, gpu_perf, gpu_time,
                    error, (error < tol ? "ok" : "failed") );
            status += ! (error < tol);
            
            TESTING_FREE_CPU( C );
            TESTING_FREE_CPU( R );
            TESTING_FREE_CPU( A );
            TESTING_FREE_CPU( W );
            TESTING_FREE_CPU( tau );
            
            TESTING_FREE_DEV( dC );
            TESTING_FREE_DEV( dA );
            TESTING_FREE_DEV( dT );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
      }}  // end iside, itran
      printf( "\n" );
    }
    
    TESTING_FINALIZE();
    return status;
}
Example #16
0
/**
    Purpose
    -------
    ZCGEQRSV solves the least squares problem
       min || A*X - B ||,
    where A is an M-by-N matrix and X and B are M-by-NRHS matrices.

    ZCGEQRSV first attempts to factorize the matrix in complex SINGLE PRECISION
    and use this factorization within an iterative refinement procedure
    to produce a solution with complex DOUBLE PRECISION norm-wise backward error
    quality (see below). If the approach fails the method switches to a
    complex DOUBLE PRECISION factorization and solve.

    The iterative refinement is not going to be a winning strategy if
    the ratio complex SINGLE PRECISION performance over complex DOUBLE PRECISION
    performance is too small. A reasonable strategy should take the
    number of right-hand sides and the size of the matrix into account.
    This might be done with a call to ILAENV in the future. Up to now, we
    always try iterative refinement.
    
    The iterative refinement process is stopped if
        ITER > ITERMAX
    or for all the RHS we have:
        RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX
    where
        o ITER is the number of the current iteration in the iterative
          refinement process
        o RNRM is the infinity-norm of the residual
        o XNRM is the infinity-norm of the solution
        o ANRM is the infinity-operator-norm of the matrix A
        o EPS is the machine epsilon returned by DLAMCH('Epsilon')
    The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A. M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A. M >= N >= 0.

    @param[in]
    nrhs    INTEGER
            The number of right hand sides, i.e., the number of columns
            of the matrix B.  NRHS >= 0.

    @param[in,out]
    dA      COMPLEX_16 array on the GPU, dimension (LDDA,N)
            On entry, the M-by-N coefficient matrix A.
            On exit, if iterative refinement has been successfully used
            (info.EQ.0 and ITER.GE.0, see description below), A is
            unchanged. If double precision factorization has been used
            (info.EQ.0 and ITER.LT.0, see description below), then the
            array dA contains the QR factorization of A as returned by
            function DGEQRF_GPU.

    @param[in]
    ldda    INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).

    @param[in,out]
    dB      COMPLEX_16 array on the GPU, dimension (LDDB,NRHS)
            The M-by-NRHS right hand side matrix B.
            May be overwritten (e.g., if refinement fails).

    @param[in]
    lddb    INTEGER
            The leading dimension of the array dB.  LDDB >= max(1,M).

    @param[out]
    dX      COMPLEX_16 array on the GPU, dimension (LDDX,NRHS)
            If info = 0, the N-by-NRHS solution matrix X.

    @param[in]
    lddx    INTEGER
            The leading dimension of the array dX.  LDDX >= max(1,N).

    @param[out]
    iter    INTEGER
      -     < 0: iterative refinement has failed, double precision
                 factorization has been performed
        +        -1 : the routine fell back to full precision for
                      implementation- or machine-specific reasons
        +        -2 : narrowing the precision induced an overflow,
                      the routine fell back to full precision
        +        -3 : failure of SGEQRF
        +        -31: stop the iterative refinement after the 30th iteration
      -     > 0: iterative refinement has been successfully used.
                 Returns the number of iterations

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if info = -i, the i-th argument had an illegal value

    @ingroup magma_zgels_driver
    ********************************************************************/
extern "C" magma_int_t
magma_zcgeqrsv_gpu(
    magma_int_t m, magma_int_t n, magma_int_t nrhs,
    magmaDoubleComplex_ptr dA,  magma_int_t ldda,
    magmaDoubleComplex_ptr dB,  magma_int_t lddb,
    magmaDoubleComplex_ptr dX,  magma_int_t lddx,
    magma_int_t *iter,
    magma_int_t *info)
{
    #define dB(i,j)     (dB + (i) + (j)*lddb)
    #define dX(i,j)     (dX + (i) + (j)*lddx)
    #define dR(i,j)     (dR + (i) + (j)*lddr)
    #define dSX(i,j)    (dSX + (i) + (j)*lddsx)
    
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magma_int_t     ione  = 1;
    magmaDoubleComplex *hworkd;
    magmaFloatComplex  *hworks;
    magmaDoubleComplex *tau;
    magmaFloatComplex  *stau;
    magmaDoubleComplex_ptr dworkd;
    magmaFloatComplex_ptr  dworks;
    magmaDoubleComplex_ptr dR, dT;
    magmaFloatComplex_ptr  dSA, dSX, dST;
    magmaDoubleComplex Xnrmv, Rnrmv;
    double          Anrm, Xnrm, Rnrm, cte, eps;
    magma_int_t     i, j, iiter, lddsa, lddsx, lddr, nb, lhwork, minmn, size, ldworkd;

    /* Check arguments */
    *iter = 0;
    *info = 0;
    if ( m < 0 )
        *info = -1;
    else if ( n < 0 || n > m )
        *info = -2;
    else if ( nrhs < 0 )
        *info = -3;
    else if ( ldda < max(1,m))
        *info = -5;
    else if ( lddb < max(1,m))
        *info = -7;
    else if ( lddx < max(1,n))
        *info = -9;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    if ( m == 0 || n == 0 || nrhs == 0 )
        return *info;

    nb   = magma_get_cgeqrf_nb(m);
    minmn= min(m, n);
    
    /* dSX contains both B and X, so must be max(m or lddb,n). */
    lddsa = ldda;
    lddsx = max(lddb,n);
    lddr  = lddb;
    
    /*
     * Allocate temporary buffers
     */
    /* dworks(dSA + dSX + dST) */
    size = lddsa*n + lddsx*nrhs + ( 2*minmn + ((n+31)/32)*32 )*nb;
    if (MAGMA_SUCCESS != magma_cmalloc( &dworks, size )) {
        fprintf(stderr, "Allocation of dworks failed (%d)\n", (int) size);
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }
    dSA = dworks;
    dSX = dSA + lddsa*n;
    dST = dSX + lddsx*nrhs;

    /* dworkd(dR) = lddr*nrhs */
    ldworkd = lddr*nrhs;
    if (MAGMA_SUCCESS != magma_zmalloc( &dworkd, ldworkd )) {
        magma_free( dworks );
        fprintf(stderr, "Allocation of dworkd failed\n");
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }
    dR = dworkd;

    /* hworks(workspace for cgeqrs + stau) = min(m,n) + lhworks */
    lhwork = (m - n + nb)*(nrhs + nb) + nrhs*nb;
    size = lhwork + minmn;
    magma_cmalloc_cpu( &hworks, size );
    if ( hworks == NULL ) {
        magma_free( dworks );
        magma_free( dworkd );
        fprintf(stderr, "Allocation of hworks failed\n");
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }
    stau = hworks + lhwork;

    eps  = lapackf77_dlamch("Epsilon");
    Anrm = magmablas_zlange(MagmaInfNorm, m, n, dA, ldda, (double*)dworkd );
    cte  = Anrm * eps * pow((double)n, 0.5) * BWDMAX;

    /*
     * Convert to single precision
     */
    magmablas_zlag2c( m, nrhs, dB, lddb, dSX, lddsx, info );
    if (*info != 0) {
        *iter = -2;
        goto FALLBACK;
    }

    magmablas_zlag2c( m, n, dA, ldda, dSA, lddsa, info );
    if (*info != 0) {
        *iter = -2;
        goto FALLBACK;
    }

    // factor dSA in single precision
    magma_cgeqrf_gpu( m, n, dSA, lddsa, stau, dST, info );
    if (*info != 0) {
        *iter = -3;
        goto FALLBACK;
    }

    // solve dSA*dSX = dB in single precision
    magma_cgeqrs_gpu( m, n, nrhs, dSA, lddsa, stau, dST, dSX, lddsx, hworks, lhwork, info );
    if (*info != 0) {
        *iter = -3;
        goto FALLBACK;
    }

    // residual dR = dB - dA*dX in double precision
    magmablas_clag2z( n, nrhs, dSX, lddsx, dX, lddx, info );
    magmablas_zlacpy( MagmaUpperLower, m, nrhs, dB, lddb, dR, lddr );
    if ( nrhs == 1 ) {
        magma_zgemv( MagmaNoTrans, m, n,
                     c_neg_one, dA, ldda,
                                dX, 1,
                     c_one,     dR, 1 );
    }
    else {
        magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, nrhs, n,
                     c_neg_one, dA, ldda,
                                dX, lddx,
                     c_one,     dR, lddr );
    }

    // TODO: use MAGMA_Z_ABS( dX(i,j) ) instead of zlange?
    for( j=0; j < nrhs; j++ ) {
        i = magma_izamax( n, dX(0,j), 1) - 1;
        magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 );
        Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL );

        i = magma_izamax ( m, dR(0,j), 1 ) - 1;
        magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 );
        Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL );

        if ( Rnrm >  Xnrm*cte ) {
            goto REFINEMENT;
        }
    }

    *iter = 0;

    /* Free workspaces */
    magma_free( dworks );
    magma_free( dworkd );
    magma_free_cpu( hworks );
    return *info;

REFINEMENT:
    /* TODO: this iterative refinement algorithm works only for compatibile
     * systems (B in colspan of A).
     * See Matrix Computations (3rd ed) p. 267 for correct algorithm. */
    for( iiter=1; iiter < ITERMAX; ) {
        *info = 0;
        // convert residual dR to single precision dSX
        magmablas_zlag2c( m, nrhs, dR, lddr, dSX, lddsx, info );
        if (*info != 0) {
            *iter = -2;
            goto FALLBACK;
        }
        // solve dSA*dSX = R in single precision
        magma_cgeqrs_gpu( m, n, nrhs, dSA, lddsa, stau, dST, dSX, lddsx, hworks, lhwork, info );
        if (*info != 0) {
            *iter = -3;
            goto FALLBACK;
        }

        // Add correction and setup residual
        // dX += dSX [including conversion]  --and--
        // dR[1:n] = dB[1:n]   (only n rows, not whole m rows! -- useless if m > n)
        for( j=0; j < nrhs; j++ ) {
            magmablas_zcaxpycp( n, dSX(0,j), dX(0,j), dB(0,j), dR(0,j) );
        }
        // dR = dB  (whole m rows)
        magmablas_zlacpy( MagmaUpperLower, m, nrhs, dB, lddb, dR, lddr );
        
        // residual dR = dB - dA*dX in double precision
        if ( nrhs == 1 ) {
            magma_zgemv( MagmaNoTrans, m, n,
                         c_neg_one, dA, ldda,
                                    dX, 1,
                         c_one,     dR, 1 );
        }
        else {
            magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, nrhs, n,
                         c_neg_one, dA, ldda,
                                    dX, lddx,
                         c_one,     dR, lddr );
        }

        /*  Check whether the nrhs normwise backward errors satisfy the
         *  stopping criterion. If yes, set ITER=IITER > 0 and return. */
        for( j=0; j < nrhs; j++ ) {
            i = magma_izamax( n, dX(0,j), 1) - 1;
            magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 );
            Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL );

            i = magma_izamax ( m, dR(0,j), 1 ) - 1;
            magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 );
            Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL );

            if ( Rnrm >  Xnrm*cte ) {
                goto L20;
            }
        }

        /*  If we are here, the nrhs normwise backward errors satisfy
         *  the stopping criterion, we are good to exit. */
        *iter = iiter;

        /* Free workspaces */
        magma_free( dworks );
        magma_free( dworkd );
        magma_free_cpu( hworks );
        return *info;
        
      L20:
        iiter++;
    }

    /* If we are at this place of the code, this is because we have
     * performed ITER=ITERMAX iterations and never satisified the
     * stopping criterion. Set up the ITER flag accordingly and follow
     * up on double precision routine. */
    *iter = -ITERMAX - 1;
    
FALLBACK:
    /* Single-precision iterative refinement failed to converge to a
     * satisfactory solution, so we resort to double precision. */
    magma_free( dworks );
    magma_free_cpu( hworks );

    /*
     * Allocate temporary buffers
     */
    /* dworkd = dT for zgeqrf */
    nb   = magma_get_zgeqrf_nb( m );
    size = (2*min(m, n) + (n+31)/32*32 )*nb;
    if ( size > ldworkd ) {
        magma_free( dworkd );
        if (MAGMA_SUCCESS != magma_zmalloc( &dworkd, size )) {
            fprintf(stderr, "Allocation of dworkd2 failed\n");
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }
    }
    dT = dworkd;

    /* hworkd(dtau + workspace for zgeqrs) = min(m,n) + lhwork */
    size = lhwork + minmn;
    magma_zmalloc_cpu( &hworkd, size );
    if ( hworkd == NULL ) {
        magma_free( dworkd );
        fprintf(stderr, "Allocation of hworkd2 failed\n");
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }
    tau = hworkd + lhwork;

    magma_zgeqrf_gpu( m, n, dA, ldda, tau, dT, info );
    if (*info == 0) {
        // if m > n, then dB won't fit in dX, so solve with dB and copy n rows to dX
        magma_zgeqrs_gpu( m, n, nrhs, dA, ldda, tau, dT, dB, lddb, hworkd, lhwork, info );
        magmablas_zlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dX, lddx );
    }

    magma_free( dworkd );
    magma_free_cpu( hworkd );
    return *info;
}
Example #17
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing cgels
*/
int main( int argc, char** argv)
{
    TESTING_INIT();
    
    real_Double_t    gflops, gpu_perf, gpu_time, cpu_perf, cpu_time;
    float           gpu_error, cpu_error, error, Anorm, work[1];
    magmaFloatComplex  c_one     = MAGMA_C_ONE;
    magmaFloatComplex  c_neg_one = MAGMA_C_NEG_ONE;
    magmaFloatComplex *h_A, *h_A2, *h_B, *h_X, *h_R, *tau, *h_work, tmp[1];
    magmaFloatComplex *d_A, *d_B;
    magma_int_t M, N, size, nrhs, lda, ldb, ldda, lddb, min_mn, max_mn, nb, info;
    magma_int_t lworkgpu, lhwork, lhwork2;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};

    magma_opts opts;
    parse_opts( argc, argv, &opts );
 
    magma_int_t status = 0;
    float tol = opts.tolerance * lapackf77_slamch("E");

    nrhs = opts.nrhs;
    
    printf("                                                            ||b-Ax|| / (N||A||)   ||dx-x||/(N||A||)\n");
    printf("    M     N  NRHS   CPU GFlop/s (sec)   GPU GFlop/s (sec)   CPU        GPU                         \n");
    printf("===================================================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            if ( M < N ) {
                printf( "%5d %5d %5d   skipping because M < N is not yet supported.\n", (int) M, (int) N, (int) nrhs );
                continue;
            }
            min_mn = min(M, N);
            max_mn = max(M, N);
            lda    = M;
            ldb    = max_mn;
            size   = lda*N;
            ldda   = ((M+31)/32)*32;
            lddb   = ((max_mn+31)/32)*32;
            nb     = magma_get_cgeqrf_nb(M);
            gflops = (FLOPS_CGEQRF( M, N ) + FLOPS_CGEQRS( M, N, nrhs )) / 1e9;
            
            lworkgpu = (M - N + nb)*(nrhs + nb) + nrhs*nb;
            
            // query for workspace size
            lhwork = -1;
            lapackf77_cgeqrf(&M, &N, NULL, &M, NULL, tmp, &lhwork, &info);
            lhwork2 = (magma_int_t) MAGMA_C_REAL( tmp[0] );
            
            lhwork = -1;
            lapackf77_cunmqr( MagmaLeftStr, MagmaConjTransStr,
                              &M, &nrhs, &min_mn, NULL, &lda, NULL,
                              NULL, &ldb, tmp, &lhwork, &info);
            lhwork = (magma_int_t) MAGMA_C_REAL( tmp[0] );
            lhwork = max( max( lhwork, lhwork2 ), lworkgpu );
            
            TESTING_MALLOC_CPU( tau,    magmaFloatComplex, min_mn    );
            TESTING_MALLOC_CPU( h_A,    magmaFloatComplex, lda*N     );
            TESTING_MALLOC_CPU( h_A2,   magmaFloatComplex, lda*N     );
            TESTING_MALLOC_CPU( h_B,    magmaFloatComplex, ldb*nrhs  );
            TESTING_MALLOC_CPU( h_X,    magmaFloatComplex, ldb*nrhs  );
            TESTING_MALLOC_CPU( h_R,    magmaFloatComplex, ldb*nrhs  );
            TESTING_MALLOC_CPU( h_work, magmaFloatComplex, lhwork    );
            
            TESTING_MALLOC_DEV( d_A,    magmaFloatComplex, ldda*N    );
            TESTING_MALLOC_DEV( d_B,    magmaFloatComplex, lddb*nrhs );
            
            /* Initialize the matrices */
            lapackf77_clarnv( &ione, ISEED, &size, h_A );
            lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda );
            
            // make random RHS
            size = M*nrhs;
            lapackf77_clarnv( &ione, ISEED, &size, h_B );
            lapackf77_clacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb );
            
            // make consistent RHS
            //size = N*nrhs;
            //lapackf77_clarnv( &ione, ISEED, &size, h_X );
            //blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N,
            //               &c_one,  h_A, &lda,
            //                        h_X, &ldb,
            //               &c_zero, h_B, &ldb );
            //lapackf77_clacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb );
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            magma_csetmatrix( M, N,    h_A, lda, d_A, ldda );
            magma_csetmatrix( M, nrhs, h_B, ldb, d_B, lddb );
            
            gpu_time = magma_wtime();
            magma_cgels3_gpu( MagmaNoTrans, M, N, nrhs, d_A, ldda,
                              d_B, lddb, h_work, lworkgpu, &info);
            gpu_time = magma_wtime() - gpu_time;
            gpu_perf = gflops / gpu_time;
            if (info != 0)
                printf("magma_cgels3_gpu returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            
            // Get the solution in h_X
            magma_cgetmatrix( N, nrhs, d_B, lddb, h_X, ldb );
            
            // compute the residual
            blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N,
                           &c_neg_one, h_A, &lda,
                                       h_X, &ldb,
                           &c_one,     h_R, &ldb);
            Anorm = lapackf77_clange("f", &M, &N, h_A, &lda, work);
            
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            lapackf77_clacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb );
            
            cpu_time = magma_wtime();
            lapackf77_cgels( MagmaNoTransStr, &M, &N, &nrhs,
                             h_A, &lda, h_X, &ldb, h_work, &lhwork, &info);
            cpu_time = magma_wtime() - cpu_time;
            cpu_perf = gflops / cpu_time;
            if (info != 0)
                printf("lapackf77_cgels returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            
            blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N,
                           &c_neg_one, h_A2, &lda,
                                       h_X,  &ldb,
                           &c_one,     h_B,  &ldb);
            
            cpu_error = lapackf77_clange("f", &M, &nrhs, h_B, &ldb, work) / (min_mn*Anorm);
            gpu_error = lapackf77_clange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm);
            
            // error relative to LAPACK
            size = M*nrhs;
            blasf77_caxpy( &size, &c_neg_one, h_B, &ione, h_R, &ione );
            error = lapackf77_clange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm);
            
            printf("%5d %5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e   %8.2e   %8.2e",
                   (int) M, (int) N, (int) nrhs,
                   cpu_perf, cpu_time, gpu_perf, gpu_time, cpu_error, gpu_error, error );
                        
            if ( M == N ) {
                printf( "   %s\n", (gpu_error < tol && error < tol ? "ok" : "failed"));
                status += ! (gpu_error < tol && error < tol);
            }
            else {
                printf( "   %s\n", (error < tol ? "ok" : "failed"));
                status += ! (error < tol);
            }

            TESTING_FREE_CPU( tau    );
            TESTING_FREE_CPU( h_A    );
            TESTING_FREE_CPU( h_A2   );
            TESTING_FREE_CPU( h_B    );
            TESTING_FREE_CPU( h_X    );
            TESTING_FREE_CPU( h_R    );
            TESTING_FREE_CPU( h_work );
            
            TESTING_FREE_DEV( d_A    );
            TESTING_FREE_DEV( d_B    );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    TESTING_FINALIZE();
    return status;
}
Example #18
0
extern "C" magma_int_t
magma_cqr(
    magma_int_t m, magma_int_t n,
    magma_c_matrix A, 
    magma_int_t lda, 
    magma_c_matrix *Q, 
    magma_c_matrix *R,
    magma_queue_t queue )
{
    magma_int_t info = 0;

    // local constants
    const magmaFloatComplex c_zero = MAGMA_C_ZERO;

    // local variables
    magma_int_t inc = 1;
    magma_int_t k = min(m,n);
    magma_int_t ldt;
    magma_int_t nb;
    magmaFloatComplex *tau = NULL;
    magmaFloatComplex *dT = NULL;
    magmaFloatComplex *dA = NULL;
    magma_c_matrix dR1 = {Magma_CSR};

    // allocate CPU resources
    CHECK( magma_cmalloc_pinned( &tau, k ) );

    // query number of blocks required for QR factorization
    nb = magma_get_cgeqrf_nb( m, n );
    ldt = (2 * k + magma_roundup(n, 32)) * nb;
    CHECK( magma_cmalloc( &dT, ldt ) );

    // get copy of matrix array
    if ( A.memory_location == Magma_DEV ) {
        dA = A.dval;
    } else {
        CHECK( magma_cmalloc( &dA, lda * n ) );
        magma_csetvector( lda * n, A.val, inc, dA, inc, queue );
    }

    // QR factorization
    magma_cgeqrf_gpu( m, n, dA, lda, tau, dT, &info );  

    // construct R matrix
    if ( R != NULL ) {
        if ( A.memory_location == Magma_DEV ) {
            CHECK( magma_cvinit( R, Magma_DEV, lda, n, c_zero, queue ) );
            magmablas_clacpy( MagmaUpper, k, n, dA, lda, R->dval, lda, queue );
        } else {
            CHECK( magma_cvinit( &dR1, Magma_DEV, lda, n, c_zero, queue ) );
            magmablas_clacpy( MagmaUpper, k, n, dA, lda, dR1.dval, lda, queue );
            CHECK( magma_cvinit( R, Magma_CPU, lda, n, c_zero, queue ) );
            magma_cgetvector( lda * n, dR1.dval, inc, R->val, inc, queue );
        }
    }

    // construct Q matrix
    if ( Q != NULL ) {
        magma_cungqr_gpu( m, n, k, dA, lda, tau, dT, nb, &info ); 

        if ( A.memory_location == Magma_DEV ) {
            CHECK( magma_cvinit( Q, Magma_DEV, lda, n, c_zero, queue ) );
            magma_ccopyvector( lda * n, dA, inc, Q->dval, inc, queue );
        } else {
            CHECK( magma_cvinit( Q, Magma_CPU, lda, n, c_zero, queue ) );
            magma_cgetvector( lda * n, dA, inc, Q->val, inc, queue );
        }
    }

cleanup:
    if( info != 0 ){
        magma_cmfree( Q, queue );
        magma_cmfree( R, queue );
        magma_cmfree( &dR1, queue );
    }

    // free resources
    magma_free_pinned( tau );
    magma_free( dT );
    if ( A.memory_location == Magma_CPU ) {
        magma_free( dA );
    }

    return info;
}
Example #19
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing cunmqr
*/
int main( int argc, char** argv )
{
    TESTING_CUDA_INIT();
    
    real_Double_t   gflops, gpu_perf, gpu_time, cpu_perf, cpu_time;    
    float error, work[1];
    cuFloatComplex c_neg_one = MAGMA_C_NEG_ONE;
    magma_int_t ione =  1;
    
    /* Matrix size */
    magma_int_t m, n, k;
    const int MAXTESTS = 10;
    magma_int_t msize[MAXTESTS] = { 1024, 2048, 3072, 4032, 5184, 6016, 7040, 8064, 9088, 10112 };
    magma_int_t nsize[MAXTESTS] = { 1024, 2048, 3072, 4032, 5184, 6016, 7040, 8064, 9088, 10112 };
    magma_int_t ksize[MAXTESTS] = { 1024, 2048, 3072, 4032, 5184, 6016, 7040, 8064, 9088, 10112 };
    magma_int_t size;
    
    magma_int_t info;
    magma_int_t iseed[4] = {0,0,0,1};
    
    printf( "Usage: %s -N m,n,k -c\n"
            "    -N can be repeated %d times. m > 0, n > 0, k > 0 is required.\n"
            "    If only m,n is given, then n=k. If only m is given, then m=n=k.\n"
            "    -c or setting $MAGMA_TESTINGS_CHECK runs LAPACK and checks result.\n\n",
            argv[0], MAXTESTS );

    int checkres = (getenv("MAGMA_TESTINGS_CHECK") != NULL);

    int ntest = 0;
    magma_int_t nmax = 0;
    magma_int_t mmax = 0;
    magma_int_t kmax = 0;
    for( int i = 1; i < argc; i++ ) {
        if ( strcmp("-N", argv[i]) == 0 && i+1 < argc ) {
            magma_assert( ntest < MAXTESTS, "error: -N repeated more than maximum %d tests\n", MAXTESTS );
            info = sscanf( argv[++i], "%d,%d,%d", &m, &n, &k );
            if ( info == 3 && m > 0 && n > 0 && k > 0 ) {
                msize[ ntest ] = m;
                nsize[ ntest ] = n;
                ksize[ ntest ] = k;
            }
            else if ( info == 2 && m > 0 && n > 0 ) {
                msize[ ntest ] = m;
                nsize[ ntest ] = n;
                ksize[ ntest ] = n;  // implicitly
            }
            else if ( info == 1 && m > 0 ) {
                msize[ ntest ] = m;
                nsize[ ntest ] = m;  // implicitly
                ksize[ ntest ] = m;  // implicitly
            }
            else {
                printf( "error: -N %s is invalid; ensure m > 0, n > 0, k > 0.\n", argv[i] );
                exit(1);
            }
            mmax = max( mmax, msize[ntest] );
            nmax = max( nmax, nsize[ntest] );
            kmax = max( kmax, ksize[ntest] );
            ntest++;
        }
        else if ( strcmp("-c", argv[i]) == 0 ) {
            checkres = true;
        }
        else {
            printf( "invalid argument: %s\n", argv[i] );
            exit(1);
        }
    }
    if ( ntest == 0 ) {
        ntest = MAXTESTS;
        nmax = nsize[ntest-1];
        mmax = msize[ntest-1];
        kmax = ksize[ntest-1];
    }
    m = mmax;
    n = nmax;
    k = kmax;
    assert( n > 0 && m > 0 && k > 0 );
    
    magma_int_t nb = magma_get_cgeqrf_nb( m );
    magma_int_t ldc = m;
    magma_int_t lda = max(m,n);
    ldc = ((ldc+31)/32)*32;
    lda = ((lda+31)/32)*32;
    
    // Allocate memory for matrices
    cuFloatComplex *C, *R, *A, *W, *tau;
    magma_int_t lwork = max( m*nb, n*nb );
    magma_int_t lwork_max = lwork;
    TESTING_MALLOC( C, cuFloatComplex, ldc*n );
    TESTING_MALLOC( R, cuFloatComplex, ldc*n );
    TESTING_MALLOC( A, cuFloatComplex, lda*k );
    TESTING_MALLOC( W, cuFloatComplex, lwork_max );
    TESTING_MALLOC( tau, cuFloatComplex, k   );
    
    // test all combinations of input parameters
    const char* side[]   = { MagmaLeftStr,      MagmaRightStr   };
    const char* trans[]  = { MagmaConjTransStr, MagmaNoTransStr };

    printf("    M     N     K  side   trans      CPU GFlop/s (sec)   GPU GFlop/s (sec)   ||R||_F / ||QC||_F\n");
    printf("===============================================================================================\n");
    for( int i = 0; i < ntest; ++i ) {
        for( int iside = 0; iside < 2; ++iside ) {
        for( int itran = 0; itran < 2; ++itran ) {
            m = msize[i];
            n = nsize[i];
            k = ksize[i];
            
            if ( *side[iside] == 'L' && m < k ) {
                printf( "%5d %5d %5d  %-5s  %-9s   skipping because side=left and m < k\n",
                        (int) m, (int) n, (int) k, side[iside], trans[itran] );
                continue;
            }
            if ( *side[iside] == 'R' && n < k ) {
                printf( "%5d %5d %5d  %-5s  %-9s   skipping because side=right and n < k\n",
                        (int) m, (int) n, (int) k, side[iside], trans[itran] );
                continue;
            }
            
            gflops = FLOPS_CUNMQR( m, n, k, *side[iside] ) / 1e9;
            
            // C is full, m x n
            size = ldc*n;
            lapackf77_clarnv( &ione, iseed, &size, C );
            lapackf77_clacpy( "Full", &m, &n, C, &ldc, R, &ldc );
            //magma_csetmatrix( m,   n, C, ldc, dC, ldc );
            
            // A is m x k (left) or n x k (right)
            lda = (*side[iside] == 'L' ? m : n);
            size = lda*k;
            lapackf77_clarnv( &ione, iseed, &size, A );
            
            // compute QR factorization to get Householder vectors in A, tau
            magma_cgeqrf( lda, k, A, lda, tau, W, lwork_max, &info );
            if ( info != 0 )
                printf("magma_cgeqrf returned error %d\n", info);
            
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            cpu_time = magma_wtime();
            lapackf77_cunmqr( side[iside], trans[itran],
                              &m, &n, &k,
                              A, &lda, tau, C, &ldc, W, &lwork_max, &info );
            cpu_time = magma_wtime() - cpu_time;
            cpu_perf = gflops / cpu_time;
            if (info != 0)
                printf("lapackf77_cunmqr returned error %d.\n", (int) info);
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            // query for work size
            lwork = -1;
            magma_cunmqr( *side[iside], *trans[itran],
                          m, n, k,
                          A, lda, tau, R, ldc, W, lwork, &info );
            if (info != 0)
                printf("magma_cunmqr returned error %d (lwork query).\n", (int) info);
            lwork = (magma_int_t) MAGMA_C_REAL( W[0] );
            if ( lwork < 0 || lwork > lwork_max )
                printf("invalid lwork %d, lwork_max %d\n", lwork, lwork_max );
            
            gpu_time = magma_wtime();
            magma_cunmqr( *side[iside], *trans[itran],
                          m, n, k,
                          A, lda, tau, R, ldc, W, lwork, &info );
            gpu_time = magma_wtime() - gpu_time;
            gpu_perf = gflops / gpu_time;
            if (info != 0)
                printf("magma_cunmqr returned error %d.\n", (int) info);
            
            //magma_cgetmatrix( m, n, dC, ldc, R, ldc );
            
            /* =====================================================================
               compute relative error |QC_magma - QC_lapack| / |QC_lapack|
               =================================================================== */
            error = lapackf77_clange( "Fro", &m, &n, C, &ldc, work );
            size = ldc*n;
            blasf77_caxpy( &size, &c_neg_one, C, &ione, R, &ione );
            error = lapackf77_clange( "Fro", &m, &n, R, &ldc, work ) / error;
            
            printf( "%5d %5d %5d  %-5s  %-9s  %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e\n",
                    (int) m, (int) n, (int) k, side[iside], trans[itran],
                    cpu_perf, cpu_time, gpu_perf, gpu_time, error );
        }}  // end iside, itran
        printf( "\n" );
    }  // end i
    
    // Memory clean up
    TESTING_FREE( C );
    TESTING_FREE( R );
    TESTING_FREE( A );
    TESTING_FREE( W );
    TESTING_FREE( tau );
    
    // Shutdown
    TESTING_CUDA_FINALIZE();
    return 0;
}
Example #20
0
/**
    Purpose
    -------
    CGEQRF computes a QR factorization of a COMPLEX M-by-N matrix A:
    A = Q * R. This version does not require work space on the GPU
    passed as input. GPU memory is allocated in the routine.

    This uses 2 queues to overlap communication and computation.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    A       COMPLEX array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).
    \n
            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    @param[out]
    tau     COMPLEX array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    work    (workspace) COMPLEX array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.
    \n
            Higher performance is achieved if WORK is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.  LWORK >= max( N*NB, 2*NB*NB ),
            where NB can be obtained through magma_get_cgeqrf_nb( M, N ).
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ---------------
    The matrix Q is represented as a product of elementary reflectors

        Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

        H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).

    @ingroup magma_cgeqrf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_cgeqrf(
    magma_int_t m, magma_int_t n,
    magmaFloatComplex *A,    magma_int_t lda,
    magmaFloatComplex *tau,
    magmaFloatComplex *work, magma_int_t lwork,
    magma_int_t *info )
{
    #define  A(i_,j_)  (A + (i_) + (j_)*lda)
    
    #ifdef HAVE_clBLAS
    #define dA(i_,j_)  dA,    ((i_) + (j_)*ldda + dA_offset)
    #define dT(i_,j_)  dT,    ((i_) + (j_)*nb   + dT_offset)
    #define dwork(i_)  dwork, ((i_)             + dwork_offset)
    #else
    #define dA(i_,j_) (dA    + (i_) + (j_)*ldda)
    #define dT(i_,j_) (dT    + (i_) + (j_)*nb)
    #define dwork(i_) (dwork + (i_))
    #endif
    
    /* Constants */
    const magmaFloatComplex c_one = MAGMA_C_ONE;
    
    /* Local variables */
    magmaFloatComplex_ptr dA, dT, dwork;
    magma_int_t i, ib, min_mn, ldda, lddwork, old_i, old_ib;
    
    /* Function Body */
    *info = 0;
    magma_int_t nb = magma_get_cgeqrf_nb( m, n );
    
    // need 2*nb*nb to store T and upper triangle of V simultaneously
    magma_int_t lwkopt = max( n*nb, 2*nb*nb );
    work[0] = magma_cmake_lwork( lwkopt );
    bool lquery = (lwork == -1);
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,m)) {
        *info = -4;
    } else if (lwork < max(1, lwkopt) && ! lquery) {
        *info = -7;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;
    
    min_mn = min( m, n );
    if (min_mn == 0) {
        work[0] = c_one;
        return *info;
    }
    
    // largest N for larfb is n-nb (trailing matrix lacks 1st panel)
    lddwork = magma_roundup( n, 32 ) - nb;
    ldda    = magma_roundup( m, 32 );
    
    magma_int_t ngpu = magma_num_gpus();
    if ( ngpu > 1 ) {
        /* call multiple-GPU interface  */
        return magma_cgeqrf_m( ngpu, m, n, A, lda, tau, work, lwork, info );
    }
    
    // allocate space for dA, dwork, and dT
    if (MAGMA_SUCCESS != magma_cmalloc( &dA, n*ldda + nb*lddwork + nb*nb )) {
        /* alloc failed so call non-GPU-resident version */
        return magma_cgeqrf_ooc( m, n, A, lda, tau, work, lwork, info );
    }
    
    dwork = dA + n*ldda;
    dT    = dA + n*ldda + nb*lddwork;
    
    magma_queue_t queues[2];
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queues[0] );
    magma_queue_create( cdev, &queues[1] );
    
    if ( (nb > 1) && (nb < min_mn) ) {
        /* Use blocked code initially.
           Asynchronously send the matrix to the GPU except the first panel. */
        magma_csetmatrix_async( m, n-nb,
                                 A(0,nb), lda,
                                dA(0,nb), ldda, queues[0] );
        
        old_i = 0;
        old_ib = nb;
        for (i = 0; i < min_mn-nb; i += nb) {
            ib = min( min_mn-i, nb );
            if (i > 0) {
                /* get i-th panel from device */
                magma_queue_sync( queues[1] );
                magma_cgetmatrix_async( m-i, ib,
                                        dA(i,i), ldda,
                                         A(i,i), lda, queues[0] );
                
                /* Apply H' to A(i:m,i+2*ib:n) from the left */
                magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                  m-old_i, n-old_i-2*old_ib, old_ib,
                                  dA(old_i, old_i),          ldda, dT(0,0),  nb,
                                  dA(old_i, old_i+2*old_ib), ldda, dwork(0), lddwork, queues[1] );
                
                magma_cgetmatrix_async( i, ib,
                                        dA(0,i), ldda,
                                         A(0,i), lda, queues[1] );
                magma_queue_sync( queues[0] );
            }
            
            magma_int_t rows = m-i;
            lapackf77_cgeqrf( &rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info );
            
            /* Form the triangular factor of the block reflector
               H = H(i) H(i+1) . . . H(i+ib-1) */
            lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr,
                              &rows, &ib, A(i,i), &lda, tau+i, work, &ib );
            
            magma_cpanel_to_q( MagmaUpper, ib, A(i,i), lda, work+ib*ib );
            
            /* put i-th V matrix onto device */
            magma_csetmatrix_async( rows, ib, A(i,i), lda, dA(i,i), ldda, queues[0] );
            
            /* put T matrix onto device */
            magma_queue_sync( queues[1] );
            magma_csetmatrix_async( ib, ib, work, ib, dT(0,0), nb, queues[0] );
            magma_queue_sync( queues[0] );
            
            if (i + ib < n) {
                if (i+ib < min_mn-nb) {
                    /* Apply H' to A(i:m,i+ib:i+2*ib) from the left (look-ahead) */
                    magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                      rows, ib, ib,
                                      dA(i, i   ), ldda, dT(0,0),  nb,
                                      dA(i, i+ib), ldda, dwork(0), lddwork, queues[1] );
                    magma_cq_to_panel( MagmaUpper, ib, A(i,i), lda, work+ib*ib );
                }
                else {
                    /* After last panel, update whole trailing matrix. */
                    /* Apply H' to A(i:m,i+ib:n) from the left */
                    magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                      rows, n-i-ib, ib,
                                      dA(i, i   ), ldda, dT(0,0),  nb,
                                      dA(i, i+ib), ldda, dwork(0), lddwork, queues[1] );
                    magma_cq_to_panel( MagmaUpper, ib, A(i,i), lda, work+ib*ib );
                }
                
                old_i  = i;
                old_ib = ib;
            }
        }
    } else {
        i = 0;
    }
    
    /* Use unblocked code to factor the last or only block. */
    if (i < min_mn) {
        ib = n-i;
        if (i != 0) {
            magma_cgetmatrix( m, ib, dA(0,i), ldda, A(0,i), lda, queues[1] );
        }
        magma_int_t rows = m-i;
        lapackf77_cgeqrf( &rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info );
    }
    
    magma_queue_destroy( queues[0] );
    magma_queue_destroy( queues[1] );
    
    magma_free( dA );
    
    return *info;
} /* magma_cgeqrf */
Example #21
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing cgeqrf
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    real_Double_t    gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0;
    float           error, work[1];
    magmaFloatComplex  c_neg_one = MAGMA_C_NEG_ONE;
    magmaFloatComplex *h_A, *h_R, *tau, *h_work, tmp[1];
    magmaFloatComplex_ptr d_A, dT;
    magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb, size;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1}, ISEED2[4];
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    magma_int_t status = 0;
    float tol;
    opts.lapack |= (opts.version == 2 && opts.check == 2);  // check (-c2) implies lapack (-l)

    if ( opts.version != 2 && opts.check == 1 ) {
        printf( "NOTE: version %d requires -c2 check due to the special structure of the\n"
                "MAGMA cgeqrf results; using -c2.\n\n", (int) opts.version );
        opts.check = 2;
    }
    printf( "version %d\n", (int) opts.version );
    if ( opts.version == 2 ) {
        if ( opts.check == 1 ) {
            printf("    M     N   CPU GFlop/s (sec)   GPU GFlop/s (sec)   ||R-Q'A||_1 / (M*||A||_1*eps) ||I-Q'Q||_1 / (M*eps)\n");
            printf("=========================================================================================================\n");
        } else {
            printf("    M     N   CPU GFlop/s (sec)   GPU GFlop/s (sec)   ||R||_F / ||A||_F\n");
            printf("=======================================================================\n");
        }
        tol = 1.0;
    } else {
        printf("    M     N   CPU GFlop/s (sec)   GPU GFlop/s (sec)   ||Ax-b||_F/(N*||A||_F*||x||_F)\n");
        printf("====================================================================================\n");
        tol = opts.tolerance * lapackf77_slamch("E");
    }
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            min_mn = min(M, N);
            lda    = M;
            n2     = lda*N;
            ldda   = ((M+31)/32)*32;
            gflops = FLOPS_CGEQRF( M, N ) / 1e9;
            
            // query for workspace size
            lwork = -1;
            lapackf77_cgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info);
            lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] );
            
            TESTING_MALLOC_CPU( tau,    magmaFloatComplex, min_mn );
            TESTING_MALLOC_CPU( h_A,    magmaFloatComplex, n2     );
            TESTING_MALLOC_CPU( h_work, magmaFloatComplex, lwork  );
            
            TESTING_MALLOC_PIN( h_R,    magmaFloatComplex, n2     );
            
            TESTING_MALLOC_DEV( d_A,    magmaFloatComplex, ldda*N );
            
            /* Initialize the matrix */
            for ( int j=0; j<4; j++ )
                ISEED2[j] = ISEED[j]; // save seeds
            lapackf77_clarnv( &ione, ISEED, &n2, h_A );
            lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda );
            magma_csetmatrix( M, N, h_R, lda, d_A, 0, ldda, opts.queue );
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            gpu_time = magma_wtime();
            if ( opts.version == 2 ) {
                magma_cgeqrf2_gpu( M, N, d_A, 0, ldda, tau, opts.queues2, &info );
            }
            else {
                nb = magma_get_cgeqrf_nb( M );
                size = (2*min(M, N) + (N+31)/32*32 )*nb;
                TESTING_MALLOC_DEV( dT, magmaFloatComplex, size );
                if ( opts.version == 1 ) {
                    magma_cgeqrf_gpu( M, N, d_A, 0, ldda, tau, dT, 0, opts.queue, &info );
                }
                #ifdef HAVE_CUBLAS
                else if ( opts.version == 3 ) {
                    magma_cgeqrf3_gpu( M, N, d_A, 0, ldda, tau, dT, opts.queue, &info );
                }
                #endif
                else {
                    printf( "Unknown version %d\n", opts.version );
                    exit(1);
                }
            }
            gpu_time = magma_wtime() - gpu_time;
            gpu_perf = gflops / gpu_time;
            if (info != 0)
                printf("magma_cgeqrf returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            
            if ( opts.lapack ) {
                /* =====================================================================
                   Performs operation using LAPACK
                   =================================================================== */
                magmaFloatComplex *tau2;
                TESTING_MALLOC_CPU( tau2, magmaFloatComplex, min_mn );
                cpu_time = magma_wtime();
                lapackf77_cgeqrf(&M, &N, h_A, &lda, tau2, h_work, &lwork, &info);
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
                if (info != 0)
                    printf("lapackf77_cgeqrf returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));
                TESTING_FREE_CPU( tau2 );
            }

            if ( opts.check == 1 && M >= N ) {
                /* =====================================================================
                   Check the result -- only version 1, cqrt02 requires M >= N
                   =================================================================== */
                magma_int_t lwork = n2+N;
                magmaFloatComplex *h_W1, *h_W2, *h_W3;
                float *h_RW, results[2];
                
                magma_cgetmatrix( M, N, d_A, 0, ldda, h_R, M, opts.queue );

                TESTING_MALLOC_CPU( h_W1, magmaFloatComplex, n2    ); // Q
                TESTING_MALLOC_CPU( h_W2, magmaFloatComplex, n2    ); // R
                TESTING_MALLOC_CPU( h_W3, magmaFloatComplex, lwork ); // WORK
                TESTING_MALLOC_CPU( h_RW, float, M );  // RWORK
                lapackf77_clarnv( &ione, ISEED2, &n2, h_A );
                lapackf77_cqrt02( &M, &N, &min_mn, h_A, h_R, h_W1, h_W2, &lda, tau, h_W3, &lwork,
                                  h_RW, results );

                if ( opts.lapack ) {
                    printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e                      %8.2e",
                           (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, results[0], results[1] );
                } else {
                    printf("%5d %5d     ---   (  ---  )   %7.2f (%7.2f)    %8.2e                      %8.2e",
                           (int) M, (int) N, gpu_perf, gpu_time, results[0], results[1] );
                } 
                // todo also check results[1] < tol?
                printf("   %s\n", (results[0] < tol ? "ok" : "failed"));
                status += ! (results[0] < tol);
            
                TESTING_FREE_CPU( h_W1 );
                TESTING_FREE_CPU( h_W2 );
                TESTING_FREE_CPU( h_W3 );
                TESTING_FREE_CPU( h_RW );
            }
            else if ( opts.check == 2 && opts.version == 2 ) {
                /* =====================================================================
                   Check the result compared to LAPACK -- only version 2
                   =================================================================== */
                magma_cgetmatrix( M, N, d_A, 0, ldda, h_R, M, opts.queue );
                error = lapackf77_clange("f", &M, &N, h_A, &lda, work);
                blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione);
                error = lapackf77_clange("f", &M, &N, h_R, &lda, work) / error;

                if ( opts.lapack ) {
                    printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e",
                           (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error );
                } else {
                    printf("%5d %5d     ---   (  ---  )   %7.2f (%7.2f)   %8.2e",
                           (int) M, (int) N, gpu_perf, gpu_time, error );
                }
                printf("   %s\n", (error < tol ? "ok" : "failed"));
                status += ! (error < tol);
            }
            else if ( opts.check == 2 && M >= N ) {
                /* =====================================================================
                   Check the result by solving linear system -- only versions 1 & 3, M >= N
                   =================================================================== */
                magma_int_t lwork;
                magmaFloatComplex *x, *b, *hwork;
                magmaFloatComplex_ptr d_B;
                const magmaFloatComplex c_zero    = MAGMA_C_ZERO;
                const magmaFloatComplex c_one     = MAGMA_C_ONE;
                const magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;
                const magma_int_t ione = 1;

                // initialize RHS, b = A*random
                TESTING_MALLOC_CPU( x, magmaFloatComplex, N );
                TESTING_MALLOC_CPU( b, magmaFloatComplex, M );
                lapackf77_clarnv( &ione, ISEED, &N, x );
                blasf77_cgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_zero, b, &ione );
                // copy to GPU
                TESTING_MALLOC_DEV( d_B, magmaFloatComplex, M );
                magma_csetvector( M, b, 1, d_B, 0, 1, opts.queue );

                if ( opts.version == 1 ) {
                    // allocate hwork
                    magma_cgeqrs_gpu( M, N, 1,
                                      d_A, 0, ldda, tau, dT, 0,
                                      d_B, 0, M, tmp, -1, opts.queue, &info );
                    lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] );
                    TESTING_MALLOC_CPU( hwork, magmaFloatComplex, lwork );

                    // solve linear system
                    magma_cgeqrs_gpu( M, N, 1,
                                      d_A, 0, ldda, tau, dT, 0,
                                      d_B, 0, M, hwork, lwork, opts.queue, &info );
                    if (info != 0)
                        printf("magma_cgeqrs returned error %d: %s.\n",
                               (int) info, magma_strerror( info ));
                    TESTING_FREE_CPU( hwork );
                }
                #ifdef HAVE_CUBLAS
                else if ( opts.version == 3 ) {
                    // allocate hwork
                    magma_cgeqrs3_gpu( M, N, 1,
                                       d_A, 0, ldda, tau, dT, 0,
                                       d_B, 0, M, tmp, -1, opts.queue, &info );
                    lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] );
                    TESTING_MALLOC_CPU( hwork, magmaFloatComplex, lwork );

                    // solve linear system
                    magma_cgeqrs3_gpu( M, N, 1,
                                       d_A, 0, ldda, tau, dT, 0,
                                       d_B, 0, M, hwork, lwork, opts.queue, &info );
                    if (info != 0)
                        printf("magma_cgeqrs3 returned error %d: %s.\n",
                               (int) info, magma_strerror( info ));
                    TESTING_FREE_CPU( hwork );
                }
                #endif
                else {
                    printf( "Unknown version %d\n", opts.version );
                    exit(1);
                }
                magma_cgetvector( N, d_B, 0, 1, x, 1, opts.queue );

                // compute r = Ax - b, saved in b
                lapackf77_clarnv( &ione, ISEED2, &n2, h_A );
                blasf77_cgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_neg_one, b, &ione );

                // compute residual |Ax - b| / (n*|A|*|x|)
                float norm_x, norm_A, norm_r, work[1];
                norm_A = lapackf77_clange( "F", &M, &N, h_A, &lda, work );
                norm_r = lapackf77_clange( "F", &M, &ione, b, &M, work );
                norm_x = lapackf77_clange( "F", &N, &ione, x, &N, work );

                TESTING_FREE_CPU( x );
                TESTING_FREE_CPU( b );
                TESTING_FREE_DEV( d_B );

                error = norm_r / (N * norm_A * norm_x);
                if ( opts.lapack ) {
                    printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e",
                           (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error );
                } else {
                    printf("%5d %5d     ---   (  ---  )   %7.2f (%7.2f)   %8.2e",
                           (int) M, (int) N, gpu_perf, gpu_time, error );
                }
                printf("   %s\n", (error < tol ? "ok" : "failed"));
                status += ! (error < tol);
            }
            else {
                if ( opts.lapack ) {
                    printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)   ---",
                           (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time );
                } else {
                    printf("%5d %5d     ---   (  ---  )   %7.2f (%7.2f)     ---",
                           (int) M, (int) N, gpu_perf, gpu_time);
                }
                printf("%s\n", (opts.check != 0 ? "  (error check only for M >= N)" : ""));
            }
            
            TESTING_FREE_CPU( tau    );
            TESTING_FREE_CPU( h_A    );
            TESTING_FREE_CPU( h_work );
            
            TESTING_FREE_PIN( h_R );
            
            TESTING_FREE_DEV( d_A );
            
            if ( opts.version != 2 )
                TESTING_FREE_DEV( dT );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }
    
    TESTING_FINALIZE();
    return status;
}
Example #22
0
int main( int argc, char** argv)
{
    real_Double_t    gflops, gpu_perf, cpu_perf, gpu_time, cpu_time, error;

    float           matnorm, work[1];
    magmaFloatComplex  c_neg_one = MAGMA_C_NEG_ONE;
    magmaFloatComplex *h_A, *h_R, *tau, *h_work, tmp[1];
    magmaFloatComplex_ptr d_lA[MagmaMaxGPUs];

    /* Matrix size */
    magma_int_t M = 0, N = 0, n2, n_local[4], lda, ldda, lhwork;
    magma_int_t size[10] = {1000,2000,3000,4000,5000,6000,7000,8000,9000,10000};

    magma_int_t i, k, nk, info, min_mn;
    int max_num_gpus = 2, num_gpus = 2;
    
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};

    if (argc != 1){
        for(i = 1; i<argc; i++){
            if (strcmp("-N", argv[i])==0)
                N = atoi(argv[++i]);
            else if (strcmp("-M", argv[i])==0)
                M = atoi(argv[++i]);
            else if (strcmp("-NGPU", argv[i])==0)
              num_gpus = atoi(argv[++i]);
        }
        if ( M == 0 ) {
            M = N;
        }
        if ( N == 0 ) {
            N = M;
        }
        if (M>0 && N>0)
          printf("  testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", (int) M, (int) N, (int) num_gpus);
        else
            {
                printf("\nUsage: \n");
                printf("  testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", 
                       1024, 1024, 1);
                exit(1);
            }
    }
    else {
        printf("\nUsage: \n");
        printf("  testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", 1024, 1024, 1);
        M = N = size[9];
    }
    
    ldda   = ((M+31)/32)*32;
    n2     = M * N;
    min_mn = min(M, N);

    magma_int_t nb  = magma_get_cgeqrf_nb(M);

    if (num_gpus > max_num_gpus){
      printf("More GPUs requested than available. Have to change it.\n");
      num_gpus = max_num_gpus;
    }
    printf("Number of GPUs to be used = %d\n", (int) num_gpus);

    /* Initialize */
    magma_queue_t  queues[MagmaMaxGPUs * 2];
    magma_device_t devices[ MagmaMaxGPUs ];
    magma_int_t num = 0;
    magma_int_t err;
    magma_init();
    err = magma_getdevices( devices, MagmaMaxGPUs, &num );
    if ( err != 0 || num < 1 ) {
        fprintf( stderr, "magma_getdevices failed: %d\n", (int) err );
        exit(-1);
    }
    for(i=0;i<num_gpus;i++){
        err = magma_queue_create( devices[i], &queues[2*i] );
        if ( err != 0 ) {
            fprintf( stderr, "magma_queue_create failed: %d\n", (int) err );
            exit(-1);
        }
        err = magma_queue_create( devices[i], &queues[2*i+1] );
        if ( err != 0 ) {
            fprintf( stderr, "magma_queue_create failed: %d\n", (int) err );
            exit(-1);
        }
    }
    
    /* Allocate host memory for the matrix */
    TESTING_MALLOC_CPU( tau, magmaFloatComplex, min_mn );
    TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2     );
    TESTING_MALLOC_CPU( h_R, magmaFloatComplex, n2     );

    for(i=0; i<num_gpus; i++){      
        n_local[i] = ((N/nb)/num_gpus)*nb;
        if (i < (N/nb)%num_gpus)
            n_local[i] += nb;
        else if (i == (N/nb)%num_gpus)
            n_local[i] += N%nb;
        
        TESTING_MALLOC_DEV( d_lA[i], magmaFloatComplex, ldda*n_local[i] );
        printf("device %2d n_local = %4d\n", (int) i, (int) n_local[i]);  
    }

    lhwork = -1;
    lapackf77_cgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info);
    lhwork = (magma_int_t)MAGMA_C_REAL( tmp[0] );

    TESTING_MALLOC_CPU( h_work, magmaFloatComplex, lhwork );

    printf("  M     N   CPU GFlop/s (sec)   GPU GFlop/s (sec)   ||R||_F / ||A||_F\n");
    printf("======================================================================\n");
    for(i=0; i<10; i++){
        if (argc == 1){
            M = N = size[i];
        }
        min_mn= min(M, N);
        lda   = M;
        n2    = lda*N;
        ldda  = ((M+31)/32)*32;
        gflops = FLOPS( (float)M, (float)N ) * 1e-9;

        /* Initialize the matrix */
        lapackf77_clarnv( &ione, ISEED, &n2, h_A );
        lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda );

        /* =====================================================================
           Performs operation using LAPACK
           =================================================================== */
        cpu_time = magma_wtime();
        lapackf77_cgeqrf(&M, &N, h_A, &M, tau, h_work, &lhwork, &info);
        cpu_time = magma_wtime() - cpu_time;
        if (info < 0)
            printf("Argument %d of lapack_cgeqrf had an illegal value.\n", (int) -info);

        cpu_perf = gflops / cpu_time;

        /* ====================================================================
           Performs operation using MAGMA
           =================================================================== */
        int j;
        magma_queue_t *trans_queues = (magma_queue_t*)malloc(num_gpus*sizeof(magma_queue_t));
        for(j=0;j<num_gpus;j++){
            trans_queues[j] = queues[2*j];
        }
        
        // warm-up
        magma_csetmatrix_1D_col_bcyclic(M, N, h_R, lda, d_lA, ldda, num_gpus, nb, trans_queues);
        magma_cgeqrf2_mgpu( num_gpus, M, N, d_lA, ldda, tau, queues, &info);

        magma_csetmatrix_1D_col_bcyclic(M, N, h_R, lda, d_lA, ldda, num_gpus, nb, trans_queues);
        gpu_time = magma_wtime();
        magma_cgeqrf2_mgpu( num_gpus, M, N, d_lA, ldda, tau, queues, &info);
        gpu_time = magma_wtime() - gpu_time;

        if (info < 0)
          printf("Argument %d of magma_cgeqrf2 had an illegal value.\n", (int) -info);
        
        gpu_perf = gflops / gpu_time;
        
        /* =====================================================================
           Check the result compared to LAPACK
           =================================================================== */
        magma_cgetmatrix_1D_col_bcyclic(M, N, d_lA, ldda, h_R, lda, num_gpus, nb, trans_queues);
        
        matnorm = lapackf77_clange("f", &M, &N, h_A, &M, work);
        blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione);
        
        printf("%5d %5d  %6.2f (%6.2f)        %6.2f (%6.2f)       %e\n",
               (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time,
               lapackf77_clange("f", &M, &N, h_R, &M, work) / matnorm);
        
        if (argc != 1)
          break;
    }
    
    /* Memory clean up */
    TESTING_FREE_PIN( tau );
    TESTING_FREE_PIN( h_A );
    TESTING_FREE_PIN( h_work );
    TESTING_FREE_PIN( h_R );

    for(i=0; i<num_gpus; i++){
        TESTING_FREE_DEV( d_lA[i] );
        magma_queue_destroy(queues[2*i]);
        magma_queue_destroy(queues[2*i+1]);
    }

    /* Shutdown */
    magma_finalize();
}
Example #23
0
extern "C" magma_int_t
magma_cgeqrf2_mgpu( magma_int_t num_gpus, magma_int_t m, magma_int_t n,
                    magmaFloatComplex **dlA, magma_int_t ldda,
                    magmaFloatComplex *tau,
                    magma_int_t *info )
{
/*  -- MAGMA (version 1.4.1) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       December 2013

    Purpose
    =======
    CGEQRF2_MGPU computes a QR factorization of a complex M-by-N matrix A:
    A = Q * R. This is a GPU interface of the routine.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    dA      (input/output) COMPLEX array on the GPU, dimension (LDDA,N)
            On entry, the M-by-N matrix dA.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).

    LDDA    (input) INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).
            To benefit from coalescent memory accesses LDDA must be
            dividable by 16.

    TAU     (output) COMPLEX array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ===============
    The matrix Q is represented as a product of elementary reflectors

       Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).
    =====================================================================    */

    #define dlA(dev, i, j)   (dlA[dev] + (i) + (j)*(ldda))
    #define hpanel(i)        (hpanel + (i))

    // set to NULL to make cleanup easy: free(NULL) does nothing.
    magmaFloatComplex *dwork[MagmaMaxGPUs]={NULL}, *dpanel[MagmaMaxGPUs]={NULL};
    magmaFloatComplex *hwork=NULL, *hpanel=NULL;
    magma_queue_t stream[MagmaMaxGPUs][2]={{NULL}};
    magma_event_t panel_event[MagmaMaxGPUs]={NULL};

    magma_int_t i, j, min_mn, dev, ldhpanel, lddwork, rows;
    magma_int_t ib, nb;
    magma_int_t lhwork, lwork;
    magma_int_t panel_dev, i_local, i_nb_local, n_local[MagmaMaxGPUs], la_dev, dpanel_offset;

    magma_queue_t cqueue;
    magmablasGetKernelStream( &cqueue );
    
    magma_device_t cdevice;
    magma_getdevice( &cdevice );

    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    min_mn = min(m,n);
    if (min_mn == 0)
        return *info;

    nb = magma_get_cgeqrf_nb( m );

    /* dwork is (n*nb) --- for T (nb*nb) and clarfb work ((n-nb)*nb) ---
     *        + dpanel (ldda*nb), on each GPU.
     * I think clarfb work could be smaller, max(n_local[:]).
     * Oddly, T and clarfb work get stacked on top of each other, both with lddwork=n.
     * on GPU that owns panel, set dpanel = dlA(dev,i,i_local).
     * on other GPUs,          set dpanel = dwork[dev] + dpanel_offset. */
    lddwork = n;
    dpanel_offset = lddwork*nb;
    for( dev=0; dev < num_gpus; dev++ ) {
        magma_setdevice( dev );
        if ( MAGMA_SUCCESS != magma_cmalloc( &(dwork[dev]), (lddwork + ldda)*nb )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            goto CLEANUP;
        }
    }

    /* hwork is MAX( workspace for cgeqrf (n*nb), two copies of T (2*nb*nb) )
     *        + hpanel (m*nb).
     * for last block, need 2*n*nb total. */
    ldhpanel = m;
    lhwork = max( n*nb, 2*nb*nb );
    lwork = max( lhwork + ldhpanel*nb, 2*n*nb );
    if ( MAGMA_SUCCESS != magma_cmalloc_pinned( &hwork, lwork )) {
        *info = MAGMA_ERR_HOST_ALLOC;
        goto CLEANUP;
    }
    hpanel = hwork + lhwork;

    /* Set the number of local n for each GPU */
    for( dev=0; dev < num_gpus; dev++ ) {
        n_local[dev] = ((n/nb)/num_gpus)*nb;
        if (dev < (n/nb) % num_gpus)
            n_local[dev] += nb;
        else if (dev == (n/nb) % num_gpus)
            n_local[dev] += n % nb;
    }

    for( dev=0; dev < num_gpus; dev++ ) {
        magma_setdevice( dev );
        magma_queue_create( &stream[dev][0] );
        magma_queue_create( &stream[dev][1] );
        magma_event_create( &panel_event[dev] );
    }

    if ( nb < min_mn ) {
        /* Use blocked code initially */
        // Note: as written, ib cannot be < nb.
        for( i = 0; i < min_mn-nb; i += nb ) {
            /* Set the GPU number that holds the current panel */
            panel_dev = (i/nb) % num_gpus;
            
            /* Set the local index where the current panel is (j==i) */
            i_local = i/(nb*num_gpus)*nb;
            
            ib = min(min_mn-i, nb);
            rows = m-i;
            
            /* Send current panel to the CPU, after panel_event indicates it has been updated */
            magma_setdevice( panel_dev );
            magma_queue_wait_event( stream[panel_dev][1], panel_event[panel_dev] );
            magma_cgetmatrix_async( rows, ib,
                                    dlA(panel_dev, i, i_local), ldda,
                                    hpanel(i),                  ldhpanel, stream[panel_dev][1] );
            magma_queue_sync( stream[panel_dev][1] );

            // Factor panel
            lapackf77_cgeqrf( &rows, &ib, hpanel(i), &ldhpanel, tau+i,
                              hwork, &lhwork, info );
            if ( *info != 0 ) {
                fprintf( stderr, "error %d\n", (int) *info );
            }

            // Form the triangular factor of the block reflector
            // H = H(i) H(i+1) . . . H(i+ib-1)
            lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr,
                              &rows, &ib,
                              hpanel(i), &ldhpanel, tau+i, hwork, &ib );

            cpanel_to_q( MagmaUpper, ib, hpanel(i), ldhpanel, hwork + ib*ib );
            // Send the current panel back to the GPUs
            for( dev=0; dev < num_gpus; dev++ ) {
                magma_setdevice( dev );
                if (dev == panel_dev)
                    dpanel[dev] = dlA(dev, i, i_local);
                else
                    dpanel[dev] = dwork[dev] + dpanel_offset;
                magma_csetmatrix_async( rows, ib,
                                        hpanel(i),   ldhpanel,
                                        dpanel[dev], ldda, stream[dev][0] );
            }
            for( dev=0; dev < num_gpus; dev++ ) {
                magma_setdevice( dev );
                magma_queue_sync( stream[dev][0] );
            }

            // TODO: if cpanel_to_q copied whole block, wouldn't need to restore
            // -- just send the copy to the GPUs.
            // TODO: also, could zero out the lower triangle and use Azzam's larfb w/ gemm.
            
            /* Restore the panel */
            cq_to_panel( MagmaUpper, ib, hpanel(i), ldhpanel, hwork + ib*ib );

            if (i + ib < n) {
                /* Send the T matrix to the GPU. */
                for( dev=0; dev < num_gpus; dev++ ) {
                    magma_setdevice( dev );
                    magma_csetmatrix_async( ib, ib,
                                            hwork,      ib,
                                            dwork[dev], lddwork, stream[dev][0] );
                }
                
                la_dev = (panel_dev+1) % num_gpus;
                for( dev=0; dev < num_gpus; dev++ ) {
                    magma_setdevice( dev );
                    magmablasSetKernelStream( stream[dev][0] );
                    if (dev == la_dev && i+nb < min_mn-nb) {
                        // If not last panel,
                        // for look-ahead panel, apply H' to A(i:m,i+ib:i+2*ib)
                        i_nb_local = (i+nb)/(nb*num_gpus)*nb;
                        magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                          rows, ib, ib,
                                          dpanel[dev],             ldda,       // V
                                          dwork[dev],              lddwork,    // T
                                          dlA(dev, i, i_nb_local), ldda,       // C
                                          dwork[dev]+ib,           lddwork );  // work
                        magma_event_record( panel_event[dev], stream[dev][0] );
                        // for trailing matrix, apply H' to A(i:m,i+2*ib:n)
                        magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                          rows, n_local[dev]-(i_nb_local+ib), ib,
                                          dpanel[dev],                ldda,       // V
                                          dwork[dev],                 lddwork,    // T
                                          dlA(dev, i, i_nb_local+ib), ldda,       // C
                                          dwork[dev]+ib,              lddwork );  // work
                    }
                    else {
                        // for trailing matrix, apply H' to A(i:m,i+ib:n)
                        i_nb_local = i_local;
                        if (dev <= panel_dev) {
                            i_nb_local += ib;
                        }
                        magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                          rows, n_local[dev]-i_nb_local, ib,
                                          dpanel[dev],             ldda,       // V
                                          dwork[dev],              lddwork,    // T
                                          dlA(dev, i, i_nb_local), ldda,       // C
                                          dwork[dev]+ib,           lddwork );  // work
                    }
                }
                // Restore top of panel (after larfb is done)
                magma_setdevice( panel_dev );
                magma_csetmatrix_async( ib, ib,
                                        hpanel(i),                  ldhpanel,
                                        dlA(panel_dev, i, i_local), ldda, stream[panel_dev][0] );
            }
        }
    }
    else {
        i = 0;
    }
    
    /* Use unblocked code to factor the last or only block row. */
    if (i < min_mn) {
        rows = m-i;
        for( j=i; j < n; j += nb ) {
            panel_dev = (j/nb) % num_gpus;
            i_local = j/(nb*num_gpus)*nb;
            ib = min( n-j, nb );
            magma_setdevice( panel_dev );
            magma_cgetmatrix( rows, ib,
                              dlA(panel_dev, i, i_local), ldda,
                              hwork + (j-i)*rows,         rows );
        }

        // needs lwork >= 2*n*nb:
        // needs (m-i)*(n-i) for last block row, bounded by nb*n.
        // needs (n-i)*nb    for cgeqrf work,    bounded by n*nb.
        ib = n-i;  // total columns in block row
        lhwork = lwork - ib*rows;
        lapackf77_cgeqrf( &rows, &ib, hwork, &rows, tau+i, hwork + ib*rows, &lhwork, info );
        if ( *info != 0 ) {
            fprintf( stderr, "error %d\n", (int) *info );
        }
        
        for( j=i; j < n; j += nb ) {
            panel_dev = (j/nb) % num_gpus;
            i_local = j/(nb*num_gpus)*nb;
            ib = min( n-j, nb );
            magma_setdevice( panel_dev );
            magma_csetmatrix( rows, ib,
                              hwork + (j-i)*rows,         rows,
                              dlA(panel_dev, i, i_local), ldda );
        }
    }

CLEANUP:
    // free(NULL) does nothing.
    // check that queues and events are non-zero before destroying them, though.
    for( dev=0; dev < num_gpus; dev++ ) {
        magma_setdevice( dev );
        if ( stream[dev][0]   ) { magma_queue_destroy( stream[dev][0]   ); }
        if ( stream[dev][1]   ) { magma_queue_destroy( stream[dev][1]   ); }
        if ( panel_event[dev] ) { magma_event_destroy( panel_event[dev] ); }
        magma_free( dwork[dev] );
    }
    magma_free_pinned( hwork );
    magma_setdevice( cdevice );
    magmablasSetKernelStream( cqueue );

    return *info;
} /* magma_cgeqrf2_mgpu */
Example #24
0
/***************************************************************************//**
    Purpose
    -------
    CGEQRF computes a QR factorization of a COMPLEX M-by-N matrix A:
    A = Q * R using multiple GPUs. This version does not require work space on the GPU
    passed as input. GPU memory is allocated in the routine.

    Arguments
    ---------
    @param[in]
    ngpu    INTEGER
            Number of GPUs to use. ngpu > 0.

    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    A       COMPLEX array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).
    \n
            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    @param[out]
    tau     COMPLEX array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    work    (workspace) COMPLEX array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.
    \n
            Higher performance is achieved if WORK is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.  LWORK >= N*NB,
            where NB can be obtained through magma_get_cgeqrf_nb( M, N ).
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ---------------
    The matrix Q is represented as a product of elementary reflectors

       Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).

    @ingroup magma_geqrf
*******************************************************************************/
extern "C" magma_int_t
magma_cgeqrf_m(
    magma_int_t ngpu,
    magma_int_t m, magma_int_t n,
    magmaFloatComplex *A,    magma_int_t lda, magmaFloatComplex *tau,
    magmaFloatComplex *work, magma_int_t lwork,
    magma_int_t *info )
{
    magmaFloatComplex *da[MagmaMaxGPUs];
    magmaFloatComplex c_one = MAGMA_C_ONE;

    magma_int_t i, min_mn, ldda;

    *info = 0;
    magma_int_t nb = magma_get_cgeqrf_nb( m, n );

    magma_int_t lwkopt = n * nb;
    work[0] = magma_cmake_lwork( lwkopt );
    bool lquery = (lwork == -1);
    if (ngpu < 0 || ngpu > MagmaMaxGPUs) {
        *info = -1;
    } else if (m < 0) {
        *info = -2;
    } else if (n < 0) {
        *info = -3;
    } else if (lda < max(1,m)) {
        *info = -5;
    } else if (lwork < max(1,n) && ! lquery) {
        *info = -8;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;

    min_mn = min(m,n);
    if (min_mn == 0) {
        work[0] = c_one;
        return *info;
    }

    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    
    ldda = magma_roundup( m, 32 );

    magma_int_t  n_local[MagmaMaxGPUs];
    for (i=0; i < ngpu; i++) {
        n_local[i] = ((n/nb)/ngpu)*nb;
        if (i < (n/nb)%ngpu)
            n_local[i] += nb;
        else if (i == (n/nb)%ngpu)
            n_local[i] += n%nb;

        magma_setdevice(i);
        
        // TODO on failure, free previously allocated memory
        if (MAGMA_SUCCESS != magma_cmalloc( &da[i], ldda*n_local[i] )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }
    }

    if (m > nb && n > nb) {
        magma_queue_t queues[MagmaMaxGPUs];
        for( magma_int_t dev=0; dev < ngpu; dev++ ) {
            magma_setdevice( dev );
            magma_queue_create( dev, &queues[dev] );
        }

        /* Copy the matrix to the GPUs in 1D block cyclic distribution */
        magma_csetmatrix_1D_col_bcyclic( ngpu, m, n, nb, A, lda, da, ldda, queues );
        for( magma_int_t dev=0; dev < ngpu; dev++ ) {
            magma_setdevice( dev );
            magma_queue_sync( queues[dev] );
        }

        /* Factor using the GPU interface */
        magma_cgeqrf2_mgpu( ngpu, m, n, da, ldda, tau, info);

        /* Copy the matrix back from the GPUs to the CPU */
        magma_cgetmatrix_1D_col_bcyclic( ngpu, m, n, nb, da, ldda, A, lda, queues );
        for( magma_int_t dev=0; dev < ngpu; dev++ ) {
            magma_setdevice( dev );
            magma_queue_sync( queues[dev] );
            magma_queue_destroy( queues[dev] );
        }
    }
    else {
        lapackf77_cgeqrf(&m, &n, A, &lda, tau, work, &lwork, info);
    }


    /* Free the allocated GPU memory */
    for (i=0; i < ngpu; i++) {
        magma_setdevice(i);
        magma_free( da[i] );
    }
    magma_setdevice( orig_dev );

    return *info;
} /* magma_cgeqrf4 */
Example #25
0
extern "C" magma_err_t
magma_cgeqrf(magma_int_t m, magma_int_t n,
             magmaFloatComplex *A,    magma_int_t lda, magmaFloatComplex *tau,
             magmaFloatComplex *work, magma_int_t lwork,
             magma_int_t *info,
             magma_queue_t* queue )
{
    /*  -- clMAGMA (version 1.1.0) --
           Univ. of Tennessee, Knoxville
           Univ. of California, Berkeley
           Univ. of Colorado, Denver
           @date January 2014

        Purpose
        =======
        CGEQRF computes a QR factorization of a COMPLEX M-by-N matrix A:
        A = Q * R. This version does not require work space on the GPU
        passed as input. GPU memory is allocated in the routine.

        If the current stream is NULL, this version replaces it with user defined
        stream to overlap computation with communication.

        Arguments
        =========
        M       (input) INTEGER
                The number of rows of the matrix A.  M >= 0.

        N       (input) INTEGER
                The number of columns of the matrix A.  N >= 0.

        A       (input/output) COMPLEX array, dimension (LDA,N)
                On entry, the M-by-N matrix A.
                On exit, the elements on and above the diagonal of the array
                contain the min(M,N)-by-N upper trapezoidal matrix R (R is
                upper triangular if m >= n); the elements below the diagonal,
                with the array TAU, represent the orthogonal matrix Q as a
                product of min(m,n) elementary reflectors (see Further
                Details).

                Higher performance is achieved if A is in pinned memory, e.g.
                allocated using magma_malloc_pinned.

        LDA     (input) INTEGER
                The leading dimension of the array A.  LDA >= max(1,M).

        TAU     (output) COMPLEX array, dimension (min(M,N))
                The scalar factors of the elementary reflectors (see Further
                Details).

        WORK    (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
                On exit, if INFO = 0, WORK(1) returns the optimal LWORK.

                Higher performance is achieved if WORK is in pinned memory, e.g.
                allocated using magma_malloc_pinned.

        LWORK   (input) INTEGER
                The dimension of the array WORK.  LWORK >= max( N*NB, 2*NB*NB ),
                where NB can be obtained through magma_get_cgeqrf_nb(M).

                If LWORK = -1, then a workspace query is assumed; the routine
                only calculates the optimal size of the WORK array, returns
                this value as the first entry of the WORK array, and no error
                message related to LWORK is issued.

        INFO    (output) INTEGER
                = 0:  successful exit
                < 0:  if INFO = -i, the i-th argument had an illegal value
                      or another error occured, such as memory allocation failed.

        Further Details
        ===============
        The matrix Q is represented as a product of elementary reflectors

           Q = H(1) H(2) . . . H(k), where k = min(m,n).

        Each H(i) has the form

           H(i) = I - tau * v * v'

        where tau is a complex scalar, and v is a complex vector with
        v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
        and tau in TAU(i).
        =====================================================================    */

#define  A(i,j) ( A + (i) + (j)*lda )
#define dA(i,j) dA, dA_offset + (i) + (j)*ldda

    magmaFloatComplex_ptr dA, dwork, dT;
    size_t dA_offset, dwork_offset, dT_offset;

    magmaFloatComplex c_one = MAGMA_C_ONE;

    magma_int_t i, k, lddwork, old_i, old_ib;
    magma_int_t ib, ldda;

    *info = 0;
    magma_int_t nb = magma_get_cgeqrf_nb(min(m, n));

    // need 2*nb*nb to store T and upper triangle of V simultaneously
    magma_int_t lwkopt = max(n*nb, 2*nb*nb);
    work[0] = MAGMA_C_MAKE( (float)lwkopt, 0 );
    int lquery = (lwork == -1);
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,m)) {
        *info = -4;
    } else if (lwork < max(1, lwkopt) && ! lquery) {
        *info = -7;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;

    k = min(m,n);
    if (k == 0) {
        work[0] = c_one;
        return *info;
    }

    // largest N for larfb is n-nb (trailing matrix lacks 1st panel)
    lddwork = ((n+31)/32)*32 - nb;
    ldda    = ((m+31)/32)*32;

    magma_int_t num_gpus = magma_num_gpus();
    if( num_gpus > 1 ) {
        /* call multiple-GPU interface  */
        printf("multiple-GPU verison not implemented\n");
        return MAGMA_ERR_NOT_IMPLEMENTED;
        //return magma_cgeqrf4(num_gpus, m, n, A, lda, tau, work, lwork, info);
    }

    // allocate space for dA, dwork, and dT
    if (MAGMA_SUCCESS != magma_cmalloc( &dA, (n*ldda + nb*lddwork + nb*nb) )) {
        /* Switch to the "out-of-core" (out of GPU-memory) version */
        printf("non-GPU-resident version not implemented\n");
        return MAGMA_ERR_NOT_IMPLEMENTED;
        //return magma_cgeqrf_ooc(m, n, A, lda, tau, work, lwork, info);
    }

    dA_offset = 0;

    dwork = dA;
    dwork_offset = n*ldda;

    dT    = dA;
    dT_offset = n*ldda + nb*lddwork;

    if ( (nb > 1) && (nb < k) ) {
        /* Use blocked code initially.
           Asynchronously send the matrix to the GPU except the first panel. */
        magma_csetmatrix_async( m, n-nb,
                                A(0,nb), 0, lda,
                                dA(0,nb), ldda, queue[0], NULL );

        old_i = 0;
        old_ib = nb;
        for (i = 0; i < k-nb; i += nb) {
            ib = min(k-i, nb);
            if (i>0) {
                /* download i-th panel */
                magma_queue_sync( queue[1] );
                magma_cgetmatrix_async( m-i, ib,
                                        dA(i,i), ldda,
                                        A(i,i), 0, lda, queue[0], NULL );

                /* Apply H' to A(i:m,i+2*ib:n) from the left */
                magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                  m-old_i, n-old_i-2*old_ib, old_ib,
                                  dA(old_i, old_i),          ldda, dT, dT_offset,    nb,
                                  dA(old_i, old_i+2*old_ib), ldda, dwork, dwork_offset, lddwork, queue[1]);

                magma_cgetmatrix_async( i, ib,
                                        dA(0,i), ldda,
                                        A(0,i), 0, lda, queue[1], NULL );
                magma_queue_sync( queue[0] );
            }

            magma_int_t rows = m-i;
            lapackf77_cgeqrf(&rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info);

            /* Form the triangular factor of the block reflector
               H = H(i) H(i+1) . . . H(i+ib-1) */
            lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr,
                              &rows, &ib, A(i,i), &lda, tau+i, work, &ib);

            cpanel_to_q(MagmaUpper, ib, A(i,i), lda, work+ib*ib);

            /* download the i-th V matrix */
            magma_csetmatrix_async( rows, ib, A(i,i), 0, lda, dA(i,i), ldda, queue[0], NULL );

            /* download the T matrix */
            magma_queue_sync( queue[1] );
            magma_csetmatrix_async( ib, ib, work, 0, ib, dT, dT_offset, nb, queue[0], NULL );
            magma_queue_sync( queue[0] );

            if (i + ib < n) {

                if (i+ib < k-nb) {
                    /* Apply H' to A(i:m,i+ib:i+2*ib) from the left (look-ahead) */
                    magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                      rows, ib, ib,
                                      dA(i, i   ), ldda, dT, dT_offset,   nb,
                                      dA(i, i+ib), ldda, dwork, dwork_offset, lddwork, queue[1]);
                    cq_to_panel(MagmaUpper, ib, A(i,i), lda, work+ib*ib);
                }
                else {
                    /* After last panel, update whole trailing matrix. */
                    /* Apply H' to A(i:m,i+ib:n) from the left */
                    magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                      rows, n-i-ib, ib,
                                      dA(i, i   ), ldda, dT, dT_offset,   nb,
                                      dA(i, i+ib), ldda, dwork, dwork_offset, lddwork, queue[1]);
                    cq_to_panel(MagmaUpper, ib, A(i,i), lda, work+ib*ib);
                }

                old_i  = i;
                old_ib = ib;
            }
        }
    } else {
        i = 0;
    }

    /* Use unblocked code to factor the last or only block. */
    if (i < k) {
        ib = n-i;
        if (i != 0) {
            magma_cgetmatrix( m, ib, dA(0,i), ldda, A(0,i), 0, lda, queue[1] );
        }
        magma_int_t rows = m-i;
        lapackf77_cgeqrf(&rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info);
    }

    magma_queue_sync(queue[0]);
    magma_queue_sync(queue[1]);
    magma_free( dA );

    return *info;
} /* magma_cgeqrf */
Example #26
0
/**
    Purpose
    -------
    CGEQRF3 computes a QR factorization of a complex M-by-N matrix A:
    A = Q * R.
    
    This version stores the triangular dT matrices used in
    the block QR factorization so that they can be applied directly (i.e.,
    without being recomputed) later. As a result, the application
    of Q is much faster. Also, the upper triangular matrices for V have 0s
    in them. The corresponding parts of the upper triangular R are
    stored separately in dT.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    dA      COMPLEX array on the GPU, dimension (LDDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).

    @param[in]
    ldda    INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).
            To benefit from coalescent memory accesses LDDA must be
            divisible by 16.

    @param[out]
    tau     COMPLEX array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    dT      (workspace) COMPLEX array on the GPU,
            dimension (2*MIN(M, N) + ceil(N/32)*32 )*NB,
            where NB can be obtained through magma_get_cgeqrf_nb( M, N ).
            It starts with a MIN(M,N)*NB block that stores the triangular T
            matrices, followed by a MIN(M,N)*NB block that stores
            the diagonal blocks of the R matrix.
            The rest of the array is used as workspace.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ---------------
    The matrix Q is represented as a product of elementary reflectors

        Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

        H(i) = I - tau * v * v^H

    where tau is a complex scalar, and v is a complex vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).

    @ingroup magma_cgeqrf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_cgeqrf3_gpu(
    magma_int_t m, magma_int_t n,
    magmaFloatComplex_ptr dA, magma_int_t ldda,
    magmaFloatComplex *tau,
    magmaFloatComplex_ptr dT,
    magma_int_t *info )
{
    #ifdef HAVE_clBLAS
    #define dA(i_, j_)  dA, (dA_offset + (i_) + (j_)*(ldda))
    #define dT(i_)      dT, (dT_offset + (i_)*nb)
    #define dR(i_)      dT, (dT_offset + (  minmn + (i_))*nb)
    #define dwork(i_)   dT, (dT_offset + (2*minmn + (i_))*nb)
    #else
    #define dA(i_, j_) (dA + (i_) + (j_)*(ldda))
    #define dT(i_)     (dT + (i_)*nb)
    #define dR(i_)     (dT + (  minmn + (i_))*nb)
    #define dwork(i_)  (dT + (2*minmn + (i_))*nb)
    #endif
    
    magmaFloatComplex *work, *hwork, *R;
    magma_int_t cols, i, ib, ldwork, lddwork, lhwork, lwork, minmn, nb, old_i, old_ib, rows;
    
    // check arguments
    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    
    minmn = min( m, n );
    if (minmn == 0)
        return *info;
    
    // TODO: use min(m,n), but that affects dT
    nb = magma_get_cgeqrf_nb( m, n );
    
    // dT contains 3 blocks:
    // dT    is minmn*nb
    // dR    is minmn*nb
    // dwork is n*nb
    lddwork = n;
    
    // work  is m*nb for panel
    // hwork is n*nb, and at least nb*nb for T in larft
    // R     is nb*nb
    ldwork = m;
    lhwork = max( n*nb, nb*nb );
    lwork  = ldwork*nb + lhwork + nb*nb;
    // last block needs rows*cols for matrix and prefers cols*nb for work
    // worst case is n > m*nb, m a small multiple of nb:
    // needs n*nb + n > (m+n)*nb
    // prefers 2*n*nb, about twice above (m+n)*nb.
    i = ((minmn-1)/nb)*nb;
    lwork = max( lwork, (m-i)*(n-i) + (n-i)*nb );
    
    if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, lwork )) {
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }
    hwork = work + ldwork*nb;
    R     = work + ldwork*nb + lhwork;
    memset( R, 0, nb*nb*sizeof(magmaFloatComplex) );
    
    magma_queue_t queues[2];
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queues[0] );
    magma_queue_create( cdev, &queues[1] );
        
    if ( nb > 1 && nb < minmn ) {
        // need nb*nb for T in larft
        assert( lhwork >= nb*nb );
        
        // Use blocked code initially
        old_i = 0; old_ib = nb;
        for (i = 0; i < minmn-nb; i += nb) {
            ib = min( minmn-i, nb );
            rows = m - i;
            
            // get i-th panel from device
            magma_cgetmatrix_async( rows, ib,
                                    dA(i,i), ldda,
                                    work,    ldwork, queues[1] );
            if (i > 0) {
                // Apply H^H to A(i:m,i+2*ib:n) from the left
                cols = n - old_i - 2*old_ib;
                magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                  m-old_i, cols, old_ib,
                                  dA(old_i, old_i         ), ldda, dT(old_i), nb,
                                  dA(old_i, old_i+2*old_ib), ldda, dwork(0),  lddwork, queues[0] );
                
                // Fix the diagonal block
                magma_csetmatrix_async( old_ib, old_ib,
                                        R,         old_ib,
                                        dR(old_i), old_ib, queues[0] );
            }
            
            magma_queue_sync( queues[1] );  // wait to get work(i)
            lapackf77_cgeqrf( &rows, &ib, work, &ldwork, &tau[i], hwork, &lhwork, info );
            // Form the triangular factor of the block reflector in hwork
            // H = H(i) H(i+1) . . . H(i+ib-1)
            lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr,
                              &rows, &ib,
                              work, &ldwork, &tau[i], hwork, &ib );
            
            // wait for previous trailing matrix update (above) to finish with R
            magma_queue_sync( queues[0] );
            
            // copy the upper triangle of panel to R and invert it, and
            // set  the upper triangle of panel (V) to identity
            csplit_diag_block( ib, work, ldwork, R );
            
            // send i-th V matrix to device
            magma_csetmatrix( rows, ib,
                              work, ldwork,
                              dA(i,i), ldda, queues[1] );
            
            if (i + ib < n) {
                // send T matrix to device
                magma_csetmatrix( ib, ib,
                                  hwork, ib,
                                  dT(i), nb, queues[1] );
                
                if (i+nb < minmn-nb) {
                    // Apply H^H to A(i:m,i+ib:i+2*ib) from the left
                    magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                      rows, ib, ib,
                                      dA(i, i   ), ldda, dT(i),  nb,
                                      dA(i, i+ib), ldda, dwork(0), lddwork, queues[1] );
                    // wait for larfb to finish with dwork before larfb in next iteration starts
                    magma_queue_sync( queues[1] );
                }
                else {
                    // Apply H^H to A(i:m,i+ib:n) from the left
                    magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                      rows, n-i-ib, ib,
                                      dA(i, i   ), ldda, dT(i),  nb,
                                      dA(i, i+ib), ldda, dwork(0), lddwork, queues[1] );
                    // Fix the diagonal block
                    magma_csetmatrix( ib, ib,
                                      R,     ib,
                                      dR(i), ib, queues[1] );
                }
                old_i  = i;
                old_ib = ib;
            }
        }
    } else {
        i = 0;
    }
    
    // Use unblocked code to factor the last or only block.
    if (i < minmn) {
        rows = m-i;
        cols = n-i;
        magma_cgetmatrix( rows, cols, dA(i, i), ldda, work, rows, queues[1] );
        // see comments for lwork above
        lhwork = lwork - rows*cols;
        lapackf77_cgeqrf( &rows, &cols, work, &rows, &tau[i], &work[rows*cols], &lhwork, info );
        magma_csetmatrix( rows, cols, work, rows, dA(i, i), ldda, queues[1] );
    }
    
    magma_queue_destroy( queues[0] );
    magma_queue_destroy( queues[1] );
    
    magma_free_pinned( work );
    
    return *info;
} // magma_cgeqrf_gpu
Example #27
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing zcgeqrsv
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    real_Double_t   gflops, gpu_perf, gpu_time, cpu_perf, cpu_time, gpu_perfd, gpu_perfs;
    double          error, gpu_error, cpu_error, Anorm, work[1];
    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    magmaDoubleComplex *h_A, *h_A2, *h_B, *h_X, *h_R;
    magmaDoubleComplex_ptr d_A, d_B, d_X, d_T;
    magmaFloatComplex  *d_SA, *d_SB;
    magmaDoubleComplex *h_workd, *tau, tmp[1];
    magmaFloatComplex  *h_works;
    magma_int_t lda,  ldb, lhwork, lworkgpu;
    magma_int_t ldda, lddb, lddx;
    magma_int_t M, N, nrhs, qrsv_iters, info, size, min_mn, max_mn, nb;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};

    printf("Epsilon(double): %8.6e\n"
           "Epsilon(single): %8.6e\n\n",
           lapackf77_dlamch("Epsilon"), lapackf77_slamch("Epsilon") );
    magma_int_t status = 0;

    magma_opts opts;
    parse_opts( argc, argv, &opts );

    double tol = opts.tolerance * lapackf77_dlamch("E");
    
    nrhs = opts.nrhs;
    
    printf("                    CPU Gflop/s   GPU  Gflop/s                         |b-Ax|| / (N||A||)   ||dx-x||/(N||A||)\n");
    printf("    M     N  NRHS    double        double    single     mixed   Iter   CPU        GPU                        \n");
    printf("=============================================================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            if ( M < N ) {
                printf( "%5d %5d %5d   skipping because M < N is not yet supported.\n", (int) M, (int) N, (int) nrhs );
                continue;
            }
            min_mn = min(M, N);
            max_mn = max(M, N);
            lda    = M;
            ldb    = max_mn;
            ldda   = ((M+31)/32) * 32;
            lddb   = ((max_mn+31)/32)*32;
            lddx   = ((N+31)/32) * 32;
            nb     = max( magma_get_zgeqrf_nb( M ), magma_get_cgeqrf_nb( M ) );
            gflops = (FLOPS_ZGEQRF( M, N ) + FLOPS_ZGEQRS( M, N, nrhs )) / 1e9;
            
            lworkgpu = (M - N + nb)*(nrhs + nb) + nrhs*nb;
            
            // query for workspace size
            lhwork = -1;
            lapackf77_zgels( MagmaNoTransStr, &M, &N, &nrhs,
                             NULL, &lda, NULL, &ldb, tmp, &lhwork, &info );
            lhwork = (magma_int_t) MAGMA_Z_REAL( tmp[0] );
            lhwork = max( lhwork, lworkgpu );
            
            TESTING_MALLOC_CPU( tau,     magmaDoubleComplex, min_mn   );
            TESTING_MALLOC_CPU( h_A,     magmaDoubleComplex, lda*N    );
            TESTING_MALLOC_CPU( h_A2,    magmaDoubleComplex, lda*N    );
            TESTING_MALLOC_CPU( h_B,     magmaDoubleComplex, ldb*nrhs );
            TESTING_MALLOC_CPU( h_X,     magmaDoubleComplex, ldb*nrhs );
            TESTING_MALLOC_CPU( h_R,     magmaDoubleComplex, ldb*nrhs );
            TESTING_MALLOC_CPU( h_workd, magmaDoubleComplex, lhwork   );
            h_works = (magmaFloatComplex*)h_workd;
            
            TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N      );
            TESTING_MALLOC_DEV( d_B, magmaDoubleComplex, lddb*nrhs   );
            TESTING_MALLOC_DEV( d_X, magmaDoubleComplex, lddx*nrhs   );
            TESTING_MALLOC_DEV( d_T, magmaDoubleComplex, ( 2*min_mn + (N+31)/32*32 )*nb );
            
            /* Initialize the matrices */
            size = lda*N;
            lapackf77_zlarnv( &ione, ISEED, &size, h_A );
            lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda );
            
            // make random RHS
            size = ldb*nrhs;
            lapackf77_zlarnv( &ione, ISEED, &size, h_B );
            lapackf77_zlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb );
            
            magma_zsetmatrix( M, N,    h_A, lda, d_A, ldda );
            magma_zsetmatrix( M, nrhs, h_B, ldb, d_B, lddb );
            
            //=====================================================================
            //              Mixed Precision Iterative Refinement - GPU
            //=====================================================================
            gpu_time = magma_wtime();
            magma_zcgeqrsv_gpu( M, N, nrhs,
                                d_A, ldda, d_B, lddb,
                                d_X, lddx, &qrsv_iters, &info );
            gpu_time = magma_wtime() - gpu_time;
            gpu_perf = gflops / gpu_time;
            if (info != 0)
                printf("magma_zcgeqrsv returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            
            // compute the residual
            magma_zgetmatrix( N, nrhs, d_X, lddx, h_X, ldb );
            blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N,
                           &c_neg_one, h_A, &lda,
                                       h_X, &ldb,
                           &c_one,     h_R, &ldb);
            Anorm = lapackf77_zlange("f", &M, &N,    h_A, &lda, work);
            
            //=====================================================================
            //                 Double Precision Solve
            //=====================================================================
            magma_zsetmatrix( M, N,    h_A, lda, d_A, ldda );
            magma_zsetmatrix( M, nrhs, h_B, ldb, d_B, lddb );
            
            gpu_time = magma_wtime();
            magma_zgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, ldda,
                             d_B, lddb, h_workd, lworkgpu, &info);
            gpu_time = magma_wtime() - gpu_time;
            gpu_perfd = gflops / gpu_time;
            
            //=====================================================================
            //                 Single Precision Solve
            //=====================================================================
            magma_zsetmatrix( M, N,    h_A, lda, d_A, ldda );
            magma_zsetmatrix( M, nrhs, h_B, ldb, d_B, lddb );
            
            /* The allocation of d_SA and d_SB is done here to avoid
             * to double the memory used on GPU with zcgeqrsv */
            TESTING_MALLOC_DEV( d_SA, magmaFloatComplex, ldda*N    );
            TESTING_MALLOC_DEV( d_SB, magmaFloatComplex, lddb*nrhs );
            magmablas_zlag2c( M, N,    d_A, ldda, d_SA, ldda, &info );
            magmablas_zlag2c( N, nrhs, d_B, lddb, d_SB, lddb, &info );
            
            gpu_time = magma_wtime();
            magma_cgels_gpu( MagmaNoTrans, M, N, nrhs, d_SA, ldda,
                             d_SB, lddb, h_works, lhwork, &info);
            gpu_time = magma_wtime() - gpu_time;
            gpu_perfs = gflops / gpu_time;
            TESTING_FREE_DEV( d_SA );
            TESTING_FREE_DEV( d_SB );
            
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            lapackf77_zlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb );
            
            cpu_time = magma_wtime();
            lapackf77_zgels( MagmaNoTransStr, &M, &N, &nrhs,
                             h_A, &lda, h_X, &ldb, h_workd, &lhwork, &info );
            cpu_time = magma_wtime() - cpu_time;
            cpu_perf = gflops / cpu_time;
            if (info != 0)
                printf("lapackf77_zgels returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            
            blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N,
                           &c_neg_one, h_A2, &lda,
                                       h_X,  &ldb,
                           &c_one,     h_B,  &ldb );
            
            cpu_error = lapackf77_zlange("f", &M, &nrhs, h_B, &ldb, work) / (min_mn*Anorm);
            gpu_error = lapackf77_zlange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm);
            
            // error relative to LAPACK
            size = M*nrhs;
            blasf77_zaxpy( &size, &c_neg_one, h_B, &ione, h_R, &ione );
            error = lapackf77_zlange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm);
            
            printf("%5d %5d %5d   %7.2f       %7.2f   %7.2f   %7.2f   %4d   %8.2e   %8.2e   %8.2e   %s\n",
                   (int) M, (int) N, (int) nrhs,
                   cpu_perf, gpu_perfd, gpu_perfs, gpu_perf,
                   (int) qrsv_iters,
                   cpu_error, gpu_error, error, (error < tol ? "ok" : "failed"));
            status += ! (error < tol);
            
            TESTING_FREE_CPU( tau  );
            TESTING_FREE_CPU( h_A  );
            TESTING_FREE_CPU( h_A2 );
            TESTING_FREE_CPU( h_B  );
            TESTING_FREE_CPU( h_X  );
            TESTING_FREE_CPU( h_R  );
            TESTING_FREE_CPU( h_workd );
            
            TESTING_FREE_DEV( d_A );
            TESTING_FREE_DEV( d_B );
            TESTING_FREE_DEV( d_X );
            TESTING_FREE_DEV( d_T );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }
    
    TESTING_FINALIZE();
    return status;
}
Example #28
0
extern "C" magma_int_t
magma_cgeqrf4(magma_int_t num_gpus, magma_int_t m, magma_int_t n,
              magmaFloatComplex *a,    magma_int_t lda, magmaFloatComplex *tau,
              magmaFloatComplex *work, magma_int_t lwork,
              magma_int_t *info )
{
/*  -- MAGMA (version 1.4.1) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       December 2013

    Purpose
    =======
    CGEQRF4 computes a QR factorization of a COMPLEX M-by-N matrix A:
    A = Q * R using multiple GPUs. This version does not require work space on the GPU
    passed as input. GPU memory is allocated in the routine.

    Arguments
    =========
    NUM_GPUS
            (input) INTEGER
            The number of GPUs to be used for the factorization.

    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) COMPLEX array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).

            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    TAU     (output) COMPLEX array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    WORK    (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.

            Higher performance is achieved if WORK is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    LWORK   (input) INTEGER
            The dimension of the array WORK.  LWORK >= N*NB,
            where NB can be obtained through magma_get_cgeqrf_nb(M).

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ===============
    The matrix Q is represented as a product of elementary reflectors

       Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).
    =====================================================================    */

    magmaFloatComplex *da[MagmaMaxGPUs];
    magmaFloatComplex c_one = MAGMA_C_ONE;

    int i, k, ldda;

    *info = 0;
    int nb = magma_get_cgeqrf_nb(min(m, n));

    int lwkopt = n * nb;
    work[0] = MAGMA_C_MAKE( (float)lwkopt, 0 );
    int lquery = (lwork == -1);
    if (num_gpus <0 || num_gpus > 4) {
        *info = -1;
    } else if (m < 0) {
        *info = -2;
    } else if (n < 0) {
        *info = -3;
    } else if (lda < max(1,m)) {
        *info = -5;
    } else if (lwork < max(1,n) && ! lquery) {
        *info = -8;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;

    k = min(m,n);
    if (k == 0) {
        work[0] = c_one;
        return *info;
    }

    ldda    = ((m+31)/32)*32;

    magma_int_t  n_local[MagmaMaxGPUs];
    for(i=0; i<num_gpus; i++){
        n_local[i] = ((n/nb)/num_gpus)*nb;
        if (i < (n/nb)%num_gpus)
            n_local[i] += nb;
        else if (i == (n/nb)%num_gpus)
            n_local[i] += n%nb;

        magma_setdevice(i);
        
        // TODO on failure, free previously allocated memory
        if (MAGMA_SUCCESS != magma_cmalloc( &da[i], ldda*n_local[i] )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }
    }

    if (m > nb && n > nb) {

        /* Copy the matrix to the GPUs in 1D block cyclic distribution */
        magma_csetmatrix_1D_col_bcyclic(m, n, a, lda, da, ldda, num_gpus, nb);

        /* Factor using the GPU interface */
        magma_cgeqrf2_mgpu( num_gpus, m, n, da, ldda, tau, info);

        /* Copy the matrix back from the GPUs to the CPU */
        magma_cgetmatrix_1D_col_bcyclic(m, n, da, ldda, a, lda, num_gpus, nb);
    }
    else {
        lapackf77_cgeqrf(&m, &n, a, &lda, tau, work, &lwork, info);
    }


    /* Free the allocated GPU memory */
    for(i=0; i<num_gpus; i++){
        magma_setdevice(i);
        magma_free( da[i] );
    }

    return *info;
} /* magma_cgeqrf4 */
Example #29
0
/**
    Purpose
    -------
    CGEQRF_OOC computes a QR factorization of a COMPLEX M-by-N matrix A:
    A = Q * R. This version does not require work space on the GPU
    passed as input. GPU memory is allocated in the routine.
    This is an out-of-core (ooc) version that is similar to magma_cgeqrf but
    the difference is that this version can use a GPU even if the matrix
    does not fit into the GPU memory at once.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    A       COMPLEX array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).
    \n
            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    @param[out]
    tau     COMPLEX array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    work    (workspace) COMPLEX array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.
    \n
            Higher performance is achieved if WORK is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.  LWORK >= N*NB,
            where NB can be obtained through magma_get_cgeqrf_nb( M, N ).
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ---------------
    The matrix Q is represented as a product of elementary reflectors

        Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

        H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).

    @ingroup magma_cgeqrf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_cgeqrf_ooc(
    magma_int_t m, magma_int_t n,
    magmaFloatComplex *A,    magma_int_t lda, magmaFloatComplex *tau,
    magmaFloatComplex *work, magma_int_t lwork,
    magma_int_t *info )
{
    #define  A(i_,j_) ( A + (i_) + (j_)*lda )
    #define dA(i_,j_) (dA + (i_) + (j_)*ldda)

    /* Constants */
    const magmaFloatComplex c_one = MAGMA_C_ONE;
    
    /* Local variables */
    magmaFloatComplex_ptr dA, dwork;
    magma_int_t i, ib, IB, j, min_mn, lddwork, ldda, rows;

    magma_int_t nb = magma_get_cgeqrf_nb( m, n );

    magma_int_t lwkopt = n * nb;
    work[0] = magma_cmake_lwork( lwkopt );
    bool lquery = (lwork == -1);
    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,m)) {
        *info = -4;
    } else if (lwork < max(1,n) && ! lquery) {
        *info = -7;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Check how much memory do we have */
    size_t freeMem, totalMem;
    cudaMemGetInfo( &freeMem, &totalMem );
    freeMem /= sizeof(magmaFloatComplex);
    
    magma_int_t NB = magma_int_t(0.8*freeMem/m);
    NB = (NB / nb) * nb;

    if (NB >= n)
        return magma_cgeqrf(m, n, A, lda, tau, work, lwork, info);

    min_mn = min(m,n);
    if (min_mn == 0) {
        work[0] = c_one;
        return *info;
    }

    lddwork = magma_roundup( NB, 32 ) + nb;
    ldda    = magma_roundup( m, 32 );

    if (MAGMA_SUCCESS != magma_cmalloc( &dA, (NB + nb)*ldda + nb*lddwork )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    magma_queue_t queues[2];
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queues[0] );
    magma_queue_create( cdev, &queues[1] );

    magmaFloatComplex_ptr ptr = dA + ldda*NB;
    dwork = dA + ldda*(NB + nb);

    /* start the main loop over the blocks that fit in the GPU memory */
    for (i=0; i < n; i += NB) {
        IB = min( n-i, NB );
        //printf("Processing %5d columns -- %5d to %5d ... \n", IB, i, i+IB);

        /* 1. Copy the next part of the matrix to the GPU */
        magma_csetmatrix_async( m, IB,
                                A(0,i),  lda,
                                dA(0,0), ldda, queues[0] );
        magma_queue_sync( queues[0] );

        /* 2. Update it with the previous transformations */
        for (j=0; j < min(i,min_mn); j += nb) {
            ib = min( min_mn-j, nb );

            /* Get a panel in ptr.                                           */
            //   1. Form the triangular factor of the block reflector
            //   2. Send it to the GPU.
            //   3. Put 0s in the upper triangular part of V.
            //   4. Send V to the GPU in ptr.
            //   5. Update the matrix.
            //   6. Restore the upper part of V.
            rows = m-j;
            lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr,
                              &rows, &ib, A(j,j), &lda, tau+j, work, &ib);
            magma_csetmatrix_async( ib, ib,
                                    work,  ib,
                                    dwork, lddwork, queues[1] );

            magma_cpanel_to_q( MagmaUpper, ib, A(j,j), lda, work+ib*ib );
            magma_csetmatrix_async( rows, ib,
                                    A(j,j), lda,
                                    ptr,    rows, queues[1] );
            magma_queue_sync( queues[1] );

            magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                              rows, IB, ib,
                              ptr, rows, dwork,    lddwork,
                              dA(j, 0), ldda, dwork+ib, lddwork, queues[1] );

            magma_cq_to_panel( MagmaUpper, ib, A(j,j), lda, work+ib*ib );
        }

        /* 3. Do a QR on the current part */
        if (i < min_mn)
            magma_cgeqrf2_gpu( m-i, IB, dA(i,0), ldda, tau+i, info );

        /* 4. Copy the current part back to the CPU */
        magma_cgetmatrix_async( m, IB,
                                dA(0,0), ldda,
                                A(0,i),  lda, queues[0] );
    }

    magma_queue_sync( queues[0] );

    magma_queue_destroy( queues[0] );
    magma_queue_destroy( queues[1] );
    magma_free( dA );
    
    return *info;
} /* magma_cgeqrf_ooc */
Example #30
0
/**
    Purpose
    -------
    Solves the least squares problem
           min || A*X - C ||
    using the QR factorization A = Q*R computed by CGEQRF3_GPU.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A. M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A. M >= N >= 0.

    @param[in]
    nrhs    INTEGER
            The number of columns of the matrix C. NRHS >= 0.

    @param[in]
    dA      COMPLEX array on the GPU, dimension (LDDA,N)
            The i-th column must contain the vector which defines the
            elementary reflector H(i), for i = 1,2,...,n, as returned by
            CGEQRF3_GPU in the first n columns of its array argument A.

    @param[in]
    ldda    INTEGER
            The leading dimension of the array A, LDDA >= M.

    @param[in]
    tau     COMPLEX array, dimension (N)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by MAGMA_CGEQRF_GPU.

    @param[in,out]
    dB      COMPLEX array on the GPU, dimension (LDDB,NRHS)
            On entry, the M-by-NRHS matrix C.
            On exit, the N-by-NRHS solution matrix X.

    @param[in]
    dT      COMPLEX array that is the output (the 6th argument)
            of magma_cgeqrf_gpu of size
            2*MIN(M, N)*NB + ((N+31)/32*32 )* MAX(NB, NRHS).
            The array starts with a block of size MIN(M,N)*NB that stores
            the triangular T matrices used in the QR factorization,
            followed by MIN(M,N)*NB block storing the diagonal block
            matrices for the R matrix, followed by work space of size
            ((N+31)/32*32 )* MAX(NB, NRHS).

    @param[in]
    lddb    INTEGER
            The leading dimension of the array dB. LDDB >= M.

    @param[out]
    hwork   (workspace) COMPLEX array, dimension (LWORK)
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK,
            LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB,
            where NB is the blocksize given by magma_get_cgeqrf_nb( M ).
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the HWORK array, returns
            this value as the first entry of the WORK array.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value

    @ingroup magma_cgels_comp
    ********************************************************************/
extern "C" magma_int_t
magma_cgeqrs3_gpu(magma_int_t m, magma_int_t n, magma_int_t nrhs,
                  magmaFloatComplex *dA,    magma_int_t ldda,
                  magmaFloatComplex *tau,   magmaFloatComplex *dT,
                  magmaFloatComplex *dB,    magma_int_t lddb,
                  magmaFloatComplex *hwork, magma_int_t lwork,
                  magma_int_t *info)
{
    #define dA(a_1,a_2) (dA + (a_2)*(ldda) + (a_1))
    #define dT(a_1)     (dT + (lddwork+(a_1))*nb)

    magmaFloatComplex c_one     = MAGMA_C_ONE;
    magma_int_t k, lddwork;

    magma_int_t nb     = magma_get_cgeqrf_nb(m);
    magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb;
    int lquery = (lwork == -1);

    hwork[0] = MAGMA_C_MAKE( (float)lwkopt, 0. );

    *info = 0;
    if (m < 0)
        *info = -1;
    else if (n < 0 || m < n)
        *info = -2;
    else if (nrhs < 0)
        *info = -3;
    else if (ldda < max(1,m))
        *info = -5;
    else if (lddb < max(1,m))
        *info = -8;
    else if (lwork < lwkopt && ! lquery)
        *info = -10;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;

    k = min(m,n);
    if (k == 0) {
        hwork[0] = c_one;
        return *info;
    }
    lddwork = k;

    /* B := Q' * B */
    magma_cunmqr_gpu( MagmaLeft, Magma_ConjTrans,
                      m, nrhs, n,
                      dA(0,0), ldda, tau,
                      dB, lddb, hwork, lwork, dT, nb, info );
    if ( *info != 0 ) {
        return *info;
    }

    /* Solve R*X = B(1:n,:)
       1. Move the (k-1)/nb block diagonal submatrices from dT to R
       2. Solve
       3. Restore the data format moving data from R back to dT
    */
    magmablas_cswapdblk(k-1, nb, dA(0,0), ldda, 1, dT(0), nb, 0);
    if ( nrhs == 1 ) {
        magma_ctrsv(MagmaUpper, MagmaNoTrans, MagmaNonUnit,
                    n, dA(0,0), ldda, dB, 1);
    } else {
        magma_ctrsm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit,
                    n, nrhs, c_one, dA(0,0), ldda, dB, lddb);
    }
    magmablas_cswapdblk(k-1, nb, dT(0), nb, 0, dA(0,0), ldda, 1);

    return *info;
}