Esempio n. 1
0
extern "C" magma_int_t
magma_sgeqrf2_mgpu( magma_int_t num_gpus, magma_int_t m, magma_int_t n,
                    float **dlA, magma_int_t ldda,
                    float *tau,
                    magma_int_t *info )
{
/*  -- MAGMA (version 1.4.1) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       December 2013

    Purpose
    =======
    SGEQRF2_MGPU computes a QR factorization of a real M-by-N matrix A:
    A = Q * R. This is a GPU interface of the routine.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    dA      (input/output) REAL array on the GPU, dimension (LDDA,N)
            On entry, the M-by-N matrix dA.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).

    LDDA    (input) INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).
            To benefit from coalescent memory accesses LDDA must be
            dividable by 16.

    TAU     (output) REAL array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ===============
    The matrix Q is represented as a product of elementary reflectors

       Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).
    =====================================================================    */

    #define dlA(dev, i, j)   (dlA[dev] + (i) + (j)*(ldda))
    #define hpanel(i)        (hpanel + (i))

    // set to NULL to make cleanup easy: free(NULL) does nothing.
    float *dwork[MagmaMaxGPUs]={NULL}, *dpanel[MagmaMaxGPUs]={NULL};
    float *hwork=NULL, *hpanel=NULL;
    magma_queue_t stream[MagmaMaxGPUs][2]={{NULL}};
    magma_event_t panel_event[MagmaMaxGPUs]={NULL};

    magma_int_t i, j, min_mn, dev, ldhpanel, lddwork, rows;
    magma_int_t ib, nb;
    magma_int_t lhwork, lwork;
    magma_int_t panel_dev, i_local, i_nb_local, n_local[MagmaMaxGPUs], la_dev, dpanel_offset;

    magma_queue_t cqueue;
    magmablasGetKernelStream( &cqueue );
    
    magma_device_t cdevice;
    magma_getdevice( &cdevice );

    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    min_mn = min(m,n);
    if (min_mn == 0)
        return *info;

    nb = magma_get_sgeqrf_nb( m );

    /* dwork is (n*nb) --- for T (nb*nb) and slarfb work ((n-nb)*nb) ---
     *        + dpanel (ldda*nb), on each GPU.
     * I think slarfb work could be smaller, max(n_local[:]).
     * Oddly, T and slarfb work get stacked on top of each other, both with lddwork=n.
     * on GPU that owns panel, set dpanel = dlA(dev,i,i_local).
     * on other GPUs,          set dpanel = dwork[dev] + dpanel_offset. */
    lddwork = n;
    dpanel_offset = lddwork*nb;
    for( dev=0; dev < num_gpus; dev++ ) {
        magma_setdevice( dev );
        if ( MAGMA_SUCCESS != magma_smalloc( &(dwork[dev]), (lddwork + ldda)*nb )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            goto CLEANUP;
        }
    }

    /* hwork is MAX( workspace for sgeqrf (n*nb), two copies of T (2*nb*nb) )
     *        + hpanel (m*nb).
     * for last block, need 2*n*nb total. */
    ldhpanel = m;
    lhwork = max( n*nb, 2*nb*nb );
    lwork = max( lhwork + ldhpanel*nb, 2*n*nb );
    if ( MAGMA_SUCCESS != magma_smalloc_pinned( &hwork, lwork )) {
        *info = MAGMA_ERR_HOST_ALLOC;
        goto CLEANUP;
    }
    hpanel = hwork + lhwork;

    /* Set the number of local n for each GPU */
    for( dev=0; dev < num_gpus; dev++ ) {
        n_local[dev] = ((n/nb)/num_gpus)*nb;
        if (dev < (n/nb) % num_gpus)
            n_local[dev] += nb;
        else if (dev == (n/nb) % num_gpus)
            n_local[dev] += n % nb;
    }

    for( dev=0; dev < num_gpus; dev++ ) {
        magma_setdevice( dev );
        magma_queue_create( &stream[dev][0] );
        magma_queue_create( &stream[dev][1] );
        magma_event_create( &panel_event[dev] );
    }

    if ( nb < min_mn ) {
        /* Use blocked code initially */
        // Note: as written, ib cannot be < nb.
        for( i = 0; i < min_mn-nb; i += nb ) {
            /* Set the GPU number that holds the current panel */
            panel_dev = (i/nb) % num_gpus;
            
            /* Set the local index where the current panel is (j==i) */
            i_local = i/(nb*num_gpus)*nb;
            
            ib = min(min_mn-i, nb);
            rows = m-i;
            
            /* Send current panel to the CPU, after panel_event indicates it has been updated */
            magma_setdevice( panel_dev );
            magma_queue_wait_event( stream[panel_dev][1], panel_event[panel_dev] );
            magma_sgetmatrix_async( rows, ib,
                                    dlA(panel_dev, i, i_local), ldda,
                                    hpanel(i),                  ldhpanel, stream[panel_dev][1] );
            magma_queue_sync( stream[panel_dev][1] );

            // Factor panel
            lapackf77_sgeqrf( &rows, &ib, hpanel(i), &ldhpanel, tau+i,
                              hwork, &lhwork, info );
            if ( *info != 0 ) {
                fprintf( stderr, "error %d\n", (int) *info );
            }

            // Form the triangular factor of the block reflector
            // H = H(i) H(i+1) . . . H(i+ib-1)
            lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr,
                              &rows, &ib,
                              hpanel(i), &ldhpanel, tau+i, hwork, &ib );

            spanel_to_q( MagmaUpper, ib, hpanel(i), ldhpanel, hwork + ib*ib );
            // Send the current panel back to the GPUs
            for( dev=0; dev < num_gpus; dev++ ) {
                magma_setdevice( dev );
                if (dev == panel_dev)
                    dpanel[dev] = dlA(dev, i, i_local);
                else
                    dpanel[dev] = dwork[dev] + dpanel_offset;
                magma_ssetmatrix_async( rows, ib,
                                        hpanel(i),   ldhpanel,
                                        dpanel[dev], ldda, stream[dev][0] );
            }
            for( dev=0; dev < num_gpus; dev++ ) {
                magma_setdevice( dev );
                magma_queue_sync( stream[dev][0] );
            }

            // TODO: if spanel_to_q copied whole block, wouldn't need to restore
            // -- just send the copy to the GPUs.
            // TODO: also, could zero out the lower triangle and use Azzam's larfb w/ gemm.
            
            /* Restore the panel */
            sq_to_panel( MagmaUpper, ib, hpanel(i), ldhpanel, hwork + ib*ib );

            if (i + ib < n) {
                /* Send the T matrix to the GPU. */
                for( dev=0; dev < num_gpus; dev++ ) {
                    magma_setdevice( dev );
                    magma_ssetmatrix_async( ib, ib,
                                            hwork,      ib,
                                            dwork[dev], lddwork, stream[dev][0] );
                }
                
                la_dev = (panel_dev+1) % num_gpus;
                for( dev=0; dev < num_gpus; dev++ ) {
                    magma_setdevice( dev );
                    magmablasSetKernelStream( stream[dev][0] );
                    if (dev == la_dev && i+nb < min_mn-nb) {
                        // If not last panel,
                        // for look-ahead panel, apply H' to A(i:m,i+ib:i+2*ib)
                        i_nb_local = (i+nb)/(nb*num_gpus)*nb;
                        magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise,
                                          rows, ib, ib,
                                          dpanel[dev],             ldda,       // V
                                          dwork[dev],              lddwork,    // T
                                          dlA(dev, i, i_nb_local), ldda,       // C
                                          dwork[dev]+ib,           lddwork );  // work
                        magma_event_record( panel_event[dev], stream[dev][0] );
                        // for trailing matrix, apply H' to A(i:m,i+2*ib:n)
                        magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise,
                                          rows, n_local[dev]-(i_nb_local+ib), ib,
                                          dpanel[dev],                ldda,       // V
                                          dwork[dev],                 lddwork,    // T
                                          dlA(dev, i, i_nb_local+ib), ldda,       // C
                                          dwork[dev]+ib,              lddwork );  // work
                    }
                    else {
                        // for trailing matrix, apply H' to A(i:m,i+ib:n)
                        i_nb_local = i_local;
                        if (dev <= panel_dev) {
                            i_nb_local += ib;
                        }
                        magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise,
                                          rows, n_local[dev]-i_nb_local, ib,
                                          dpanel[dev],             ldda,       // V
                                          dwork[dev],              lddwork,    // T
                                          dlA(dev, i, i_nb_local), ldda,       // C
                                          dwork[dev]+ib,           lddwork );  // work
                    }
                }
                // Restore top of panel (after larfb is done)
                magma_setdevice( panel_dev );
                magma_ssetmatrix_async( ib, ib,
                                        hpanel(i),                  ldhpanel,
                                        dlA(panel_dev, i, i_local), ldda, stream[panel_dev][0] );
            }
        }
    }
    else {
        i = 0;
    }
    
    /* Use unblocked code to factor the last or only block row. */
    if (i < min_mn) {
        rows = m-i;
        for( j=i; j < n; j += nb ) {
            panel_dev = (j/nb) % num_gpus;
            i_local = j/(nb*num_gpus)*nb;
            ib = min( n-j, nb );
            magma_setdevice( panel_dev );
            magma_sgetmatrix( rows, ib,
                              dlA(panel_dev, i, i_local), ldda,
                              hwork + (j-i)*rows,         rows );
        }

        // needs lwork >= 2*n*nb:
        // needs (m-i)*(n-i) for last block row, bounded by nb*n.
        // needs (n-i)*nb    for sgeqrf work,    bounded by n*nb.
        ib = n-i;  // total columns in block row
        lhwork = lwork - ib*rows;
        lapackf77_sgeqrf( &rows, &ib, hwork, &rows, tau+i, hwork + ib*rows, &lhwork, info );
        if ( *info != 0 ) {
            fprintf( stderr, "error %d\n", (int) *info );
        }
        
        for( j=i; j < n; j += nb ) {
            panel_dev = (j/nb) % num_gpus;
            i_local = j/(nb*num_gpus)*nb;
            ib = min( n-j, nb );
            magma_setdevice( panel_dev );
            magma_ssetmatrix( rows, ib,
                              hwork + (j-i)*rows,         rows,
                              dlA(panel_dev, i, i_local), ldda );
        }
    }

CLEANUP:
    // free(NULL) does nothing.
    // check that queues and events are non-zero before destroying them, though.
    for( dev=0; dev < num_gpus; dev++ ) {
        magma_setdevice( dev );
        if ( stream[dev][0]   ) { magma_queue_destroy( stream[dev][0]   ); }
        if ( stream[dev][1]   ) { magma_queue_destroy( stream[dev][1]   ); }
        if ( panel_event[dev] ) { magma_event_destroy( panel_event[dev] ); }
        magma_free( dwork[dev] );
    }
    magma_free_pinned( hwork );
    magma_setdevice( cdevice );
    magmablasSetKernelStream( cqueue );

    return *info;
} /* magma_sgeqrf2_mgpu */
Esempio n. 2
0
/**
    Purpose
    -------
    SGEQRF computes a QR factorization of a real M-by-N matrix A:
    A = Q * R.
    
    This version has LAPACK-complaint arguments.
    
    If the current stream is NULL, this version replaces it with a new
    stream to overlap computation with communication.

    Other versions (magma_sgeqrf_gpu and magma_sgeqrf3_gpu) store the
    intermediate T matrices.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    dA      REAL array on the GPU, dimension (LDDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).

    @param[in]
    ldda    INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).
            To benefit from coalescent memory accesses LDDA must be
            divisible by 16.

    @param[out]
    tau     REAL array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ---------------
    The matrix Q is represented as a product of elementary reflectors

        Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

        H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).

    @ingroup magma_sgeqrf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_sgeqrf2_gpu(
    magma_int_t m, magma_int_t n,
    magmaFloat_ptr dA, magma_int_t ldda,
    float *tau,
    magma_int_t *info )
{
    #define dA(a_1,a_2)    ( dA+(a_2)*(ldda) + (a_1))
    #define work_ref(a_1)  ( work + (a_1))
    #define hwork          ( work + (nb)*(m))

    magmaFloat_ptr dwork;
    float *work;
    magma_int_t i, k, ldwork, lddwork, old_i, old_ib, rows;
    magma_int_t nbmin, nx, ib, nb;
    magma_int_t lhwork, lwork;

    /* Function Body */
    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    k = min(m,n);
    if (k == 0)
        return *info;

    nb = magma_get_sgeqrf_nb(m);

    lwork  = (m+n) * nb;
    lhwork = lwork - (m)*nb;

    if (MAGMA_SUCCESS != magma_smalloc( &dwork, n*nb )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    if (MAGMA_SUCCESS != magma_smalloc_pinned( &work, lwork )) {
        magma_free( dwork );
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }

    /* Define user stream if current stream is NULL */
    magma_queue_t stream[2];
    
    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );

    magma_queue_create( &stream[0] );
    if (orig_stream == NULL) {
        magma_queue_create( &stream[1] );
        magmablasSetKernelStream(stream[1]);
    }
    else {
        stream[1] = orig_stream;
    }

    nbmin = 2;
    nx    = nb;
    ldwork = m;
    lddwork= n;

    if (nb >= nbmin && nb < k && nx < k) {
        /* Use blocked code initially */
        old_i = 0; old_ib = nb;
        for (i = 0; i < k-nx; i += nb) {
            ib = min(k-i, nb);
            rows = m -i;

            /* download i-th panel */
            magma_queue_sync( stream[1] );
            magma_sgetmatrix_async( rows, ib,
                                    dA(i,i),       ldda,
                                    work_ref(i), ldwork, stream[0] );
            if (i > 0) {
                /* Apply H' to A(i:m,i+2*ib:n) from the left */
                magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                  m-old_i, n-old_i-2*old_ib, old_ib,
                                  dA(old_i, old_i         ), ldda, dwork,        lddwork,
                                  dA(old_i, old_i+2*old_ib), ldda, dwork+old_ib, lddwork);

                magma_ssetmatrix_async( old_ib, old_ib,
                                        work_ref(old_i),  ldwork,
                                        dA(old_i, old_i), ldda, stream[1] );
            }

            magma_queue_sync( stream[0] );
            lapackf77_sgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info);
            /* Form the triangular factor of the block reflector
               H = H(i) H(i+1) . . . H(i+ib-1) */
            lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr,
                              &rows, &ib,
                              work_ref(i), &ldwork, tau+i, hwork, &ib);

            spanel_to_q( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib );

            /* download the i-th V matrix */
            magma_ssetmatrix_async( rows, ib, work_ref(i), ldwork, dA(i,i), ldda, stream[0] );

            /* download the T matrix */
            magma_queue_sync( stream[1] );
            magma_ssetmatrix_async( ib, ib, hwork, ib, dwork, lddwork, stream[0] );
            magma_queue_sync( stream[0] );

            if (i + ib < n) {
                if (i+nb < k-nx) {
                    /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */
                    magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                      rows, ib, ib,
                                      dA(i, i   ), ldda, dwork,    lddwork,
                                      dA(i, i+ib), ldda, dwork+ib, lddwork);
                    sq_to_panel( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib );
                }
                else {
                    magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                      rows, n-i-ib, ib,
                                      dA(i, i   ), ldda, dwork,    lddwork,
                                      dA(i, i+ib), ldda, dwork+ib, lddwork);
                    sq_to_panel( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib );
                    magma_ssetmatrix_async( ib, ib,
                                            work_ref(i), ldwork,
                                            dA(i,i),     ldda, stream[1] );
                }
                old_i  = i;
                old_ib = ib;
            }
        }
    } else {
        i = 0;
    }
    magma_free( dwork );

    /* Use unblocked code to factor the last or only block. */
    if (i < k) {
        ib   = n-i;
        rows = m-i;
        magma_sgetmatrix_async( rows, ib, dA(i, i), ldda, work, rows, stream[1] );
        magma_queue_sync( stream[1] );
        lhwork = lwork - rows*ib;
        lapackf77_sgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info);
        
        magma_ssetmatrix_async( rows, ib, work, rows, dA(i, i), ldda, stream[1] );
    }

    magma_free_pinned( work );

    magma_queue_destroy( stream[0] );
    if (orig_stream == NULL) {
        magma_queue_destroy( stream[1] );
    }
    magmablasSetKernelStream( orig_stream );

    return *info;
} /* magma_sgeqrf2_gpu */
Esempio n. 3
0
/**
    Purpose
    -------
    SSYTRD reduces a real symmetric matrix A to real symmetric
    tridiagonal form T by an orthogonal similarity transformation:
    Q**H * A * Q = T.

    Arguments
    ---------
    @param[in]
    num_gpus INTEGER
             The number of GPUs.  num_gpus > 0.

    @param[in]
    num_streams INTEGER
             The number of GPU streams used for update.  10 >= num_streams > 0.

    @param[in]
    uplo     magma_uplo_t
      -      = MagmaUpper:  Upper triangle of A is stored;
      -      = MagmaLower:  Lower triangle of A is stored.

    @param[in]
    n        INTEGER
             The order of the matrix A.  N >= 0.

    @param[in,out]
    A        REAL array, dimension (LDA,N)
             On entry, the symmetric matrix A.  If UPLO = MagmaUpper, the leading
             N-by-N upper triangular part of A contains the upper
             triangular part of the matrix A, and the strictly lower
             triangular part of A is not referenced.  If UPLO = MagmaLower, the
             leading N-by-N lower triangular part of A contains the lower
             triangular part of the matrix A, and the strictly upper
             triangular part of A is not referenced.
             On exit, if UPLO = MagmaUpper, the diagonal and first superdiagonal
             of A are overwritten by the corresponding elements of the
             tridiagonal matrix T, and the elements above the first
             superdiagonal, with the array TAU, represent the orthogonal
             matrix Q as a product of elementary reflectors; if UPLO
             = MagmaLower, the diagonal and first subdiagonal of A are over-
             written by the corresponding elements of the tridiagonal
             matrix T, and the elements below the first subdiagonal, with
             the array TAU, represent the orthogonal matrix Q as a product
             of elementary reflectors. See Further Details.

    @param[in]
    lda      INTEGER
             The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    d        REAL array, dimension (N)
             The diagonal elements of the tridiagonal matrix T:
             D(i) = A(i,i).
 
    @param[out]
    e        REAL array, dimension (N-1)
             The off-diagonal elements of the tridiagonal matrix T:
             E(i) = A(i,i+1) if UPLO = MagmaUpper, E(i) = A(i+1,i) if UPLO = MagmaLower.

    @param[out]
    tau      REAL array, dimension (N-1)
             The scalar factors of the elementary reflectors (see Further
             Details).

    @param[out]
    work     (workspace) REAL array, dimension (MAX(1,LWORK))
             On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork    INTEGER
             The dimension of the array WORK.  LWORK >= 1.
             For optimum performance LWORK >= N*NB, where NB is the
             optimal blocksize.
    \n
             If LWORK = -1, then a workspace query is assumed; the routine
             only calculates the optimal size of the WORK array, returns
             this value as the first entry of the WORK array, and no error
             message related to LWORK is issued by XERBLA.

    @param[out]
    info     INTEGER
      -      = 0:  successful exit
      -      < 0:  if INFO = -i, the i-th argument had an illegal value

    Further Details
    ---------------
    If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary
    reflectors

       Q = H(n-1) . . . H(2) H(1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in
    A(1:i-1,i+1), and tau in TAU(i).

    If UPLO = MagmaLower, the matrix Q is represented as a product of elementary
    reflectors

       Q = H(1) H(2) . . . H(n-1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i),
    and tau in TAU(i).

    The contents of A on exit are illustrated by the following examples
    with n = 5:

    if UPLO = MagmaUpper:                if UPLO = MagmaLower:

      (  d   e   v2  v3  v4 )              (  d                  )
      (      d   e   v3  v4 )              (  e   d              )
      (          d   e   v4 )              (  v1  e   d          )
      (              d   e  )              (  v1  v2  e   d      )
      (                  d  )              (  v1  v2  v3  e   d  )

    where d and e denote diagonal and off-diagonal elements of T, and vi
    denotes an element of the vector defining H(i).

    @ingroup magma_ssyev_comp
    ********************************************************************/
extern "C" magma_int_t
magma_ssytrd_mgpu(
    magma_int_t num_gpus, magma_int_t num_streams, magma_uplo_t uplo, magma_int_t n,
    float *A, magma_int_t lda,
    float *d, float *e, float *tau,
    float *work, magma_int_t lwork,
    magma_int_t *info)
{
#define  A(i, j)     (A           + (j)*lda  + (i))
#define dA(id, i, j) (dA[(id)]    + (j)*ldda + (i))
#define dW(id, i, j) (dwork[(id)] + (j)*ldda + (i))

    const char* uplo_ = lapack_uplo_const( uplo );
    
    magma_int_t ln, ldda;
    magma_int_t nb = magma_get_ssytrd_nb(n), ib;

    float c_neg_one = MAGMA_S_NEG_ONE;
    float c_one = MAGMA_S_ONE;
    float  d_one = MAGMA_D_ONE;
    //float mv_time = 0.0;
#ifdef PROFILE_SY2RK
    float up_time = 0.0;
#endif

    magma_int_t kk, nx;
    magma_int_t i = 0, ii, iii, j, did, i_n;
    magma_int_t iinfo;
    magma_int_t ldwork, lddwork, lwkopt, ldwork2;
    magma_int_t lquery;
    magma_queue_t stream[MagmaMaxGPUs][10];
    float *dx[MagmaMaxGPUs], *dy[MagmaMaxGPUs], *hwork;
    float *dwork2[MagmaMaxGPUs];

    *info = 0;
    int upper = (uplo == MagmaUpper);
    lquery = (lwork == -1);
    if (! upper && uplo != MagmaLower) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,n)) {
        *info = -4;
    } else if (lwork < nb*n && ! lquery) {
        *info = -9;
    } else if ( num_streams > 2 ) {
        *info = 2;  // TODO fix
    }

    /* Determine the block size. */
    ldwork = lddwork = n;
    lwkopt = n * nb;
    if (*info == 0) {
        work[0] = MAGMA_S_MAKE( lwkopt, 0 );
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;

    /* Quick return if possible */
    if (n == 0) {
        work[0] = c_one;
        return *info;
    }

    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );
    
    float *dA[MagmaMaxGPUs];
    float *dwork[MagmaMaxGPUs];

    float times[11];
    for( did=0; did < 11; did++ )
        times[did] = 0;
//#define PROFILE_SY2RK
#ifdef PROFILE_SY2RK
    magma_event_t start, stop;
    float etime;
    magma_setdevice(0);
    magma_event_create( &start );
    magma_event_create( &stop  );
#endif
    ldda = lda;
    ln = ((nb*(1+n/(nb*num_gpus))+31)/32)*32;
    ldwork2 = (1+ n / nb + (n % nb != 0)) * ldda;
    for( did=0; did < num_gpus; did++ ) {
        magma_setdevice(did);
        // TODO fix memory leak
        if ( MAGMA_SUCCESS != magma_smalloc(&dA[did],     ln*ldda+3*lddwork*nb) ||
             MAGMA_SUCCESS != magma_smalloc(&dx[did],     num_streams*n) ||
             MAGMA_SUCCESS != magma_smalloc(&dy[did],     num_streams*n) ||
             MAGMA_SUCCESS != magma_smalloc(&dwork2[did], ldwork2 ) ) {
            for( i=0; i < did; i++ ) {
                magma_setdevice(i);
                magma_free(dA[i]);
                magma_free(dx[i]);
                magma_free(dy[i]);
            }
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }
        dwork[did] = dA[did] + ln*ldda;
        
        for( kk=0; kk < num_streams; kk++ )
            magma_queue_create(&stream[did][kk]);
    }
    magma_setdevice(0);
    // TODO fix memory leak dwork2
    if ( MAGMA_SUCCESS != magma_smalloc_pinned( &hwork, num_streams*num_gpus*n ) ) {
        for( i=0; i < num_gpus; i++ ) {
            magma_setdevice(i);
            magma_free(dA[i]);
            magma_free(dx[i]);
            magma_free(dy[i]);
        }
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }

    if (n < 2048)
        nx = n;
    else
        nx = 512;

    if (upper) {
        /* Copy the matrix to the GPU */
        if (1 <= n-nx) {
            magma_shtodhe(num_gpus, uplo, n, nb, A, lda, dA, ldda, stream, &iinfo );
        }

        /*  Reduce the upper triangle of A.
            Columns 1:kk are handled by the unblocked method. */
        for (i = nb*((n-1)/nb); i >= nx; i -= nb) {
            ib = min(nb, n-i);

            ii  = nb*(i/(nb*num_gpus));
            did = (i/nb)%num_gpus;

            /* wait for the next panel */
            if (i != nb*((n-1)/nb)) {
                magma_setdevice(did);
                magma_queue_sync(stream[did][0]);
            }

            magma_slatrd_mgpu(num_gpus, uplo, n, i+ib, ib, nb,
                              A(0, 0), lda, e, tau,
                              work, ldwork,
                              dA, ldda, 0,
                              dwork, i+ib,
                              dwork2, ldwork2,
                              1, dx, dy, hwork,
                              stream, times);

            magma_ssyr2k_mgpu(num_gpus, MagmaUpper, MagmaNoTrans, nb, i, ib,
                         c_neg_one, dwork, i+ib, 0,
                         d_one,     dA,    ldda, 0,
                         num_streams, stream);

            /* get the next panel */
            if (i-nb >= nx ) {
                ib = min(nb, n-(i-nb));
                
                ii  = nb*((i-nb)/(nb*num_gpus));
                did = ((i-nb)/nb)%num_gpus;
                magma_setdevice(did);
                
                magma_sgetmatrix_async( (i-nb)+ib, ib,
                                        dA(did, 0, ii), ldda,
                                         A(0, i-nb),    lda,
                                        stream[did][0] );
            }

            /* Copy superdiagonal elements back into A, and diagonal
               elements into D */
            for (j = i; j < i+ib; ++j) {
                if ( j > 0 ) {
                    *A(j-1,j) = MAGMA_S_MAKE( e[j - 1], 0 );
                }
                d[j] = MAGMA_S_REAL( *A(j, j) );
            }
        } /* end of for i=... */
      
        if ( nx > 0 ) {
            if (1 <= n-nx) { /* else A is already on CPU */
                for (i=0; i < nx; i += nb) {
                    ib = min(nb, n-i);
                    ii  = nb*(i/(nb*num_gpus));
                    did = (i/nb)%num_gpus;
                
                    magma_setdevice(did);
                    magma_sgetmatrix_async( nx, ib,
                                            dA(did, 0, ii), ldda,
                                            A(0, i),        lda,
                                            stream[did][0] );
                }
            }
            
            for( did=0; did < num_gpus; did++ ) {
                magma_setdevice(did);
                magma_queue_sync(stream[did][0]);
            }
            /*  Use unblocked code to reduce the last or only block */
            lapackf77_ssytd2(uplo_, &nx, A(0, 0), &lda, d, e, tau, &iinfo);
        }
    }
    else {
        trace_init( 1, num_gpus, num_streams, (CUstream_st**)stream );
        /* Copy the matrix to the GPU */
        if (1 <= n-nx) {
            magma_shtodhe(num_gpus, uplo, n, nb, A, lda, dA, ldda, stream, &iinfo );
        }

        /* Reduce the lower triangle of A */
        for (i = 0; i < n-nx; i += nb) {
            ib = min(nb, n-i);

            ii  = nb*(i/(nb*num_gpus));
            did = (i/nb)%num_gpus;
            /* Reduce columns i:i+ib-1 to tridiagonal form and form the
               matrix W which is needed to update the unreduced part of
               the matrix */

            /*   Get the current panel (no need for the 1st iteration) */
            if (i != 0) {
                magma_setdevice(did);
                trace_gpu_start( did, 0, "comm", "get" );
                magma_sgetmatrix_async( n-i, ib,
                                        dA(did, i, ii), ldda,
                                         A(i,i),        lda,
                                        stream[did][0] );
                trace_gpu_end( did, 0 );
                magma_queue_sync(stream[did][0]);
                magma_setdevice(0);
            }
            
            magma_slatrd_mgpu(num_gpus, uplo, n, n-i, ib, nb,
                              A(i, i), lda, &e[i],
                              &tau[i], work, ldwork,
                              dA, ldda, i,
                              dwork,  (n-i),
                              dwork2, ldwork2,
                              1, dx, dy, hwork,
                              stream, times );

#ifdef PROFILE_SY2RK
            magma_setdevice(0);
            if ( i > 0 ) {
                cudaEventElapsedTime(&etime, start, stop);
                up_time += (etime/1000.0);
            }
            magma_event_record(start, 0);
#endif
            magma_ssyr2k_mgpu(num_gpus, MagmaLower, MagmaNoTrans, nb, n-i-ib, ib,
                         c_neg_one, dwork, n-i, ib,
                         d_one, dA, ldda, i+ib, num_streams, stream);
#ifdef PROFILE_SY2RK
            magma_setdevice(0);
            magma_event_record(stop, 0);
#endif

            /* Copy subdiagonal elements back into A, and diagonal
               elements into D */
            for (j = i; j < i+ib; ++j) {
                if ( j+1 < n ) {
                    *A(j+1,j) = MAGMA_S_MAKE( e[j], 0 );
                }
                d[j] = MAGMA_S_REAL( *A(j, j) );
            }
        } /* for i=... */

        /* Use unblocked code to reduce the last or only block */
        if ( i < n ) {
            iii = i;
            i_n = n-i;
            if ( i > 0 ) {
                for (; i < n; i += nb) {
                    ib = min(nb, n-i);
                    ii  = nb*(i/(nb*num_gpus));
                    did = (i/nb)%num_gpus;
                
                    magma_setdevice(did);
                    magma_sgetmatrix_async( i_n, ib,
                                            dA(did, iii, ii), ldda,
                                             A(iii, i),       lda,
                                            stream[did][0] );
                }
                for( did=0; did < num_gpus; did++ ) {
                    magma_setdevice(did);
                    magma_queue_sync(stream[did][0]);
                }
            }
            lapackf77_ssytrd(uplo_, &i_n, A(iii, iii), &lda, &d[iii], &e[iii],
                             &tau[iii], work, &lwork, &iinfo);
        }
    }
#ifdef PROFILE_SY2RK
    magma_setdevice(0);
    if ( n > nx ) {
        cudaEventElapsedTime(&etime, start, stop);
        up_time += (etime/1000.0);
    }
    magma_event_destroy( start );
    magma_event_destroy( stop  );
#endif

    trace_finalize( "ssytrd.svg", "trace.css" );
    for( did=0; did < num_gpus; did++ ) {
        magma_setdevice(did);
        for( kk=0; kk < num_streams; kk++ )
            magma_queue_sync(stream[did][kk]);
        for( kk=0; kk < num_streams; kk++ )
            magma_queue_destroy(stream[did][kk]);
        magma_free(dA[did]);
        magma_free(dx[did]);
        magma_free(dy[did]);
        magma_free(dwork2[did]);
    }
    magma_free_pinned(hwork);
    magma_setdevice( orig_dev );
    magmablasSetKernelStream( orig_stream );
    
    work[0] = MAGMA_S_MAKE( lwkopt, 0 );

#ifdef PROFILE_SY2RK
    printf( " n=%d nb=%d\n", n, nb );
    printf( " Time in SLARFG: %.2e seconds\n", times[0] );
    //printf( " Time in SSYMV : %.2e seconds\n", mv_time );
    printf( " Time in SSYR2K: %.2e seconds\n", up_time );
#endif
    return *info;
} /* magma_ssytrd */
Esempio n. 4
0
/**
    Purpose   
    =======   

    SSYTRF_nopiv_gpu computes the LDLt factorization of a real symmetric   
    matrix A.

    The factorization has the form   
       A = U^H * D * U , if UPLO = 'U', or   
       A = L  * D * L^H, if UPLO = 'L',   
    where U is an upper triangular matrix, L is lower triangular, and
    D is a diagonal matrix.

    This is the block version of the algorithm, calling Level 3 BLAS.   

    Arguments
    ---------
    @param[in]
    UPLO    CHARACTER*1   
      -     = 'U':  Upper triangle of A is stored;   
      -     = 'L':  Lower triangle of A is stored.   

    @param[in]
    N       INTEGER   
            The order of the matrix A.  N >= 0.   

    @param[in,out]
    dA      REAL array on the GPU, dimension (LDA,N)   
            On entry, the symmetric matrix A.  If UPLO = 'U', the leading   
            N-by-N upper triangular part of A contains the upper   
            triangular part of the matrix A, and the strictly lower   
            triangular part of A is not referenced.  If UPLO = 'L', the   
            leading N-by-N lower triangular part of A contains the lower   
            triangular part of the matrix A, and the strictly upper   
            triangular part of A is not referenced.   
    \n
            On exit, if INFO = 0, the factor U or L from the Cholesky   
            factorization A = U^H D U or A = L D L^H.   
    \n 
            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using cudaMallocHost.

    @param[in]
    LDA     INTEGER   
            The leading dimension of the array A.  LDA >= max(1,N).   

    @param[out]
    INFO    INTEGER   
      -     = 0:  successful exit   
      -     < 0:  if INFO = -i, the i-th argument had an illegal value 
                  if INFO = -6, the GPU memory allocation failed 
      -     > 0:  if INFO = i, the leading minor of order i is not   
                  positive definite, and the factorization could not be   
                  completed.   
    
    @ingroup magma_ssytrf_comp
    ******************************************************************* */
extern "C" magma_int_t
magma_ssytrf_nopiv_gpu(
    magma_uplo_t uplo, magma_int_t n,
    magmaFloat_ptr dA, magma_int_t ldda,
    magma_int_t *info)
{
    #define  A(i, j)  (A)
    #define dA(i, j)  (dA +(j)*ldda + (i))
    #define dW(i, j)  (dW +(j)*ldda + (i))
    #define dWt(i, j) (dW +(j)*nb   + (i))

    /* Local variables */
    float zone  = MAGMA_S_ONE;
    float mzone = MAGMA_S_NEG_ONE;
    int                upper = (uplo == MagmaUpper);
    magma_int_t j, k, jb, nb, ib, iinfo;

    *info = 0;
    if (! upper && uplo != MagmaLower) {
      *info = -1;
    } else if (n < 0) {
      *info = -2;
    } else if (ldda < max(1,n)) {
      *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return MAGMA_ERR_ILLEGAL_VALUE;
    }

    /* Quick return */
    if ( n == 0 )
      return MAGMA_SUCCESS;

    nb = magma_get_ssytrf_nopiv_nb(n);
    ib = min(32, nb); // inner-block for diagonal factorization

    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );


    magma_queue_t stream[2];
    magma_event_t event;
    magma_queue_create(&stream[0]);
    magma_queue_create(&stream[1]);
    magma_event_create( &event );
    trace_init( 1, 1, 2, stream );

    // CPU workspace
    float *A;
    if (MAGMA_SUCCESS != magma_smalloc_pinned( &A, nb*nb )) {
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }

    // GPU workspace
    magmaFloat_ptr dW;
    if (MAGMA_SUCCESS != magma_smalloc( &dW, (1+nb)*ldda )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    /* Use hybrid blocked code. */
    if (upper) {
        //=========================================================
        // Compute the LDLt factorization A = U'*D*U without pivoting.
        // main loop
        for (j=0; j<n; j += nb) {
            jb = min(nb, (n-j));
            
            // copy A(j,j) back to CPU
            trace_gpu_start( 0, 0, "get", "get" );
            //magma_queue_wait_event( stream[1], event );                                                                
            magma_event_sync(event);
            magma_sgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), nb, stream[1]);
            trace_gpu_end( 0, 0 );

            // factorize the diagonal block
            magma_queue_sync(stream[1]);
            trace_cpu_start( 0, "potrf", "potrf" );
            ssytrf_nopiv_cpu(MagmaUpper, jb, ib, A(j, j), nb, info);
            trace_cpu_end( 0 );
            if (*info != 0){
                *info = *info + j;
                break;
            }
            
            // copy A(j,j) back to GPU
            trace_gpu_start( 0, 0, "set", "set" );
            magma_ssetmatrix_async(jb, jb, A(j, j), nb, dA(j, j), ldda, stream[0]);
            trace_gpu_end( 0, 0 );
                
            if ( (j+jb) < n) {
                // compute the off-diagonal blocks of current block column
                magmablasSetKernelStream( stream[0] );
                trace_gpu_start( 0, 0, "trsm", "trsm" );
                magma_strsm(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaUnit, 
                            jb, (n-j-jb), 
                            zone, dA(j, j),    ldda, 
                            dA(j, j+jb), ldda);
                magma_scopymatrix( jb, n-j-jb, dA( j, j+jb ), ldda, dWt( 0, j+jb ), nb );
                
                // update the trailing submatrix with D
                magmablas_slascl_diag(MagmaUpper, jb, n-j-jb,
                                      dA(j,    j), ldda,
                                      dA(j, j+jb), ldda,
                                      &iinfo);
                trace_gpu_end( 0, 0 );
                
                // update the trailing submatrix with U and W
                trace_gpu_start( 0, 0, "gemm", "gemm" );
                for (k=j+jb; k<n; k+=nb) {
                    magma_int_t kb = min(nb,n-k);
                    magma_sgemm(MagmaConjTrans, MagmaNoTrans, kb, n-k, jb,
                                mzone, dWt(0, k), nb, 
                                       dA(j, k), ldda,
                                zone,  dA(k, k), ldda);
                    if (k==j+jb)
                        magma_event_record( event, stream[0] );
                }
                trace_gpu_end( 0, 0 );
            }
        }
    } else {
        //=========================================================
        // Compute the LDLt factorization A = L*D*L' without pivoting.
        // main loop
        for (j=0; j<n; j+=nb) {
            jb = min(nb, (n-j));
            
            // copy A(j,j) back to CPU
            trace_gpu_start( 0, 0, "get", "get" );
            //magma_queue_wait_event( stream[0], event );                                                                
            magma_event_sync(event);
            magma_sgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), nb, stream[1]);
            trace_gpu_end( 0, 0 );
            
            // factorize the diagonal block
            magma_queue_sync(stream[1]);
            trace_cpu_start( 0, "potrf", "potrf" );
            ssytrf_nopiv_cpu(MagmaLower, jb, ib, A(j, j), nb, info);
            trace_cpu_end( 0 );
            if (*info != 0){
                *info = *info + j;
                break;
            }

            // copy A(j,j) back to GPU
            trace_gpu_start( 0, 0, "set", "set" );
            magma_ssetmatrix_async(jb, jb, A(j, j), nb, dA(j, j), ldda, stream[0]);
            trace_gpu_end( 0, 0 );
            
            if ( (j+jb) < n) {
                // compute the off-diagonal blocks of current block column
                magmablasSetKernelStream( stream[0] );
                trace_gpu_start( 0, 0, "trsm", "trsm" );
                magma_strsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaUnit, 
                            (n-j-jb), jb, 
                            zone, dA(j,    j), ldda, 
                            dA(j+jb, j), ldda);
                magma_scopymatrix( n-j-jb,jb, dA( j+jb, j ), ldda, dW( j+jb, 0 ), ldda );
                
                // update the trailing submatrix with D
                magmablas_slascl_diag(MagmaLower, n-j-jb, jb,
                                      dA(j,    j), ldda,
                                      dA(j+jb, j), ldda,
                                      &iinfo);
                trace_gpu_end( 0, 0 );
                
                // update the trailing submatrix with L and W
                trace_gpu_start( 0, 0, "gemm", "gemm" );
                for (k=j+jb; k<n; k+=nb) {
                    magma_int_t kb = min(nb,n-k);
                    magma_sgemm(MagmaNoTrans, MagmaConjTrans, n-k, kb, jb,
                                mzone, dA(k, j), ldda, 
                                       dW(k, 0), ldda,
                                zone,  dA(k, k), ldda);
                    if (k==j+jb)
                        magma_event_record( event, stream[0] );
                }
                trace_gpu_end( 0, 0 );
            }
        }
    }
    
    trace_finalize( "ssytrf.svg","trace.css" );
    magma_queue_destroy(stream[0]);
    magma_queue_destroy(stream[1]);
    magma_event_destroy( event );
    magma_free( dW );
    magma_free_pinned( A );
    
    magmablasSetKernelStream( orig_stream );
    return MAGMA_SUCCESS;
} /* magma_ssytrf_nopiv */
Esempio n. 5
0
/**
    Purpose
    -------
    SGETRF_NOPIV_GPU computes an LU factorization of a general M-by-N
    matrix A without any pivoting.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    dA      REAL array on the GPU, dimension (LDDA,N).
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    @param[in]
    ldda     INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
      -     > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    @ingroup magma_sgesv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_sgetrf_nopiv_gpu(
    magma_int_t m, magma_int_t n,
    magmaFloat_ptr dA, magma_int_t ldda,
    magma_int_t *info)
{
#define dA(i,j) (dA + (i)*nb + (j)*nb*ldda)

    float c_one     = MAGMA_S_ONE;
    float c_neg_one = MAGMA_S_NEG_ONE;

    magma_int_t iinfo, nb;
    magma_int_t maxm, mindim;
    magma_int_t i, rows, s, lddwork;
    float *work;

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (ldda < max(1,m))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    nb     = magma_get_sgetrf_nb(m);
    s      = mindim / nb;

    if (nb <= 1 || nb >= min(m,n)) {
        /* Use CPU code. */
        magma_smalloc_cpu( &work, m * n );
        if ( work == NULL ) {
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        magma_sgetmatrix( m, n, dA, ldda, work, m );
        magma_sgetrf_nopiv( m, n, work, m, info);
        magma_ssetmatrix( m, n, work, m, dA, ldda );
        magma_free_cpu(work);
    }
    else {
        /* Use hybrid blocked code. */
        maxm = ((m + 31)/32)*32;

        lddwork = maxm;

        if (MAGMA_SUCCESS != magma_smalloc_pinned( &work, maxm*nb )) {
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }

        /* Define user stream if current stream is NULL */
        magma_queue_t stream[2];
        
        magma_queue_t orig_stream;
        magmablasGetKernelStream( &orig_stream );

        magma_queue_create( &stream[0] );
        if (orig_stream == NULL) {
            magma_queue_create( &stream[1] );
            magmablasSetKernelStream(stream[1]);
        }
        else {
            stream[1] = orig_stream;
        }

        for( i=0; i < s; i++ ) {
            // download i-th panel
            magma_queue_sync( stream[1] );
            magma_sgetmatrix_async( m-i*nb, nb, dA(i,i), ldda, work, lddwork, stream[0] );
            
            if ( i > 0 ) {
                magma_strsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit,
                             nb, n - (i+1)*nb,
                             c_one, dA(i-1,i-1), ldda,
                             dA(i-1,i+1), ldda );
                magma_sgemm( MagmaNoTrans, MagmaNoTrans,
                             m-i*nb, n-(i+1)*nb, nb,
                             c_neg_one, dA(i,  i-1), ldda, dA(i-1,i+1), ldda,
                             c_one,     dA(i,  i+1), ldda );
            }

            // do the cpu part
            rows = m - i*nb;
            magma_queue_sync( stream[0] );
            magma_sgetrf_nopiv( rows, nb, work, lddwork, &iinfo );
            if ( (*info == 0) && (iinfo > 0) )
                *info = iinfo + i*nb;

            // upload i-th panel
            magma_ssetmatrix_async( m-i*nb, nb, work, lddwork, dA(i, i), ldda, stream[0] );
            magma_queue_sync( stream[0] );

            // do the small non-parallel computations
            if ( s > (i+1) ) {
                magma_strsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit,
                             nb, nb,
                             c_one, dA(i, i  ), ldda,
                             dA(i, i+1), ldda);
                magma_sgemm( MagmaNoTrans, MagmaNoTrans,
                             m-(i+1)*nb, nb, nb,
                             c_neg_one, dA(i+1, i  ), ldda, dA(i,   i+1), ldda,
                             c_one,     dA(i+1, i+1), ldda );
            }
            else {
                magma_strsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit,
                             nb, n-s*nb,
                             c_one, dA(i, i  ), ldda,
                             dA(i, i+1), ldda);
                magma_sgemm( MagmaNoTrans, MagmaNoTrans,
                             m-(i+1)*nb, n-(i+1)*nb, nb,
                             c_neg_one, dA(i+1, i  ), ldda, dA(i,   i+1), ldda,
                             c_one,     dA(i+1, i+1), ldda );
            }
        }

        magma_int_t nb0 = min(m - s*nb, n - s*nb);
        rows = m - s*nb;
        magma_sgetmatrix( rows, nb0, dA(s,s), ldda, work, lddwork );

        // make sure that gpu queue is empty
        magma_device_sync();

        // do the cpu part
        magma_sgetrf_nopiv( rows, nb0, work, lddwork, &iinfo );
        if ( (*info == 0) && (iinfo > 0) )
            *info = iinfo + s*nb;

        // upload i-th panel
        magma_ssetmatrix( rows, nb0, work, lddwork, dA(s,s), ldda );

        magma_strsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit,
                     nb0, n-s*nb-nb0,
                     c_one, dA(s,s),     ldda,
                            dA(s,s)+nb0, ldda);

        magma_free_pinned( work );

        magma_queue_destroy( stream[0] );
        if (orig_stream == NULL) {
            magma_queue_destroy( stream[1] );
        }
        magmablasSetKernelStream( orig_stream );
    }

    return *info;
} /* magma_sgetrf_nopiv_gpu */
Esempio n. 6
0
/**
    Purpose
    -------
    SPOTRF computes the Cholesky factorization of a real symmetric
    positive definite matrix dA.

    The factorization has the form
       dA = U**H * U,   if UPLO = MagmaUpper, or
       dA = L  * L**H,  if UPLO = MagmaLower,
    where U is an upper triangular matrix and L is lower triangular.

    This is the block version of the algorithm, calling Level 3 BLAS.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of dA is stored;
      -     = MagmaLower:  Lower triangle of dA is stored.

    @param[in]
    n       INTEGER
            The order of the matrix dA.  N >= 0.

    @param[in,out]
    d_lA    REAL array of pointers on the GPU, dimension (ngpu)
            On entry, the symmetric matrix dA distributed over GPUs
            (dl_A[d] points to the local matrix on the d-th GPU).
            It is distributed in 1D block column or row cyclic (with the
            block size of nb) if UPLO = MagmaUpper or MagmaLower, respectively.
            If UPLO = MagmaUpper, the leading N-by-N upper triangular
            part of dA contains the upper triangular part of the matrix dA,
            and the strictly lower triangular part of dA is not referenced.
            If UPLO = MagmaLower, the leading N-by-N lower triangular part
            of dA contains the lower triangular part of the matrix dA, and
            the strictly upper triangular part of dA is not referenced.
    \n
            On exit, if INFO = 0, the factor U or L from the Cholesky
            factorization dA = U**H * U or dA = L * L**H.

    @param[in]
    ldda     INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            divisible by 16.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
      -     > 0:  if INFO = i, the leading minor of order i is not
                  positive definite, and the factorization could not be
                  completed.

    @ingroup magma_sposv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_spotrf_mgpu_right(
    magma_int_t ngpu,
    magma_uplo_t uplo, magma_int_t n,
    magmaFloat_ptr d_lA[], magma_int_t ldda,
    magma_int_t *info )
{
    #define dlA(id, i, j)  (d_lA[(id)] + (j) * ldda + (i))
    #define dlP(id, i, j)  (d_lP[(id)] + (j) * ldda + (i))

    #define panel(j)  (panel + (j))
    #define tmppanel(j)  (tmppanel + (j))
    #define tmpprevpanel(j)  (tmpprevpanel + (j))
    #define STREAM_ID(i) (nqueue > 1 ? 1+((i)/nb)%(nqueue-1) : 0)

    float z_one = MAGMA_S_MAKE(  1.0, 0.0 );
    float mz_one = MAGMA_S_MAKE( -1.0, 0.0 );
    float             one =  1.0;
    float             m_one = -1.0;
    const char* uplo_ = lapack_uplo_const( uplo );

    magma_int_t j, nb, d, id, j_local, blkid, crosspoint, prevtrsmrows=0, nqueue = 5;
    float *panel, *tmppanel0, *tmppanel1, *tmppanel, *tmpprevpanel;
    float *d_lP[MagmaMaxGPUs], *dlpanel, *dlpanels[MagmaMaxGPUs];
    magma_int_t rows, trsmrows, igpu, n_local[MagmaMaxGPUs], ldpanel;
    magma_queue_t queues[MagmaMaxGPUs][10];

    *info = 0;
    if ( uplo != MagmaUpper && uplo != MagmaLower ) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,n)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );

    nb = magma_get_spotrf_nb(n);

    ldpanel = ldda;
    magma_setdevice(0);
    if (MAGMA_SUCCESS != magma_smalloc_pinned( &panel, 2 * nb * ldpanel )) {
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }

    tmppanel0 = panel;
    tmppanel1 = tmppanel0 + nb * ldpanel;

    if ((nb <= 1) || (nb >= n)) {
        // Use unblocked code.
        magma_sgetmatrix( n, n, dlA(0, 0, 0), ldda, panel, ldpanel);
        lapackf77_spotrf( uplo_, &n, panel, &ldpanel, info);
        magma_ssetmatrix( n, n, panel, ldpanel, dlA(0, 0, 0), ldda );
    } else {
        for( d = 0; d < ngpu; d++ ) {
            // local-n and local-ld
            n_local[d] = ((n / nb) / ngpu) * nb;
            if (d < (n / nb) % ngpu)
                n_local[d] += nb;
            else if (d == (n / nb) % ngpu)
                n_local[d] += n % nb;

            magma_setdevice(d);
            magma_device_sync();
            if (MAGMA_SUCCESS != magma_smalloc( &d_lP[d], nb * ldda )) {
                for( j = 0; j < d; j++ ) {
                    magma_setdevice(j);
                    magma_free( d_lP[d] );
                }
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }
            for( j=0; j < nqueue; j++ ) {
                magma_queue_create( &queues[d][j] );
            }
        }

        //#define ENABLE_TIMER
        #if defined (ENABLE_TIMER)
        real_Double_t therk[4], tmtc, tcchol, tctrsm, tctm, tmnp, tcnp;
        real_Double_t ttot_herk[4] = {0,0,0,0}, ttot_mtc = 0, ttot_cchol = 0, ttot_ctrsm = 0, ttot_ctm = 0, ttot_mnp = 0, ttot_cnp = 0;
        printf("\n\n %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n",
                "j", "nb", "row", "mtc", "CPU_np", "panel", "ctrsm", "CH+TRSM", "CPU", "dsyrk[0]", "dsyrk[1]", "dsyrk[2]", "dsyrk[3]", "ctm P", "gpu_np");
        printf("     ====================================================================================================\n");
        #endif

        // Use blocked code.
        if (uplo == MagmaUpper) {
            printf( " === not supported, yet ===\n" );
        } else {
            blkid = -1;
            if (ngpu == 4)
                crosspoint = n;
            else if (ngpu == 3)
                crosspoint = n;
            else if (ngpu == 2)
                crosspoint = 20160;
            else
                crosspoint = 0;
            crosspoint = 0; //n; //n -- > gpu always does next panel, 0 --> cpu always does next panel
            crosspoint = n;

            #if defined (ENABLE_TIMER)
            real_Double_t tget = magma_wtime(), tset = 0.0, ttot = 0.0;
            #endif
            if ( n > nb ) {
                // send first panel to cpu
                magma_setdevice(0);
                tmppanel = tmppanel0;
                magma_sgetmatrix_async(n, nb,
                        dlA(0, 0, 0), ldda,
                        tmppanel(0),  ldpanel,
                        queues[0][0] );
            }
            #if defined (ENABLE_TIMER)
            for( d=0; d < ngpu; d++ ) {
                magma_setdevice(d);
                magma_device_sync();
            }
            tget = magma_wtime()-tget;
            #endif

            // Compute the Cholesky factorization A = L*L'
            for (j = 0; (j + nb) < n; j += nb) {
                #if defined (ENABLE_TIMER)
                therk[0] = therk[1] = therk[2] = therk[3] = tmtc = tcchol = tctrsm = tctm = tmnp = tcnp = 0.0;
                #endif

                blkid += 1;
                tmppanel = (blkid % 2 == 0) ? tmppanel0 : tmppanel1;
                // Set the gpu number that holds the current panel
                id = (j / nb) % ngpu;
                magma_setdevice(id);

                // Set the local index where the current panel is
                j_local = j / (nb * ngpu) * nb;
                
                rows = n - j;
                // Wait for the panel on cpu
                magma_queue_sync( queues[id][0] );
                if (j > 0 && prevtrsmrows > crosspoint) {
                    #if defined (ENABLE_TIMER)
                    tcnp = magma_wtime();
                    #endif

                    tmpprevpanel = ((blkid - 1) % 2) == 0 ? tmppanel0 : tmppanel1;

                    blasf77_sgemm( MagmaNoTransStr, MagmaConjTransStr,
                            &rows, &nb, &nb,
                            &mz_one, tmpprevpanel(j), &ldpanel,
                                     tmpprevpanel(j), &ldpanel,
                            &z_one,      tmppanel(j), &ldpanel );

                    #if defined (ENABLE_TIMER)
                    tcnp = magma_wtime() - tcnp;
                    ttot_cnp += tcnp;
                    #endif
                }

                #if defined (ENABLE_TIMER)
                tcchol = magma_wtime();
                #endif
                lapackf77_spotrf(MagmaLowerStr, &nb, tmppanel(j), &ldpanel, info);
                if (*info != 0) {
                    *info = *info + j;
                    break;
                }

                #if defined (ENABLE_TIMER)
                tcchol = magma_wtime() - tcchol;
                ttot_cchol += tcchol;
                tctrsm = magma_wtime();
                #endif

                trsmrows = rows - nb;

                if (trsmrows > 0) {
                    blasf77_strsm(MagmaRightStr, MagmaLowerStr, MagmaConjTransStr, MagmaNonUnitStr,
                                  &trsmrows, &nb,
                                  &z_one, tmppanel(j), &ldpanel,
                                          tmppanel(j + nb), &ldpanel);
                }

                #if defined (ENABLE_TIMER)
                tctrsm = magma_wtime() - tctrsm;
                ttot_ctrsm += tctrsm;
                tctm = magma_wtime();
                #endif

                d = (id + 1) % ngpu;
                // send current panel to gpus
                for (igpu = 0; igpu < ngpu; igpu++, d = (d + 1) % ngpu ) {
                    magma_int_t myrows = 0;
                    magma_int_t row_offset = 0;
                    if ( d == id ) {
                        dlpanel = dlA(d, j, j_local);
                        myrows = rows;
                        row_offset = 0;
                    } else {
                        dlpanel = dlP(d, 0, 0);
                        myrows = trsmrows;
                        row_offset = nb;
                    }

                    if (myrows > 0) {
                        magma_setdevice(d);
                        magma_ssetmatrix_async(myrows, nb,
                                tmppanel(j + row_offset),    ldpanel,
                                dlpanel, ldda, queues[d][0] );
                    }
                }
                /* make sure panel is on GPUs */
                d = (id + 1) % ngpu;
                for (igpu = 0; igpu < ngpu; igpu++, d = (d + 1) % ngpu ) {
                    magma_setdevice(d);
                    magma_queue_sync( queues[d][0] );
                }

                #if defined (ENABLE_TIMER)
                tctm = magma_wtime() - tctm;
                ttot_ctm += tctm;
                #endif

                if ( (j + nb) < n) {
                    magma_int_t offset = 0;
                    magma_int_t row_offset = 0;
                    if (j + nb + nb < n) {
                        d = (id + 1) % ngpu;
                        magma_setdevice(d);
                        magma_int_t j_local2 = (j + nb) / (nb * ngpu) * nb;
                        if (trsmrows <= crosspoint) {
                            #if defined (ENABLE_TIMER)
                            tmnp = magma_wtime();
                            #endif

                            // do gemm on look ahead panel
                            if ( d == id ) {
                                dlpanel = dlA(d, j + nb, j_local);
                            } else {
                                dlpanel = dlP(d, 0, 0);
                            }

                            magmablasSetKernelStream( queues[d][STREAM_ID(j_local2)] );
                            #define SSYRK_ON_DIAG
                            #ifdef  SSYRK_ON_DIAG
                            magma_ssyrk( MagmaLower, MagmaNoTrans,
                                    nb, nb,
                                    m_one, dlpanel, ldda,
                                     one,  dlA(d, j + nb, j_local2), ldda);
                            magma_sgemm( MagmaNoTrans, MagmaConjTrans,
                                    trsmrows-nb, nb, nb,
                                    mz_one, dlpanel+nb, ldda,
                                            dlpanel,    ldda,
                                     z_one, dlA(d, j + nb +nb, j_local2), ldda);
                            #else
                            magma_sgemm( MagmaNoTrans, MagmaConjTrans,
                                    trsmrows, nb, nb,
                                    mz_one, dlpanel, ldda,
                                            dlpanel, ldda,
                                     z_one, dlA(d, j + nb, j_local2), ldda);
                            #endif

                            #if defined (ENABLE_TIMER)
                            magma_device_sync();
                            tmnp = magma_wtime() - tmnp;
                            ttot_mnp += tmnp;
                            #endif
                        }
                        // send next panel to cpu
                        magma_queue_sync( queues[d][STREAM_ID(j_local2)] ); // make sure lookahead is done
                        tmppanel = ((blkid+1) % 2 == 0) ? tmppanel0 : tmppanel1;
                        magma_sgetmatrix_async(rows-nb, nb,
                                dlA(d, j+nb, j_local2), ldda,
                                tmppanel(j+nb),  ldpanel,
                                queues[d][0] );
                        tmppanel = (blkid % 2 == 0) ? tmppanel0 : tmppanel1;

                        offset = j + nb + nb;
                        row_offset = nb;
                    } else {
                        offset = j + nb;
                        row_offset = 0;
                    }

                    if (n - offset > 0) {
                        // syrk on multiple gpu
                        for (d = 0; d < ngpu; d++ ) {
                            if ( d == id ) {
                                dlpanels[d] = dlA(d, j + nb + row_offset, j_local);
                            } else {
                                dlpanels[d] = dlP(d, row_offset, 0);
                            }
                        }

                        #if defined (ENABLE_TIMER)
                        for( d=0; d < ngpu; d++ ) therk[d] = magma_wtime();
                        #endif

                        //magmablasSetKernelStream( queues[d] );
                        //magma_ssyrk(MagmaLower, MagmaNoTrans, n - offset, nb,
                        //        m_one, dlpanel, ldda,
                        //        one, &d_lA[d][offset + offset*ldda], ldda );
                        #ifdef  SSYRK_ON_DIAG
                        magma_ssyrk_mgpu
                        #else
                        magma_ssyrk_mgpu2
                        #endif
                                        (ngpu, MagmaLower, MagmaNoTrans,
                                         nb, n - offset, nb,
                                         m_one, dlpanels, ldda, 0,
                                         one,   d_lA,     ldda, offset,
                                         nqueue, queues );
                        #if defined (ENABLE_TIMER)
                        for( d=0; d < ngpu; d++ ) {
                            magma_setdevice(d);
                            magma_device_sync();
                            therk[d] = magma_wtime() - therk[d];
                            ttot_herk[d] += therk[d];
                        }
                        #endif
                    }

                    prevtrsmrows = trsmrows;

                    #if defined (ENABLE_TIMER)
                    ttot += (tcnp+tcchol+tctrsm+therk[0]+therk[1]+therk[2]+tctm+tmnp);
                    printf("%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(%d) %10.3lf\n",
                            j, nb, rows, tmtc,
                            tcnp,     // gemm
                            tcchol,   // potrf
                            tctrsm,   // trsm
                            (tcchol + tctrsm),
                            (tmtc+tcnp+tcchol+tctrsm),
                            therk[0], therk[1], therk[2], therk[3], // syrk
                            tctm, // copy panel to GPU
                            tmnp, // lookahead on GPU
                            (id + 1) % ngpu,
                            (tcnp+tcchol+tctrsm+therk[0]+therk[1]+therk[2]+tctm+tmnp));
                    fflush(0);
                    #endif
                }
            }
            for( d = 0; d < ngpu; d++ ) {
                magma_setdevice(d);
                for( id=0; id < nqueue; id++ ) {
                    magma_queue_sync( queues[d][id] );
                }
            }
            #if defined (ENABLE_TIMER)
            printf("\n%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(-) %10.3lf\n",
                    n, n, 0, ttot_mtc,
                    ttot_cnp,     // gemm
                    ttot_cchol,   // potrf
                    ttot_ctrsm,   // trsm
                    (ttot_cchol + ttot_ctrsm),
                    (ttot_mtc+ttot_cnp+ttot_cchol+ttot_ctrsm),
                    ttot_herk[0], ttot_herk[1], ttot_herk[2], ttot_herk[3], // syrk
                    ttot_ctm, // copy panel to GPU
                    ttot_mnp, // lookahead on GPU
                    (ttot_cnp+ttot_cchol+ttot_ctrsm+ttot_herk[0]+ttot_herk[1]+ttot_herk[2]+ttot_ctm+ttot_mnp));
            printf("%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(-) %10.3lf (ratio)\n",
                    n, n, 0, ttot_mtc/ttot,
                    ttot_cnp/ttot,     // gemm
                    ttot_cchol/ttot,   // potrf
                    ttot_ctrsm/ttot,   // trsm
                    (ttot_cchol + ttot_ctrsm)/ttot,
                    (ttot_mtc+ttot_cnp+ttot_cchol+ttot_ctrsm)/ttot,
                    ttot_herk[0]/ttot, ttot_herk[1]/ttot, ttot_herk[2]/ttot, ttot_herk[3]/ttot, // syrk
                    ttot_ctm/ttot, // copy panel to GPU
                    ttot_mnp/ttot, // lookahead on GPU
                    (ttot_cnp+ttot_cchol+ttot_ctrsm+ttot_herk[0]+ttot_herk[1]+ttot_herk[2]+ttot_ctm+ttot_mnp)/ttot);
            #endif

            // cholesky for the last block
            if (j < n && *info == 0) {
                rows = n - j;
                id = (j / nb) % ngpu;

                // Set the local index where the current panel is
                j_local = j / (nb * ngpu) * nb;
                
                magma_setdevice(id);
                #if defined (ENABLE_TIMER)
                tset = magma_wtime();
                #endif
                magma_sgetmatrix(rows, rows, dlA(id, j, j_local), ldda, panel(j), ldpanel);
                lapackf77_spotrf(MagmaLowerStr, &rows, panel(j), &ldpanel, info);
                magma_ssetmatrix(rows, rows, panel(j), ldpanel, dlA(id, j, j_local), ldda);
                #if defined (ENABLE_TIMER)
                tset = magma_wtime() - tset;
                #endif
            }
            #if defined (ENABLE_TIMER)
            printf( " matrix_get,set: %10.3lf %10.3lf -> %10.3lf\n",tget,tset,ttot+tget+tset );
            #endif
        } // end of else not upper

        // clean up
        for( d = 0; d < ngpu; d++ ) {
            magma_setdevice(d);
            for( j=0; j < nqueue; j++ ) {
                magma_queue_destroy( queues[d][j] );
            }
            magma_free( d_lP[d] );
        }
    } // end of not lapack

    // free workspace
    magma_free_pinned( panel );
    magma_setdevice( orig_dev );
    magmablasSetKernelStream( orig_stream );

    return *info;
} /* magma_spotrf_mgpu_right */
Esempio n. 7
0
/***************************************************************************//**
    Purpose
    -------
    SPOTRF computes the Cholesky factorization of a real symmetric
    positive definite matrix dA.

    The factorization has the form
        dA = U**H * U,   if UPLO = MagmaUpper, or
        dA = L  * L**H,  if UPLO = MagmaLower,
    where U is an upper triangular matrix and L is lower triangular.

    This is the block version of the algorithm, calling Level 3 BLAS.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of dA is stored;
      -     = MagmaLower:  Lower triangle of dA is stored.

    @param[in]
    n       INTEGER
            The order of the matrix dA.  N >= 0.

    @param[in,out]
    dA      REAL array on the GPU, dimension (LDDA,N)
            On entry, the symmetric matrix dA.  If UPLO = MagmaUpper, the leading
            N-by-N upper triangular part of dA contains the upper
            triangular part of the matrix dA, and the strictly lower
            triangular part of dA is not referenced.  If UPLO = MagmaLower, the
            leading N-by-N lower triangular part of dA contains the lower
            triangular part of the matrix dA, and the strictly upper
            triangular part of dA is not referenced.
    \n
            On exit, if INFO = 0, the factor U or L from the Cholesky
            factorization dA = U**H * U or dA = L * L**H.

    @param[in]
    ldda     INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            divisible by 16.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
      -     > 0:  if INFO = i, the leading minor of order i is not
                  positive definite, and the factorization could not be
                  completed.

    @ingroup magma_potrf
*******************************************************************************/
extern "C" magma_int_t
magma_spotrf_gpu(
    magma_uplo_t uplo, magma_int_t n,
    magmaFloat_ptr dA, magma_int_t ldda,
    magma_int_t *info )
{
    #ifdef HAVE_clBLAS
    #define dA(i_, j_)  dA, ((i_) + (j_)*ldda + dA_offset)
    #else
    #define dA(i_, j_) (dA + (i_) + (j_)*ldda)
    #endif

    /* Constants */
    const float c_one     = MAGMA_S_ONE;
    const float c_neg_one = MAGMA_S_NEG_ONE;
    const float d_one     =  1.0;
    const float d_neg_one = -1.0;
    
    /* Local variables */
    const char* uplo_ = lapack_uplo_const( uplo );
    bool upper = (uplo == MagmaUpper);
    
    magma_int_t j, jb, nb;
    float *work;

    *info = 0;
    if (! upper && uplo != MagmaLower) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,n)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    
    nb = magma_get_spotrf_nb( n );
    
    if (MAGMA_SUCCESS != magma_smalloc_pinned( &work, nb*nb )) {
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }
    
    magma_queue_t queues[2];
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queues[0] );
    magma_queue_create( cdev, &queues[1] );
    
    if (nb <= 1 || nb >= n) {
        /* Use unblocked code. */
        magma_sgetmatrix( n, n, dA(0,0), ldda, work, n, queues[0] );
        lapackf77_spotrf( uplo_, &n, work, &n, info );
        magma_ssetmatrix( n, n, work, n, dA(0,0), ldda, queues[0] );
    }
    else {
        /* Use blocked code. */
        if (upper) {
            //=========================================================
            /* Compute the Cholesky factorization A = U'*U. */
            for (j=0; j < n; j += nb) {
                // apply all previous updates to diagonal block,
                // then transfer it to CPU
                jb = min( nb, n-j );
                magma_ssyrk( MagmaUpper, MagmaConjTrans, jb, j,
                             d_neg_one, dA(0, j), ldda,
                             d_one,     dA(j, j), ldda, queues[1] );
                
                magma_queue_sync( queues[1] );
                magma_sgetmatrix_async( jb, jb,
                                        dA(j, j), ldda,
                                        work,     jb, queues[0] );
                
                // apply all previous updates to block row right of diagonal block
                if (j+jb < n) {
                    magma_sgemm( MagmaConjTrans, MagmaNoTrans,
                                 jb, n-j-jb, j,
                                 c_neg_one, dA(0, j   ), ldda,
                                            dA(0, j+jb), ldda,
                                 c_one,     dA(j, j+jb), ldda, queues[1] );
                }
                
                // simultaneous with above sgemm, transfer diagonal block,
                // factor it on CPU, and test for positive definiteness
                magma_queue_sync( queues[0] );
                lapackf77_spotrf( MagmaUpperStr, &jb, work, &jb, info );
                magma_ssetmatrix_async( jb, jb,
                                        work,     jb,
                                        dA(j, j), ldda, queues[1] );
                if (*info != 0) {
                    *info = *info + j;
                    break;
                }
                
                // apply diagonal block to block row right of diagonal block
                if (j+jb < n) {
                    magma_strsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                 jb, n-j-jb,
                                 c_one, dA(j, j),    ldda,
                                        dA(j, j+jb), ldda, queues[1] );
                }
            }
        }
        else {
            //=========================================================
            // Compute the Cholesky factorization A = L*L'.
            for (j=0; j < n; j += nb) {
                // apply all previous updates to diagonal block,
                // then transfer it to CPU
                jb = min( nb, n-j );
                magma_ssyrk( MagmaLower, MagmaNoTrans, jb, j,
                             d_neg_one, dA(j, 0), ldda,
                             d_one,     dA(j, j), ldda, queues[1] );
                
                magma_queue_sync( queues[1] );
                magma_sgetmatrix_async( jb, jb,
                                        dA(j, j), ldda,
                                        work,     jb, queues[0] );
                
                // apply all previous updates to block column below diagonal block
                if (j+jb < n) {
                    magma_sgemm( MagmaNoTrans, MagmaConjTrans,
                                 n-j-jb, jb, j,
                                 c_neg_one, dA(j+jb, 0), ldda,
                                            dA(j,    0), ldda,
                                 c_one,     dA(j+jb, j), ldda, queues[1] );
                }
                
                // simultaneous with above sgemm, transfer diagonal block,
                // factor it on CPU, and test for positive definiteness
                magma_queue_sync( queues[0] );
                lapackf77_spotrf( MagmaLowerStr, &jb, work, &jb, info );
                magma_ssetmatrix_async( jb, jb,
                                        work,     jb,
                                        dA(j, j), ldda, queues[1] );
                if (*info != 0) {
                    *info = *info + j;
                    break;
                }
                
                // apply diagonal block to block column below diagonal
                if (j+jb < n) {
                    magma_strsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                 n-j-jb, jb,
                                 c_one, dA(j,    j), ldda,
                                        dA(j+jb, j), ldda, queues[1] );
                }
            }
        }
    }
    
    magma_queue_destroy( queues[0] );
    magma_queue_destroy( queues[1] );
    
    magma_free_pinned( work );
    
    return *info;
} /* magma_spotrf_gpu */
Esempio n. 8
0
/**
    Purpose
    -------
    SGETRF_NOPIV_GPU computes an LU factorization of a general M-by-N
    matrix A without any pivoting.

    The factorization has the form
        A = L * U
    where L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    dA      REAL array on the GPU, dimension (LDDA,N).
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    @param[in]
    ldda     INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
      -     > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    @ingroup magma_sgesv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_sgetrf_nopiv_gpu(
    magma_int_t m, magma_int_t n,
    magmaFloat_ptr dA, magma_int_t ldda,
    magma_int_t *info )
{
    #ifdef HAVE_clBLAS
    #define  dA(i_, j_) dA,  (dA_offset  + (i_)*nb       + (j_)*nb*ldda)
    #else
    #define  dA(i_, j_) (dA  + (i_)*nb       + (j_)*nb*ldda)
    #endif

    float c_one     = MAGMA_S_ONE;
    float c_neg_one = MAGMA_S_NEG_ONE;

    magma_int_t iinfo, nb;
    magma_int_t maxm, mindim;
    magma_int_t j, rows, s, ldwork;
    float *work;

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (ldda < max(1,m))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min( m, n );
    nb     = magma_get_sgetrf_nb( m, n );
    s      = mindim / nb;

    magma_queue_t queues[2];
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queues[0] );
    magma_queue_create( cdev, &queues[1] );

    if (nb <= 1 || nb >= min(m,n)) {
        /* Use CPU code. */
        if ( MAGMA_SUCCESS != magma_smalloc_cpu( &work, m*n )) {
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        magma_sgetmatrix( m, n, dA(0,0), ldda, work, m, queues[0] );
        magma_sgetrf_nopiv( m, n, work, m, info );
        magma_ssetmatrix( m, n, work, m, dA(0,0), ldda, queues[0] );
        magma_free_cpu( work );
    }
    else {
        /* Use hybrid blocked code. */
        maxm = magma_roundup( m, 32 );

        ldwork = maxm;
        if (MAGMA_SUCCESS != magma_smalloc_pinned( &work, ldwork*nb )) {
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }

        for( j=0; j < s; j++ ) {
            // get j-th panel from device
            magma_queue_sync( queues[1] );
            magma_sgetmatrix_async( m-j*nb, nb, dA(j,j), ldda, work, ldwork, queues[0] );
            
            if ( j > 0 ) {
                magma_strsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit,
                             nb, n - (j+1)*nb,
                             c_one, dA(j-1,j-1), ldda,
                                    dA(j-1,j+1), ldda, queues[1] );
                magma_sgemm( MagmaNoTrans, MagmaNoTrans,
                             m-j*nb, n-(j+1)*nb, nb,
                             c_neg_one, dA(j,  j-1), ldda,
                                        dA(j-1,j+1), ldda,
                             c_one,     dA(j,  j+1), ldda, queues[1] );
            }

            // do the cpu part
            rows = m - j*nb;
            magma_queue_sync( queues[0] );
            magma_sgetrf_nopiv( rows, nb, work, ldwork, &iinfo );
            if ( *info == 0 && iinfo > 0 )
                *info = iinfo + j*nb;

            // send j-th panel to device
            magma_ssetmatrix_async( m-j*nb, nb, work, ldwork, dA(j, j), ldda, queues[0] );
            magma_queue_sync( queues[0] );

            // do the small non-parallel computations (next panel update)
            if ( s > j+1 ) {
                magma_strsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit,
                             nb, nb,
                             c_one, dA(j, j  ), ldda,
                                    dA(j, j+1), ldda, queues[1] );
                magma_sgemm( MagmaNoTrans, MagmaNoTrans,
                             m-(j+1)*nb, nb, nb,
                             c_neg_one, dA(j+1, j  ), ldda,
                                        dA(j,   j+1), ldda,
                             c_one,     dA(j+1, j+1), ldda, queues[1] );
            }
            else {
                magma_strsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit,
                             nb, n-s*nb,
                             c_one, dA(j, j  ), ldda,
                                    dA(j, j+1), ldda, queues[1] );
                magma_sgemm( MagmaNoTrans, MagmaNoTrans,
                             m-(j+1)*nb, n-(j+1)*nb, nb,
                             c_neg_one, dA(j+1, j  ), ldda,
                                        dA(j,   j+1), ldda,
                             c_one,     dA(j+1, j+1), ldda, queues[1] );
            }
        }

        magma_int_t nb0 = min( m - s*nb, n - s*nb );
        if ( nb0 > 0 ) {
            rows = m - s*nb;
            
            magma_sgetmatrix( rows, nb0, dA(s,s), ldda, work, ldwork, queues[1] );
            
            // do the cpu part
            magma_sgetrf_nopiv( rows, nb0, work, ldwork, &iinfo );
            if ( *info == 0 && iinfo > 0 )
                *info = iinfo + s*nb;
    
            // send j-th panel to device
            magma_ssetmatrix( rows, nb0, work, ldwork, dA(s,s), ldda, queues[1] );
    
            magma_strsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit,
                         nb0, n-s*nb-nb0,
                         c_one, dA(s,s),     ldda,
                                dA(s,s)+nb0, ldda, queues[1] );
        }
        
        magma_free_pinned( work );
    }
    
    magma_queue_destroy( queues[0] );
    magma_queue_destroy( queues[1] );
    
    return *info;
} /* magma_sgetrf_nopiv_gpu */
Esempio n. 9
0
/***************************************************************************//**
    Purpose
    -------
    SORMQR overwrites the general real M-by-N matrix C with

    @verbatim
                                SIDE = MagmaLeft    SIDE = MagmaRight
    TRANS = MagmaNoTrans:       Q * C               C * Q
    TRANS = MagmaTrans:    Q**H * C            C * Q**H
    @endverbatim

    where Q is a real orthogonal matrix defined as the product of k
    elementary reflectors

          Q = H(1) H(2) . . . H(k)

    as returned by SGEQRF. Q is of order M if SIDE = MagmaLeft and of order N
    if SIDE = MagmaRight.

    Arguments
    ---------
    @param[in]
    ngpu    INTEGER
            Number of GPUs to use. ngpu > 0.

    @param[in]
    side    magma_side_t
      -     = MagmaLeft:      apply Q or Q**H from the Left;
      -     = MagmaRight:     apply Q or Q**H from the Right.

    @param[in]
    trans   magma_trans_t
      -     = MagmaNoTrans:    No transpose, apply Q;
      -     = MagmaTrans: Conjugate transpose, apply Q**H.

    @param[in]
    m       INTEGER
            The number of rows of the matrix C. M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix C. N >= 0.

    @param[in]
    k       INTEGER
            The number of elementary reflectors whose product defines
            the matrix Q.
            If SIDE = MagmaLeft,  M >= K >= 0;
            if SIDE = MagmaRight, N >= K >= 0.

    @param[in]
    A       REAL array, dimension (LDA,K)
            The i-th column must contain the vector which defines the
            elementary reflector H(i), for i = 1,2,...,k, as returned by
            SGEQRF in the first k columns of its array argument A.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.
            If SIDE = MagmaLeft,  LDA >= max(1,M);
            if SIDE = MagmaRight, LDA >= max(1,N).

    @param[in]
    tau     REAL array, dimension (K)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by SGEQRF.

    @param[in,out]
    C       REAL array, dimension (LDC,N)
            On entry, the M-by-N matrix C.
            On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q.

    @param[in]
    ldc     INTEGER
            The leading dimension of the array C. LDC >= max(1,M).

    @param[out]
    work    (workspace) REAL array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.
            If SIDE = MagmaLeft,  LWORK >= max(1,N);
            if SIDE = MagmaRight, LWORK >= max(1,M).
            For optimum performance LWORK >= N*NB if SIDE = MagmaLeft, and
            LWORK >= M*NB if SIDE = MagmaRight, where NB is the optimal
            blocksize.
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value

    @ingroup magma_unmqr
*******************************************************************************/
extern "C" magma_int_t
magma_sormqr_m(
    magma_int_t ngpu,
    magma_side_t side, magma_trans_t trans,
    magma_int_t m, magma_int_t n, magma_int_t k,
    float *A,    magma_int_t lda,
    float *tau,
    float *C,    magma_int_t ldc,
    float *work, magma_int_t lwork,
    magma_int_t *info)
{
#define  A(i, j) (A + (j)*lda  + (i))
#define  C(i, j) (C + (j)*ldc  + (i))

#define    dC(gpui,      i, j) (dw[gpui] + (j)*lddc + (i))
#define  dA_c(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddac)
#define  dA_r(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddar)
#define    dT(gpui, ind)       (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + (ind)*((nb+1)*nb))
#define dwork(gpui, ind)       (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + 2*((nb+1)*nb) + (ind)*(lddwork*nb))

    /* Constants */
    float c_zero = MAGMA_S_ZERO;
    float c_one  = MAGMA_S_ONE;

    /* Local variables */
    const char* side_  = lapack_side_const( side );
    const char* trans_ = lapack_trans_const( trans );

    magma_int_t nb = 128;
    float *T = NULL;
    magmaFloat_ptr dw[MagmaMaxGPUs] = { NULL };
    magma_queue_t queues[MagmaMaxGPUs][2] = {{ NULL }};
    magma_event_t events[MagmaMaxGPUs][2] = {{ NULL }};

    magma_int_t ind_c;
    magma_device_t dev;
    
    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );

    *info = 0;

    magma_int_t left   = (side == MagmaLeft);
    magma_int_t notran = (trans == MagmaNoTrans);
    magma_int_t lquery = (lwork == -1);

    /* NQ is the order of Q and NW is the minimum dimension of WORK */
    magma_int_t nq, nw;
    if (left) {
        nq = m;
        nw = n;
    } else {
        nq = n;
        nw = m;
    }

    if (! left && side != MagmaRight) {
        *info = -1;
    } else if (! notran && trans != MagmaTrans) {
        *info = -2;
    } else if (m < 0) {
        *info = -3;
    } else if (n < 0) {
        *info = -4;
    } else if (k < 0 || k > nq) {
        *info = -5;
    } else if (lda < max(1,nq)) {
        *info = -7;
    } else if (ldc < max(1,m)) {
        *info = -10;
    } else if (lwork < max(1,nw) && ! lquery) {
        *info = -12;
    }

    magma_int_t lwkopt = max(1,nw) * nb;
    if (*info == 0) {
        work[0] = magma_smake_lwork( lwkopt );
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0 || k == 0) {
        work[0] = c_one;
        return *info;
    }

    if (nb >= k) {
        /* Use CPU code */
        lapackf77_sormqr(side_, trans_, &m, &n, &k, A, &lda, tau,
                         C, &ldc, work, &lwork, info);
        return *info;
    }

    magma_int_t lddc = magma_roundup( m, 64 );  // TODO why 64 instead of 32 ?
    magma_int_t lddac = nq;
    magma_int_t lddar = nb;
    magma_int_t lddwork = nw;

    magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 };

    magma_int_t nb_l=256;
    magma_int_t nbl = magma_ceildiv( n, nb_l ); // number of blocks
    magma_int_t maxnlocal = magma_ceildiv( nbl, ngpu )*nb_l;

    ngpu = min( ngpu, magma_ceildiv( n, nb_l )); // Don't use GPU that will not have data.

    magma_int_t ldw = maxnlocal*lddc // dC
                    + 2*lddac*lddar // 2*dA
                    + 2*(nb + 1 + lddwork)*nb; // 2*(dT and dwork)

    if (MAGMA_SUCCESS != magma_smalloc_pinned( &T, nb*nb )) {
        *info = MAGMA_ERR_HOST_ALLOC;
        goto cleanup;
    }
    for (dev = 0; dev < ngpu; ++dev) {
        magma_setdevice( dev );
        if (MAGMA_SUCCESS != magma_smalloc( &dw[dev], ldw )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            goto cleanup;
        }
        magma_queue_create( dev, &queues[dev][0] );
        magma_queue_create( dev, &queues[dev][1] );
        magma_event_create( &events[dev][0] );
        magma_event_create( &events[dev][1] );
    }

    /* Use hybrid CPU-MGPU code */
    if (left) {
        //copy C to mgpus
        for (magma_int_t i = 0; i < nbl; ++i) {
            dev = i % ngpu;
            magma_setdevice( dev );
            magma_int_t kb = min(nb_l, n-i*nb_l);
            magma_ssetmatrix_async( m, kb,
                                   C(0, i*nb_l), ldc,
                                   dC(dev, 0, i/ngpu*nb_l), lddc, queues[dev][0] );
            nlocal[dev] += kb;
        }

        magma_int_t i1, i2, i3;
        if ( !notran ) {
            i1 = 0;
            i2 = k;
            i3 = nb;
        } else {
            i1 = (k - 1) / nb * nb;
            i2 = 0;
            i3 = -nb;
        }

        ind_c = 0;

        for (magma_int_t i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) {
            // start the copy of A panel
            magma_int_t kb = min(nb, k - i);
            for (dev = 0; dev < ngpu; ++dev) {
                magma_setdevice( dev );
                magma_event_sync( events[dev][ind_c] ); // check if the new data can be copied
                magma_ssetmatrix_async(nq-i, kb,
                                       A(i, i),                 lda,
                                       dA_c(dev, ind_c, i, 0), lddac, queues[dev][0] );
                // set upper triangular part of dA to identity
                magmablas_slaset_band( MagmaUpper, kb, kb, kb, c_zero, c_one, dA_c(dev, ind_c, i, 0), lddac, queues[dev][0] );
            }

            /* Form the triangular factor of the block reflector
             H = H(i) H(i+1) . . . H(i+ib-1) */
            magma_int_t nqi = nq - i;
            lapackf77_slarft("F", "C", &nqi, &kb, A(i, i), &lda,
                             &tau[i], T, &kb);

            /* H or H' is applied to C(1:m,i:n) */

            /* Apply H or H'; First copy T to the GPU */
            for (dev = 0; dev < ngpu; ++dev) {
                magma_setdevice( dev );
                magma_ssetmatrix_async(kb, kb,
                                       T,               kb,
                                       dT(dev, ind_c), kb, queues[dev][0] );
            }

            for (dev = 0; dev < ngpu; ++dev) {
                magma_setdevice( dev );
                magma_queue_sync( queues[dev][0] ); // check if the data was copied
                magma_slarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
                                 m-i, nlocal[dev], kb,
                                 dA_c(dev, ind_c, i, 0), lddac, dT(dev, ind_c), kb,
                                 dC(dev, i, 0), lddc,
                                 dwork(dev, ind_c), lddwork, queues[dev][1] );
                magma_event_record(events[dev][ind_c], queues[dev][1] );
            }

            ind_c = (ind_c+1)%2;
        }

        for (dev = 0; dev < ngpu; ++dev) {
            magma_setdevice( dev );
            magma_queue_sync( queues[dev][1] );
        }

        //copy C from mgpus
        for (magma_int_t i = 0; i < nbl; ++i) {
            dev = i % ngpu;
            magma_setdevice( dev );
            magma_int_t kb = min(nb_l, n-i*nb_l);
            magma_sgetmatrix( m, kb,
                              dC(dev, 0, i/ngpu*nb_l), lddc,
                              C(0, i*nb_l), ldc, queues[dev][1] );
//            magma_sgetmatrix_async( m, kb,
//                                   dC(dev, 0, i/ngpu*nb_l), lddc,
//                                   C(0, i*nb_l), ldc, queues[dev][0] );
        }
    } else {
        *info = MAGMA_ERR_NOT_IMPLEMENTED;
        magma_xerbla( __func__, -(*info) );
        goto cleanup;
        
        /*
        if ( notran ) {
            i1 = 0;
            i2 = k;
            i3 = nb;
        } else {
            i1 = (k - 1) / nb * nb;
            i2 = 0;
            i3 = -nb;
        }

        mi = m;
        ic = 0;

        for (i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) {
            ib = min(nb, k - i);
            
            // Form the triangular factor of the block reflector
            // H = H(i) H(i+1) . . . H(i+ib-1)
            i__4 = nq - i;
            lapackf77_slarft("F", "C", &i__4, &ib, A(i, i), &lda,
            &tau[i], T, &ib);
            
            // 1) copy the panel from A to the GPU, and
            // 2) set upper triangular part of dA to identity
            magma_ssetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda, queues[dev][1] );
            magmablas_slaset_band( MagmaUpper, ib, ib, ib, c_zero, c_one, dA(i, 0), ldda, queues[dev][1] );
            
            // H or H' is applied to C(1:m,i:n)
            ni = n - i;
            jc = i;
            
            // Apply H or H'; First copy T to the GPU
            magma_ssetmatrix( ib, ib, T, ib, dT, ib, queues[dev][1] );
            magma_slarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
            mi, ni, ib,
            dA(i, 0), ldda, dT, ib,
            dC(ic, jc), lddc,
            dwork, lddwork, queues[dev][1] );
        }
        */
    }

cleanup:
    work[0] = magma_smake_lwork( lwkopt );

    for (dev = 0; dev < ngpu; ++dev) {
        magma_setdevice( dev );
        magma_event_destroy( events[dev][0] );
        magma_event_destroy( events[dev][1] );
        magma_queue_destroy( queues[dev][0] );
        magma_queue_destroy( queues[dev][1] );
        magma_free( dw[dev] );
    }
    magma_setdevice( orig_dev );
    magma_free_pinned( T );

    return *info;
} /* magma_sormqr */
Esempio n. 10
0
/**
    Purpose
    -------
    SGEQRF computes a QR factorization of a real M-by-N matrix A:
    A = Q * R.
    
    This version stores the triangular dT matrices used in
    the block QR factorization so that they can be applied directly (i.e.,
    without being recomputed) later. As a result, the application
    of Q is much faster. Also, the upper triangular matrices for V have 0s
    in them. The corresponding parts of the upper triangular R are inverted
    and stored separately in dT.
    
    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    dA      REAL array on the GPU, dimension (LDDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).

    @param[in]
    ldda     INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).
            To benefit from coalescent memory accesses LDDA must be
            divisible by 16.

    @param[out]
    tau     REAL array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    dT      (workspace) REAL array on the GPU,
            dimension (2*MIN(M, N) + (N+31)/32*32 )*NB,
            where NB can be obtained through magma_get_sgeqrf_nb(M).
            It starts with MIN(M,N)*NB block that store the triangular T
            matrices, followed by the MIN(M,N)*NB block of the diagonal
            inverses for the R matrix. The rest of the array is used as workspace.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ---------------
    The matrix Q is represented as a product of elementary reflectors

       Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).

    @ingroup magma_sgeqrf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_sgeqrf_gpu(
    magma_int_t m, magma_int_t n,
    magmaFloat_ptr dA,   magma_int_t ldda,
    float *tau,
    magmaFloat_ptr dT,
    magma_int_t *info )
{
    #define dA(a_1,a_2) (dA + (a_2)*(ldda) + (a_1))
    #define dT(a_1)     (dT + (a_1)*nb)
    #define d_ref(a_1)  (dT + (  minmn+(a_1))*nb)
    #define dd_ref(a_1) (dT + (2*minmn+(a_1))*nb)
    #define work(a_1)   (work + (a_1))
    #define hwork       (work + (nb)*(m))

    magma_int_t i, k, minmn, old_i, old_ib, rows, cols;
    magma_int_t ib, nb;
    magma_int_t ldwork, lddwork, lwork, lhwork;
    float *work, *ut;

    /* check arguments */
    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    k = minmn = min(m,n);
    if (k == 0)
        return *info;

    nb = magma_get_sgeqrf_nb(m);

    lwork  = (m + n + nb)*nb;
    lhwork = lwork - m*nb;

    if (MAGMA_SUCCESS != magma_smalloc_pinned( &work, lwork )) {
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }
    
    ut = hwork+nb*(n);
    memset( ut, 0, nb*nb*sizeof(float));

    magma_queue_t stream[2];
    magma_queue_create( &stream[0] );
    magma_queue_create( &stream[1] );

    ldwork = m;
    lddwork= n;

    if ( (nb > 1) && (nb < k) ) {
        /* Use blocked code initially */
        old_i = 0; old_ib = nb;
        for (i = 0; i < k-nb; i += nb) {
            ib = min(k-i, nb);
            rows = m -i;
            magma_sgetmatrix_async( rows, ib,
                                    dA(i,i),  ldda,
                                    work(i), ldwork, stream[1] );
            if (i > 0) {
                /* Apply H' to A(i:m,i+2*ib:n) from the left */
                cols = n-old_i-2*old_ib;
                magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                  m-old_i, cols, old_ib,
                                  dA(old_i, old_i         ), ldda, dT(old_i), nb,
                                  dA(old_i, old_i+2*old_ib), ldda, dd_ref(0),    lddwork);
                
                /* store the diagonal */
                magma_ssetmatrix_async( old_ib, old_ib,
                                        ut,           old_ib,
                                        d_ref(old_i), old_ib, stream[0] );
            }

            magma_queue_sync( stream[1] );
            lapackf77_sgeqrf(&rows, &ib, work(i), &ldwork, tau+i, hwork, &lhwork, info);
            /* Form the triangular factor of the block reflector
               H = H(i) H(i+1) . . . H(i+ib-1) */
            lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr,
                              &rows, &ib,
                              work(i), &ldwork, tau+i, hwork, &ib);

            /* Put 0s in the upper triangular part of a panel (and 1s on the
               diagonal); copy the upper triangular in ut and invert it. */
            magma_queue_sync( stream[0] );
            ssplit_diag_block(ib, work(i), ldwork, ut);
            magma_ssetmatrix( rows, ib, work(i), ldwork, dA(i,i), ldda );

            if (i + ib < n) {
                /* Send the triangular factor T to the GPU */
                magma_ssetmatrix( ib, ib, hwork, ib, dT(i), nb );

                if (i+nb < k-nb) {
                    /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */
                    magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                      rows, ib, ib,
                                      dA(i, i   ), ldda, dT(i),  nb,
                                      dA(i, i+ib), ldda, dd_ref(0), lddwork);
                }
                else {
                    cols = n-i-ib;
                    magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                      rows, cols, ib,
                                      dA(i, i   ), ldda, dT(i),  nb,
                                      dA(i, i+ib), ldda, dd_ref(0), lddwork);
                    /* Fix the diagonal block */
                    magma_ssetmatrix( ib, ib, ut, ib, d_ref(i), ib );
                }
                old_i  = i;
                old_ib = ib;
            }
        }
    } else {
        i = 0;
    }

    /* Use unblocked code to factor the last or only block. */
    if (i < k) {
        ib   = n-i;
        rows = m-i;
        magma_sgetmatrix( rows, ib, dA(i, i), ldda, work, rows );
        lhwork = lwork - rows*ib;
        lapackf77_sgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info);
        
        magma_ssetmatrix( rows, ib, work, rows, dA(i, i), ldda );
    }

    magma_queue_destroy( stream[0] );
    magma_queue_destroy( stream[1] );
    magma_free_pinned( work );
    return *info;
} /* magma_sgeqrf_gpu */
Esempio n. 11
0
/**
    Purpose
    -------
    SPOTRF computes the Cholesky factorization of a real symmetric
    positive definite matrix dA.

    The factorization has the form
        dA = U**H * U,   if UPLO = MagmaUpper, or
        dA = L  * L**H,  if UPLO = MagmaLower,
    where U is an upper triangular matrix and L is lower triangular.

    This is the block version of the algorithm, calling Level 3 BLAS.
    If the current stream is NULL, this version replaces it with a new
    stream to overlap computation with communication.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of dA is stored;
      -     = MagmaLower:  Lower triangle of dA is stored.

    @param[in]
    n       INTEGER
            The order of the matrix dA.  N >= 0.

    @param[in,out]
    dA      REAL array on the GPU, dimension (LDDA,N)
            On entry, the symmetric matrix dA.  If UPLO = MagmaUpper, the leading
            N-by-N upper triangular part of dA contains the upper
            triangular part of the matrix dA, and the strictly lower
            triangular part of dA is not referenced.  If UPLO = MagmaLower, the
            leading N-by-N lower triangular part of dA contains the lower
            triangular part of the matrix dA, and the strictly upper
            triangular part of dA is not referenced.
    \n
            On exit, if INFO = 0, the factor U or L from the Cholesky
            factorization dA = U**H * U or dA = L * L**H.

    @param[in]
    ldda     INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            divisible by 16.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
      -     > 0:  if INFO = i, the leading minor of order i is not
                  positive definite, and the factorization could not be
                  completed.

    @ingroup magma_sposv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_spotrf_gpu(
    magma_uplo_t uplo, magma_int_t n,
    magmaFloat_ptr dA, magma_int_t ldda,
    magma_int_t *info)
{
#define dA(i, j) (dA + (j)*ldda + (i))

    magma_int_t     j, jb, nb;
    const char* uplo_ = lapack_uplo_const( uplo );
    float c_one     = MAGMA_S_ONE;
    float c_neg_one = MAGMA_S_NEG_ONE;
    float *work;
    float          d_one     =  1.0;
    float          d_neg_one = -1.0;
    int upper = (uplo == MagmaUpper);

    *info = 0;
    if (! upper && uplo != MagmaLower) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,n)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    nb = magma_get_spotrf_nb(n);

    if (MAGMA_SUCCESS != magma_smalloc_pinned( &work, nb*nb )) {
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }

    /* Define user stream if current stream is NULL */
    magma_queue_t stream[2];
    
    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );
    
    magma_queue_create( &stream[0] );
    if (orig_stream == NULL) {
        magma_queue_create( &stream[1] );
        magmablasSetKernelStream(stream[1]);
    }
    else {
        stream[1] = orig_stream;
    }
    
    if ((nb <= 1) || (nb >= n)) {
        /* Use unblocked code. */
        magma_sgetmatrix_async( n, n, dA, ldda, work, n, stream[1] );
        magma_queue_sync( stream[1] );
        lapackf77_spotrf(uplo_, &n, work, &n, info);
        magma_ssetmatrix_async( n, n, work, n, dA, ldda, stream[1] );
    }
    else {
        /* Use blocked code. */
        if (upper) {
            
            /* Compute the Cholesky factorization A = U'*U. */
            for (j=0; j < n; j += nb) {
                
                /* Update and factorize the current diagonal block and test
                   for non-positive-definiteness. Computing MIN */
                jb = min(nb, (n-j));
                
                magma_ssyrk(MagmaUpper, MagmaConjTrans, jb, j,
                            d_neg_one, dA(0, j), ldda,
                            d_one,     dA(j, j), ldda);

                magma_queue_sync( stream[1] );
                magma_sgetmatrix_async( jb, jb,
                                        dA(j, j), ldda,
                                        work,     jb, stream[0] );
                
                if ( (j+jb) < n) {
                    /* Compute the current block row. */
                    magma_sgemm(MagmaConjTrans, MagmaNoTrans,
                                jb, (n-j-jb), j,
                                c_neg_one, dA(0, j   ), ldda,
                                           dA(0, j+jb), ldda,
                                c_one,     dA(j, j+jb), ldda);
                }
                
                magma_queue_sync( stream[0] );
                lapackf77_spotrf(MagmaUpperStr, &jb, work, &jb, info);
                magma_ssetmatrix_async( jb, jb,
                                        work,     jb,
                                        dA(j, j), ldda, stream[1] );
                if (*info != 0) {
                    *info = *info + j;
                    break;
                }

                if ( (j+jb) < n) {
                    magma_strsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                 jb, (n-j-jb),
                                 c_one, dA(j, j   ), ldda,
                                        dA(j, j+jb), ldda);
                }
            }
        }
        else {
            //=========================================================
            // Compute the Cholesky factorization A = L*L'.
            for (j=0; j < n; j += nb) {
                //  Update and factorize the current diagonal block and test
                //  for non-positive-definiteness. Computing MIN
                jb = min(nb, (n-j));

                magma_ssyrk(MagmaLower, MagmaNoTrans, jb, j,
                            d_neg_one, dA(j, 0), ldda,
                            d_one,     dA(j, j), ldda);
                
                magma_queue_sync( stream[1] );
                magma_sgetmatrix_async( jb, jb,
                                        dA(j, j), ldda,
                                        work,     jb, stream[0] );
                
                if ( (j+jb) < n) {
                    magma_sgemm( MagmaNoTrans, MagmaConjTrans,
                                 (n-j-jb), jb, j,
                                 c_neg_one, dA(j+jb, 0), ldda,
                                            dA(j,    0), ldda,
                                 c_one,     dA(j+jb, j), ldda);
                }

                magma_queue_sync( stream[0] );
                lapackf77_spotrf(MagmaLowerStr, &jb, work, &jb, info);
                magma_ssetmatrix_async( jb, jb,
                                        work,     jb,
                                        dA(j, j), ldda, stream[1] );
                if (*info != 0) {
                    *info = *info + j;
                    break;
                }
                
                if ( (j+jb) < n) {
                    magma_strsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                (n-j-jb), jb,
                                c_one, dA(j,    j), ldda,
                                       dA(j+jb, j), ldda);
                }
            }
        }
    }

    magma_free_pinned( work );

    magma_queue_destroy( stream[0] );
    if (orig_stream == NULL) {
        magma_queue_destroy( stream[1] );
    }
    magmablasSetKernelStream( orig_stream );

    return *info;
} /* magma_spotrf_gpu */
Esempio n. 12
0
int main( int argc, char** argv )
{
    TESTING_INIT();
    
    real_Double_t   gflops, t1, t2;
    float c_neg_one = MAGMA_S_NEG_ONE;
    magma_int_t ione = 1;
    const char trans[] = { 'N', 'C', 'T' };
    const char uplo[]  = { 'L', 'U' };
    const char diag[]  = { 'U', 'N' };
    const char side[]  = { 'L', 'R' };
    
    float  *A,  *B,  *C,   *C2, *LU;
    float *dA, *dB, *dC1, *dC2;
    float alpha = MAGMA_S_MAKE( 0.5, 0.1 );
    float beta  = MAGMA_S_MAKE( 0.7, 0.2 );
    float dalpha = 0.6;
    float dbeta  = 0.8;
    float work[1], error, total_error;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t m, n, k, size, maxn, ld, info;
    magma_int_t *piv;
    magma_err_t err;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" );
    
    total_error = 0.;
    for( int i = 0; i < opts.ntest; ++i ) {
        m = opts.msize[i];
        n = opts.nsize[i];
        k = opts.ksize[i];
        printf("=========================================================================\n");
        printf( "M %d, N %d, K %d\n", (int) m, (int) n, (int) k );
        
        // allocate matrices
        // over-allocate so they can be any combination of {m,n,k} x {m,n,k}.
        maxn = max( max( m, n ), k );
        ld = maxn;
        size = maxn*maxn;
        err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) );  assert( err == 0 );
        err = magma_smalloc_pinned( &A,  size );  assert( err == 0 );
        err = magma_smalloc_pinned( &B,  size );  assert( err == 0 );
        err = magma_smalloc_pinned( &C,  size );  assert( err == 0 );
        err = magma_smalloc_pinned( &C2, size );  assert( err == 0 );
        err = magma_smalloc_pinned( &LU, size );  assert( err == 0 );
        err = magma_smalloc( &dA,  size );        assert( err == 0 );
        err = magma_smalloc( &dB,  size );        assert( err == 0 );
        err = magma_smalloc( &dC1, size );        assert( err == 0 );
        err = magma_smalloc( &dC2, size );        assert( err == 0 );
        
        // initialize matrices
        size = maxn*maxn;
        lapackf77_slarnv( &ione, ISEED, &size, A  );
        lapackf77_slarnv( &ione, ISEED, &size, B  );
        lapackf77_slarnv( &ione, ISEED, &size, C  );
        
        printf( "========== Level 1 BLAS ==========\n" );
        
        // ----- test SSWAP
        // swap 2nd and 3rd columns of dA, then copy to C2 and compare with A
        assert( n >= 4 );
        magma_ssetmatrix( m, n, A, ld, dA, ld );
        magma_ssetmatrix( m, n, A, ld, dB, ld );
        magma_sswap( m, dA(0,1), 1, dA(0,2), 1 );
        magma_sswap( m, dB(0,1), 1, dB(0,2), 1 );
        
        // check results, storing diff between magma and cuda calls in C2
        cublasSaxpy( ld*n, c_neg_one, dA, 1, dB, 1 );
        magma_sgetmatrix( m, n, dB, ld, C2, ld );
        error = lapackf77_slange( "F", &m, &k, C2, &ld, work );
        total_error += error;
        printf( "sswap             diff %.2g\n", error );
        
        // ----- test ISAMAX
        // get argmax of column of A
        magma_ssetmatrix( m, k, A, ld, dA, ld );
        error = 0;
        for( int j = 0; j < k; ++j ) {
            magma_int_t i1 = magma_isamax( m, dA(0,j), 1 );
            magma_int_t i2 = cublasIsamax( m, dA(0,j), 1 );
            assert( i1 == i2 );
            error += abs( i1 - i2 );
        }
        total_error += error;
        gflops = (float)m * k / 1e9;
        printf( "isamax            diff %.2g\n", error );
        printf( "\n" );
        
        printf( "========== Level 2 BLAS ==========\n" );
        
        // ----- test SGEMV
        // c = alpha*A*b + beta*c,  with A m*n; b,c m or n-vectors
        // try no-trans/trans
        for( int ia = 0; ia < 3; ++ia ) {
            magma_ssetmatrix( m, n, A,  ld, dA,  ld );
            magma_ssetvector( maxn, B, 1, dB,  1 );
            magma_ssetvector( maxn, C, 1, dC1, 1 );
            magma_ssetvector( maxn, C, 1, dC2, 1 );
            t1 = magma_sync_wtime( 0 );
            magma_sgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasSgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC2, 1 );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            size = (trans[ia] == 'N' ? m : n);
            cublasSaxpy( size, c_neg_one, dC1, 1, dC2, 1 );
            magma_sgetvector( size, dC2, 1, C2, 1 );
            error = lapackf77_slange( "F", &size, &ione, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_SGEMV( m, n ) / 1e9;
            printf( "sgemv( %c )        diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    trans[ia], error, gflops/t1, gflops/t2 );
        }
        printf( "\n" );
        
        // ----- test SSYMV
        // c = alpha*A*b + beta*c,  with A m*m symmetric; b,c m-vectors
        // try upper/lower
        for( int iu = 0; iu < 2; ++iu ) {
            magma_ssetmatrix( m, m, A, ld, dA, ld );
            magma_ssetvector( m, B, 1, dB,  1 );
            magma_ssetvector( m, C, 1, dC1, 1 );
            magma_ssetvector( m, C, 1, dC2, 1 );
            t1 = magma_sync_wtime( 0 );
            magma_ssymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasSsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC2, 1 );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasSaxpy( m, c_neg_one, dC1, 1, dC2, 1 );
            magma_sgetvector( m, dC2, 1, C2, 1 );
            error = lapackf77_slange( "F", &m, &ione, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_SSYMV( m ) / 1e9;
            printf( "ssymv( %c )        diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], error, gflops/t1, gflops/t2 );
        }
        printf( "\n" );
        
        // ----- test STRSV
        // solve A*c = c,  with A m*m triangular; c m-vector
        // try upper/lower, no-trans/trans, unit/non-unit diag
        // Factor A into LU to get well-conditioned triangles, else solve yields garbage.
        // Still can give garbage if solves aren't consistent with LU factors,
        // e.g., using unit diag for U, so copy lower triangle to upper triangle.
        // Also used for trsm later.
        lapackf77_slacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld );
        lapackf77_sgetrf( &maxn, &maxn, LU, &ld, piv, &info );
        for( int j = 0; j < maxn; ++j ) {
            for( int i = 0; i < j; ++i ) {
                *LU(i,j) = *LU(j,i);
            }
        }
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
        for( int id = 0; id < 2; ++id ) {
            magma_ssetmatrix( m, m, LU, ld, dA, ld );
            magma_ssetvector( m, C, 1, dC1, 1 );
            magma_ssetvector( m, C, 1, dC2, 1 );
            t1 = magma_sync_wtime( 0 );
            magma_strsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasStrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC2, 1 );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasSaxpy( m, c_neg_one, dC1, 1, dC2, 1 );
            magma_sgetvector( m, dC2, 1, C2, 1 );
            error = lapackf77_slange( "F", &m, &ione, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_STRSM( MagmaLeft, m, 1 ) / 1e9;
            printf( "strsv( %c, %c, %c )  diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], diag[id], error, gflops/t1, gflops/t2 );
        }}}
        printf( "\n" );
        
        printf( "========== Level 3 BLAS ==========\n" );
        
        // ----- test SGEMM
        // C = alpha*A*B + beta*C,  with A m*k or k*m; B k*n or n*k; C m*n
        // try combinations of no-trans/trans
        for( int ia = 0; ia < 3; ++ia ) {
        for( int ib = 0; ib < 3; ++ib ) {
            bool nta = (trans[ia] == 'N');
            bool ntb = (trans[ib] == 'N');
            magma_ssetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA,  ld );
            magma_ssetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB,  ld );
            magma_ssetmatrix( m, n, C, ld, dC1, ld );
            magma_ssetmatrix( m, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_sgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasSgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasSaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_sgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_slange( "F", &m, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_SGEMM( m, n, k ) / 1e9;
            printf( "sgemm( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    trans[ia], trans[ib], error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test SSYMM
        // C = alpha*A*B + beta*C  (left)  with A m*m symmetric; B,C m*n; or
        // C = alpha*B*A + beta*C  (right) with A n*n symmetric; B,C m*n
        // try left/right, upper/lower
        for( int is = 0; is < 2; ++is ) {
        for( int iu = 0; iu < 2; ++iu ) {
            magma_ssetmatrix( m, m, A, ld, dA,  ld );
            magma_ssetmatrix( m, n, B, ld, dB,  ld );
            magma_ssetmatrix( m, n, C, ld, dC1, ld );
            magma_ssetmatrix( m, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_ssymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasSsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasSaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_sgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_slange( "F", &m, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_SSYMM( side[is], m, n ) / 1e9;
            printf( "ssymm( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    side[is], uplo[iu], error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test SSYRK
        // C = alpha*A*A^H + beta*C  (no-trans) with A m*k and C m*m symmetric; or
        // C = alpha*A^H*A + beta*C  (trans)    with A k*m and C m*m symmetric
        // try upper/lower, no-trans/trans
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
            magma_ssetmatrix( n, k, A, ld, dA,  ld );
            magma_ssetmatrix( n, n, C, ld, dC1, ld );
            magma_ssetmatrix( n, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_ssyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasSsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasSaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_sgetmatrix( n, n, dC2, ld, C2, ld );
            error = lapackf77_slange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_SSYRK( k, n ) / 1e9;
            printf( "ssyrk( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test SSYR2K
        // C = alpha*A*B^H + ^alpha*B*A^H + beta*C  (no-trans) with A,B n*k; C n*n symmetric; or
        // C = alpha*A^H*B + ^alpha*B^H*A + beta*C  (trans)    with A,B k*n; C n*n symmetric
        // try upper/lower, no-trans/trans
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
            bool nt = (trans[it] == 'N');
            magma_ssetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA,  ld );
            magma_ssetmatrix( n, n, C, ld, dC1, ld );
            magma_ssetmatrix( n, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_ssyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasSsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasSaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_sgetmatrix( n, n, dC2, ld, C2, ld );
            error = lapackf77_slange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_SSYR2K( k, n ) / 1e9;
            printf( "ssyr2k( %c, %c )    diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test STRMM
        // C = alpha*A*C  (left)  with A m*m triangular; C m*n; or
        // C = alpha*C*A  (right) with A n*n triangular; C m*n
        // try left/right, upper/lower, no-trans/trans, unit/non-unit
        for( int is = 0; is < 2; ++is ) {
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
        for( int id = 0; id < 2; ++id ) {
            bool left = (side[is] == 'L');
            magma_ssetmatrix( (left ? m : n), (left ? m : n), A, ld, dA,  ld );
            magma_ssetmatrix( m, n, C, ld, dC1, ld );
            magma_ssetmatrix( m, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_strmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasStrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasSaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_sgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_slange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_STRMM( side[is], m, n ) / 1e9;
            printf( "strmm( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], error, gflops/t1, gflops/t2 );
        }}}}
        printf( "\n" );
        
        // ----- test STRSM
        // solve A*X = alpha*B  (left)  with A m*m triangular; B m*n; or
        // solve X*A = alpha*B  (right) with A n*n triangular; B m*n
        // try left/right, upper/lower, no-trans/trans, unit/non-unit
        for( int is = 0; is < 2; ++is ) {
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
        for( int id = 0; id < 2; ++id ) {
            bool left = (side[is] == 'L');
            magma_ssetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA,  ld );
            magma_ssetmatrix( m, n, C, ld, dC1, ld );
            magma_ssetmatrix( m, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_strsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasStrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasSaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_sgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_slange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_STRSM( side[is], m, n ) / 1e9;
            printf( "strsm( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], error, gflops/t1, gflops/t2 );
        }}}}
        printf( "\n" );
        
        // cleanup
        magma_free_cpu( piv );
        magma_free_pinned( A  );
        magma_free_pinned( B  );
        magma_free_pinned( C  );
        magma_free_pinned( C2 );
        magma_free_pinned( LU );
        magma_free( dA  );
        magma_free( dB  );
        magma_free( dC1 );
        magma_free( dC2 );
    }
    
    if ( total_error != 0. ) {
        printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n",
                total_error );
    }
    else {
        printf( "all tests passed\n" );
    }
    
    TESTING_FINALIZE();
    return 0;
}
Esempio n. 13
0
extern "C" magma_int_t
magma_sidr_strms(
    magma_s_matrix A, magma_s_matrix b, magma_s_matrix *x,
    magma_s_solver_par *solver_par,
    magma_queue_t queue )
{
    magma_int_t info = MAGMA_NOTCONVERGED;

    // prepare solver feedback
    solver_par->solver = Magma_IDRMERGE;
    solver_par->numiter = 0;
    solver_par->spmv_count = 0;
    solver_par->init_res = 0.0;
    solver_par->final_res = 0.0;
    solver_par->iter_res = 0.0;
    solver_par->runtime = 0.0;

    // constants
    const float c_zero = MAGMA_S_ZERO;
    const float c_one = MAGMA_S_ONE;
    const float c_n_one = MAGMA_S_NEG_ONE;

    // internal user options
    const magma_int_t smoothing = 1;   // 0 = disable, 1 = enable
    const float angle = 0.7;          // [0-1]

    // local variables
    magma_int_t iseed[4] = {0, 0, 0, 1};
    magma_int_t dof;
    magma_int_t s;
    magma_int_t distr;
    magma_int_t k, i, sk;
    magma_int_t innerflag;
    magma_int_t ldd;
    magma_int_t q;
    float residual;
    float nrm;
    float nrmb;
    float nrmr;
    float nrmt;
    float rho;
    float om;
    float gamma;

    // matrices and vectors
    magma_s_matrix dxs = {Magma_CSR};
    magma_s_matrix dr = {Magma_CSR}, drs = {Magma_CSR};
    magma_s_matrix dP = {Magma_CSR}, dP1 = {Magma_CSR};
    magma_s_matrix dG = {Magma_CSR}, dGcol = {Magma_CSR};
    magma_s_matrix dU = {Magma_CSR};
    magma_s_matrix dM = {Magma_CSR};
    magma_s_matrix df = {Magma_CSR};
    magma_s_matrix dt = {Magma_CSR}, dtt = {Magma_CSR};
    magma_s_matrix dc = {Magma_CSR};
    magma_s_matrix dv = {Magma_CSR};
    magma_s_matrix dskp = {Magma_CSR};
    magma_s_matrix dalpha = {Magma_CSR};
    magma_s_matrix dbeta = {Magma_CSR};
    float *hMdiag = NULL;
    float *hskp = NULL;
    float *halpha = NULL;
    float *hbeta = NULL;
    float *d1 = NULL, *d2 = NULL;
    
    // queue variables
    const magma_int_t nqueues = 3;     // number of queues
    magma_queue_t queues[nqueues];    

    // chronometry
    real_Double_t tempo1, tempo2;

    // create additional queues
    queues[0] = queue;
    for ( q = 1; q < nqueues; q++ ) {
        magma_queue_create( queue->device(), &(queues[q]) );
    }

    // initial s space
    // TODO: add option for 's' (shadow space number)
    // Hack: uses '--restart' option as the shadow space number.
    //       This is not a good idea because the default value of restart option is used to detect
    //       if the user provided a custom restart. This means that if the default restart value
    //       is changed then the code will think it was the user (unless the default value is
    //       also updated in the 'if' statement below.
    s = 1;
    if ( solver_par->restart != 50 ) {
        if ( solver_par->restart > A.num_cols ) {
            s = A.num_cols;
        } else {
            s = solver_par->restart;
        }
    }
    solver_par->restart = s;

    // set max iterations
    solver_par->maxiter = min( 2 * A.num_cols, solver_par->maxiter );

    // check if matrix A is square
    if ( A.num_rows != A.num_cols ) {
        //printf("Matrix A is not square.\n");
        info = MAGMA_ERR_NOT_SUPPORTED;
        goto cleanup;
    }

    // |b|
    nrmb = magma_snrm2( b.num_rows, b.dval, 1, queue );
    if ( nrmb == 0.0 ) {
        magma_sscal( x->num_rows, MAGMA_S_ZERO, x->dval, 1, queue );
        info = MAGMA_SUCCESS;
        goto cleanup;
    }

    // t = 0
    // make t twice as large to contain both, dt and dr
    ldd = magma_roundup( b.num_rows, 32 );
    CHECK( magma_svinit( &dt, Magma_DEV, ldd, 2, c_zero, queue ));
    dt.num_rows = b.num_rows;
    dt.num_cols = 1;
    dt.nnz = dt.num_rows;

    // redirect the dr.dval to the second part of dt
    CHECK( magma_svinit( &dr, Magma_DEV, b.num_rows, 1, c_zero, queue ));
    magma_free( dr.dval );
    dr.dval = dt.dval + ldd;

    // r = b - A x
    CHECK( magma_sresidualvec( A, b, *x, &dr, &nrmr, queue ));
    
    // |r|
    solver_par->init_res = nrmr;
    solver_par->final_res = solver_par->init_res;
    solver_par->iter_res = solver_par->init_res;
    if ( solver_par->verbose > 0 ) {
        solver_par->res_vec[0] = (real_Double_t)nrmr;
    }

    // check if initial is guess good enough
    if ( nrmr <= solver_par->atol ||
        nrmr/nrmb <= solver_par->rtol ) {
        info = MAGMA_SUCCESS;
        goto cleanup;
    }

    // P = randn(n, s)
    // P = ortho(P)
//---------------------------------------
    // P = 0.0
    CHECK( magma_svinit( &dP, Magma_CPU, A.num_cols, s, c_zero, queue ));

    // P = randn(n, s)
    distr = 3;        // 1 = unif (0,1), 2 = unif (-1,1), 3 = normal (0,1) 
    dof = dP.num_rows * dP.num_cols;
    lapackf77_slarnv( &distr, iseed, &dof, dP.val );

    // transfer P to device
    CHECK( magma_smtransfer( dP, &dP1, Magma_CPU, Magma_DEV, queue ));
    magma_smfree( &dP, queue );

    // P = ortho(P1)
    if ( dP1.num_cols > 1 ) {
        // P = magma_sqr(P1), QR factorization
        CHECK( magma_sqr( dP1.num_rows, dP1.num_cols, dP1, dP1.ld, &dP, NULL, queue ));
    } else {
        // P = P1 / |P1|
        nrm = magma_snrm2( dof, dP1.dval, 1, queue );
        nrm = 1.0 / nrm;
        magma_sscal( dof, nrm, dP1.dval, 1, queue );
        CHECK( magma_smtransfer( dP1, &dP, Magma_DEV, Magma_DEV, queue ));
    }
    magma_smfree( &dP1, queue );
//---------------------------------------

    // allocate memory for the scalar products
    CHECK( magma_smalloc_pinned( &hskp, 5 ));
    CHECK( magma_svinit( &dskp, Magma_DEV, 4, 1, c_zero, queue ));

    CHECK( magma_smalloc_pinned( &halpha, s ));
    CHECK( magma_svinit( &dalpha, Magma_DEV, s, 1, c_zero, queue ));

    CHECK( magma_smalloc_pinned( &hbeta, s ));
    CHECK( magma_svinit( &dbeta, Magma_DEV, s, 1, c_zero, queue ));
    
    // workspace for merged dot product
    CHECK( magma_smalloc( &d1, max(2, s) * b.num_rows ));
    CHECK( magma_smalloc( &d2, max(2, s) * b.num_rows ));

    // smoothing enabled
    if ( smoothing > 0 ) {
        // set smoothing solution vector
        CHECK( magma_smtransfer( *x, &dxs, Magma_DEV, Magma_DEV, queue ));

        // tt = 0
        // make tt twice as large to contain both, dtt and drs
        ldd = magma_roundup( b.num_rows, 32 );
        CHECK( magma_svinit( &dtt, Magma_DEV, ldd, 2, c_zero, queue ));
        dtt.num_rows = dr.num_rows;
        dtt.num_cols = 1;
        dtt.nnz = dtt.num_rows;

        // redirect the drs.dval to the second part of dtt
        CHECK( magma_svinit( &drs, Magma_DEV, dr.num_rows, 1, c_zero, queue ));
        magma_free( drs.dval );
        drs.dval = dtt.dval + ldd;

        // set smoothing residual vector
        magma_scopyvector( dr.num_rows, dr.dval, 1, drs.dval, 1, queue );
    }

    // G(n,s) = 0
    if ( s > 1 ) {
        ldd = magma_roundup( A.num_rows, 32 );
        CHECK( magma_svinit( &dG, Magma_DEV, ldd, s, c_zero, queue ));
        dG.num_rows = A.num_rows;
    } else {
        CHECK( magma_svinit( &dG, Magma_DEV, A.num_rows, s, c_zero, queue ));
    }

    // dGcol represents a single column of dG, array pointer is set inside loop
    CHECK( magma_svinit( &dGcol, Magma_DEV, dG.num_rows, 1, c_zero, queue ));
    magma_free( dGcol.dval );

    // U(n,s) = 0
    if ( s > 1 ) {
        ldd = magma_roundup( A.num_cols, 32 );
        CHECK( magma_svinit( &dU, Magma_DEV, ldd, s, c_zero, queue ));
        dU.num_rows = A.num_cols;
    } else {
        CHECK( magma_svinit( &dU, Magma_DEV, A.num_cols, s, c_zero, queue ));
    }

    // M(s,s) = I
    CHECK( magma_svinit( &dM, Magma_DEV, s, s, c_zero, queue ));
    CHECK( magma_smalloc_pinned( &hMdiag, s ));
    magmablas_slaset( MagmaFull, dM.num_rows, dM.num_cols, c_zero, c_one, dM.dval, dM.ld, queue );

    // f = 0
    CHECK( magma_svinit( &df, Magma_DEV, dP.num_cols, 1, c_zero, queue ));

    // c = 0
    CHECK( magma_svinit( &dc, Magma_DEV, dM.num_cols, 1, c_zero, queue ));

    // v = r
    CHECK( magma_smtransfer( dr, &dv, Magma_DEV, Magma_DEV, queue ));

    //--------------START TIME---------------
    // chronometry
    tempo1 = magma_sync_wtime( queue );
    if ( solver_par->verbose > 0 ) {
        solver_par->timing[0] = 0.0;
    }

cudaProfilerStart();

    om = MAGMA_S_ONE;
    gamma = MAGMA_S_ZERO;
    innerflag = 0;

    // new RHS for small systems
    // f = P' r
    // Q1
    magma_sgemvmdot_shfl( dP.num_rows, dP.num_cols, dP.dval, dr.dval, d1, d2, df.dval, queues[1] );

    // skp[4] = f(k)
    // Q1
    magma_sgetvector_async( 1, df.dval, 1, &hskp[4], 1, queues[1] );

    // c(k:s) = f(k:s)
    // Q1
    magma_scopyvector_async( s, df.dval, 1, dc.dval, 1, queues[1] );

    // c(k:s) = M(k:s,k:s) \ f(k:s)
    // Q1
    magma_strsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, s, dM.dval, dM.ld, dc.dval, 1, queues[1] );

    // start iteration
    do
    {
        solver_par->numiter++;

        // shadow space loop
        for ( k = 0; k < s; ++k ) {
            sk = s - k;
            dGcol.dval = dG.dval + k * dG.ld;

            // v = r - G(:,k:s) c(k:s)
            // Q1
            magmablas_sgemv( MagmaNoTrans, dG.num_rows, sk, c_n_one, dGcol.dval, dG.ld, &dc.dval[k], 1, c_one, dv.dval, 1, queues[1] );

            // U(:,k) = om * v + U(:,k:s) c(k:s)
            // Q1
            magmablas_sgemv( MagmaNoTrans, dU.num_rows, sk, c_one, &dU.dval[k*dU.ld], dU.ld, &dc.dval[k], 1, om, dv.dval, 1, queues[1] );

            // G(:,k) = A U(:,k)
            // Q1
            CHECK( magma_s_spmv( c_one, A, dv, c_zero, dGcol, queues[1] ));
            solver_par->spmv_count++;

            // bi-orthogonalize the new basis vectors
            for ( i = 0; i < k; ++i ) {
                // alpha = P(:,i)' G(:,k)
                // Q1
                halpha[i] = magma_sdot( dP.num_rows, &dP.dval[i*dP.ld], 1, dGcol.dval, 1, queues[1] );
                // implicit sync Q1 --> alpha = P(:,i)' G(:,k) 

                // alpha = alpha / M(i,i)
                halpha[i] = halpha[i] / hMdiag[i];
                    
                // G(:,k) = G(:,k) - alpha * G(:,i)
                // Q1
                magma_saxpy( dG.num_rows, -halpha[i], &dG.dval[i*dG.ld], 1, dGcol.dval, 1, queues[1] );
            }

            // sync Q1 --> G(:,k) = G(:,k) - alpha * G(:,i), skp[4] = f(k)
            magma_queue_sync( queues[1] );

            // new column of M = P'G, first k-1 entries are zero
            // M(k:s,k) = P(:,k:s)' G(:,k)
            // Q2
            magma_sgemvmdot_shfl( dP.num_rows, sk, &dP.dval[k*dP.ld], dGcol.dval, d1, d2, &dM.dval[k*dM.ld+k], queues[2] );

            // non-first s iteration
            if ( k > 0 ) {
                // alpha = dalpha
                // Q0
                magma_ssetvector_async( k, halpha, 1, dalpha.dval, 1, queues[0] );

                // U update outside of loop using GEMV
                // U(:,k) = U(:,k) - U(:,1:k) * alpha(1:k)
                // Q0
                magmablas_sgemv( MagmaNoTrans, dU.num_rows, k, c_n_one, dU.dval, dU.ld, dalpha.dval, 1, c_one, dv.dval, 1, queues[0] );
            }

            // Mdiag(k) = M(k,k)
            // Q2
            magma_sgetvector( 1, &dM.dval[k*dM.ld+k], 1, &hMdiag[k], 1, queues[2] );
            // implicit sync Q2 --> Mdiag(k) = M(k,k)

            // U(:,k) = v
            // Q0
            magma_scopyvector_async( dU.num_rows, dv.dval, 1, &dU.dval[k*dU.ld], 1, queues[0] );

            // check M(k,k) == 0
            if ( MAGMA_S_EQUAL(hMdiag[k], MAGMA_S_ZERO) ) {
                innerflag = 1;
                info = MAGMA_DIVERGENCE;
                break;
            }

            // beta = f(k) / M(k,k)
            hbeta[k] = hskp[4] / hMdiag[k];

            // check for nan
            if ( magma_s_isnan( hbeta[k] ) || magma_s_isinf( hbeta[k] )) {
                innerflag = 1;
                info = MAGMA_DIVERGENCE;
                break;
            }

            // r = r - beta * G(:,k)
            // Q2
            magma_saxpy( dr.num_rows, -hbeta[k], dGcol.dval, 1, dr.dval, 1, queues[2] );

            // non-last s iteration 
            if ( (k + 1) < s ) {
                // f(k+1:s) = f(k+1:s) - beta * M(k+1:s,k)
                // Q1
                magma_saxpy( sk-1, -hbeta[k], &dM.dval[k*dM.ld+(k+1)], 1, &df.dval[k+1], 1, queues[1] );

                // c(k+1:s) = f(k+1:s)
                // Q1
                magma_scopyvector_async( sk-1, &df.dval[k+1], 1, &dc.dval[k+1], 1, queues[1] );

                // c(k+1:s) = M(k+1:s,k+1:s) \ f(k+1:s)
                // Q1
                magma_strsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, sk-1, &dM.dval[(k+1)*dM.ld+(k+1)], dM.ld, &dc.dval[k+1], 1, queues[1] );

                // skp[4] = f(k+1)
                // Q1
                magma_sgetvector_async( 1, &df.dval[k+1], 1, &hskp[4], 1, queues[1] ); 
            }

            // smoothing disabled
            if ( smoothing <= 0 ) {
                // |r|
                // Q2
                nrmr = magma_snrm2( dr.num_rows, dr.dval, 1, queues[2] );           
                // implicit sync Q2 --> |r|

            // smoothing enabled
            } else {
                // smoothing operation
//---------------------------------------
                // t = rs - r
                // Q2
                magma_sidr_smoothing_1( drs.num_rows, drs.num_cols, drs.dval, dr.dval, dtt.dval, queues[2] );

                // x = x + beta * U(:,k)
                // Q0
                magma_saxpy( x->num_rows, hbeta[k], &dU.dval[k*dU.ld], 1, x->dval, 1, queues[0] );

                // t't
                // t'rs
                // Q2
                CHECK( magma_sgemvmdot_shfl( dt.ld, 2, dtt.dval, dtt.dval, d1, d2, &dskp.dval[2], queues[2] ));

                // skp[2-3] = dskp[2-3]
                // Q2
                magma_sgetvector( 2, &dskp.dval[2], 1, &hskp[2], 1, queues[2] );
                // implicit sync Q2 --> skp = dskp

                // gamma = (t' * rs) / (t' * t)
                gamma = hskp[3] / hskp[2];
                
                // rs = rs - gamma * t 
                // Q1
                magma_saxpy( drs.num_rows, -gamma, dtt.dval, 1, drs.dval, 1, queues[1] );

                // xs = xs - gamma * (xs - x) 
                // Q0
                magma_sidr_smoothing_2( dxs.num_rows, dxs.num_cols, -gamma, x->dval, dxs.dval, queues[0] );

                // |rs|
                // Q1
                nrmr = magma_snrm2( drs.num_rows, drs.dval, 1, queues[1] );       
                // implicit sync Q0 --> |r|
//---------------------------------------
            }

            // v = r
            // Q1
            magma_scopyvector_async( dr.num_rows, dr.dval, 1, dv.dval, 1, queues[1] );

            // last s iteration
            if ( (k + 1) == s ) {
               // t = A r
               // Q2
               CHECK( magma_s_spmv( c_one, A, dr, c_zero, dt, queues[2] ));
               solver_par->spmv_count++;

               // t't
               // t'r
               // Q2
               CHECK( magma_sgemvmdot_shfl( dt.ld, 2, dt.dval, dt.dval, d1, d2, dskp.dval, queues[2] ));
            }

            // store current timing and residual
            if ( solver_par->verbose > 0 ) {
                tempo2 = magma_sync_wtime( queue );
                if ( (solver_par->numiter) % solver_par->verbose == 0 ) {
                    solver_par->res_vec[(solver_par->numiter) / solver_par->verbose]
                            = (real_Double_t)nrmr;
                    solver_par->timing[(solver_par->numiter) / solver_par->verbose]
                            = (real_Double_t)tempo2 - tempo1;
                }
            }

            // check convergence or iteration limit
            if ( nrmr <= solver_par->atol ||
                nrmr/nrmb <= solver_par->rtol ) { 
                s = k + 1; // for the x-update outside the loop
                innerflag = 2;
                info = MAGMA_SUCCESS;
                break;
            }
        }

        // smoothing disabled
        if ( smoothing <= 0 && innerflag != 1 ) {
            // dbeta(1:s) = beta(1:s)
            // Q0
            magma_ssetvector_async( s, hbeta, 1, dbeta.dval, 1, queues[0] );

            // x = x + U(:,1:s) * beta(1:s)
            // Q0
            magmablas_sgemv( MagmaNoTrans, dU.num_rows, s, c_one, dU.dval, dU.ld, dbeta.dval, 1, c_one, x->dval, 1, queues[0] );
        }

        // check convergence or iteration limit or invalid result of inner loop
        if ( innerflag > 0 ) {
            break;
        }

        // computation of a new omega
//---------------------------------------
        // skp[0-2] = dskp[0-2]
        // Q2
        magma_sgetvector( 2, dskp.dval, 1, hskp, 1, queues[2] );
        // implicit sync Q2 --> skp = dskp

        // |t|
        nrmt = magma_ssqrt( MAGMA_S_REAL(hskp[0]) );
        
        // rho = abs((t' * r) / (|t| * |r|))
        rho = MAGMA_D_ABS( MAGMA_S_REAL(hskp[1]) / (nrmt * nrmr) );

        // om = (t' * r) / (|t| * |t|)
        om = hskp[1] / hskp[0]; 
        if ( rho < angle ) {
            om = (om * angle) / rho;
        }
//---------------------------------------
        if ( MAGMA_S_EQUAL(om, MAGMA_S_ZERO) ) {
            info = MAGMA_DIVERGENCE;
            break;
        }

        // sync Q1 --> v = r
        magma_queue_sync( queues[1] );

        // r = r - om * t
        // Q2
        magma_saxpy( dr.num_rows, -om, dt.dval, 1, dr.dval, 1, queues[2] );

        // x = x + om * v
        // Q0
        magma_saxpy( x->num_rows, om, dv.dval, 1, x->dval, 1, queues[0] );

        // smoothing disabled
        if ( smoothing <= 0 ) {
            // |r|
            // Q2
            nrmr = magma_snrm2( dr.num_rows, dr.dval, 1, queues[2] );           
            // implicit sync Q2 --> |r|

            // v = r
            // Q0
            magma_scopyvector_async( dr.num_rows, dr.dval, 1, dv.dval, 1, queues[0] );

            // new RHS for small systems
            // f = P' r
            // Q1
            magma_sgemvmdot_shfl( dP.num_rows, dP.num_cols, dP.dval, dr.dval, d1, d2, df.dval, queues[1] );

            // skp[4] = f(k)
            // Q1
            magma_sgetvector_async( 1, df.dval, 1, &hskp[4], 1, queues[1] );

            // c(k:s) = f(k:s)
            // Q1
            magma_scopyvector_async( s, df.dval, 1, dc.dval, 1, queues[1] );

            // c(k:s) = M(k:s,k:s) \ f(k:s)
            // Q1
            magma_strsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, s, dM.dval, dM.ld, dc.dval, 1, queues[1] );

        // smoothing enabled
        } else {
            // smoothing operation
//---------------------------------------
            // t = rs - r
            // Q2
            magma_sidr_smoothing_1( drs.num_rows, drs.num_cols, drs.dval, dr.dval, dtt.dval, queues[2] );

            // t't
            // t'rs
            // Q2
            CHECK( magma_sgemvmdot_shfl( dt.ld, 2, dtt.dval, dtt.dval, d1, d2, &dskp.dval[2], queues[2] ));

            // skp[2-3] = dskp[2-3]
            // Q2
            magma_sgetvector( 2, &dskp.dval[2], 1, &hskp[2], 1, queues[2] );
            // implicit sync Q2 --> skp = dskp

            // gamma = (t' * rs) / (t' * t)
            gamma = hskp[3] / hskp[2];

            // rs = rs - gamma * (rs - r) 
            // Q2
            magma_saxpy( drs.num_rows, -gamma, dtt.dval, 1, drs.dval, 1, queues[2] );

            // xs = xs - gamma * (xs - x) 
            // Q0
            magma_sidr_smoothing_2( dxs.num_rows, dxs.num_cols, -gamma, x->dval, dxs.dval, queues[0] );

            // v = r
            // Q0
            magma_scopyvector_async( dr.num_rows, dr.dval, 1, dv.dval, 1, queues[0] );

            // new RHS for small systems
            // f = P' r
            // Q1
            magma_sgemvmdot_shfl( dP.num_rows, dP.num_cols, dP.dval, dr.dval, d1, d2, df.dval, queues[1] );

            // skp[4] = f(k)
            // Q1
            magma_sgetvector_async( 1, df.dval, 1, &hskp[4], 1, queues[1] );

            // c(k:s) = f(k:s)
            // Q1
            magma_scopyvector_async( s, df.dval, 1, dc.dval, 1, queues[1] );

            // |rs|
            // Q2
            nrmr = magma_snrm2( drs.num_rows, drs.dval, 1, queues[2] );           
            // implicit sync Q2 --> |r|

            // c(k:s) = M(k:s,k:s) \ f(k:s)
            // Q1
            magma_strsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, s, dM.dval, dM.ld, dc.dval, 1, queues[1] );
//---------------------------------------
        }

        // store current timing and residual
        if ( solver_par->verbose > 0 ) {
            tempo2 = magma_sync_wtime( queue );
            magma_queue_sync( queue );
            if ( (solver_par->numiter) % solver_par->verbose == 0 ) {
                solver_par->res_vec[(solver_par->numiter) / solver_par->verbose]
                        = (real_Double_t)nrmr;
                solver_par->timing[(solver_par->numiter) / solver_par->verbose]
                        = (real_Double_t)tempo2 - tempo1;
            }
        }

        // check convergence or iteration limit
        if ( nrmr <= solver_par->atol ||
            nrmr/nrmb <= solver_par->rtol ) { 
            info = MAGMA_SUCCESS;
            break;
        }

        // sync Q0 --> v = r
        magma_queue_sync( queues[0] );
    }
    while ( solver_par->numiter + 1 <= solver_par->maxiter );

    // sync all queues
    for ( q = 0; q < nqueues; q++ ) {
        magma_queue_sync( queues[q] );
    }

    // smoothing enabled
    if ( smoothing > 0 ) {
        // x = xs
        magma_scopyvector_async( x->num_rows, dxs.dval, 1, x->dval, 1, queue );

        // r = rs
        magma_scopyvector_async( dr.num_rows, drs.dval, 1, dr.dval, 1, queue );
    }

cudaProfilerStop();

    // get last iteration timing
    tempo2 = magma_sync_wtime( queue );
    magma_queue_sync( queue );
    solver_par->runtime = (real_Double_t)tempo2 - tempo1;
//--------------STOP TIME----------------

    // get final stats
    solver_par->iter_res = nrmr;
    CHECK( magma_sresidualvec( A, b, *x, &dr, &residual, queue ));
    solver_par->final_res = residual;

    // set solver conclusion
    if ( info != MAGMA_SUCCESS && info != MAGMA_DIVERGENCE ) {
        if ( solver_par->init_res > solver_par->final_res ) {
            info = MAGMA_SLOW_CONVERGENCE;
        }
    }


cleanup:
    // free resources
    // sync all queues, destory additional queues
    magma_queue_sync( queues[0] );
    for ( q = 1; q < nqueues; q++ ) {
        magma_queue_sync( queues[q] );
        magma_queue_destroy( queues[q] );
    }

    // smoothing enabled
    if ( smoothing > 0 ) {
        drs.dval = NULL;  // needed because its pointer is redirected to dtt
        magma_smfree( &dxs, queue );
        magma_smfree( &drs, queue ); 
        magma_smfree( &dtt, queue );
    }
    dr.dval = NULL;       // needed because its pointer is redirected to dt
    dGcol.dval = NULL;    // needed because its pointer is redirected to dG
    magma_smfree( &dr, queue );
    magma_smfree( &dP, queue );
    magma_smfree( &dP1, queue );
    magma_smfree( &dG, queue );
    magma_smfree( &dGcol, queue );
    magma_smfree( &dU, queue );
    magma_smfree( &dM, queue );
    magma_smfree( &df, queue );
    magma_smfree( &dt, queue );
    magma_smfree( &dc, queue );
    magma_smfree( &dv, queue );
    magma_smfree( &dskp, queue );
    magma_smfree( &dalpha, queue );
    magma_smfree( &dbeta, queue );
    magma_free_pinned( hMdiag );
    magma_free_pinned( hskp );
    magma_free_pinned( halpha );
    magma_free_pinned( hbeta );
    magma_free( d1 );
    magma_free( d2 );

    solver_par->info = info;
    return info;
    /* magma_sidr_strms */
}
Esempio n. 14
0
int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
                              PyGpuArrayObject **S,
                              PyGpuArrayObject **U, // may be NULL
                              PyGpuArrayObject **VT, // may be NULL
                              PARAMS_TYPE* params) {
  bool compute_uv = (U != NULL);
  magma_int_t *iwork = NULL, iunused[1];
  magma_int_t M, N, K, ldu, ldv, M_U, N_VT, info;
  magma_vec_t jobz;
  size_t s_dims[1], u_dims[2], vt_dims[2];
  float *a_data = NULL, *s_data = NULL, *u_data = NULL, *vt_data = NULL,
        *work = NULL;
  float dummy[1];
  int res = -1, lwork;

  if (A->ga.typecode != GA_FLOAT) {
    PyErr_SetString(PyExc_TypeError,
                    "GpuMagmaMatrixInverse: Unsupported data type");
    return -1;
  }

  // This is early to match the exit() in the fail label.
  cuda_enter(params->context->ctx);
  magma_init();

  if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
    PyErr_SetString(PyExc_ValueError,
                    "GpuMagmaMatrixInverse: requires data to be C-contiguous");
    return 1;
  }
  if (PyGpuArray_NDIM(A) != 2) {
    PyErr_SetString(PyExc_ValueError,
                    "GpuMagmaMatrixInverse: matrix rank error");
    goto fail;
  }

  // magma matrix svd
  // reverse dimensions because MAGMA expects column-major matrices:
  M = PyGpuArray_DIM(A, 1);
  N = PyGpuArray_DIM(A, 0);
  K = std::min(M, N);

  if (MAGMA_SUCCESS !=  magma_smalloc_pinned(&a_data, M * N)) {
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaSVD: failed to allocate memory");
    goto fail;
  }
  cudaMemcpy(a_data, PyGpuArray_DEV_DATA(A), M * N * sizeof(float),
             cudaMemcpyDeviceToDevice);

  if (MAGMA_SUCCESS !=  magma_smalloc_pinned(&s_data, K)) {
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaSVD: failed to allocate memory");
    goto fail;
  }

  if (compute_uv) {
    if (params->full_matrices) {
      jobz = MagmaAllVec;
    } else {
      jobz = MagmaSomeVec;
    }
    M_U  = (jobz == MagmaAllVec ? M : K);
    N_VT = (jobz == MagmaAllVec ? N : K);
    ldu = M;
    ldv = N_VT;

    if (MAGMA_SUCCESS != magma_smalloc_pinned(&u_data, M_U * M)) {
      PyErr_SetString(PyExc_RuntimeError,
                      "GpuMagmaSVD: failed to allocate memory");
      goto fail;
    }
    if (MAGMA_SUCCESS != magma_smalloc_pinned(&vt_data, N * N_VT)) {
      PyErr_SetString(PyExc_RuntimeError,
                      "GpuMagmaSVD: failed to allocate memory");
      goto fail;
    }
  } else {
    jobz = MagmaNoVec;
    ldu = M;
    ldv = N;
  }

  // query for workspace size
  magma_sgesdd(jobz, M, N, NULL, M, NULL, NULL, ldu, NULL, ldv,
               dummy, -1, iunused, &info);

  lwork = (magma_int_t) MAGMA_S_REAL(dummy[0]);
  if (MAGMA_SUCCESS != magma_smalloc_pinned(&work, lwork)) {
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaSVD: failed to allocate working memory");
    goto fail;
  }

  if (MAGMA_SUCCESS != magma_imalloc_cpu(&iwork, 8*K)) {
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaSVD: failed to allocate working memory");
    goto fail;
  }

  // compute svd
  magma_sgesdd(jobz, M, N, a_data, M, s_data,
               u_data, ldu, vt_data, ldv, work, lwork, iwork, &info);
  if (info > 0) {
    PyErr_Format(
        PyExc_RuntimeError,
        "GpuMagmaSVD: the updating process of SBDSDC did not converge (error: %d)",
        info);
    goto fail;
  } else if (info < 0) {
    PyErr_Format(
        PyExc_RuntimeError,
        "GpuMagmaSVD: magma_sgesdd_gpu argument %d has an illegal value", -info);
    goto fail;
  }

  s_dims[0] = K;
  if (theano_prep_output(S, 1, s_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaSVD: failed to allocate memory");
    goto fail;
  }
  cudaMemcpy(PyGpuArray_DEV_DATA(*S), s_data, K * sizeof(float),
             cudaMemcpyDeviceToDevice);

  if (compute_uv) {
    u_dims[0] = N; u_dims[1] = N_VT;
    if (theano_prep_output(U, 2, u_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
      PyErr_SetString(PyExc_RuntimeError,
                      "GpuMagmaSVD: failed to allocate memory");
      goto fail;
    }
    // magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
    // to match numpy.linalg.svd output
    cudaMemcpy(PyGpuArray_DEV_DATA(*U), vt_data, N * N_VT * sizeof(float),
               cudaMemcpyDeviceToDevice);

    vt_dims[0] = M_U; vt_dims[1] = M;
    if (theano_prep_output(VT, 2, vt_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
      PyErr_SetString(PyExc_RuntimeError,
                      "GpuMagmaSVD: failed to allocate memory");
      goto fail;
    }
    // magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
    // to match numpy.linalg.svd output
    cudaMemcpy(PyGpuArray_DEV_DATA(*VT), u_data, M_U * M * sizeof(float),
               cudaMemcpyDeviceToDevice);
  }
  res = 0;
fail:
  if (a_data != NULL)
    magma_free_pinned(a_data);
  if (s_data != NULL)
    magma_free_pinned(s_data);
  if (u_data != NULL)
    magma_free_pinned(u_data);
  if (vt_data != NULL)
    magma_free_pinned(vt_data);
  if (work != NULL)
    magma_free_pinned(work);
  if (iwork != NULL)
    magma_free_cpu(iwork);
  magma_finalize();
  cuda_exit(params->context->ctx);
  return res;
}
Esempio n. 15
0
int APPLY_SPECIFIC(magma_eigh)(PyGpuArrayObject *A_,
                               PyGpuArrayObject **D,
                               PyGpuArrayObject **V, // may be NULL
                               PARAMS_TYPE *params) {
  PyGpuArrayObject *A = NULL;
  magma_int_t N, liwork, *iwork_data = NULL;
  size_t d_dims[1], v_dims[2];
  magma_uplo_t uplo;
  magma_vec_t jobz;
  float *w_data = NULL, *wA_data = NULL, *work_data = NULL, lwork;
  int res = -1, info;

  if (A_->ga.typecode != GA_FLOAT) {
    PyErr_SetString(PyExc_TypeError,
                    "GpuMagmaEigh: Unsupported data type");
    return -1;
  }

  // This is early to match the exit() in the fail label.
  cuda_enter(params->context->ctx);

  if (!GpuArray_IS_C_CONTIGUOUS(&A_->ga)) {
    PyErr_SetString(PyExc_ValueError,
                    "GpuMagmaEigh: requires data to be C-contiguous");
    goto fail;
  }
  if (PyGpuArray_NDIM(A_) != 2) {
    PyErr_SetString(PyExc_ValueError,
                    "GpuMagmaEigh: matrix rank error");
    goto fail;
  }
  if (PyGpuArray_DIM(A_, 0) != PyGpuArray_DIM(A_, 1)) {
    PyErr_SetString(PyExc_ValueError,
                    "GpuMagmaEigh: matrix is not square");
    goto fail;
  }

  A = pygpu_copy(A_, GA_F_ORDER);
  if (A == NULL) {
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaEigh: failed to change to column-major order");
    return -1;
  }

  // magma matrix eigen decomposition of a symmetric matrix
  N = PyGpuArray_DIM(A, 0);

  if (params->lower) {
    uplo = MagmaLower;
  } else {
    uplo = MagmaUpper;
  }
  if (params->compute_v) {
    jobz = MagmaVec;
  } else {
    jobz = MagmaNoVec;
  }

  if (MAGMA_SUCCESS != magma_smalloc_pinned(&w_data, N)) {
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaEigh: failed to allocate working memory");
    goto fail;
  }
  if (MAGMA_SUCCESS != magma_smalloc_pinned(&wA_data, N * N)) {
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaEigh: failed to allocate working memory");
    goto fail;
  }

  // query for workspace size
  magma_ssyevd_gpu(jobz, uplo, N, NULL, N, NULL, NULL, N, &lwork,
                   -1, &liwork, -1, &info);

  if (MAGMA_SUCCESS != magma_smalloc_pinned(&work_data, (size_t)lwork)) {
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaEigh: failed to allocate working memory");
    goto fail;
  }
  if (MAGMA_SUCCESS != magma_imalloc_cpu(&iwork_data, liwork)) {
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaEigh: failed to allocate working memory");
    goto fail;
  }

  magma_ssyevd_gpu(jobz, uplo, N, (float *)PyGpuArray_DEV_DATA(A), N, w_data,
                   wA_data, N, work_data, (size_t)lwork, iwork_data, liwork,
                   &info);
  if (info > 0) {
    PyErr_Format(
        PyExc_RuntimeError,
        "GpuMagmaEigh:  %d off-diagonal elements of an didn't converge to zero",
        info);
    goto fail;
  } else if (info < 0) {
    PyErr_Format(
        PyExc_RuntimeError,
        "GpuMagmaEigh: magma_ssyevd_gpu argument %d has an illegal value", -info);
    goto fail;
  }

  d_dims[0] = N;
  if (theano_prep_output(D, 1, d_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaEigh: failed to allocate memory for the output");
    goto fail;
  }
  cudaMemcpy(PyGpuArray_DEV_DATA(*D), w_data, N * sizeof(float),
             cudaMemcpyDeviceToDevice);

  if (params->compute_v) {
    *V = theano_try_copy(*V, A);
    if (*V == NULL) {
      PyErr_SetString(PyExc_RuntimeError,
                      "GpuMagmaEigh: failed to allocate memory for the output");
      goto fail;
    }
  }
  res = 0;
fail:
  if (w_data != NULL)
    magma_free_pinned(w_data);
  if (wA_data != NULL)
    magma_free_pinned(wA_data);
  if (work_data != NULL)
    magma_free_pinned(work_data);
  if (iwork_data != NULL)
    magma_free_cpu(iwork_data);
  Py_XDECREF(A);
  cuda_exit(params->context->ctx);
  return res;
}
Esempio n. 16
0
magma_int_t
magma_spgmres( magma_s_sparse_matrix A, magma_s_vector b, magma_s_vector *x,  
               magma_s_solver_par *solver_par, 
               magma_s_preconditioner *precond_par ){

    // prepare solver feedback
    solver_par->solver = Magma_PGMRES;
    solver_par->numiter = 0;
    solver_par->info = 0;

    // local variables
    float c_zero = MAGMA_S_ZERO, c_one = MAGMA_S_ONE, 
                                                c_mone = MAGMA_S_NEG_ONE;
    magma_int_t dofs = A.num_rows;
    magma_int_t i, j, k, m = 0;
    magma_int_t restart = min( dofs-1, solver_par->restart );
    magma_int_t ldh = restart+1;
    float nom, rNorm, RNorm, nom0, betanom, r0 = 0.;

    // CPU workspace
    //magma_setdevice(0);
    float *H, *HH, *y, *h1;
    magma_smalloc_pinned( &H, (ldh+1)*ldh );
    magma_smalloc_pinned( &y, ldh );
    magma_smalloc_pinned( &HH, ldh*ldh );
    magma_smalloc_pinned( &h1, ldh );

    // GPU workspace
    magma_s_vector r, q, q_t, z, z_t, t;
    magma_s_vinit( &t, Magma_DEV, dofs, c_zero );
    magma_s_vinit( &r, Magma_DEV, dofs, c_zero );
    magma_s_vinit( &q, Magma_DEV, dofs*(ldh+1), c_zero );
    magma_s_vinit( &z, Magma_DEV, dofs*(ldh+1), c_zero );
    magma_s_vinit( &z_t, Magma_DEV, dofs, c_zero );
    q_t.memory_location = Magma_DEV; 
    q_t.val = NULL; 
    q_t.num_rows = q_t.nnz = dofs;

    float *dy, *dH = NULL;
    if (MAGMA_SUCCESS != magma_smalloc( &dy, ldh )) 
        return MAGMA_ERR_DEVICE_ALLOC;
    if (MAGMA_SUCCESS != magma_smalloc( &dH, (ldh+1)*ldh )) 
        return MAGMA_ERR_DEVICE_ALLOC;

    // GPU stream
    magma_queue_t stream[2];
    magma_event_t event[1];
    magma_queue_create( &stream[0] );
    magma_queue_create( &stream[1] );
    magma_event_create( &event[0] );
    magmablasSetKernelStream(stream[0]);

    magma_sscal( dofs, c_zero, x->val, 1 );              //  x = 0
    magma_scopy( dofs, b.val, 1, r.val, 1 );             //  r = b
    nom0 = betanom = magma_snrm2( dofs, r.val, 1 );     //  nom0= || r||
    nom = nom0  * nom0;
    solver_par->init_res = nom0;
    H(1,0) = MAGMA_S_MAKE( nom0, 0. ); 
    magma_ssetvector(1, &H(1,0), 1, &dH(1,0), 1);
    if ( (r0 = nom0 * RTOLERANCE ) < ATOLERANCE ) 
        r0 = solver_par->epsilon;
    if ( nom < r0 )
        return MAGMA_SUCCESS;

    //Chronometry
    real_Double_t tempo1, tempo2;
    magma_device_sync(); tempo1=magma_wtime();
    if( solver_par->verbose > 0 ){
        solver_par->res_vec[0] = nom0;
        solver_par->timing[0] = 0.0;
    }
    // start iteration
    for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; 
                                                    solver_par->numiter++ ){

        for(k=1; k<=restart; k++) {

        magma_scopy(dofs, r.val, 1, q(k-1), 1);       //  q[0]    = 1.0/||r||
        magma_sscal(dofs, 1./H(k,k-1), q(k-1), 1);    //  (to be fused)
            q_t.val = q(k-1);
            magmablasSetKernelStream(stream[0]);
            // preconditioner
            //  z[k] = M^(-1) q(k)
            magma_s_applyprecond_left( A, q_t, &t, precond_par );      
            magma_s_applyprecond_right( A, t, &z_t, precond_par );     
  
            magma_scopy(dofs, z_t.val, 1, z(k-1), 1);                  

            // r = A q[k] 
            magma_s_spmv( c_one, A, z_t, c_zero, r );


    //      if (solver_par->ortho == Magma_MGS ) {
                // modified Gram-Schmidt
                for (i=1; i<=k; i++) {
                    H(i,k) =magma_sdot(dofs, q(i-1), 1, r.val, 1);            
                        //  H(i,k) = q[i] . r
                    magma_saxpy(dofs,-H(i,k), q(i-1), 1, r.val, 1);            
                       //  r = r - H(i,k) q[i]
                }
                H(k+1,k) = MAGMA_S_MAKE( magma_snrm2(dofs, r.val, 1), 0. ); // H(k+1,k) = ||r|| 


            /*}else if (solver_par->ortho == Magma_FUSED_CGS ) {
                // fusing sgemv with snrm2 in classical Gram-Schmidt
                magmablasSetKernelStream(stream[0]);
                magma_scopy(dofs, r.val, 1, q(k), 1);  
                    // dH(1:k+1,k) = q[0:k] . r
                magmablas_sgemv(MagmaTrans, dofs, k+1, c_one, q(0), 
                                dofs, r.val, 1, c_zero, &dH(1,k), 1);
                    // r = r - q[0:k-1] dH(1:k,k)
                magmablas_sgemv(MagmaNoTrans, dofs, k, c_mone, q(0), 
                                dofs, &dH(1,k), 1, c_one, r.val, 1);
                   // 1) dH(k+1,k) = sqrt( dH(k+1,k) - dH(1:k,k) )
                magma_scopyscale(  dofs, k, r.val, q(k), &dH(1,k) );  
                   // 2) q[k] = q[k] / dH(k+1,k) 

                magma_event_record( event[0], stream[0] );
                magma_queue_wait_event( stream[1], event[0] );
                magma_sgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); 
                    // asynch copy dH(1:(k+1),k) to H(1:(k+1),k)
            } else {
                // classical Gram-Schmidt (default)
                // > explicitly calling magmabls
                magmablasSetKernelStream(stream[0]);                                                  
                magmablas_sgemv(MagmaTrans, dofs, k, c_one, q(0), 
                                dofs, r.val, 1, c_zero, &dH(1,k), 1); 
                                // dH(1:k,k) = q[0:k-1] . r
                #ifndef SNRM2SCALE 
                // start copying dH(1:k,k) to H(1:k,k)
                magma_event_record( event[0], stream[0] );
                magma_queue_wait_event( stream[1], event[0] );
                magma_sgetvector_async(k, &dH(1,k), 1, &H(1,k), 
                                                    1, stream[1]);
                #endif
                                  // r = r - q[0:k-1] dH(1:k,k)
                magmablas_sgemv(MagmaNoTrans, dofs, k, c_mone, q(0), 
                                    dofs, &dH(1,k), 1, c_one, r.val, 1);
                #ifdef SNRM2SCALE
                magma_scopy(dofs, r.val, 1, q(k), 1);                 
                    //  q[k] = r / H(k,k-1) 
                magma_snrm2scale(dofs, q(k), dofs, &dH(k+1,k) );     
                    //  dH(k+1,k) = sqrt(r . r) and r = r / dH(k+1,k)

                magma_event_record( event[0], stream[0] );            
                            // start sending dH(1:k,k) to H(1:k,k)
                magma_queue_wait_event( stream[1], event[0] );        
                            // can we keep H(k+1,k) on GPU and combine?
                magma_sgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]);
                #else
                H(k+1,k) = MAGMA_S_MAKE( magma_snrm2(dofs, r.val, 1), 0. );   
                            //  H(k+1,k) = sqrt(r . r) 
                if( k<solver_par->restart ){
                        magmablasSetKernelStream(stream[0]);
                        magma_scopy(dofs, r.val, 1, q(k), 1);                  
                            //  q[k]    = 1.0/H[k][k-1] r
                        magma_sscal(dofs, 1./H(k+1,k), q(k), 1);              
                            //  (to be fused)   
                 }
                #endif
            }*/
            /*     Minimization of  || b-Ax ||  in H_k       */ 
            for (i=1; i<=k; i++) {
                HH(k,i) = magma_cblas_sdot( i+1, &H(1,k), 1, &H(1,i), 1 );
            }
            h1[k] = H(1,k)*H(1,0); 
            if (k != 1){
                for (i=1; i<k; i++) {
                    HH(k,i) = HH(k,i)/HH(i,i);//
                    for (m=i+1; m<=k; m++){
                        HH(k,m) -= HH(k,i) * HH(m,i) * HH(i,i);
                    }
                    h1[k] -= h1[i] * HH(k,i);   
                }    
            }
            y[k] = h1[k]/HH(k,k); 
            if (k != 1)  
                for (i=k-1; i>=1; i--) {
                    y[i] = h1[i]/HH(i,i);
                    for (j=i+1; j<=k; j++)
                        y[i] -= y[j] * HH(j,i);
                }                    
            m = k;
            rNorm = fabs(MAGMA_S_REAL(H(k+1,k)));
        }/*     Minimization done       */ 
        // compute solution approximation
        magma_ssetmatrix(m, 1, y+1, m, dy, m );

        magma_sgemv(MagmaNoTrans, dofs, m, c_one, z(0), dofs, dy, 1, 
                                                    c_one, x->val, 1); 

        // compute residual
        magma_s_spmv( c_mone, A, *x, c_zero, r );      //  r = - A * x
        magma_saxpy(dofs, c_one, b.val, 1, r.val, 1);  //  r = r + b
        H(1,0) = MAGMA_S_MAKE( magma_snrm2(dofs, r.val, 1), 0. ); 
                                            //  RNorm = H[1][0] = || r ||
        RNorm = MAGMA_S_REAL( H(1,0) );
        betanom = fabs(RNorm);  

        if( solver_par->verbose > 0 ){
            magma_device_sync(); tempo2=magma_wtime();
            if( (solver_par->numiter)%solver_par->verbose==0 ) {
                solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] 
                        = (real_Double_t) betanom;
                solver_par->timing[(solver_par->numiter)/solver_par->verbose] 
                        = (real_Double_t) tempo2-tempo1;
            }
        }

        if (  betanom  < r0 ) {
            break;
        } 
    }

    magma_device_sync(); tempo2=magma_wtime();
    solver_par->runtime = (real_Double_t) tempo2-tempo1;
    float residual;
    magma_sresidual( A, b, *x, &residual );
    solver_par->iter_res = betanom;
    solver_par->final_res = residual;

    if( solver_par->numiter < solver_par->maxiter){
        solver_par->info = 0;
    }else if( solver_par->init_res > solver_par->final_res ){
        if( solver_par->verbose > 0 ){
            if( (solver_par->numiter)%solver_par->verbose==0 ) {
                solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] 
                        = (real_Double_t) betanom;
                solver_par->timing[(solver_par->numiter)/solver_par->verbose] 
                        = (real_Double_t) tempo2-tempo1;
            }
        }
        solver_par->info = -2;
    }
    else{
        if( solver_par->verbose > 0 ){
            if( (solver_par->numiter)%solver_par->verbose==0 ) {
                solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] 
                        = (real_Double_t) betanom;
                solver_par->timing[(solver_par->numiter)/solver_par->verbose] 
                        = (real_Double_t) tempo2-tempo1;
            }
        }
        solver_par->info = -1;
    }
    // free pinned memory
    magma_free_pinned( H );
    magma_free_pinned( y );
    magma_free_pinned( HH );
    magma_free_pinned( h1 );
    // free GPU memory
    magma_free(dy); 
    if (dH != NULL ) magma_free(dH); 
    magma_s_vfree(&t);
    magma_s_vfree(&r);
    magma_s_vfree(&q);
    magma_s_vfree(&z);
    magma_s_vfree(&z_t);

    // free GPU streams and events
    magma_queue_destroy( stream[0] );
    magma_queue_destroy( stream[1] );
    magma_event_destroy( event[0] );
    magmablasSetKernelStream(NULL);

    return MAGMA_SUCCESS;
}   /* magma_spgmres */
Esempio n. 17
0
extern "C" magma_int_t
magma_sgetrf_nopiv_gpu(magma_int_t m, magma_int_t n,
                       float *dA, magma_int_t ldda,
                       magma_int_t *info)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    SGETRF_NOPIV_GPU computes an LU factorization of a general M-by-N
    matrix A without any pivoting.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) REAL array on the GPU, dimension (LDDA,N).
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    LDDA     (input) INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
            > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.
    =====================================================================    */

#define inA(i,j) (dA + (i)*nb + (j)*nb*ldda)

    float c_one     = MAGMA_S_ONE;
    float c_neg_one = MAGMA_S_NEG_ONE;

    magma_int_t iinfo, nb;
    magma_int_t maxm, maxn, mindim;
    magma_int_t i, rows, cols, s, lddwork;
    float *work;

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (ldda < max(1,m))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    nb     = 2*magma_get_sgetrf_nb(m);
    s      = mindim / nb;

    if (nb <= 1 || nb >= min(m,n)) {
        /* Use CPU code. */
        magma_smalloc_cpu( &work, m * n );
        if ( work == NULL ) {
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        magma_sgetmatrix( m, n, dA, ldda, work, m );
        magma_sgetrf_nopiv(&m, &n, work, &m, info);
        magma_ssetmatrix( m, n, work, m, dA, ldda );
        magma_free_cpu(work);
    }
    else {
        /* Use hybrid blocked code. */
        maxm = ((m + 31)/32)*32;
        maxn = ((n + 31)/32)*32;

        lddwork = maxm;

        if (MAGMA_SUCCESS != magma_smalloc_pinned( &work, maxm*nb )) {
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }

        for( i=0; i<s; i++ ) {
            // download i-th panel
            cols = maxm - i*nb;
            magma_sgetmatrix( m-i*nb, nb, inA(i,i), ldda, work, lddwork );
            
            // make sure that gpu queue is empty
            magma_device_sync();
            
            if ( i>0 ){
                magma_strsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit,
                             nb, n - (i+1)*nb,
                             c_one, inA(i-1,i-1), ldda,
                             inA(i-1,i+1), ldda );
                magma_sgemm( MagmaNoTrans, MagmaNoTrans,
                             m-i*nb, n-(i+1)*nb, nb,
                             c_neg_one, inA(i,  i-1), ldda, inA(i-1,i+1), ldda,
                             c_one,     inA(i,  i+1), ldda );
            }

            // do the cpu part
            rows = m - i*nb;
            magma_sgetrf_nopiv(&rows, &nb, work, &lddwork, &iinfo);
            if ( (*info == 0) && (iinfo > 0) )
                *info = iinfo + i*nb;

            // upload i-th panel
            magma_ssetmatrix( m-i*nb, nb, work, lddwork, inA(i, i), ldda );
            
            // do the small non-parallel computations
            if ( s > (i+1) ) {
                magma_strsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit,
                             nb, nb,
                             c_one, inA(i, i  ), ldda,
                             inA(i, i+1), ldda);
                magma_sgemm( MagmaNoTrans, MagmaNoTrans,
                             m-(i+1)*nb, nb, nb,
                             c_neg_one, inA(i+1, i  ), ldda, inA(i,   i+1), ldda,
                             c_one,     inA(i+1, i+1), ldda );
            }
            else {
                magma_strsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit,
                             nb, n-s*nb,
                             c_one, inA(i, i  ), ldda,
                             inA(i, i+1), ldda);
                magma_sgemm( MagmaNoTrans, MagmaNoTrans,
                             m-(i+1)*nb, n-(i+1)*nb, nb,
                             c_neg_one, inA(i+1, i  ), ldda, inA(i,   i+1), ldda,
                             c_one,     inA(i+1, i+1), ldda );
            }
        }

        magma_int_t nb0 = min(m - s*nb, n - s*nb);
        rows = m - s*nb;
        cols = maxm - s*nb;
        magma_sgetmatrix( rows, nb0, inA(s,s), ldda, work, lddwork );

        // make sure that gpu queue is empty
        magma_device_sync();

        // do the cpu part
        magma_sgetrf_nopiv( &rows, &nb0, work, &lddwork, &iinfo);
        if ( (*info == 0) && (iinfo > 0) )
            *info = iinfo + s*nb;

        // upload i-th panel
        magma_ssetmatrix( rows, nb0, work, lddwork, inA(s,s), ldda );

        magma_strsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit,
                     nb0, n-s*nb-nb0,
                     c_one, inA(s,s),     ldda,
                            inA(s,s)+nb0, ldda);

        magma_free_pinned( work );
    }

    return *info;
} /* magma_sgetrf_nopiv_gpu */
Esempio n. 18
0
/**
    Purpose
    -------
    SORMQR overwrites the general real M-by-N matrix C with

    @verbatim
                                SIDE = MagmaLeft    SIDE = MagmaRight
    TRANS = MagmaNoTrans:       Q * C               C * Q
    TRANS = MagmaTrans:    Q**H * C            C * Q**H
    @endverbatim

    where Q is a real unitary matrix defined as the product of k
    elementary reflectors

          Q = H(1) H(2) . . . H(k)

    as returned by SGEQRF. Q is of order M if SIDE = MagmaLeft and of order N
    if SIDE = MagmaRight.

    Arguments
    ---------
    @param[in]
    ngpu    INTEGER
            Number of GPUs to use. ngpu > 0.

    @param[in]
    side    magma_side_t
      -     = MagmaLeft:      apply Q or Q**H from the Left;
      -     = MagmaRight:     apply Q or Q**H from the Right.

    @param[in]
    trans   magma_trans_t
      -     = MagmaNoTrans:    No transpose, apply Q;
      -     = MagmaTrans: Conjugate transpose, apply Q**H.

    @param[in]
    m       INTEGER
            The number of rows of the matrix C. M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix C. N >= 0.

    @param[in]
    k       INTEGER
            The number of elementary reflectors whose product defines
            the matrix Q.
            If SIDE = MagmaLeft,  M >= K >= 0;
            if SIDE = MagmaRight, N >= K >= 0.

    @param[in]
    A       REAL array, dimension (LDA,K)
            The i-th column must contain the vector which defines the
            elementary reflector H(i), for i = 1,2,...,k, as returned by
            SGEQRF in the first k columns of its array argument A.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.
            If SIDE = MagmaLeft,  LDA >= max(1,M);
            if SIDE = MagmaRight, LDA >= max(1,N).

    @param[in]
    tau     REAL array, dimension (K)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by SGEQRF.

    @param[in,out]
    C       REAL array, dimension (LDC,N)
            On entry, the M-by-N matrix C.
            On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q.

    @param[in]
    ldc     INTEGER
            The leading dimension of the array C. LDC >= max(1,M).

    @param[out]
    work    (workspace) REAL array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.
            If SIDE = MagmaLeft,  LWORK >= max(1,N);
            if SIDE = MagmaRight, LWORK >= max(1,M).
            For optimum performance LWORK >= N*NB if SIDE = MagmaLeft, and
            LWORK >= M*NB if SIDE = MagmaRight, where NB is the optimal
            blocksize.
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value

    @ingroup magma_sgeqrf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_sormqr_m(
    magma_int_t ngpu,
    magma_side_t side, magma_trans_t trans,
    magma_int_t m, magma_int_t n, magma_int_t k,
    float *A,    magma_int_t lda,
    float *tau,
    float *C,    magma_int_t ldc,
    float *work, magma_int_t lwork,
    magma_int_t *info)
{
#define  A(i, j) (A + (j)*lda  + (i))
#define  C(i, j) (C + (j)*ldc  + (i))

#define    dC(gpui,      i, j) (dw[gpui] + (j)*lddc + (i))
#define  dA_c(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddac)
#define  dA_r(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddar)
#define    dT(gpui, ind)       (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + (ind)*((nb+1)*nb))
#define dwork(gpui, ind)       (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + 2*((nb+1)*nb) + (ind)*(lddwork*nb))

    float c_zero = MAGMA_S_ZERO;
    float c_one  = MAGMA_S_ONE;

    const char* side_  = lapack_side_const( side );
    const char* trans_ = lapack_trans_const( trans );

    // TODO fix memory leak (alloc after argument checks)
    magma_int_t nb = 128;
    float *T;
    magma_smalloc_pinned(&T, nb*nb);
    //printf("calling sormqr_m with nb=%d\n", (int) nb);

    float* dw[MagmaMaxGPUs];
    magma_queue_t stream [MagmaMaxGPUs][2];
    magma_event_t  event [MagmaMaxGPUs][2];

    magma_int_t ind_c;
    magma_device_t igpu;
    
    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );

    *info = 0;

    magma_int_t left   = (side == MagmaLeft);
    magma_int_t notran = (trans == MagmaNoTrans);
    magma_int_t lquery = (lwork == -1);

    /* NQ is the order of Q and NW is the minimum dimension of WORK */
    magma_int_t nq, nw;
    if (left) {
        nq = m;
        nw = n;
    } else {
        nq = n;
        nw = m;
    }


    if (! left && side != MagmaRight) {
        *info = -1;
    } else if (! notran && trans != MagmaTrans) {
        *info = -2;
    } else if (m < 0) {
        *info = -3;
    } else if (n < 0) {
        *info = -4;
    } else if (k < 0 || k > nq) {
        *info = -5;
    } else if (lda < max(1,nq)) {
        *info = -7;
    } else if (ldc < max(1,m)) {
        *info = -10;
    } else if (lwork < max(1,nw) && ! lquery) {
        *info = -12;
    }

    magma_int_t lwkopt = max(1,nw) * nb;
    if (*info == 0) {
        work[0] = MAGMA_S_MAKE( lwkopt, 0 );
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0 || k == 0) {
        work[0] = c_one;
        return *info;
    }

    if (nb >= k) {
        /* Use CPU code */
        lapackf77_sormqr(side_, trans_, &m, &n, &k, A, &lda, tau,
                         C, &ldc, work, &lwork, info);
        return *info;
    }

    magma_int_t lddc = (m+63)/64*64;
    magma_int_t lddac = nq;
    magma_int_t lddar = nb;
    magma_int_t lddwork = nw;

    magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 };

    magma_int_t nb_l=256;
    magma_int_t nbl = (n-1)/nb_l+1; // number of blocks
    magma_int_t maxnlocal = (nbl+ngpu-1)/ngpu*nb_l;

    ngpu = min(ngpu, (n+nb_l-1)/nb_l); // Don't use GPU that will not have data.

    magma_int_t ldw = maxnlocal*lddc // dC
                    + 2*lddac*lddar // 2*dA
                    + 2*(nb + 1 + lddwork)*nb; // 2*(dT and dwork)

    for (igpu = 0; igpu < ngpu; ++igpu) {
        magma_setdevice(igpu);
        if (MAGMA_SUCCESS != magma_smalloc( &dw[igpu], ldw )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            magma_xerbla( __func__, -(*info) );
            return *info;
        }
        magma_queue_create( &stream[igpu][0] );
        magma_queue_create( &stream[igpu][1] );
        magma_event_create( &event[igpu][0] );
        magma_event_create( &event[igpu][1] );
    }

    /* Use hybrid CPU-MGPU code */
    if (left) {
        //copy C to mgpus
        for (magma_int_t i = 0; i < nbl; ++i) {
            magma_int_t igpu = i%ngpu;
            magma_setdevice(igpu);
            magma_int_t kb = min(nb_l, n-i*nb_l);
            magma_ssetmatrix_async( m, kb,
                                   C(0, i*nb_l), ldc,
                                   dC(igpu, 0, i/ngpu*nb_l), lddc, stream[igpu][0] );
            nlocal[igpu] += kb;
        }

        magma_int_t i1, i2, i3;
        if ( !notran ) {
            i1 = 0;
            i2 = k;
            i3 = nb;
        } else {
            i1 = (k - 1) / nb * nb;
            i2 = 0;
            i3 = -nb;
        }

        ind_c = 0;

        for (magma_int_t i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) {
            // start the copy of A panel
            magma_int_t kb = min(nb, k - i);
            for (igpu = 0; igpu < ngpu; ++igpu) {
                magma_setdevice(igpu);
                magma_event_sync(event[igpu][ind_c]); // check if the new data can be copied
                magma_ssetmatrix_async(nq-i, kb,
                                       A(i, i),                 lda,
                                       dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] );
                // set upper triangular part of dA to identity
                magmablas_slaset_band_q( MagmaUpper, kb, kb, kb, c_zero, c_one, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] );
            }

            /* Form the triangular factor of the block reflector
             H = H(i) H(i+1) . . . H(i+ib-1) */
            magma_int_t nqi = nq - i;
            lapackf77_slarft("F", "C", &nqi, &kb, A(i, i), &lda,
                             &tau[i], T, &kb);

            /* H or H' is applied to C(1:m,i:n) */

            /* Apply H or H'; First copy T to the GPU */
            for (igpu = 0; igpu < ngpu; ++igpu) {
                magma_setdevice(igpu);
                magma_ssetmatrix_async(kb, kb,
                                       T,               kb,
                                       dT(igpu, ind_c), kb, stream[igpu][0] );
            }

            for (igpu = 0; igpu < ngpu; ++igpu) {
                magma_setdevice(igpu);
                magma_queue_sync( stream[igpu][0] ); // check if the data was copied
                magmablasSetKernelStream(stream[igpu][1]);
                magma_slarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
                                 m-i, nlocal[igpu], kb,
                                 dA_c(igpu, ind_c, i, 0), lddac, dT(igpu, ind_c), kb,
                                 dC(igpu, i, 0), lddc,
                                 dwork(igpu, ind_c), lddwork);
                magma_event_record(event[igpu][ind_c], stream[igpu][1] );
            }

            ind_c = (ind_c+1)%2;
        }

        for (igpu = 0; igpu < ngpu; ++igpu) {
            magma_setdevice(igpu);
            magma_queue_sync( stream[igpu][1] );
        }

        //copy C from mgpus
        for (magma_int_t i = 0; i < nbl; ++i) {
            magma_int_t igpu = i%ngpu;
            magma_setdevice(igpu);
            magma_int_t kb = min(nb_l, n-i*nb_l);
            magma_sgetmatrix( m, kb,
                              dC(igpu, 0, i/ngpu*nb_l), lddc,
                              C(0, i*nb_l), ldc );
//            magma_sgetmatrix_async( m, kb,
//                                   dC(igpu, 0, i/ngpu*nb_l), lddc,
//                                   C(0, i*nb_l), ldc, stream[igpu][0] );
        }
    } else {
        // TODO fix memory leak T, dw, event, stream
        fprintf(stderr, "The case (side == right) is not implemented\n");
        *info = MAGMA_ERR_NOT_IMPLEMENTED;
        magma_xerbla( __func__, -(*info) );
        return *info;
        /*
        if ( notran ) {
            i1 = 0;
            i2 = k;
            i3 = nb;
        } else {
            i1 = (k - 1) / nb * nb;
            i2 = 0;
            i3 = -nb;
        }

        mi = m;
        ic = 0;

        for (i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) {
            ib = min(nb, k - i);
            
            // Form the triangular factor of the block reflector
            // H = H(i) H(i+1) . . . H(i+ib-1)
            i__4 = nq - i;
            lapackf77_slarft("F", "C", &i__4, &ib, A(i, i), &lda,
            &tau[i], T, &ib);
            
            // 1) copy the panel from A to the GPU, and
            // 2) set upper triangular part of dA to identity
            magma_ssetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda );
            magmablas_slaset_band( MagmaUpper, ib, ib, ib, c_zero, c_one, dA(i, 0), ldda );
            
            // H or H' is applied to C(1:m,i:n)
            ni = n - i;
            jc = i;
            
            // Apply H or H'; First copy T to the GPU
            magma_ssetmatrix( ib, ib, T, ib, dT, ib );
            magma_slarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
            mi, ni, ib,
            dA(i, 0), ldda, dT, ib,
            dC(ic, jc), lddc,
            dwork, lddwork);
        }
        */
    }

    work[0] = MAGMA_S_MAKE( lwkopt, 0 );

    for (igpu = 0; igpu < ngpu; ++igpu) {
        magma_setdevice(igpu);
        magma_event_destroy( event[igpu][0] );
        magma_event_destroy( event[igpu][1] );
        magma_queue_destroy( stream[igpu][0] );
        magma_queue_destroy( stream[igpu][1] );
        magma_free( dw[igpu] );
    }
    magma_setdevice( orig_dev );
    magmablasSetKernelStream( orig_stream );

    return *info;
} /* magma_sormqr */
Esempio n. 19
0
extern "C" int calc_numerical_range(magmaFloatComplex *M, magma_int_t M_lead_dim, float _from, float _step, magma_int_t _steps, magmaFloatComplex *pts)
{
	magma_int_t idx = 0, rslt = 0;

	magmaFloatComplex p, scalar;
	std::complex<float> vtmp;

	float j;

	magmaFloatComplex *dA = nullptr;
	magmaFloatComplex *dAth = NULL, *dAthT = NULL,
				*dX = NULL, *dY = NULL;

	float *dE = NULL;
	//float *hE = NULL;


	//magma_int_t *ipiv = NULL;
	magma_int_t lda = M_lead_dim;
	//magma_int_t ldx = lda;
	magma_int_t info = 0;

	magma_int_t nb = 0;

	//magma_vec_t jobvl;
	//magma_vec_t jobvr;

	magmaFloatComplex *work = nullptr;
	magma_int_t  lwork = 0;

	float *rwork = nullptr;
	magma_int_t lrwork = 0;

	magma_int_t *iwork = nullptr;
	magma_int_t liwork = 0;

	nb = magma_get_cgehrd_nb( M_lead_dim );

	lwork = 2 * max(M_lead_dim + M_lead_dim*nb, 2 * M_lead_dim + M_lead_dim*M_lead_dim); // MagmaVec

	lrwork = 1 + 5 * M_lead_dim + 2 * M_lead_dim*M_lead_dim; // MagmaVec

	liwork = (3 + 5 * M_lead_dim); // MagmaVec

	magma_imalloc_cpu(&iwork, liwork);
	magma_smalloc_cpu(&rwork, lrwork);

	magma_cmalloc_pinned(&work, lwork);

	magma_cmalloc_pinned(&dA, lda*M_lead_dim);
	magma_cmalloc_pinned(&dAth, lda*M_lead_dim);
	magma_cmalloc_pinned(&dAthT, lda*M_lead_dim);

	magma_smalloc_pinned(&dE, M_lead_dim);
	//magma_smalloc_cpu(&hE, M_lead_dim);

	magma_cmalloc_pinned(&dX, M_lead_dim);
	magma_cmalloc_pinned(&dY, M_lead_dim);

	magma_csetmatrix(M_lead_dim, M_lead_dim, M, lda, dA, M_lead_dim, queue);

	// th=[0:resolution:2*pi]
	j = _from;
	for (idx = 0; idx < _steps; idx++)
	{
		//scalar = exp( 1im * -j);
		vtmp.real( 0.0f );
		vtmp.imag(  -j  );
		//vtmp = _FCbuild(0.0f, -j);
		//printf("vtmp = %f + i%f\n", vtmp._Val[0], vtmp._Val[1]);

		vtmp = exp(vtmp);
		scalar.x = vtmp.real();
		scalar.y = vtmp.imag();

		//printf("scalar = %f + i%f\n", scalar.x, scalar.y);

		magma_ccopy(lda * M_lead_dim, dA, 1, dAth, 1, queue);
		// Ath = exp(1im * -j) * As
		magma_cscal(lda * M_lead_dim, scalar, dAth, 1, queue);

		//magma_cprint_gpu(N, N, dA, lda);
		//magma_cprint_gpu(N, N, dAth, lda);

		// AthT = (Ath + Ath')
		magmablas_ctranspose_conj(M_lead_dim, M_lead_dim, dAth, M_lead_dim, dAthT, M_lead_dim, queue);
		magmablas_cgeadd(M_lead_dim, M_lead_dim, MAGMA_C_MAKE(1.0f, 0.0f), dAth, M_lead_dim, dAthT, M_lead_dim, queue);
		// AthT = AthT / 2
		magma_cscal(lda*M_lead_dim, MAGMA_C_MAKE(0.5f, 0.0f), dAthT, 1, queue);
		magma_sync_wtime(queue);

		//magma_cprint_gpu(M_lead_dim, M_lead_dim, dAthT, lda);

		// e, r = eig(AthT)
		rslt = magma_cheevd(MagmaVec, MagmaLower,
			M_lead_dim,
			dAthT, lda,
			dE,
			work, lwork,
			rwork, lrwork,
			iwork, liwork,
			&info);
		magma_sync_wtime(queue);

		//printf("magma_cheevd info=%d\n", info);

		//magma_cprint_gpu(M_lead_dim, M_lead_dim, dAthT, lda);
		//magma_sprint_gpu(M_lead_dim, 1, dE, M_lead_dim);

		//magma_sgetvector(M_lead_dim, dE, 1, hE, 1, queue);

		//printf("%f %f\n", hE[0], hE[1]);

		// p = r[:,s]' * A * r[:,s]
		// r = r[:,s]
		magma_ccopy(
			M_lead_dim,
			dAthT + (M_lead_dim*(M_lead_dim-1)), 1, // dAthT + (N), where (N) is a column offset
			dX, 1,
			queue);
		magma_sync_wtime(queue);

		//magma_cprint_gpu(M_lead_dim, 1, dX, M_lead_dim);

		// pp = A * r[:,s]
		magma_cgemv(MagmaNoTrans,
			M_lead_dim, M_lead_dim,
			MAGMA_C_MAKE(1.0f, 0.0f),
			dA, lda,
			dX, 1,
			MAGMA_C_MAKE(0.0f, 0.0f),
			dY, 1, queue);
		magma_sync_wtime(queue);

		//magma_cprint_gpu(M_lead_dim, 1, dY, M_lead_dim);

		// p = r' * pp
		p = magma_cdotc(M_lead_dim, dX, 1, dY, 1, queue);
		magma_sync_wtime(queue);

		pts[idx] = p;

		//printf("p = %f %fi\n", p.x, p.y);

		j += _step;
	} // end of for (idx = 0; idx < _steps; idx++)

	magma_free_pinned(dY);
	magma_free_pinned(dX);

	//magma_free_cpu(hE);
	magma_free_pinned(dE);

	magma_free_pinned(dAthT);
	magma_free_pinned(dAth);
	magma_free_pinned(dA);

	magma_free_pinned(work);

	magma_free_cpu(rwork);
	magma_free_cpu(iwork);
	//magma_free_cpu(w);
	//magma_free_cpu(A);

	return rslt;
}
Esempio n. 20
0
extern "C" magma_int_t
magma_sgeqrf3_gpu( magma_int_t m, magma_int_t n,
                  float *dA,   magma_int_t ldda,
                  float *tau, float *dT,
                  magma_int_t *info )
{
/*  -- MAGMA (version 1.4.1) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       December 2013

    Purpose
    =======
    SGEQRF3 computes a QR factorization of a real M-by-N matrix A:
    A = Q * R.
    
    This version stores the triangular dT matrices used in
    the block QR factorization so that they can be applied directly (i.e.,
    without being recomputed) later. As a result, the application
    of Q is much faster. Also, the upper triangular matrices for V have 0s
    in them and the corresponding parts of the upper triangular R are
    stored separately in dT.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    dA      (input/output) REAL array on the GPU, dimension (LDDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).

    LDDA    (input) INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).
            To benefit from coalescent memory accesses LDDA must be
            dividable by 16.

    TAU     (output) REAL array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    dT      (workspace/output)  REAL array on the GPU,
            dimension (2*MIN(M, N) + (N+31)/32*32 )*NB,
            where NB can be obtained through magma_get_sgeqrf_nb(M).
            It starts with MIN(M,N)*NB block that store the triangular T
            matrices, followed by the MIN(M,N)*NB block of the diagonal
            matrices for the R matrix. The rest of the array is used as workspace.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ===============
    The matrix Q is represented as a product of elementary reflectors

       Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).
    =====================================================================    */

    #define a_ref(a_1,a_2) (dA+(a_2)*(ldda) + (a_1))
    #define t_ref(a_1)     (dT+(a_1)*nb)
    #define d_ref(a_1)     (dT+(minmn+(a_1))*nb)
    #define dd_ref(a_1)    (dT+(2*minmn+(a_1))*nb)
    #define work_ref(a_1)  ( work + (a_1))
    #define hwork          ( work + (nb)*(m))

    magma_int_t i, k, minmn, old_i, old_ib, rows, cols;
    magma_int_t ib, nb;
    magma_int_t ldwork, lddwork, lwork, lhwork;
    float *work, *ut;

    /* check arguments */
    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    k = minmn = min(m,n);
    if (k == 0)
        return *info;

    nb = magma_get_sgeqrf_nb(m);

    lwork  = (m + n + nb)*nb;
    lhwork = lwork - m*nb;

    if (MAGMA_SUCCESS != magma_smalloc_pinned( &work, lwork )) {
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }
    
    ut = hwork+nb*(n);
    memset( ut, 0, nb*nb*sizeof(float));

    magma_queue_t stream[2];
    magma_queue_create( &stream[0] );
    magma_queue_create( &stream[1] );

    ldwork = m;
    lddwork= n;

    if ( (nb > 1) && (nb < k) ) {
        /* Use blocked code initially */
        old_i = 0; old_ib = nb;
        for (i = 0; i < k-nb; i += nb) {
            ib = min(k-i, nb);
            rows = m -i;
            magma_sgetmatrix_async( rows, ib,
                                    a_ref(i,i),  ldda,
                                    work_ref(i), ldwork, stream[1] );
            if (i>0){
                /* Apply H' to A(i:m,i+2*ib:n) from the left */
                cols = n-old_i-2*old_ib;
                magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise,
                                  m-old_i, cols, old_ib,
                                  a_ref(old_i, old_i         ), ldda, t_ref(old_i), nb,
                                  a_ref(old_i, old_i+2*old_ib), ldda, dd_ref(0),    lddwork);
                
                /* store the diagonal */
                magma_ssetmatrix_async( old_ib, old_ib,
                                        ut,           old_ib,
                                        d_ref(old_i), old_ib, stream[0] );
            }

            magma_queue_sync( stream[1] );
            lapackf77_sgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info);
            /* Form the triangular factor of the block reflector
               H = H(i) H(i+1) . . . H(i+ib-1) */
            lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr,
                              &rows, &ib,
                              work_ref(i), &ldwork, tau+i, hwork, &ib);

            /* Put 0s in the upper triangular part of a panel (and 1s on the
               diagonal); copy the upper triangular in ut.     */
            magma_queue_sync( stream[0] );
            ssplit_diag_block3(ib, work_ref(i), ldwork, ut);
            magma_ssetmatrix( rows, ib, work_ref(i), ldwork, a_ref(i,i), ldda );

            if (i + ib < n) {
                /* Send the triangular factor T to the GPU */
                magma_ssetmatrix( ib, ib, hwork, ib, t_ref(i), nb );

                if (i+nb < k-nb){
                    /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */
                    magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise,
                                      rows, ib, ib,
                                      a_ref(i, i   ), ldda, t_ref(i),  nb,
                                      a_ref(i, i+ib), ldda, dd_ref(0), lddwork);
                }
                else {
                    cols = n-i-ib;
                    magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise,
                                      rows, cols, ib,
                                      a_ref(i, i   ), ldda, t_ref(i),  nb,
                                      a_ref(i, i+ib), ldda, dd_ref(0), lddwork);
                    /* Fix the diagonal block */
                    magma_ssetmatrix( ib, ib, ut, ib, d_ref(i), ib );
                }
                old_i  = i;
                old_ib = ib;
            }
        }
    } else {
        i = 0;
    }

    /* Use unblocked code to factor the last or only block. */
    if (i < k) {
        ib   = n-i;
        rows = m-i;
        magma_sgetmatrix( rows, ib, a_ref(i, i), ldda, work, rows );
        lhwork = lwork - rows*ib;
        lapackf77_sgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info);
        
        magma_ssetmatrix( rows, ib, work, rows, a_ref(i, i), ldda );
    }

    magma_queue_destroy( stream[0] );
    magma_queue_destroy( stream[1] );
    magma_free_pinned( work );
    return *info;

/*     End of MAGMA_SGEQRF */

} /* magma_sgeqrf */
Esempio n. 21
0
extern "C" int calc_bounding_box(magmaFloatComplex *M, magma_int_t M_lead_dim, float *wReEig, float *wImEig)
{
	magma_int_t rslt = 0;

	//magmaFloatComplex *AT = nullptr;
	magmaFloatComplex *dA = nullptr, *dAT = nullptr,
		*dreA = nullptr, *dimA = nullptr;

	float *dreEig = nullptr;
	float *dimEig = nullptr;

	//magma_int_t *ipiv = NULL;
	magma_int_t lda = M_lead_dim;
	//magma_int_t ldx = lda;
	magma_int_t info = 0;

	magma_int_t nb = 0;

	//magma_vec_t jobvl;
	//magma_vec_t jobvr;

	magmaFloatComplex *work = nullptr;
	magma_int_t  lwork = 0;

	float *rwork = nullptr;
	magma_int_t lrwork = 0;

	magma_int_t *iwork = nullptr;
	magma_int_t liwork = 0;


	nb = magma_get_cgehrd_nb( M_lead_dim );


	lwork = 2 * (M_lead_dim + M_lead_dim*nb); // MagmaNoVec
	//lwork = 2 * max(M_lead_dim + M_lead_dim*nb, 2*M_lead_dim + M_lead_dim*M_lead_dim); // MagmaVec

	lrwork = M_lead_dim; // MagmaNoVec
	//lrwork = 1 + 5 * M_lead_dim + 2*M_lead_dim*M_lead_dim; // MagmaVec

	liwork = 1; // MagmaNoVec
	//liwork = 3 + 5*M_lead_dim; // MagmaVec

	magma_imalloc_cpu(&iwork, liwork);

	magma_smalloc_cpu(&rwork, lrwork);

	//magma_cmalloc_cpu(&A, lda*M_lead_dim);
	//magma_cmalloc_cpu(&AT, lda*M_lead_dim);

	//magma_smalloc_cpu(&reEig, M_lead_dim);
	//magma_smalloc_cpu(&imEig, M_lead_dim);


	magma_cmalloc_pinned(&dA, lda*M_lead_dim);
	magma_cmalloc_pinned(&dAT, lda*M_lead_dim);

	magma_cmalloc_pinned(&dreA, lda*M_lead_dim);
	magma_cmalloc_pinned(&dimA, lda*M_lead_dim);

	//magma_cmalloc_pinned(&VL, lda*M_lead_dim);
	//magma_cmalloc_pinned(&VR, lda*M_lead_dim);

	magma_cmalloc_pinned(&work, lwork);

	magma_smalloc_pinned(&dreEig, M_lead_dim);
	magma_smalloc_pinned(&dimEig, M_lead_dim);


	//matrix_fillzero(AT, M_lead_dim);

	//vector_fillzero(reEig, M_lead_dim);
	//vector_fillzero(imEig, M_lead_dim);

	//prepare_matrix_2(M);

	magma_csetmatrix(M_lead_dim, M_lead_dim, M, lda, dA, M_lead_dim, queue);
	//magma_csetmatrix(M_lead_dim, M_lead_dim, AT, lda, dAT, M_lead_dim, queue);

	//magma_ssetvector(M_lead_dim, wReEig, 1, dreEig, 1, queue);
	//magma_ssetvector(M_lead_dim, wImEig, 1, dimEig, 1, queue);

	//magma_cprint_gpu(M_lead_dim, M_lead_dim, dA, lda);

	// reA = ( (A + A')/2.0 )
	// A'
	magmablas_ctranspose(M_lead_dim, M_lead_dim, dA, M_lead_dim, dAT, M_lead_dim, queue);
	//magma_cprint_gpu(M_lead_dim, M_lead_dim, dAT, lda);

	// AT = A + A'
	magmablas_cgeadd(M_lead_dim, M_lead_dim, MAGMA_C_MAKE(1.0f, 0.0f), dA, M_lead_dim, dAT, M_lead_dim, queue);
	//magma_cprint_gpu(M_lead_dim, M_lead_dim, dAT, lda);
	// AT=AT*0.5
	magma_cscal(lda*M_lead_dim, MAGMA_C_MAKE(0.5f, 0.0f), dAT, 1, queue);
	//magma_cprint_gpu(M_lead_dim, M_lead_dim, dAT, lda);
	// reA = AT
	magma_ccopy(lda*M_lead_dim, dAT, 1, dreA, 1, queue);
	//magma_cprint_gpu(M_lead_dim, M_lead_dim, dreA, lda);
	magma_sync_wtime(queue);


	// imA = ( -1im*(A - A')/2.0 )
	// A'
	magmablas_ctranspose(M_lead_dim, M_lead_dim, dA, M_lead_dim, dAT, M_lead_dim, queue);
	//magma_cprint_gpu(M_lead_dim, M_lead_dim, dAT, lda);
	// AT = A + A'
	magmablas_cgeadd(M_lead_dim, M_lead_dim, MAGMA_C_MAKE(-1.0f, 0.0f), dAT, M_lead_dim, dA, M_lead_dim, queue);
	// A=A*-1j*0.5
	magma_cscal(lda*M_lead_dim, MAGMA_C_MAKE(0.0f, -0.5f), dA, 1, queue);
	// imA = A
	magma_ccopy(lda*M_lead_dim, dA, 1, dimA, 1, queue);
	magma_sync_wtime(queue);

	//magma_cprint_gpu(M_lead_dim, M_lead_dim, dreA, lda);
	//magma_cprint_gpu(M_lead_dim, M_lead_dim, dimA, lda);


	// reEig::Vector=eigvals(reA)
	rslt = magma_cheevd(MagmaNoVec, MagmaLower,
		M_lead_dim,
		dreA, lda,
		dreEig,
		work, lwork,
		rwork, lrwork,
		iwork, liwork,
		&info);

	// imEig::Vector=eigvals(imA)
	rslt = magma_cheevd(MagmaNoVec, MagmaLower,
		M_lead_dim,
		dimA, lda,
		dimEig,
		work, lwork,
		rwork, lrwork,
		iwork, liwork,
		&info);


	//magma_sprint_gpu(M_lead_dim, 1, dreEig, M_lead_dim);
	//magma_sprint_gpu(M_lead_dim, 1, dimEig, M_lead_dim);


	magma_sgetvector(M_lead_dim, dreEig, 1, wReEig, 1, queue);
	//magma_sync_wtime(queue);

	magma_sgetvector(M_lead_dim, dimEig, 1, wImEig, 1, queue);
	//magma_sync_wtime(queue);

	/*
	maxReIdx = magma_isamax(M_lead_dim, dreEig, 1, queue) - 1;
	minReIdx = magma_isamin(M_lead_dim, dreEig, 1, queue) - 1;

	maxImIdx = magma_isamax(M_lead_dim, dimEig, 1, queue) - 1;
	minImIdx = magma_isamin(M_lead_dim, dimEig, 1, queue) - 1;


	printf("max re idx = %d\nmin re idx = %d\n", maxReIdx, minReIdx);
	printf("%f %f\n", wReEig[maxReIdx], wReEig[minReIdx]);

	printf("max im idx = %d\nmin im idx = %d\n", maxImIdx, minImIdx);
	printf("%f %f\n", wImEig[maxImIdx], wImEig[minImIdx]);
	*/

	//printf("test wReEig: %f %f\n", wReEig[0], wReEig[1]);
	//printf("test wImEig: %f %f\n", wImEig[0], wImEig[1]);


	magma_free_cpu(iwork);
	magma_free_cpu(rwork);
	//magma_free_cpu(AT);

	magma_free_pinned(dA);
	magma_free_pinned(dAT);

	magma_free_pinned(dreA);
	magma_free_pinned(dimA);

	magma_free_pinned(work);

	magma_free_pinned(dreEig);
	magma_free_pinned(dimEig);

	return rslt;
}
Esempio n. 22
0
extern "C" magma_int_t
magma_slobpcg( magma_s_sparse_matrix A, magma_s_solver_par *solver_par ) {


#define  residualNorms(i,iter)  ( residualNorms + (i) + (iter)*n )
#define magmablas_swap(x, y)    { pointer = x; x = y; y = pointer; }
#define hresidualNorms(i,iter)  (hresidualNorms + (i) + (iter)*n )

#define gramA(    m, n)   (gramA     + (m) + (n)*ldgram)
#define gramB(    m, n)   (gramB     + (m) + (n)*ldgram)
#define gevectors(m, n)   (gevectors + (m) + (n)*ldgram)
#define h_gramB(  m, n)   (h_gramB   + (m) + (n)*ldgram)

#define magma_s_bspmv_tuned(m, n, alpha, A, X, beta, AX)       {        \
            magmablas_stranspose( m, n, X, m, blockW, n );        	\
            magma_s_vector x, ax;                                       \
            x.memory_location = Magma_DEV;  x.num_rows = m*n;  x.nnz = m*n;  x.val = blockW; \
            ax.memory_location= Magma_DEV; ax.num_rows = m*n; ax.nnz = m*n; ax.val = AX;     \
            magma_s_spmv(alpha, A, x, beta, ax );                           \
            magmablas_stranspose( n, m, blockW, n, X, m );            		\
}




//**************************************************************

    // Memory allocation for the eigenvectors, eigenvalues, and workspace
    solver_par->solver = Magma_LOBPCG;
    magma_int_t m = A.num_rows;
    magma_int_t n =(solver_par->num_eigenvalues);
    float *blockX = solver_par->eigenvectors;
    float *evalues = solver_par->eigenvalues;


    float *dwork, *hwork;
    float *blockP, *blockAP, *blockR, *blockAR, *blockAX, *blockW;
    float *gramA, *gramB, *gramM;
    float *gevectors, *h_gramB;

    float *pointer, *origX = blockX;
    float *eval_gpu;

    magma_int_t lwork = max( 2*n+n*magma_get_dsytrd_nb(n),
                             1 + 6*3*n + 2* 3*n* 3*n);

    magma_smalloc_pinned( &hwork   ,        lwork );
    magma_smalloc(        &blockAX   ,        m*n );
    magma_smalloc(        &blockAR   ,        m*n );
    magma_smalloc(        &blockAP   ,        m*n );
    magma_smalloc(        &blockR    ,        m*n );
    magma_smalloc(        &blockP    ,        m*n );
    magma_smalloc(        &blockW    ,        m*n );
    magma_smalloc(        &dwork     ,        m*n );
    magma_smalloc(        &eval_gpu  ,        3*n );




//**********************************************************+

    magma_int_t verbosity = 1;
    magma_int_t *iwork, liwork = 15*n+9;

    // === Set solver parameters ===
    float residualTolerance  = solver_par->epsilon;
    magma_int_t maxIterations = solver_par->maxiter;

    // === Set some constants & defaults ===
    float c_one = MAGMA_S_ONE, c_zero = MAGMA_S_ZERO;

    float *residualNorms, *condestGhistory, condestG;
    float *gevalues;
    magma_int_t *activeMask;

    // === Check some parameters for possible quick exit ===
    solver_par->info = 0;
    if (m < 2)
        solver_par->info = -1;
    else if (n > m)
        solver_par->info = -2;

    if (solver_par->info != 0) {
        magma_xerbla( __func__, -(solver_par->info) );
        return solver_par->info;
    }
    magma_int_t *info = &(solver_par->info); // local info variable;

    // === Allocate GPU memory for the residual norms' history ===
    magma_smalloc(&residualNorms, (maxIterations+1) * n);
    magma_malloc( (void **)&activeMask, (n+1) * sizeof(magma_int_t) );

    // === Allocate CPU work space ===
    magma_smalloc_cpu(&condestGhistory, maxIterations+1);
    magma_smalloc_cpu(&gevalues, 3 * n);
    magma_malloc_cpu((void **)&iwork, liwork * sizeof(magma_int_t));

    float *hW;
    magma_smalloc_pinned(&hW, n*n);
    magma_smalloc_pinned(&gevectors, 9*n*n);
    magma_smalloc_pinned(&h_gramB  , 9*n*n);

    // === Allocate GPU workspace ===
    magma_smalloc(&gramM, n * n);
    magma_smalloc(&gramA, 9 * n * n);
    magma_smalloc(&gramB, 9 * n * n);

#if defined(PRECISION_z) || defined(PRECISION_c)
    float *rwork;
    magma_int_t lrwork = 1 + 5*(3*n) + 2*(3*n)*(3*n);

    magma_smalloc_cpu(&rwork, lrwork);
#endif

    // === Set activemask to one ===
    for(int k =0; k<n; k++)
        iwork[k]=1;
    magma_setmatrix(n, 1, sizeof(magma_int_t), iwork, n ,activeMask, n);

    magma_int_t gramDim, ldgram  = 3*n, ikind = 4;

    // === Make the initial vectors orthonormal ===
    magma_sgegqr_gpu(ikind, m, n, blockX, m, dwork, hwork, info );
    //magma_sorthomgs( m, n, blockX );

    magma_s_bspmv_tuned(m, n, c_one, A, blockX, c_zero, blockAX );

    // === Compute the Gram matrix = (X, AX) & its eigenstates ===
    magma_sgemm(MagmaTrans, MagmaNoTrans, n, n, m,
                c_one,  blockX, m, blockAX, m, c_zero, gramM, n);

    magma_ssyevd_gpu( MagmaVec, MagmaUpper,
                      n, gramM, n, evalues, hW, n, hwork, lwork,
#if defined(PRECISION_z) || defined(PRECISION_c)
                      rwork, lrwork,
#endif
                      iwork, liwork, info );

    // === Update  X =  X * evectors ===
    magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, n,
                c_one,  blockX, m, gramM, n, c_zero, blockW, m);
    magmablas_swap(blockW, blockX);

    // === Update AX = AX * evectors ===
    magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, n,
                c_one,  blockAX, m, gramM, n, c_zero, blockW, m);
    magmablas_swap(blockW, blockAX);

    condestGhistory[1] = 7.82;
    magma_int_t iterationNumber, cBlockSize, restart = 1, iter;

    //Chronometry
    real_Double_t tempo1, tempo2;
    magma_device_sync();
    tempo1=magma_wtime();
    // === Main LOBPCG loop ============================================================
    for(iterationNumber = 1; iterationNumber < maxIterations; iterationNumber++)
    {
        // === compute the residuals (R = Ax - x evalues )
        magmablas_slacpy( MagmaUpperLower, m, n, blockAX, m, blockR, m);

        /*
                    for(int i=0; i<n; i++){
                       magma_saxpy(m, MAGMA_S_MAKE(-evalues[i],0), blockX+i*m, 1, blockR+i*m, 1);
                    }
          */
#if defined(PRECISION_z) || defined(PRECISION_d)
        magma_dsetmatrix( 3*n, 1, evalues, 3*n, eval_gpu, 3*n );
#else
        magma_ssetmatrix( 3*n, 1, evalues, 3*n, eval_gpu, 3*n );
#endif

        magma_slobpcg_res( m, n, eval_gpu, blockX, blockR, eval_gpu);

        magmablas_snrm2_cols(m, n, blockR, m, residualNorms(0, iterationNumber));

        // === remove the residuals corresponding to already converged evectors
        magma_scompact(m, n, blockR, m,
                       residualNorms(0, iterationNumber), residualTolerance,
                       activeMask, &cBlockSize);

        if (cBlockSize == 0)
            break;

        // === apply a preconditioner P to the active residulas: R_new = P R_old
        // === for now set P to be identity (no preconditioner => nothing to be done )
        // magmablas_slacpy( MagmaUpperLower, m, cBlockSize, blockR, m, blockW, m);

        /*
        // === make the preconditioned residuals orthogonal to X
        magma_sgemm(MagmaTrans, MagmaNoTrans, n, cBlockSize, m,
                    c_one, blockX, m, blockR, m, c_zero, gramB(0,0), ldgram);
        magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, n,
                    c_mone, blockX, m, gramB(0,0), ldgram, c_one, blockR, m);
        */

        // === make the active preconditioned residuals orthonormal
        magma_sgegqr_gpu(ikind, m, cBlockSize, blockR, m, dwork, hwork, info );
        //magma_sorthomgs( m, cBlockSize, blockR );

        // === compute AR
        magma_s_bspmv_tuned(m, cBlockSize, c_one, A, blockR, c_zero, blockAR );

        if (!restart) {
            // === compact P & AP as well
            magma_scompactActive(m, n, blockP,  m, activeMask);
            magma_scompactActive(m, n, blockAP, m, activeMask);

            /*
            // === make P orthogonal to X ?
            magma_sgemm(MagmaTrans, MagmaNoTrans, n, cBlockSize, m,
                        c_one, blockX, m, blockP, m, c_zero, gramB(0,0), ldgram);
            magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, n,
                        c_mone, blockX, m, gramB(0,0), ldgram, c_one, blockP, m);

            // === make P orthogonal to R ?
            magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, cBlockSize, m,
                        c_one, blockR, m, blockP, m, c_zero, gramB(0,0), ldgram);
            magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, cBlockSize,
                        c_mone, blockR, m, gramB(0,0), ldgram, c_one, blockP, m);
            */

            // === Make P orthonormal & properly change AP (without multiplication by A)
            magma_sgegqr_gpu(ikind, m, cBlockSize, blockP, m, dwork, hwork, info );
            //magma_sorthomgs( m, cBlockSize, blockP );

            //magma_s_bspmv_tuned(m, cBlockSize, c_one, A, blockP, c_zero, blockAP );
            magma_ssetmatrix( cBlockSize, cBlockSize, hwork, cBlockSize, dwork, cBlockSize);


//                magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit,
            //                           m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m);

            // replacement according to Stan
#if defined(PRECISION_s) || defined(PRECISION_d)
            magmablas_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit,
                             m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m);
#else
            magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m,
                         cBlockSize, c_one, dwork, cBlockSize, blockAP, m);
#endif
        }

        iter = max(1,iterationNumber-10- (int)(log(1.*cBlockSize)));
        float condestGmean = 0.;
        for(int i = 0; i<iterationNumber-iter+1; i++)
            condestGmean += condestGhistory[i];
        condestGmean = condestGmean / (iterationNumber-iter+1);

        if (restart)
            gramDim = n+cBlockSize;
        else
            gramDim = n+2*cBlockSize;

        /* --- The Raileight-Ritz method for [X R P] -----------------------
           [ X R P ]'  [AX  AR  AP] y = evalues [ X R P ]' [ X R P ], i.e.,

                  GramA                                 GramB
            / X'AX  X'AR  X'AP \                 / X'X  X'R  X'P \
           |  R'AX  R'AR  R'AP  | y   = evalues |  R'X  R'R  R'P  |
            \ P'AX  P'AR  P'AP /                 \ P'X  P'R  P'P /
           -----------------------------------------------------------------   */

        // === assemble GramB; first, set it to I
        magmablas_slaset(MagmaFull, ldgram, ldgram, c_zero, c_one, gramB, ldgram);  // identity

        if (!restart) {
            magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, n, m,
                        c_one, blockP, m, blockX, m, c_zero, gramB(n+cBlockSize,0), ldgram);
            magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, cBlockSize, m,
                        c_one, blockP, m, blockR, m, c_zero, gramB(n+cBlockSize,n), ldgram);
        }
        magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, n, m,
                    c_one, blockR, m, blockX, m, c_zero, gramB(n,0), ldgram);

        // === get GramB from the GPU to the CPU and compute its eigenvalues only
        magma_sgetmatrix(gramDim, gramDim, gramB, ldgram, h_gramB, ldgram);
        lapackf77_ssyev("N", "L", &gramDim, h_gramB, &ldgram, gevalues,
                        hwork, &lwork,
#if defined(PRECISION_z) || defined(PRECISION_c)
                        rwork,
#endif
                        info);

        // === check stability criteria if we need to restart
        condestG = log10( gevalues[gramDim-1]/gevalues[0] ) + 1.;
        if ((condestG/condestGmean>2 && condestG>2) || condestG>8) {
            // Steepest descent restart for stability
            restart=1;
            printf("restart at step #%d\n", (int) iterationNumber);
        }

        // === assemble GramA; first, set it to I
        magmablas_slaset(MagmaFull, ldgram, ldgram, c_zero, c_one, gramA, ldgram);  // identity

        magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, n, m,
                    c_one, blockR, m, blockAX, m, c_zero, gramA(n,0), ldgram);
        magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, cBlockSize, m,
                    c_one, blockR, m, blockAR, m, c_zero, gramA(n,n), ldgram);

        if (!restart) {
            magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, n, m,
                        c_one, blockP, m, blockAX, m, c_zero,
                        gramA(n+cBlockSize,0), ldgram);
            magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, cBlockSize, m,
                        c_one, blockP, m, blockAR, m, c_zero,
                        gramA(n+cBlockSize,n), ldgram);
            magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, cBlockSize, m,
                        c_one, blockP, m, blockAP, m, c_zero,
                        gramA(n+cBlockSize,n+cBlockSize), ldgram);
        }

        /*
        // === Compute X' AX or just use the eigenvalues below ?
        magma_sgemm(MagmaTrans, MagmaNoTrans, n, n, m,
                    c_one, blockX, m, blockAX, m, c_zero,
                    gramA(0,0), ldgram);
        */

        if (restart==0) {
            magma_sgetmatrix(gramDim, gramDim, gramA, ldgram, gevectors, ldgram);
        }
        else {
            gramDim = n+cBlockSize;
            magma_sgetmatrix(gramDim, gramDim, gramA, ldgram, gevectors, ldgram);
        }

        for(int k=0; k<n; k++)
            *gevectors(k,k) = MAGMA_S_MAKE(evalues[k], 0);

        // === the previous eigensolver destroyed what is in h_gramB => must copy it again
        magma_sgetmatrix(gramDim, gramDim, gramB, ldgram, h_gramB, ldgram);

        magma_int_t itype = 1;
        lapackf77_ssygvd(&itype, "V", "L", &gramDim,
                         gevectors, &ldgram, h_gramB, &ldgram,
                         gevalues, hwork, &lwork,
#if defined(PRECISION_z) || defined(PRECISION_c)
                         rwork, &lrwork,
#endif
                         iwork, &liwork, info);

        for(int k =0; k<n; k++)
            evalues[k] = gevalues[k];

        // === copy back the result to gramA on the GPU and use it for the updates
        magma_ssetmatrix(gramDim, gramDim, gevectors, ldgram, gramA, ldgram);

        if (restart == 0) {
            // === contribution from P to the new X (in new search direction P)
            magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize,
                        c_one, blockP, m, gramA(n+cBlockSize,0), ldgram, c_zero, dwork, m);
            magmablas_swap(dwork, blockP);

            // === contribution from R to the new X (in new search direction P)
            magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize,
                        c_one, blockR, m, gramA(n,0), ldgram, c_one, blockP, m);

            // === corresponding contribution from AP to the new AX (in AP)
            magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize,
                        c_one, blockAP, m, gramA(n+cBlockSize,0), ldgram, c_zero, dwork, m);
            magmablas_swap(dwork, blockAP);

            // === corresponding contribution from AR to the new AX (in AP)
            magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize,
                        c_one, blockAR, m, gramA(n,0), ldgram, c_one, blockAP, m);
        }
        else {
            // === contribution from R (only) to the new X
            magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize,
                        c_one, blockR, m, gramA(n,0), ldgram, c_zero, blockP, m);

            // === corresponding contribution from AR (only) to the new AX
            magma_sgemm(MagmaNoTrans, MagmaNoTrans,m, n, cBlockSize,
                        c_one, blockAR, m, gramA(n,0), ldgram, c_zero, blockAP, m);
        }

        // === contribution from old X to the new X + the new search direction P
        magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, n,
                    c_one, blockX, m, gramA, ldgram, c_zero, dwork, m);
        magmablas_swap(dwork, blockX);
        //magma_saxpy(m*n, c_one, blockP, 1, blockX, 1);
        magma_slobpcg_maxpy( m, n, blockP, blockX );


        // === corresponding contribution from old AX to new AX + AP
        magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, n,
                    c_one, blockAX, m, gramA, ldgram, c_zero, dwork, m);
        magmablas_swap(dwork, blockAX);
        //magma_saxpy(m*n, c_one, blockAP, 1, blockAX, 1);
        magma_slobpcg_maxpy( m, n, blockAP, blockAX );

        condestGhistory[iterationNumber+1]=condestG;
        if (verbosity==1) {
            // float res;
            // magma_sgetmatrix(1, 1,
            //                  (float*)residualNorms(0, iterationNumber), 1,
            //                  (float*)&res, 1);
            //
            //  printf("Iteration %4d, CBS %4d, Residual: %10.7f\n",
            //         iterationNumber, cBlockSize, res);
            printf("%4d-%2d ", (int) iterationNumber, (int) cBlockSize);
            magma_sprint_gpu(1, n, residualNorms(0, iterationNumber), 1);
        }

        restart = 0;
    }   // === end for iterationNumber = 1,maxIterations =======================


    // fill solver info
    magma_device_sync();
    tempo2=magma_wtime();
    solver_par->runtime = (real_Double_t) tempo2-tempo1;
    solver_par->numiter = iterationNumber;
    if( solver_par->numiter < solver_par->maxiter) {
        solver_par->info = 0;
    } else if( solver_par->init_res > solver_par->final_res )
        solver_par->info = -2;
    else
        solver_par->info = -1;

    // =============================================================================
    // === postprocessing;
    // =============================================================================

    // === compute the real AX and corresponding eigenvalues
    magma_s_bspmv_tuned(m, n, c_one, A, blockX, c_zero, blockAX );
    magma_sgemm(MagmaTrans, MagmaNoTrans, n, n, m,
                c_one,  blockX, m, blockAX, m, c_zero, gramM, n);

    magma_ssyevd_gpu( MagmaVec, MagmaUpper,
                      n, gramM, n, gevalues, dwork, n, hwork, lwork,
#if defined(PRECISION_z) || defined(PRECISION_c)
                      rwork, lrwork,
#endif
                      iwork, liwork, info );

    for(int k =0; k<n; k++)
        evalues[k] = gevalues[k];

    // === update X = X * evectors
    magmablas_swap(blockX, dwork);
    magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, n,
                c_one, dwork, m, gramM, n, c_zero, blockX, m);

    // === update AX = AX * evectors to compute the final residual
    magmablas_swap(blockAX, dwork);
    magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, n,
                c_one, dwork, m, gramM, n, c_zero, blockAX, m);

    // === compute R = AX - evalues X
    magmablas_slacpy( MagmaUpperLower, m, n, blockAX, m, blockR, m);
    for(int i=0; i<n; i++)
        magma_saxpy(m, MAGMA_S_MAKE(-evalues[i], 0), blockX+i*m, 1, blockR+i*m, 1);

    // === residualNorms[iterationNumber] = || R ||
    magmablas_snrm2_cols(m, n, blockR, m, residualNorms(0, iterationNumber));

    // === restore blockX if needed
    if (blockX != origX)
        magmablas_slacpy( MagmaUpperLower, m, n, blockX, m, origX, m);

    printf("Eigenvalues:\n");
    for(int i =0; i<n; i++)
        printf("%e  ", evalues[i]);
    printf("\n\n");

    printf("Final residuals:\n");
    magma_sprint_gpu(1, n, residualNorms(0, iterationNumber), 1);
    printf("\n\n");

    //=== Print residual history in a file for plotting ====
    float *hresidualNorms;
    magma_smalloc_cpu(&hresidualNorms, (iterationNumber+1) * n);
    magma_sgetmatrix(n, iterationNumber,
                     (float*)residualNorms, n,
                     (float*)hresidualNorms, n);

    printf("Residuals are stored in file residualNorms\n");
    printf("Plot the residuals using: myplot \n");

    FILE *residuals_file;
    residuals_file = fopen("residualNorms", "w");
    for(int i =1; i<iterationNumber; i++) {
        for(int j = 0; j<n; j++)
            fprintf(residuals_file, "%f ", *hresidualNorms(j,i));
        fprintf(residuals_file, "\n");
    }
    fclose(residuals_file);
    magma_free_cpu(hresidualNorms);

    // === free work space
    magma_free(     residualNorms   );
    magma_free_cpu( condestGhistory );
    magma_free_cpu( gevalues        );
    magma_free_cpu( iwork           );

    magma_free_pinned( hW           );
    magma_free_pinned( gevectors    );
    magma_free_pinned( h_gramB      );

    magma_free(     gramM           );
    magma_free(     gramA           );
    magma_free(     gramB           );
    magma_free(  activeMask         );

    magma_free(     blockAX    );
    magma_free(     blockAR    );
    magma_free(     blockAP    );
    magma_free(     blockR    );
    magma_free(     blockP    );
    magma_free(     blockW    );
    magma_free(     dwork    );
    magma_free(     eval_gpu    );

    magma_free_pinned( hwork    );


#if defined(PRECISION_z) || defined(PRECISION_c)
    magma_free_cpu( rwork           );
#endif

    return MAGMA_SUCCESS;
}
Esempio n. 23
0
/**
    Purpose
    -------
    SGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
        A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.

    Arguments
    ---------
    @param[in]
    ngpu    INTEGER
            Number of GPUs to use. ngpu > 0.

    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    d_lA    REAL array of pointers on the GPU, dimension (ngpu).
            On entry, the M-by-N matrix A distributed over GPUs
            (d_lA[d] points to the local matrix on d-th GPU).
            It uses 1D block column cyclic format with the block size of nb,
            and each local matrix is stored by column.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    @param[in]
    ldda     INTEGER
            The leading dimension of the array d_lA.  LDDA >= max(1,M).

    @param[out]
    ipiv    INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
      -     > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    @ingroup magma_sgesv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_sgetrf_mgpu(
    magma_int_t ngpu,
    magma_int_t m, magma_int_t n,
    magmaFloat_ptr d_lA[], magma_int_t ldda, magma_int_t *ipiv,
    magma_int_t *info)
{
    magma_int_t nb, n_local[MagmaMaxGPUs];
    magma_int_t maxm;
    magma_int_t i, j, d, lddat, lddwork;
    float *d_lAT[MagmaMaxGPUs];
    float *d_panel[MagmaMaxGPUs], *work;
    magma_queue_t queues[MagmaMaxGPUs][2];

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -2;
    else if (n < 0)
        *info = -3;
    else if (ldda < max(1,m))
        *info = -5;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* create the queues */
    for( d=0; d < ngpu; d++ ) {
        magma_queue_create( d, &queues[d][0] );
        magma_queue_create( d, &queues[d][1] );
    }

    /* Function Body */
    nb = magma_get_sgetrf_nb( m, n );

    if (nb <= 1 || nb >= n) {
        /* Use CPU code. */
        magma_smalloc_cpu( &work, m * n );
        if ( work == NULL ) {
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        magma_sgetmatrix( m, n, d_lA[0], ldda, work, m, queues[0][0] );
        lapackf77_sgetrf(&m, &n, work, &m, ipiv, info);
        magma_ssetmatrix( m, n, work, m, d_lA[0], ldda, queues[0][0] );
        magma_free_cpu(work);
    } else {
        /* Use hybrid blocked code. */
        magma_device_t orig_dev;
        magma_getdevice( &orig_dev );
        
        maxm = magma_roundup( m, 32 );
        if ( ngpu > ceil((float)n/nb) ) {
            printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu );
            *info = -1;
            return *info;
        }

        /* allocate workspace for each GPU */
        lddat = magma_roundup( ((magma_ceildiv( n, nb )/ngpu)*nb), 32 );
        lddat = magma_ceildiv( n, nb );        /* number of block columns         */
        lddat = magma_ceildiv( lddat, ngpu );  /* number of block columns per GPU */
        lddat = nb*lddat;                      /* number of columns per GPU       */
        lddat = magma_roundup( lddat, 32 );    /* make it a multiple of 32        */
        for (i=0; i < ngpu; i++) {
            magma_setdevice(i);
            
            /* local-n and local-ld */
            n_local[i] = ((n/nb)/ngpu)*nb;
            if (i < (n/nb)%ngpu)
                n_local[i] += nb;
            else if (i == (n/nb)%ngpu)
                n_local[i] += n%nb;
            
            /* workspaces */
            if (MAGMA_SUCCESS != magma_smalloc( &d_panel[i], (3+ngpu)*nb*maxm )) {
                for( j=0; j <= i; j++ ) {
                    magma_setdevice(j);
                }
                for( j=0; j < i; j++ ) {
                    magma_setdevice(j);
                    magma_free( d_panel[j] );
                    magma_free( d_lAT[j]   );
                }
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }
            
            /* local-matrix storage */
            if (MAGMA_SUCCESS != magma_smalloc( &d_lAT[i], lddat*maxm )) {
                for( j=0; j <= i; j++ ) {
                    magma_setdevice(j);
                    magma_free( d_panel[j] );
                }
                for( j=0; j < i; j++ ) {
                    magma_setdevice(j);
                    magma_free( d_lAT[j] );
                }
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }
            
            magmablas_stranspose( m, n_local[i], d_lA[i], ldda, d_lAT[i], lddat, queues[i][1] );
        }
        for (i=0; i < ngpu; i++) {
            magma_setdevice(i);
            magma_queue_sync(queues[i][0]);
        }
        magma_setdevice(0);

        /* cpu workspace */
        lddwork = maxm;
        if (MAGMA_SUCCESS != magma_smalloc_pinned( &work, lddwork*nb*ngpu )) {
            for (i=0; i < ngpu; i++ ) {
                magma_setdevice(i);
                magma_free( d_panel[i] );
                magma_free( d_lAT[i]   );
            }
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }

        /* calling multi-gpu interface with allocated workspaces and queues */
        magma_sgetrf2_mgpu(ngpu, m, n, nb, 0, d_lAT, lddat, ipiv, d_panel, work, maxm,
                           queues, info);

        /* clean up */
        for( d=0; d < ngpu; d++ ) {
            magma_setdevice(d);
            
            /* save on output */
            magmablas_stranspose( n_local[d], m, d_lAT[d], lddat, d_lA[d], ldda, queues[d][0] );
            magma_queue_sync(queues[d][0]);
            magma_queue_sync(queues[d][1]);

            magma_free( d_lAT[d]   );
            magma_free( d_panel[d] );
        } /* end of for d=1,..,ngpu */
        magma_setdevice( orig_dev );
        magma_free_pinned( work );
    }

    /* clean up */
    for( d=0; d < ngpu; d++ ) {
        magma_setdevice(d);
        magma_queue_destroy( queues[d][0] );
        magma_queue_destroy( queues[d][1] );
    }

    return *info;
}
Esempio n. 24
0
/**
    Purpose
    -------
    SLAEX3 finds the roots of the secular equation, as defined by the
    values in D, W, and RHO, between 1 and K.  It makes the
    appropriate calls to SLAED4 and then updates the eigenvectors by
    multiplying the matrix of eigenvectors of the pair of eigensystems
    being combined by the matrix of eigenvectors of the K-by-K system
    which is solved here.

    It is used in the last step when only a part of the eigenvectors
    is required.
    It compute only the required part of the eigenvectors and the rest
    is not used.

    This code makes very mild assumptions about floating point
    arithmetic. It will work on machines with a guard digit in
    add/subtract, or on those binary machines without guard digits
    which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
    It could conceivably fail on hexadecimal or decimal machines
    without guard digits, but we know of none.

    Arguments
    ---------
    @param[in]
    ngpu    INTEGER
            Number of GPUs to use. ngpu > 0.

    @param[in]
    k       INTEGER
            The number of terms in the rational function to be solved by
            SLAED4.  K >= 0.

    @param[in]
    n       INTEGER
            The number of rows and columns in the Q matrix.
            N >= K (deflation may result in N > K).

    @param[in]
    n1      INTEGER
            The location of the last eigenvalue in the leading submatrix.
            min(1,N) <= N1 <= N/2.

    @param[out]
    d       REAL array, dimension (N)
            D(I) contains the updated eigenvalues for
            1 <= I <= K.

    @param[out]
    Q       REAL array, dimension (LDQ,N)
            Initially the first K columns are used as workspace.
            On output the columns ??? to ??? contain
            the updated eigenvectors.

    @param[in]
    ldq     INTEGER
            The leading dimension of the array Q.  LDQ >= max(1,N).

    @param[in]
    rho     REAL
            The value of the parameter in the rank one update equation.
            RHO >= 0 required.

    @param[in,out]
    dlamda  REAL array, dimension (K)
            The first K elements of this array contain the old roots
            of the deflated updating problem.  These are the poles
            of the secular equation. May be changed on output by
            having lowest order bit set to zero on Cray X-MP, Cray Y-MP,
            Cray-2, or Cray C-90, as described above.

    @param[in]
    Q2      REAL array, dimension (LDQ2, N)
            The first K columns of this matrix contain the non-deflated
            eigenvectors for the split problem.

    @param[in]
    indx    INTEGER array, dimension (N)
            The permutation used to arrange the columns of the deflated
            Q matrix into three groups (see SLAED2).
            The rows of the eigenvectors found by SLAED4 must be likewise
            permuted before the matrix multiply can take place.

    @param[in]
    ctot    INTEGER array, dimension (4)
            A count of the total number of the various types of columns
            in Q, as described in INDX.  The fourth column type is any
            column which has been deflated.

    @param[in,out]
    w       REAL array, dimension (K)
            The first K elements of this array contain the components
            of the deflation-adjusted updating vector. Destroyed on
            output.

    @param
    s       (workspace) REAL array, dimension (N1 + 1)*K
            Will contain the eigenvectors of the repaired matrix which
            will be multiplied by the previously accumulated eigenvectors
            to update the system.

    @param[out]
    indxq   INTEGER array, dimension (N)
            On exit, the permutation which will reintegrate the
            subproblems back into sorted order,
            i.e. D( INDXQ( I = 1, N ) ) will be in ascending order.
    
    @param
    dwork   (devices workspaces) REAL array of arrays,
            dimension NRGPU.
            if NRGPU = 1 the dimension of the first workspace
            should be (3*N*N/2+3*N)
            otherwise the NRGPU workspaces should have the size
            ceil((N-N1) * (N-N1) / floor(ngpu/2)) +
            NB * ((N-N1) + (N-N1) / floor(ngpu/2))
    
    @param
    queues  (device queues) magma_queue_t array,
            dimension (MagmaMaxGPUs,2)

    @param[in]
    range   magma_range_t
      -     = MagmaRangeAll: all eigenvalues will be found.
      -     = MagmaRangeV:   all eigenvalues in the half-open interval (VL,VU]
                             will be found.
      -     = MagmaRangeI:   the IL-th through IU-th eigenvalues will be found.
            TODO verify range, vl, vu, il, iu -- copied from slaex1.

    @param[in]
    vl      REAL
    @param[in]
    vu      REAL
            if RANGE=MagmaRangeV, the lower and upper bounds of the interval to
            be searched for eigenvalues. VL < VU.
            Not referenced if RANGE = MagmaRangeAll or MagmaRangeI.

    @param[in]
    il      INTEGER
    @param[in]
    iu      INTEGER
            if RANGE=MagmaRangeI, the indices (in ascending order) of the
            smallest and largest eigenvalues to be returned.
            1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
            Not referenced if RANGE = MagmaRangeAll or MagmaRangeV.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit.
      -     < 0:  if INFO = -i, the i-th argument had an illegal value.
      -     > 0:  if INFO = 1, an eigenvalue did not converge

    Further Details
    ---------------
    Based on contributions by
    Jeff Rutter, Computer Science Division, University of California
    at Berkeley, USA
    Modified by Francoise Tisseur, University of Tennessee.

    @ingroup magma_ssyev_aux
    ********************************************************************/
extern "C" magma_int_t
magma_slaex3_m(
    magma_int_t ngpu,
    magma_int_t k, magma_int_t n, magma_int_t n1, float *d,
    float *Q, magma_int_t ldq, float rho,
    float *dlamda, float *Q2, magma_int_t *indx,
    magma_int_t *ctot, float *w, float *s, magma_int_t *indxq,
    magmaFloat_ptr dwork[],
    magma_queue_t queues[MagmaMaxGPUs][2],
    magma_range_t range, float vl, float vu, magma_int_t il, magma_int_t iu,
    magma_int_t *info )
{
#define Q(i_,j_) (Q + (i_) + (j_)*ldq)

#define dQ2(id)    (dwork[id])
#define dS(id, ii) (dwork[id] + n2*n2_loc + (ii)*(n2*nb))
#define dQ(id, ii) (dwork[id] + n2*n2_loc +    2*(n2*nb) + (ii)*(n2_loc*nb))

    if (ngpu == 1) {
        magma_setdevice(0);
        magma_slaex3(k, n, n1, d, Q, ldq, rho,
                     dlamda, Q2, indx, ctot, w, s, indxq,
                     *dwork, range, vl, vu, il, iu, info );
        return *info;
    }
    float d_one  = 1.;
    float d_zero = 0.;
    magma_int_t ione = 1;
    magma_int_t ineg_one = -1;

    magma_int_t iil, iiu, rk;
    magma_int_t n1_loc, n2_loc, ib, nb, ib2, igpu;
    magma_int_t ni_loc[MagmaMaxGPUs];

    magma_int_t i, ind, iq2, j, n12, n2, n23, tmp;
    float temp;
    magma_int_t alleig, valeig, indeig;

    alleig = (range == MagmaRangeAll);
    valeig = (range == MagmaRangeV);
    indeig = (range == MagmaRangeI);

    *info = 0;

    if (k < 0)
        *info=-1;
    else if (n < k)
        *info=-2;
    else if (ldq < max(1,n))
        *info=-6;
    else if (! (alleig || valeig || indeig))
        *info = -15;
    else {
        if (valeig) {
            if (n > 0 && vu <= vl)
                *info = -17;
        }
        else if (indeig) {
            if (il < 1 || il > max(1,n))
                *info = -18;
            else if (iu < min(n,il) || iu > n)
                *info = -19;
        }
    }

    if (*info != 0) {
        magma_xerbla(__func__, -(*info));
        return *info;
    }

    // Quick return if possible
    if (k == 0)
        return *info;

    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );
    
    /*
     Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can
     be computed with high relative accuracy (barring over/underflow).
     This is a problem on machines without a guard digit in
     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
     The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I),
     which on any of these machines zeros out the bottommost
     bit of DLAMDA(I) if it is 1; this makes the subsequent
     subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation
     occurs. On binary machines with a guard digit (almost all
     machines) it does not change DLAMDA(I) at all. On hexadecimal
     and decimal machines with a guard digit, it slightly
     changes the bottommost bits of DLAMDA(I). It does not account
     for hexadecimal or decimal machines without guard digits
     (we know of none). We use a subroutine call to compute
     2*DLAMBDA(I) to prevent optimizing compilers from eliminating
     this code.*/

//#define CHECK_CPU
#ifdef CHECK_CPU
    float *hwS[2][MagmaMaxGPUs], *hwQ[2][MagmaMaxGPUs], *hwQ2[MagmaMaxGPUs];
    #define hQ2(id) (hwQ2[id])
    #define hS(id, ii) (hwS[ii][id])
    #define hQ(id, ii) (hwQ[ii][id])
#endif
    n2 = n - n1;

    n12 = ctot[0] + ctot[1];
    n23 = ctot[1] + ctot[2];

    iq2 = n1 * n12;
    //lq2 = iq2 + n2 * n23;

    n1_loc = (n1-1) / (ngpu/2) + 1;
    n2_loc = (n2-1) / (ngpu/2) + 1;

    nb = magma_get_slaex3_m_nb();

    if (n1 >= magma_get_slaex3_m_k()) {
#ifdef CHECK_CPU
        for (igpu = 0; igpu < ngpu; ++igpu) {
            magma_smalloc_pinned( &(hwS[0][igpu]), n2*nb );
            magma_smalloc_pinned( &(hwS[1][igpu]), n2*nb );
            magma_smalloc_pinned( &(hwQ2[igpu]), n2*n2_loc );
            magma_smalloc_pinned( &(hwQ[0][igpu]), n2_loc*nb );
            magma_smalloc_pinned( &(hwQ[1][igpu]), n2_loc*nb );
        }
#endif
        for (igpu = 0; igpu < ngpu-1; igpu += 2) {
            ni_loc[igpu] = min(n1_loc, n1 - igpu/2 * n1_loc);
#ifdef CHECK_CPU
            lapackf77_slacpy("A", &ni_loc[igpu], &n12, Q2+n1_loc*(igpu/2), &n1, hQ2(igpu), &n1_loc);
#endif
            magma_setdevice(igpu);
            magma_ssetmatrix_async( ni_loc[igpu], n12,
                                    Q2+n1_loc*(igpu/2), n1,
                                    dQ2(igpu),          n1_loc, queues[igpu][0] );
            ni_loc[igpu+1] = min(n2_loc, n2 - igpu/2 * n2_loc);
#ifdef CHECK_CPU
            lapackf77_slacpy("A", &ni_loc[igpu+1], &n23, Q2+iq2+n2_loc*(igpu/2), &n2, hQ2(igpu+1), &n2_loc);
#endif
            magma_setdevice(igpu+1);
            magma_ssetmatrix_async( ni_loc[igpu+1], n23,
                                    Q2+iq2+n2_loc*(igpu/2), n2,
                                    dQ2(igpu+1),            n2_loc, queues[igpu+1][0] );
        }
    }

    //

#ifdef _OPENMP
    /////////////////////////////////////////////////////////////////////////////////
    //openmp implementation
    /////////////////////////////////////////////////////////////////////////////////
    magma_timer_t time=0;
    timer_start( time );

#pragma omp parallel private(i, j, tmp, temp)
    {
        magma_int_t id = omp_get_thread_num();
        magma_int_t tot = omp_get_num_threads();

        magma_int_t ib = (  id   * k) / tot; //start index of local loop
        magma_int_t ie = ((id+1) * k) / tot; //end index of local loop
        magma_int_t ik = ie - ib;           //number of local indices

        for (i = ib; i < ie; ++i)
            dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i];

        for (j = ib; j < ie; ++j) {
            magma_int_t tmpp=j+1;
            magma_int_t iinfo = 0;
            lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo);
            // If the zero finder fails, the computation is terminated.
            if (iinfo != 0) {
#pragma omp critical (info)
                *info = iinfo;
                break;
            }
        }

#pragma omp barrier

        if (*info == 0) {
#pragma omp single
            {
                //Prepare the INDXQ sorting permutation.
                magma_int_t nk = n - k;
                lapackf77_slamrg( &k, &nk, d, &ione, &ineg_one, indxq);

                //compute the lower and upper bound of the non-deflated eigenvectors
                if (valeig)
                    magma_svrange(k, d, &iil, &iiu, vl, vu);
                else if (indeig)
                    magma_sirange(k, indxq, &iil, &iiu, il, iu);
                else {
                    iil = 1;
                    iiu = k;
                }
                rk = iiu - iil + 1;
            }

            if (k == 2) {
#pragma omp single
                {
                    for (j = 0; j < k; ++j) {
                        w[0] = *Q(0,j);
                        w[1] = *Q(1,j);

                        i = indx[0] - 1;
                        *Q(0,j) = w[i];
                        i = indx[1] - 1;
                        *Q(1,j) = w[i];
                    }
                }
            }
            else if (k != 1) {
                // Compute updated W.
                blasf77_scopy( &ik, &w[ib], &ione, &s[ib], &ione);

                // Initialize W(I) = Q(I,I)
                tmp = ldq + 1;
                blasf77_scopy( &ik, Q(ib,ib), &tmp, &w[ib], &ione);

                for (j = 0; j < k; ++j) {
                    magma_int_t i_tmp = min(j, ie);
                    for (i = ib; i < i_tmp; ++i)
                        w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
                    i_tmp = max(j+1, ib);
                    for (i = i_tmp; i < ie; ++i)
                        w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
                }

                for (i = ib; i < ie; ++i)
                    w[i] = copysign( sqrt( -w[i] ), s[i]);

#pragma omp barrier

                //reduce the number of used threads to have enough S workspace
                tot = min(n1, omp_get_num_threads());

                if (id < tot) {
                    ib = (  id   * rk) / tot + iil - 1;
                    ie = ((id+1) * rk) / tot + iil - 1;
                    ik = ie - ib;
                }
                else {
                    ib = -1;
                    ie = -1;
                    ik = -1;
                }

                // Compute eigenvectors of the modified rank-1 modification.
                for (j = ib; j < ie; ++j) {
                    for (i = 0; i < k; ++i)
                        s[id*k + i] = w[i] / *Q(i,j);
                    temp = magma_cblas_snrm2( k, s+id*k, 1 );
                    for (i = 0; i < k; ++i) {
                        magma_int_t iii = indx[i] - 1;
                        *Q(i,j) = s[id*k + iii] / temp;
                    }
                }
            }
        }
    }
    if (*info != 0)
        return *info;

    timer_stop( time );
    timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time );

#else
    /////////////////////////////////////////////////////////////////////////////////
    // Non openmp implementation
    /////////////////////////////////////////////////////////////////////////////////
    magma_timer_t time=0;
    timer_start( time );

    for (i = 0; i < k; ++i)
        dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i];

    for (j = 0; j < k; ++j) {
        magma_int_t tmpp=j+1;
        magma_int_t iinfo = 0;
        lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo);
        // If the zero finder fails, the computation is terminated.
        if (iinfo != 0)
            *info=iinfo;
    }
    if (*info != 0)
        return *info;

    //Prepare the INDXQ sorting permutation.
    magma_int_t nk = n - k;
    lapackf77_slamrg( &k, &nk, d, &ione, &ineg_one, indxq);

    //compute the lower and upper bound of the non-deflated eigenvectors
    if (valeig)
        magma_svrange(k, d, &iil, &iiu, vl, vu);
    else if (indeig)
        magma_sirange(k, indxq, &iil, &iiu, il, iu);
    else {
        iil = 1;
        iiu = k;
    }
    rk = iiu - iil + 1;

    if (k == 2) {
        for (j = 0; j < k; ++j) {
            w[0] = *Q(0,j);
            w[1] = *Q(1,j);

            i = indx[0] - 1;
            *Q(0,j) = w[i];
            i = indx[1] - 1;
            *Q(1,j) = w[i];
        }
    }
    else if (k != 1) {
        // Compute updated W.
        blasf77_scopy( &k, w, &ione, s, &ione);

        // Initialize W(I) = Q(I,I)
        tmp = ldq + 1;
        blasf77_scopy( &k, Q, &tmp, w, &ione);

        for (j = 0; j < k; ++j) {
            for (i = 0; i < j; ++i)
                w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
            for (i = j+1; i < k; ++i)
                w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
        }

        for (i = 0; i < k; ++i)
            w[i] = copysign( sqrt( -w[i] ), s[i]);

        // Compute eigenvectors of the modified rank-1 modification.
        for (j = iil-1; j < iiu; ++j) {
            for (i = 0; i < k; ++i)
                s[i] = w[i] / *Q(i,j);
            temp = magma_cblas_snrm2( k, s, 1 );
            for (i = 0; i < k; ++i) {
                magma_int_t iii = indx[i] - 1;
                *Q(i,j) = s[iii] / temp;
            }
        }
    }

    timer_stop( time );
    timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time );

#endif //_OPENMP

    // Compute the updated eigenvectors.

    timer_start( time );

    if (rk > 0) {
        if (n1 < magma_get_slaex3_m_k()) {
            // stay on the CPU
            if ( n23 != 0 ) {
                lapackf77_slacpy("A", &n23, &rk, Q(ctot[0],iil-1), &ldq, s, &n23);
                blasf77_sgemm("N", "N", &n2, &rk, &n23, &d_one, &Q2[iq2], &n2,
                              s, &n23, &d_zero, Q(n1,iil-1), &ldq );
            }
            else
                lapackf77_slaset("A", &n2, &rk, &d_zero, &d_zero, Q(n1,iil-1), &ldq);

            if ( n12 != 0 ) {
                lapackf77_slacpy("A", &n12, &rk, Q(0,iil-1), &ldq, s, &n12);
                blasf77_sgemm("N", "N", &n1, &rk, &n12, &d_one, Q2, &n1,
                              s, &n12, &d_zero, Q(0,iil-1), &ldq);
            }
            else
                lapackf77_slaset("A", &n1, &rk, &d_zero, &d_zero, Q(0,iil-1), &ldq);
        }
        else {
            //use the gpus
            ib = min(nb, rk);
            for (igpu = 0; igpu < ngpu-1; igpu += 2) {
                if (n23 != 0) {
                    magma_setdevice(igpu+1);
                    magma_ssetmatrix_async( n23, ib,
                                            Q(ctot[0],iil-1), ldq,
                                            dS(igpu+1,0),     n23, queues[igpu+1][0] );
                }
                if (n12 != 0) {
                    magma_setdevice(igpu);
                    magma_ssetmatrix_async( n12, ib,
                                            Q(0,iil-1), ldq,
                                            dS(igpu,0), n12, queues[igpu][0] );
                }
            }

            for (i = 0; i < rk; i += nb) {
                ib = min(nb, rk - i);
                ind = (i/nb)%2;
                if (i+nb < rk) {
                    ib2 = min(nb, rk - i - nb);
                    for (igpu = 0; igpu < ngpu-1; igpu += 2) {
                        if (n23 != 0) {
                            magma_setdevice(igpu+1);
                            magma_ssetmatrix_async( n23, ib2,
                                                    Q(ctot[0],iil-1+i+nb), ldq,
                                                    dS(igpu+1,(ind+1)%2),  n23, queues[igpu+1][(ind+1)%2] );
                        }
                        if (n12 != 0) {
                            magma_setdevice(igpu);
                            magma_ssetmatrix_async( n12, ib2,
                                                    Q(0,iil-1+i+nb),    ldq,
                                                    dS(igpu,(ind+1)%2), n12, queues[igpu][(ind+1)%2] );
                        }
                    }
                }

                // Ensure that the data is copied on gpu since we will overwrite it.
                for (igpu = 0; igpu < ngpu-1; igpu += 2) {
                    if (n23 != 0) {
#ifdef CHECK_CPU
                        lapackf77_slacpy("A", &n23, &ib, Q(ctot[0],iil-1+i), &ldq, hS(igpu+1,ind), &n23);
#endif
                        magma_setdevice(igpu+1);
                        magma_queue_sync( queues[igpu+1][ind] );
                    }
                    if (n12 != 0) {
#ifdef CHECK_CPU
                        lapackf77_slacpy("A", &n12, &ib, Q(0,iil-1+i), &ldq, hS(igpu,ind), &n12);
#endif
                        magma_setdevice(igpu);
                        magma_queue_sync( queues[igpu][ind] );
                    }
                }
                for (igpu = 0; igpu < ngpu-1; igpu += 2) {
                    if (n23 != 0) {
#ifdef CHECK_CPU
                        blasf77_sgemm("N", "N", &ni_loc[igpu+1], &ib, &n23, &d_one, hQ2(igpu+1), &n2_loc,
                                      hS(igpu+1,ind), &n23, &d_zero, hQ(igpu+1, ind), &n2_loc);
#endif
                        magma_setdevice(igpu+1);
                        magmablasSetKernelStream(queues[igpu+1][ind]);
                        magma_sgemm(MagmaNoTrans, MagmaNoTrans, ni_loc[igpu+1], ib, n23, d_one, dQ2(igpu+1), n2_loc,
                                    dS(igpu+1, ind), n23, d_zero, dQ(igpu+1, ind), n2_loc);
#ifdef CHECK_CPU
                        printf("norm Q %d: %f\n", igpu+1, cpu_gpu_sdiff(ni_loc[igpu+1], ib, hQ(igpu+1, ind), n2_loc, dQ(igpu+1, ind), n2_loc));
#endif
                    }
                    if (n12 != 0) {
#ifdef CHECK_CPU
                        blasf77_sgemm("N", "N", &ni_loc[igpu], &ib, &n12, &d_one, hQ2(igpu), &n1_loc,
                                      hS(igpu,ind%2), &n12, &d_zero, hQ(igpu, ind%2), &n1_loc);
#endif
                        magma_setdevice(igpu);
                        magmablasSetKernelStream(queues[igpu][ind]);
                        magma_sgemm(MagmaNoTrans, MagmaNoTrans, ni_loc[igpu], ib, n12, d_one, dQ2(igpu), n1_loc,
                                    dS(igpu, ind), n12, d_zero, dQ(igpu, ind), n1_loc);
#ifdef CHECK_CPU
                        printf("norm Q %d: %f\n", igpu, cpu_gpu_sdiff(ni_loc[igpu], ib, hQ(igpu, ind), n1_loc, dQ(igpu, ind), n1_loc));
#endif
                    }
                }
                for (igpu = 0; igpu < ngpu-1; igpu += 2) {
                    if (n23 != 0) {
                        magma_setdevice(igpu+1);
                        magma_sgetmatrix( ni_loc[igpu+1], ib, dQ(igpu+1, ind), n2_loc,
                                          Q(n1+n2_loc*(igpu/2),iil-1+i), ldq );
//                        magma_sgetmatrix_async( ni_loc[igpu+1], ib, dQ(igpu+1, ind), n2_loc,
//                                                Q(n1+n2_loc*(igpu/2),iil-1+i), ldq, queues[igpu+1][ind] );
                    }
                    if (n12 != 0) {
                        magma_setdevice(igpu);
                        magma_sgetmatrix( ni_loc[igpu], ib, dQ(igpu, ind), n1_loc,
                                          Q(n1_loc*(igpu/2),iil-1+i), ldq );
//                        magma_sgetmatrix_async( ni_loc[igpu], ib, dQ(igpu, ind), n1_loc,
//                                                Q(n1_loc*(igpu/2),iil-1+i), ldq, queues[igpu][ind] );
                    }
                }
            }
            for (igpu = 0; igpu < ngpu; ++igpu) {
#ifdef CHECK_CPU
                magma_free_pinned( hwS[1][igpu] );
                magma_free_pinned( hwS[0][igpu] );
                magma_free_pinned( hwQ2[igpu] );
                magma_free_pinned( hwQ[1][igpu] );
                magma_free_pinned( hwQ[0][igpu] );
#endif
                magma_setdevice(igpu);
                magma_queue_sync( queues[igpu][0] );
                magma_queue_sync( queues[igpu][1] );
            }
            if ( n23 == 0 )
                lapackf77_slaset("A", &n2, &rk, &d_zero, &d_zero, Q(n1,iil-1), &ldq);

            if ( n12 == 0 )
                lapackf77_slaset("A", &n1, &rk, &d_zero, &d_zero, Q(0,iil-1), &ldq);
        }
    }
    timer_stop( time );
    timer_printf( "gemms = %6.2f\n", time );

    magma_setdevice( orig_dev );
    magmablasSetKernelStream( orig_stream );
    
    return *info;
} /* magma_slaed3_m */
Esempio n. 25
0
extern "C" magma_int_t
magma_slauum_gpu(char uplo, magma_int_t n,
                 float  *dA, magma_int_t ldda, magma_int_t *info)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    SLAUUM computes the product U * U' or L' * L, where the triangular
    factor U or L is stored in the upper or lower triangular part of
    the array dA.

    If UPLO = 'U' or 'u' then the upper triangle of the result is stored,
    overwriting the factor U in dA.
    If UPLO = 'L' or 'l' then the lower triangle of the result is stored,
    overwriting the factor L in dA.
    This is the blocked form of the algorithm, calling Level 3 BLAS.

    Arguments
    =========
    UPLO    (input) CHARACTER*1
            Specifies whether the triangular factor stored in the array dA
            is upper or lower triangular:
            = 'U':  Upper triangular
            = 'L':  Lower triangular

    N       (input) INTEGER
            The order of the triangular factor U or L.  N >= 0.

    dA      (input/output) DOUBLE PRECISION array on the GPU, dimension (LDDA,N)
            On entry, the triangular factor U or L.
            On exit, if UPLO = 'U', the upper triangle of dA is
            overwritten with the upper triangle of the product U * U';
            if UPLO = 'L', the lower triangle of dA is overwritten with
            the lower triangle of the product L' * L.

    LDDA    (input) INTEGER
            The leading dimension of the array A.  LDDA >= max(1,N).

    INFO    (output) INTEGER
            = 0: successful exit
            < 0: if INFO = -k, the k-th argument had an illegal value

    ===================================================================== */

    /* Local variables */
    char uplo_[2] = {uplo, 0};
    magma_int_t         nb, i, ib;
    float              d_one = MAGMA_D_ONE;
    float  c_one = MAGMA_S_ONE;
    float  *work;

    int upper  = lapackf77_lsame(uplo_, "U");

    *info = 0;

    if ((! upper) && (! lapackf77_lsame(uplo_, "L")))
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (ldda < max(1,n))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    nb = magma_get_spotrf_nb(n);

    if (MAGMA_SUCCESS != magma_smalloc_pinned( &work, nb*nb )) {
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }

    magma_queue_t stream[2];
    magma_queue_create( &stream[0] );
    magma_queue_create( &stream[1] );

    if (nb <= 1 || nb >= n)
    {
        magma_sgetmatrix( n, n, dA, ldda, work, n );
        lapackf77_slauum(uplo_, &n, work, &n, info);
        magma_ssetmatrix( n, n, work, n, dA, ldda );
    }
    else
    {
        if (upper)
        {
            /* Compute inverse of upper triangular matrix */
            for (i=0; i < n; i += nb)
            {
                ib = min(nb, (n-i));

                /* Compute the product U * U'. */
                magma_strmm( MagmaRight, MagmaUpper,
                         MagmaTrans, MagmaNonUnit, i, ib,
                         c_one, dA(i,i), ldda, dA(0, i),ldda);

                magma_sgetmatrix( ib, ib,
                                  dA(i, i), ldda,
                                  work,     ib );

                lapackf77_slauum(MagmaUpperStr, &ib, work, &ib, info);

                magma_ssetmatrix( ib, ib,
                                  work,     ib,
                                  dA(i, i), ldda );

                if(i+ib < n)
                {
                    magma_sgemm( MagmaNoTrans, MagmaTrans,
                                 i, ib, (n-i-ib), c_one, dA(0,i+ib),
                                 ldda, dA(i, i+ib), ldda, c_one,
                                 dA(0,i), ldda);

                    magma_ssyrk( MagmaUpper, MagmaNoTrans, ib,(n-i-ib),
                                 d_one, dA(i, i+ib), ldda,
                                 d_one, dA(i, i),    ldda);
                }
            }
        }
        else
        {
            /* Compute the product L' * L. */
            for(i=0; i<n; i=i+nb)
            {
                ib=min(nb,(n-i));

                magma_strmm( MagmaLeft, MagmaLower,
                             MagmaTrans, MagmaNonUnit, ib,
                             i, c_one, dA(i,i), ldda,
                             dA(i, 0),ldda);

                magma_sgetmatrix( ib, ib,
                                  dA(i, i), ldda,
                                  work,     ib );

                lapackf77_slauum(MagmaLowerStr, &ib, work, &ib, info);

                magma_ssetmatrix( ib, ib,
                                  work,     ib,
                                  dA(i, i), ldda );

                if((i+ib) < n)
                {
                    magma_sgemm( MagmaTrans, MagmaNoTrans,
                                 ib, i, (n-i-ib), c_one, dA( i+ib,i),
                                 ldda, dA(i+ib, 0),ldda, c_one,
                                 dA(i,0), ldda);
                    magma_ssyrk( MagmaLower, MagmaTrans, ib, (n-i-ib),
                                 d_one, dA(i+ib, i), ldda,
                                 d_one, dA(i, i),    ldda);
                }
            }
        }
    }

    magma_queue_destroy( stream[0] );
    magma_queue_destroy( stream[1] );

    magma_free_pinned( work );

    return *info;
}
Esempio n. 26
0
extern "C" magma_int_t
magma_sgeqrf2_mgpu( magma_int_t num_gpus, magma_int_t m, magma_int_t n,
                    float **dlA, magma_int_t ldda,
                    float *tau, 
                    magma_int_t *info )
{
/*  -- MAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2012

    Purpose
    =======
    SGEQRF2_MGPU computes a QR factorization of a real M-by-N matrix A:
    A = Q * R. This is a GPU interface of the routine.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    dA      (input/output) REAL array on the GPU, dimension (LDDA,N)
            On entry, the M-by-N matrix dA.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).

    LDDA    (input) INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).
            To benefit from coalescent memory accesses LDDA must be
            dividable by 16.

    TAU     (output) REAL array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ===============

    The matrix Q is represented as a product of elementary reflectors

       Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).
    =====================================================================    */

    #define dlA(gpu,a_1,a_2) ( dlA[gpu]+(a_2)*(ldda) + (a_1))
    #define work_ref(a_1)    ( work + (a_1))
    #define hwork            ( work + (nb)*(m))

    #define hwrk_ref(a_1)    ( local_work + (a_1))
    #define lhwrk            ( local_work + (nb)*(m))

    float *dwork[4], *panel[4], *local_work;

    magma_int_t i, j, k, ldwork, lddwork, old_i, old_ib, rows;
    magma_int_t nbmin, nx, ib, nb;
    magma_int_t lhwork, lwork;

    magma_device_t cdevice;
    magma_getdevice(&cdevice);

    int panel_gpunum, i_local, n_local[4], la_gpu, displacement; 

    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    k = min(m,n);
    if (k == 0)
        return *info;

    nb = magma_get_sgeqrf_nb(m);

    displacement = n * nb;
    lwork  = (m+n+64) * nb;
    lhwork = lwork - (m)*nb;

    for(i=0; i<num_gpus; i++){
      #ifdef  MultiGPUs
         magma_setdevice(i);
      #endif
         if (MAGMA_SUCCESS != magma_smalloc( &(dwork[i]), (n + ldda)*nb )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
      }
    }

    /* Set the number of local n for each GPU */
    for(i=0; i<num_gpus; i++){
      n_local[i] = ((n/nb)/num_gpus)*nb;
      if (i < (n/nb)%num_gpus)
        n_local[i] += nb;
      else if (i == (n/nb)%num_gpus)
        n_local[i] += n%nb;
    }

    if (MAGMA_SUCCESS != magma_smalloc_pinned( &local_work, lwork )) {
      *info = -9;
      for(i=0; i<num_gpus; i++){
        #ifdef  MultiGPUs
          magma_setdevice(i);
        #endif
        magma_free( dwork[i] );
      }

      *info = MAGMA_ERR_HOST_ALLOC;
      return *info;
    }

    cudaStream_t streaml[4][2];
    for(i=0; i<num_gpus; i++){
      #ifdef  MultiGPUs
         magma_setdevice(i);
      #endif
      magma_queue_create( &streaml[i][0] );
      magma_queue_create( &streaml[i][1] );
    }  

    nbmin = 2;
    nx    = nb;
    ldwork = m;
    lddwork= n;

    if (nb >= nbmin && nb < k && nx < k) {
        /* Use blocked code initially */
        old_i = 0; old_ib = nb;
        for (i = 0; i < k-nx; i += nb) 
          {
            /* Set the GPU number that holds the current panel */
            panel_gpunum = (i/nb)%num_gpus;
            
            /* Set the local index where the current panel is */
            i_local = i/(nb*num_gpus)*nb;
            
            ib = min(k-i, nb);
            rows = m -i;
            /* Send current panel to the CPU */
            #ifdef  MultiGPUs
               magma_setdevice(panel_gpunum);
            #endif
            magma_sgetmatrix_async( rows, ib,
                                    dlA(panel_gpunum, i, i_local), ldda,
                                    hwrk_ref(i),                   ldwork, streaml[panel_gpunum][1] );

            if (i>0){
                /* Apply H' to A(i:m,i+2*ib:n) from the left; this is the look-ahead
                   application to the trailing matrix                                     */
                la_gpu = panel_gpunum;

                /* only the GPU that has next panel is done look-ahead */
                #ifdef  MultiGPUs
                     magma_setdevice(la_gpu);
                #endif
                   
                magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise,
                                  m-old_i, n_local[la_gpu]-i_local-old_ib, old_ib,
                                  panel[la_gpu], ldda, dwork[la_gpu],      lddwork,
                                  dlA(la_gpu, old_i, i_local+old_ib), ldda, 
                                  dwork[la_gpu]+old_ib, lddwork);
                  
                la_gpu = ((i-nb)/nb)%num_gpus;
                #ifdef  MultiGPUs
                magma_setdevice(la_gpu);
                #endif
                magma_ssetmatrix_async( old_ib, old_ib,
                                        hwrk_ref(old_i), ldwork,
                                        panel[la_gpu],   ldda, streaml[la_gpu][0] );
            }
            
            #ifdef  MultiGPUs
               magma_setdevice(panel_gpunum);
            #endif
            magma_queue_sync( streaml[panel_gpunum][1] );

            lapackf77_sgeqrf(&rows, &ib, hwrk_ref(i), &ldwork, tau+i, lhwrk, &lhwork, info);

            // Form the triangular factor of the block reflector
            // H = H(i) H(i+1) . . . H(i+ib-1) 
            lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr,
                              &rows, &ib,
                              hwrk_ref(i), &ldwork, tau+i, lhwrk, &ib);

            spanel_to_q( MagmaUpper, ib, hwrk_ref(i), ldwork, lhwrk+ib*ib );
            // Send the current panel back to the GPUs 
            // Has to be done with asynchronous copies
            for(j=0; j<num_gpus; j++)
              {  
                #ifdef  MultiGPUs
                   magma_setdevice(j);
                #endif
                if (j == panel_gpunum)
                  panel[j] = dlA(j, i, i_local);
                else
                  panel[j] = dwork[j]+displacement;
                magma_ssetmatrix_async( rows, ib,
                                        hwrk_ref(i), ldwork,
                                        panel[j],    ldda, streaml[j][0] );
              }
            for(j=0; j<num_gpus; j++)
              {
                #ifdef  MultiGPUs
                magma_setdevice(j);
                #endif
                magma_queue_sync( streaml[j][0] );
              }

            /* Restore the panel */
            sq_to_panel( MagmaUpper, ib, hwrk_ref(i), ldwork, lhwrk+ib*ib );

            if (i + ib < n) 
              {
                /* Send the T matrix to the GPU. 
                   Has to be done with asynchronous copies */
                for(j=0; j<num_gpus; j++)
                  {
                    #ifdef  MultiGPUs
                       magma_setdevice(j);
                    #endif
                       magma_ssetmatrix_async( ib, ib,
                                               lhwrk,    ib,
                                               dwork[j], lddwork, streaml[j][0] );
                  }

                if (i+nb < k-nx)
                  {
                    /* Apply H' to A(i:m,i+ib:i+2*ib) from the left;
                       This is update for the next panel; part of the look-ahead    */
                    la_gpu = (panel_gpunum+1)%num_gpus;
                    int i_loc = (i+nb)/(nb*num_gpus)*nb;
                    for(j=0; j<num_gpus; j++){
                      #ifdef  MultiGPUs
                      magma_setdevice(j);
                      #endif
                      //magma_queue_sync( streaml[j][0] );
                      if (j==la_gpu)
                        magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise,
                                          rows, ib, ib,
                                          panel[j], ldda, dwork[j],    lddwork,
                                          dlA(j, i, i_loc), ldda, dwork[j]+ib, lddwork);
                      else if (j<=panel_gpunum)
                        magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise,
                                          rows, n_local[j]-i_local-ib, ib,
                                          panel[j], ldda, dwork[j],    lddwork,
                                          dlA(j, i, i_local+ib), ldda, dwork[j]+ib, lddwork);
                      else
                        magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise,
                                          rows, n_local[j]-i_local, ib,
                                          panel[j], ldda, dwork[j],    lddwork,
                                          dlA(j, i, i_local), ldda, dwork[j]+ib, lddwork);
                    }     
                  }
                else {
                  /* do the entire update as we exit and there would be no lookahead */
                  la_gpu = (panel_gpunum+1)%num_gpus;
                  int i_loc = (i+nb)/(nb*num_gpus)*nb;

                  #ifdef  MultiGPUs
                     magma_setdevice(la_gpu);
                  #endif
                  magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise,
                                    rows, n_local[la_gpu]-i_loc, ib,
                                    panel[la_gpu], ldda, dwork[la_gpu],    lddwork,
                                    dlA(la_gpu, i, i_loc), ldda, dwork[la_gpu]+ib, lddwork);
                  #ifdef  MultiGPUs
                     magma_setdevice(panel_gpunum);
                  #endif
                  magma_ssetmatrix( ib, ib,
                                    hwrk_ref(i),                   ldwork,
                                    dlA(panel_gpunum, i, i_local), ldda );
                }
                old_i  = i;
                old_ib = ib;
              }
          }
    } else {
      i = 0;
    }
    
    for(j=0; j<num_gpus; j++){
      #ifdef  MultiGPUs
      magma_setdevice(j);
      #endif
      magma_free( dwork[j] );
    }
    
    /* Use unblocked code to factor the last or only block. */
    if (i < k) {
        ib   = n-i;
        rows = m-i;
        lhwork = lwork - rows*ib;

        panel_gpunum = (panel_gpunum+1)%num_gpus;
        int i_loc = (i)/(nb*num_gpus)*nb;

        #ifdef  MultiGPUs
           magma_setdevice(panel_gpunum);
        #endif
        magma_sgetmatrix( rows, ib,
                          dlA(panel_gpunum, i, i_loc), ldda,
                          lhwrk,                       rows );

        lhwork = lwork - rows*ib;
        lapackf77_sgeqrf(&rows, &ib, lhwrk, &rows, tau+i, lhwrk+ib*rows, &lhwork, info);

        magma_ssetmatrix( rows, ib,
                          lhwrk,                       rows,
                          dlA(panel_gpunum, i, i_loc), ldda );
    }

    for(i=0; i<num_gpus; i++){
      #ifdef  MultiGPUs
         magma_setdevice(i);
      #endif
      magma_queue_destroy( streaml[i][0] );
      magma_queue_destroy( streaml[i][1] );
    }

    magma_setdevice(cdevice);
    magma_free_pinned( local_work );

    return *info;
} /* magma_sgeqrf2_mgpu */
Esempio n. 27
0
extern "C" magma_int_t
magma_spotrf_gpu(char uplo, magma_int_t n,
                 float *dA, magma_int_t ldda, magma_int_t *info)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    SPOTRF computes the Cholesky factorization of a real symmetric
    positive definite matrix dA.

    The factorization has the form
       dA = U**T * U,  if UPLO = 'U', or
       dA = L  * L**T,  if UPLO = 'L',
    where U is an upper triangular matrix and L is lower triangular.

    This is the block version of the algorithm, calling Level 3 BLAS.
    If the current stream is NULL, this version replaces it with user defined
    stream to overlap computation with communication.

    Arguments
    =========
    UPLO    (input) CHARACTER*1
            = 'U':  Upper triangle of dA is stored;
            = 'L':  Lower triangle of dA is stored.

    N       (input) INTEGER
            The order of the matrix dA.  N >= 0.

    dA      (input/output) REAL array on the GPU, dimension (LDDA,N)
            On entry, the symmetric matrix dA.  If UPLO = 'U', the leading
            N-by-N upper triangular part of dA contains the upper
            triangular part of the matrix dA, and the strictly lower
            triangular part of dA is not referenced.  If UPLO = 'L', the
            leading N-by-N lower triangular part of dA contains the lower
            triangular part of the matrix dA, and the strictly upper
            triangular part of dA is not referenced.

            On exit, if INFO = 0, the factor U or L from the Cholesky
            factorization dA = U**T * U or dA = L * L**T.

    LDDA     (input) INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            dividable by 16.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
            > 0:  if INFO = i, the leading minor of order i is not
                  positive definite, and the factorization could not be
                  completed.
    =====================================================================   */


    magma_int_t     j, jb, nb;
    char            uplo_[2] = {uplo, 0};
    float c_one     = MAGMA_S_ONE;
    float c_neg_one = MAGMA_S_NEG_ONE;
    float *work;
    float          d_one     =  1.0;
    float          d_neg_one = -1.0;
    int upper = lapackf77_lsame(uplo_, "U");

    *info = 0;
    if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,n)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    nb = magma_get_spotrf_nb(n);

    if (MAGMA_SUCCESS != magma_smalloc_pinned( &work, nb*nb )) {
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }

    /* Define user stream if current stream is NULL */
    cudaStream_t stream[2], current_stream;
    magmablasGetKernelStream(&current_stream);

    magma_queue_create( &stream[0] );
    if (current_stream == NULL) {
      magma_queue_create( &stream[1] );
      magmablasSetKernelStream(stream[1]);
    }
    else
      stream[1] = current_stream;

    if ((nb <= 1) || (nb >= n)) {
        /*  Use unblocked code. */
        magma_sgetmatrix_async( n, n, dA, ldda, work, n, stream[1] );
        magma_queue_sync( stream[1] );
        lapackf77_spotrf(uplo_, &n, work, &n, info);
        magma_ssetmatrix_async( n, n, work, n, dA, ldda, stream[1] );
    }
    else {

        /* Use blocked code. */
        if (upper) {
            
            /* Compute the Cholesky factorization A = U'*U. */
            for (j=0; j<n; j+=nb) {
                
                /* Update and factorize the current diagonal block and test
                   for non-positive-definiteness. Computing MIN */
                jb = min(nb, (n-j));
                
                magma_ssyrk(MagmaUpper, MagmaTrans, jb, j,
                            d_neg_one, dA(0, j), ldda,
                            d_one,     dA(j, j), ldda);

                magma_queue_sync( stream[1] );
                magma_sgetmatrix_async( jb, jb,
                                        dA(j, j), ldda,
                                        work,     jb, stream[0] );
                
                if ( (j+jb) < n) {
                    /* Compute the current block row. */
                    magma_sgemm(MagmaTrans, MagmaNoTrans,
                                jb, (n-j-jb), j,
                                c_neg_one, dA(0, j   ), ldda,
                                           dA(0, j+jb), ldda,
                                c_one,     dA(j, j+jb), ldda);
                }
                
                magma_queue_sync( stream[0] );
                lapackf77_spotrf(MagmaUpperStr, &jb, work, &jb, info);
                magma_ssetmatrix_async( jb, jb,
                                        work,     jb,
                                        dA(j, j), ldda, stream[1] );
                if (*info != 0) {
                    *info = *info + j;
                    break;
                }

                if ( (j+jb) < n) {
                    magma_strsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit,
                                 jb, (n-j-jb),
                                 c_one, dA(j, j   ), ldda,
                                        dA(j, j+jb), ldda);
                }
            }
        }
        else {
            //=========================================================
            // Compute the Cholesky factorization A = L*L'.
            for (j=0; j<n; j+=nb) {

                //  Update and factorize the current diagonal block and test
                //  for non-positive-definiteness. Computing MIN
                jb = min(nb, (n-j));

                magma_ssyrk(MagmaLower, MagmaNoTrans, jb, j,
                            d_neg_one, dA(j, 0), ldda,
                            d_one,     dA(j, j), ldda);
                
                magma_queue_sync( stream[1] );
                magma_sgetmatrix_async( jb, jb,
                                        dA(j, j), ldda,
                                        work,     jb, stream[0] );
                
                if ( (j+jb) < n) {
                    magma_sgemm( MagmaNoTrans, MagmaTrans,
                                 (n-j-jb), jb, j,
                                 c_neg_one, dA(j+jb, 0), ldda,
                                            dA(j,    0), ldda,
                                 c_one,     dA(j+jb, j), ldda);
                }

                magma_queue_sync( stream[0] );
                lapackf77_spotrf(MagmaLowerStr, &jb, work, &jb, info);
                magma_ssetmatrix_async( jb, jb,
                                        work,     jb,
                                        dA(j, j), ldda, stream[1] );
                if (*info != 0) {
                    *info = *info + j;
                    break;
                }
                
                if ( (j+jb) < n) {
                    magma_strsm(MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit,
                                (n-j-jb), jb,
                                c_one, dA(j,    j), ldda,
                                       dA(j+jb, j), ldda);
                }
            }
        }
    }

    magma_free_pinned( work );

    magma_queue_destroy( stream[0] );
    if (current_stream == NULL) {
      magma_queue_destroy( stream[1] );
      magmablasSetKernelStream(NULL);
    }

    return *info;
} /* magma_spotrf_gpu */