Example #1
0
/**
    Purpose
    -------
    SORGQR generates an M-by-N REAL matrix Q with orthonormal columns,
    which is defined as the first N columns of a product of K elementary
    reflectors of order M

        Q  =  H(1) H(2) . . . H(k)

    as returned by SGEQRF.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix Q. M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix Q. M >= N >= 0.

    @param[in]
    k       INTEGER
            The number of elementary reflectors whose product defines the
            matrix Q. N >= K >= 0.

    @param[in,out]
    A       REAL array A, dimension (LDDA,N).
            On entry, the i-th column must contain the vector
            which defines the elementary reflector H(i), for
            i = 1,2,...,k, as returned by SGEQRF_GPU in the
            first k columns of its array argument A.
            On exit, the M-by-N matrix Q.

    @param[in]
    lda     INTEGER
            The first dimension of the array A. LDA >= max(1,M).

    @param[in]
    tau     REAL array, dimension (K)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by SGEQRF_GPU.

    @param[in]
    T       REAL array, dimension (NB, min(M,N)).
            T contains the T matrices used in blocking the elementary
            reflectors H(i), e.g., this can be the 6th argument of
            magma_sgeqrf_gpu (except stored on the CPU, not the GPU).

    @param[in]
    nb      INTEGER
            This is the block size used in SGEQRF_GPU, and correspondingly
            the size of the T matrices, used in the factorization, and
            stored in T.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value

    @ingroup magma_sgeqrf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_sorgqr_m(
    magma_int_t m, magma_int_t n, magma_int_t k,
    float *A, magma_int_t lda,
    float *tau,
    float *T, magma_int_t nb,
    magma_int_t *info)
{
#define  A(i,j)   ( A    + (i) + (j)*lda )
#define dA(d,i,j) (dA[d] + (i) + (j)*ldda)
#define dT(d,i,j) (dT[d] + (i) + (j)*nb)

    float c_zero = MAGMA_S_ZERO;
    float c_one  = MAGMA_S_ONE;

    magma_int_t m_kk, n_kk, k_kk, mi;
    magma_int_t lwork, ldwork;
    magma_int_t d, i, ib, j, jb, ki, kk;
    float *work=NULL;

    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if ((n < 0) || (n > m)) {
        *info = -2;
    } else if ((k < 0) || (k > n)) {
        *info = -3;
    } else if (lda < max(1,m)) {
        *info = -5;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    if (n <= 0) {
        return *info;
    }
    
    magma_int_t di, dn;
    magma_int_t dpanel;

    magma_int_t ngpu = magma_num_gpus();
    
    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    
    // Allocate memory on GPUs for A and workspaces
    magma_int_t ldda    = magma_roundup( m, 32 );
    magma_int_t lddwork = magma_roundup( n, 32 );
    magma_int_t min_lblocks = (n / nb) / ngpu;  // min. blocks per gpu
    magma_int_t last_dev    = (n / nb) % ngpu;  // device with last block
    
    magma_int_t  nlocal[ MagmaMaxGPUs ] = { 0 };
    float *dA[ MagmaMaxGPUs ] = { NULL };
    float *dT[ MagmaMaxGPUs ] = { NULL };
    float *dV[ MagmaMaxGPUs ] = { NULL };
    float *dW[ MagmaMaxGPUs ] = { NULL };
    magma_queue_t queues[ MagmaMaxGPUs ] = { NULL };
    
    for( d = 0; d < ngpu; ++d ) {
        // example with n = 75, nb = 10, ngpu = 3
        // min_lblocks = 2
        // last_dev    = 1
        // gpu 0: 2  blocks, cols:  0- 9, 30-39, 60-69
        // gpu 1: 1+ blocks, cols: 10-19, 40-49, 70-74 (partial)
        // gpu 2: 1  block,  cols: 20-29, 50-59
        magma_setdevice( d );
        nlocal[d] = min_lblocks*nb;
        if ( d < last_dev ) {
            nlocal[d] += nb;
        }
        else if ( d == last_dev ) {
            nlocal[d] += (n % nb);
        }
        
        ldwork = nlocal[d]*ldda  // dA
               + nb*m            // dT
               + nb*ldda         // dV
               + nb*lddwork;     // dW
        if ( MAGMA_SUCCESS != magma_smalloc( &dA[d], ldwork )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            goto cleanup;
        }
        dT[d] = dA[d] + nlocal[d]*ldda;
        dV[d] = dT[d] + nb*m;
        dW[d] = dV[d] + nb*ldda;
        
        magma_queue_create( d, &queues[d] );
    }
    
    trace_init( 1, ngpu, 1, queues );
    
    // first kk columns are handled by blocked method.
    // ki is start of 2nd-to-last block
    if ((nb > 1) && (nb < k)) {
        ki = (k - nb - 1) / nb * nb;
        kk = min(k, ki + nb);
    } else {
        ki = 0;
        kk = 0;
    }

    // Allocate CPU work space
    // n*nb  for larfb work
    // m*nb  for V
    // nb*nb for T
    lwork = (n + m + nb) * nb;
    magma_smalloc_cpu( &work, lwork );
    if (work == NULL) {
        *info = MAGMA_ERR_HOST_ALLOC;
        goto cleanup;
    }
    float *work_T, *work_V;
    work_T = work + n*nb;
    work_V = work + n*nb + nb*nb;

    // Use unblocked code for the last or only block.
    if (kk < n) {
        trace_cpu_start( 0, "ungqr", "ungqr last block" );
        m_kk = m - kk;
        n_kk = n - kk;
        k_kk = k - kk;
        
        // sorgqr requires less workspace (n*nb), but is slow if k < sorgqr's block size.
        // replacing it with the 4 routines below is much faster (e.g., 60x).
        //magma_int_t iinfo;
        //lapackf77_sorgqr( &m_kk, &n_kk, &k_kk,
        //                  A(kk, kk), &lda,
        //                  &tau[kk], work, &lwork, &iinfo );
        
        lapackf77_slacpy( MagmaFullStr, &m_kk, &k_kk, A(kk,kk), &lda, work_V, &m_kk);
        lapackf77_slaset( MagmaFullStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda );
        
        lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr,
                          &m_kk, &k_kk,
                          work_V, &m_kk, &tau[kk], work_T, &k_kk);
        lapackf77_slarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr,
                          &m_kk, &n_kk, &k_kk,
                          work_V, &m_kk, work_T, &k_kk, A(kk, kk), &lda, work, &n_kk );
        
        if (kk > 0) {
            for( j=kk; j < n; j += nb ) {
                jb = min( n-j, nb );
                d  =  (j / nb) % ngpu;
                di = ((j / nb) / ngpu) * nb;
                magma_setdevice( d );
                magma_ssetmatrix( m_kk, jb,
                                  A(kk, j),  lda,
                                  dA(d, kk, di), ldda, queues[d] );
                
                // Set A(1:kk,kk+1:n) to zero.
                magmablas_slaset( MagmaFull, kk, jb, c_zero, c_zero, dA(d, 0, di), ldda, queues[d] );
            }
        }
        trace_cpu_end( 0 );
    }

    if (kk > 0) {
        // Use blocked code
        // send T to all GPUs
        for( d = 0; d < ngpu; ++d ) {
            magma_setdevice( d );
            trace_gpu_start( d, 0, "set", "set T" );
            magma_ssetmatrix_async( nb, min(m,n), T, nb, dT[d], nb, queues[d] );
            trace_gpu_end( d, 0 );
        }
        
        // queue: set Aii (V) --> laset --> laset --> larfb --> [next]
        // CPU has no computation
        for( i = ki; i >= 0; i -= nb ) {
            ib = min(nb, k - i);
            mi = m - i;
            dpanel =  (i / nb) % ngpu;
            di     = ((i / nb) / ngpu) * nb;

            // Send current panel to dV on the GPUs
            lapackf77_slaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda );
            for( d = 0; d < ngpu; ++d ) {
                magma_setdevice( d );
                trace_gpu_start( d, 0, "set", "set V" );
                magma_ssetmatrix_async( mi, ib,
                                        A(i, i), lda,
                                        dV[d],   ldda, queues[d] );
                trace_gpu_end( d, 0 );
            }
            
            // set panel to identity
            magma_setdevice( dpanel );
            trace_gpu_start( dpanel, 0, "laset", "laset" );
            magmablas_slaset( MagmaFull, i,  ib, c_zero, c_zero, dA(dpanel, 0, di), ldda, queues[dpanel] );
            magmablas_slaset( MagmaFull, mi, ib, c_zero, c_one,  dA(dpanel, i, di), ldda, queues[dpanel] );
            trace_gpu_end( dpanel, 0 );
            
            if (i < n) {
                // Apply H to A(i:m,i:n) from the left
                for( d = 0; d < ngpu; ++d ) {
                    magma_setdevice( d );
                    magma_indices_1D_bcyclic( nb, ngpu, d, i, n, &di, &dn );
                    trace_gpu_start( d, 0, "larfb", "larfb" );
                    magma_slarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise,
                                      mi, dn-di, ib,
                                      dV[d],        ldda, dT(d,0,i), nb,
                                      dA(d, i, di), ldda, dW[d], lddwork, queues[d] );
                    trace_gpu_end( d, 0 );
                }
            }
        }
        
        // copy result back to CPU
        trace_cpu_start( 0, "get", "get A" );
        magma_sgetmatrix_1D_col_bcyclic( m, n, dA, ldda, A, lda, ngpu, nb, queues );
        trace_cpu_end( 0 );
    }
    
    #ifdef TRACING
    char name[80];
    snprintf( name, sizeof(name), "sorgqr-n%d-ngpu%d.svg", m, ngpu );
    trace_finalize( name, "trace.css" );
    #endif
    
cleanup:
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magma_free( dA[d] );
        magma_queue_destroy( queues[d] );
    }
    magma_free_cpu( work );
    magma_setdevice( orig_dev );
    
    return *info;
} /* magma_sorgqr */
Example #2
0
extern "C" magma_int_t
magma_zpotrf2_mgpu(int num_gpus, char uplo, magma_int_t m, magma_int_t n,
                   magma_int_t off_i, magma_int_t off_j, magma_int_t nb,
                   magmaDoubleComplex **d_lA,  magma_int_t ldda,
                   magmaDoubleComplex **d_lP,  magma_int_t lddp,
                   magmaDoubleComplex *a,      magma_int_t lda,   magma_int_t h,
                   magma_queue_t stream[][3], magma_event_t event[][5],
                   magma_int_t *info )
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    ZPOTRF computes the Cholesky factorization of a complex Hermitian
    positive definite matrix dA.

    The factorization has the form
       dA = U**H * U,  if UPLO = 'U', or
       dA = L  * L**H,  if UPLO = 'L',
    where U is an upper triangular matrix and L is lower triangular.

    This is the block version of the algorithm, calling Level 3 BLAS.

    Arguments
    =========
    UPLO    (input) CHARACTER*1
            = 'U':  Upper triangle of dA is stored;
            = 'L':  Lower triangle of dA is stored.

    N       (input) INTEGER
            The order of the matrix dA.  N >= 0.

    dA      (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N)
            On entry, the Hermitian matrix dA.  If UPLO = 'U', the leading
            N-by-N upper triangular part of dA contains the upper
            triangular part of the matrix dA, and the strictly lower
            triangular part of dA is not referenced.  If UPLO = 'L', the
            leading N-by-N lower triangular part of dA contains the lower
            triangular part of the matrix dA, and the strictly upper
            triangular part of dA is not referenced.

            On exit, if INFO = 0, the factor U or L from the Cholesky
            factorization dA = U**H * U or dA = L * L**H.

    LDDA     (input) INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            dividable by 16.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
            > 0:  if INFO = i, the leading minor of order i is not
                  positive definite, and the factorization could not be
                  completed.
    =====================================================================   */


    magma_int_t     j, jb, nb0, nb2, dd, d, id, j_local, j_local2, buf;
    char            uplo_[2] = {uplo, 0};
    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    double          d_one     =  1.0;
    double          d_neg_one = -1.0;
    int upper = lapackf77_lsame(uplo_, "U");
    magmaDoubleComplex *dlpanel;
    //magma_event_t event0[MagmaMaxGPUs], // syrk
    //            event1[MagmaMaxGPUs], // send off-diagonal
    //            event2[MagmaMaxGPUs], // send diagonal
    //            event3[MagmaMaxGPUs]; // trsm
    magma_int_t n_local[MagmaMaxGPUs], ldpanel;
    int stream0 = 0, stream1 = 1;
    #ifdef ZTRSM_WORK
    magmaDoubleComplex *d_dinvA[MagmaMaxGPUs][2], *d_x[MagmaMaxGPUs][2]; /* used by ztrsm_work */
    #endif
    
    *info = 0;
    if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (!upper && num_gpus*ldda < max(1,n)) {
        *info = -4;
    } else if (upper && ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    for( d=0; d<num_gpus; d++ ) {
        /* local-n and local-ld */
        if (upper) {
            n_local[d] = ((n/nb)/num_gpus)*nb;
            if (d < (n/nb)%num_gpus)
                n_local[d] += nb;
            else if (d == (n/nb)%num_gpus)
                n_local[d] += n%nb;
        } else {
            n_local[d] = ((m/nb)/num_gpus)*nb;
            if (d < (m/nb)%num_gpus)
                n_local[d] += nb;
            else if (d == (m/nb)%num_gpus)
                n_local[d] += m%nb;
        }
        //magma_setdevice(d);
        //magma_event_create( &event0[d] );
        //magma_event_create( &event1[d] );
        //magma_event_create( &event2[d] );
        //magma_event_create( &event3[d] );
    }
    magma_setdevice(0);

    /* == initialize the trace */
    trace_init( 1, num_gpus, 3, stream );

    /* Use blocked code. */
    if (upper) {
        /* ---------------------------------------------- */
        /* Upper-triangular case                          */
        /* > Compute the Cholesky factorization A = U'*U. */
        /* ---------------------------------------------- */
        
#if defined(PRECISION_d) && defined(ZTRSM_WORK)
        /* invert the diagonals
         * Allocate device memory for the inversed diagonal blocks, size=m*NB
         */
        for( d=0; d<num_gpus; d++ ) {
            magma_setdevice(d);
            for( j=0; j<2; j++ ) {
                magma_zmalloc( &d_dinvA[d][j], nb*nb );
                magma_zmalloc( &d_x[d][j],      n*nb );
                cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(magmaDoubleComplex));
                cudaMemset(d_x[d][j],     0,  n*nb*sizeof(magmaDoubleComplex));
            }
        }
        magma_setdevice(0);
#endif
        
        for (j=0; j<m; j+=nb) {

            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%num_gpus;
            buf = (j/nb)%num_gpus;
            
            /* Set the local index where the current panel is */
            j_local = j/(nb*num_gpus);
            jb = min(nb, (m-j));
            
            if( j > 0 ) {
                /* needed on pluto... */
                magma_setdevice(id);
                magma_queue_sync( stream[id][stream0] ); // wait for the column on CPU

                /* broadcast off-diagonal column to all gpus */
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    if( d != id ) {
                        magma_setdevice(d);
                
                        /* wait for it on CPU */
                        magma_queue_wait_event( stream[d][stream0], event[id][1] );
                
                        /* send it to GPU */
                        trace_gpu_start( d, stream0, "comm", "rows to GPUs" );
                        magma_zsetmatrix_async( j, jb,
                                                Aup(0,j),        lda,
                                                dlP(d,jb,0,buf), lddp,
                                                stream[d][stream0] );
                        trace_gpu_end( d, stream0 );
                        magma_event_record( event[d][1], stream[d][stream0] );
                    }
                    d = (d+1)%num_gpus;
                }
            }
            
            /* Update the current diagonal block */
            magma_setdevice(id);
            if( j > 0 ) {
                magmablasSetKernelStream(stream[id][stream1]);
                trace_gpu_start( id, stream1, "syrk", "syrk" );
                magma_zherk(MagmaUpper, MagmaConjTrans, jb, j,
                            d_neg_one, dlA(id, 0, nb*j_local), ldda,
                            d_one,     dlA(id, j, nb*j_local), ldda);
                trace_gpu_end( id, stream1 );
                magma_event_record( event[id][0], stream[id][stream1] );
            }

            /* send the diagonal to cpu */
            magma_queue_wait_event( stream[id][stream0], event[id][0] ); // wait for syrk
            trace_gpu_start( id, stream0, "comm", "D to CPU" );
            magma_zgetmatrix_async( jb, jb,
                                    dlA(id, j, nb*j_local), ldda,
                                    Aup(j,j),               lda,
                                    stream[id][stream0] );
            trace_gpu_end( id, stream0 );

            if ( j > 0 ) {

                /* Compute the local block column of the panel. */
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    j_local2 = j_local+1;
                    if( d > id ) j_local2 --;
                    nb0 = nb*j_local2;
                
                    if( n_local[d] > nb0 ) {
                        /* wait for the off-diagonal */
                        if( d != id ) {
                            //magma_queue_sync( stream[id][3] );
                            dlpanel = dlP(d, jb, 0, buf);
                            ldpanel = lddp;
                
                            /* wait for the offdiagonal column */
                            magma_queue_wait_event( stream[d][stream1], event[d][1] );
                        } else {
                            dlpanel = dlA(d, 0, nb*j_local);
                            ldpanel = ldda;
                        }
                        
                        /* update the panel */
                        magma_setdevice(d);
                        magmablasSetKernelStream(stream[d][stream1]);
                        trace_gpu_start( d, stream1, "gemm", "gemm" );
                        magma_zgemm(MagmaConjTrans, MagmaNoTrans,
                                    jb, n_local[d]-nb0, j,
                                    c_neg_one, dlpanel,        ldpanel,
                                               dlA(d, 0, nb0), ldda,
                                    c_one,     dlA(d, j, nb0), ldda);
                        trace_gpu_end( d, stream1 );
                    }
                    d = (d+1)%num_gpus;
                }
            }
            
            /* factor the diagonal */
            magma_setdevice(id);
            magma_queue_sync( stream[id][stream0] ); // wait for the diagonal
            trace_cpu_start( 0, "getrf", "getrf" );
            lapackf77_zpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info);
            trace_cpu_end( 0 );
            if (*info != 0) {
                *info = *info + j;
                break;
            }

            /* send the diagonal to gpus */
            if ( (j+jb) < n) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    magma_setdevice(d);
                    if( d == id ) {
                        dlpanel = dlA(d, j, nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlP(d, 0, 0, buf);
                        ldpanel = lddp;
                    }
                    
                    trace_gpu_start( d, stream0, "comm", "D to GPUs" );
                    magma_zsetmatrix_async( jb, jb,
                                            Aup(j,j), lda,
                                            dlpanel,  ldpanel,
                                            stream[d][stream0] );
                    trace_gpu_end( d, stream0 );
                    magma_event_record( event[d][2], stream[d][stream0] );
                    d = (d+1)%num_gpus;
                }
            } else {
                magma_setdevice(id);
                trace_gpu_start( id, stream0, "comm", "D to GPUs" );
                magma_zsetmatrix_async( jb, jb,
                                        Aup(j,j),               lda,
                                        dlA(id, j, nb*j_local), ldda,
                                        stream[id][stream0] );
                trace_gpu_end( id, stream0 );
            }
            
            /* panel-factorize the off-diagonal */
            if ( (j+jb) < n) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    /* next column */
                    j_local2 = j_local+1;
                    if( d > id ) j_local2--;
                    if( d == id ) {
                        dlpanel = dlA(d, j, nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlP(d, 0, 0, buf);
                        ldpanel = lddp;
                    }
                    nb2 = n_local[d]-nb*j_local2;
                    nb0 = min(nb, nb2 );
                    
                    magma_setdevice(d);
                    magmablasSetKernelStream(stream[d][stream1]);
                    magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for the diagonal
                    if( j+jb < m && d == (j/nb+1)%num_gpus ) {
                        /* owns the next column, look-ahead the column */
                        trace_gpu_start( d, stream1, "trsm", "trsm" );
#if defined(PRECISION_d) && defined(ZTRSM_WORK)
                        magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                              jb, nb0, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                              d_dinvA[d][0], d_x[d][0] );
                        /*nb2 = n_local[d] - j_local2*nb;
                        magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                              d_dinvA[d], d_x[d] );*/
#else
                        /*nb2 = n_local[d] - j_local2*nb;
                        magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel,                ldda,
                                     dlA(d, j, nb*j_local2), ldda);
                        */
                        magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                     jb, nb0, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, j, nb*j_local2), ldda);
#endif
                        trace_gpu_end( d, stream1 );
                        magma_event_record( event[d][3], stream[d][stream1] );
                        
                        /* send the column to cpu */
                        if( j+jb < m ) {
                            trace_gpu_start( d, stream0, "comm", "rows to CPU" );
                            magma_queue_wait_event( stream[d][stream0], event[d][3] ); // wait for lookahead
                            magma_zgetmatrix_async( (j+jb), nb0,
                                                    dlA(d, 0, nb*j_local2), ldda,
                                                    Aup(0,j+jb),            lda,
                                                    stream[d][stream0] );
                            trace_gpu_end( d, stream0 );
                            magma_event_record( event[d][1], stream[d][stream0] );
                        }
                        
                        /* update the remaining blocks */
                        nb2 = nb2 - nb0;
#if defined(PRECISION_d) && defined(ZTRSM_WORK)
                        magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel,                    ldpanel,
                                              dlA(d, j, nb*j_local2+nb0), ldda,
                                              d_dinvA[d][1], d_x[d][1] );
#else
                        magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel,                    ldpanel,
                                     dlA(d, j, nb*j_local2+nb0), ldda);
#endif
                    } else if( nb2 > 0 ) {
                        /* update the entire trailing matrix */
                        trace_gpu_start( d, stream1, "trsm", "trsm" );
#if defined(PRECISION_d) && defined(ZTRSM_WORK)
                        magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                    d_dinvA[d][1], d_x[d][1] );
#else
                        magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, j, nb*j_local2), ldda);
#endif
                        trace_gpu_end( d, stream1 );
                    }
                    d = (d+1)%num_gpus;
                }
            } /* end of ztrsm */
        } /* end of for j=1, .., n */
    } else {
        /* -------------------------------------------- */
        /* Lower-triangular case                        */
        /* Compute the Cholesky factorization A = L*L'. */
        /* -------------------------------------------- */
#if defined(PRECISION_d) && defined(ZTRSM_WORK)
        /*
         * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE
         */
        for( d=0; d<num_gpus; d++ ) {
            magma_setdevice(d);
            for( j=0; j<2; j++ ) {
                magma_zmalloc( &d_dinvA[d][j], nb*nb );
                magma_zmalloc( &d_x[d][j],     nb*m  );
                cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(magmaDoubleComplex));
                cudaMemset(d_x[d][j],     0, nb* m*sizeof(magmaDoubleComplex));
            }
        }
        magma_setdevice(0);
#endif

        for (j=0; j<n; j+=nb) {

            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%num_gpus;
            buf = (j/nb)%num_gpus;
            
            /* Set the local index where the current panel is */
            j_local = j/(nb*num_gpus);
            jb = min(nb, (n-j));
            
            if( j > 0 ) {
                /* needed on pluto... */
                magma_setdevice(id);
                magma_queue_sync( stream[id][stream0] ); // wait for the column on CPU

                /* broadcast offdiagonal row to all gpus */
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    if( d != id ) {
                        magma_setdevice(d);
                        /* wait for it on CPU */
                        magma_queue_wait_event( stream[d][stream0], event[id][1] );
            
                        /* send it to GPU */
                        magma_zsetmatrix_async( jb, j,
                                                Alo(j,0),         lda,
                                                dlPT(d,0,jb,buf), nb,
                                                stream[d][stream0] );
                        magma_event_record( event[d][1], stream[d][stream0] );
                    }
                    d = (d+1)%num_gpus;
                }
            }

            /* Update the current diagonal block */
            magma_setdevice(id);
            if( j > 0 ) {
                magmablasSetKernelStream(stream[id][stream1]);
                magma_zherk(MagmaLower, MagmaNoTrans, jb, j,
                            d_neg_one, dlA(id, nb*j_local, 0), ldda,
                            d_one,     dlA(id, nb*j_local, j), ldda);
                magma_event_record( event[id][0], stream[id][stream1] );
            }
            
            /* send the diagonal to cpu */
            magma_queue_wait_event( stream[id][stream0], event[id][0] ); // wait for syrk
            magma_zgetmatrix_async( jb, jb,
                                    dlA(id, nb*j_local, j), ldda,
                                    Alo(j,j),               lda,
                                    stream[id][stream0] );

            /* update the offdiagonal blocks */
            if ( j > 0 ) {
                /* compute the block-rows of the panel */
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    j_local2 = j_local+1;
                    if( d > id ) j_local2 --;
                    nb0 = nb*j_local2;
            
                    if( nb0 < n_local[d] ) {
                        if( d != id ) {
                            dlpanel = dlPT(d, 0, jb, buf);
                            ldpanel = nb;
            
                            /* wait for offdiagonal row */
                            magma_queue_wait_event( stream[d][stream1], event[d][1] );
                        } else {
                            dlpanel = dlA(d, nb*j_local, 0);
                            ldpanel = ldda;
                        }
            
                        magma_setdevice(d);
                        magmablasSetKernelStream(stream[d][stream1]);
                        magma_zgemm( MagmaNoTrans, MagmaConjTrans,
                                     n_local[d]-nb0, jb, j,
                                     c_neg_one, dlA(d, nb0, 0), ldda,
                                                dlpanel,        ldpanel,
                                     c_one,     dlA(d, nb0, j), ldda);
                    }
                    d = (d+1)%num_gpus;
                }
            }

            /* factor the diagonal */
            magma_setdevice(id);
            magma_queue_sync( stream[id][stream0] );
            lapackf77_zpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info);
            if (*info != 0) {
                *info = *info + j;
                break;
            }

            /* send the diagonal to gpus */
            if ( (j+jb) < m ) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    magma_setdevice(d);
                    if( d == id ) {
                        dlpanel = dlA(d, nb*j_local, j);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlPT(d, 0, 0, buf);
                        ldpanel = nb;
                    }
                    magma_zsetmatrix_async( jb, jb,
                                            Alo(j,j), lda,
                                            dlpanel,  ldpanel,
                                            stream[d][stream0] );
                    magma_event_record( event[d][2], stream[d][stream0] );
                    d = (d+1)%num_gpus;
                }
            } else {
                magma_setdevice(id);
                magma_zsetmatrix_async( jb, jb,
                                        Alo(j,j),               lda,
                                        dlA(id, nb*j_local, j), ldda,
                                        stream[id][stream0] );
            }

            /* factorize off-diagonal blocks */
            if ( (j+jb) < m ) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    /* next column */
                    j_local2 = j_local+1;
                    if( d > id ) j_local2--;
                    if( d == id ) {
                        dlpanel = dlA(d, nb*j_local, j);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlPT(d, 0, 0, buf);
                        ldpanel = nb;
                    }
                    nb2 = n_local[d] - j_local2*nb;
                    nb0 = min(nb, nb2 );
            
                    magma_setdevice(d);
                    magmablasSetKernelStream(stream[d][stream1]);
                    magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for the diagonal
                    if( j+jb < n && d == (j/nb+1)%num_gpus ) {
                        /* owns the next column, look-ahead the column */
#if defined(PRECISION_d) && defined(ZTRSM_WORK)
                        magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                              nb0, jb, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, nb*j_local2, j), ldda,
                                              d_dinvA[d][0], d_x[d][0]);
#else
                        magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                     nb0, jb, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, nb*j_local2, j), ldda);
#endif
                        magma_event_record( event[d][3], stream[d][stream1] );

                        /* send the column to cpu */
                        if( j+jb < n ) {
                            magma_queue_wait_event( stream[d][stream0], event[d][3] ); // wait for lookahead
                            magma_zgetmatrix_async( nb0, j+jb,
                                                    dlA(d, nb*j_local2, 0), ldda,
                                                    Alo(j+jb,0),            lda,
                                                    stream[d][stream0] );
                            magma_event_record( event[d][1], stream[d][stream0] );
                        }

                        /* update the remaining blocks */
                        nb2 = nb2 - nb0;
#if defined(PRECISION_d) && defined(ZTRSM_WORK)
                        magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                              nb2, jb, c_one,
                                              dlpanel,                    ldpanel,
                                              dlA(d, nb*j_local2+nb0, j), ldda,
                                              d_dinvA[d][1], d_x[d][1] );
#else
                        magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                     nb2, jb, c_one,
                                     dlpanel,                    ldpanel,
                                     dlA(d, nb*j_local2+nb0, j), ldda);
#endif
                    } else if( nb2 > 0 ) {
                        /* update the entire trailing matrix */
#if defined(PRECISION_d) && defined(ZTRSM_WORK)
                        magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                              nb2, jb, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, nb*j_local2, j), ldda,
                                              d_dinvA[d][1], d_x[d][1] );
#else
                        magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                     nb2, jb, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, nb*j_local2, j), ldda);
#endif
                    }
                    d = (d+1)%num_gpus;
                }
            }
        }
    } /* end of else not upper */

    /* == finalize the trace == */
    trace_finalize( "zpotrf.svg","trace.css" );

    /* clean up */
    for( d=0; d<num_gpus; d++ ) {
        magma_setdevice(d);
        magma_queue_sync( stream[d][0] );
        magma_queue_sync( stream[d][1] );
        magmablasSetKernelStream(NULL);

        //magma_event_destroy( event0[d] );
        //magma_event_destroy( event1[d] );
        //magma_event_destroy( event2[d] );
        //magma_event_destroy( event3[d] );
    }
    magma_setdevice(0);

    return *info;
} /* magma_zpotrf_mgpu */
Example #3
0
extern "C" magma_int_t
magma_zpotrf3_mgpu(magma_int_t num_gpus, char uplo, magma_int_t m, magma_int_t n,
                   magma_int_t off_i, magma_int_t off_j, magma_int_t nb,
                   magmaDoubleComplex *d_lA[],  magma_int_t ldda,
                   magmaDoubleComplex *d_lP[],  magma_int_t lddp,
                   magmaDoubleComplex *a,       magma_int_t lda, magma_int_t h,
                   magma_queue_t stream[][3], magma_event_t event[][5],
                   magma_int_t *info )
{
    /*  -- MAGMA (version 1.4.0) --
           Univ. of Tennessee, Knoxville
           Univ. of California, Berkeley
           Univ. of Colorado, Denver
           August 2013

        Purpose
        =======
        ZPOTRF computes the Cholesky factorization of a complex Hermitian
        positive definite matrix dA.
        Auxiliary subroutine for zpotrf2_ooc. It is multiple gpu interface to compute
        Cholesky of a "rectangular" matrix.

        The factorization has the form
           dA = U**H * U,  if UPLO = 'U', or
           dA = L  * L**H,  if UPLO = 'L',
        where U is an upper triangular matrix and L is lower triangular.

        This is the block version of the algorithm, calling Level 3 BLAS.

        Arguments
        =========
        UPLO    (input) CHARACTER*1
                = 'U':  Upper triangle of dA is stored;
                = 'L':  Lower triangle of dA is stored.

        N       (input) INTEGER
                The order of the matrix dA.  N >= 0.

        dA      (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N)
                On entry, the Hermitian matrix dA.  If UPLO = 'U', the leading
                N-by-N upper triangular part of dA contains the upper
                triangular part of the matrix dA, and the strictly lower
                triangular part of dA is not referenced.  If UPLO = 'L', the
                leading N-by-N lower triangular part of dA contains the lower
                triangular part of the matrix dA, and the strictly upper
                triangular part of dA is not referenced.

                On exit, if INFO = 0, the factor U or L from the Cholesky
                factorization dA = U**H * U or dA = L * L**H.

        LDDA     (input) INTEGER
                The leading dimension of the array dA.  LDDA >= max(1,N).
                To benefit from coalescent memory accesses LDDA must be
                dividable by 16.

        INFO    (output) INTEGER
                = 0:  successful exit
                < 0:  if INFO = -i, the i-th argument had an illegal value
                > 0:  if INFO = i, the leading minor of order i is not
                      positive definite, and the factorization could not be
                      completed.
        =====================================================================   */


    magma_int_t     j, jb, nb0, nb2, d, dd, id, j_local, j_local2, buf;
    char            uplo_[2] = {uplo, 0};
    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    double          d_one     =  1.0;
    double          d_neg_one = -1.0;
    int upper = lapackf77_lsame(uplo_, "U");
    magmaDoubleComplex *dlpanel;
    magma_int_t n_local[MagmaMaxGPUs], ldpanel;
    const magma_int_t stream1 = 0, stream2 = 1, stream3 = 2;
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK)
    /* used by ztrsm_work */
    int trsm_nb = 128;
    int trsm_n = trsm_nb*((nb+trsm_nb-1)/trsm_nb);
    magmaDoubleComplex *d_dinvA[MagmaMaxGPUs];
    magmaDoubleComplex *d_x[MagmaMaxGPUs];
#define dinvA(d,j) &(d_dinvA[(d)][(j)*trsm_nb*trsm_n])
#define dx(d,j) &(d_x[(d)][(j)*nb*m])
    /*
     * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE
     */
    for( d=0; d<num_gpus; d++ ) {
        magma_setdevice(d);
        if ( (MAGMA_SUCCESS != magma_zmalloc( &d_dinvA[d], 2*trsm_nb*trsm_n )) ||
                (MAGMA_SUCCESS != magma_zmalloc( &d_x[d],     2*nb*(upper ? n : m) )) ) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }
    }
    magma_setdevice(0);
#endif

    *info = 0;
    if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (!upper && num_gpus*ldda < max(1,n)) {
        *info = -4;
    } else if (upper && ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* initialization */
    for( d=0; d<num_gpus; d++ ) {
        /* local-n and local-ld */
        if (upper) {
            n_local[d] = (n/(nb*num_gpus))*nb;
            if (d < (n/nb)%num_gpus)
                n_local[d] += nb;
            else if (d == (n/nb)%num_gpus)
                n_local[d] += n%nb;
        } else {
            n_local[d] = (m/(nb*num_gpus))*nb;
            if (d < (m/nb)%num_gpus)
                n_local[d] += nb;
            else if (d == (m/nb)%num_gpus)
                n_local[d] += m%nb;
        }
    }

    /* == initialize the trace */
    trace_init( 1, num_gpus, 3, (CUstream_st**)stream );

    if (upper)
    {
        /* ---------------------------------------------- */
        /* Upper-triangular case                          */
        /* > Compute the Cholesky factorization A = U'*U. */
        /* ---------------------------------------------- */
        for (j=0; j<m; j+=nb) {

            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%num_gpus;
            buf = (j/nb)%num_gpus; // right now, we have num_gpu buffers, so id and buf are the same..

            /* Set the local index where the current panel is */
            j_local = j/(nb*num_gpus);
            jb = min(nb, (m-j));

            /* Update the current diagonal block on stream1 */
            magma_setdevice(id);
            if( j > 0 ) {
                magmablasSetKernelStream(stream[id][stream1]);
                trace_gpu_start( id, stream1, "syrk", "syrk" );
                magma_zherk(MagmaUpper, MagmaConjTrans, jb, j,
                            d_neg_one, dlA(id, 0, nb*j_local), ldda,
                            d_one,     dlA(id, j, nb*j_local), ldda);
                trace_gpu_end( id, stream1 );
            }

            /* send the diagonal to cpu on stream1 */
            trace_gpu_start( id, stream1, "comm", "D to CPU" );
            magma_zgetmatrix_async( jb, jb,
                                    dlA(id, j, nb*j_local), ldda,
                                    Aup(j,j),               lda,
                                    stream[id][stream1] );
            trace_gpu_end( id, stream1 );

            /* update off-diagonal blocks in the panel */
            if( j > 0 ) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    j_local2 = j_local+1;
                    if( d > id ) j_local2 --;
                    nb0 = nb*j_local2; // number of local columns in the panel, while jb is panel-size (number of rows)

                    if( n_local[d] > nb0 ) {
                        magma_setdevice(d);
                        magmablasSetKernelStream(stream[d][stream2]);
                        if( d == id ) {
                            dlpanel = dlA(d,0,nb*j_local);
                            ldpanel = ldda;
                            // the GPU owns the row from start, and no need of synch.
                            //magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu
                        } else {
                            dlpanel = dlP(d,nb,0,buf);
                            ldpanel = lddp;
                            magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu
                        }
                        trace_gpu_start( d, stream2, "gemm", "gemm" );
                        magma_zgemm(MagmaConjTrans, MagmaNoTrans,
                                    jb, n_local[d]-nb0, j,
                                    c_neg_one, dlpanel,        ldpanel,
                                    dlA(d, 0, nb0), ldda,
                                    c_one,     dlA(d, j, nb0), ldda);
                        trace_gpu_end( d, stream2 );
                        magma_event_record( event[d][2], stream[d][stream2] );
                    }
                    d = (d+1)%num_gpus;
                }
            }

            /* wait for panel and factorize it on cpu */
            magma_setdevice(id);
            magma_queue_sync( stream[id][stream1] );
            trace_cpu_start( 0, "getrf", "getrf" );
            lapackf77_zpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info);
            trace_cpu_end( 0 );
            if (*info != 0) {
                *info = *info + j;
                break;
            }

            /* send the diagonal to gpus on stream1 */
            if ( (j+jb) < n) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    if( d == id ) {
                        dlpanel = dlA(d, j, nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlP(d,0,0,buf);
                        ldpanel = lddp;
                    }
                    magma_setdevice(d);
                    trace_gpu_start( d, stream1, "comm", "comm" );
                    magma_zsetmatrix_async( jb, jb,
                                            Aup(j,j), lda,
                                            dlpanel,  ldpanel,
                                            stream[d][stream1] );
                    trace_gpu_end( d, stream1 );
                    magma_event_record( event[d][1], stream[d][stream1] );
                    d = (d+1)%num_gpus;
                }
            } else {
                magma_setdevice(id);
                trace_gpu_start( id, stream1, "comm", "comm" );
                magma_zsetmatrix_async( jb, jb,
                                        Aup(j,j),               lda,
                                        dlA(id, j, nb*j_local), ldda,
                                        stream[id][stream1] );
                trace_gpu_end( id, stream1 );
            }

            /* panel-factorize the off-diagonal */
            if ( (j+jb) < n) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    /* next column */
                    j_local2 = j_local+1;
                    if( d > id ) j_local2--;
                    if( d == id ) {
                        dlpanel = dlA(d,j,nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlP(d,0,0,buf);
                        ldpanel = lddp;
                    }
                    nb2 = n_local[d] - j_local2*nb;

                    magma_setdevice(d);
                    if( j+jb < m && d == (j/nb+1)%num_gpus ) {
                        /* owns the next column, look-ahead next block on stream1 */
                        nb0 = min(nb, nb2);
                        magmablasSetKernelStream(stream[d][stream1]);
                        magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update
                        trace_gpu_start( d, stream1, "trsm", "trsm" );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK)
                        magmablas_zlaset( MagmaUpperLower, trsm_nb, trsm_n, dinvA(d,0),trsm_nb );
                        magmablas_zlaset( MagmaUpperLower, nb0,jb, dx(d,0),nb0 );
                        magmablas_ztrsm_work( MagmaLeft, MagmaUpper,
                                              MagmaConjTrans, MagmaNonUnit,
                                              jb, nb0, c_one,
                                              dlpanel, ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                              1, dinvA(d,0), dx(d,0) );
#else
                        magma_ztrsm( MagmaLeft, MagmaUpper,
                                     MagmaConjTrans, MagmaNonUnit,
                                     jb, nb0, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, j, nb*j_local2), ldda);
#endif
                        magma_event_record( event[d][4], stream[d][stream1] );
                        trace_gpu_end( d, stream1 );
                    } else if( nb2 > 0 ) {
                        /* update all the blocks on stream2 */
                        magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor
                        trace_gpu_start( d, stream2, "trsm", "trsm" );
                        magmablasSetKernelStream(stream[d][stream2]);
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK)
                        magmablas_zlaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,0),trsm_nb );
                        magmablas_zlaset( MagmaUpperLower, nb2,jb, dx(d,0),nb2 );
                        magmablas_ztrsm_work( MagmaLeft, MagmaUpper,
                                              MagmaConjTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel, ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                              1, dinvA(d,0), dx(d,0) );
#else
                        magma_ztrsm( MagmaLeft, MagmaUpper,
                                     MagmaConjTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, j, nb*j_local2), ldda);
#endif
                        trace_gpu_end( d, stream2 );
                    }
                    d = (d+1)%num_gpus;
                } /* end of for */

                /* ========================================================== */
                if( j+jb < m ) {
                    d = (j/nb+1)%num_gpus;
                    /* next column */
                    j_local2 = j_local+1;
                    if( d > id ) j_local2--;
                    nb0 = min(nb, n_local[d]-nb*j_local2 );

                    /* even on 1 gpu, off-diagonals are copied to cpu (synchronize at the end).      *
                     * so we have the Cholesky factor, but only diagonal submatrix of the big panel, *
                     * on cpu at the end.                                                            */
                    int d2, buf2;
                    magma_setdevice(d);
                    /* lookahead done */
                    magma_queue_wait_event( stream[d][stream3], event[d][4] );

                    trace_gpu_start( d, stream3, "comm", "row to CPU" );
                    magma_zgetmatrix_async( (j+jb), nb0,
                                            dlA(d, 0, nb*j_local2), ldda,
                                            Aup(0,j+jb),            lda,
                                            stream[d][stream3] );
                    trace_gpu_end( d, stream3 );
                    magma_event_record( event[d][3], stream[d][stream3] );
                    /* needed on pluto */
                    //magma_queue_sync( stream[d][stream3] );

                    /* broadcast rows to gpus on stream2 */
                    buf2 = ((j+jb)/nb)%num_gpus;
                    for( d2=0; d2<num_gpus; d2++ ) {
                        if( d2 != d )
                        {
                            magma_setdevice(d2);
                            trace_gpu_start( d2, stream3, "comm", "row to GPUs" );
                            magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // rows arrived at cpu on stream3
                            magma_zsetmatrix_async( j+jb, nb0,
                                                    Aup(0,j+jb),       lda,
                                                    dlP(d2,nb,0,buf2), lddp,
                                                    stream[d2][stream3] );
                            trace_gpu_end( d2, stream3 );
                            magma_event_record( event[d2][0], stream[d2][stream3] );
                        }
                    }

                    /* =========================== */
                    /* update the remaining blocks */
                    nb2 = n_local[d]-(nb*j_local2 + nb0);
                    if( nb2 > 0 ) {
                        if( d == id ) {
                            dlpanel = dlA(d, j, nb*j_local);
                            ldpanel = ldda;
                        } else {
                            dlpanel = dlP(d,0,0,buf);
                            ldpanel = lddp;
                        }
                        magma_setdevice(d);
                        magmablasSetKernelStream(stream[d][stream2]);
                        trace_gpu_start( d, stream2, "trsm", "trsm" );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK)
                        int flag = 0;
                        if (flag == 0) {
                            magma_queue_wait_event( stream[d][stream2], event[d][4] ); // lookahead -> diagonal inversion
                        } else {
                            magmablas_zlaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,flag),trsm_nb );
                            magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received
                        }
                        magmablas_zlaset( MagmaUpperLower, nb2,jb, dx(d,1),nb2 );
                        magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel, ldpanel,
                                              dlA(d, j, nb*j_local2+nb0), ldda,
                                              flag, dinvA(d,flag), dx(d,1) );
#else
                        magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor
                        magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel, ldpanel,
                                     dlA(d, j, nb*j_local2+nb0), ldda);
#endif
                        trace_gpu_end( d, stream2 );
                    }
                }
            } /* end of ztrsm */
        } /* end of for j=1, .., n */
    } else {

        /* ---------------------------------------------- */
        /* Lower-triangular case                          */
        /* > Compute the Cholesky factorization A = L*L'. */
        /* ---------------------------------------------- */
        for (j=0; j<n; j+=nb) {

            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%num_gpus;
            buf = (j/nb)%num_gpus;

            /* Set the local index where the current panel is */
            j_local = j/(nb*num_gpus);
            jb = min(nb, (n-j));

            /* Update the current diagonal block on stream1 */
            magma_setdevice(id);
            if( j > 0 ) {
                magmablasSetKernelStream(stream[id][stream1]);
                magma_zherk(MagmaLower, MagmaNoTrans, jb, j,
                            d_neg_one, dlA(id, nb*j_local, 0), ldda,
                            d_one,     dlA(id, nb*j_local, j), ldda);
            }

            /* send the diagonal to cpu on stream1 */
            magma_zgetmatrix_async( jb, jb,
                                    dlA(id, nb*j_local, j), ldda,
                                    Alo(j,j),               lda,
                                    stream[id][stream1] );

            /* update off-diagonal blocks of the panel */
            if( j > 0 ) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    j_local2 = j_local+1;
                    if( d > id ) j_local2 --;
                    nb0 = nb*j_local2;

                    if( nb0 < n_local[d] ) {
                        magma_setdevice(d);
                        magmablasSetKernelStream(stream[d][stream2]);
                        if( d == id ) {
                            dlpanel = dlA(d, nb*j_local, 0);
                            ldpanel = ldda;
                        } else {
                            dlpanel = dlPT(d,0,nb,buf);
                            ldpanel = nb;
                            magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu
                        }
                        magma_zgemm( MagmaNoTrans, MagmaConjTrans,
                                     n_local[d]-nb0, jb, j,
                                     c_neg_one, dlA(d, nb0, 0), ldda,
                                     dlpanel,        ldpanel,
                                     c_one,     dlA(d, nb0, j), ldda);
                        magma_event_record( event[d][2], stream[d][stream2] );
                    }
                    d = (d+1)%num_gpus;
                }
            }

            /* wait for the panel and factorized it on cpu */
            magma_setdevice(id);
            magma_queue_sync( stream[id][stream1] );
            lapackf77_zpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info);
            if (*info != 0) {
                *info = *info + j;
                break;
            }

            /* send the diagonal to gpus on stream1 */
            if ( (j+jb) < m) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    if( d == id ) {
                        dlpanel = dlA(d, nb*j_local, j);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlPT(d, 0, 0, buf);
                        ldpanel = nb;
                    }
                    magma_setdevice(d);
                    magma_zsetmatrix_async( jb, jb,
                                            Alo(j,j), lda,
                                            dlpanel,  ldpanel,
                                            stream[d][stream1] );
                    magma_event_record( event[d][1], stream[d][stream1] );
                    d = (d+1)%num_gpus;
                }
            } else {
                magma_setdevice(id);
                magma_zsetmatrix_async( jb, jb,
                                        Alo(j,j),               lda,
                                        dlA(id, nb*j_local, j), ldda,
                                        stream[id][stream1] );
            }

            /* panel factorize the off-diagonal */
            if ( (j+jb) < m) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    /* next column */
                    j_local2 = j_local+1;
                    if( d > id ) j_local2--;
                    if( d == id ) {
                        dlpanel = dlA(d, nb*j_local, j);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlPT(d, 0, 0, buf);
                        ldpanel = nb;
                    }
                    nb2 = n_local[d] - j_local2*nb;
                    nb0 = min(nb, nb2);

                    magma_setdevice(d);
                    if( j+nb < n && d == (j/nb+1)%num_gpus ) { /* owns next column, look-ahead next block on stream1 */
                        if ( j > 0 ) magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update
                        magmablasSetKernelStream(stream[d][stream1]);
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK)
                        magmablas_zlaset( MagmaUpperLower, trsm_nb, trsm_n, dinvA(d,0),trsm_nb );
                        magmablas_zlaset( MagmaUpperLower, nb0,jb, dx(d,0),nb0 );
                        magmablas_ztrsm_work( MagmaRight, MagmaLower,
                                              MagmaConjTrans, MagmaNonUnit,
                                              nb0, jb, c_one,
                                              dlpanel, ldpanel,
                                              dlA(d, nb*j_local2, j), ldda,
                                              1, dinvA(d,0), dx(d,0) );
#else
                        magma_ztrsm( MagmaRight, MagmaLower,
                                     MagmaConjTrans, MagmaNonUnit,
                                     nb0, jb, c_one,
                                     dlpanel, ldpanel,
                                     dlA(d, nb*j_local2, j), ldda);
#endif
                        magma_event_record( event[d][4], stream[d][stream1] );
                    } else if( nb2 > 0 ) { /* other gpus updating all the blocks on stream2 */
                        /* update the entire column */
                        magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for the cholesky factor
                        magmablasSetKernelStream(stream[d][stream2]);
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK)
                        magmablas_zlaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,0),trsm_nb );
                        magmablas_zlaset( MagmaUpperLower, nb2,jb, dx(d,0),nb2 );
                        magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                              nb2, jb, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, nb*j_local2, j), ldda,
                                              1, dinvA(d,0), dx(d,0) );
#else
                        magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                     nb2, jb, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, nb*j_local2, j), ldda);
#endif
                    }
                    d = (d+1)%num_gpus;
                } /* end for d */

                /* ========================================================== */
                if( j+jb < n ) {
                    d = (j/nb+1)%num_gpus;
                    /* next column */
                    j_local2 = j_local+1;
                    if( d > id ) j_local2--;
                    nb0 = min(nb, n_local[d]-nb*j_local2 );

                    /* even on 1 gpu, we copy off-diagonal to cpu (but don't synchronize).  */
                    /* so we have the Cholesky factor on cpu at the end.                    */
                    int d2, buf2;
//#define ZPOTRF_DEVICE_TO_DEVICE
#ifdef ZPOTRF_DEVICE_TO_DEVICE
                    // lookahead done

                    /* broadcast the rows to gpus */
                    buf2 = ((j+jb)/nb)%num_gpus;
                    for( d2=0; d2<num_gpus; d2++ ) {
                        magma_setdevice(d2);
                        magma_queue_wait_event( stream[d2][stream3], event[d][4] );
                        if( d2 != d ) {
                            magma_zcopymatrix_async( nb0, j+jb,
                                                     dlPT(d2,0,nb,buf2), nb, // first nbxnb reserved for diagonal block
                                                     dlA(d, nb*j_local2, 0), ldda,
                                                     stream[d2][stream3] );
                            magma_event_record( event[d2][0], stream[d2][stream3] );
                        } else {
                            magma_zgetmatrix_async( nb0, j+jb,
                                                    dlA(d, nb*j_local2, 0), ldda,
                                                    Alo(j+jb,0),            lda,
                                                    stream[d][stream3] );
                        }
                    }
#else
                    // lookahead done
                    magma_setdevice(d);
                    magma_queue_wait_event( stream[d][stream3], event[d][4] );
                    magma_zgetmatrix_async( nb0, j+jb,
                                            dlA(d, nb*j_local2, 0), ldda,
                                            Alo(j+jb,0),            lda,
                                            stream[d][stream3] );
                    magma_event_record( event[d][3], stream[d][stream3] );
                    /* syn on rows on CPU, seem to be needed on Pluto */
                    //magma_queue_sync( stream[d][stream3] );

                    /* broadcast the rows to gpus */
                    buf2 = ((j+jb)/nb)%num_gpus;
                    for( d2=0; d2<num_gpus; d2++ ) {
                        if( d2 != d )
                        {
                            magma_setdevice(d2);
                            magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // getmatrix done
                            magma_zsetmatrix_async( nb0, j+jb,
                                                    Alo(j+jb,0),        lda,
                                                    dlPT(d2,0,nb,buf2), nb, // first nbxnb reserved for diagonal block
                                                    stream[d2][stream3] );
                            magma_event_record( event[d2][0], stream[d2][stream3] );
                        }
                    }
#endif
                    /* =================================== */
                    /* updates remaining blocks on stream2 */
                    nb2 = n_local[d] - (j_local2*nb + nb0);
                    if( nb2 > 0 ) {
                        if( d == id ) {
                            dlpanel = dlA(d, nb*j_local, j);
                            ldpanel = ldda;
                        } else {
                            dlpanel = dlPT(d,0,0,buf);
                            ldpanel = nb;
                        }
                        magma_setdevice(d);
                        magmablasSetKernelStream(stream[d][stream2]);
                        /* update the remaining blocks in the column */
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK)
                        int flag = 0;
                        if (flag == 0) {
                            magma_queue_wait_event( stream[d][stream2], event[d][4] ); // lookahead -> diagonal inversion
                        } else {
                            magmablas_zlaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,flag),trsm_nb );
                            magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received
                        }
                        magmablas_zlaset( MagmaUpperLower, nb2,jb, dx(d,1),nb2 );
                        magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                              nb2, jb, c_one,
                                              dlpanel,                    ldpanel,
                                              dlA(d, nb*j_local2+nb0, j), ldda,
                                              flag, dinvA(d,flag), dx(d,1) );
#else
                        magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received
                        magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                     nb2, jb, c_one,
                                     dlpanel,                    ldpanel,
                                     dlA(d, nb*j_local2+nb0, j), ldda);
#endif
                    }
                }
            }
        }
    } /* end of else not upper */

    /* == finalize the trace == */
    trace_finalize( "zpotrf.svg","trace.css" );
    for( d=0; d<num_gpus; d++ ) {
        magma_setdevice(d);
        for( j=0; j<3; j++ ) {
            magma_queue_sync( stream[d][j] );
        }
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK)
        magma_free( d_dinvA[d] );
        magma_free( d_x[d] );
#endif
        magmablasSetKernelStream(NULL);
    }
    magma_setdevice(0);

    return *info;
} /* magma_zpotrf_mgpu */
/**
    Purpose   
    =======   

    SSYTRF_nopiv_gpu computes the LDLt factorization of a real symmetric   
    matrix A.

    The factorization has the form   
       A = U^H * D * U , if UPLO = 'U', or   
       A = L  * D * L^H, if UPLO = 'L',   
    where U is an upper triangular matrix, L is lower triangular, and
    D is a diagonal matrix.

    This is the block version of the algorithm, calling Level 3 BLAS.   

    Arguments
    ---------
    @param[in]
    UPLO    CHARACTER*1   
      -     = 'U':  Upper triangle of A is stored;   
      -     = 'L':  Lower triangle of A is stored.   

    @param[in]
    N       INTEGER   
            The order of the matrix A.  N >= 0.   

    @param[in,out]
    dA      REAL array on the GPU, dimension (LDA,N)   
            On entry, the symmetric matrix A.  If UPLO = 'U', the leading   
            N-by-N upper triangular part of A contains the upper   
            triangular part of the matrix A, and the strictly lower   
            triangular part of A is not referenced.  If UPLO = 'L', the   
            leading N-by-N lower triangular part of A contains the lower   
            triangular part of the matrix A, and the strictly upper   
            triangular part of A is not referenced.   
    \n
            On exit, if INFO = 0, the factor U or L from the Cholesky   
            factorization A = U^H D U or A = L D L^H.   
    \n 
            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using cudaMallocHost.

    @param[in]
    LDA     INTEGER   
            The leading dimension of the array A.  LDA >= max(1,N).   

    @param[out]
    INFO    INTEGER   
      -     = 0:  successful exit   
      -     < 0:  if INFO = -i, the i-th argument had an illegal value 
                  if INFO = -6, the GPU memory allocation failed 
      -     > 0:  if INFO = i, the leading minor of order i is not   
                  positive definite, and the factorization could not be   
                  completed.   
    
    @ingroup magma_ssytrf_comp
    ******************************************************************* */
extern "C" magma_int_t
magma_ssytrf_nopiv_gpu(
    magma_uplo_t uplo, magma_int_t n,
    magmaFloat_ptr dA, magma_int_t ldda,
    magma_int_t *info)
{
    #define  A(i, j)  (A)
    #define dA(i, j)  (dA +(j)*ldda + (i))
    #define dW(i, j)  (dW +(j)*ldda + (i))
    #define dWt(i, j) (dW +(j)*nb   + (i))

    /* Local variables */
    float zone  = MAGMA_S_ONE;
    float mzone = MAGMA_S_NEG_ONE;
    int                upper = (uplo == MagmaUpper);
    magma_int_t j, k, jb, nb, ib, iinfo;

    *info = 0;
    if (! upper && uplo != MagmaLower) {
      *info = -1;
    } else if (n < 0) {
      *info = -2;
    } else if (ldda < max(1,n)) {
      *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return MAGMA_ERR_ILLEGAL_VALUE;
    }

    /* Quick return */
    if ( n == 0 )
      return MAGMA_SUCCESS;

    nb = magma_get_ssytrf_nopiv_nb(n);
    ib = min(32, nb); // inner-block for diagonal factorization

    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );


    magma_queue_t stream[2];
    magma_event_t event;
    magma_queue_create(&stream[0]);
    magma_queue_create(&stream[1]);
    magma_event_create( &event );
    trace_init( 1, 1, 2, stream );

    // CPU workspace
    float *A;
    if (MAGMA_SUCCESS != magma_smalloc_pinned( &A, nb*nb )) {
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }

    // GPU workspace
    magmaFloat_ptr dW;
    if (MAGMA_SUCCESS != magma_smalloc( &dW, (1+nb)*ldda )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    /* Use hybrid blocked code. */
    if (upper) {
        //=========================================================
        // Compute the LDLt factorization A = U'*D*U without pivoting.
        // main loop
        for (j=0; j<n; j += nb) {
            jb = min(nb, (n-j));
            
            // copy A(j,j) back to CPU
            trace_gpu_start( 0, 0, "get", "get" );
            //magma_queue_wait_event( stream[1], event );                                                                
            magma_event_sync(event);
            magma_sgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), nb, stream[1]);
            trace_gpu_end( 0, 0 );

            // factorize the diagonal block
            magma_queue_sync(stream[1]);
            trace_cpu_start( 0, "potrf", "potrf" );
            ssytrf_nopiv_cpu(MagmaUpper, jb, ib, A(j, j), nb, info);
            trace_cpu_end( 0 );
            if (*info != 0){
                *info = *info + j;
                break;
            }
            
            // copy A(j,j) back to GPU
            trace_gpu_start( 0, 0, "set", "set" );
            magma_ssetmatrix_async(jb, jb, A(j, j), nb, dA(j, j), ldda, stream[0]);
            trace_gpu_end( 0, 0 );
                
            if ( (j+jb) < n) {
                // compute the off-diagonal blocks of current block column
                magmablasSetKernelStream( stream[0] );
                trace_gpu_start( 0, 0, "trsm", "trsm" );
                magma_strsm(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaUnit, 
                            jb, (n-j-jb), 
                            zone, dA(j, j),    ldda, 
                            dA(j, j+jb), ldda);
                magma_scopymatrix( jb, n-j-jb, dA( j, j+jb ), ldda, dWt( 0, j+jb ), nb );
                
                // update the trailing submatrix with D
                magmablas_slascl_diag(MagmaUpper, jb, n-j-jb,
                                      dA(j,    j), ldda,
                                      dA(j, j+jb), ldda,
                                      &iinfo);
                trace_gpu_end( 0, 0 );
                
                // update the trailing submatrix with U and W
                trace_gpu_start( 0, 0, "gemm", "gemm" );
                for (k=j+jb; k<n; k+=nb) {
                    magma_int_t kb = min(nb,n-k);
                    magma_sgemm(MagmaConjTrans, MagmaNoTrans, kb, n-k, jb,
                                mzone, dWt(0, k), nb, 
                                       dA(j, k), ldda,
                                zone,  dA(k, k), ldda);
                    if (k==j+jb)
                        magma_event_record( event, stream[0] );
                }
                trace_gpu_end( 0, 0 );
            }
        }
    } else {
        //=========================================================
        // Compute the LDLt factorization A = L*D*L' without pivoting.
        // main loop
        for (j=0; j<n; j+=nb) {
            jb = min(nb, (n-j));
            
            // copy A(j,j) back to CPU
            trace_gpu_start( 0, 0, "get", "get" );
            //magma_queue_wait_event( stream[0], event );                                                                
            magma_event_sync(event);
            magma_sgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), nb, stream[1]);
            trace_gpu_end( 0, 0 );
            
            // factorize the diagonal block
            magma_queue_sync(stream[1]);
            trace_cpu_start( 0, "potrf", "potrf" );
            ssytrf_nopiv_cpu(MagmaLower, jb, ib, A(j, j), nb, info);
            trace_cpu_end( 0 );
            if (*info != 0){
                *info = *info + j;
                break;
            }

            // copy A(j,j) back to GPU
            trace_gpu_start( 0, 0, "set", "set" );
            magma_ssetmatrix_async(jb, jb, A(j, j), nb, dA(j, j), ldda, stream[0]);
            trace_gpu_end( 0, 0 );
            
            if ( (j+jb) < n) {
                // compute the off-diagonal blocks of current block column
                magmablasSetKernelStream( stream[0] );
                trace_gpu_start( 0, 0, "trsm", "trsm" );
                magma_strsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaUnit, 
                            (n-j-jb), jb, 
                            zone, dA(j,    j), ldda, 
                            dA(j+jb, j), ldda);
                magma_scopymatrix( n-j-jb,jb, dA( j+jb, j ), ldda, dW( j+jb, 0 ), ldda );
                
                // update the trailing submatrix with D
                magmablas_slascl_diag(MagmaLower, n-j-jb, jb,
                                      dA(j,    j), ldda,
                                      dA(j+jb, j), ldda,
                                      &iinfo);
                trace_gpu_end( 0, 0 );
                
                // update the trailing submatrix with L and W
                trace_gpu_start( 0, 0, "gemm", "gemm" );
                for (k=j+jb; k<n; k+=nb) {
                    magma_int_t kb = min(nb,n-k);
                    magma_sgemm(MagmaNoTrans, MagmaConjTrans, n-k, kb, jb,
                                mzone, dA(k, j), ldda, 
                                       dW(k, 0), ldda,
                                zone,  dA(k, k), ldda);
                    if (k==j+jb)
                        magma_event_record( event, stream[0] );
                }
                trace_gpu_end( 0, 0 );
            }
        }
    }
    
    trace_finalize( "ssytrf.svg","trace.css" );
    magma_queue_destroy(stream[0]);
    magma_queue_destroy(stream[1]);
    magma_event_destroy( event );
    magma_free( dW );
    magma_free_pinned( A );
    
    magmablasSetKernelStream( orig_stream );
    return MAGMA_SUCCESS;
} /* magma_ssytrf_nopiv */
Example #5
0
/**
    Purpose
    -------
    SPOTRF computes the Cholesky factorization of a real symmetric
    positive definite matrix dA.
    Auxiliary subroutine for spotrf2_ooc. It is multiple gpu interface to compute
    Cholesky of a "rectangular" matrix.

    The factorization has the form
       dA = U**H * U,   if UPLO = MagmaUpper, or
       dA = L  * L**H,  if UPLO = MagmaLower,
    where U is an upper triangular matrix and L is lower triangular.

    This is the block version of the algorithm, calling Level 3 BLAS.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of dA is stored;
      -     = MagmaLower:  Lower triangle of dA is stored.

    @param[in]
    n       INTEGER
            The order of the matrix dA.  N >= 0.

    @param[in,out]
    dA      REAL array on the GPU, dimension (LDDA,N)
            On entry, the symmetric matrix dA.  If UPLO = MagmaUpper, the leading
            N-by-N upper triangular part of dA contains the upper
            triangular part of the matrix dA, and the strictly lower
            triangular part of dA is not referenced.  If UPLO = MagmaLower, the
            leading N-by-N lower triangular part of dA contains the lower
            triangular part of the matrix dA, and the strictly upper
            triangular part of dA is not referenced.
    \n
            On exit, if INFO = 0, the factor U or L from the Cholesky
            factorization dA = U**H * U or dA = L * L**H.

    @param[in]
    ldda     INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            divisible by 16.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
      -     > 0:  if INFO = i, the leading minor of order i is not
                  positive definite, and the factorization could not be
                  completed.

    @ingroup magma_sposv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_spotrf3_mgpu(
    magma_int_t ngpu,
    magma_uplo_t uplo, magma_int_t m, magma_int_t n,
    magma_int_t off_i, magma_int_t off_j, magma_int_t nb,
    magmaFloat_ptr d_lA[],  magma_int_t ldda,
    magmaFloat_ptr d_lP[],  magma_int_t lddp,
    float *A,          magma_int_t lda, magma_int_t h,
    magma_queue_t queues[][3], magma_event_t events[][5],
    magma_int_t *info )
{
#define Alo(i, j)  (A +             ((j)+off_j)*lda  + (nb*(((i)/nb)%h)+off_i))
#define Aup(i, j)  (A + (nb*(((j)/nb)%h)+off_j)*lda  +               (i+off_i))

#define dlA(id, i, j)     (d_lA[(id)] + (j)*ldda + (i))
#define dlP(id, i, j, k)  (d_lP[(id)] + (k)*nb*lddp + (j)*lddp + (i))
#define dlPT(id, i, j, k) (d_lP[(id)] + (k)*nb*lddp + (j)*nb   + (i))

    magma_int_t     j, jb, nb0, nb2, d, dd, id, j_local, j_local2, buf;
    float c_one     = MAGMA_S_ONE;
    float c_neg_one = MAGMA_S_NEG_ONE;
    float          d_one     =  1.0;
    float          d_neg_one = -1.0;
    int upper = (uplo == MagmaUpper);
    float *dlpanel;
    magma_int_t n_local[MagmaMaxGPUs], ldpanel;
    const magma_int_t stream1 = 0, stream2 = 1, stream3 = 2;
    
    *info = 0;
    if (! upper && uplo != MagmaLower) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (!upper && ngpu*ldda < max(1,n)) {
        *info = -4;
    } else if (upper && ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );
    
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
    /* used by strsm_work */
    float c_zero    = MAGMA_S_ZERO;
    int trsm_nb = 128;
    int trsm_n = trsm_nb*((nb+trsm_nb-1)/trsm_nb);
    float *d_dinvA[MagmaMaxGPUs];
    float *d_x[MagmaMaxGPUs];
    #define dinvA(d,j) &(d_dinvA[(d)][(j)*trsm_nb*trsm_n])
    #define dx(d,j) &(d_x[(d)][(j)*nb*m])
    /*
     * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE
     */
    // TODO free memory on failure.
    for( d=0; d < ngpu; d++ ) {
        magma_setdevice(d);
        if ( (MAGMA_SUCCESS != magma_smalloc( &d_dinvA[d], 2*trsm_nb*trsm_n )) ||
             (MAGMA_SUCCESS != magma_smalloc( &d_x[d],     2*nb*(upper ? n : m) )) ) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }
    }
    magma_setdevice(0);
#endif
    
    /* initialization */
    for( d=0; d < ngpu; d++ ) {
        /* local-n and local-ld */
        if (upper) {
            n_local[d] = (n/(nb*ngpu))*nb;
            if (d < (n/nb)%ngpu)
                n_local[d] += nb;
            else if (d == (n/nb)%ngpu)
                n_local[d] += n%nb;
        } else {
            n_local[d] = (m/(nb*ngpu))*nb;
            if (d < (m/nb)%ngpu)
                n_local[d] += nb;
            else if (d == (m/nb)%ngpu)
                n_local[d] += m%nb;
        }
    }

    /* == initialize the trace */
    trace_init( 1, ngpu, 3, (CUstream_st**)queues );

    if (upper) {
        /* ---------------------------------------------- */
        /* Upper-triangular case                          */
        /* > Compute the Cholesky factorization A = U'*U. */
        /* ---------------------------------------------- */
        for (j=0; j < m; j += nb) {
            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%ngpu;
            buf = (j/nb)%ngpu; // right now, we have ngpu buffers, so id and buf are the same..
            
            /* Set the local index where the current panel is */
            j_local = j/(nb*ngpu);
            jb = min(nb, (m-j));
 
            /* Update the current diagonal block on stream1 */
            magma_setdevice(id);
            if ( j > 0 ) {
                magmablasSetKernelStream( queues[id][stream1] );
                trace_gpu_start( id, stream1, "syrk", "syrk" );
                magma_ssyrk(MagmaUpper, MagmaConjTrans, jb, j,
                            d_neg_one, dlA(id, 0, nb*j_local), ldda,
                            d_one,     dlA(id, j, nb*j_local), ldda);
                trace_gpu_end( id, stream1 );
            }
            
            /* send the diagonal to cpu on stream1 */
            trace_gpu_start( id, stream1, "comm", "D to CPU" );
            magma_sgetmatrix_async( jb, jb,
                                    dlA(id, j, nb*j_local), ldda,
                                    Aup(j,j),               lda,
                                    queues[id][stream1] );
            trace_gpu_end( id, stream1 );

            /* update off-diagonal blocks in the panel */
            if ( j > 0 ) {
                d = (j/nb+1)%ngpu;
                for( dd=0; dd < ngpu; dd++ ) {
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2 --;
                    nb0 = nb*j_local2; // number of local columns in the panel, while jb is panel-size (number of rows)
            
                    if ( n_local[d] > nb0 ) {
                        magma_setdevice(d);
                        magmablasSetKernelStream( queues[d][stream2] );
                        if ( d == id ) {
                            dlpanel = dlA(d,0,nb*j_local);
                            ldpanel = ldda;
                            // the GPU owns the row from start, and no need of synch.
                            //magma_queue_wait_event( queues[d][stream2], events[d][0] ); // rows arrived at gpu
                            magma_queue_wait_event( queues[d][stream2], events[d][4] ); // wait for look-ahead trsm to finish
                        } else {
                            dlpanel = dlP(d,nb,0,buf);
                            ldpanel = lddp;
                            magma_queue_wait_event( queues[d][stream2], events[d][0] ); // rows arrived at gpu
                        }
                        trace_gpu_start( d, stream2, "gemm", "gemm" );
                        magma_sgemm(MagmaConjTrans, MagmaNoTrans,
                                    jb, n_local[d]-nb0, j,
                                    c_neg_one, dlpanel,        ldpanel,
                                               dlA(d, 0, nb0), ldda,
                                    c_one,     dlA(d, j, nb0), ldda);
                        trace_gpu_end( d, stream2 );
                        magma_event_record( events[d][2], queues[d][stream2] );
                    }
                    d = (d+1)%ngpu;
                }
            }

            /* wait for panel and factorize it on cpu */
            magma_setdevice(id);
            magma_queue_sync( queues[id][stream1] );
            trace_cpu_start( 0, "getrf", "getrf" );
            lapackf77_spotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info);
            trace_cpu_end( 0 );
            if (*info != 0) {
                *info = *info + j;
                break;
            }
            
            /* send the diagonal to gpus on stream1 */
            if ( (j+jb) < n) {
                d = (j/nb+1)%ngpu;
                for( dd=0; dd < ngpu; dd++ ) {
                    if ( d == id ) {
                        dlpanel = dlA(d, j, nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlP(d,0,0,buf);
                        ldpanel = lddp;
                    }
                    magma_setdevice(d);
                    trace_gpu_start( d, stream1, "comm", "comm" );
                    magma_ssetmatrix_async( jb, jb,
                                            Aup(j,j), lda,
                                            dlpanel,  ldpanel,
                                            queues[d][stream1] );
                    trace_gpu_end( d, stream1 );
                    magma_event_record( events[d][1], queues[d][stream1] );
                    d = (d+1)%ngpu;
                }
            } else {
                magma_setdevice(id);
                trace_gpu_start( id, stream1, "comm", "comm" );
                magma_ssetmatrix_async( jb, jb,
                                        Aup(j,j),               lda,
                                        dlA(id, j, nb*j_local), ldda,
                                        queues[id][stream1] );
                trace_gpu_end( id, stream1 );
            }
            
            /* panel-factorize the off-diagonal */
            if ( (j+jb) < n) {
                d = (j/nb+1)%ngpu;
                for( dd=0; dd < ngpu; dd++ ) {
                    /* next column */
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2--;
                    if ( d == id ) {
                        dlpanel = dlA(d,j,nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlP(d,0,0,buf);
                        ldpanel = lddp;
                    }
                    nb2 = n_local[d] - j_local2*nb;
                    
                    magma_setdevice(d);
                    if ( j+jb < m && d == (j/nb+1)%ngpu ) {
                        /* owns the next column, look-ahead next block on stream1 */
                        nb0 = min(nb, nb2);
                        magmablasSetKernelStream( queues[d][stream1] );
                        magma_queue_wait_event( queues[d][stream1], events[d][2] ); // wait for gemm update
                        trace_gpu_start( d, stream1, "trsm", "trsm" );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
                        magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,0), trsm_nb );
                        magmablas_slaset( MagmaFull, nb0,     jb,     c_zero, c_zero, dx(d,0), nb0 );
                        magmablas_strsm_work( MagmaLeft, MagmaUpper,
                                              MagmaConjTrans, MagmaNonUnit,
                                              jb, nb0, c_one,
                                              dlpanel, ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                              1, dinvA(d,0), dx(d,0) );
#else
                        magma_strsm( MagmaLeft, MagmaUpper,
                                     MagmaConjTrans, MagmaNonUnit,
                                     jb, nb0, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, j, nb*j_local2), ldda);
#endif
                        magma_event_record( events[d][4], queues[d][stream1] );
                        trace_gpu_end( d, stream1 );
                    } else if ( nb2 > 0 ) {
                        /* update all the blocks on stream2 */
                        magma_queue_wait_event( queues[d][stream2], events[d][1] ); // wait for cholesky factor
                        trace_gpu_start( d, stream2, "trsm", "trsm" );
                        magmablasSetKernelStream( queues[d][stream2] );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
                        magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,0), trsm_nb );
                        magmablas_slaset( MagmaFull, nb2,     jb,     c_zero, c_zero, dx(d,0), nb2 );
                        magmablas_strsm_work( MagmaLeft, MagmaUpper,
                                              MagmaConjTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel, ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                              1, dinvA(d,0), dx(d,0) );
#else
                        magma_strsm( MagmaLeft, MagmaUpper,
                                     MagmaConjTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, j, nb*j_local2), ldda);
#endif
                        trace_gpu_end( d, stream2 );
                    }
                    d = (d+1)%ngpu;
                } /* end of for */

                /* ========================================================== */
                if ( j+jb < m ) {
                    d = (j/nb+1)%ngpu;
                    /* next column */
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2--;
                    nb0 = min(nb, n_local[d]-nb*j_local2 );
                
                    /* even on 1 gpu, off-diagonals are copied to cpu (synchronize at the end).      *
                     * so we have the Cholesky factor, but only diagonal submatrix of the big panel, *
                     * on cpu at the end.                                                            */
                    int d2, buf2;
                    magma_setdevice(d);
                    /* lookahead done */
                    magma_queue_wait_event( queues[d][stream3], events[d][4] );
                
                    trace_gpu_start( d, stream3, "comm", "row to CPU" );
                    magma_sgetmatrix_async( (j+jb), nb0,
                                            dlA(d, 0, nb*j_local2), ldda,
                                            Aup(0,j+jb),            lda,
                                            queues[d][stream3] );
                    trace_gpu_end( d, stream3 );
                    magma_event_record( events[d][3], queues[d][stream3] );
                    /* needed on pluto */
                    //magma_queue_sync( queues[d][stream3] );
                
                    /* broadcast rows to gpus on stream2 */
                    buf2 = ((j+jb)/nb)%ngpu;
                    for( d2=0; d2 < ngpu; d2++ ) {
                        if ( d2 != d ) {
                            magma_setdevice(d2);
                            trace_gpu_start( d2, stream3, "comm", "row to GPUs" );
                            magma_queue_wait_event( queues[d2][stream3], events[d][3] ); // rows arrived at cpu on stream3
                            magma_ssetmatrix_async( j+jb, nb0,
                                                    Aup(0,j+jb),       lda,
                                                    dlP(d2,nb,0,buf2), lddp,
                                                    queues[d2][stream3] );
                            trace_gpu_end( d2, stream3 );
                            magma_event_record( events[d2][0], queues[d2][stream3] );
                        }
                    }

                    /* =========================== */
                    /* update the remaining blocks */
                    nb2 = n_local[d]-(nb*j_local2 + nb0);
                    if ( nb2 > 0 ) {
                        if ( d == id ) {
                            dlpanel = dlA(d, j, nb*j_local);
                            ldpanel = ldda;
                        } else {
                            dlpanel = dlP(d,0,0,buf);
                            ldpanel = lddp;
                        }
                        magma_setdevice(d);
                        magmablasSetKernelStream( queues[d][stream2] );
                        trace_gpu_start( d, stream2, "trsm", "trsm" );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
                        int flag = 0;
                        if (flag == 0) {
                            magma_queue_wait_event( queues[d][stream2], events[d][4] ); // lookahead -> diagonal inversion
                        } else {
                            magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,flag), trsm_nb );
                            magma_queue_wait_event( queues[d][stream2], events[d][1] ); // panel received
                        }
                        magmablas_slaset( MagmaFull, nb2, jb, c_zero, c_zero, dx(d,1), nb2 );
                        magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel, ldpanel,
                                              dlA(d, j, nb*j_local2+nb0), ldda,
                                              flag, dinvA(d,flag), dx(d,1) );
#else
                        magma_queue_wait_event( queues[d][stream2], events[d][1] ); // wait for cholesky factor
                        magma_strsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel, ldpanel,
                                     dlA(d, j, nb*j_local2+nb0), ldda);
#endif
                        trace_gpu_end( d, stream2 );
                    }
                }
            } /* end of strsm */
        } /* end of for j=1, .., n */
    } else {
        /* ---------------------------------------------- */
        /* Lower-triangular case                          */
        /* > Compute the Cholesky factorization A = L*L'. */
        /* ---------------------------------------------- */
        for (j=0; j < n; j += nb) {
        
            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%ngpu;
            buf = (j/nb)%ngpu;
            
            /* Set the local index where the current panel is */
            j_local = j/(nb*ngpu);
            jb = min(nb, (n-j));

            /* Update the current diagonal block on stream1 */
            magma_setdevice(id);
            if ( j > 0 ) {
                magmablasSetKernelStream( queues[id][stream1] );
                magma_ssyrk(MagmaLower, MagmaNoTrans, jb, j,
                            d_neg_one, dlA(id, nb*j_local, 0), ldda,
                            d_one,     dlA(id, nb*j_local, j), ldda);
            }

            /* send the diagonal to cpu on stream1 */
            magma_sgetmatrix_async( jb, jb,
                                    dlA(id, nb*j_local, j), ldda,
                                    Alo(j,j),               lda,
                                    queues[id][stream1] );

            /* update off-diagonal blocks of the panel */
            if ( j > 0 ) {
                d = (j/nb+1)%ngpu;
                for( dd=0; dd < ngpu; dd++ ) {
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2 --;
                    nb0 = nb*j_local2;
            
                    if ( nb0 < n_local[d] ) {
                        magma_setdevice(d);
                        magmablasSetKernelStream( queues[d][stream2] );
                        if ( d == id ) {
                            dlpanel = dlA(d, nb*j_local, 0);
                            ldpanel = ldda;
                            magma_queue_wait_event( queues[d][stream2], events[d][4] ); // wait for look-ahead trsm to finish
                        } else {
                            dlpanel = dlPT(d,0,nb,buf);
                            ldpanel = nb;
                            magma_queue_wait_event( queues[d][stream2], events[d][0] ); // rows arrived at gpu
                        }
                        magma_sgemm( MagmaNoTrans, MagmaConjTrans,
                                     n_local[d]-nb0, jb, j,
                                     c_neg_one, dlA(d, nb0, 0), ldda,
                                                dlpanel,        ldpanel,
                                     c_one,     dlA(d, nb0, j), ldda);
                        magma_event_record( events[d][2], queues[d][stream2] );
                    }
                    d = (d+1)%ngpu;
                }
            }

            /* wait for the panel and factorized it on cpu */
            magma_setdevice(id);
            magma_queue_sync( queues[id][stream1] );
            lapackf77_spotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info);
            if (*info != 0) {
                *info = *info + j;
                break;
            }

            /* send the diagonal to gpus on stream1 */
            if ( (j+jb) < m) {
                d = (j/nb+1)%ngpu;
                for( dd=0; dd < ngpu; dd++ ) {
                    if ( d == id ) {
                        dlpanel = dlA(d, nb*j_local, j);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlPT(d, 0, 0, buf);
                        ldpanel = nb;
                    }
                    magma_setdevice(d);
                    magma_ssetmatrix_async( jb, jb,
                                            Alo(j,j), lda,
                                            dlpanel,  ldpanel,
                                            queues[d][stream1] );
                    magma_event_record( events[d][1], queues[d][stream1] );
                    d = (d+1)%ngpu;
                }
            } else {
                magma_setdevice(id);
                magma_ssetmatrix_async( jb, jb,
                                        Alo(j,j),               lda,
                                        dlA(id, nb*j_local, j), ldda,
                                        queues[id][stream1] );
            }

            /* panel factorize the off-diagonal */
            if ( (j+jb) < m) {
                d = (j/nb+1)%ngpu;
                for( dd=0; dd < ngpu; dd++ ) {
                    /* next column */
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2--;
                    if ( d == id ) {
                        dlpanel = dlA(d, nb*j_local, j);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlPT(d, 0, 0, buf);
                        ldpanel = nb;
                    }
                    nb2 = n_local[d] - j_local2*nb;
                    nb0 = min(nb, nb2);
                    
                    magma_setdevice(d);
                    if ( j+nb < n && d == (j/nb+1)%ngpu ) { /* owns next column, look-ahead next block on stream1 */
                        if ( j > 0 ) magma_queue_wait_event( queues[d][stream1], events[d][2] ); // wait for gemm update
                        magmablasSetKernelStream( queues[d][stream1] );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
                        magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,0), trsm_nb );
                        magmablas_slaset( MagmaFull, nb0,     jb,     c_zero, c_zero, dx(d,0), nb0 );
                        magmablas_strsm_work( MagmaRight, MagmaLower,
                                              MagmaConjTrans, MagmaNonUnit,
                                              nb0, jb, c_one,
                                              dlpanel, ldpanel,
                                              dlA(d, nb*j_local2, j), ldda,
                                              1, dinvA(d,0), dx(d,0) );
#else
                        magma_strsm( MagmaRight, MagmaLower,
                                     MagmaConjTrans, MagmaNonUnit,
                                     nb0, jb, c_one,
                                     dlpanel, ldpanel,
                                     dlA(d, nb*j_local2, j), ldda);
#endif
                        magma_event_record( events[d][4], queues[d][stream1] );
                    } else if ( nb2 > 0 ) { /* other gpus updating all the blocks on stream2 */
                        /* update the entire column */
                        magma_queue_wait_event( queues[d][stream2], events[d][1] ); // wait for the cholesky factor
                        magmablasSetKernelStream( queues[d][stream2] );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
                        magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,0), trsm_nb );
                        magmablas_slaset( MagmaFull, nb2,     jb,     c_zero, c_zero, dx(d,0), nb2 );
                        magmablas_strsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                              nb2, jb, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, nb*j_local2, j), ldda,
                                              1, dinvA(d,0), dx(d,0) );
#else
                        magma_strsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                     nb2, jb, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, nb*j_local2, j), ldda);
#endif
                    }
                    d = (d+1)%ngpu;
                } /* end for d */

                /* ========================================================== */
                if ( j+jb < n ) {
                    d = (j/nb+1)%ngpu;
                    /* next column */
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2--;
                    nb0 = min(nb, n_local[d]-nb*j_local2 );
                
                    /* even on 1 gpu, we copy off-diagonal to cpu (but don't synchronize).  */
                    /* so we have the Cholesky factor on cpu at the end.                    */
                    int d2, buf2;
//#define SPOTRF_DEVICE_TO_DEVICE
#ifdef SPOTRF_DEVICE_TO_DEVICE
                    // lookahead done
                
                    /* broadcast the rows to gpus */
                    buf2 = ((j+jb)/nb)%ngpu;
                    for( d2=0; d2 < ngpu; d2++ ) {
                        magma_setdevice(d2);
                        magma_queue_wait_event( queues[d2][stream3], events[d][4] );
                        if ( d2 != d ) {
                            magma_scopymatrix_async( nb0, j+jb,
                                                     dlPT(d2,0,nb,buf2), nb, // first nbxnb reserved for diagonal block
                                                     dlA(d, nb*j_local2, 0), ldda,
                                                     queues[d2][stream3] );
                            magma_event_record( events[d2][0], queues[d2][stream3] );
                        } else {
                            magma_sgetmatrix_async( nb0, j+jb,
                                                    dlA(d, nb*j_local2, 0), ldda,
                                                    Alo(j+jb,0),            lda,
                                                    queues[d][stream3] );
                        }
                    }
#else
                    // lookahead done
                    magma_setdevice(d);
                    magma_queue_wait_event( queues[d][stream3], events[d][4] );
                    magma_sgetmatrix_async( nb0, j+jb,
                                            dlA(d, nb*j_local2, 0), ldda,
                                            Alo(j+jb,0),            lda,
                                            queues[d][stream3] );
                    magma_event_record( events[d][3], queues[d][stream3] );
                    /* syn on rows on CPU, seem to be needed on Pluto */
                    //magma_queue_sync( queues[d][stream3] );
                
                    /* broadcast the rows to gpus */
                    buf2 = ((j+jb)/nb)%ngpu;
                    for( d2=0; d2 < ngpu; d2++ ) {
                        if ( d2 != d ) {
                            magma_setdevice(d2);
                            magma_queue_wait_event( queues[d2][stream3], events[d][3] ); // getmatrix done
                            magma_ssetmatrix_async( nb0, j+jb,
                                                    Alo(j+jb,0),        lda,
                                                    dlPT(d2,0,nb,buf2), nb, // first nbxnb reserved for diagonal block
                                                    queues[d2][stream3] );
                            magma_event_record( events[d2][0], queues[d2][stream3] );
                        }
                    }
#endif
                    /* =================================== */
                    /* updates remaining blocks on stream2 */
                    nb2 = n_local[d] - (j_local2*nb + nb0);
                    if ( nb2 > 0 ) {
                        if ( d == id ) {
                            dlpanel = dlA(d, nb*j_local, j);
                            ldpanel = ldda;
                        } else {
                            dlpanel = dlPT(d,0,0,buf);
                            ldpanel = nb;
                        }
                        magma_setdevice(d);
                        magmablasSetKernelStream( queues[d][stream2] );
                        /* update the remaining blocks in the column */
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
                        int flag = 0;
                        if (flag == 0) {
                            magma_queue_wait_event( queues[d][stream2], events[d][4] ); // lookahead -> diagonal inversion
                        } else {
                            magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,flag), trsm_nb );
                            magma_queue_wait_event( queues[d][stream2], events[d][1] ); // panel received
                        }
                        magmablas_slaset( MagmaFull, nb2, jb, c_zero, c_zero, dx(d,1), nb2 );
                        magmablas_strsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                              nb2, jb, c_one,
                                              dlpanel,                    ldpanel,
                                              dlA(d, nb*j_local2+nb0, j), ldda,
                                              flag, dinvA(d,flag), dx(d,1) );
#else
                        magma_queue_wait_event( queues[d][stream2], events[d][1] ); // panel received
                        magma_strsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                     nb2, jb, c_one,
                                     dlpanel,                    ldpanel,
                                     dlA(d, nb*j_local2+nb0, j), ldda);
#endif
                    }
                }
            }
        }
    } /* end of else not upper */

    /* == finalize the trace == */
    trace_finalize( "spotrf.svg", "trace.css" );
    for( d=0; d < ngpu; d++ ) {
        magma_setdevice(d);
        for( j=0; j < 3; j++ ) {
            magma_queue_sync( queues[d][j] );
        }
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
        magma_free( d_dinvA[d] );
        magma_free( d_x[d] );
#endif
    }
    magma_setdevice( orig_dev );
    magmablasSetKernelStream( orig_stream );

    return *info;
} /* magma_spotrf_mgpu */
Example #6
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing magma_zhemm_mgpu
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    magmaDoubleComplex calpha    = MAGMA_Z_MAKE( 3.456, 5.678 );
    magmaDoubleComplex cbeta     = MAGMA_Z_MAKE( 1.234, 2.456 );
    
    real_Double_t    gflops, gpu_perf=0., cpu_perf=0., gpu_time=0., cpu_time=0.;
    real_Double_t    gpu_perf2=0., gpu_time2=0.;
    double           error=0., errorbis=0., work[1];
    magmaDoubleComplex *hA, *hX, *hB, *hR;
    magmaDoubleComplex_ptr dA[MagmaMaxGPUs], dX[MagmaMaxGPUs], dB[MagmaMaxGPUs], dwork[MagmaMaxGPUs], hwork[MagmaMaxGPUs+1];
    magmaDoubleComplex_ptr dA2;
    magma_int_t M, N, size, lda, ldda, msize, nb, nstream;
    magma_int_t ione     = 1;
    magma_int_t iseed[4] = {0,0,0,1};
    magma_int_t status = 0;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    double tol = opts.tolerance * lapackf77_dlamch("E");
    
    // default values
    nb      = (opts.nb      > 0 ? opts.nb      : 64);
    nstream = (opts.nstream > 0 ? opts.nstream :  2);
    
    magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2];
    magma_int_t nbcmplx = 0;
    magma_buildconnection_mgpu(gnode, &nbcmplx, opts.ngpu);
    printf("Initializing communication pattern... GPU-ncmplx %d\n\n", (int) nbcmplx);

    for (int i=0; i < nbcmplx; ++i) {
        int myngpu = gnode[i][MagmaMaxGPUs];
        printf("cmplx %d has %d gpu ", i, myngpu);
        for(int j=0; j < myngpu; ++j)
            printf("  %d", (int) gnode[i][j]);
        printf("\n");
    }

    magma_int_t nbevents = 2;
    magma_queue_t streams[MagmaMaxGPUs][20];
    magma_event_t redevents[MagmaMaxGPUs][20];
    magma_event_t redevents2[MagmaMaxGPUs][MagmaMaxGPUs*MagmaMaxGPUs+10];
    for( int d = 0; d < opts.ngpu; ++d ) {
        for( magma_int_t i = 0; i < nstream; ++i ) {
            magma_queue_create( &streams[d][i] );
        }
        for( magma_int_t i = 0; i < nbevents; ++i ) {
            cudaEventCreateWithFlags(&redevents[d][i],  cudaEventDisableTiming);
            cudaEventCreateWithFlags(&redevents2[d][i], cudaEventDisableTiming);
        }
    }

    printf( "nb %d, ngpu %d, nstream %d version %d\n", (int) nb, (int) opts.ngpu, (int) nstream, (int) opts.version );
    printf("    M     N    nb offset  CPU GFlop/s (sec)   GPU GFlop/s (sec)   CUBLAS hemm (sec)   ||R|| / ||A||*||X||\n");
    printf("=========================================================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
      M = opts.msize[itest];
      N = opts.nsize[itest];
      for( int offset = 0; offset < N; offset += min(N,nb) ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            msize = M - offset;
            lda   = M;
            ldda  = ((M + 31)/32)*32;
            size  = lda*M;
            gflops = FLOPS_ZHEMM( MagmaLeft, (double)msize, (double)N ) / 1e9;
            
            magma_int_t dworksiz = ldda*N*3;
            magma_int_t hworksiz = lda*N;
            
            TESTING_MALLOC_CPU( hA, magmaDoubleComplex, lda*M );
            TESTING_MALLOC_CPU( hX, magmaDoubleComplex, lda*N );
            TESTING_MALLOC_CPU( hB, magmaDoubleComplex, lda*N );
            
            TESTING_MALLOC_PIN( hR, magmaDoubleComplex, lda*N );

            for( int d = 0; d < opts.ngpu; ++d ) {
                magma_int_t mlocal = ((M / nb) / opts.ngpu + 1) * nb;
                magma_setdevice( d );
                TESTING_MALLOC_DEV( dA[d],    magmaDoubleComplex, ldda*mlocal );
                TESTING_MALLOC_DEV( dX[d],    magmaDoubleComplex, ldda*N      );
                TESTING_MALLOC_DEV( dB[d],    magmaDoubleComplex, ldda*N      );
                TESTING_MALLOC_DEV( dwork[d], magmaDoubleComplex, dworksiz    );
                
                TESTING_MALLOC_PIN( hwork[d], magmaDoubleComplex, hworksiz    );
            }
            TESTING_MALLOC_PIN( hwork[opts.ngpu], magmaDoubleComplex, lda*N );
        
            if ( opts.check ) {
                magma_setdevice( 0 );
                TESTING_MALLOC_DEV( dA2, magmaDoubleComplex, ldda*M );
            }

            lapackf77_zlarnv( &ione, iseed, &size, hA );
            magma_zmake_hermitian( M, hA, lda );
            
            size = lda*N;
            lapackf77_zlarnv( &ione, iseed, &size, hX );
            lapackf77_zlarnv( &ione, iseed, &size, hB );
            lapackf77_zlacpy( "Full", &M, &N, hB, &lda, hR, &lda );
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            magma_zsetmatrix_1D_col_bcyclic( M, M, hA, lda, dA, ldda, opts.ngpu, nb );
            for( int d = 0; d < opts.ngpu; ++d ) {
                magma_setdevice( d );
                //magmablasSetKernelStream( streams[ d ][  0 ] );
                magma_zsetmatrix( M, N, hX, lda, dX[d], ldda );
                //if (d == 0) magma_zsetmatrix( M, N, hB, lda, dB[d], ldda ); // this is wrong coz when offset != 0 the gpu who do the beta*C may be not 0 so this should be related to stdev(starting device who own i=0 first col)
                magma_zsetmatrix( M, N, hB, lda, dB[d], ldda );
            }
        
            //memset(hR, 0, lda*N*sizeof(magmaDoubleComplex));
    
            trace_init( 1, opts.ngpu, nstream, (magma_queue_t*) streams );
    
            //magma_int_t offset = 0; //nb;
    
            gpu_time = magma_sync_wtime(0);
        
            magmablas_zhemm_mgpu_com(
                MagmaLeft, MagmaLower, msize, N,
                calpha,    dA, ldda, offset,
                           dX, ldda,
                cbeta,     dB, ldda, dwork, dworksiz, hR, lda, hwork, hworksiz,
                opts.ngpu, nb, streams, nstream, redevents2, nbevents, gnode, nbcmplx);
           
            gpu_time = magma_sync_wtime(0) - gpu_time;
            gpu_perf = gflops / gpu_time;
                
            #ifdef TRACING
            char buf[80];
            snprintf( buf, sizeof(buf), "zhemm-m%d-n%d-nb%d-stream%d-ngpu%d-run%d.svg",
                      (int) M, (int) N, (int) nb, (int) nstream, (int) opts.ngpu, (int) iter );
            trace_finalize( buf, "trace.css" );
            #endif
            
            /* ====================================================================
               Performs operation using CUBLAS
               =================================================================== */
            if ( opts.check && iter == 0 ) {
                magma_setdevice( 0 );
                magmablasSetKernelStream(  0  );
                magma_zsetmatrix( M, M, hA, lda, dA2, ldda );
                magma_zsetmatrix( M, N, hX, lda, dX[0], ldda );
                magma_zsetmatrix( M, N, hB, lda, dwork[0], ldda );
                
                gpu_time2 = magma_sync_wtime(0);
                magma_zhemm(
                    MagmaLeft, MagmaLower, msize, N,
                    calpha,    dA2+offset*ldda+offset, ldda,
                               dX[0],    ldda,
                    cbeta,     dwork[0], ldda );
                gpu_time2 = magma_sync_wtime(0) - gpu_time2;
                gpu_perf2 = gflops / gpu_time2;
            }
            
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            if ( opts.check ) {
                // store ||A||*||X||
                errorbis  = lapackf77_zlange("fro", &msize, &msize, hA+offset*lda+offset, &lda, work );
                errorbis *= lapackf77_zlange("fro", &msize, &N, hX, &lda, work );
                
                //printf( "A =" ); magma_zprint( M, M, hA, lda );
                //printf( "X =" ); magma_zprint( M, N, hX, lda );
                //printf( "B =" ); magma_zprint( M, N, hB, lda );
                
                cpu_time = magma_wtime();
                blasf77_zhemm( "Left", "Lower", &msize, &N,
                                &calpha, hA+offset*lda+offset, &lda,
                                         hX, &lda,
                                &cbeta,  hB, &lda );
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
                /*
                trace_file = fopen("AJETE/C", "w");
                for (int j = 0; j < N; j++)
                    for (int i = 0; i < siz; i++)
                        fprintf(trace_file, "%10d%10d%40.30e\n", i+1, j+1, hB[j*lda+i]);
                fclose(trace_file);
                */
                magma_int_t firstprint=0;
                for(magma_int_t dev=0; dev < opts.ngpu; ++dev) {
                    magma_setdevice( dev );
                    magma_zgetmatrix( M, N, dB[dev], ldda, hR, lda );
    
                    // compute relative error ||R||/||A||*||X||, where R := B_magma - B_lapack = R - B
                    size = lda*N;
                    blasf77_zaxpy( &size, &c_neg_one, hB, &ione, hR, &ione );
                    error = lapackf77_zlange("fro", &msize, &N, hR, &lda, work) / errorbis;
                    
                    //printf( "R ="  ); magma_zprint( M, N, hR, lda );
                    if (firstprint == 0) {
                        printf( "%5d %5d %5d %5d   %7.1f (%7.4f)   %7.1f (%7.4f)   %7.1f (%7.4f)   %8.2e   %s\n",
                                (int) M, (int) N, (int) nb, (int) offset,
                                cpu_perf, cpu_time,
                                gpu_perf, gpu_time,
                                gpu_perf2, gpu_time2,
                                error, (error < tol ? "ok" : "failed") );
                    }
                    else {
                        printf( "%89s  %8.2e   %s\n", " ",
                                error, (error < tol ? "ok" : "failed") );
                    }
                    status += ! (error < tol);
                    firstprint =1;
                }
            } else {
                printf( "%5d %5d %5d %5d     ---   (  ---  )   %7.1f (%7.4f)     ---   (  ---  )   ---\n",
                        (int) M, (int) N, (int) nb, (int) offset,
                        gpu_perf, gpu_time );
            }
    
            TESTING_FREE_CPU( hA );
            TESTING_FREE_CPU( hX );
            TESTING_FREE_CPU( hB );
            
            TESTING_FREE_PIN( hR );
        
            for( int d = 0; d < opts.ngpu; ++d ) {
                magma_setdevice( d );
                TESTING_FREE_DEV( dA[d]    );
                TESTING_FREE_DEV( dX[d]    );
                TESTING_FREE_DEV( dB[d]    );
                TESTING_FREE_DEV( dwork[d] );
                
                TESTING_FREE_PIN( hwork[d] );
            }
            TESTING_FREE_PIN( hwork[opts.ngpu] );
        
            if ( opts.check ) {
                magma_setdevice( 0 );
                TESTING_FREE_DEV( dA2 );
            }
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
      }  // offset
      printf( "\n" );
    }

    for( int d = 0; d < opts.ngpu; ++d ) {
        magma_setdevice( d );
        for( magma_int_t i = 0; i < nstream; ++i ) {
            magma_queue_destroy( streams[d][i] );
        }
        for( magma_int_t i = 0; i < nbevents; ++i ) {
            magma_event_destroy( redevents[d][i]  );
            magma_event_destroy( redevents2[d][i] );
        }
    }
    
    TESTING_FINALIZE();
    return status;
}
Example #7
0
/**
    Purpose
    -------
    SSYTRD reduces a real symmetric matrix A to real symmetric
    tridiagonal form T by an orthogonal similarity transformation:
    Q**H * A * Q = T.

    Arguments
    ---------
    @param[in]
    num_gpus INTEGER
             The number of GPUs.  num_gpus > 0.

    @param[in]
    num_streams INTEGER
             The number of GPU streams used for update.  10 >= num_streams > 0.

    @param[in]
    uplo     magma_uplo_t
      -      = MagmaUpper:  Upper triangle of A is stored;
      -      = MagmaLower:  Lower triangle of A is stored.

    @param[in]
    n        INTEGER
             The order of the matrix A.  N >= 0.

    @param[in,out]
    A        REAL array, dimension (LDA,N)
             On entry, the symmetric matrix A.  If UPLO = MagmaUpper, the leading
             N-by-N upper triangular part of A contains the upper
             triangular part of the matrix A, and the strictly lower
             triangular part of A is not referenced.  If UPLO = MagmaLower, the
             leading N-by-N lower triangular part of A contains the lower
             triangular part of the matrix A, and the strictly upper
             triangular part of A is not referenced.
             On exit, if UPLO = MagmaUpper, the diagonal and first superdiagonal
             of A are overwritten by the corresponding elements of the
             tridiagonal matrix T, and the elements above the first
             superdiagonal, with the array TAU, represent the orthogonal
             matrix Q as a product of elementary reflectors; if UPLO
             = MagmaLower, the diagonal and first subdiagonal of A are over-
             written by the corresponding elements of the tridiagonal
             matrix T, and the elements below the first subdiagonal, with
             the array TAU, represent the orthogonal matrix Q as a product
             of elementary reflectors. See Further Details.

    @param[in]
    lda      INTEGER
             The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    d        REAL array, dimension (N)
             The diagonal elements of the tridiagonal matrix T:
             D(i) = A(i,i).
 
    @param[out]
    e        REAL array, dimension (N-1)
             The off-diagonal elements of the tridiagonal matrix T:
             E(i) = A(i,i+1) if UPLO = MagmaUpper, E(i) = A(i+1,i) if UPLO = MagmaLower.

    @param[out]
    tau      REAL array, dimension (N-1)
             The scalar factors of the elementary reflectors (see Further
             Details).

    @param[out]
    work     (workspace) REAL array, dimension (MAX(1,LWORK))
             On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork    INTEGER
             The dimension of the array WORK.  LWORK >= 1.
             For optimum performance LWORK >= N*NB, where NB is the
             optimal blocksize.
    \n
             If LWORK = -1, then a workspace query is assumed; the routine
             only calculates the optimal size of the WORK array, returns
             this value as the first entry of the WORK array, and no error
             message related to LWORK is issued by XERBLA.

    @param[out]
    info     INTEGER
      -      = 0:  successful exit
      -      < 0:  if INFO = -i, the i-th argument had an illegal value

    Further Details
    ---------------
    If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary
    reflectors

       Q = H(n-1) . . . H(2) H(1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in
    A(1:i-1,i+1), and tau in TAU(i).

    If UPLO = MagmaLower, the matrix Q is represented as a product of elementary
    reflectors

       Q = H(1) H(2) . . . H(n-1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i),
    and tau in TAU(i).

    The contents of A on exit are illustrated by the following examples
    with n = 5:

    if UPLO = MagmaUpper:                if UPLO = MagmaLower:

      (  d   e   v2  v3  v4 )              (  d                  )
      (      d   e   v3  v4 )              (  e   d              )
      (          d   e   v4 )              (  v1  e   d          )
      (              d   e  )              (  v1  v2  e   d      )
      (                  d  )              (  v1  v2  v3  e   d  )

    where d and e denote diagonal and off-diagonal elements of T, and vi
    denotes an element of the vector defining H(i).

    @ingroup magma_ssyev_comp
    ********************************************************************/
extern "C" magma_int_t
magma_ssytrd_mgpu(
    magma_int_t num_gpus, magma_int_t num_streams, magma_uplo_t uplo, magma_int_t n,
    float *A, magma_int_t lda,
    float *d, float *e, float *tau,
    float *work, magma_int_t lwork,
    magma_int_t *info)
{
#define  A(i, j)     (A           + (j)*lda  + (i))
#define dA(id, i, j) (dA[(id)]    + (j)*ldda + (i))
#define dW(id, i, j) (dwork[(id)] + (j)*ldda + (i))

    const char* uplo_ = lapack_uplo_const( uplo );
    
    magma_int_t ln, ldda;
    magma_int_t nb = magma_get_ssytrd_nb(n), ib;

    float c_neg_one = MAGMA_S_NEG_ONE;
    float c_one = MAGMA_S_ONE;
    float  d_one = MAGMA_D_ONE;
    //float mv_time = 0.0;
#ifdef PROFILE_SY2RK
    float up_time = 0.0;
#endif

    magma_int_t kk, nx;
    magma_int_t i = 0, ii, iii, j, did, i_n;
    magma_int_t iinfo;
    magma_int_t ldwork, lddwork, lwkopt, ldwork2;
    magma_int_t lquery;
    magma_queue_t stream[MagmaMaxGPUs][10];
    float *dx[MagmaMaxGPUs], *dy[MagmaMaxGPUs], *hwork;
    float *dwork2[MagmaMaxGPUs];

    *info = 0;
    int upper = (uplo == MagmaUpper);
    lquery = (lwork == -1);
    if (! upper && uplo != MagmaLower) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,n)) {
        *info = -4;
    } else if (lwork < nb*n && ! lquery) {
        *info = -9;
    } else if ( num_streams > 2 ) {
        *info = 2;  // TODO fix
    }

    /* Determine the block size. */
    ldwork = lddwork = n;
    lwkopt = n * nb;
    if (*info == 0) {
        work[0] = MAGMA_S_MAKE( lwkopt, 0 );
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;

    /* Quick return if possible */
    if (n == 0) {
        work[0] = c_one;
        return *info;
    }

    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );
    
    float *dA[MagmaMaxGPUs];
    float *dwork[MagmaMaxGPUs];

    float times[11];
    for( did=0; did < 11; did++ )
        times[did] = 0;
//#define PROFILE_SY2RK
#ifdef PROFILE_SY2RK
    magma_event_t start, stop;
    float etime;
    magma_setdevice(0);
    magma_event_create( &start );
    magma_event_create( &stop  );
#endif
    ldda = lda;
    ln = ((nb*(1+n/(nb*num_gpus))+31)/32)*32;
    ldwork2 = (1+ n / nb + (n % nb != 0)) * ldda;
    for( did=0; did < num_gpus; did++ ) {
        magma_setdevice(did);
        // TODO fix memory leak
        if ( MAGMA_SUCCESS != magma_smalloc(&dA[did],     ln*ldda+3*lddwork*nb) ||
             MAGMA_SUCCESS != magma_smalloc(&dx[did],     num_streams*n) ||
             MAGMA_SUCCESS != magma_smalloc(&dy[did],     num_streams*n) ||
             MAGMA_SUCCESS != magma_smalloc(&dwork2[did], ldwork2 ) ) {
            for( i=0; i < did; i++ ) {
                magma_setdevice(i);
                magma_free(dA[i]);
                magma_free(dx[i]);
                magma_free(dy[i]);
            }
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }
        dwork[did] = dA[did] + ln*ldda;
        
        for( kk=0; kk < num_streams; kk++ )
            magma_queue_create(&stream[did][kk]);
    }
    magma_setdevice(0);
    // TODO fix memory leak dwork2
    if ( MAGMA_SUCCESS != magma_smalloc_pinned( &hwork, num_streams*num_gpus*n ) ) {
        for( i=0; i < num_gpus; i++ ) {
            magma_setdevice(i);
            magma_free(dA[i]);
            magma_free(dx[i]);
            magma_free(dy[i]);
        }
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }

    if (n < 2048)
        nx = n;
    else
        nx = 512;

    if (upper) {
        /* Copy the matrix to the GPU */
        if (1 <= n-nx) {
            magma_shtodhe(num_gpus, uplo, n, nb, A, lda, dA, ldda, stream, &iinfo );
        }

        /*  Reduce the upper triangle of A.
            Columns 1:kk are handled by the unblocked method. */
        for (i = nb*((n-1)/nb); i >= nx; i -= nb) {
            ib = min(nb, n-i);

            ii  = nb*(i/(nb*num_gpus));
            did = (i/nb)%num_gpus;

            /* wait for the next panel */
            if (i != nb*((n-1)/nb)) {
                magma_setdevice(did);
                magma_queue_sync(stream[did][0]);
            }

            magma_slatrd_mgpu(num_gpus, uplo, n, i+ib, ib, nb,
                              A(0, 0), lda, e, tau,
                              work, ldwork,
                              dA, ldda, 0,
                              dwork, i+ib,
                              dwork2, ldwork2,
                              1, dx, dy, hwork,
                              stream, times);

            magma_ssyr2k_mgpu(num_gpus, MagmaUpper, MagmaNoTrans, nb, i, ib,
                         c_neg_one, dwork, i+ib, 0,
                         d_one,     dA,    ldda, 0,
                         num_streams, stream);

            /* get the next panel */
            if (i-nb >= nx ) {
                ib = min(nb, n-(i-nb));
                
                ii  = nb*((i-nb)/(nb*num_gpus));
                did = ((i-nb)/nb)%num_gpus;
                magma_setdevice(did);
                
                magma_sgetmatrix_async( (i-nb)+ib, ib,
                                        dA(did, 0, ii), ldda,
                                         A(0, i-nb),    lda,
                                        stream[did][0] );
            }

            /* Copy superdiagonal elements back into A, and diagonal
               elements into D */
            for (j = i; j < i+ib; ++j) {
                if ( j > 0 ) {
                    *A(j-1,j) = MAGMA_S_MAKE( e[j - 1], 0 );
                }
                d[j] = MAGMA_S_REAL( *A(j, j) );
            }
        } /* end of for i=... */
      
        if ( nx > 0 ) {
            if (1 <= n-nx) { /* else A is already on CPU */
                for (i=0; i < nx; i += nb) {
                    ib = min(nb, n-i);
                    ii  = nb*(i/(nb*num_gpus));
                    did = (i/nb)%num_gpus;
                
                    magma_setdevice(did);
                    magma_sgetmatrix_async( nx, ib,
                                            dA(did, 0, ii), ldda,
                                            A(0, i),        lda,
                                            stream[did][0] );
                }
            }
            
            for( did=0; did < num_gpus; did++ ) {
                magma_setdevice(did);
                magma_queue_sync(stream[did][0]);
            }
            /*  Use unblocked code to reduce the last or only block */
            lapackf77_ssytd2(uplo_, &nx, A(0, 0), &lda, d, e, tau, &iinfo);
        }
    }
    else {
        trace_init( 1, num_gpus, num_streams, (CUstream_st**)stream );
        /* Copy the matrix to the GPU */
        if (1 <= n-nx) {
            magma_shtodhe(num_gpus, uplo, n, nb, A, lda, dA, ldda, stream, &iinfo );
        }

        /* Reduce the lower triangle of A */
        for (i = 0; i < n-nx; i += nb) {
            ib = min(nb, n-i);

            ii  = nb*(i/(nb*num_gpus));
            did = (i/nb)%num_gpus;
            /* Reduce columns i:i+ib-1 to tridiagonal form and form the
               matrix W which is needed to update the unreduced part of
               the matrix */

            /*   Get the current panel (no need for the 1st iteration) */
            if (i != 0) {
                magma_setdevice(did);
                trace_gpu_start( did, 0, "comm", "get" );
                magma_sgetmatrix_async( n-i, ib,
                                        dA(did, i, ii), ldda,
                                         A(i,i),        lda,
                                        stream[did][0] );
                trace_gpu_end( did, 0 );
                magma_queue_sync(stream[did][0]);
                magma_setdevice(0);
            }
            
            magma_slatrd_mgpu(num_gpus, uplo, n, n-i, ib, nb,
                              A(i, i), lda, &e[i],
                              &tau[i], work, ldwork,
                              dA, ldda, i,
                              dwork,  (n-i),
                              dwork2, ldwork2,
                              1, dx, dy, hwork,
                              stream, times );

#ifdef PROFILE_SY2RK
            magma_setdevice(0);
            if ( i > 0 ) {
                cudaEventElapsedTime(&etime, start, stop);
                up_time += (etime/1000.0);
            }
            magma_event_record(start, 0);
#endif
            magma_ssyr2k_mgpu(num_gpus, MagmaLower, MagmaNoTrans, nb, n-i-ib, ib,
                         c_neg_one, dwork, n-i, ib,
                         d_one, dA, ldda, i+ib, num_streams, stream);
#ifdef PROFILE_SY2RK
            magma_setdevice(0);
            magma_event_record(stop, 0);
#endif

            /* Copy subdiagonal elements back into A, and diagonal
               elements into D */
            for (j = i; j < i+ib; ++j) {
                if ( j+1 < n ) {
                    *A(j+1,j) = MAGMA_S_MAKE( e[j], 0 );
                }
                d[j] = MAGMA_S_REAL( *A(j, j) );
            }
        } /* for i=... */

        /* Use unblocked code to reduce the last or only block */
        if ( i < n ) {
            iii = i;
            i_n = n-i;
            if ( i > 0 ) {
                for (; i < n; i += nb) {
                    ib = min(nb, n-i);
                    ii  = nb*(i/(nb*num_gpus));
                    did = (i/nb)%num_gpus;
                
                    magma_setdevice(did);
                    magma_sgetmatrix_async( i_n, ib,
                                            dA(did, iii, ii), ldda,
                                             A(iii, i),       lda,
                                            stream[did][0] );
                }
                for( did=0; did < num_gpus; did++ ) {
                    magma_setdevice(did);
                    magma_queue_sync(stream[did][0]);
                }
            }
            lapackf77_ssytrd(uplo_, &i_n, A(iii, iii), &lda, &d[iii], &e[iii],
                             &tau[iii], work, &lwork, &iinfo);
        }
    }
#ifdef PROFILE_SY2RK
    magma_setdevice(0);
    if ( n > nx ) {
        cudaEventElapsedTime(&etime, start, stop);
        up_time += (etime/1000.0);
    }
    magma_event_destroy( start );
    magma_event_destroy( stop  );
#endif

    trace_finalize( "ssytrd.svg", "trace.css" );
    for( did=0; did < num_gpus; did++ ) {
        magma_setdevice(did);
        for( kk=0; kk < num_streams; kk++ )
            magma_queue_sync(stream[did][kk]);
        for( kk=0; kk < num_streams; kk++ )
            magma_queue_destroy(stream[did][kk]);
        magma_free(dA[did]);
        magma_free(dx[did]);
        magma_free(dy[did]);
        magma_free(dwork2[did]);
    }
    magma_free_pinned(hwork);
    magma_setdevice( orig_dev );
    magmablasSetKernelStream( orig_stream );
    
    work[0] = MAGMA_S_MAKE( lwkopt, 0 );

#ifdef PROFILE_SY2RK
    printf( " n=%d nb=%d\n", n, nb );
    printf( " Time in SLARFG: %.2e seconds\n", times[0] );
    //printf( " Time in SSYMV : %.2e seconds\n", mv_time );
    printf( " Time in SSYR2K: %.2e seconds\n", up_time );
#endif
    return *info;
} /* magma_ssytrd */
Example #8
0
/**
    Purpose
    =======
 
    DSYTRF computes the factorization of a real symmetric matrix A
    using the Bunch-Kaufman diagonal pivoting method.  The form of the
    factorization is
 
        A = U*D*U^H  or  A = L*D*L^H
 
    where U (or L) is a product of permutation and unit upper (lower)
    triangular matrices, and D is symmetric and block diagonal with
    1-by-1 and 2-by-2 diagonal blocks.

    This is the blocked version of the algorithm, calling Level 3 BLAS.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of A is stored;
      -     = MagmaLower:  Lower triangle of A is stored.
 
    @param[in]
    n       INTEGER
            The order of the matrix A.  N >= 0.
  
    @param[in,out]
    A       DOUBLE PRECISION array, dimension (LDA,N)
            On entry, the symmetric matrix A.  If UPLO = MagmaUpper, the leading
            N-by-N upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = MagmaLower, the
            leading N-by-N lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
    \n
            On exit, the block diagonal matrix D and the multipliers used
            to obtain the factor U or L (see below for further details).
 
    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).
 
    @param[out]
    ipiv    INTEGER array, dimension (N)
            Details of the interchanges and the block structure of D.
            If IPIV(k) > 0, then rows and columns k and IPIV(k) were
            interchanged and D(k,k) is a 1-by-1 diagonal block.
            If UPLO = MagmaUpper and IPIV(k) = IPIV(k-1) < 0, then rows and
            columns k-1 and -IPIV(k) were interchanged and D(k-1:k,k-1:k)
            is a 2-by-2 diagonal block.  If UPLO = MagmaLower and IPIV(k) =
            IPIV(k+1) < 0, then rows and columns k+1 and -IPIV(k) were
            interchanged and D(k:k+1,k:k+1) is a 2-by-2 diagonal block.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
      -     > 0:  if INFO = i, D(i,i) is exactly zero.  The factorization
                  has been completed, but the block diagonal matrix D is
                  exactly singular, and division by zero will occur if it
                  is used to solve a system of equations.

    Further Details
    ===============
    If UPLO = MagmaUpper, then A = U*D*U', where
    U = P(n)*U(n)* ... *P(k)U(k)* ...,
    i.e., U is a product of terms P(k)*U(k), where k decreases from n to
    1 in steps of 1 or 2, and D is a block diagonal matrix with 1-by-1
    and 2-by-2 diagonal blocks D(k).  P(k) is a permutation matrix as
    defined by IPIV(k), and U(k) is a unit upper triangular matrix, such
    that if the diagonal block D(k) is of order s (s = 1 or 2), then
 
               (   I    v    0   )   k-s
       U(k) =  (   0    I    0   )   s
               (   0    0    I   )   n-k
                  k-s   s   n-k
 
    If s = 1, D(k) overwrites A(k,k), and v overwrites A(1:k-1,k).
    If s = 2, the upper triangle of D(k) overwrites A(k-1,k-1), A(k-1,k),
    and A(k,k), and v overwrites A(1:k-2,k-1:k).
  
    If UPLO = MagmaLower, then A = L*D*L', where
       L = P(1)*L(1)* ... *P(k)*L(k)* ...,
    i.e., L is a product of terms P(k)*L(k), where k increases from 1 to
    n in steps of 1 or 2, and D is a block diagonal matrix with 1-by-1
    and 2-by-2 diagonal blocks D(k).  P(k) is a permutation matrix as
    defined by IPIV(k), and L(k) is a unit lower triangular matrix, such
    that if the diagonal block D(k) is of order s (s = 1 or 2), then
  
               (   I    0     0   )  k-1
       L(k) =  (   0    I     0   )  s
               (   0    v     I   )  n-k-s+1
                  k-1   s  n-k-s+1
  
    If s = 1, D(k) overwrites A(k,k), and v overwrites A(k+1:n,k).
    If s = 2, the lower triangle of D(k) overwrites A(k,k), A(k+1,k),
    and A(k+1,k+1), and v overwrites A(k+2:n,k:k+1).
 
    @ingroup magma_dsysv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_dsytrf(
    magma_uplo_t uplo, magma_int_t n,
    double *A, magma_int_t lda,
    magma_int_t *ipiv, magma_int_t *info)
{
    #define  A(i_, j_) ( A + (i_) + (j_)*lda )
    #define dA(i_, j_) (dA + (i_) + (j_)*ldda)

    /* .. Local Scalars .. */
    magma_int_t nb = magma_get_dsytrf_nb(n);
    magma_int_t iinfo = 0, nk, kb, j, k;

    /* Test the input parameters. */
    *info = 0;
    bool upper = (uplo == MagmaUpper);
    if ( ! upper && uplo != MagmaLower ) {
        *info = -1;
    } else if ( n < 0 ) {
        *info = -2;
    } else if ( lda < max( 1, n ) ) {
        *info = -4;
    }
    if ( *info != 0 ) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    // TODO fix memory leak of dA if dW fails
    magma_int_t ldda = magma_roundup( n, 32 );
    magmaDouble_ptr dA, dW;
    if ((MAGMA_SUCCESS != magma_dmalloc( &dA, n*ldda  )) ||
        (MAGMA_SUCCESS != magma_dmalloc( &dW, (1+nb)*ldda ))) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }
    magma_device_t cdev;
    magma_getdevice( &cdev );

    magma_queue_t queues[2];
    magma_event_t event[2];
    magma_queue_create( cdev, &queues[0] );
    magma_queue_create( cdev, &queues[1] );
    magma_event_create( &event[0] );
    magma_event_create( &event[1] );
    trace_init( 1, 1, 2, queues );

    /* copy matrix to GPU */
    trace_gpu_start( 0, 0, "set", "setA" );
    //magma_dsetmatrix_async( n, n, A(0,0), lda, dA(0,0), ldda, queues[0] );
    if ( upper ) {
        for (k = 0; k < n; k += nb ) {
            kb = min(nb, n-k);
            magma_dsetmatrix_async( k+kb, kb, A(0,k), lda, dA(0,k), ldda, queues[0] );
        }
    } else {
        for (k = 0; k < n; k += nb ) {
            kb = min(nb, n-k);
            magma_dsetmatrix_async( n-k, kb, A(k,k), lda, dA(k,k), ldda, queues[0] );
        }
    }
    trace_gpu_end( 0, 0 );

    if ( upper ) {
        /* Factorize A as U*D*U' using the upper triangle of A

           K is the main loop index, decreasing from N to 1 in steps of
           KB, where KB is the number of columns factorized by DLASYF;
           KB is either NB or NB-1, or K for the last block */

        kb = min(n,nb);
        for (k = n-1; k >= 0; k -= kb ) {
            nk = k+1;
            kb = min(nb, nk);

            if ( k+1 > nb ) {
                /* Factorize columns k-kb+1:k of A and use blocked code to
                   update columns 1:k-kb */

                magma_dlasyf_gpu( MagmaUpper, nk, kb, &kb, A( 0, 0 ), lda, dA( 0, 0 ), ldda,
                                  &ipiv[0], dW, ldda, queues, event, &iinfo );
            } else {
                /* Use unblocked code to factorize columns 1:k of A */

                magma_queue_sync( queues[0] );
                magma_dgetmatrix( nk, nk, dA( 0, 0 ), ldda, A( 0, 0 ), lda, queues[0] );
                lapackf77_dsytf2( MagmaUpperStr, &nk, A( 0, 0 ), &lda, &ipiv[0], &iinfo );
                kb = k+1;
            }

            /* Set INFO on the first occurrence of a zero pivot */

            if ( *info == 0 && iinfo > 0 ) *info = iinfo;
        }
    } else {
        /* Factorize A as L*D*L' using the lower triangle of A

           K is the main loop index, increasing from 1 to N in steps of
           KB, where KB is the number of columns factorized by DLASYF;
           KB is either NB or NB-1, or N-K+1 for the last block */

        for (k = 0; k < n; k += kb ) {
            nk = n-k;
            kb = min(nb, n - k);
            if ( k < n-nb ) {
                /* Factorize columns k:k+kb-1 of A and use blocked code to
                   update columns k+kb:n */
                magma_dlasyf_gpu( MagmaLower, nk, nb, &kb, A( k, k ), lda, dA( k, k ), ldda,
                                  &ipiv[k], dW, ldda, queues, event, &iinfo );
            }
            else {
                /* Use unblocked code to factorize columns k:n of A */
                magma_queue_sync( queues[0] );
                magma_dgetmatrix( nk, nk, dA(k,k), ldda, A(k,k), lda, queues[0] );
                lapackf77_dsytf2( MagmaLowerStr, &nk, A( k, k ), &lda, &ipiv[k], &iinfo );
            }
            /* Set INFO on the first occurrence of a zero pivot */
            if ( *info == 0 && iinfo > 0 ) *info = iinfo + k;
            /* Adjust IPIV */
            for (j = k; j < k + kb; j ++) {
                if ( ipiv[j] > 0 ) {
                    ipiv[j] = ipiv[j] + k;
                } else {
                    ipiv[j] = ipiv[j] - k;
                }
            }
        }
    }

    trace_finalize( "dsytrf.svg", "trace.css" );
    magma_queue_sync( queues[0] );
    magma_queue_sync( queues[1] );
    magma_event_destroy( event[0] );
    magma_event_destroy( event[1] );
    magma_queue_destroy( queues[0] );
    magma_queue_destroy( queues[1] );
    magma_free( dA );
    magma_free( dW );
    
    return *info;
}   /* End of DSYTRF */
Example #9
0
/**
    Purpose
    -------
    DSYTRD_HE2HB reduces a real symmetric matrix A to real symmetric
    band-diagonal form T by an orthogonal similarity transformation:
    Q**H * A * Q = T.
    This version stores the triangular matrices T used in the accumulated
    Householder transformations (I - V T V').

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of A is stored;
      -     = MagmaLower:  Lower triangle of A is stored.

    @param[in]
    n       INTEGER
            The order of the matrix A.  N >= 0.

    @param[in,out]
    A       DOUBLE_PRECISION array, dimension (LDA,N)
            On entry, the symmetric matrix A.  If UPLO = MagmaUpper, the leading
            N-by-N upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = MagmaLower, the
            leading N-by-N lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
            On exit, if UPLO = MagmaUpper, the Upper band-diagonal of A is
            overwritten by the corresponding elements of the
            band-diagonal matrix T, and the elements above the band
            diagonal, with the array TAU, represent the orthogonal
            matrix Q as a product of elementary reflectors; if UPLO
            = MagmaLower, the the Lower band-diagonal of A is overwritten by
            the corresponding elements of the band-diagonal
            matrix T, and the elements below the band-diagonal, with
            the array TAU, represent the orthogonal matrix Q as a product
            of elementary reflectors. See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    tau     DOUBLE_PRECISION array, dimension (N-1)
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    work    (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.  LWORK >= 1.
            For optimum performance LWORK >= N*NB, where NB is the
            optimal blocksize.
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    @param[out]
    dT      DOUBLE_PRECISION array on the GPU, dimension N*NB,
            where NB is the optimal blocksize.
            On exit dT holds the upper triangular matrices T from the
            accumulated Householder transformations (I - V T V') used
            in the factorization. The nb x nb matrices T are ordered
            consecutively in memory one after another.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value

    Further Details
    ---------------
    If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary
    reflectors

       Q = H(n-1) . . . H(2) H(1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in
    A(1:i-1,i+1), and tau in TAU(i).

    If UPLO = MagmaLower, the matrix Q is represented as a product of elementary
    reflectors

       Q = H(1) H(2) . . . H(n-1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i),
    and tau in TAU(i).

    The contents of A on exit are illustrated by the following examples
    with n = 5:

    if UPLO = MagmaUpper:                if UPLO = MagmaLower:

      (  d   e   v2  v3  v4 )              (  d                  )
      (      d   e   v3  v4 )              (  e   d              )
      (          d   e   v4 )              (  v1  e   d          )
      (              d   e  )              (  v1  v2  e   d      )
      (                  d  )              (  v1  v2  v3  e   d  )

    where d and e denote diagonal and off-diagonal elements of T, and vi
    denotes an element of the vector defining H(i).

    @ingroup magma_dsyev_2stage
    ********************************************************************/
extern "C" magma_int_t
magma_dsytrd_sy2sb( magma_uplo_t uplo, magma_int_t n, magma_int_t nb,
                    double *A, magma_int_t lda,
                    double *tau,
                    double *work, magma_int_t lwork,
                    double *dT,
                    magma_int_t *info)
{
    #define  A(a_1,a_2)  ( A + ((a_2)-1)*( lda) + (a_1)-1)
    #define dA(a_1,a_2)  (dA + ((a_2)-1)*(ldda) + (a_1)-1)
    #define tau_ref(a_1) (tau + (a_1)-1)
    #define dT(a_1)      (dT + ((a_1)-1)*(lddt))

    int ldda = ((n+31)/32)*32;
    int lddt = nb;
   
    double c_neg_one  = MAGMA_D_NEG_ONE;
    double c_neg_half = MAGMA_D_NEG_HALF;
    double c_one  = MAGMA_D_ONE;
    double c_zero = MAGMA_D_ZERO;
    double  d_one = MAGMA_D_ONE;

    magma_int_t pm, pn, indi, indj, pk;
    magma_int_t pm_old=0, pn_old=0, indi_old=0, indj_old=0;

    int i;
    int lwkopt;
    int lquery;

    *info = 0;
    int upper = (uplo == MagmaUpper);
    lquery = (lwork == -1);
    if (! upper && uplo != MagmaLower) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,n)) {
        *info = -4;
    } else if (lwork < 1 && ! lquery) {
        *info = -9;
    }

    /* Determine the block size. */
    lwkopt = n * nb;
    if (*info == 0) {
        work[0] = MAGMA_D_MAKE( lwkopt, 0 );
    }

    if (*info != 0)
        return *info;
    else if (lquery)
        return *info;

    /* Quick return if possible */
    if (n == 0) {
        work[0] = c_one;
        return *info;
    }

    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );
    
    double *dA;
    if (MAGMA_SUCCESS != magma_dmalloc( &dA, (n + 2*nb)*ldda )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    // limit to 16 threads
    magma_int_t orig_threads = magma_get_lapack_numthreads();
    magma_set_lapack_numthreads( min(orig_threads,16) );

    /* Use the first panel of dA as work space */
    double *dwork = dA + n*ldda;
    double *dW    = dwork + nb*ldda;

    #ifdef TRACING
    char buf[80];
    #endif
    magma_queue_t stream[3];
    magma_queue_create( &stream[0] );
    magma_queue_create( &stream[1] );
    stream[2] = 0;  // default stream
    
    trace_init( 1, 1, 3, stream );

    double *hT = work + lwork - nb*nb;
    lwork -= nb*nb;
    memset( hT, 0, nb*nb*sizeof(double));

    magmablasSetKernelStream( stream[0] );
    magma_event_t Pupdate_event;
    cudaEventCreateWithFlags(&Pupdate_event,cudaEventDisableTiming);
    //magma_event_create(&Pupdate_event);


    if (upper) {
        printf("DSYTRD_HE2HB is not yet implemented for upper matrix storage. Exit.\n");
        exit(1);
    } else {
        /* Copy the matrix to the GPU */
        if (1 <= n-nb) {
            trace_gpu_start( 0, 0, "set", "set A" );
            magma_dsetmatrix_async( (n-nb), (n-nb),
                                    A(nb+1, nb+1),  lda,
                                    dA(nb+1, nb+1), ldda, stream[0] );
            trace_gpu_end( 0, 0 );
        }

        /* Reduce the lower triangle of A */
        for (i = 1; i <= n-nb; i += nb) {
             indi = i+nb;
             indj = i;
             pm   = n - i - nb + 1;
             //pn   = min(i+nb-1, n-nb) -i + 1;
             pn   = nb;
             
             /*   Get the current panel (no need for the 1st iteration) */
             if (i > 1 ) {
                 // dpanel_to_q copy the upper oof diagonal part of
                 // the matrix to work to be restored later. acctually
                 //  the zero's and one's putted are not used this is only
                 //   because we don't have a function that copy only the
                 //    upper part of A to be restored after copying the
                 //    lookahead panel that has been computted from GPU to CPU.
                 dpanel_to_q(MagmaUpper, pn-1, A(i, i+1), lda, work);

                 trace_gpu_start( 0, 1, "get", "get panel" );
                 //magma_queue_sync( stream[0] );
                 magma_queue_wait_event(stream[1], Pupdate_event);  //, 0);
                 magma_dgetmatrix_async( (pm+pn), pn,
                                         dA( i, i), ldda,
                                         A ( i, i), lda, stream[1] );
                 trace_gpu_end( 0, 1 );

                 trace_gpu_start( 0, 2, "her2k", "her2k" );
                 magma_dsyr2k(MagmaLower, MagmaNoTrans, pm_old-pn_old, pn_old, c_neg_one,
                      dA(indi_old+pn_old, indj_old), ldda,
                      dW + pn_old,            pm_old, d_one,
                      dA(indi_old+pn_old, indi_old+pn_old), ldda);
                 trace_gpu_end( 0, 2 );

                 trace_cpu_start( 0, "sync", "sync on 1" );
                 magma_queue_sync( stream[1] );
                 trace_cpu_end( 0 );
                 dq_to_panel(MagmaUpper, pn-1, A(i, i+1), lda, work);
             }

             /* ==========================================================
                QR factorization on a panel starting nb off of the diagonal.
                Prepare the V and T matrices.
                ==========================================================  */
             #ifdef TRACING
             snprintf( buf, sizeof(buf), "panel %d", i );
             #endif
             trace_cpu_start( 0, "geqrf", buf );
             lapackf77_dgeqrf(&pm, &pn, A(indi, indj), &lda,
                        tau_ref(i), work, &lwork, info);
             
             /* Form the matrix T */
                         pk=min(pm,pn);
             lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr,
                           &pm, &pk, A(indi, indj), &lda,
                           tau_ref(i), hT, &nb);

             /* Prepare V - put 0s in the upper triangular part of the panel
                (and 1s on the diagonal), temporaly storing the original in work */
             dpanel_to_q(MagmaUpper, pk, A(indi, indj), lda, work);
             trace_cpu_end( 0 );

             /* Send V from the CPU to the GPU */
             trace_gpu_start( 0, 0, "set", "set V and T" );
             magma_dsetmatrix_async( pm, pk,
                                     A(indi, indj),  lda,
                                     dA(indi, indj), ldda, stream[0] );

             /* Send the triangular factor T to the GPU */
             magma_dsetmatrix_async( pk, pk,
                                     hT,       nb,
                                     dT(i), lddt, stream[0] );
             trace_gpu_end( 0, 0 );
             
             /* ==========================================================
                Compute W:
                1. X = A (V T)
                2. W = X - 0.5* V * (T' * (V' * X))
                ==========================================================  */
             /* dwork = V T */
             trace_cpu_start( 0, "sync", "sync on 0" );
             // this sync is done here to be sure that the copy has been finished
             // because below we made a restore dq_to_panel and this restore need
             // to ensure that the copy has been finished. we did it here to allow
             // overlapp of restore with next gemm and symm.
             magma_queue_sync( stream[0] );
             trace_cpu_end( 0 );
             
             trace_gpu_start( 0, 2, "gemm", "work = V*T" );
             magma_dgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk,
                         c_one, dA(indi, indj), ldda,
                         dT(i), lddt,
                         c_zero, dwork, pm);
             trace_gpu_end( 0, 2 );
             
             /* dW = X = A*V*T. dW = A*dwork */
             trace_gpu_start( 0, 2, "hemm", "X = A*work" );
             magma_dsymm(MagmaLeft, uplo, pm, pk,
                         c_one, dA(indi, indi), ldda,
                         dwork, pm,
                         c_zero, dW, pm);
             trace_gpu_end( 0, 2 );
             /* restore the panel */
             dq_to_panel(MagmaUpper, pk, A(indi, indj), lda, work);
             
             /* dwork = V*T already ==> dwork' = T'*V'
              * compute T'*V'*X ==> dwork'*W ==>
              * dwork + pm*nb = ((T' * V') * X) = dwork' * X = dwork' * W */
             trace_gpu_start( 0, 2, "gemm", "work = T'*V'*X" );
             magma_dgemm(MagmaConjTrans, MagmaNoTrans, pk, pk, pm,
                         c_one, dwork, pm,
                         dW, pm,
                         c_zero, dwork + pm*nb, nb);
             trace_gpu_end( 0, 2 );
             
             /* W = X - 0.5 * V * T'*V'*X
              *   = X - 0.5 * V * (dwork + pm*nb) = W - 0.5 * V * (dwork + pm*nb) */
             trace_gpu_start( 0, 2, "gemm", "W = X - 0.5*V*(T'*V'*X)" );
             magma_dgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk,
                         c_neg_half, dA(indi, indj), ldda,
                         dwork + pm*nb, nb,
                         c_one,     dW, pm);
             trace_gpu_end( 0, 2 );

             /* ==========================================================
                Update the unreduced submatrix A(i+ib:n,i+ib:n), using
                an update of the form:  A := A - V*W' - W*V'
                ==========================================================  */
             if (i + nb <= n-nb) {
                 /* There would be next iteration;
                    do lookahead - update the next panel */
                 trace_gpu_start( 0, 2, "gemm", "gemm 4 next panel left" );
                 magma_dgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one,
                             dA(indi, indj), ldda,
                             dW,                 pm, c_one,
                             dA(indi, indi), ldda);
                 trace_gpu_end( 0, 2 );
             
                 trace_gpu_start( 0, 2, "gemm", "gemm 5 next panel right" );
                 magma_dgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one,
                             dW,                 pm,
                             dA(indi, indj), ldda, c_one,
                             dA(indi, indi), ldda);
                 trace_gpu_end( 0, 2 );
                 magma_event_record(Pupdate_event, stream[0]);
             }
             else {
                 /* no look-ahead as this is last iteration */
                 trace_gpu_start( 0, 2, "her2k", "her2k last iteration" );
                 magma_dsyr2k(MagmaLower, MagmaNoTrans, pk, pk, c_neg_one,
                              dA(indi, indj), ldda,
                              dW,                 pm, d_one,
                              dA(indi, indi), ldda);
                 trace_gpu_end( 0, 2 );
             }
             
             indi_old = indi;
             indj_old = indj;
             pm_old   = pm;
             pn_old   = pn;
        }  // end loop for (i)

        /* Send the last block to the CPU */
        pk = min(pm,pn);
        if (1 <= n-nb) {
            dpanel_to_q(MagmaUpper, pk-1, A(n-pk+1, n-pk+2), lda, work);
            trace_gpu_start( 0, 2, "get", "get last block" );
            magma_dgetmatrix( pk, pk,
                              dA(n-pk+1, n-pk+1), ldda,
                              A(n-pk+1, n-pk+1),  lda );
            trace_gpu_end( 0, 2 );
            dq_to_panel(MagmaUpper, pk-1, A(n-pk+1, n-pk+2), lda, work);
        }
    }// end of LOWER
    
    trace_finalize( "dsytrd_sy2sb.svg", "trace.css" );

    magma_event_destroy( Pupdate_event );
    magma_queue_destroy( stream[0] );
    magma_queue_destroy( stream[1] );
    magma_free( dA );
    work[0] = MAGMA_D_MAKE( lwkopt, 0 );

    magmablasSetKernelStream( orig_stream );    
    magma_set_lapack_numthreads( orig_threads );

    return *info;
} /* magma_dsytrd_sy2sb */
Example #10
0
/**
    Purpose
    -------
    ZGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.
    Use two buffer to send panels.

    Arguments
    ---------
    @param[in]
    ngpu    INTEGER
            Number of GPUs to use. ngpu > 0.

    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    d_lAT   COMPLEX_16 array of pointers on the GPU, dimension (ngpu).
            On entry, the M-by-N matrix A distributed over GPUs
            (d_lAT[d] points to the local matrix on d-th GPU).
            It uses a 1D block column cyclic format (with the block size
            nb), and each local matrix is stored by row.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    @param[in]
    lddat   INTEGER
            The leading dimension of the array d_lAT[d]. LDDA >= max(1,M).

    @param[out]
    ipiv    INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    @param (workspace) on device
    d_lAP   COMPLEX_16 array of pointers on the GPU, dimension (ngpu).
            d_lAP[d] is the workspace on d-th GPU. Each local workspace
            must be of size (3+ngpu)*nb*maxm, where maxm is m rounded
            up to a multiple of 32 and nb is the block size.

    @param (workspace)
    W       COMPLEX_16 array, dimension (ngpu*nb*maxm).
            It is used to store panel on CPU.

    @param[in]
    ldw     INTEGER
            The leading dimension of the workspace w.

    @param[in]
    queues  magma_queue_t
            queues[d] points to the streams for the d-th GPU to execute
            in. Each GPU require two streams.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
      -     > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    @ingroup magma_zgesv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_zgetrf2_mgpu(
    magma_int_t ngpu,
    magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset,
    magmaDoubleComplex_ptr d_lAT[], magma_int_t lddat, magma_int_t *ipiv,
    magmaDoubleComplex_ptr d_lAP[],
    magmaDoubleComplex *W, magma_int_t ldw,
    magma_queue_t queues[][2],
    magma_int_t *info)
{
#define dAT(id,i,j)  (d_lAT[(id)] + ((offset)+(i)*nb)*lddat + (j)*nb)
#define W(j) (W + ((j)%ngpu)*nb*ldw)

    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;

    magma_int_t block_size = 32;
    magma_int_t iinfo, n_local[MagmaMaxGPUs];
    magma_int_t maxm, mindim;
    magma_int_t i, j, d, dd, rows, cols, s, ldpan[MagmaMaxGPUs];
    magma_int_t id, j_local, j_local2, nb0, nb1, h = 2+ngpu;
    magmaDoubleComplex *d_panel[MagmaMaxGPUs], *panel_local[MagmaMaxGPUs];

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -2;
    else if (n < 0)
        *info = -3;
    else if (ngpu*lddat < max(1,n))
        *info = -5;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    if ( ngpu > ceil((double)n/nb) ) {
        *info = -1;
        return *info;
    }

    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );
    
    /* Use hybrid blocked code. */
    maxm  = ((m + block_size-1)/block_size)*block_size;

    /* some initializations */
    for (d=0; d < ngpu; d++) {
        magma_setdevice(d);
        
        n_local[d] = ((n/nb)/ngpu)*nb;
        if (d < (n/nb) % ngpu)
            n_local[d] += nb;
        else if (d == (n/nb) % ngpu)
            n_local[d] += n % nb;
        
        /* workspaces */
        d_panel[d] = &(d_lAP[d][h*nb*maxm]);   /* temporary panel storage */
    }
    trace_init( 1, ngpu, 2, (CUstream_st**)queues );

    /* start sending the panel to cpu */
    nb0 = min(mindim, nb);
    magma_setdevice(0);
    magmablasSetKernelStream(queues[0][1]);
    trace_gpu_start( 0, 1, "comm", "get" );
    magmablas_ztranspose( nb0, m, dAT(0,0,0), lddat, d_lAP[0], maxm );
    magma_zgetmatrix_async( m, nb0,
                            d_lAP[0], maxm,
                            W(0),     ldw, queues[0][1] );
    trace_gpu_end( 0, 1 );

    /* ------------------------------------------------------------------------------------- */
    magma_timer_t time=0;
    timer_start( time );

    s = mindim / nb;
    for( j=0; j < s; j++ ) {
        /* Set the GPU number that holds the current panel */
        id = j % ngpu;
        magma_setdevice(id);
        
        /* Set the local index where the current panel is */
        j_local = j/ngpu;
        cols  = maxm - j*nb;
        rows  = m - j*nb;
        
        /* synchronize j-th panel from id-th gpu into work */
        magma_queue_sync( queues[id][1] );
        
        /* j-th panel factorization */
        trace_cpu_start( 0, "getrf", "getrf" );
        lapackf77_zgetrf( &rows, &nb, W(j), &ldw, ipiv+j*nb, &iinfo);
        if ( (*info == 0) && (iinfo > 0) ) {
            *info = iinfo + j*nb;
        }
        trace_cpu_end( 0 );
        
        /* start sending the panel to all the gpus */
        d = (j+1) % ngpu;
        for( dd=0; dd < ngpu; dd++ ) {
            magma_setdevice(d);
            trace_gpu_start( 0, 1, "comm", "set" );
            magma_zsetmatrix_async( rows, nb,
                                    W(j),     ldw,
                                    &d_lAP[d][(j%h)*nb*maxm], cols,
                                    queues[d][1] );
            trace_gpu_end( 0, 1 );
            d = (d+1) % ngpu;
        }
        
        /* apply the pivoting */
        d = (j+1) % ngpu;
        for( dd=0; dd < ngpu; dd++ ) {
            magma_setdevice(d);
            
            trace_gpu_start( d, 1, "pivot", "pivot" );
            if ( dd == 0 ) {
                for( i=j*nb; i < j*nb + nb; ++i ) {
                    ipiv[i] += j*nb;
                }
            }
            magmablas_zlaswp_q( lddat, dAT(d,0,0), lddat, j*nb + 1, j*nb + nb, ipiv, 1, queues[d][0] );
            trace_gpu_end( d, 1 );
            d = (d+1) % ngpu;
        }
    
    
        /* update the trailing-matrix/look-ahead */
        d = (j+1) % ngpu;
        for( dd=0; dd < ngpu; dd++ ) {
            magma_setdevice(d);
            
            /* storage for panel */
            if ( d == id ) {
                /* the panel belond to this gpu */
                panel_local[d] = dAT(d,j,j_local);
                ldpan[d] = lddat;
                /* next column */
                j_local2 = j_local+1;
            } else {
                /* the panel belong to another gpu */
                panel_local[d] = d_panel[d];
                ldpan[d] = nb;
                /* next column */
                j_local2 = j_local;
                if ( d < id ) j_local2 ++;
            }
            /* the size of the next column */
            if ( s > (j+1) ) {
                nb0 = nb;
            } else {
                nb0 = n_local[d]-nb*(s/ngpu);
                if ( d < s % ngpu ) nb0 -= nb;
            }
            if ( d == (j+1) % ngpu) {
                /* owns the next column, look-ahead the column */
                nb1 = nb0;
                magmablasSetKernelStream(queues[d][1]);
                
                /* make sure all the pivoting has been applied */
                magma_queue_sync(queues[d][0]);
                trace_gpu_start( d, 1, "gemm", "gemm" );
                /* transpose panel on GPU */
                magmablas_ztranspose( rows, nb, &d_lAP[d][(j%h)*nb*maxm], cols, panel_local[d], ldpan[d] );
                /* synch for remaining update */
                magma_queue_sync(queues[d][1]);
            } else {
                /* update the entire trailing matrix */
                nb1 = n_local[d] - j_local2*nb;
                magmablasSetKernelStream(queues[d][0]);
                
                /* synchronization to make sure panel arrived on gpu */
                magma_queue_sync(queues[d][1]);
                trace_gpu_start( d, 0, "gemm", "gemm" );
                /* transpose panel on GPU */
                magmablas_ztranspose( rows, nb, &d_lAP[d][(j%h)*nb*maxm], cols, panel_local[d], ldpan[d] );
            }
            
            /* gpu updating the trailing matrix */
            magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                         nb1, nb, c_one,
                         panel_local[d],       ldpan[d],
                         dAT(d, j, j_local2), lddat);
            magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                         nb1, m-(j+1)*nb, nb,
                         c_neg_one, dAT(d, j,   j_local2),         lddat,
                                    &(panel_local[d][nb*ldpan[d]]), ldpan[d],
                         c_one,     dAT(d, j+1, j_local2),         lddat );
        
            if ( d == (j+1) % ngpu ) {
                /* Set the local index where the current panel is */
                int loff    = j+1;
                int j_local = (j+1)/ngpu;
                int ldda    = maxm - (j+1)*nb;
                int cols    = m - (j+1)*nb;
                nb0 = min(nb, mindim - (j+1)*nb); /* size of the diagonal block */
                trace_gpu_end( d, 1 );
                
                if ( nb0 > 0 ) {
                    /* transpose the panel for sending it to cpu */
                    trace_gpu_start( d, 1, "comm", "get" );
                    magmablas_ztranspose( nb0, m-(j+1)*nb, dAT(d,loff,j_local), lddat, &d_lAP[d][((j+1)%h)*nb*maxm], ldda );
             
                    /* send the panel to cpu */
                    magma_zgetmatrix_async( cols, nb0,
                                            &d_lAP[d][((j+1)%h)*nb*maxm], ldda,
                                            W(j+1), ldw, queues[d][1] );

                    trace_gpu_end( d, 1 );
                }
            } else {
                trace_gpu_end( d, 0 );
            }
            
            d = (d+1) % ngpu;
        }
    
        /* update the remaining matrix by gpu owning the next panel */
        if ( (j+1) < s ) {
            int j_local = (j+1)/ngpu;
            int rows  = m - (j+1)*nb;
            
            d = (j+1) % ngpu;
            magma_setdevice(d);
            magmablasSetKernelStream(queues[d][0]);
            trace_gpu_start( d, 0, "gemm", "gemm" );

            magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                         n_local[d] - (j_local+1)*nb, nb,
                         c_one, panel_local[d],       ldpan[d],
                                dAT(d,j,j_local+1),  lddat );
            magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                         n_local[d]-(j_local+1)*nb, rows, nb,
                         c_neg_one, dAT(d,j,j_local+1),            lddat,
                                    &(panel_local[d][nb*ldpan[d]]), ldpan[d],
                         c_one,     dAT(d,j+1,  j_local+1),        lddat );
            trace_gpu_end( d, 0 );
        }
    } /* end of for j=1..s */
    /* ------------------------------------------------------------------------------ */
    
    /* Set the GPU number that holds the last panel */
    id = s % ngpu;
    
    /* Set the local index where the last panel is */
    j_local = s/ngpu;
    
    /* size of the last diagonal-block */
    nb0 = min(m - s*nb, n - s*nb);
    rows = m    - s*nb;
    cols = maxm - s*nb;
    
    if ( nb0 > 0 ) {
        magma_setdevice(id);
        
        /* wait for the last panel on cpu */
        magma_queue_sync( queues[id][1] );
    
        /* factor on cpu */
        lapackf77_zgetrf( &rows, &nb0, W(s), &ldw, ipiv+s*nb, &iinfo);
        if ( (*info == 0) && (iinfo > 0) )
            *info = iinfo + s*nb;
        
        /* send the factor to gpus */
        for( d=0; d < ngpu; d++ ) {
            magma_setdevice(d);
            j_local2 = j_local;
            if ( d < id ) j_local2 ++;
            
            if ( d == id || n_local[d] > j_local2*nb ) {
                magma_zsetmatrix_async( rows, nb0,
                                        W(s),     ldw,
                                        &d_lAP[d][(s%h)*nb*maxm],
                                        cols, queues[d][1] );
            }
        }
        
        for( d=0; d < ngpu; d++ ) {
            magma_setdevice(d);
            if ( d == 0 ) {
                for( i=s*nb; i < s*nb + nb0; ++i ) {
                    ipiv[i] += s*nb;
                }
            }
            magmablas_zlaswp_q( lddat, dAT(d,0,0), lddat, s*nb + 1, s*nb + nb0, ipiv, 1, queues[d][0] );
        }
        
        for( d=0; d < ngpu; d++ ) {
            magma_setdevice(d);
            magmablasSetKernelStream(queues[d][1]);
            
            /* wait for the pivoting to be done */
            magma_queue_sync( queues[d][0] );
            
            j_local2 = j_local;
            if ( d < id ) j_local2++;
            if ( d == id ) {
                /* the panel belond to this gpu */
                panel_local[d] = dAT(d,s,j_local);
                
                /* next column */
                nb1 = n_local[d] - j_local*nb-nb0;
                
                magmablas_ztranspose( rows, nb0, &d_lAP[d][(s%h)*nb*maxm], cols, panel_local[d], lddat );
                
                if ( nb1 > 0 ) {
                    magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                                 nb1, nb0, c_one,
                                 panel_local[d],        lddat,
                                 dAT(d,s,j_local)+nb0, lddat);
                }
            } else if ( n_local[d] > j_local2*nb ) {
                /* the panel belong to another gpu */
                panel_local[d] = d_panel[d];
                
                /* next column */
                nb1 = n_local[d] - j_local2*nb;
                
                magmablas_ztranspose( rows, nb0, &d_lAP[d][(s%h)*nb*maxm], cols, panel_local[d], nb );
                magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             nb1, nb0, c_one,
                             panel_local[d],     nb,
                             dAT(d,s,j_local2), lddat);
            }
        }
    } /* if ( nb0 > 0 ) */
    
    /* clean up */
    trace_finalize( "zgetrf_mgpu.svg","trace.css" );
    for( d=0; d < ngpu; d++ ) {
        magma_setdevice(d);
        magma_queue_sync( queues[d][0] );
        magma_queue_sync( queues[d][1] );
    }
    magma_setdevice( orig_dev );
    magmablasSetKernelStream( orig_stream );
    
    timer_start( time );
    timer_printf("\n Performance %f GFlop/s\n", FLOPS_ZGETRF(m,n) / 1e9 / time );

    return *info;
} /* magma_zgetrf2_mgpu */
Example #11
0
/**
    Purpose
    =======
 
    ZHETRF computes the factorization of a complex Hermitian matrix A
    using the Bunch-Kaufman diagonal pivoting method.  The form of the
    factorization is
 
     A = U*D*U**H  or  A = L*D*L**H
 
    where U (or L) is a product of permutation and unit upper (lower)
    triangular matrices, and D is Hermitian and block diagonal with
    1-by-1 and 2-by-2 diagonal blocks.

    This is the blocked version of the algorithm, calling Level 3 BLAS.

    Arguments
    ---------
    @param[in]
    UPLO    CHARACTER*1
      -     = 'U':  Upper triangle of A is stored;
      -     = 'L':  Lower triangle of A is stored.
 
    @param[in]
    N       INTEGER
            The order of the matrix A.  N >= 0.
  
    @param[in,out]
    A       COMPLEX*16 array, dimension (LDA,N)
            On entry, the Hermitian matrix A.  If UPLO = 'U', the leading
            N-by-N upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = 'L', the
            leading N-by-N lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
    \n
            On exit, the block diagonal matrix D and the multipliers used
            to obtain the factor U or L (see below for further details).
 
    @param[in]
    LDA     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).
 
    @param[out]
    IPIV    INTEGER array, dimension (N)
            Details of the interchanges and the block structure of D.
            If IPIV(k) > 0, then rows and columns k and IPIV(k) were
            interchanged and D(k,k) is a 1-by-1 diagonal block.
            If UPLO = 'U' and IPIV(k) = IPIV(k-1) < 0, then rows and
            columns k-1 and -IPIV(k) were interchanged and D(k-1:k,k-1:k)
            is a 2-by-2 diagonal block.  If UPLO = 'L' and IPIV(k) =
            IPIV(k+1) < 0, then rows and columns k+1 and -IPIV(k) were
            interchanged and D(k:k+1,k:k+1) is a 2-by-2 diagonal block.

    @param[out]
    INFO    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
      -     > 0:  if INFO = i, D(i,i) is exactly zero.  The factorization
                  has been completed, but the block diagonal matrix D is
                  exactly singular, and division by zero will occur if it
                  is used to solve a system of equations.

    Further Details
    ===============
    If UPLO = 'U', then A = U*D*U', where
    U = P(n)*U(n)* ... *P(k)U(k)* ...,
    i.e., U is a product of terms P(k)*U(k), where k decreases from n to
    1 in steps of 1 or 2, and D is a block diagonal matrix with 1-by-1
    and 2-by-2 diagonal blocks D(k).  P(k) is a permutation matrix as
    defined by IPIV(k), and U(k) is a unit upper triangular matrix, such
    that if the diagonal block D(k) is of order s (s = 1 or 2), then
 
               (   I    v    0   )   k-s
       U(k) =  (   0    I    0   )   s
               (   0    0    I   )   n-k
                  k-s   s   n-k
 
    If s = 1, D(k) overwrites A(k,k), and v overwrites A(1:k-1,k).
    If s = 2, the upper triangle of D(k) overwrites A(k-1,k-1), A(k-1,k),
    and A(k,k), and v overwrites A(1:k-2,k-1:k).
  
    If UPLO = 'L', then A = L*D*L', where
       L = P(1)*L(1)* ... *P(k)*L(k)* ...,
    i.e., L is a product of terms P(k)*L(k), where k increases from 1 to
    n in steps of 1 or 2, and D is a block diagonal matrix with 1-by-1
    and 2-by-2 diagonal blocks D(k).  P(k) is a permutation matrix as
    defined by IPIV(k), and L(k) is a unit lower triangular matrix, such
    that if the diagonal block D(k) is of order s (s = 1 or 2), then
  
               (   I    0     0   )  k-1
       L(k) =  (   0    I     0   )  s
               (   0    v     I   )  n-k-s+1
                  k-1   s  n-k-s+1
  
    If s = 1, D(k) overwrites A(k,k), and v overwrites A(k+1:n,k).
    If s = 2, the lower triangle of D(k) overwrites A(k,k), A(k+1,k),
    and A(k+1,k+1), and v overwrites A(k+2:n,k:k+1).
 
    @ingroup magma_zhetrf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_zhetrf(
    magma_uplo_t uplo, magma_int_t n,
    magmaDoubleComplex *A, magma_int_t lda,
    magma_int_t *ipiv, magma_int_t *info)
{
    #define  A(i, j) ( A + (j)*lda  + (i))
    #define dA(i, j) (dA + (j)*ldda + (i))

    /* .. Local Scalars .. */
    magma_int_t upper;
    magma_int_t nb = magma_get_zhetrf_nb(n);
    magma_int_t iinfo = 0, nk, kb;

    /* .. Executable Statements .. */
    /* Test the input parameters. */
    *info = 0;
    upper = (uplo == MagmaUpper);
    if ( !upper && uplo != MagmaLower ) {
        *info = -1;
    } else if ( n < 0 ) {
        *info = -2;
    } else if ( lda < max( 1, n ) ) {
        *info = -4;
    }
    if ( *info != 0 ) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    magma_int_t ldda = 32*((n+31)/32);
    magmaDoubleComplex *dA, *dW;
    if ((MAGMA_SUCCESS != magma_zmalloc( &dA, n*ldda  )) ||
        (MAGMA_SUCCESS != magma_zmalloc( &dW, (1+nb)*ldda ))) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }
    magma_queue_t stream[2];
    magma_event_t event[2];
    magma_queue_create( &stream[0] );
    magma_queue_create( &stream[1] );
    magma_event_create( &event[0] );
    magma_event_create( &event[1] );
    trace_init( 1, 1, 2, (CUstream_st**)stream );

    /* copy matrix to GPU */
    trace_gpu_start( 0, 0, "set", "setA" );
    //magma_zsetmatrix_async( n, n, A(0,0), lda, dA(0,0), ldda, stream[0] );
    if ( upper ) {
       for (int k = 0; k < n; k+=nb ) {
           kb = min(nb, n-k);
           magma_zsetmatrix_async( k+kb, kb, A(0,k), lda, dA(0,k), ldda, stream[0] );
       }
    } else {
       for (int k = 0; k < n; k+=nb ) {
           kb = min(nb, n-k);
           magma_zsetmatrix_async( n-k, kb, A(k,k), lda, dA(k,k), ldda, stream[0] );
       }
    }
    trace_gpu_end( 0, 0 );

    if ( upper ) {

        /* Factorize A as U*D*U' using the upper triangle of A

           K is the main loop index, decreasing from N to 1 in steps of
           KB, where KB is the number of columns factorized by ZLAHEF;
           KB is either NB or NB-1, or K for the last block */

        kb = min(n,nb);
        for (int k = n-1; k >= 0; k-=kb ) {
            nk = k+1;
            kb = min(nb, nk);

            if ( k+1 > nb ) {

                /* Factorize columns k-kb+1:k of A and use blocked code to
                   update columns 1:k-kb */

                magma_zlahef_gpu( MagmaUpper, nk, kb, &kb, A( 0, 0 ), lda, dA( 0, 0 ), ldda, 
                                  &ipiv[0], dW, ldda, stream, event, &iinfo );
            } else {

                /* Use unblocked code to factorize columns 1:k of A */

                magma_queue_sync( stream[0] );
                magma_zgetmatrix( nk, nk, dA( 0, 0 ),ldda, A( 0, 0 ),lda );
                lapackf77_zhetf2( MagmaUpperStr, &nk, A( 0, 0 ), &lda, &ipiv[0], &iinfo );
                kb = k+1;
            }

            /* Set INFO on the first occurrence of a zero pivot */

            if ( *info == 0 && iinfo > 0 ) *info = iinfo;
        }
    } else {

        /* Factorize A as L*D*L' using the lower triangle of A

           K is the main loop index, increasing from 1 to N in steps of
           KB, where KB is the number of columns factorized by ZLAHEF;
           KB is either NB or NB-1, or N-K+1 for the last block */

        for (int k = 0; k < n; k += kb ) {
            nk = n-k;
            kb = min(nb, n - k);
            if ( k < n-nb ) {
                /* Factorize columns k:k+kb-1 of A and use blocked code to
                   update columns k+kb:n */
                magma_zlahef_gpu( MagmaLower, nk, nb, &kb, A( k, k ), lda, dA( k, k ), ldda,
                                  &ipiv[k], dW, ldda, stream, event, &iinfo );

            } else {
                /* Use unblocked code to factorize columns k:n of A */
                magma_queue_sync( stream[0] );
                magma_zgetmatrix( nk,nk, dA(k,k),ldda, A(k,k),lda );
                lapackf77_zhetf2( MagmaLowerStr, &nk, A( k, k ), &lda, &ipiv[k], &iinfo );
            }
            /* Set INFO on the first occurrence of a zero pivot */
            if ( *info == 0 && iinfo > 0 ) *info = iinfo + k;
            /* Adjust IPIV */
            for (int j = k; j < k + kb; j ++) {
                if ( ipiv[j] > 0 ) {
                    ipiv[j] = ipiv[j] + k;
                } else {
                    ipiv[j] = ipiv[j] - k;
                }
            }
        }
    }

    trace_finalize( "zhetrf.svg","trace.css" );
    magma_queue_sync( stream[0] );
    magma_queue_sync( stream[1] );
    magmablasSetKernelStream( NULL );
    magma_event_destroy( event[0] );
    magma_event_destroy( event[1] );
    magma_queue_destroy( stream[0] );
    magma_queue_destroy( stream[1] );
    magma_free( dA );
    magma_free( dW );
    return *info;
    /* End of ZHETRF */
}
Example #12
0
extern "C" magma_int_t
magma_dpotrf3_mgpu(int num_gpus, char uplo, magma_int_t m, magma_int_t n, 
                   magma_int_t off_i, magma_int_t off_j, magma_int_t nb,
                   double **d_lA,  magma_int_t ldda, 
                   double **d_lP,  magma_int_t lddp, 
                   double *a,      magma_int_t lda,   magma_int_t h,
                   cudaStream_t stream[][3], cudaEvent_t event[][5],
                   magma_int_t *info ) 
{
/*  -- MAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2012

    Purpose   
    =======   
    DPOTRF computes the Cholesky factorization of a real symmetric   
    positive definite matrix dA.   
    Auxiliary subroutine for dpotrf2_ooc. It is multiple gpu interface to compute 
    Cholesky of a "rectangular" matrix.

    The factorization has the form   
       dA = U**T * U,  if UPLO = 'U', or   
       dA = L  * L**T,  if UPLO = 'L',   
    where U is an upper triangular matrix and L is lower triangular.   

    This is the block version of the algorithm, calling Level 3 BLAS.   

    Arguments   
    =========   
    UPLO    (input) CHARACTER*1   
            = 'U':  Upper triangle of dA is stored;   
            = 'L':  Lower triangle of dA is stored.   

    N       (input) INTEGER   
            The order of the matrix dA.  N >= 0.   

    dA      (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N)   
            On entry, the symmetric matrix dA.  If UPLO = 'U', the leading   
            N-by-N upper triangular part of dA contains the upper   
            triangular part of the matrix dA, and the strictly lower   
            triangular part of dA is not referenced.  If UPLO = 'L', the   
            leading N-by-N lower triangular part of dA contains the lower   
            triangular part of the matrix dA, and the strictly upper   
            triangular part of dA is not referenced.   

            On exit, if INFO = 0, the factor U or L from the Cholesky   
            factorization dA = U**T * U or dA = L * L**T.   

    LDDA     (input) INTEGER   
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            dividable by 16.

    INFO    (output) INTEGER   
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value   
            > 0:  if INFO = i, the leading minor of order i is not   
                  positive definite, and the factorization could not be   
                  completed.   
    =====================================================================   */


    magma_int_t     j, jb, nb0, nb2, d, dd, id, j_local, j_local2, buf;
    char            uplo_[2] = {uplo, 0};
    double c_one     = MAGMA_D_ONE;
    double c_neg_one = MAGMA_D_NEG_ONE;
    double          d_one     =  1.0;
    double          d_neg_one = -1.0;
    int upper = lapackf77_lsame(uplo_, "U");
    double *dlpanel;
    magma_int_t n_local[MagmaMaxGPUs], ldpanel;
    //cudaEvent_t event0[MagmaMaxGPUs],  /* send row to CPU    */
    //            event1[MagmaMaxGPUs],  /* send diag to GPU   */
    //            event2[MagmaMaxGPUs],  /* offdiagonal update */
    //            event3[MagmaMaxGPUs],  /* send row to GPU    */
    //            event4[MagmaMaxGPUs];  /* lookahead          */
    const magma_int_t stream1 = 0, stream2 = 1, stream3 = 2;
    double *d_dinvA[MagmaMaxGPUs][2], *d_x[MagmaMaxGPUs][2]; /* used by dtrsm_work */

    *info = 0;
    if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (!upper && num_gpus*ldda < max(1,n)) {
        *info = -4;
    } else if (upper && ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* initialization */
    for( d=0; d<num_gpus; d++ ) {
      /* local-n and local-ld */
      if (upper) {
        n_local[d] = ((n/nb)/num_gpus)*nb;
        if (d < (n/nb)%num_gpus)
          n_local[d] += nb;
        else if (d == (n/nb)%num_gpus)
          n_local[d] += n%nb;
      } else {
        n_local[d] = ((m/nb)/num_gpus)*nb;
        if (d < (m/nb)%num_gpus)
          n_local[d] += nb;
        else if (d == (m/nb)%num_gpus)
          n_local[d] += m%nb;
      }
      //magma_setdevice(d);
      //magma_event_create( &event0[d] );
      //magma_event_create( &event1[d] );
      //magma_event_create( &event2[d] );
      //magma_event_create( &event3[d] );
      //magma_event_create( &event4[d] );
    }

    /* == initialize the trace */
    trace_init( 1, num_gpus, 3, (CUstream_st**)stream );

    if (upper) 
    {     
      /* ---------------------------------------------- */
      /* Upper-triangular case                          */
      /* > Compute the Cholesky factorization A = U'*U. */
      /* ---------------------------------------------- */
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
      /* invert the diagonals 
       * Allocate device memory for the inversed diagonal blocks, size=m*NB
       */
      for( d=0; d<num_gpus; d++ ) {
          magma_setdevice(d);
          for( j=0; j<2; j++ ) {
              cudaMalloc((void**)&d_dinvA[d][j], nb*nb*sizeof(double));
              cudaMalloc((void**)&d_x[d][j],      n*nb*sizeof(double));
              cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(double));
              cudaMemset(d_x[d][j],     0,  n*nb*sizeof(double));
          }
      }
      magma_setdevice(0);
#endif

      for (j=0; j<m; j+=nb) {

        /* Set the GPU number that holds the current panel */
        id  = (j/nb)%num_gpus;
        buf = (j/nb)%num_gpus;

        /* Set the local index where the current panel is */
        j_local = j/(nb*num_gpus);
        jb = min(nb, (m-j));

        /* Update the current diagonal block on stream1 */
        magma_setdevice(id);
        if( j > 0 ) {
            magmablasSetKernelStream(stream[id][stream1]);
            trace_gpu_start( id, stream1, "syrk", "syrk" );
            magma_dsyrk(MagmaUpper, MagmaTrans, jb, j, 
                        d_neg_one, dlA(id, 0, nb*j_local), ldda, 
                        d_one,     dlA(id, j, nb*j_local), ldda);
            trace_gpu_end( id, stream1 );
        }

        /* send the diagonal to cpu on stream1 */
        trace_gpu_start( id, stream1, "comm", "D to CPU" );
        magma_dgetmatrix_async( jb, jb,
                                dlA(id, j, nb*j_local), ldda,
                                Aup(j,j),               lda, 
                                stream[id][stream1] );
        trace_gpu_end( id, stream1 );

        /* update off-diagonal blocks in the panel */
        if( j > 0 ) {
            d = (j/nb+1)%num_gpus;
            for( dd=0; dd<num_gpus; dd++ ) {
                j_local2 = j_local+1;
                if( d > id ) j_local2 --;
                nb0 = nb*j_local2; 

                if( n_local[d] > nb0 ) {
                    magma_setdevice(d);
                    magmablasSetKernelStream(stream[d][stream2]);
                    if( d == id ) {
                        dlpanel = dlA(d, 0, nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlP(d, jb, 0, buf);
                        ldpanel = lddp;
                        magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu
                    }
                    trace_gpu_start( d, stream2, "gemm", "gemm" );
                    magma_dgemm(MagmaTrans, MagmaNoTrans, 
                                jb, n_local[d]-nb0, j, 
                                c_neg_one, dlpanel,        ldpanel, 
                                           dlA(d, 0, nb0), ldda,
                                c_one,     dlA(d, j, nb0), ldda);
                    trace_gpu_end( d, stream2 );
                    magma_event_record( event[d][2], stream[d][stream2] );
                }
                d = (d+1)%num_gpus;
            }
        }

        /* wait for panel and factorize it on cpu */
        magma_setdevice(id);
        magma_queue_sync( stream[id][stream1] );
        trace_cpu_start( 0, "getrf", "getrf" );
        lapackf77_dpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info);
        trace_cpu_end( 0 );
        if (*info != 0) {
            *info = *info + j;
            break;
        }

        /* send the diagonal to gpus on stream1 */
        if ( (j+jb) < n) {
          d = (j/nb+1)%num_gpus;
          for( dd=0; dd<num_gpus; dd++ ) {
            if( d == id ) {
                dlpanel = dlA(d, j, nb*j_local);
                ldpanel = ldda;
            } else {
                dlpanel = dlP(d, 0, 0, buf);
                ldpanel = lddp;
            }
            magma_setdevice(d);
            trace_gpu_start( d, stream1, "comm", "comm" );
            magma_dsetmatrix_async( jb, jb,
                                    Aup(j,j), lda,
                                    dlpanel,  ldpanel, 
                                    stream[d][stream1] );
            trace_gpu_end( d, stream1 );
            magma_event_record( event[d][1], stream[d][stream1] );
            d = (d+1)%num_gpus;
          }
        } else {
          magma_setdevice(id);
          trace_gpu_start( id, stream1, "comm", "comm" );
          magma_dsetmatrix_async( jb, jb,
                                  Aup(j,j),               lda,
                                  dlA(id, j, nb*j_local), ldda, 
                                  stream[id][stream1] );
          trace_gpu_end( id, stream1 );
        }

        /* panel-factorize the off-diagonal */
        if ( (j+jb) < n) {
            d = (j/nb+1)%num_gpus;
            for( dd=0; dd<num_gpus; dd++ ) {
                /* next column */
                j_local2 = j_local+1;
                if( d > id ) j_local2--;
                if( d == id ) {
                    dlpanel = dlA(d,j,nb*j_local);
                    ldpanel = ldda;
                } else {
                    dlpanel = dlP(d, 0, 0, buf);
                    ldpanel = lddp;
                }
                nb2 = n_local[d] - j_local2*nb;
                nb0 = min(nb, nb2);
        
                magma_setdevice(d);
                //magma_queue_sync( stream[d][stream1] );  // synch on chol for remaining update
                //magma_queue_sync( stream[d][stream2] );
                if( j+jb < m && d == (j/nb+1)%num_gpus ) { 
                    /* owns the next column, look-ahead next block on stream1 */
                    magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update 
                    magmablasSetKernelStream(stream[d][stream1]);
                    trace_gpu_start( d, stream1, "trsm", "trsm" );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
                    magmablas_dtrsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, 
                                          jb, nb0, c_one,
                                          dlpanel,                ldpanel, 
                                          dlA(d, j, nb*j_local2), ldda,
                                          d_dinvA[d][0], d_x[d][0] );
#else
                    magma_dtrsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, 
                                 jb, nb0, c_one,
                                 dlpanel,                ldpanel, 
                                 dlA(d, j, nb*j_local2), ldda);
#endif
                    magma_event_record( event[d][4], stream[d][stream1] );
                    trace_gpu_end( d, stream1 );
                } else if( nb2 > 0 ) {
                    /* update all the blocks on stream2 */
                    magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor 
                    trace_gpu_start( d, stream2, "trsm", "trsm" );
                    magmablasSetKernelStream(stream[d][stream2]);
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
                    magmablas_dtrsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, 
                                          jb, nb2, c_one,
                                          dlpanel,                ldpanel, 
                                          dlA(d, j, nb*j_local2), ldda,
                                          d_dinvA[d][1], d_x[d][1] );
#else
                    magma_dtrsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, 
                                 jb, nb2, c_one,
                                 dlpanel,                ldpanel, 
                                 dlA(d, j, nb*j_local2), ldda);
#endif
                    trace_gpu_end( d, stream2 );
                }
                d = (d+1)%num_gpus;
            } /* end of for */

/* ======================================================================================== */

            d = (j/nb+1)%num_gpus;
            /* next column */
            j_local2 = j_local+1;
            if( d > id ) j_local2--;
            nb0 = min(nb, n_local[d]-nb*j_local2 );
        
            /* even on 1 gpu, off-diagonals are copied to cpu (synchronize at the end).      *
             * so we have the Cholesky factor, but only diagonal submatrix of the big panel, *
             * on cpu at the end.                                                            */
            if( j+jb < m ) { 
                int d2, id2, j2, buf2;
                magma_setdevice(d);
                /* make sure all the previous sets are done */
                if( h < num_gpus ) {
                    /* > offdiagonal */
                    for( d2=0; d2<num_gpus; d2++ ) {
                        j2 = j - (1+d2)*nb;
                        if( j2 < 0 ) break;
                        id2  = (j2/nb)%num_gpus;
                        magma_queue_wait_event( stream[d][stream3], event[id2][0] );
                    }

                    /* > diagonal */
                    for( d2=0; d2<num_gpus; d2++ ) {
                        j2 = j - d2*nb;
                        if( j2 < 0 ) break;
                        id2  = (j2/nb)%num_gpus;
                        magma_queue_wait_event( stream[d][stream3], event[id2][1] );
                    }
                }
                /* lookahead */
                magma_queue_wait_event( stream[d][stream3], event[d][4] );

                trace_gpu_start( d, stream3, "comm", "row to CPU" );
                magma_dgetmatrix_async( (j+jb), nb0,
                                        dlA(d, 0, nb*j_local2), ldda,
                                        Aup(0,j+jb),            lda, 
                                        stream[d][stream3] );
                trace_gpu_end( d, stream3 );
                magma_event_record( event[d][3], stream[d][stream3] );
/* needed on pluto */
magma_queue_sync( stream[d][stream3] );

                /* wait for the off-diagonal on cpu */
                //magma_setdevice(id);
                //magma_queue_sync( stream[id][stream3] );

                /* broadcast rows to gpus on stream2 */
                buf2 = ((j+jb)/nb)%num_gpus;
                for( d2=0; d2<num_gpus; d2++ ) {
                    if( d2 != d ) 
                    {
                        magma_setdevice(d2);
                        trace_gpu_start( d2, stream3, "comm", "row to GPUs" );
                        magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // rows arrived at cpu on stream3
                        magma_dsetmatrix_async( j+jb, nb0,
                                                Aup(0,j+jb),        lda,
                                                dlP(d2,nb0,0,buf2), lddp, 
                                                stream[d2][stream3] );
                        trace_gpu_end( d2, stream3 );
                        magma_event_record( event[d2][0], stream[d2][stream3] );
                    }
                } 
            }

/* ======================================================================================== */
            /* gpu owning the next column                    */
            /* after look ahead, update the remaining blocks */
            if( j+jb < m ) /* no update on the last block column */ 
            {
                d = (j/nb+1)%num_gpus;
                /* next column */
                j_local2 = j_local+1;
                if( d > id ) j_local2--;
                if( d == id ) {
                    dlpanel = dlA(d, j, nb*j_local);
                    ldpanel = ldda;
                } else {
                    dlpanel = dlP(d, 0, 0, buf);
                    ldpanel = lddp;
                }
                nb0 = min(nb, n_local[d]-nb*j_local2 );
                nb2 =         n_local[d]-nb*j_local2 - nb0;
        
                /* update the remaining blocks */
                if( nb2 > 0 ) {
                    magma_setdevice(d);
                    magmablasSetKernelStream(stream[d][stream2]);  
                    magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor
                    trace_gpu_start( d, stream2, "trsm", "trsm" );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
                    magmablas_dtrsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, 
                                          jb, nb2, c_one,
                                          dlpanel,                    ldpanel, 
                                          dlA(d, j, nb*j_local2+nb0), ldda,
                                          d_dinvA[d][1], d_x[d][1] );
#else
                    magma_dtrsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, 
                                 jb, nb2, c_one,
                                 dlpanel,                    ldpanel, 
                                 dlA(d, j, nb*j_local2+nb0), ldda);
#endif
                    trace_gpu_end( d, stream2 );
                }
            }
        } /* end of dtrsm */
      } /* end of for j=1, .., n */
    } else { 

      /* ---------------------------------------------- */
      /* Lower-triangular case                          */
      /* > Compute the Cholesky factorization A = L*L'. */
      /* ---------------------------------------------- */
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
      /* 
       * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE
       */
      for( d=0; d<num_gpus; d++ ) {
         magma_setdevice(d);
         for( j=0; j<2; j++ ) {
             cudaMalloc((void**)&d_dinvA[d][j], nb*nb*sizeof(double));
             cudaMalloc((void**)&d_x[d][j],     nb*m *sizeof(double));
             cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(double));
             cudaMemset(d_x[d][j],     0, nb* m*sizeof(double));
         }
      }
      magma_setdevice(0);
#endif

      for (j=0; j<n; j+=nb) {

        /* Set the GPU number that holds the current panel */
        id  = (j/nb)%num_gpus;
        buf = (j/nb)%num_gpus;

        /* Set the local index where the current panel is */
        j_local = j/(nb*num_gpus);
        jb = min(nb, (n-j));

        /* Update the current diagonal block on stream1 */
        magma_setdevice(id);
        if( j > 0 ) {
            magmablasSetKernelStream(stream[id][stream1]);
            magma_dsyrk(MagmaLower, MagmaNoTrans, jb, j,
                        d_neg_one, dlA(id, nb*j_local, 0), ldda,
                        d_one,     dlA(id, nb*j_local, j), ldda);
        }

        /* send the diagonal to cpu on stream1 */
        magma_dgetmatrix_async( jb, jb,
                                dlA(id, nb*j_local, j), ldda,
                                Alo(j,j),               lda, 
                                stream[id][stream1] );

        /* update off-diagonal blocks of the panel */
        if( j > 0 ) {
            d = (j/nb+1)%num_gpus;
            for( dd=0; dd<num_gpus; dd++ ) {
                j_local2 = j_local+1;
                if( d > id ) j_local2 --;
                nb0 = nb*j_local2; 

                if( nb0 < n_local[d] ) {
                    magma_setdevice(d);
                    magmablasSetKernelStream(stream[d][stream2]);
                    if( d == id ) {
                        dlpanel = dlA(d, nb*j_local, 0);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlPT(d,0,jb,buf);
                        ldpanel = nb;
                        magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu
                    }
                    magma_dgemm( MagmaNoTrans, MagmaTrans,
                                 n_local[d]-nb0, jb, j,
                                 c_neg_one, dlA(d, nb0, 0), ldda,
                                            dlpanel,        ldpanel,
                                 c_one,     dlA(d, nb0, j), ldda);
                    magma_event_record( event[d][2], stream[d][stream2] );
                }
                d = (d+1)%num_gpus;
            }
        }

        /* wait for the panel and factorized it on cpu */
        magma_setdevice(id);
        magma_queue_sync( stream[id][stream1] );
        lapackf77_dpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info);
        if (*info != 0) {
           *info = *info + j;
           break;
        }

        /* send the diagonal to gpus on stream1 */
        if ( (j+jb) < m) {
          d = (j/nb+1)%num_gpus;
          for( dd=0; dd<num_gpus; dd++ ) {
            if( d == id ) {
                dlpanel = dlA(d, nb*j_local, j);
                ldpanel = ldda;
            } else {
                dlpanel = dlPT(d, 0, 0, buf);
                ldpanel = nb;
            }
            magma_setdevice(d);
            magma_dsetmatrix_async( jb, jb,
                                    Alo(j,j), lda,
                                    dlpanel,  ldpanel, 
                                    stream[d][stream1] );
            magma_event_record( event[d][1], stream[d][stream1] );
            d = (d+1)%num_gpus;
          }
        } else {
          magma_setdevice(id);
          magma_dsetmatrix_async( jb, jb,
                                  Alo(j,j),               lda,
                                  dlA(id, nb*j_local, j), ldda, 
                                  stream[id][stream1] );
        }

        /* panel factorize the off-diagonal */
        if ( (j+jb) < m) {
          d = (j/nb+1)%num_gpus;
          for( dd=0; dd<num_gpus; dd++ ) {
            /* next column */
            j_local2 = j_local+1;
            if( d > id ) j_local2--;
            if( d == id ) {
                dlpanel = dlA(d, nb*j_local, j);
                ldpanel = ldda;
            } else {         
                dlpanel = dlPT(d, 0, 0, buf);
                ldpanel = nb;
            }
            nb2 = n_local[d] - j_local2*nb;
            nb0 = min(nb, nb2 );

            magma_setdevice(d);
            if( j+nb < n && d == (j/nb+1)%num_gpus ) { /* owns next column, look-ahead next block on stream1 */
              magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update 
              magmablasSetKernelStream(stream[d][stream1]);
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
              magmablas_dtrsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, 
                                    nb0, jb, c_one,
                                    dlpanel,                ldpanel, 
                                    dlA(d, nb*j_local2, j), ldda,
                                    d_dinvA[d][0], d_x[d][0] );
#else
              magma_dtrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, 
                           nb0, jb, c_one,
                           dlpanel,                ldpanel, 
                           dlA(d, nb*j_local2, j), ldda);
#endif
              magma_event_record( event[d][4], stream[d][stream1] );
            } else if( nb2 > 0 ) { /* other gpus updating all the blocks on stream2 */
              /* update the entire column */
              magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for the cholesky factor
              magmablasSetKernelStream(stream[d][stream2]);
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
              magmablas_dtrsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, 
                                    nb2, jb, c_one,
                                    dlpanel,                ldpanel, 
                                    dlA(d, nb*j_local2, j), ldda,
                                    d_dinvA[d][1], d_x[d][1] );
#else
              magma_dtrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, 
                           nb2, jb, c_one,
                           dlpanel,                ldpanel, 
                           dlA(d, nb*j_local2, j), ldda);
#endif
            }
            d = (d+1)%num_gpus;
          } /* end for d */

/* ======================================================================================== */

          d = (j/nb+1)%num_gpus;
          /* next column */
          j_local2 = j_local+1;
          if( d > id ) j_local2--;
          nb0 = min(nb, n_local[d]-nb*j_local2 );

          /* even on 1 gpu, we copy off-diagonal to cpu (but don't synchronize).  */
          /* so we have the Cholesky factor on cpu at the end.                    */
          if( j+jb < n ) {
              int d2, id2, j2, buf2;
              magma_setdevice(d);
              /* make sure all the previous sets are done */
              if( h < num_gpus ) {
                  /* > offdiagonal */
                  for( d2=0; d2<num_gpus; d2++ ) {
                      j2 = j - (1+d2)*nb;
                      if( j2 < 0 ) break;
                      id2  = (j2/nb)%num_gpus;
                      magma_queue_wait_event( stream[d][stream3], event[id2][0] );
                  }

                  /* > diagonal */
                  for( d2=0; d2<num_gpus; d2++ ) {
                      j2 = j - d2*nb;
                      if( j2 < 0 ) break;
                      id2  = (j2/nb)%num_gpus;
                      magma_queue_wait_event( stream[d][stream3], event[id2][1] );
                  }
              }
              // lookahead done
              magma_queue_wait_event( stream[d][stream3], event[d][4] );

              magma_dgetmatrix_async( nb0, j+jb,
                                      dlA(d, nb*j_local2, 0), ldda,
                                      Alo(j+jb,0),            lda, 
                                      stream[d][stream3] );
              magma_event_record( event[d][3], stream[d][stream3] );
/* syn on rows on CPU, seem to be needed on Pluto */
magma_queue_sync( stream[d][stream3] );

              /* broadcast the rows to gpus */
              buf2 = ((j+jb)/nb)%num_gpus;
              for( d2=0; d2<num_gpus; d2++ ) {
                  if( d2 != d ) 
                  {
                      magma_setdevice(d2);
                      magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // getmatrix done
                      magma_dsetmatrix_async( nb0, j+jb,
                                              Alo(j+jb,0),         lda,
                                              dlPT(d2,0,nb0,buf2), nb, 
                                              stream[d2][stream3] );
                      magma_event_record( event[d2][0], stream[d2][stream3] );
                  }
              }
          }

/* ======================================================================================== */
          /* gpu owing the next column updates remaining blocks on stream2 */
          if( j+nb < n ) { // no lookahead on the last block column
            d = (j/nb+1)%num_gpus;

            /* next column */
            j_local2 = j_local+1;
            if( d > id ) j_local2--;
            if( d == id ) {
              dlpanel = dlA(d, nb*j_local, j);
              ldpanel = ldda;
            } else {         
              dlpanel = dlPT(d,0,0,buf);
              ldpanel = nb;
            }
            nb0 = min(nb, n_local[d]-nb*j_local2 );
            nb2 = n_local[d] - j_local2*nb - nb0;

            if( nb2 > 0 ) {
                magma_setdevice(d);
                magmablasSetKernelStream(stream[d][stream2]);
                /* update the remaining blocks in the column */
                magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
                magmablas_dtrsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, 
                                      nb2, jb, c_one,
                                      dlpanel,                    ldpanel, 
                                      dlA(d, nb*j_local2+nb0, j), ldda,
                                      d_dinvA[d][1], d_x[d][1] );
#else
                magma_dtrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, 
                             nb2, jb, c_one,
                             dlpanel,                    ldpanel, 
                             dlA(d, nb*j_local2+nb0, j), ldda);
#endif
            }
          }
        }
      }
    } /* end of else not upper */

    /* == finalize the trace == */
    trace_finalize( "dpotrf.svg","trace.css" );
    for( d=0; d<num_gpus; d++ ) {
      magma_setdevice(d);
      magma_queue_sync( stream[d][0] ); 
      magma_queue_sync( stream[d][1] );
      magma_queue_sync( stream[d][2] );
      magmablasSetKernelStream(NULL);

      //magma_event_destroy( event0[d] ); 
      //magma_event_destroy( event1[d] );
      //magma_event_destroy( event2[d] );
      //magma_event_destroy( event3[d] );
      //magma_event_destroy( event4[d] );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
      for( j=0; j<2; j++ ) {
          magma_free( d_dinvA[d][j] );
          magma_free( d_x[d][j] );
      }
#endif
    }
    magma_setdevice(0);

    return *info;
} /* magma_dpotrf_mgpu */
Example #13
0
extern "C" magma_int_t
magma_zpotrf2_mgpu(
    int num_gpus, magma_uplo_t uplo, magma_int_t m, magma_int_t n, 
    magma_int_t off_i, magma_int_t off_j, magma_int_t nb,
    magmaDoubleComplex_ptr *d_lA, size_t d_lA_offset, magma_int_t ldda, 
    magmaDoubleComplex_ptr *d_lP,  magma_int_t lddp, 
    magmaDoubleComplex *a,      magma_int_t lda,   magma_int_t h,
    magma_queue_t *queues,
    magma_int_t *info )
{
/*  -- clMAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date November 2014

    Purpose   
    =======   
    ZPOTRF computes the Cholesky factorization of a complex Hermitian   
    positive definite matrix dA.   

    The factorization has the form   
       dA = U**H * U,  if UPLO = 'U', or   
       dA = L  * L**H,  if UPLO = 'L',   
    where U is an upper triangular matrix and L is lower triangular.   

    This is the block version of the algorithm, calling Level 3 BLAS.   

    Arguments   
    =========   
    UPLO    (input) CHARACTER*1   
            = 'U':  Upper triangle of dA is stored;   
            = 'L':  Lower triangle of dA is stored.   

    N       (input) INTEGER   
            The order of the matrix dA.  N >= 0.   

    dA      (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N)   
            On entry, the Hermitian matrix dA.  If UPLO = 'U', the leading   
            N-by-N upper triangular part of dA contains the upper   
            triangular part of the matrix dA, and the strictly lower   
            triangular part of dA is not referenced.  If UPLO = 'L', the   
            leading N-by-N lower triangular part of dA contains the lower   
            triangular part of the matrix dA, and the strictly upper   
            triangular part of dA is not referenced.   

            On exit, if INFO = 0, the factor U or L from the Cholesky   
            factorization dA = U**H * U or dA = L * L**H.   

    LDDA     (input) INTEGER   
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            divisible by 16.

    INFO    (output) INTEGER   
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value   
            > 0:  if INFO = i, the leading minor of order i is not   
                  positive definite, and the factorization could not be   
                  completed.   
    =====================================================================   */

    magma_int_t     j, jb, nb0, nb2, dd, d, id, j_local, j_local2, buf;
    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    double          d_one     =  1.0;
    double          d_neg_one = -1.0;
    magmaDoubleComplex_ptr dlpanel;
    size_t dlpanel_offset;
    magma_int_t n_local[MagmaMaxGPUs], ldpanel;
    magma_event_t* events[MagmaMaxGPUs][2];

    *info = 0;
    if ( (uplo != MagmaUpper) && (uplo != MagmaLower) ) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if ((uplo != MagmaUpper) && num_gpus*ldda < max(1,n)) {
        *info = -4;
    } else if ((uplo == MagmaUpper) && ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    {

      for( d=0; d<num_gpus; d++ ) {
        /* local-n and local-ld */
        if (uplo == MagmaUpper) {
            n_local[d] = ((n/nb)/num_gpus)*nb;
            if (d < (n/nb)%num_gpus)
               n_local[d] += nb;
            else if (d == (n/nb)%num_gpus)
              n_local[d] += n%nb;
        } else {
            n_local[d] = ((m/nb)/num_gpus)*nb;
            if (d < (m/nb)%num_gpus)
               n_local[d] += nb;
            else if (d == (m/nb)%num_gpus)
              n_local[d] += m%nb;
        }
      }

      /* Use blocked code. */
      if (uplo == MagmaUpper) 
        {
          /* ---------------------------------------------- */
          /* Upper-triangular case                          */
          /* > Compute the Cholesky factorization A = U'*U. */
          /* ---------------------------------------------- */
            printf("support Lower case only!\n");
        } else { 
            trace_init(1, num_gpus, 2, queues);
            /* -------------------------------------------- */
            /* Lower-triangular case                        */
            /* Compute the Cholesky factorization A = L*L'. */
            /* -------------------------------------------- */
            // try one queue per device firstly!
            for (j=0; j<n; j+=nb) {

              /* Set the GPU number that holds the current panel */
              id  = (j/nb)%num_gpus;
              buf = (j/nb)%num_gpus;

              /* Set the local index where the current panel is */
              j_local = j/(nb*num_gpus);
              jb = min(nb, (n-j));

              if( j > 0 ) {
                  /* needed on pluto... */
                  //magma_setdevice(id);
                  magma_queue_sync( queues[id*2] ); // wait for the column on CPU

                  /* broadcast offdiagonal row to all gpus */
                  d = (j/nb+1)%num_gpus;
                  for( dd=0; dd<num_gpus; dd++ ) {
                      if( d != id ) {
                          /* wait for it on CPU */
                          //magma_queue_sync( queues[d*2] );

                          /* send it to GPU */
                          trace_gpu_start(d, 0, "comm_set", "set");
                          events[d][0] = trace_gpu_end(d, 0);
                          magma_zsetmatrix_async( jb, j,
                                                  Alo(j,0), 0,      lda,
                                                  dlPT(d,0,jb,buf), nb, 
                                                  queues[d*2], events[d][0] );
                         clFlush(queues[d*2]);
                      }
                      d = (d+1)%num_gpus;
                  }
              }

              /* Update the current diagonal block */
              if( j > 0 ) {
                  trace_gpu_start(id, 1, "herk", "herk");
                  events[id][1] = trace_gpu_end(id, 1);
                  magma_zherk_trace(MagmaLower, MagmaNoTrans, jb, j,
                              d_neg_one, dlA(id, nb*j_local, 0), ldda,
                              d_one,     dlA(id, nb*j_local, j), ldda,
                              queues[id*2+1], events[id][1]);
                 // magma_queue_sync( queues[id*2+1] ); // wait for syrk
                  clWaitForEvents(1, events[id][1]);
              }

              /* send the diagonal to cpu */
              trace_gpu_start(id, 0, "comm_get", "get");
              events[id][0] = trace_gpu_end(id, 0);
              magma_zgetmatrix_async( jb, jb,
                                      dlA(id, nb*j_local, j), ldda,
                                      Alo(j,j), 0,            lda, 
                                      queues[id*2], events[id][0] );
              clFlush(queues[id*2]);


              /* update the offdiagonal blocks */
              if ( j > 0 ) {
                  /* compute the block-rows of the panel */
                  d = (j/nb+1)%num_gpus;
                  for( dd=0; dd<num_gpus; dd++ ) {
                      j_local2 = j_local+1;
                      if( d > id ) j_local2 --;
                      nb0 = nb*j_local2;

                      if( nb0 < n_local[d] ) {
                          if( d != id ) {
                              //dlpanel = dlPT(d);
                              dlpanel = d_lP[d];
                              dlpanel_offset = dlPT_offset(0, jb, buf);
                              ldpanel = nb;

                              /* wait for offdiagonal row */
                              magma_queue_sync(queues[d*2]);
                          } else {
                              dlpanel = d_lA[d];
                              dlpanel_offset = dlA_offset(nb*j_local, 0);
                              ldpanel = ldda;
                          }
                        
                          trace_gpu_start(d, 1, "gemm", "gemm");
                          events[d][1] = trace_gpu_end(d, 1);
                          magma_zgemm_trace( MagmaNoTrans, MagmaConjTrans,
                                       n_local[d]-nb0, jb, j,
                                       c_neg_one, dlA(d, nb0, 0), ldda,
                                                  dlpanel, dlpanel_offset, ldpanel,
                                       c_one,     dlA(d, nb0, j), ldda, 
                                       queues[d*2+1], events[d][1]);
                      }
                      d = (d+1)%num_gpus;
                  }
              }

              /* send the diagonal to cpu */
              /*
              trace_gpu_start(id, 0, "comm_get", "get");
              events[id][0] = trace_gpu_end(id, 0);
              magma_zgetmatrix_async( jb, jb,
                                      dlA(id, nb*j_local, j), ldda,
                                      Alo(j,j), 0,            lda, 
                                      queues[id*2], events[id][0] );
              clFlush(queues[id*2]);
              */
              /* factor the diagonal */
              magma_queue_sync( queues[id*2] );
              trace_cpu_start(0, "potrf", "potrf");
              lapackf77_zpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info);
              trace_cpu_end(0);
              if (*info != 0) {
                  printf("row number: %d\n", j);
                  *info = *info + j;
                  break;
              }

              /* send the diagonal to gpus */
              if ( (j+jb) < m ) {
                  d = (j/nb+1)%num_gpus;
                  for( dd=0; dd<num_gpus; dd++ ) {
                      if( d == id ) {
                          dlpanel = d_lA[d];
                          dlpanel_offset = dlA_offset(nb*j_local, j);
                          ldpanel = ldda;
                      } else {
                          //dlpanel = dlPT(d);
                          dlpanel = d_lP[d];
                          dlpanel_offset = dlPT_offset(0, 0, buf);
                          ldpanel = nb;
                      }
                      trace_gpu_start(d, 0, "comm_set", "set");
                      events[d][0] = trace_gpu_end(d, 0);
                      magma_zsetmatrix_async( jb, jb,
                                              Alo(j,j), 0, lda,
                                              dlpanel,  dlpanel_offset, ldpanel, 
                                              queues[d*2], events[d][0] );
                      clFlush(queues[d*2]);
                      d = (d+1)%num_gpus;
                  }
              } else {
                  trace_gpu_start(id, 0, "comm_set", "set");
                  events[id][0] = trace_gpu_end(id, 0);
                  magma_zsetmatrix_async( jb, jb,
                                          Alo(j,j),       0,      lda,
                                          dlA(id, nb*j_local, j), ldda, 
                                          queues[id*2], events[id][0] );
                  clFlush(queues[id*2]);
              }

              /* factorize off-diagonal blocks */
              if ( (j+jb) < m ) {
                  d = (j/nb+1)%num_gpus;
                  for( dd=0; dd<num_gpus; dd++ ) {
                      /* next column */
                      j_local2 = j_local+1;
                      if( d > id ) j_local2--;
                      if( d == id ) {
                          dlpanel = d_lA[d];
                          dlpanel_offset = dlA_offset(nb*j_local, j);
                          ldpanel = ldda;
                      } else {         
                          //dlpanel = dlPT(d);
                          dlpanel = d_lP[d];
                          dlpanel_offset = dlPT_offset(0, 0, buf);
                          ldpanel = nb;
                      }
                      nb2 = n_local[d] - j_local2*nb;
                      nb0 = min(nb, nb2 );
                        
                      magma_queue_sync(queues[d*2]);
                      // wait for the diagonal
                      if( j+jb < n && d == (j/nb+1)%num_gpus ) {
                          /* owns the next column, look-ahead the column */
                          trace_gpu_start(d, 1, "trsm", "trsm");
                          events[d][1] = trace_gpu_end(d, 1);
                          magma_ztrsm_trace( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, 
                                       nb0, jb, c_one,
                                       dlpanel,  dlpanel_offset, ldpanel, 
                                       dlA(d, nb*j_local2, j), ldda,
                                       queues[d*2+1], events[d][1]);
                          /* send the column to cpu */
                          if( j+jb < n ) {
                              magma_queue_sync( queues[d*2+1] ); // wait for lookahead
                              trace_gpu_start(d, 0, "comm_get", "get");
                              events[d][0] = trace_gpu_end(d, 0);
                              magma_zgetmatrix_async( nb0, j+jb,
                                                      dlA(d, nb*j_local2, 0), ldda,
                                                      Alo(j+jb,0), 0,           lda, 
                                                      queues[d*2], events[d][0]);
                              clFlush(queues[d*2]);
                          }
                          /* update the remaining blocks */
                          nb2 = nb2 - nb0;
                          trace_gpu_start(d, 1, "trsm", "trsm");
                          events[d][1] = trace_gpu_end(d, 1);
                          magma_ztrsm_trace( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, 
                                       nb2, jb, c_one,
                                       dlpanel, dlpanel_offset, ldpanel, 
                                       dlA(d, nb*j_local2+nb0, j), ldda, 
                                       queues[d*2+1], events[d][1]);
                      } else if( nb2 > 0 ) {
                          /* update the entire trailing matrix */
                          trace_gpu_start(d, 1, "trsm", "trsm");
                          events[d][1] = trace_gpu_end(d, 1);
                          magma_ztrsm_trace( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, 
                                       nb2, jb, c_one,
                                       dlpanel, dlpanel_offset, ldpanel, 
                                       dlA(d, nb*j_local2, j), ldda, 
                                       queues[d*2+1], events[d][1]);
                      }
                      d = (d+1)%num_gpus;
                  }
              }
            }
          } /* end of else not upper */

          /* clean up */
          for( d=0; d<num_gpus; d++ ) {
              magma_queue_sync( queues[d*2] );
              magma_queue_sync( queues[d*2+1] );
          }
          trace_finalize("dpotrf2_mgpu.svg", "trace.css");

    } /* end of not lapack */

    return *info;
} /* magma_zpotrf_mgpu */
Example #14
0
extern "C" magma_int_t
magma_ssytrd_sy2sb( char uplo, magma_int_t n, magma_int_t nb,
                    float *a, magma_int_t lda, 
                    float *tau,
                    float *work, magma_int_t lwork,
                    float *dT,
                    magma_int_t threads, magma_int_t *info)
{
/*  -- MAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2012

    Purpose   
    =======   
    SSYTRD_HE2HB reduces a real symmetric matrix A to real symmetric   
    band-diagonal form T by an orthogonal similarity transformation:   
    Q**T * A * Q = T.   
    This version stores the triangular matrices T used in the accumulated
    Householder transformations (I - V T V').

    Arguments   
    =========   
    UPLO    (input) CHARACTER*1   
            = 'U':  Upper triangle of A is stored;   
            = 'L':  Lower triangle of A is stored.   

    N       (input) INTEGER   
            The order of the matrix A.  N >= 0.   

    A       (input/output) REAL array, dimension (LDA,N)   
            On entry, the symmetric matrix A.  If UPLO = 'U', the leading   
            N-by-N upper triangular part of A contains the upper   
            triangular part of the matrix A, and the strictly lower   
            triangular part of A is not referenced.  If UPLO = 'L', the   
            leading N-by-N lower triangular part of A contains the lower   
            triangular part of the matrix A, and the strictly upper   
            triangular part of A is not referenced.   
            On exit, if UPLO = 'U', the Upper band-diagonal of A is 
            overwritten by the corresponding elements of the   
            band-diagonal matrix T, and the elements above the band   
            diagonal, with the array TAU, represent the orthogonal   
            matrix Q as a product of elementary reflectors; if UPLO   
            = 'L', the the Lower band-diagonal of A is overwritten by 
            the corresponding elements of the band-diagonal   
            matrix T, and the elements below the band-diagonal, with   
            the array TAU, represent the orthogonal matrix Q as a product   
            of elementary reflectors. See Further Details.   

    LDA     (input) INTEGER   
            The leading dimension of the array A.  LDA >= max(1,N).   

    TAU     (output) REAL array, dimension (N-1)   
            The scalar factors of the elementary reflectors (see Further   
            Details).   

    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))   
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.   

    LWORK   (input) INTEGER   
            The dimension of the array WORK.  LWORK >= 1.   
            For optimum performance LWORK >= N*NB, where NB is the   
            optimal blocksize.   

            If LWORK = -1, then a workspace query is assumed; the routine   
            only calculates the optimal size of the WORK array, returns   
            this value as the first entry of the WORK array, and no error   
            message related to LWORK is issued by XERBLA.   

    dT      (output) REAL array on the GPU, dimension N*NB, 
            where NB is the optimal blocksize.
            On exit dT holds the upper triangular matrices T from the 
            accumulated Householder transformations (I - V T V') used
            in the factorization. The nb x nb matrices T are ordered 
            consecutively in memory one after another.

    INFO    (output) INTEGER   
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value   

    Further Details   
    ===============   
    If UPLO = 'U', the matrix Q is represented as a product of elementary   
    reflectors   

       Q = H(n-1) . . . H(2) H(1).   

    Each H(i) has the form   

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with   
    v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in   
    A(1:i-1,i+1), and tau in TAU(i).   

    If UPLO = 'L', the matrix Q is represented as a product of elementary   
    reflectors   

       Q = H(1) H(2) . . . H(n-1).   

    Each H(i) has the form   

       H(i) = I - tau * v * v'   

    where tau is a real scalar, and v is a real vector with   
    v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i),   
    and tau in TAU(i).

    The contents of A on exit are illustrated by the following examples   
    with n = 5:   

    if UPLO = 'U':                       if UPLO = 'L':   

      (  d   e   v2  v3  v4 )              (  d                  )   
      (      d   e   v3  v4 )              (  e   d              )   
      (          d   e   v4 )              (  v1  e   d          )   
      (              d   e  )              (  v1  v2  e   d      )   
      (                  d  )              (  v1  v2  v3  e   d  )   

    where d and e denote diagonal and off-diagonal elements of T, and vi   
    denotes an element of the vector defining H(i).   
    =====================================================================    */

    #define a_ref(a_1,a_2)  ( a  + ((a_2)-1)*( lda) + (a_1)-1)
    #define da_ref(a_1,a_2) (da  + ((a_2)-1)*(ldda) + (a_1)-1)
    #define tau_ref(a_1)    (tau + (a_1)-1)
    #define t_ref(a_1)      (dT  + ((a_1)-1)*(lddt))

    char uplo_[2] = {uplo, 0};

    int ldda = ((n+31)/32)*32;
    int lddt = nb;
   
    float c_neg_one  = MAGMA_S_NEG_ONE;
    float c_neg_half = MAGMA_S_NEG_HALF;
    float c_one  = MAGMA_S_ONE ;
    float c_zero = MAGMA_S_ZERO;
    float  d_one = MAGMA_D_ONE;

    magma_int_t pm, pn, indi, indj, pk;
    magma_int_t pm_old=0, pn_old=0, indi_old=0, indj_old=0;

    int i;
    int lwkopt;
    int lquery;

    *info = 0;
    int upper = lapackf77_lsame(uplo_, "U");
    lquery = lwork == -1;
    if (! upper && ! lapackf77_lsame(uplo_, "L")) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,n)) {
        *info = -4;
    } else if (lwork < 1 && ! lquery) {
        *info = -9;
    }

    if (*info == 0) {
      /* Determine the block size. */
      lwkopt = n * nb;
      MAGMA_S_SET2REAL( work[0], lwkopt );
    }

    if (*info != 0)
      return *info;
    else if (lquery)
      return *info;

    /* Quick return if possible */
    if (n == 0) {
        work[0] = c_one;
        return *info;
    }

    float *da;
    if (MAGMA_SUCCESS != magma_smalloc( &da, (n + 2*nb)*ldda )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    magma_int_t mklth = min(threads,12);
#if defined(USEMKL)
    mkl_set_num_threads(mklth);
#endif
#if defined(USEACML)
    omp_set_num_threads(mklth);
#endif


    /* Use the first panel of da as work space */
    float *dwork = da+n*ldda;
    float *dW    = dwork + nb*ldda;

    #ifdef TRACING
    char buf[80];
    #endif
    cudaStream_t stream[3];
    magma_queue_create( &stream[0] );
    magma_queue_create( &stream[1] );
    stream[2] = 0;  // default stream
    
    trace_init( 1, 1, 3, stream );

    float *hT = work + lwork - nb*nb;
    lwork -= nb*nb;
    memset( hT, 0, nb*nb*sizeof(float));

    magmablasSetKernelStream( stream[0] );
    cudaEvent_t Pupdate_event;
    cudaEventCreateWithFlags(&Pupdate_event,cudaEventDisableTiming);
    //cudaEventCreate(&Pupdate_event);


    if (upper) {
      printf("SSYTRD_HE2HB is not yet implemented for upper matrix storage. Exit.\n");
      exit(1);

    }else {
        /* Copy the matrix to the GPU */
        if (1 <= n-nb){
            trace_gpu_start( 0, 0, "set", "set A" );
            magma_ssetmatrix_async( (n-nb), (n-nb),
                                    a_ref(nb+1, nb+1),  lda,
                                    da_ref(nb+1, nb+1), ldda, stream[0] );
            trace_gpu_end( 0, 0 );
        }

        /* Reduce the lower triangle of A */
        for (i = 1; i <= n-nb; i += nb) 
        {
             indi = i+nb;
             indj = i;
             pm   = n - i - nb + 1;
             //pn   = min(i+nb-1, n-nb) -i + 1;
             pn   = nb;
             
             /*   Get the current panel (no need for the 1st iteration) */
             if (i > 1 ){
                 // spanel_to_q copy the upper oof diagonal part of 
                 // the matrix to work to be restored later. acctually
                 //  the zero's and one's putted are not used this is only
                 //   because we don't have a function that copy only the
                 //    upper part of A to be restored after copying the 
                 //    lookahead panel that has been computted from GPU to CPU. 
                 spanel_to_q(MagmaUpper, pn-1, a_ref(i, i+1), lda, work);

                 trace_gpu_start( 0, 1, "get", "get panel" );
                 //magma_queue_sync( stream[0] );
                 cudaStreamWaitEvent(stream[1], Pupdate_event, 0);
                 magma_sgetmatrix_async( (pm+pn), pn,
                                         da_ref( i, i), ldda,
                                         a_ref ( i, i), lda, stream[1] );
                 trace_gpu_end( 0, 1 );

                 trace_gpu_start( 0, 2, "syr2k", "syr2k" );
                 magma_ssyr2k(MagmaLower, MagmaNoTrans, pm_old-pn_old, pn_old, c_neg_one,
                      da_ref(indi_old+pn_old, indj_old), ldda,
                      dW + pn_old           , pm_old, d_one,
                      da_ref(indi_old+pn_old, indi_old+pn_old), ldda);
                 trace_gpu_end( 0, 2 );

                 trace_cpu_start( 0, "sync", "sync on 1" );
                 magma_queue_sync( stream[1] );
                 trace_cpu_end( 0 );
                 sq_to_panel(MagmaUpper, pn-1, a_ref(i, i+1), lda, work);
             }

             /* ==========================================================
                QR factorization on a panel starting nb off of the diagonal.
                Prepare the V and T matrices. 
                ==========================================================  */
             #ifdef TRACING
             snprintf( buf, sizeof(buf), "panel %d", i );
             #endif
             trace_cpu_start( 0, "geqrf", buf );
             lapackf77_sgeqrf(&pm, &pn, a_ref(indi, indj), &lda, 
                        tau_ref(i), work, &lwork, info);
             
             /* Form the matrix T */
                         pk=min(pm,pn);
             lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr,
                           &pm, &pk, a_ref(indi, indj), &lda,
                           tau_ref(i), hT, &nb);

             /* Prepare V - put 0s in the upper triangular part of the panel
                (and 1s on the diagonal), temporaly storing the original in work */
             spanel_to_q(MagmaUpper, pk, a_ref(indi, indj), lda, work);
             trace_cpu_end( 0 );

             /* Send V from the CPU to the GPU */
             trace_gpu_start( 0, 0, "set", "set V and T" );
             magma_ssetmatrix_async( pm, pk,
                                     a_ref(indi, indj),  lda,
                                     da_ref(indi, indj), ldda, stream[0] );

             /* Send the triangular factor T to the GPU */
             magma_ssetmatrix_async( pk, pk,
                                     hT,       nb,
                                     t_ref(i), lddt, stream[0] );
             trace_gpu_end( 0, 0 );
             
             /* ==========================================================
                Compute W:
                1. X = A (V T)
                2. W = X - 0.5* V * (T' * (V' * X)) 
                ==========================================================  */
             /* dwork = V T */
             trace_cpu_start( 0, "sync", "sync on 0" );
             // this sync is done here to be sure that the copy has been finished
             // because below we made a restore sq_to_panel and this restore need
             // to ensure that the copy has been finished. we did it here to allow
             // overlapp of restore with next gemm and symm.
             magma_queue_sync( stream[0] );
             trace_cpu_end( 0 );
             
             trace_gpu_start( 0, 2, "gemm", "work = V*T" );
             magma_sgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk,
                         c_one, da_ref(indi, indj), ldda, 
                         t_ref(i), lddt,
                         c_zero, dwork, pm);
             trace_gpu_end( 0, 2 );
             
             /* dW = X = A*V*T. dW = A*dwork */ 
             trace_gpu_start( 0, 2, "symm", "X = A*work" );
             magma_ssymm(MagmaLeft, uplo, pm, pk,
                         c_one, da_ref(indi, indi), ldda,
                         dwork, pm,
                         c_zero, dW, pm);
             trace_gpu_end( 0, 2 );
             /* restore the panel */
             sq_to_panel(MagmaUpper, pk, a_ref(indi, indj), lda, work);
             
             /* dwork = V*T already ==> dwork' = T'*V'
              * compute T'*V'*X ==> dwork'*W ==>
              * dwork + pm*nb = ((T' * V') * X) = dwork' * X = dwork' * W */
             trace_gpu_start( 0, 2, "gemm", "work = T'*V'*X" );
             magma_sgemm(MagmaTrans, MagmaNoTrans, pk, pk, pm,
                         c_one, dwork, pm, 
                         dW, pm,
                         c_zero, dwork + pm*nb, nb);
             trace_gpu_end( 0, 2 );
             
             /* W = X - 0.5 * V * T'*V'*X
              *   = X - 0.5 * V * (dwork + pm*nb) = W - 0.5 * V * (dwork + pm*nb) */
             trace_gpu_start( 0, 2, "gemm", "W = X - 0.5*V*(T'*V'*X)" );
             magma_sgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk,
                         c_neg_half, da_ref(indi, indj), ldda,
                         dwork + pm*nb, nb, 
                         c_one,     dW, pm);
             trace_gpu_end( 0, 2 );

             /* ==========================================================
                Update the unreduced submatrix A(i+ib:n,i+ib:n), using   
                an update of the form:  A := A - V*W' - W*V' 
                ==========================================================  */
             if (i + nb <= n-nb){
                 /* There would be next iteration;
                    do lookahead - update the next panel */
                 trace_gpu_start( 0, 2, "gemm", "gemm 4 next panel left" );
                 magma_sgemm(MagmaNoTrans, MagmaTrans, pm, pn, pn, c_neg_one,
                             da_ref(indi, indj), ldda,
                             dW                , pm, c_one,
                             da_ref(indi, indi), ldda);
                 trace_gpu_end( 0, 2 );
             
                 trace_gpu_start( 0, 2, "gemm", "gemm 5 next panel right" );
                 magma_sgemm(MagmaNoTrans, MagmaTrans, pm, pn, pn, c_neg_one,
                             dW                , pm,
                             da_ref(indi, indj), ldda, c_one,
                             da_ref(indi, indi), ldda);
                 trace_gpu_end( 0, 2 );
                 cudaEventRecord(Pupdate_event, stream[0]);
             }
             else {
                 /* no look-ahead as this is last iteration */
                 trace_gpu_start( 0, 2, "syr2k", "syr2k last iteration" );
                 magma_ssyr2k(MagmaLower, MagmaNoTrans, pk, pk, c_neg_one,
                              da_ref(indi, indj), ldda,
                              dW                , pm, d_one,
                              da_ref(indi, indi), ldda);
                 trace_gpu_end( 0, 2 );
             }
             
             indi_old = indi;
             indj_old = indj;
             pm_old   = pm;
             pn_old   = pn;
        }  // end loop for(i)

        /* Send the last block to the CPU */
        pk = min(pm,pn);
        if (1 <= n-nb){
            spanel_to_q(MagmaUpper, pk-1, a_ref(n-pk+1, n-pk+2), lda, work);
            trace_gpu_start( 0, 2, "get", "get last block" );
            magma_sgetmatrix( pk, pk,
                              da_ref(n-pk+1, n-pk+1), ldda,
                              a_ref(n-pk+1, n-pk+1),  lda );
            trace_gpu_end( 0, 2 );
            sq_to_panel(MagmaUpper, pk-1, a_ref(n-pk+1, n-pk+2), lda, work);
        }
    }// end of LOWER
    
    trace_finalize( "ssytrd_sy2sb.svg", "trace.css" );

    cudaEventDestroy(Pupdate_event);
    magma_queue_destroy( stream[0] );
    magma_queue_destroy( stream[1] );
    magma_free( da );
    MAGMA_S_SET2REAL( work[0], lwkopt );
    magmablasSetKernelStream( 0 );
    
#if defined(USEMKL)
    mkl_set_num_threads(1);
#endif
#if defined(USEACML)
    omp_set_num_threads(1);
#endif
    

    return *info;
} /* ssytrd_sy2sb_ */
Example #15
0
/**
    Purpose
    -------
    ZUNGQR generates an M-by-N COMPLEX_16 matrix Q with orthonormal columns,
    which is defined as the first N columns of a product of K elementary
    reflectors of order M

        Q  =  H(1) H(2) . . . H(k)

    as returned by ZGEQRF.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix Q. M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix Q. M >= N >= 0.

    @param[in]
    k       INTEGER
            The number of elementary reflectors whose product defines the
            matrix Q. N >= K >= 0.

    @param[in,out]
    A       COMPLEX_16 array A, dimension (LDDA,N).
            On entry, the i-th column must contain the vector
            which defines the elementary reflector H(i), for
            i = 1,2,...,k, as returned by ZGEQRF_GPU in the
            first k columns of its array argument A.
            On exit, the M-by-N matrix Q.

    @param[in]
    lda     INTEGER
            The first dimension of the array A. LDA >= max(1,M).

    @param[in]
    tau     COMPLEX_16 array, dimension (K)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by ZGEQRF_GPU.

    @param[in]
    T       COMPLEX_16 array, dimension (NB, min(M,N)).
            T contains the T matrices used in blocking the elementary
            reflectors H(i), e.g., this can be the 6th argument of
            magma_zgeqrf_gpu (except stored on the CPU, not the GPU).

    @param[in]
    nb      INTEGER
            This is the block size used in ZGEQRF_GPU, and correspondingly
            the size of the T matrices, used in the factorization, and
            stored in T.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument has an illegal value

    @ingroup magma_zgeqrf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_zungqr_m(
    magma_int_t m, magma_int_t n, magma_int_t k,
    magmaDoubleComplex *A, magma_int_t lda,
    magmaDoubleComplex *tau,
    magmaDoubleComplex *T, magma_int_t nb,
    magma_int_t *info)
{
#define  A(i,j)   ( A    + (i) + (j)*lda )
#define dA(d,i,j) (dA[d] + (i) + (j)*ldda)
#define dT(d,i,j) (dT[d] + (i) + (j)*nb)

    magmaDoubleComplex c_zero = MAGMA_Z_ZERO;
    magmaDoubleComplex c_one  = MAGMA_Z_ONE;

    magma_int_t m_kk, n_kk, k_kk, mi;
    magma_int_t lwork, ldwork;
    magma_int_t i, ib, ki, kk, iinfo;
    magmaDoubleComplex *work;

    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if ((n < 0) || (n > m)) {
        *info = -2;
    } else if ((k < 0) || (k > n)) {
        *info = -3;
    } else if (lda < max(1,m)) {
        *info = -5;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    if (n <= 0) {
        return *info;
    }
    
    magma_int_t di, dn;
    magma_int_t dpanel;

    magma_int_t ngpu = magma_num_gpus();
    
    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );
    
    // Allocate memory on GPUs for A and workspaces
    magma_int_t ldda    = ((m + 31) / 32) * 32;
    magma_int_t lddwork = ((n + 31) / 32) * 32;
    magma_int_t min_lblocks = (n / nb) / ngpu;  // min. blocks per gpu
    magma_int_t last_dev    = (n / nb) % ngpu;  // device with last block
    
    magma_int_t  nlocal[ MagmaMaxGPUs ] = { 0 };
    magmaDoubleComplex *dA[ MagmaMaxGPUs ] = { NULL };
    magmaDoubleComplex *dT[ MagmaMaxGPUs ] = { NULL };
    magmaDoubleComplex *dV[ MagmaMaxGPUs ] = { NULL };
    magmaDoubleComplex *dW[ MagmaMaxGPUs ] = { NULL };
    magma_queue_t stream[ MagmaMaxGPUs ] = { NULL };
    
    for( int d = 0; d < ngpu; ++d ) {
        // example with n = 75, nb = 10, ngpu = 3
        // min_lblocks = 2
        // last_dev    = 1
        // gpu 0: 2  blocks, cols:  0- 9, 30-39, 60-69
        // gpu 1: 1+ blocks, cols: 10-19, 40-49, 70-74 (partial)
        // gpu 2: 1  block,  cols: 20-29, 50-59
        magma_setdevice( d );
        nlocal[d] = min_lblocks*nb;
        if ( d < last_dev ) {
            nlocal[d] += nb;
        }
        else if ( d == last_dev ) {
            nlocal[d] += (n % nb);
        }
        
        ldwork = nlocal[d]*ldda  // dA
               + nb*m            // dT
               + nb*ldda         // dV
               + nb*lddwork;     // dW
        if ( MAGMA_SUCCESS != magma_zmalloc( &dA[d], ldwork )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            goto CLEANUP;
        }
        dT[d] = dA[d] + nlocal[d]*ldda;
        dV[d] = dT[d] + nb*m;
        dW[d] = dV[d] + nb*ldda;
        
        magma_queue_create( &stream[d] );
    }
    
    trace_init( 1, ngpu, 1, stream );
    
    // first kk columns are handled by blocked method.
    // ki is start of 2nd-to-last block
    if ((nb > 1) && (nb < k)) {
        ki = (k - nb - 1) / nb * nb;
        kk = min(k, ki + nb);
    } else {
        ki = 0;
        kk = 0;
    }

    // Allocate CPU work space
    // n*nb for zungqr workspace
    lwork = n * nb;
    magma_zmalloc_cpu( &work, lwork );
    if (work == NULL) {
        *info = MAGMA_ERR_HOST_ALLOC;
        goto CLEANUP;
    }

    // Use unblocked code for the last or only block.
    if (kk < n) {
        trace_cpu_start( 0, "ungqr", "ungqr last block" );
        m_kk = m - kk;
        n_kk = n - kk;
        k_kk = k - kk;
        dpanel =  (kk / nb) % ngpu;
        di     = ((kk / nb) / ngpu) * nb;
        magma_setdevice( dpanel );
        
        lapackf77_zungqr( &m_kk, &n_kk, &k_kk,
                          A(kk, kk), &lda,
                          &tau[kk], work, &lwork, &iinfo );

        magma_zsetmatrix( m_kk, n_kk,
                          A(kk, kk),  lda,
                          dA(dpanel, kk, di), ldda );
        
        // Set A(1:kk,kk+1:n) to zero.
        magmablas_zlaset( MagmaFull, kk, n - kk, c_zero, c_zero, dA(dpanel, 0, di), ldda );
        trace_cpu_end( 0 );
    }

    if (kk > 0) {
        // Use blocked code
        // send T to all GPUs
        for( int d = 0; d < ngpu; ++d ) {
            magma_setdevice( d );
            trace_gpu_start( d, 0, "set", "set T" );
            magma_zsetmatrix_async( nb, min(m,n), T, nb, dT[d], nb, stream[d] );
            trace_gpu_end( d, 0 );
        }
        
        // stream: set Aii (V) --> laset --> laset --> larfb --> [next]
        // CPU has no computation
        for( i = ki; i >= 0; i -= nb ) {
            ib = min(nb, k - i);
            mi = m - i;
            dpanel =  (i / nb) % ngpu;
            di     = ((i / nb) / ngpu) * nb;

            // Send current panel to the GPUs
            lapackf77_zlaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda );
            for( int d = 0; d < ngpu; ++d ) {
                magma_setdevice( d );
                trace_gpu_start( d, 0, "set", "set V" );
                magma_zsetmatrix_async( mi, ib,
                                        A(i, i), lda,
                                        dV[d],   ldda, stream[d] );
                trace_gpu_end( d, 0 );
            }
            
            // set panel to identity
            magma_setdevice( dpanel );
            magmablasSetKernelStream( stream[dpanel] );
            trace_gpu_start( dpanel, 0, "laset", "laset" );
            magmablas_zlaset( MagmaFull, i,  ib, c_zero, c_zero, dA(dpanel, 0, di), ldda );
            magmablas_zlaset( MagmaFull, mi, ib, c_zero, c_one,  dA(dpanel, i, di), ldda );
            trace_gpu_end( dpanel, 0 );
            
            if (i < n) {
                // Apply H to A(i:m,i:n) from the left
                for( int d = 0; d < ngpu; ++d ) {
                    magma_setdevice( d );
                    magmablasSetKernelStream( stream[d] );
                    magma_indices_1D_bcyclic( nb, ngpu, d, i, n, &di, &dn );
                    trace_gpu_start( d, 0, "larfb", "larfb" );
                    magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise,
                                      mi, dn-di, ib,
                                      dV[d],        ldda, dT(d,0,i), nb,
                                      dA(d, i, di), ldda, dW[d], lddwork );
                    trace_gpu_end( d, 0 );
                }
            }
        }
    }
    
    // copy result back to CPU
    trace_cpu_start( 0, "get", "get A" );
    magma_zgetmatrix_1D_col_bcyclic( m, n, dA, ldda, A, lda, ngpu, nb );
    trace_cpu_end( 0 );
    
    #ifdef TRACING
    char name[80];
    snprintf( name, sizeof(name), "zungqr-n%d-ngpu%d.svg", m, ngpu );
    trace_finalize( name, "trace.css" );
    #endif
    
CLEANUP:
    for( int d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magma_free( dA[d] );
        magma_queue_destroy( stream[d] );
    }
    magma_free_cpu( work );
    magma_setdevice( orig_dev );
    magmablasSetKernelStream( orig_stream );
    
    return *info;
} /* magma_zungqr */
Example #16
0
/**
    Purpose
    -------
    SGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.
    Use two buffer to send panels.

    Arguments
    ---------
    @param[in]
    num_gpus INTEGER
            The number of GPUs to be used for the factorization.

    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    A       REAL array on the GPU, dimension (LDDA,N).
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    @param[in]
    ldda    INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).

    @param[out]
    ipiv    INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
      -     > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    @ingroup magma_sgesv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_sgetrf2_mgpu(magma_int_t num_gpus,
         magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset,
         float *d_lAT[], magma_int_t lddat, magma_int_t *ipiv,
         float *d_lAP[], float *w, magma_int_t ldw,
         magma_queue_t streaml[][2], magma_int_t *info)
{
#define dAT(id,i,j)  (d_lAT[(id)] + ((offset)+(i)*nb)*lddat + (j)*nb)
#define W(j) (w+((j)%num_gpus)*nb*ldw)

    float c_one     = MAGMA_S_ONE;
    float c_neg_one = MAGMA_S_NEG_ONE;

    magma_int_t block_size = 32;
    magma_int_t iinfo, n_local[MagmaMaxGPUs];
    magma_int_t maxm, mindim;
    magma_int_t i, d, dd, rows, cols, s, ldpan[MagmaMaxGPUs];
    magma_int_t id, i_local, i_local2, nb0, nb1, h = 2+num_gpus;
    float *d_panel[MagmaMaxGPUs], *panel_local[MagmaMaxGPUs];

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -2;
    else if (n < 0)
        *info = -3;
    else if (num_gpus*lddat < max(1,n))
        *info = -5;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    if ( num_gpus > ceil((float)n/nb) ) {
        *info = -1;
        return *info;
    }

    /* Use hybrid blocked code. */
    maxm  = ((m + block_size-1)/block_size)*block_size;

    /* some initializations */
    for (i=0; i < num_gpus; i++) {
        magma_setdevice(i);
        
        n_local[i] = ((n/nb)/num_gpus)*nb;
        if (i < (n/nb)%num_gpus)
            n_local[i] += nb;
        else if (i == (n/nb)%num_gpus)
            n_local[i] += n%nb;
        
        /* workspaces */
        d_panel[i] = &(d_lAP[i][h*nb*maxm]);   /* temporary panel storage */
    }
    trace_init( 1, num_gpus, 2, (CUstream_st**)streaml );

    /* start sending the panel to cpu */
    nb0 = min(mindim, nb);
    magma_setdevice(0);
    magmablasSetKernelStream(streaml[0][1]);
    trace_gpu_start( 0, 1, "comm", "get" );
    magmablas_stranspose( nb0, m, dAT(0,0,0), lddat, d_lAP[0], maxm );
    magma_sgetmatrix_async( m, nb0,
                            d_lAP[0], maxm,
                            W(0),     ldw, streaml[0][1] );
    trace_gpu_end( 0, 1 );

    /* ------------------------------------------------------------------------------------- */
    magma_timer_t time=0;
    timer_start( time );

    s = mindim / nb;
    for( i=0; i < s; i++ ) {
        /* Set the GPU number that holds the current panel */
        id = i%num_gpus;
        magma_setdevice(id);
        
        /* Set the local index where the current panel is */
        i_local = i/num_gpus;
        cols  = maxm - i*nb;
        rows  = m - i*nb;
        
        /* synchrnoize i-th panel from id-th gpu into work */
        magma_queue_sync( streaml[id][1] );
        
        /* i-th panel factorization */
        trace_cpu_start( 0, "getrf", "getrf" );
        lapackf77_sgetrf( &rows, &nb, W(i), &ldw, ipiv+i*nb, &iinfo);
        if ( (*info == 0) && (iinfo > 0) ) {
            *info = iinfo + i*nb;
        }
        trace_cpu_end( 0 );
        
        /* start sending the panel to all the gpus */
        d = (i+1)%num_gpus;
        for( dd=0; dd < num_gpus; dd++ ) {
            magma_setdevice(d);
            trace_gpu_start( 0, 1, "comm", "set" );
            magma_ssetmatrix_async( rows, nb,
                                    W(i),     ldw,
                                    &d_lAP[d][(i%h)*nb*maxm], cols,
                                    streaml[d][1] );
            trace_gpu_end( 0, 1 );
            d = (d+1)%num_gpus;
        }
        
        /* apply the pivoting */
        d = (i+1)%num_gpus;
        for( dd=0; dd < num_gpus; dd++ ) {
            magma_setdevice(d);
            magmablasSetKernelStream(streaml[d][0]);
            
            trace_gpu_start( d, 1, "pivot", "pivot" );
            if ( dd == 0 )
                magmablas_spermute_long2( lddat, dAT(d,0,0), lddat, ipiv, nb, i*nb );
            else
                magmablas_spermute_long3(        dAT(d,0,0), lddat, ipiv, nb, i*nb );
            trace_gpu_end( d, 1 );
            d = (d+1)%num_gpus;
        }
    
    
        /* update the trailing-matrix/look-ahead */
        d = (i+1)%num_gpus;
        for( dd=0; dd < num_gpus; dd++ ) {
            magma_setdevice(d);
            
            /* storage for panel */
            if ( d == id ) {
                /* the panel belond to this gpu */
                panel_local[d] = dAT(d,i,i_local);
                ldpan[d] = lddat;
                /* next column */
                i_local2 = i_local+1;
            } else {
                /* the panel belong to another gpu */
                panel_local[d] = d_panel[d];
                ldpan[d] = nb;
                /* next column */
                i_local2 = i_local;
                if ( d < id ) i_local2 ++;
            }
            /* the size of the next column */
            if ( s > (i+1) ) {
                nb0 = nb;
            } else {
                nb0 = n_local[d]-nb*(s/num_gpus);
                if ( d < s%num_gpus ) nb0 -= nb;
            }
            if ( d == (i+1)%num_gpus) {
                /* owns the next column, look-ahead the column */
                nb1 = nb0;
                magmablasSetKernelStream(streaml[d][1]);
                
                /* make sure all the pivoting has been applied */
                magma_queue_sync(streaml[d][0]);
                trace_gpu_start( d, 1, "gemm", "gemm" );
                /* transpose panel on GPU */
                magmablas_stranspose( rows, nb, &d_lAP[d][(i%h)*nb*maxm], cols, panel_local[d], ldpan[d] );
                /* synch for remaining update */
                magma_queue_sync(streaml[d][1]);
            } else {
                /* update the entire trailing matrix */
                nb1 = n_local[d] - i_local2*nb;
                magmablasSetKernelStream(streaml[d][0]);
                
                /* synchronization to make sure panel arrived on gpu */
                magma_queue_sync(streaml[d][1]);
                trace_gpu_start( d, 0, "gemm", "gemm" );
                /* transpose panel on GPU */
                magmablas_stranspose( rows, nb, &d_lAP[d][(i%h)*nb*maxm], cols, panel_local[d], ldpan[d] );
            }
            
            /* gpu updating the trailing matrix */
            magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                         nb1, nb, c_one,
                         panel_local[d],       ldpan[d],
                         dAT(d, i, i_local2), lddat);
            magma_sgemm( MagmaNoTrans, MagmaNoTrans,
                         nb1, m-(i+1)*nb, nb,
                         c_neg_one, dAT(d, i,   i_local2),         lddat,
                                    &(panel_local[d][nb*ldpan[d]]), ldpan[d],
                         c_one,     dAT(d, i+1, i_local2),         lddat );
        
            if ( d == (i+1)%num_gpus ) {
                /* Set the local index where the current panel is */
                int loff    = i+1;
                int i_local = (i+1)/num_gpus;
                int ldda    = maxm - (i+1)*nb;
                int cols    = m - (i+1)*nb;
                nb0 = min(nb, mindim - (i+1)*nb); /* size of the diagonal block */
                trace_gpu_end( d, 1 );
                
                if ( nb0 > 0 ) {
                    /* transpose the panel for sending it to cpu */
                    trace_gpu_start( d, 1, "comm", "get" );
                    magmablas_stranspose( nb0, m-(i+1)*nb, dAT(d,loff,i_local), lddat, &d_lAP[d][((i+1)%h)*nb*maxm], ldda );
             
                    /* send the panel to cpu */
                    magma_sgetmatrix_async( cols, nb0,
                                            &d_lAP[d][((i+1)%h)*nb*maxm], ldda,
                                            W(i+1), ldw, streaml[d][1] );

                    trace_gpu_end( d, 1 );
                }
            } else {
                trace_gpu_end( d, 0 );
            }
            
            d = (d+1)%num_gpus;
        }
    
        /* update the remaining matrix by gpu owning the next panel */
        if ( (i+1) < s ) {
            int i_local = (i+1)/num_gpus;
            int rows  = m - (i+1)*nb;
            
            d = (i+1)%num_gpus;
            magma_setdevice(d);
            magmablasSetKernelStream(streaml[d][0]);
            trace_gpu_start( d, 0, "gemm", "gemm" );

            magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                         n_local[d] - (i_local+1)*nb, nb,
                         c_one, panel_local[d],       ldpan[d],
                                dAT(d,i,i_local+1),  lddat );
            magma_sgemm( MagmaNoTrans, MagmaNoTrans,
                         n_local[d]-(i_local+1)*nb, rows, nb,
                         c_neg_one, dAT(d,i,i_local+1),            lddat,
                                    &(panel_local[d][nb*ldpan[d]]), ldpan[d],
                         c_one,     dAT(d,i+1,  i_local+1),        lddat );
            trace_gpu_end( d, 0 );
        }
    } /* end of for i=1..s */
    /* ------------------------------------------------------------------------------ */
    
    /* Set the GPU number that holds the last panel */
    id = s%num_gpus;
    
    /* Set the local index where the last panel is */
    i_local = s/num_gpus;
    
    /* size of the last diagonal-block */
    nb0 = min(m - s*nb, n - s*nb);
    rows = m    - s*nb;
    cols = maxm - s*nb;
    
    if ( nb0 > 0 ) {
        magma_setdevice(id);
        
        /* wait for the last panel on cpu */
        magma_queue_sync( streaml[id][1] );
    
        /* factor on cpu */
        lapackf77_sgetrf( &rows, &nb0, W(s), &ldw, ipiv+s*nb, &iinfo);
        if ( (*info == 0) && (iinfo > 0) )
            *info = iinfo + s*nb;
        
        /* send the factor to gpus */
        for( d=0; d < num_gpus; d++ ) {
            magma_setdevice(d);
            i_local2 = i_local;
            if ( d < id ) i_local2 ++;
            
            if ( d == id || n_local[d] > i_local2*nb ) {
                magma_ssetmatrix_async( rows, nb0,
                                        W(s),     ldw,
                                        &d_lAP[d][(s%h)*nb*maxm],
                                        cols, streaml[d][1] );
            }
        }
        
        for( d=0; d < num_gpus; d++ ) {
            magma_setdevice(d);
            magmablasSetKernelStream(streaml[d][0]);
            if ( d == 0 )
                magmablas_spermute_long2( lddat, dAT(d,0,0), lddat, ipiv, nb0, s*nb );
            else
                magmablas_spermute_long3(        dAT(d,0,0), lddat, ipiv, nb0, s*nb );
        }
        
        for( d=0; d < num_gpus; d++ ) {
            magma_setdevice(d);
            magmablasSetKernelStream(streaml[d][1]);
            
            /* wait for the pivoting to be done */
            magma_queue_sync( streaml[d][0] );
            
            i_local2 = i_local;
            if ( d < id ) i_local2++;
            if ( d == id ) {
                /* the panel belond to this gpu */
                panel_local[d] = dAT(d,s,i_local);
                
                /* next column */
                nb1 = n_local[d] - i_local*nb-nb0;
                
                magmablas_stranspose( rows, nb0, &d_lAP[d][(s%h)*nb*maxm], cols, panel_local[d], lddat );
                
                if ( nb1 > 0 ) {
                    magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                                 nb1, nb0, c_one,
                                 panel_local[d],        lddat,
                                 dAT(d,s,i_local)+nb0, lddat);
                }
            } else if ( n_local[d] > i_local2*nb ) {
                /* the panel belong to another gpu */
                panel_local[d] = d_panel[d];
                
                /* next column */
                nb1 = n_local[d] - i_local2*nb;
                
                magmablas_stranspose( rows, nb0, &d_lAP[d][(s%h)*nb*maxm], cols, panel_local[d], nb );
                magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             nb1, nb0, c_one,
                             panel_local[d],     nb,
                             dAT(d,s,i_local2), lddat);
            }
        }
    } /* if ( nb0 > 0 ) */
    
    /* clean up */
    trace_finalize( "sgetrf_mgpu.svg","trace.css" );
    for( d=0; d < num_gpus; d++ ) {
        magma_setdevice(d);
        magma_queue_sync( streaml[d][0] );
        magma_queue_sync( streaml[d][1] );
        magmablasSetKernelStream(NULL);
    }
    magma_setdevice(0);
    
    timer_start( time );
    timer_printf("\n Performance %f GFlop/s\n", FLOPS_SGETRF(m,n) / 1e9 / time );

    return *info;
} /* magma_sgetrf2_mgpu */
Example #17
0
/**
    Purpose
    =======

    SSYTRF_nopiv computes the LDLt factorization of a real symmetric
    matrix A. This version does not require work space on the GPU passed
    as input. GPU memory is allocated in the routine.

    The factorization has the form
       A = U^H * D * U,   if UPLO = MagmaUpper, or
       A = L   * D * L^H, if UPLO = MagmaLower,
    where U is an upper triangular matrix, L is lower triangular, and
    D is a diagonal matrix.

    This is the block version of the algorithm, calling Level 3 BLAS.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of A is stored;
      -     = MagmaLower:  Lower triangle of A is stored.

    @param[in]
    n       INTEGER
            The order of the matrix A.  N >= 0.

    @param[in,out]
    A       REAL array, dimension (LDA,N)
            On entry, the symmetric matrix A.  If UPLO = MagmaUpper, the leading
            N-by-N upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = MagmaLower, the
            leading N-by-N lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
    \n
            On exit, if INFO = 0, the factor U or L from the Cholesky
            factorization A = U^H D U or A = L D L^H.
    \n
            Higher performance is achieved if A is in pinned memory.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  if INFO = -6, the GPU memory allocation failed
      -     > 0:  if INFO = i, the leading minor of order i is not
                  positive definite, and the factorization could not be
                  completed.

    @ingroup magma_ssysv_comp
    ******************************************************************* */
extern "C" magma_int_t
magma_ssytrf_nopiv(
    magma_uplo_t uplo, magma_int_t n,
    float *A, magma_int_t lda,
    magma_int_t *info)
{
    #define  A(i, j)  ( A +(j)*lda  + (i))
    #define dA(i, j)  (dA +(j)*ldda + (i))
    #define dW(i, j)  (dW +(j)*ldda + (i))
    #define dWt(i, j) (dW +(j)*nb   + (i))

    /* Constants */
    const float c_one     = MAGMA_S_ONE;
    const float c_neg_one = MAGMA_S_NEG_ONE;
    
    /* Local variables */
    bool upper = (uplo == MagmaUpper);
    magma_int_t j, k, jb, ldda, nb, ib, iinfo;
    magmaFloat_ptr dA;
    magmaFloat_ptr dW;

    *info = 0;
    if (! upper && uplo != MagmaLower) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,n)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return MAGMA_ERR_ILLEGAL_VALUE;
    }

    /* Quick return */
    if ( n == 0 )
      return MAGMA_SUCCESS;

    ldda = magma_roundup( n, 32 );
    nb = magma_get_ssytrf_nopiv_nb(n);
    ib = min(32, nb); // inner-block for diagonal factorization

    if ((MAGMA_SUCCESS != magma_smalloc(&dA, n *ldda)) ||
        (MAGMA_SUCCESS != magma_smalloc(&dW, nb*ldda))) {
        /* alloc failed so call the non-GPU-resident version */
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    magma_device_t cdev;
    magma_queue_t queues[2];
    magma_event_t event;

    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queues[0] );
    magma_queue_create( cdev, &queues[1] );
    magma_event_create( &event );
    trace_init( 1, 1, 2, queues );

    /* Use hybrid blocked code. */
    if (upper) {
        //=========================================================
        // Compute the LDLt factorization A = U'*D*U without pivoting.
        // copy matrix to GPU
        for (j=0; j < n; j += nb) {
            jb = min(nb, (n-j));
            trace_gpu_start( 0, 0, "set", "set" );
            magma_ssetmatrix_async(j+jb, jb, A(0, j), lda, dA(0, j), ldda, queues[0]);
            trace_gpu_end( 0, 0 );
        }
        
        // main loop
        for (j=0; j < n; j += nb) {
            jb = min(nb, (n-j));
            
            // copy A(j,j) back to CPU
            trace_gpu_start( 0, 0, "get", "get" );
            if ( j != 0) {
                //magma_event_sync(event);
                magma_sgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, queues[1]);
            }
            trace_gpu_end( 0, 0 );
            
            // factorize the diagonal block
            magma_queue_sync(queues[1]);
            trace_cpu_start( 0, "potrf", "potrf" );
            magma_ssytrf_nopiv_cpu( MagmaUpper, jb, ib, A(j, j), lda, info );
            trace_cpu_end( 0 );
            if (*info != 0) {
                *info = *info + j;
                break;
            }
            
            // copy A(j,j) back to GPU
            trace_gpu_start( 0, 0, "set", "set" );
            magma_ssetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, queues[0]);
            trace_gpu_end( 0, 0 );
            
            // copy j-th column of U back to CPU
            trace_gpu_start( 0, 1, "get", "get" );
            magma_sgetmatrix_async(j, jb, dA(0, j), ldda, A(0, j), lda, queues[1]);
            trace_gpu_end( 0, 1 );

            if ( (j+jb) < n) {
                // compute the off-diagonal blocks of current block column
                trace_gpu_start( 0, 0, "trsm", "trsm" );
                magma_strsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaUnit,
                             jb, (n-j-jb),
                             c_one, dA(j, j), ldda,
                             dA(j, j+jb), ldda,
                             queues[0] );
                magma_scopymatrix( jb, n-j-jb, dA( j, j+jb ), ldda, dWt( 0, j+jb ), nb, queues[0] );
                
                // update the trailing submatrix with D
                magmablas_slascl_diag( MagmaUpper, jb, n-j-jb,
                                       dA(j,    j), ldda,
                                       dA(j, j+jb), ldda,
                                       queues[0], &iinfo);
                trace_gpu_end( 0, 0 );
                
                // update the trailing submatrix with U and W
                trace_gpu_start( 0, 0, "gemm", "gemm" );
                for (k=j+jb; k < n; k += nb) {
                    magma_int_t kb = min(nb,n-k);
                    magma_sgemm( MagmaConjTrans, MagmaNoTrans, kb, n-k, jb,
                                 c_neg_one, dWt(0, k), nb,
                                            dA(j, k),  ldda,
                                 c_one,     dA(k, k),  ldda,
                                 queues[0]);
                    if (k == j+jb) {
                        // magma_event_record( event, queues[0] );
                        magma_queue_sync( queues[0] );
                    }
                }
                trace_gpu_end( 0, 0 );
            }
        }
    } else {
        //=========================================================
        // Compute the LDLt factorization A = L*D*L' without pivoting.
        // copy the matrix to GPU
        for (j=0; j < n; j += nb) {
            jb = min(nb, (n-j));
            trace_gpu_start( 0, 0, "set", "set" );
            magma_ssetmatrix_async((n-j), jb, A(j, j), lda, dA(j, j), ldda, queues[0]);
            trace_gpu_end( 0, 0 );
        }
        
        // main loop
        for (j=0; j < n; j += nb) {
            jb = min(nb, (n-j));
            
            // copy A(j,j) back to CPU
            trace_gpu_start( 0, 0, "get", "get" );
            if (j != 0) {
                //magma_event_sync(event);
                magma_sgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, queues[1]);
            }
            trace_gpu_end( 0, 0 );
            
            // factorize the diagonal block
            magma_queue_sync(queues[1]);
            trace_cpu_start( 0, "potrf", "potrf" );
            magma_ssytrf_nopiv_cpu( MagmaLower, jb, ib, A(j, j), lda, info );
            trace_cpu_end( 0 );
            if (*info != 0) {
                *info = *info + j;
                break;
            }

            // copy A(j,j) back to GPU
            trace_gpu_start( 0, 0, "set", "set" );
            magma_ssetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, queues[0]);
            trace_gpu_end( 0, 0 );
            
            // copy j-th row of L back to CPU
            trace_gpu_start( 0, 1, "get", "get" );
            magma_sgetmatrix_async(jb, j, dA(j, 0), ldda, A(j, 0), lda, queues[1]);
            trace_gpu_end( 0, 1 );
            
            if ( (j+jb) < n) {
                // compute the off-diagonal blocks of current block column
                trace_gpu_start( 0, 0, "trsm", "trsm" );
                magma_strsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaUnit,
                             (n-j-jb), jb,
                             c_one, dA(j, j), ldda,
                             dA(j+jb, j), ldda,
                             queues[0] );
                magma_scopymatrix( n-j-jb,jb, dA( j+jb, j ), ldda, dW( j+jb, 0 ), ldda, queues[0] );
                
                // update the trailing submatrix with D
                magmablas_slascl_diag( MagmaLower, n-j-jb, jb,
                                       dA(j,    j), ldda,
                                       dA(j+jb, j), ldda,
                                       queues[0], &iinfo );
                trace_gpu_end( 0, 0 );
                
                // update the trailing submatrix with L and W
                trace_gpu_start( 0, 0, "gemm", "gemm" );
                for (k=j+jb; k < n; k += nb) {
                    magma_int_t kb = min(nb,n-k);
                    magma_sgemm( MagmaNoTrans, MagmaConjTrans, n-k, kb, jb,
                                 c_neg_one, dA(k, j), ldda,
                                            dW(k, 0), ldda,
                                 c_one,     dA(k, k), ldda,
                                 queues[0] );
                    if (k == j+jb) {
                        //magma_event_record( event, queues[0] );
                        magma_queue_sync(queues[0]);
                    }
                }
                trace_gpu_end( 0, 0 );
            }
        }
    }
    
    trace_finalize( "ssytrf.svg","trace.css" );
    magma_queue_destroy(queues[0]);
    magma_queue_destroy(queues[1]);
    magma_event_destroy( event );
    magma_free(dW);
    magma_free(dA);
    
    return MAGMA_SUCCESS;
} /* magma_ssytrf_nopiv */
Example #18
0
/**
    Purpose
    -------
    ZHETRD reduces a complex Hermitian matrix A to real symmetric
    tridiagonal form T by an orthogonal similarity transformation:
    Q**H * A * Q = T.

    Arguments
    ---------
    @param[in]
    ngpu    INTEGER
            Number of GPUs to use. ngpu > 0.

    @param[in]
    nqueue  INTEGER
            The number of GPU streams used for update.  10 >= nqueue > 0.

    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of A is stored;
      -     = MagmaLower:  Lower triangle of A is stored.

    @param[in]
    n       INTEGER
            The order of the matrix A.  N >= 0.

    @param[in,out]
    A       COMPLEX_16 array, dimension (LDA,N)
            On entry, the Hermitian matrix A.  If UPLO = MagmaUpper, the leading
            N-by-N upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = MagmaLower, the
            leading N-by-N lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
            On exit, if UPLO = MagmaUpper, the diagonal and first superdiagonal
            of A are overwritten by the corresponding elements of the
            tridiagonal matrix T, and the elements above the first
            superdiagonal, with the array TAU, represent the orthogonal
            matrix Q as a product of elementary reflectors; if UPLO
            = MagmaLower, the diagonal and first subdiagonal of A are over-
            written by the corresponding elements of the tridiagonal
            matrix T, and the elements below the first subdiagonal, with
            the array TAU, represent the orthogonal matrix Q as a product
            of elementary reflectors. See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    d       COMPLEX_16 array, dimension (N)
            The diagonal elements of the tridiagonal matrix T:
            D(i) = A(i,i).
 
    @param[out]
    e       COMPLEX_16 array, dimension (N-1)
            The off-diagonal elements of the tridiagonal matrix T:
            E(i) = A(i,i+1) if UPLO = MagmaUpper, E(i) = A(i+1,i) if UPLO = MagmaLower.

    @param[out]
    tau     COMPLEX_16 array, dimension (N-1)
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    work    (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.  LWORK >= N*NB, where NB is the
            optimal blocksize given by magma_get_zhetrd_nb().
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value

    Further Details
    ---------------
    If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary
    reflectors

        Q = H(n-1) . . . H(2) H(1).

    Each H(i) has the form

        H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in
    A(1:i-1,i+1), and tau in TAU(i).

    If UPLO = MagmaLower, the matrix Q is represented as a product of elementary
    reflectors

        Q = H(1) H(2) . . . H(n-1).

    Each H(i) has the form

        H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i),
    and tau in TAU(i).

    The contents of A on exit are illustrated by the following examples
    with n = 5:

    if UPLO = MagmaUpper:                if UPLO = MagmaLower:

        (  d   e   v2  v3  v4 )              (  d                  )
        (      d   e   v3  v4 )              (  e   d              )
        (          d   e   v4 )              (  v1  e   d          )
        (              d   e  )              (  v1  v2  e   d      )
        (                  d  )              (  v1  v2  v3  e   d  )

    where d and e denote diagonal and off-diagonal elements of T, and vi
    denotes an element of the vector defining H(i).

    @ingroup magma_zheev_comp
    ********************************************************************/
extern "C" magma_int_t
magma_zhetrd_mgpu(
    magma_int_t ngpu,
    magma_int_t nqueue, magma_uplo_t uplo, magma_int_t n,
    magmaDoubleComplex *A, magma_int_t lda,
    double *d, double *e, magmaDoubleComplex *tau,
    magmaDoubleComplex *work, magma_int_t lwork,
    magma_int_t *info)
{
#define  A(i, j)     (A           + (j)*lda  + (i))
#define dA(id, i, j) (dA[(id)]    + (j)*ldda + (i))
#define dW(id, i, j) (dW[(id)] + (j)*ldda + (i))

    const char* uplo_ = lapack_uplo_const( uplo );
    
    magma_int_t nlocal, ldda;
    magma_int_t nb = magma_get_zhetrd_nb(n), ib, ib2;

    const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    const magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    const double             d_one     = MAGMA_D_ONE;
    
    #ifdef PROFILE_SY2RK
    double mv_time = 0.0;
    double up_time = 0.0;
    #endif

    magma_int_t kk, nx;
    magma_int_t i, ii, iii, j, dev, i_n;
    magma_int_t iinfo;
    magma_int_t ldwork, lddw, lwkopt, ldwork2, lhwork;
    magma_int_t lquery;
    
    // set pointers to NULL so it is safe to goto CLEANUP if any malloc fails.
    magma_queue_t queues[MagmaMaxGPUs][10] = { { NULL, NULL } };
    magma_queue_t queues0[MagmaMaxGPUs]    = { NULL };
    magmaDoubleComplex *hwork = NULL;
    magmaDoubleComplex_ptr dwork2[MagmaMaxGPUs] = { NULL };
    magmaDoubleComplex_ptr dA[MagmaMaxGPUs]     = { NULL };
    magmaDoubleComplex_ptr dW[MagmaMaxGPUs]     = { NULL };

    *info = 0;
    int upper = (uplo == MagmaUpper);
    lquery = (lwork == -1);
    if (! upper && uplo != MagmaLower) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,n)) {
        *info = -4;
    } else if (lwork < nb*n && ! lquery) {
        *info = -9;
    } else if ( nqueue > 2 ) {
        *info = 2;  // TODO fix
    }

    /* Determine the block size. */
    ldwork = n;
    lwkopt = n * nb;
    if (*info == 0) {
        work[0] = MAGMA_Z_MAKE( lwkopt, 0 );
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (n == 0) {
        work[0] = c_one;
        return *info;
    }

    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );

    //#define PROFILE_SY2RK
    #ifdef PROFILE_SY2RK
    double times[11] = { 0 };
    magma_event_t start, stop;
    float etime;
    magma_setdevice( 0 );
    magma_event_create( &start );
    magma_event_create( &stop  );
    #endif

    ldda = roundup( lda, 32 );
    lddw = ldda;
    nlocal = nb*(1 + n/(nb*ngpu));
    ldwork2 = ldda*( ((n - 1)/nb + 1) + 1);  // i.e., ldda*(blocks + 1)
    for( dev=0; dev < ngpu; dev++ ) {
        magma_setdevice( dev );
        // TODO fix memory leak
        if ( MAGMA_SUCCESS != magma_zmalloc( &dA[dev],     nlocal*ldda + 3*lddw*nb ) ||
             MAGMA_SUCCESS != magma_zmalloc( &dwork2[dev], ldwork2 ) ) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            goto CLEANUP;
        }
        dW[dev] = dA[dev] + nlocal*ldda;
        
        for( kk=0; kk < nqueue; kk++ ) {
            magma_queue_create( &queues[dev][kk] );
        }
        queues0[dev] = queues[dev][0];
    }
    
    lhwork = nqueue*ngpu*n;
    if ( MAGMA_SUCCESS != magma_zmalloc_pinned( &hwork, lhwork ) ) {
        *info = MAGMA_ERR_HOST_ALLOC;
        goto CLEANUP;
    }

    // crossover point: use CPU code for last nx columns
    //if (n < 2048)
    //    nx = n;
    //else
    //    nx = 512;
    nx = min( 128, n );  // nx <= n is required

    if (upper) {
        /* Copy the matrix to the GPU */
        if (1 <= n-nx) {
            magma_zhtodhe( ngpu, uplo, n, nb, A, lda, dA, ldda, queues, &iinfo );
        }

        /*  Reduce the upper triangle of A.
            Columns 1:kk are handled by the unblocked method. */
        for (i = nb*((n-1)/nb); i >= nx; i -= nb) {
            ib = min(nb, n-i);

            ii  = nb*(i/(nb*ngpu));
            dev = (i/nb)%ngpu;

            /* wait for the next panel */
            if (i != nb*((n-1)/nb)) {
                magma_setdevice( dev );
                magma_queue_sync( queues[dev][0] );
            }

            magma_zlatrd_mgpu( ngpu, uplo, i+ib, ib, nb,
                               A(0, 0), lda, e, tau,
                               work, ldwork,
                               dA, ldda, 0,
                               dW, i+ib,
                               hwork,  lhwork,
                               dwork2, ldwork2,
                               queues0 );

            magma_zher2k_mgpu( ngpu, MagmaUpper, MagmaNoTrans, nb, i, ib,
                               c_neg_one, dW, i+ib, 0,
                               d_one,     dA, ldda, 0,
                               nqueue, queues);

            /* get the next panel */
            if (i-nb >= nx ) {
                ib2 = min(nb, n-(i-nb));
                
                ii  = nb*((i-nb)/(nb*ngpu));
                dev = ((i-nb)/nb)%ngpu;
                magma_setdevice( dev );
                
                magma_zgetmatrix_async( (i-nb)+ib2, ib2,
                                        dA(dev, 0, ii), ldda,
                                        A(0, i-nb),     lda,
                                        queues[dev][0] );
            }

            /* Copy superdiagonal elements back into A, and diagonal
               elements into D */
            for (j = i; j < i+ib; ++j) {
                if ( j > 0 ) {
                    *A(j-1,j) = MAGMA_Z_MAKE( e[j - 1], 0 );
                }
                d[j] = MAGMA_Z_REAL( *A(j, j) );
            }
        } /* end of for i=... */
      
        if ( nx > 0 ) {
            if (1 <= n-nx) { /* else A is already on CPU */
                for (i=0; i < nx; i += nb) {
                    ib = min(nb, n-i);
                    ii  = nb*(i/(nb*ngpu));
                    dev = (i/nb)%ngpu;
                
                    magma_setdevice( dev );
                    magma_zgetmatrix_async( nx, ib,
                                            dA(dev, 0, ii), ldda,
                                            A(0, i),        lda,
                                            queues[dev][0] );
                }
            }
            
            for( dev=0; dev < ngpu; dev++ ) {
                magma_setdevice( dev );
                magma_queue_sync( queues[dev][0] );
            }
            /* Use CPU code to reduce the last or only block */
            lapackf77_zhetrd( uplo_, &nx, A(0, 0), &lda, d, e, tau,
                              work, &lwork, &iinfo );
        }
    }
    else {
        trace_init( 1, ngpu, nqueue, (CUstream_st**)queues );
        /* Copy the matrix to the GPU */
        if (1 <= n-nx) {
            magma_zhtodhe( ngpu, uplo, n, nb, A, lda, dA, ldda, queues, &iinfo );
        }

        /* Reduce the lower triangle of A */
        for (i = 0; i < n-nx; i += nb) {
            ib = min(nb, n-i);

            ii  = nb*(i/(nb*ngpu));
            dev = (i/nb)%ngpu;
            /* Reduce columns i:i+ib-1 to tridiagonal form and form the
               matrix W which is needed to update the unreduced part of
               the matrix */

            /*   Get the current panel (no need for the 1st iteration) */
            if (i != 0) {
                magma_setdevice( dev );
                trace_gpu_start( dev, 0, "comm", "get" );
                magma_zgetmatrix_async( n-i, ib,
                                        dA(dev, i, ii), ldda,
                                        A(i,i),         lda,
                                        queues[dev][0] );
                trace_gpu_end( dev, 0 );
                magma_queue_sync( queues[dev][0] );
                magma_setdevice( 0 );
            }
            
            magma_zlatrd_mgpu( ngpu, uplo, n-i, ib, nb,
                               A(i, i), lda, &e[i], &tau[i],
                               work, ldwork,
                               dA, ldda, i,
                               dW, n-i,
                               hwork,  lhwork,
                               dwork2, ldwork2,
                               queues0 );

            #ifdef PROFILE_SY2RK
            magma_setdevice( 0 );
            if ( i > 0 ) {
                cudaEventElapsedTime( &etime, start, stop );
                up_time += (etime/1000.0);
            }
            magma_event_record( start, 0 );
            #endif
            
            magma_zher2k_mgpu( ngpu, MagmaLower, MagmaNoTrans, nb, n-i-ib, ib,
                               c_neg_one, dW, n-i, ib,
                               d_one, dA, ldda, i+ib, nqueue, queues);
            
            #ifdef PROFILE_SY2RK
            magma_setdevice( 0 );
            magma_event_record( stop, 0 );
            #endif

            /* Copy subdiagonal elements back into A, and diagonal
               elements into D */
            for (j = i; j < i+ib; ++j) {
                if ( j+1 < n ) {
                    *A(j+1,j) = MAGMA_Z_MAKE( e[j], 0 );
                }
                d[j] = MAGMA_Z_REAL( *A(j, j) );
            }
        } /* for i=... */

        /* Use CPU code to reduce the last or only block */
        if ( i < n ) {
            iii = i;
            i_n = n-i;
            if ( i > 0 ) {
                for (; i < n; i += nb) {
                    ib = min(nb, n-i);
                    ii  = nb*(i/(nb*ngpu));
                    dev = (i/nb)%ngpu;
                
                    magma_setdevice( dev );
                    magma_zgetmatrix_async( i_n, ib,
                                            dA(dev, iii, ii), ldda,
                                            A(iii, i),        lda,
                                            queues[dev][0] );
                }
                for( dev=0; dev < ngpu; dev++ ) {
                    magma_setdevice( dev );
                    magma_queue_sync( queues[dev][0] );
                }
            }
            lapackf77_zhetrd( uplo_, &i_n, A(iii, iii), &lda, &d[iii], &e[iii],
                              &tau[iii], work, &lwork, &iinfo );
        }
    }
    
    for( dev=0; dev < ngpu; dev++ ) {
        magma_setdevice( dev );
        for( kk=0; kk < nqueue; kk++ ) {
            magma_queue_sync( queues[dev][kk] );
        }
    }
    
    #ifdef PROFILE_SY2RK
    magma_setdevice( 0 );
    if ( n > nx ) {
        cudaEventElapsedTime( &etime, start, stop );
        up_time += (etime/1000.0);
    }
    magma_event_destroy( start );
    magma_event_destroy( stop  );
    #endif

    trace_finalize( "zhetrd.svg", "trace.css" );
    
    #ifdef PROFILE_SY2RK
    printf( " n=%d nb=%d\n", n, nb );
    printf( " Time in ZLARFG: %.2e seconds\n", times[0] );
    //printf( " Time in ZHEMV : %.2e seconds\n", mv_time );
    printf( " Time in ZHER2K: %.2e seconds\n", up_time );
    #endif
    
CLEANUP:
    for( dev=0; dev < ngpu; dev++ ) {
        magma_setdevice( dev );
        for( kk=0; kk < nqueue; kk++ ) {
            magma_queue_destroy( queues[dev][kk] );
        }
        magma_free( dA[dev] );
        magma_free( dwork2[dev] );
    }
    magma_free_pinned( hwork );
    
    magma_setdevice( orig_dev );
    magmablasSetKernelStream( orig_stream );
    
    work[0] = MAGMA_Z_MAKE( lwkopt, 0 );
    
    return *info;
} /* magma_zhetrd */
Example #19
0
extern "C" magma_int_t 
magma_dsytrf_nopiv(magma_uplo_t uplo, magma_int_t n, 
                   double *A, magma_int_t lda, 
                   magma_int_t *info)
{
/*  -- MAGMA (version 1.6.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2011

    Purpose   
    =======   

    DSYTRF_nopiv computes the LDLt factorization of a real symmetric   
    matrix A. This version does not require work space on the GPU passed 
    as input. GPU memory is allocated in the routine.

    The factorization has the form   
       A = U\*\*H * D * U,  if UPLO = 'U', or   
       A = L  * D * L\*\*H, if UPLO = 'L',   
    where U is an upper triangular matrix, L is lower triangular, and
    D is a diagonal matrix.

    This is the block version of the algorithm, calling Level 3 BLAS.   

    Arguments   
    =========   

    UPLO    (input) CHARACTER*1   
            = 'U':  Upper triangle of A is stored;   
            = 'L':  Lower triangle of A is stored.   

    N       (input) INTEGER   
            The order of the matrix A.  N >= 0.   

    A       (input/output) DOUBLE_PRECISION array, dimension (LDA,N)   
            On entry, the symmetric matrix A.  If UPLO = 'U', the leading   
            N-by-N upper triangular part of A contains the upper   
            triangular part of the matrix A, and the strictly lower   
            triangular part of A is not referenced.  If UPLO = 'L', the   
            leading N-by-N lower triangular part of A contains the lower   
            triangular part of the matrix A, and the strictly upper   
            triangular part of A is not referenced.   

            On exit, if INFO = 0, the factor U or L from the Cholesky   
            factorization A = U\*\*H*U or A = L*L\*\*H.   

            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using cudaMallocHost.

    LDA     (input) INTEGER   
            The leading dimension of the array A.  LDA >= max(1,N).   

    INFO    (output) INTEGER   
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value 
                  if INFO = -6, the GPU memory allocation failed 
            > 0:  if INFO = i, the leading minor of order i is not   
                  positive definite, and the factorization could not be   
                  completed.   

    =====================================================================    */


    /* Local variables */
    double zone  = MAGMA_D_ONE;
    double mzone = MAGMA_D_NEG_ONE;
    int                upper = (uplo == MagmaUpper);
    magma_int_t j, k, jb, ldda, nb, ib, iinfo;
    magmaDouble_ptr dA;
    magmaDouble_ptr dW;

    *info = 0;
    if (! upper && uplo != MagmaLower) {
      *info = -1;
    } else if (n < 0) {
      *info = -2;
    } else if (lda < max(1,n)) {
      *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return MAGMA_ERR_ILLEGAL_VALUE;
    }

    /* Quick return */
    if ( n == 0 )
      return MAGMA_SUCCESS;

    ldda = ((n+31)/32)*32;
    nb = magma_get_dsytrf_nopiv_nb(n);
    ib = min(32, nb); // inner-block for diagonal factorization

    if ((MAGMA_SUCCESS != magma_dmalloc(&dA, n *ldda)) ||
        (MAGMA_SUCCESS != magma_dmalloc(&dW, nb*ldda))) {
        /* alloc failed so call the non-GPU-resident version */
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    magma_queue_t stream[2];
    magma_event_t event;
    magma_queue_create(&stream[0]);
    magma_queue_create(&stream[1]);
    magma_event_create( &event );
    trace_init( 1, 1, 2, (CUstream_st**)stream );

    //if (nb <= 1 || nb >= n) 
    //{
    //    lapackf77_dpotrf(uplo_, &n, a, &lda, info);
    //} else 
    {
        /* Use hybrid blocked code. */
        if (upper) {
            //=========================================================
            // Compute the LDLt factorization A = U'*D*U without pivoting.
            // copy matrix to GPU
            for (j=0; j<n; j+=nb) {
                jb = min(nb, (n-j));
                trace_gpu_start( 0, 0, "set", "set" );
                magma_dsetmatrix_async(j+jb, jb, A(0, j), lda, dA(0, j), ldda, stream[0]);
                trace_gpu_end( 0, 0 );
            }

            // main loop
            for (j=0; j<n; j += nb) {
                jb = min(nb, (n-j));

                // copy A(j,j) back to CPU
                trace_gpu_start( 0, 0, "get", "get" );
                magma_dgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, stream[0]);
                trace_gpu_end( 0, 0 );

                // copy j-th column of U back to CPU
                magma_queue_wait_event( stream[1], event );
                trace_gpu_start( 0, 1, "get", "get" );
                magma_dgetmatrix_async(j, jb, dA(0, j), ldda, A(0, j), lda, stream[1]);
                trace_gpu_end( 0, 1 );

                // factorize the diagonal block
                magma_queue_sync(stream[0]);
                trace_cpu_start( 0, "potrf", "potrf" );
                dsytrf_nopiv_cpu(MagmaUpper, jb, ib, A(j, j), lda, info);
                trace_cpu_end( 0 );
                if (*info != 0){
                    *info = *info + j;
                    break;
                }

                // copy A(j,j) back to GPU
                trace_gpu_start( 0, 0, "set", "set" );
                magma_dsetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, stream[0]);
                trace_gpu_end( 0, 0 );
                
                if ( (j+jb) < n) {
                    // compute the off-diagonal blocks of current block column
                    magmablasSetKernelStream( stream[0] );
                    trace_gpu_start( 0, 0, "trsm", "trsm" );
                    magma_dtrsm(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaUnit, 
                                jb, (n-j-jb), 
                                zone, dA(j, j),    ldda, 
                                      dA(j, j+jb), ldda);
                    magma_dcopymatrix( jb, n-j-jb, dA( j, j+jb ), ldda, dWt( 0, j+jb ), nb );

                    // update the trailing submatrix with D
                    magmablas_dlascl_diag(MagmaUpper, jb, n-j-jb,
                                          dA(j,    j), ldda,
                                          dA(j, j+jb), ldda,
                                          &iinfo);
                    magma_event_record( event, stream[0] );
                    trace_gpu_end( 0, 0 );

                    // update the trailing submatrix with U and W
                    trace_gpu_start( 0, 0, "gemm", "gemm" );
                    for (k=j+jb; k<n; k+=nb)
                    {
                        magma_int_t kb = min(nb,n-k);
                        magma_dgemm(MagmaConjTrans, MagmaNoTrans, kb, n-k, jb,
                                    mzone, dWt(0, k), nb, 
                                           dA(j, k), ldda,
                                    zone,  dA(k, k), ldda);
                    }
                    trace_gpu_end( 0, 0 );
                }
            }
        } else {
            //=========================================================
            // Compute the LDLt factorization A = L*D*L' without pivoting.
            // copy the matrix to GPU
            for (j=0; j<n; j+=nb) {
                jb = min(nb, (n-j));
                trace_gpu_start( 0, 0, "set", "set" );
                magma_dsetmatrix_async((n-j), jb, A(j, j), lda, dA(j, j), ldda, stream[0]);
                trace_gpu_end( 0, 0 );
            }

            // main loop
            for (j=0; j<n; j+=nb) {
                jb = min(nb, (n-j));

                // copy A(j,j) back to CPU
                trace_gpu_start( 0, 0, "get", "get" );
                magma_dgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, stream[0]);
                trace_gpu_end( 0, 0 );

                // copy j-th row of L back to CPU
                magma_queue_wait_event( stream[1], event );
                trace_gpu_start( 0, 1, "get", "get" );
                magma_dgetmatrix_async(jb, j, dA(j, 0), ldda, A(j, 0), lda, stream[1]);
                trace_gpu_end( 0, 1 );

                // factorize the diagonal block
                magma_queue_sync(stream[0]);
                trace_cpu_start( 0, "potrf", "potrf" );
                dsytrf_nopiv_cpu(MagmaLower, jb, ib, A(j, j), lda, info);
                trace_cpu_end( 0 );
                if (*info != 0){
                    *info = *info + j;
                    break;
                }
                // copy A(j,j) back to GPU
                trace_gpu_start( 0, 0, "set", "set" );
                magma_dsetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, stream[0]);
                trace_gpu_end( 0, 0 );
                
                if ( (j+jb) < n) {
                    // compute the off-diagonal blocks of current block column
                    magmablasSetKernelStream( stream[0] );
                    trace_gpu_start( 0, 0, "trsm", "trsm" );
                    magma_dtrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaUnit, 
                                (n-j-jb), jb, 
                                zone, dA(j,    j), ldda, 
                                      dA(j+jb, j), ldda);
                    magma_dcopymatrix( n-j-jb,jb, dA( j+jb, j ), ldda, dW( j+jb, 0 ), ldda );

                    // update the trailing submatrix with D
                    magmablas_dlascl_diag(MagmaLower, n-j-jb, jb,
                                          dA(j,    j), ldda,
                                          dA(j+jb, j), ldda,
                                          &iinfo);
                    magma_event_record( event, stream[0] );
                    trace_gpu_end( 0, 0 );

                    // update the trailing submatrix with L and W
                    trace_gpu_start( 0, 0, "gemm", "gemm" );
                    for (k=j+jb; k<n; k+=nb)
                    {
                        magma_int_t kb = min(nb,n-k);
                        magma_dgemm(MagmaNoTrans, MagmaConjTrans, n-k, kb, jb,
                                    mzone, dA(k, j), ldda, 
                                           dW(k, 0), ldda,
                                    zone,  dA(k, k), ldda);
                    }
                    trace_gpu_end( 0, 0 );
                }
            }
        }
    }
    
    trace_finalize( "dsytrf.svg","trace.css" );
    magma_queue_destroy(stream[0]);
    magma_queue_destroy(stream[1]);
    magma_event_destroy( event );
    magma_free(dW);
    magma_free(dA);
    
    return MAGMA_SUCCESS;
} /* magma_dsytrf_nopiv */
Example #20
0
extern "C" magma_int_t
magma_cungqr_m(
    magma_int_t m, magma_int_t n, magma_int_t k,
    magmaFloatComplex *A, magma_int_t lda,
    magmaFloatComplex *tau,
    magmaFloatComplex *T, magma_int_t nb,
    magma_int_t *info)
{
/*  -- MAGMA (version 1.4.1) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       December 2013

    Purpose
    =======
    CUNGQR generates an M-by-N COMPLEX matrix Q with orthonormal columns,
    which is defined as the first N columns of a product of K elementary
    reflectors of order M

          Q  =  H(1) H(2) . . . H(k)

    as returned by CGEQRF.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix Q. M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix Q. M >= N >= 0.

    K       (input) INTEGER
            The number of elementary reflectors whose product defines the
            matrix Q. N >= K >= 0.

    A       (input/output) COMPLEX array A, dimension (LDDA,N).
            On entry, the i-th column must contain the vector
            which defines the elementary reflector H(i), for
            i = 1,2,...,k, as returned by CGEQRF_GPU in the
            first k columns of its array argument A.
            On exit, the M-by-N matrix Q.

    LDA     (input) INTEGER
            The first dimension of the array A. LDA >= max(1,M).

    TAU     (input) COMPLEX array, dimension (K)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by CGEQRF_GPU.

    T       (input) COMPLEX array, dimension (NB, min(M,N)).
            T contains the T matrices used in blocking the elementary
            reflectors H(i), e.g., this can be the 6th argument of
            magma_cgeqrf_gpu (except stored on the CPU, not the GPU).

    NB      (input) INTEGER
            This is the block size used in CGEQRF_GPU, and correspondingly
            the size of the T matrices, used in the factorization, and
            stored in T.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument has an illegal value
    =====================================================================    */

#define  A(i,j)   ( A    + (i) + (j)*lda )
#define dA(d,i,j) (dA[d] + (i) + (j)*ldda)
#define dT(d,i,j) (dT[d] + (i) + (j)*nb)

    magmaFloatComplex c_zero = MAGMA_C_ZERO;
    magmaFloatComplex c_one  = MAGMA_C_ONE;

    magma_int_t m_kk, n_kk, k_kk, mi;
    magma_int_t lwork, ldwork;
    magma_int_t i, ib, ki, kk, iinfo;
    magmaFloatComplex *work;

    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if ((n < 0) || (n > m)) {
        *info = -2;
    } else if ((k < 0) || (k > n)) {
        *info = -3;
    } else if (lda < max(1,m)) {
        *info = -5;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    if (n <= 0) {
        return *info;
    }
    
    magma_int_t di, dn;
    int dpanel;

    int ngpu = magma_num_gpus();
    int doriginal;
    magma_getdevice( &doriginal );
    
    // Allocate memory on GPUs for A and workspaces
    magma_int_t ldda    = ((m + 31) / 32) * 32;
    magma_int_t lddwork = ((n + 31) / 32) * 32;
    magma_int_t min_lblocks = (n / nb) / ngpu;  // min. blocks per gpu
    magma_int_t last_dev    = (n / nb) % ngpu;  // device with last block
    
    magma_int_t  nlocal[ MagmaMaxGPUs ] = { 0 };
    magmaFloatComplex *dA[ MagmaMaxGPUs ] = { NULL };
    magmaFloatComplex *dT[ MagmaMaxGPUs ] = { NULL };
    magmaFloatComplex *dV[ MagmaMaxGPUs ] = { NULL };
    magmaFloatComplex *dW[ MagmaMaxGPUs ] = { NULL };
    magma_queue_t stream[ MagmaMaxGPUs ] = { NULL };
    
    for( int d = 0; d < ngpu; ++d ) {
        // example with n = 75, nb = 10, ngpu = 3
        // min_lblocks = 2
        // last_dev    = 1
        // gpu 0: 2  blocks, cols:  0- 9, 30-39, 60-69
        // gpu 1: 1+ blocks, cols: 10-19, 40-49, 70-74 (partial)
        // gpu 2: 1  block , cols: 20-29, 50-59
        magma_setdevice( d );
        nlocal[d] = min_lblocks*nb;
        if ( d < last_dev ) {
            nlocal[d] += nb;
        }
        else if ( d == last_dev ) {
            nlocal[d] += (n % nb);
        }
        
        ldwork = nlocal[d]*ldda  // dA
               + nb*m            // dT
               + nb*ldda         // dV
               + nb*lddwork;     // dW
        if ( MAGMA_SUCCESS != magma_cmalloc( &dA[d], ldwork )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            goto CLEANUP;
        }
        dT[d] = dA[d] + nlocal[d]*ldda;
        dV[d] = dT[d] + nb*m;
        dW[d] = dV[d] + nb*ldda;
        
        magma_queue_create( &stream[d] );
    }
    
    trace_init( 1, ngpu, 1, stream );
    
    // first kk columns are handled by blocked method.
    // ki is start of 2nd-to-last block
    if ((nb > 1) && (nb < k)) {
        ki = (k - nb - 1) / nb * nb;
        kk = min(k, ki + nb);
    } else {
        ki = 0;
        kk = 0;
    }

    // Allocate CPU work space
    // n*nb for cungqr workspace
    lwork = n * nb;
    magma_cmalloc_cpu( &work, lwork );
    if (work == NULL) {
        *info = MAGMA_ERR_HOST_ALLOC;
        goto CLEANUP;
    }

    // Use unblocked code for the last or only block.
    if (kk < n) {
        trace_cpu_start( 0, "ungqr", "ungqr last block" );
        m_kk = m - kk;
        n_kk = n - kk;
        k_kk = k - kk;
        dpanel =  (kk / nb) % ngpu;
        di     = ((kk / nb) / ngpu) * nb;
        magma_setdevice( dpanel );
        
        lapackf77_cungqr( &m_kk, &n_kk, &k_kk,
                          A(kk, kk), &lda,
                          &tau[kk], work, &lwork, &iinfo );

        magma_csetmatrix( m_kk, n_kk,
                          A(kk, kk),  lda,
                          dA(dpanel, kk, di), ldda );
        
        // Set A(1:kk,kk+1:n) to zero.
        magmablas_claset( MagmaUpperLower, kk, n - kk, dA(dpanel, 0, di), ldda );
        trace_cpu_end( 0 );
    }

    if (kk > 0) {
        // Use blocked code
        // send T to all GPUs
        for( int d = 0; d < ngpu; ++d ) {
            magma_setdevice( d );
            trace_gpu_start( d, 0, "set", "set T" );
            magma_csetmatrix_async( nb, min(m,n), T, nb, dT[d], nb, stream[d] );
            trace_gpu_end( d, 0 );
        }
        
        // stream: set Aii (V) --> laset --> laset --> larfb --> [next]
        // CPU has no computation
        for( i = ki; i >= 0; i -= nb ) {
            ib = min(nb, k - i);
            mi = m - i;
            dpanel =  (i / nb) % ngpu;
            di     = ((i / nb) / ngpu) * nb;

            // Send current panel to the GPUs
            lapackf77_claset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda );
            for( int d = 0; d < ngpu; ++d ) {
                magma_setdevice( d );
                trace_gpu_start( d, 0, "set", "set V" );
                magma_csetmatrix_async( mi, ib,
                                        A(i, i), lda,
                                        dV[d],   ldda, stream[d] );
                trace_gpu_end( d, 0 );
            }
            
            // set panel to identity
            magma_setdevice( dpanel );
            magmablasSetKernelStream( stream[dpanel] );
            trace_gpu_start( dpanel, 0, "laset", "laset" );
            magmablas_claset( MagmaUpperLower, i, ib, dA(dpanel, 0, di), ldda );
            magmablas_claset_identity( mi, ib, dA(dpanel, i, di), ldda );
            trace_gpu_end( dpanel, 0 );
            
            if (i < n) {
                // Apply H to A(i:m,i:n) from the left
                for( int d = 0; d < ngpu; ++d ) {
                    magma_setdevice( d );
                    magmablasSetKernelStream( stream[d] );
                    magma_indices_1D_bcyclic( nb, ngpu, d, i, n, &di, &dn );
                    trace_gpu_start( d, 0, "larfb", "larfb" );
                    magma_clarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise,
                                      mi, dn-di, ib,
                                      dV[d],        ldda, dT(d,0,i), nb,
                                      dA(d, i, di), ldda, dW[d], lddwork );
                    trace_gpu_end( d, 0 );
                }
            }
        }
    }
    
    // copy result back to CPU
    trace_cpu_start( 0, "get", "get A" );
    magma_cgetmatrix_1D_col_bcyclic( m, n, dA, ldda, A, lda, ngpu, nb );
    trace_cpu_end( 0 );
    
    #ifdef TRACING
    char name[80];
    snprintf( name, sizeof(name), "cungqr-n%d-ngpu%d.svg", m, ngpu );
    trace_finalize( name, "trace.css" );
    #endif
    
CLEANUP:
    for( int d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magmablasSetKernelStream( NULL );
        magma_free( dA[d] );
        dA[d] = NULL;
        if ( stream[d] != NULL ) {
            magma_queue_destroy( stream[d] );
        }
    }
    magma_free_cpu( work );
    magma_setdevice( doriginal );
    
    return *info;
} /* magma_cungqr */
Example #21
0
extern "C" magma_int_t
magma_zgetrf2_mgpu(magma_int_t num_gpus, 
         magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset,
         cuDoubleComplex **d_lAT, magma_int_t lddat, magma_int_t *ipiv,
         cuDoubleComplex **d_lAP, cuDoubleComplex *w, magma_int_t ldw,
         cudaStream_t streaml[][2], magma_int_t *info)
#endif
{
/*  -- MAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2010

    Purpose
    =======

    ZGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.
    Use two buffer to send panels..

    Arguments
    =========

    NUM_GPUS 
            (input) INTEGER
            The number of GPUS to be used for the factorization.

    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N).
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    LDDA     (input) INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).

    IPIV    (output) INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  if INFO = -7, internal GPU memory allocation failed.
            > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.
    =====================================================================    */

#define inAT(id,i,j)  (d_lAT[(id)] + ((offset)+(i)*nb)*lddat + (j)*nb)
#define W(j) (w+((j)%num_gpus)*nb*ldw)

    cuDoubleComplex c_one     = MAGMA_Z_ONE;
    cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;

    magma_int_t block_size = 32;
    magma_int_t iinfo, n_local[4]; 
    magma_int_t maxm, mindim;
    magma_int_t i, ii, d, dd, rows, cols, s, ldpan[4];
    magma_int_t id, i_local, i_local2, nb0, nb1;
    cuDoubleComplex *d_panel[4], *panel_local[4];
    //cudaStream_t streaml[4][2];

    /* Check arguments */
    *info = 0;
    if (m < 0)
    *info = -2;
    else if (n < 0)
    *info = -3;
    else if (num_gpus*lddat < max(1,n))
    *info = -5;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    //nb     = magma_get_zgetrf_nb(m);
    if( num_gpus > ceil((double)n/nb) ) {
      *info = -1;
      return *info;
    }

    {
      /* Use hybrid blocked code. */
      maxm  = ((m + block_size-1)/block_size)*block_size;

      /* some initializations */
      for(i=0; i<num_gpus; i++){
        magmaSetDevice(i);

        n_local[i] = ((n/nb)/num_gpus)*nb;
        if (i < (n/nb)%num_gpus)
           n_local[i] += nb;
        else if (i == (n/nb)%num_gpus)
           n_local[i] += n%nb;

        /* workspaces */
        d_panel[i] = &(d_lAP[i][nb*maxm]);   /* temporary panel storage */

        /* create local streams */
        //magma_queue_create(&streaml[i][0]);
        //magma_queue_create(&streaml[i][1]);
      }
      trace_init( 1, num_gpus, 2, (CUstream_st**)streaml );

      /* start sending the panel to cpu */
      nb0 = min(mindim, nb);
      magmaSetDevice(0);
      magmablasSetKernelStream(streaml[0][1]);
      trace_gpu_start( 0, 1, "comm", "get" );
      if( nb0 == nb )
        magmablas_ztranspose(  d_lAP[0], maxm, inAT(0,0,0), lddat, nb0, maxm );
      else
        magmablas_ztranspose2( d_lAP[0], maxm, inAT(0,0,0), lddat, nb0, maxm );
      magma_zgetmatrix_async( m, nb0,
                              d_lAP[0], maxm,
                              W(0),     ldw, streaml[0][1] );
      trace_gpu_end( 0, 1 );

      /* ------------------------------------------------------------------------------------- */
#ifdef  PROFILE
      magma_timestr_t start_timer, end_timer;
      start_timer = get_current_time();
#endif
      s = mindim / nb;
      for( i=0; i<s; i++ )
            {
                /* Set the GPU number that holds the current panel */
                id = i%num_gpus;
                magmaSetDevice(id);

                /* Set the local index where the current panel is */
                i_local = i/num_gpus;
                cols  = maxm - i*nb;
                rows  = m - i*nb;

                /* synchrnoize i-th panel from id-th gpu into work */
                magma_queue_sync( streaml[id][1] );

                /* i-th panel factorization */
                trace_cpu_start( 0, "getrf", "getrf" );
#ifdef PANEL_FACT_MC
                cntxt->nb = 12;
                magma_zgetrf_mc(cntxt, &rows, &nb, W(i), &ldw, ipiv+i*nb, &iinfo);
#else
                lapackf77_zgetrf( &rows, &nb, W(i), &ldw, ipiv+i*nb, &iinfo);
#endif
                if ( (*info == 0) && (iinfo > 0) ) {
                    *info = iinfo + i*nb;
                    //break;
                }
                trace_cpu_end( 0 );

                /* start sending the panel to all the gpus */
                d = (i+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                  magmaSetDevice(d);
                  trace_gpu_start( 0, 1, "comm", "set" );
                  magma_zsetmatrix_async( rows, nb,
                                          W(i),     ldw,
                                          d_lAP[d], cols, streaml[d][1] );
                  trace_gpu_end( 0, 1 );
                  d = (d+1)%num_gpus;
                }

                /* apply the pivoting */
                d = (i+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                  magmaSetDevice(d);
                  magmablasSetKernelStream(streaml[d][0]);

                  trace_gpu_start( d, 1, "pivot", "pivot" );
                  if( dd == 0 )
                    magmablas_zpermute_long2( lddat, inAT(d,0,0), lddat, ipiv, nb, i*nb );
                  else
                    magmablas_zpermute_long3(        inAT(d,0,0), lddat, ipiv, nb, i*nb );
                  trace_gpu_end( d, 1 );
                  d = (d+1)%num_gpus;
                }


                /* update the trailing-matrix/look-ahead */
                d = (i+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                  magmaSetDevice(d);

                  /* storage for panel */
                  if( d == id ) {
                    /* the panel belond to this gpu */
                    panel_local[d] = inAT(d,i,i_local);
                    ldpan[d] = lddat;
                    /* next column */
                    i_local2 = i_local+1;
                  } else {
                    /* the panel belong to another gpu */
                    panel_local[d] = &d_panel[d][(i%2)*nb*maxm];
                    //panel_local[d] = d_panel[d];
                    ldpan[d] = nb;
                    /* next column */
                    i_local2 = i_local;
                    if( d < id ) i_local2 ++;
                  }
                  /* the size of the next column */
                  if ( s > (i+1) ) {
                    nb0 = nb;
                  } else {
                    nb0 = n_local[d]-nb*(s/num_gpus);
                    if( d < s%num_gpus ) nb0 -= nb;
                  }
                  if( d == (i+1)%num_gpus) {
                    /* owns the next column, look-ahead the column */
                    nb1 = nb0;
                    magmablasSetKernelStream(streaml[d][1]);

                    /* make sure all the pivoting has been applied */
                    magma_queue_sync(streaml[d][0]);
                    trace_gpu_start( d, 1, "gemm", "gemm" );
                  } else {
                    /* update the entire trailing matrix */
                    nb1 = n_local[d] - i_local2*nb;
                    magmablasSetKernelStream(streaml[d][0]);

                    /* synchronization to make sure panel arrived on gpu */
                    magma_queue_sync(streaml[d][1]);
                    trace_gpu_start( d, 0, "gemm", "gemm" );
                  }
                  magmablas_ztranspose(panel_local[d], ldpan[d], d_lAP[d], cols, cols, nb);

                  /* gpu updating the trailing matrix */
                  //magmablas_ztrsm(
                  magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                               nb1, nb, c_one,
                               panel_local[d],       ldpan[d],
                               inAT(d, i, i_local2), lddat);
                  //cublasZgemm
                  magma_zgemm( MagmaNoTrans, MagmaNoTrans, 
                               nb1, m-(i+1)*nb, nb, 
                               c_neg_one, inAT(d, i,   i_local2),         lddat,
                                          &(panel_local[d][nb*ldpan[d]]), ldpan[d], 
                               c_one,     inAT(d, i+1, i_local2),         lddat );

                  if( d == (i+1)%num_gpus ) 
                  {
                    /* Set the local index where the current panel is */
                    int loff    = i+1;
                    int i_local = (i+1)/num_gpus;
                    int ldda    = maxm - (i+1)*nb;
                    int cols    = m - (i+1)*nb;
                    nb0 = min(nb, mindim - (i+1)*nb); /* size of the diagonal block */
                    trace_gpu_end( d, 1 );

                    if( nb0 > 0 ) {
                        /* transpose the panel for sending it to cpu */
                        trace_gpu_start( d, 1, "comm", "get" );
                        if( i+1 < s ) 
                          magmablas_ztranspose(  d_lAP[d], ldda, inAT(d,loff,i_local), lddat, nb0, ldda );
                        else
                          magmablas_ztranspose2(  d_lAP[d], ldda, inAT(d,loff,i_local), lddat, nb0, ldda );

                        /* send the panel to cpu */
                        magma_zgetmatrix_async( cols, nb0,
                                                d_lAP[d], ldda,
                                                W(i+1),   ldw, streaml[d][1] );
                        trace_gpu_end( d, 1 );
                    }
                  } else {
                    trace_gpu_end( d, 0 );
                  }

                  d = (d+1)%num_gpus;
                }

                /* update the remaining matrix by gpu owning the next panel */
                if( (i+1) < s ) {
                  int i_local = (i+1)/num_gpus;
                  int rows  = m - (i+1)*nb;

                  d = (i+1)%num_gpus;
                  magmaSetDevice(d);
                  magmablasSetKernelStream(streaml[d][0]);
                  trace_gpu_start( d, 0, "gemm", "gemm" );
                  //magmablas_ztrsm
                  magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                               n_local[d] - (i_local+1)*nb, nb, 
                               c_one, panel_local[d],       ldpan[d], 
                                      inAT(d,i,i_local+1),  lddat );
                  //cublasZgemm
                  magma_zgemm( MagmaNoTrans, MagmaNoTrans, 
                               n_local[d]-(i_local+1)*nb, rows, nb, 
                               c_neg_one, inAT(d,i,i_local+1),            lddat, 
                                          &(panel_local[d][nb*ldpan[d]]), ldpan[d], 
                               c_one,     inAT(d,i+1,  i_local+1),        lddat );
                  trace_gpu_end( d, 0 );
                }
             } /* end of for i=1..s */
             /* ------------------------------------------------------------------------------ */

            /* Set the GPU number that holds the last panel */
            id = s%num_gpus;

            /* Set the local index where the last panel is */
            i_local = s/num_gpus;

            /* size of the last diagonal-block */
            nb0 = min(m - s*nb, n - s*nb);
            rows = m    - s*nb;
            cols = maxm - s*nb;

            if( nb0 > 0 ) {
              magmaSetDevice(id);

              /* wait for the last panel on cpu */
              magma_queue_sync( streaml[id][1] );

              /* factor on cpu */
              lapackf77_zgetrf( &rows, &nb0, W(s), &ldw, ipiv+s*nb, &iinfo);
              if ( (*info == 0) && (iinfo > 0) )
              *info = iinfo + s*nb;

              /* send the factor to gpus */
              for( d=0; d<num_gpus; d++ ) {
                magmaSetDevice(d);
                i_local2 = i_local;
                if( d < id ) i_local2 ++;

                if( d == id || n_local[d] > i_local2*nb ) {
                  magma_zsetmatrix_async( rows, nb0,
                                          W(s),     ldw,
                                          d_lAP[d], cols, streaml[d][1] );
                }
              }

              for( d=0; d<num_gpus; d++ ) {
                magmaSetDevice(d);
                magmablasSetKernelStream(streaml[d][0]);
                if( d == 0 )
                  magmablas_zpermute_long2( lddat, inAT(d,0,0), lddat, ipiv, nb0, s*nb );
                else
                  magmablas_zpermute_long3(        inAT(d,0,0), lddat, ipiv, nb0, s*nb );
              }

              for( d=0; d<num_gpus; d++ ) {
                magmaSetDevice(d);
                magmablasSetKernelStream(streaml[d][1]);

                /* wait for the pivoting to be done */
                magma_queue_sync( streaml[d][0] );

                i_local2 = i_local;
                if( d < id ) i_local2++;
                if( d == id ) {
                  /* the panel belond to this gpu */
                  panel_local[d] = inAT(d,s,i_local);

                  /* next column */
                  nb1 = n_local[d] - i_local*nb-nb0;

                  magmablas_ztranspose2( panel_local[d], lddat, d_lAP[d], cols, rows, nb0);

                  if( nb1 > 0 )
                  //cublasZtrsm
                  magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                               nb1, nb0, c_one,
                               panel_local[d],        lddat, 
                               inAT(d,s,i_local)+nb0, lddat);
                } else if( n_local[d] > i_local2*nb ) {
                  /* the panel belong to another gpu */
                  panel_local[d] = &d_panel[d][(s%2)*nb*maxm];
                  //panel_local[d] = d_panel[d];

                  /* next column */
                  nb1 = n_local[d] - i_local2*nb;

                  magmablas_ztranspose2( panel_local[d], nb, d_lAP[d], cols, rows, nb0);
                  //cublasZtrsm
                  magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                               nb1, nb0, c_one,
                               panel_local[d],     nb, 
                               inAT(d,s,i_local2), lddat);
                }
              }
            } /* if( nb0 > 0 ) */

            /* clean up */
            trace_finalize( "zgetrf_mgpu.svg","trace.css" );
            for( d=0; d<num_gpus; d++ ) {
              magmaSetDevice(d);
              magma_queue_sync( streaml[d][0] );
              magma_queue_sync( streaml[d][1] );
              //magma_queue_destroy(streaml[d][0]);
              //magma_queue_destroy(streaml[d][1]);
              magmablasSetKernelStream(NULL);
            } 
            magmaSetDevice(0);
#ifdef  PROFILE
            end_timer = get_current_time();
            printf("\n Performance %f GFlop/s\n", (2./3.*n*n*n /1000000.) / GetTimerValue(start_timer, end_timer));
#endif
    }

    return *info;
    /* End of MAGMA_ZGETRF2_MGPU */
}
Example #22
0
extern "C" magma_int_t
magma_cpotrf2_msub(
    magma_int_t num_subs, magma_int_t num_gpus, magma_uplo_t uplo, magma_int_t m, magma_int_t n, 
    magma_int_t off_i, magma_int_t off_j, magma_int_t nb,
    magmaFloatComplex_ptr *d_lA, size_t d_lA_offset, magma_int_t ldda, 
    magmaFloatComplex_ptr *d_lP, magma_int_t lddp, 
    magmaFloatComplex *a, magma_int_t lda, magma_int_t h,
    magma_queue_t *queues,
    magma_int_t *info )
{
/*  -- clMAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date November 2014

    Purpose   
    =======   
    CPOTRF computes the Cholesky factorization of a complex Hermitian   
    positive definite matrix dA.   

    The factorization has the form   
       dA = U**H * U,  if UPLO = 'U', or   
       dA = L  * L**H,  if UPLO = 'L',   
    where U is an upper triangular matrix and L is lower triangular.   

    This is the block version of the algorithm, calling Level 3 BLAS.   

    Arguments   
    =========   
    UPLO    (input) CHARACTER*1   
            = 'U':  Upper triangle of dA is stored;   
            = 'L':  Lower triangle of dA is stored.   

    N       (input) INTEGER   
            The order of the matrix dA.  N >= 0.   

    dA      (input/output) COMPLEX array on the GPU, dimension (LDDA,N)   
            On entry, the Hermitian matrix dA.  If UPLO = 'U', the leading   
            N-by-N upper triangular part of dA contains the upper   
            triangular part of the matrix dA, and the strictly lower   
            triangular part of dA is not referenced.  If UPLO = 'L', the   
            leading N-by-N lower triangular part of dA contains the lower   
            triangular part of the matrix dA, and the strictly upper   
            triangular part of dA is not referenced.   

            On exit, if INFO = 0, the factor U or L from the Cholesky   
            factorization dA = U**H * U or dA = L * L**H.   

    LDDA     (input) INTEGER   
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            divisible by 16.

    INFO    (output) INTEGER   
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value   
            > 0:  if INFO = i, the leading minor of order i is not   
                  positive definite, and the factorization could not be   
                  completed.   
    =====================================================================   */

    int tot_subs = num_subs*num_gpus;
    magma_int_t     j, jb, nb0, nb2, dd, d, id, j_local, j_local2;
    magmaFloatComplex c_one     = MAGMA_C_ONE;
    magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;
    float          d_one     =  1.0;
    float          d_neg_one = -1.0;
    magmaFloatComplex_ptr dlpanel;
    size_t dlpanel_offset;
    magma_int_t n_local[MagmaMaxSubs * MagmaMaxGPUs], ldpanel;

    // initialize trace
    trace_init(1, num_gpus, 2, queues);

    *info = 0;
    if ( (uplo != MagmaUpper) && (uplo != MagmaLower) ) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if ((uplo != MagmaUpper) && tot_subs*ldda < max(1,n)) {
        *info = -4;
    } else if ((uplo == MagmaUpper) && ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    for (d=0; d<tot_subs; d++) {
        /* local-n and local-ld */
        if (uplo == MagmaUpper) {
            n_local[d] = ((n/nb)/tot_subs)*nb;
            if (d < (n/nb)%tot_subs)
               n_local[d] += nb;
            else if (d == (n/nb)%tot_subs)
              n_local[d] += n%nb;
        } else {
            n_local[d] = ((m/nb)/tot_subs)*nb;
            if (d < (m/nb)%tot_subs)
               n_local[d] += nb;
            else if (d == (m/nb)%tot_subs)
              n_local[d] += m%nb;
        }
    }

    /* Use blocked code. */
    if (uplo == MagmaUpper) {
        /* ---------------------------------------------- */
        /* Upper-triangular case                          */
        /* > Compute the Cholesky factorization A = U'*U. */
        /* ---------------------------------------------- */
        for (j=0; j<m; j+=nb) {
            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%tot_subs;
            /* Set the local index where the current panel is */
            j_local = j/(nb*tot_subs);
            jb = min(nb, (m-j));
            if (j > 0) {
                // Wait for the column on CPU
                magma_queue_sync(queues[2*(id%num_gpus)]); 
                /* broadcast off-diagonal column to all gpus */
                d = (j/nb+1)%num_gpus;
                for (dd=0; dd<num_gpus; dd++) {
                    if (d != id%num_gpus) {
                        magma_csetmatrix_async( j, jb, 
                                                Aup(0,j),                lda, 
                                                dlP(d,jb,0,id%num_gpus), lddp, 
                                                queues[2*d], 
                                                trace_gpu_event(d, 0, "set", "set-col") );
                    }
                    d = (d+1)%num_gpus;
                }
                /* Update the current diagonal block */
                trace_gpu_start(id%num_gpus, 1, "herk", "herk");
                magma_cherk(MagmaUpper, MagmaConjTrans, jb, j, 
                            d_neg_one, dlA(id, 0, nb*j_local), ldda,
                            d_one,     dlA(id, j, nb*j_local), ldda,
                            queues[2*(id%num_gpus)+1]);
                magma_queue_sync(queues[2*(id%num_gpus)+1]); // Wait for syrk
            }
            /* Send the diagonal to cpu */
            magma_cgetmatrix_async( jb, jb, 
                                    dlA(id, j, nb*j_local), ldda,
                                    Aup(j,j),               lda,
                                    queues[2*(id%num_gpus)], 
                                    trace_gpu_event(id%num_gpus, 0, "get", "get-diag") );
            if (j > 0) {
                /* Compute the local block column of the panel. */
                d = (j/nb+1)%tot_subs;
                for (dd=0; dd<tot_subs; dd++) {
                    j_local2 = j_local+1;
                    if (d > id) j_local2 --;
                    nb0 = nb*j_local2;
                    if (n_local[d] > nb0) {
                        if (d%num_gpus != id%num_gpus) {
                            dlpanel = d_lP[d%num_gpus];
                            dlpanel_offset = dlP_offset(jb, 0, id%num_gpus);
                            ldpanel = lddp;
                            /* Wait for the offdiagonal column */
                            if (dd < num_gpus) magma_queue_sync(queues[2*(d%num_gpus)]);
                        } else {
                            dlpanel = d_lA[id];
                            dlpanel_offset = dlA_offset(0, nb*j_local);
                            ldpanel = ldda;
                        }
                        /* update the panel */
                        trace_gpu_start(d%num_gpus, 1, "gemm", "gemm");
                        magma_cgemm(MagmaConjTrans, MagmaNoTrans, 
                                    jb, n_local[d]-nb0, j, 
                                    c_neg_one, dlpanel, dlpanel_offset, ldpanel,
                                               dlA(d, 0, nb0), ldda, 
                                    c_one,     dlA(d, j, nb0), ldda,
                                    queues[2*(d%num_gpus)+1]);
                    }
                    d = (d+1)%tot_subs;
                }
            }
            /* factor the diagonal */
            magma_queue_sync( queues[2*(id%num_gpus)] ); // wait for the diagonal
            trace_cpu_start(0, "potrf", "potrf");
            lapackf77_cpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info);
            trace_cpu_end(0);
            if (*info != 0) {
                *info = *info + j;
                break;
            }

            /* send the diagonal to gpus */
            if ((j+jb) < n) {
                d = (j/nb+1)%num_gpus;
                for (dd=0; dd<num_gpus; dd++) {
                    if (d == id%num_gpus) {
                        dlpanel = d_lA[id];
                        dlpanel_offset = dlA_offset(j, nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = d_lP[d];
                        dlpanel_offset = dlP_offset(0, 0, id%num_gpus);
                        ldpanel = lddp;
                    }
                    magma_csetmatrix_async( jb, jb, 
                                            Aup(j,j),                lda,
                                            dlpanel, dlpanel_offset, ldpanel, 
                                            queues[2*d], 
                                            trace_gpu_event(d, 0, "set", "set-diag"));
                    d = (d+1)%num_gpus;
                }
            } else {
                magma_csetmatrix_async( jb, jb, 
                                        Aup(j,j),               lda, 
                                        dlA(id, j, nb*j_local), ldda,
                                        queues[2*(id%num_gpus)], 
                                        trace_gpu_event(id%num_gpus, 0, "set", "set-diag") );
            }

            /* panel-factorize the off-diagonal */
            if ((j+jb) < n) {
                d = (j/nb+1)%tot_subs;
                for (dd=0; dd<tot_subs; dd++) {
                    /* next column */
                    j_local2 = j_local+1;
                    if (d > id) j_local2--;
                    if (d%num_gpus == id%num_gpus) {
                        dlpanel = d_lA[id];
                        dlpanel_offset = dlA_offset(j, nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = d_lP[d%num_gpus];
                        dlpanel_offset = dlP_offset(0, 0, id%num_gpus);
                        ldpanel = lddp;
                    }
                    nb2 = n_local[d]-nb*j_local2;
                    nb0 = min(nb, nb2);
                    if (dd < num_gpus) magma_queue_sync( queues[2*(d%num_gpus)] ); // wait for the diagonal
                    if (j+jb < m && d == (j/nb+1)%tot_subs) {
                        /* owns the next column, look-ahead the column */
                        trace_gpu_start(d%num_gpus, 1, "trsm", "trsm");
                        magma_ctrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                     jb, nb0, c_one,
                                     dlpanel, dlpanel_offset, ldpanel,
                                     dlA(d, j, nb*j_local2), ldda, 
                                     queues[2*(d%num_gpus)+1] );
                        /* send the column to cpu */
                        magma_queue_sync(queues[2*(d%num_gpus)+1]);  // wait for lookahead
                        magma_cgetmatrix_async( (j+jb), nb0, 
                                                dlA(d, 0, nb*j_local2), ldda, 
                                                Aup(0,j+jb),            lda,
                                                queues[2*(d%num_gpus)], 
                                                trace_gpu_event(d%num_gpus, 0, "get", "get-col") );
                        /* update the remaining blocks */
                        nb2 = nb2 - nb0;
                        trace_gpu_start(d%num_gpus, 1, "trsm", "trsm");
                        magma_ctrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                     jb, nb2, c_one, 
                                     dlpanel, dlpanel_offset, ldpanel,
                                     dlA(d, j, nb*j_local2+nb0), ldda, 
                                     queues[2*(d%num_gpus)+1] );
                    } else if (nb2 > 0) {
                        /* update the entire trailing matrix */
                        trace_gpu_start(d%num_gpus, 1, "trsm", "trsm");
                        magma_ctrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, 
                                     jb, nb2, c_one, 
                                     dlpanel, dlpanel_offset, ldpanel,
                                     dlA(d, j, nb*j_local2), ldda,
                                     queues[2*(d%num_gpus)+1] );
                    }
                    d = (d+1)%tot_subs;
                }
            }
        }
    } else { 
        /* -------------------------------------------- */
        /* Lower-triangular case                        */
        /* Compute the Cholesky factorization A = L*L'. */
        /* -------------------------------------------- */
        for (j=0; j<n; j+=nb) {
            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%tot_subs;
            /* Set the local index where the current panel is */
            j_local = j/(nb*tot_subs);
            jb = min(nb, (n-j));
            if (j > 0) {
                if (num_gpus > 1) {
                    // Wait for the row on CPU to broadcast
                    magma_queue_sync(queues[2*(id%num_gpus)]); 
                }
                /* broadcast off-diagonal row to all the GPUs */
                d = (j/nb+1)%num_gpus;
                for (dd=0; dd<num_gpus; dd++) {
                    if (d != id%num_gpus) {
                        /* send it to GPU-d */
                        magma_csetmatrix_async( jb, j,
                                                Alo(j,0),                 lda,
                                                dlPT(d,0,jb,id%num_gpus), nb, 
                                                queues[2*d], 
                                                trace_gpu_event(d, 0, "set", "set-row") );
                    }
                    d = (d+1)%num_gpus;
                }
                /* Update the current diagonal block */
                trace_gpu_start(id%num_gpus, 1, "herk", "herk");
                magma_cherk(MagmaLower, MagmaNoTrans, jb, j,
                            d_neg_one, dlA(id, nb*j_local, 0), ldda,
                            d_one,     dlA(id, nb*j_local, j), ldda,
                            queues[2*(id%num_gpus)+1]);
                magma_queue_sync(queues[2*(id%num_gpus)+1]); // wait for syrk
            }
            /* send the diagonal to cpu */
            magma_cgetmatrix_async( jb, jb,
                                    dlA(id, nb*j_local, j), ldda,
                                    Alo(j,j),               lda, 
                                    queues[2*(id%num_gpus)], 
                                    trace_gpu_event(id%num_gpus, 0, "get", "get") );
            /* update the offdiagonal blocks */
            if (j > 0) {
                /* compute the block-rows of the panel */
                d = (j/nb+1)%tot_subs;
                for (dd=0; dd<tot_subs; dd++) {
                    j_local2 = j_local+1;
                    if (d > id) j_local2 --;
                    nb0 = nb*j_local2;
                    if (nb0 < n_local[d]) {
                        if (d%num_gpus != id%num_gpus) {
                            dlpanel = d_lP[d%num_gpus];
                            dlpanel_offset = dlPT_offset(0, jb, id%num_gpus);
                            ldpanel = nb;
                            /* Wait for offdiagonal row */
                            if (dd < num_gpus) magma_queue_sync(queues[2*(d%num_gpus)]);
                        } else {
                            dlpanel = d_lA[id];
                            dlpanel_offset = dlA_offset(nb*j_local, 0);
                            ldpanel = ldda;
                        }
                        /* Update the panel */
                        trace_gpu_start(d%num_gpus, 1, "gemm", "gemm");
                        magma_cgemm( MagmaNoTrans, MagmaConjTrans,
                                     n_local[d]-nb0, jb, j,
                                     c_neg_one, dlA(d, nb0, 0), ldda,
                                                dlpanel, dlpanel_offset, ldpanel,
                                     c_one,     dlA(d, nb0, j), ldda, 
                                     queues[2*(d%num_gpus)+1]);
                    }
                    d = (d+1)%tot_subs;
                }
            }

            /* factor the diagonal */
            magma_queue_sync( queues[2*(id%num_gpus)] );
            trace_cpu_start(0, "potrf", "potrf");
            lapackf77_cpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info);
            trace_cpu_end(0);
            if (*info != 0) {
                printf( " cpotrf returned %d (id=%d,j=%d,j_local=%d,jb=%d)\n",*info,id,j,j_local,jb );
                *info = *info + j;
                break;
            }

            /* send the diagonal to gpus */
            if ((j+jb) < m) {
                d = (j/nb+1)%num_gpus;
                for (dd=0; dd<num_gpus; dd++) {
                    if (d == id%num_gpus) {
                        dlpanel = d_lA[id];
                        dlpanel_offset = dlA_offset(nb*j_local, j);
                        ldpanel = ldda;
                    } else {
                        dlpanel = d_lP[d];
                        dlpanel_offset = dlPT_offset(0, 0, id%num_gpus);
                        ldpanel = nb;
                    }
                    magma_csetmatrix_async( jb, jb,
                                            Alo(j,j), lda,
                                            dlpanel,  dlpanel_offset, ldpanel, 
                                            queues[2*d], 
                                            trace_gpu_event(d, 0, "set", "set-diag") );
                    d = (d+1)%num_gpus;
                }
            } else {
                magma_csetmatrix_async( jb, jb,
                                        Alo(j,j),               lda,
                                        dlA(id, nb*j_local, j), ldda, 
                                        queues[2*(id%num_gpus)],
                                        trace_gpu_event(id%num_gpus, 0, "set", "set-diag") );
            }

            /* factorize off-diagonal blocks */
            if ((j+jb) < m) {
                d = (j/nb+1)%tot_subs;
                for (dd=0; dd<tot_subs; dd++) {
                    /* next column */
                    j_local2 = j_local+1;
                    if (d > id) j_local2--;
                    if (d%num_gpus == id%num_gpus) {
                        dlpanel = d_lA[id];
                        dlpanel_offset = dlA_offset(nb*j_local, j);
                        ldpanel = ldda;
                    } else {         
                        dlpanel = d_lP[d%num_gpus];
                        dlpanel_offset = dlPT_offset(0, 0, id%num_gpus);
                        ldpanel = nb;
                    }
                    nb2 = n_local[d] - j_local2*nb;
                    nb0 = min(nb, nb2 );
                    // wait for the diagonal
                    if (dd < num_gpus) magma_queue_sync(queues[2*(d%num_gpus)]);
                    if (j+jb < n && d == (j/nb+1)%tot_subs) {
                        /* owns the next column, look-ahead the column */
                        trace_gpu_start(d%num_gpus, 1, "trsm", "trsm");
                        magma_ctrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, 
                                     nb0, jb, c_one,
                                     dlpanel,  dlpanel_offset, ldpanel, 
                                     dlA(d, nb*j_local2, j), ldda,
                                     queues[2*(d%num_gpus)+1]);
                        /* send the column to cpu */
                        magma_queue_sync( queues[2*(d%num_gpus)+1] ); // wait for lookahead
                        magma_cgetmatrix_async( nb0, j+jb,
                                                dlA(d, nb*j_local2, 0), ldda,
                                                Alo(j+jb,0),            lda, 
                                                queues[2*(d%num_gpus)], 
                                                trace_gpu_event(d%num_gpus, 0, "get", "get") );
                        /* update the remaining blocks */
                        nb2 = nb2 - nb0;
                        trace_gpu_start(d%num_gpus, 1, "trsm", "trsm");
                        magma_ctrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, 
                                     nb2, jb, c_one,
                                     dlpanel, dlpanel_offset, ldpanel, 
                                     dlA(d, nb*j_local2+nb0, j), ldda, 
                                     queues[2*(d%num_gpus)+1]);
                    } else if (nb2 > 0) {
                        /* update the entire trailing matrix */
                        trace_gpu_start(d%num_gpus, 1, "trsm", "trsm");
                        magma_ctrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, 
                                     nb2, jb, c_one,
                                     dlpanel, dlpanel_offset, ldpanel, 
                                     dlA(d, nb*j_local2, j), ldda, 
                                     queues[2*(d%num_gpus)+1]);
                    }
                    d = (d+1)%tot_subs;
                }
            }
        }
    } /* end of else not upper */

    /* clean up */
    for( d=0; d<num_gpus; d++ ) {
        magma_queue_sync( queues[2*d] );
        magma_queue_sync( queues[2*d+1] );
    }

    trace_finalize("cpotrf_msub.svg", "trace.css");
    return *info;
} /* magma_cpotrf2_msub */
Example #23
0
/**
    Purpose
    -------
    SPOTRF computes the Cholesky factorization of a real symmetric
    positive definite matrix dA.

    The factorization has the form
       dA = U**T * U,   if UPLO = MagmaUpper, or
       dA = L  * L**T,  if UPLO = MagmaLower,
    where U is an upper triangular matrix and L is lower triangular.

    This is the block version of the algorithm, calling Level 3 BLAS.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of dA is stored;
      -     = MagmaLower:  Lower triangle of dA is stored.

    @param[in]
    n       INTEGER
            The order of the matrix dA.  N >= 0.

    @param[in,out]
    dA      REAL array on the GPU, dimension (LDDA,N)
            On entry, the symmetric matrix dA.  If UPLO = MagmaUpper, the leading
            N-by-N upper triangular part of dA contains the upper
            triangular part of the matrix dA, and the strictly lower
            triangular part of dA is not referenced.  If UPLO = MagmaLower, the
            leading N-by-N lower triangular part of dA contains the lower
            triangular part of the matrix dA, and the strictly upper
            triangular part of dA is not referenced.
    \n
            On exit, if INFO = 0, the factor U or L from the Cholesky
            factorization dA = U**T * U or dA = L * L**T.

    @param[in]
    ldda     INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            divisible by 16.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
      -     > 0:  if INFO = i, the leading minor of order i is not
                  positive definite, and the factorization could not be
                  completed.

    @ingroup magma_sposv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_spotrf2_mgpu(int num_gpus, magma_uplo_t uplo, magma_int_t m, magma_int_t n,
                   magma_int_t off_i, magma_int_t off_j, magma_int_t nb,
                   float **d_lA,  magma_int_t ldda,
                   float **d_lP,  magma_int_t lddp,
                   float *A,      magma_int_t lda,   magma_int_t h,
                   magma_queue_t stream[][3], magma_event_t event[][5],
                   magma_int_t *info )
{
#define Alo(i, j)  (A +             ((j)+off_j)*lda  + (nb*(((i)/nb)%h)+off_i))
#define Aup(i, j)  (A + (nb*(((j)/nb)%h)+off_j)*lda  +               (i+off_i))

#define  dlA(id, i, j)    (d_lA[(id)] + (j)*ldda + (i))
#define  dlP(id, i, j, k) (d_lP[(id)] + (k)*nb*lddp + (j)*lddp + (i))
#define dlPT(id, i, j, k) (d_lP[(id)] + (k)*nb*lddp + (j)*nb   + (i))

    magma_int_t     j, jb, nb0, nb2, dd, d, id, j_local, j_local2, buf;
    float c_one     = MAGMA_S_ONE;
    float c_neg_one = MAGMA_S_NEG_ONE;
    float          d_one     =  1.0;
    float          d_neg_one = -1.0;
    int upper = (uplo == MagmaUpper);
    float *dlpanel;
    //magma_event_t event0[MagmaMaxGPUs], // syrk
    //            event1[MagmaMaxGPUs], // send off-diagonal
    //            event2[MagmaMaxGPUs], // send diagonal
    //            event3[MagmaMaxGPUs]; // trsm
    magma_int_t n_local[MagmaMaxGPUs], ldpanel;
    int stream0 = 0, stream1 = 1;
    #ifdef STRSM_WORK
    float *d_dinvA[MagmaMaxGPUs][2], *d_x[MagmaMaxGPUs][2]; /* used by strsm_work */
    #endif
    
    *info = 0;
    if (! upper && uplo != MagmaLower) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (!upper && num_gpus*ldda < max(1,n)) {
        *info = -4;
    } else if (upper && ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    for( d=0; d < num_gpus; d++ ) {
        /* local-n and local-ld */
        if (upper) {
            n_local[d] = ((n/nb)/num_gpus)*nb;
            if (d < (n/nb)%num_gpus)
                n_local[d] += nb;
            else if (d == (n/nb)%num_gpus)
                n_local[d] += n%nb;
        } else {
            n_local[d] = ((m/nb)/num_gpus)*nb;
            if (d < (m/nb)%num_gpus)
                n_local[d] += nb;
            else if (d == (m/nb)%num_gpus)
                n_local[d] += m%nb;
        }
        //magma_setdevice(d);
        //magma_event_create( &event0[d] );
        //magma_event_create( &event1[d] );
        //magma_event_create( &event2[d] );
        //magma_event_create( &event3[d] );
    }
    magma_setdevice(0);

    /* == initialize the trace */
    trace_init( 1, num_gpus, 3, (magma_queue_t*)stream );

    /* Use blocked code. */
    if (upper) {
        /* ---------------------------------------------- */
        /* Upper-triangular case                          */
        /* > Compute the Cholesky factorization A = U'*U. */
        /* ---------------------------------------------- */
        
#if defined(PRECISION_d) && defined(STRSM_WORK)
        /* invert the diagonals
         * Allocate device memory for the inversed diagonal blocks, size=m*NB
         */
        for( d=0; d < num_gpus; d++ ) {
            magma_setdevice(d);
            for( j=0; j < 2; j++ ) {
                magma_smalloc( &d_dinvA[d][j], nb*nb );
                magma_smalloc( &d_x[d][j],      n*nb );
                cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(float));
                cudaMemset(d_x[d][j],     0,  n*nb*sizeof(float));
            }
        }
        magma_setdevice(0);
#endif
        
        for (j=0; j < m; j += nb) {
            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%num_gpus;
            buf = (j/nb)%num_gpus;
            
            /* Set the local index where the current panel is */
            j_local = j/(nb*num_gpus);
            jb = min(nb, (m-j));
            
            if ( j > 0 ) {
                /* needed on pluto... */
                magma_setdevice(id);
                magma_queue_sync( stream[id][stream0] ); // wait for the column on CPU

                /* broadcast off-diagonal column to all gpus */
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd < num_gpus; dd++ ) {
                    if ( d != id ) {
                        magma_setdevice(d);
                
                        /* wait for it on CPU */
                        magma_queue_wait_event( stream[d][stream0], event[id][1] );
                
                        /* send it to GPU */
                        trace_gpu_start( d, stream0, "comm", "rows to GPUs" );
                        magma_ssetmatrix_async( j, jb,
                                                Aup(0,j),        lda,
                                                dlP(d,jb,0,buf), lddp,
                                                stream[d][stream0] );
                        trace_gpu_end( d, stream0 );
                        magma_event_record( event[d][1], stream[d][stream0] );
                    }
                    d = (d+1)%num_gpus;
                }
            }
            
            /* Update the current diagonal block */
            magma_setdevice(id);
            if ( j > 0 ) {
                magmablasSetKernelStream(stream[id][stream1]);
                trace_gpu_start( id, stream1, "syrk", "syrk" );
                magma_ssyrk(MagmaUpper, MagmaTrans, jb, j,
                            d_neg_one, dlA(id, 0, nb*j_local), ldda,
                            d_one,     dlA(id, j, nb*j_local), ldda);
                trace_gpu_end( id, stream1 );
                magma_event_record( event[id][0], stream[id][stream1] );
            }

            /* send the diagonal to cpu */
            magma_queue_wait_event( stream[id][stream0], event[id][0] ); // wait for syrk
            trace_gpu_start( id, stream0, "comm", "D to CPU" );
            magma_sgetmatrix_async( jb, jb,
                                    dlA(id, j, nb*j_local), ldda,
                                    Aup(j,j),               lda,
                                    stream[id][stream0] );
            trace_gpu_end( id, stream0 );

            if ( j > 0 ) {
                /* Compute the local block column of the panel. */
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd < num_gpus; dd++ ) {
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2 --;
                    nb0 = nb*j_local2;
                
                    if ( n_local[d] > nb0 ) {
                        /* wait for the off-diagonal */
                        if ( d != id ) {
                            //magma_queue_sync( stream[id][3] );
                            dlpanel = dlP(d, jb, 0, buf);
                            ldpanel = lddp;
                
                            /* wait for the offdiagonal column */
                            magma_queue_wait_event( stream[d][stream1], event[d][1] );
                        } else {
                            dlpanel = dlA(d, 0, nb*j_local);
                            ldpanel = ldda;
                        }
                        
                        /* update the panel */
                        magma_setdevice(d);
                        magmablasSetKernelStream(stream[d][stream1]);
                        trace_gpu_start( d, stream1, "gemm", "gemm" );
                        magma_sgemm(MagmaTrans, MagmaNoTrans,
                                    jb, n_local[d]-nb0, j,
                                    c_neg_one, dlpanel,        ldpanel,
                                               dlA(d, 0, nb0), ldda,
                                    c_one,     dlA(d, j, nb0), ldda);
                        trace_gpu_end( d, stream1 );
                    }
                    d = (d+1)%num_gpus;
                }
            }
            
            /* factor the diagonal */
            magma_setdevice(id);
            magma_queue_sync( stream[id][stream0] ); // wait for the diagonal
            trace_cpu_start( 0, "getrf", "getrf" );
            lapackf77_spotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info);
            trace_cpu_end( 0 );
            if (*info != 0) {
                *info = *info + j;
                break;
            }

            /* send the diagonal to gpus */
            if ( (j+jb) < n) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd < num_gpus; dd++ ) {
                    magma_setdevice(d);
                    if ( d == id ) {
                        dlpanel = dlA(d, j, nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlP(d, 0, 0, buf);
                        ldpanel = lddp;
                    }
                    
                    trace_gpu_start( d, stream0, "comm", "D to GPUs" );
                    magma_ssetmatrix_async( jb, jb,
                                            Aup(j,j), lda,
                                            dlpanel,  ldpanel,
                                            stream[d][stream0] );
                    trace_gpu_end( d, stream0 );
                    magma_event_record( event[d][2], stream[d][stream0] );
                    d = (d+1)%num_gpus;
                }
            } else {
                magma_setdevice(id);
                trace_gpu_start( id, stream0, "comm", "D to GPUs" );
                magma_ssetmatrix_async( jb, jb,
                                        Aup(j,j),               lda,
                                        dlA(id, j, nb*j_local), ldda,
                                        stream[id][stream0] );
                trace_gpu_end( id, stream0 );
            }
            
            /* panel-factorize the off-diagonal */
            if ( (j+jb) < n) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd < num_gpus; dd++ ) {
                    /* next column */
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2--;
                    if ( d == id ) {
                        dlpanel = dlA(d, j, nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlP(d, 0, 0, buf);
                        ldpanel = lddp;
                    }
                    nb2 = n_local[d]-nb*j_local2;
                    nb0 = min(nb, nb2 );
                    
                    magma_setdevice(d);
                    magmablasSetKernelStream(stream[d][stream1]);
                    magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for the diagonal
                    if ( j+jb < m && d == (j/nb+1)%num_gpus ) {
                        /* owns the next column, look-ahead the column */
                        trace_gpu_start( d, stream1, "trsm", "trsm" );
#if defined(PRECISION_d) && defined(STRSM_WORK)
                        magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit,
                                              jb, nb0, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                              d_dinvA[d][0], d_x[d][0] );
                        /*nb2 = n_local[d] - j_local2*nb;
                        magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                              d_dinvA[d], d_x[d] ); */
#else
                        /*nb2 = n_local[d] - j_local2*nb;
                        magma_strsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel,                ldda,
                                     dlA(d, j, nb*j_local2), ldda);
                        */
                        magma_strsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit,
                                     jb, nb0, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, j, nb*j_local2), ldda);
#endif
                        trace_gpu_end( d, stream1 );
                        magma_event_record( event[d][3], stream[d][stream1] );
                        
                        /* send the column to cpu */
                        if ( j+jb < m ) {
                            trace_gpu_start( d, stream0, "comm", "rows to CPU" );
                            magma_queue_wait_event( stream[d][stream0], event[d][3] ); // wait for lookahead
                            magma_sgetmatrix_async( (j+jb), nb0,
                                                    dlA(d, 0, nb*j_local2), ldda,
                                                    Aup(0,j+jb),            lda,
                                                    stream[d][stream0] );
                            trace_gpu_end( d, stream0 );
                            magma_event_record( event[d][1], stream[d][stream0] );
                        }
                        
                        /* update the remaining blocks */
                        nb2 = nb2 - nb0;
#if defined(PRECISION_d) && defined(STRSM_WORK)
                        magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel,                    ldpanel,
                                              dlA(d, j, nb*j_local2+nb0), ldda,
                                              d_dinvA[d][1], d_x[d][1] );
#else
                        magma_strsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel,                    ldpanel,
                                     dlA(d, j, nb*j_local2+nb0), ldda);
#endif
                    } else if ( nb2 > 0 ) {
                        /* update the entire trailing matrix */
                        trace_gpu_start( d, stream1, "trsm", "trsm" );
#if defined(PRECISION_d) && defined(STRSM_WORK)
                        magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                    d_dinvA[d][1], d_x[d][1] );
#else
                        magma_strsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, j, nb*j_local2), ldda);
#endif
                        trace_gpu_end( d, stream1 );
                    }
                    d = (d+1)%num_gpus;
                }
            } /* end of strsm */
        } /* end of for j=1, .., n */
    } else {
        /* -------------------------------------------- */
        /* Lower-triangular case                        */
        /* Compute the Cholesky factorization A = L*L'. */
        /* -------------------------------------------- */
#if defined(PRECISION_d) && defined(STRSM_WORK)
        /*
         * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE
         */
        for( d=0; d < num_gpus; d++ ) {
            magma_setdevice(d);
            for( j=0; j < 2; j++ ) {
                magma_smalloc( &d_dinvA[d][j], nb*nb );
                magma_smalloc( &d_x[d][j],     nb*m  );
                cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(float));
                cudaMemset(d_x[d][j],     0, nb* m*sizeof(float));
            }
        }
        magma_setdevice(0);
#endif

        for (j=0; j < n; j += nb) {
            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%num_gpus;
            buf = (j/nb)%num_gpus;
            
            /* Set the local index where the current panel is */
            j_local = j/(nb*num_gpus);
            jb = min(nb, (n-j));
            
            if ( j > 0 ) {
                /* needed on pluto... */
                magma_setdevice(id);
                magma_queue_sync( stream[id][stream0] ); // wait for the column on CPU

                /* broadcast offdiagonal row to all gpus */
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd < num_gpus; dd++ ) {
                    if ( d != id ) {
                        magma_setdevice(d);
                        /* wait for it on CPU */
                        magma_queue_wait_event( stream[d][stream0], event[id][1] );
            
                        /* send it to GPU */
                        magma_ssetmatrix_async( jb, j,
                                                Alo(j,0),         lda,
                                                dlPT(d,0,jb,buf), nb,
                                                stream[d][stream0] );
                        magma_event_record( event[d][1], stream[d][stream0] );
                    }
                    d = (d+1)%num_gpus;
                }
            }

            /* Update the current diagonal block */
            magma_setdevice(id);
            if ( j > 0 ) {
                magmablasSetKernelStream(stream[id][stream1]);
                magma_ssyrk(MagmaLower, MagmaNoTrans, jb, j,
                            d_neg_one, dlA(id, nb*j_local, 0), ldda,
                            d_one,     dlA(id, nb*j_local, j), ldda);
                magma_event_record( event[id][0], stream[id][stream1] );
            }
            
            /* send the diagonal to cpu */
            magma_queue_wait_event( stream[id][stream0], event[id][0] ); // wait for syrk
            magma_sgetmatrix_async( jb, jb,
                                    dlA(id, nb*j_local, j), ldda,
                                    Alo(j,j),               lda,
                                    stream[id][stream0] );

            /* update the offdiagonal blocks */
            if ( j > 0 ) {
                /* compute the block-rows of the panel */
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd < num_gpus; dd++ ) {
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2 --;
                    nb0 = nb*j_local2;
            
                    if ( nb0 < n_local[d] ) {
                        if ( d != id ) {
                            dlpanel = dlPT(d, 0, jb, buf);
                            ldpanel = nb;
            
                            /* wait for offdiagonal row */
                            magma_queue_wait_event( stream[d][stream1], event[d][1] );
                        } else {
                            dlpanel = dlA(d, nb*j_local, 0);
                            ldpanel = ldda;
                        }
            
                        magma_setdevice(d);
                        magmablasSetKernelStream(stream[d][stream1]);
                        magma_sgemm( MagmaNoTrans, MagmaTrans,
                                     n_local[d]-nb0, jb, j,
                                     c_neg_one, dlA(d, nb0, 0), ldda,
                                                dlpanel,        ldpanel,
                                     c_one,     dlA(d, nb0, j), ldda);
                    }
                    d = (d+1)%num_gpus;
                }
            }

            /* factor the diagonal */
            magma_setdevice(id);
            magma_queue_sync( stream[id][stream0] );
            lapackf77_spotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info);
            if (*info != 0) {
                *info = *info + j;
                break;
            }

            /* send the diagonal to gpus */
            if ( (j+jb) < m ) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd < num_gpus; dd++ ) {
                    magma_setdevice(d);
                    if ( d == id ) {
                        dlpanel = dlA(d, nb*j_local, j);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlPT(d, 0, 0, buf);
                        ldpanel = nb;
                    }
                    magma_ssetmatrix_async( jb, jb,
                                            Alo(j,j), lda,
                                            dlpanel,  ldpanel,
                                            stream[d][stream0] );
                    magma_event_record( event[d][2], stream[d][stream0] );
                    d = (d+1)%num_gpus;
                }
            } else {
                magma_setdevice(id);
                magma_ssetmatrix_async( jb, jb,
                                        Alo(j,j),               lda,
                                        dlA(id, nb*j_local, j), ldda,
                                        stream[id][stream0] );
            }

            /* factorize off-diagonal blocks */
            if ( (j+jb) < m ) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd < num_gpus; dd++ ) {
                    /* next column */
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2--;
                    if ( d == id ) {
                        dlpanel = dlA(d, nb*j_local, j);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlPT(d, 0, 0, buf);
                        ldpanel = nb;
                    }
                    nb2 = n_local[d] - j_local2*nb;
                    nb0 = min(nb, nb2 );
            
                    magma_setdevice(d);
                    magmablasSetKernelStream(stream[d][stream1]);
                    magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for the diagonal
                    if ( j+jb < n && d == (j/nb+1)%num_gpus ) {
                        /* owns the next column, look-ahead the column */
#if defined(PRECISION_d) && defined(STRSM_WORK)
                        magmablas_strsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit,
                                              nb0, jb, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, nb*j_local2, j), ldda,
                                              d_dinvA[d][0], d_x[d][0]);
#else
                        magma_strsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit,
                                     nb0, jb, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, nb*j_local2, j), ldda);
#endif
                        magma_event_record( event[d][3], stream[d][stream1] );

                        /* send the column to cpu */
                        if ( j+jb < n ) {
                            magma_queue_wait_event( stream[d][stream0], event[d][3] ); // wait for lookahead
                            magma_sgetmatrix_async( nb0, j+jb,
                                                    dlA(d, nb*j_local2, 0), ldda,
                                                    Alo(j+jb,0),            lda,
                                                    stream[d][stream0] );
                            magma_event_record( event[d][1], stream[d][stream0] );
                        }

                        /* update the remaining blocks */
                        nb2 = nb2 - nb0;
#if defined(PRECISION_d) && defined(STRSM_WORK)
                        magmablas_strsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit,
                                              nb2, jb, c_one,
                                              dlpanel,                    ldpanel,
                                              dlA(d, nb*j_local2+nb0, j), ldda,
                                              d_dinvA[d][1], d_x[d][1] );
#else
                        magma_strsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit,
                                     nb2, jb, c_one,
                                     dlpanel,                    ldpanel,
                                     dlA(d, nb*j_local2+nb0, j), ldda);
#endif
                    } else if ( nb2 > 0 ) {
                        /* update the entire trailing matrix */
#if defined(PRECISION_d) && defined(STRSM_WORK)
                        magmablas_strsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit,
                                              nb2, jb, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, nb*j_local2, j), ldda,
                                              d_dinvA[d][1], d_x[d][1] );
#else
                        magma_strsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit,
                                     nb2, jb, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, nb*j_local2, j), ldda);
#endif
                    }
                    d = (d+1)%num_gpus;
                }
            }
        }
    } /* end of else not upper */

    /* == finalize the trace == */
    trace_finalize( "spotrf.svg", "trace.css" );

    /* clean up */
    for( d=0; d < num_gpus; d++ ) {
        magma_setdevice(d);
        magma_queue_sync( stream[d][0] );
        magma_queue_sync( stream[d][1] );
        magmablasSetKernelStream(NULL);

        //magma_event_destroy( event0[d] );
        //magma_event_destroy( event1[d] );
        //magma_event_destroy( event2[d] );
        //magma_event_destroy( event3[d] );
    }
    magma_setdevice(0);

    return *info;
} /* magma_spotrf_mgpu */
Example #24
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing magma_dsymm_mgpu
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    double c_neg_one = MAGMA_D_NEG_ONE;
    double alpha     = MAGMA_D_MAKE( 3.456, 5.678 );
    double beta      = MAGMA_D_MAKE( 1.234, 2.456 );
    
    real_Double_t    gflops, gpu_perf=0., cpu_perf=0., gpu_time=0., cpu_time=0.;
    real_Double_t    gpu_perf2=0., gpu_time2=0.;
    double           Anorm, error, work[1];
    double *hA, *hB, *hC, *hR;
    magmaDouble_ptr dA[MagmaMaxGPUs], dB[MagmaMaxGPUs], dC[MagmaMaxGPUs], dwork[MagmaMaxGPUs];
    magmaDouble_ptr dA2;
    magma_int_t i, j, dev, M, N, size, lda, ldb, ldc, ldda, lddb, lddc, msize, nb;
    magma_int_t ione     = 1;
    magma_int_t iseed[4] = {0,0,0,1};
    magma_int_t status = 0;
    
    magma_opts opts;
    opts.parse_opts( argc, argv );
    opts.ngpu = abs( opts.ngpu );  // always uses multi-GPU code
    
    double tol = opts.tolerance * lapackf77_dlamch("E");
    
    // default values
    nb = (opts.nb > 0 ? opts.nb : 64);
    
    magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2];
    magma_int_t ncmplx = 0;
    magma_buildconnection_mgpu( gnode, &ncmplx, opts.ngpu );
    
    printf("%% Initializing communication pattern... GPU-ncmplx %d\n", (int) ncmplx);
    for (i=0; i < ncmplx; ++i) {
        magma_int_t myngpu = gnode[i][MagmaMaxGPUs];
        printf("%% cmplx %d has %d GPUs:", i, myngpu);
        for (j=0; j < myngpu; ++j) {
            printf(" %d", (int) gnode[i][j]);
            if (j < myngpu-1) {
                printf(",");
            }
        }
        printf("\n");
    }

    // number of queues per GPU. Requires ngpu.
    magma_int_t nqueue  = opts.ngpu;
    // number of events per GPU. Require ngpu*ngpu.
    magma_int_t nevents = opts.ngpu*opts.ngpu;
    magma_queue_t queues[MagmaMaxGPUs][20], queues0[MagmaMaxGPUs];
    magma_event_t events[MagmaMaxGPUs][MagmaMaxGPUs*MagmaMaxGPUs + 10];
    for( dev = 0; dev < opts.ngpu; ++dev ) {
        magma_setdevice( dev );
        for( i = 0; i < nqueue; ++i ) {
            magma_queue_create( dev, &queues[dev][i] );
        }
        queues0[dev] = queues[dev][0];
        for( i = 0; i < nevents; ++i ) {
            cudaEventCreateWithFlags( &events[dev][i], cudaEventDisableTiming );
        }
    }

    printf("%% nb %d, ngpu %d, version %d\n", (int) nb, (int) opts.ngpu, (int) opts.version );
    printf("%%   M     N    nb offset  CPU Gflop/s (sec)   GPU Gflop/s (sec)   CUBLAS hemm (sec)   ||R|| / ||A||*||B||\n");
    printf("%%========================================================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
      M = opts.msize[itest];
      N = opts.nsize[itest];
      for( int offset = 0; offset < N; offset += min(N,nb) ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            msize = M - offset;
            lda   = M;  // TODO depends on side
            ldb   = M;
            ldc   = M;
            ldda  = magma_roundup( lda, opts.align );  // multiple of 32 by default
            lddb  = magma_roundup( ldb, opts.align );  // multiple of 32 by default
            lddc  = magma_roundup( ldc, opts.align );  // multiple of 32 by default
            gflops = FLOPS_DSYMM( MagmaLeft, (double)msize, (double)N ) / 1e9;
            
            magma_int_t dworksiz = lddc*N + (M*N)*opts.ngpu;
            
            TESTING_MALLOC_CPU( hA, double, lda*M );
            TESTING_MALLOC_CPU( hB, double, ldb*N );
            TESTING_MALLOC_CPU( hC, double, ldc*N );
            
            TESTING_MALLOC_PIN( hR, double, ldc*N );

            for( dev = 0; dev < opts.ngpu; ++dev ) {
                magma_int_t mlocal = ((M / nb) / opts.ngpu + 1) * nb;
                magma_setdevice( dev );
                TESTING_MALLOC_DEV( dA[dev],    double, ldda*mlocal );
                TESTING_MALLOC_DEV( dB[dev],    double, lddb*N      );
                TESTING_MALLOC_DEV( dC[dev],    double, lddc*N      );
                TESTING_MALLOC_DEV( dwork[dev], double, dworksiz    );
            }
            
            if ( opts.check ) {
                magma_setdevice( 0 );
                TESTING_MALLOC_DEV( dA2, double, ldda*M );
            }

            size = lda*M;
            lapackf77_dlarnv( &ione, iseed, &size, hA );
            magma_dmake_symmetric( M, hA, lda );
            
            size = ldb*N;
            lapackf77_dlarnv( &ione, iseed, &size, hB );
            size = ldc*N;
            lapackf77_dlarnv( &ione, iseed, &size, hC );
            lapackf77_dlacpy( "Full", &M, &N, hC, &ldc, hR, &lda );
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            magma_dsetmatrix_1D_col_bcyclic( M, M, hA, lda, dA, ldda, opts.ngpu, nb, queues0 );
            for( dev = 0; dev < opts.ngpu; ++dev ) {
                magma_setdevice( dev );
                magma_dsetmatrix( M, N, hB, lda, dB[dev], ldda, opts.queue );
                // since when offset != 0, the GPU that does beta*C may not be 0,
                // send initial hC to all GPUs.
                magma_dsetmatrix( M, N, hC, lda, dC[dev], ldda, opts.queue );
            }
            
            trace_init( 1, opts.ngpu, nqueue, (magma_queue_t*) queues );
            
            gpu_time = magma_sync_wtime(0);
            
            magmablas_dsymm_mgpu(
                MagmaLeft, MagmaLower, msize, N,
                alpha, dA, ldda, offset,
                       dB, ldda,
                beta,  dC, ldda, dwork, dworksiz,
                opts.ngpu, nb, queues, nqueue, events, nevents, gnode, ncmplx);
            
            gpu_time = magma_sync_wtime(0) - gpu_time;
            gpu_perf = gflops / gpu_time;
            
            #ifdef TRACING
            char buf[80];
            snprintf( buf, sizeof(buf), "dsymm-m%d-n%d-nb%d-ngpu%d-run%d.svg",
                      (int) M, (int) N, (int) nb, (int) opts.ngpu, (int) iter );
            trace_finalize( buf, "trace.css" );
            #endif
            
            /* ====================================================================
               Performs operation using CUBLAS
               =================================================================== */
            if ( opts.check && iter == 0 ) {
                magma_setdevice( 0 );
                magma_dsetmatrix( M, M, hA, lda, dA2, ldda, opts.queue );
                magma_dsetmatrix( M, N, hB, lda, dB[0], ldda, opts.queue );
                magma_dsetmatrix( M, N, hC, lda, dwork[0], ldda, opts.queue );
                
                gpu_time2 = magma_sync_wtime(0);
                magma_dsymm(
                    MagmaLeft, MagmaLower, msize, N,
                    alpha, dA2 + offset + offset*ldda, ldda,
                           dB[0],    ldda,
                    beta,  dwork[0], ldda, opts.queue );
                gpu_time2 = magma_sync_wtime(0) - gpu_time2;
                gpu_perf2 = gflops / gpu_time2;
            }
            
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            if ( opts.check ) {
                // store ||A||*||B||
                Anorm  = lapackf77_dlange("fro", &msize, &msize, hA + offset + offset*lda, &lda, work );
                Anorm *= lapackf77_dlange("fro", &msize, &N, hB, &lda, work );
                
                //printf( "A =" ); magma_dprint( M, M, hA, lda );
                //printf( "B =" ); magma_dprint( M, N, hB, lda );
                //printf( "C =" ); magma_dprint( M, N, hC, lda );
                
                cpu_time = magma_wtime();
                blasf77_dsymm( "Left", "Lower", &msize, &N,
                                &alpha, hA + offset + offset*lda, &lda,
                                        hB, &lda,
                                &beta,  hC, &lda );
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
                
                for (dev=0; dev < opts.ngpu; ++dev) {
                    magma_setdevice( dev );
                    magma_dgetmatrix( M, N, dC[dev], ldda, hR, lda, opts.queue );
                    
                    // compute relative error ||R||/||A||*||B||, where R := C_magma - C_lapack = R - C
                    size = ldc*N;
                    blasf77_daxpy( &size, &c_neg_one, hC, &ione, hR, &ione );
                    error = lapackf77_dlange("fro", &msize, &N, hR, &lda, work) / Anorm;
                    
                    //printf( "R ="  ); magma_dprint( M, N, hR, lda );
                    bool okay = (error < tol);
                    status += ! okay;
                    if (dev == 0) {
                        printf( "%5d %5d %5d %5d   %7.1f (%7.4f)   %7.1f (%7.4f)   %7.1f (%7.4f)   %8.2e   %s\n",
                                (int) M, (int) N, (int) nb, (int) offset,
                                cpu_perf, cpu_time,
                                gpu_perf, gpu_time,
                                gpu_perf2, gpu_time2,
                                error, (okay ? "ok" : "failed") );
                    }
                    else {
                        printf( "    dev %d %74s  %8.2e   %s\n", dev, "",
                                error, (okay ? "ok" : "failed") );
                    }
                }
            } else {
                printf( "%5d %5d %5d %5d     ---   (  ---  )   %7.1f (%7.4f)     ---   (  ---  )   ---\n",
                        (int) M, (int) N, (int) nb, (int) offset,
                        gpu_perf, gpu_time );
            }
            
            TESTING_FREE_CPU( hA );
            TESTING_FREE_CPU( hB );
            TESTING_FREE_CPU( hC );
            
            TESTING_FREE_PIN( hR );
            
            for( dev = 0; dev < opts.ngpu; ++dev ) {
                magma_setdevice( dev );
                TESTING_FREE_DEV( dA[dev]    );
                TESTING_FREE_DEV( dB[dev]    );
                TESTING_FREE_DEV( dC[dev]    );
                TESTING_FREE_DEV( dwork[dev] );
            }
            
            if ( opts.check ) {
                magma_setdevice( 0 );
                TESTING_FREE_DEV( dA2 );
            }
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
      }  // offset
      printf( "\n" );
    }

    for( dev = 0; dev < opts.ngpu; ++dev ) {
        magma_setdevice( dev );
        for( i = 0; i < nqueue; ++i ) {
            magma_queue_destroy( queues[dev][i] );
        }
        for( i = 0; i < nevents; ++i ) {
            magma_event_destroy( events[dev][i] );
        }
    }
    
    opts.cleanup();
    TESTING_FINALIZE();
    return status;
}