예제 #1
0
extern "C" magma_int_t
magma_zpotrf3_mgpu(magma_int_t num_gpus, char uplo, magma_int_t m, magma_int_t n,
                   magma_int_t off_i, magma_int_t off_j, magma_int_t nb,
                   magmaDoubleComplex *d_lA[],  magma_int_t ldda,
                   magmaDoubleComplex *d_lP[],  magma_int_t lddp,
                   magmaDoubleComplex *a,       magma_int_t lda, magma_int_t h,
                   magma_queue_t stream[][3], magma_event_t event[][5],
                   magma_int_t *info )
{
    /*  -- MAGMA (version 1.4.0) --
           Univ. of Tennessee, Knoxville
           Univ. of California, Berkeley
           Univ. of Colorado, Denver
           August 2013

        Purpose
        =======
        ZPOTRF computes the Cholesky factorization of a complex Hermitian
        positive definite matrix dA.
        Auxiliary subroutine for zpotrf2_ooc. It is multiple gpu interface to compute
        Cholesky of a "rectangular" matrix.

        The factorization has the form
           dA = U**H * U,  if UPLO = 'U', or
           dA = L  * L**H,  if UPLO = 'L',
        where U is an upper triangular matrix and L is lower triangular.

        This is the block version of the algorithm, calling Level 3 BLAS.

        Arguments
        =========
        UPLO    (input) CHARACTER*1
                = 'U':  Upper triangle of dA is stored;
                = 'L':  Lower triangle of dA is stored.

        N       (input) INTEGER
                The order of the matrix dA.  N >= 0.

        dA      (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N)
                On entry, the Hermitian matrix dA.  If UPLO = 'U', the leading
                N-by-N upper triangular part of dA contains the upper
                triangular part of the matrix dA, and the strictly lower
                triangular part of dA is not referenced.  If UPLO = 'L', the
                leading N-by-N lower triangular part of dA contains the lower
                triangular part of the matrix dA, and the strictly upper
                triangular part of dA is not referenced.

                On exit, if INFO = 0, the factor U or L from the Cholesky
                factorization dA = U**H * U or dA = L * L**H.

        LDDA     (input) INTEGER
                The leading dimension of the array dA.  LDDA >= max(1,N).
                To benefit from coalescent memory accesses LDDA must be
                dividable by 16.

        INFO    (output) INTEGER
                = 0:  successful exit
                < 0:  if INFO = -i, the i-th argument had an illegal value
                > 0:  if INFO = i, the leading minor of order i is not
                      positive definite, and the factorization could not be
                      completed.
        =====================================================================   */


    magma_int_t     j, jb, nb0, nb2, d, dd, id, j_local, j_local2, buf;
    char            uplo_[2] = {uplo, 0};
    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    double          d_one     =  1.0;
    double          d_neg_one = -1.0;
    int upper = lapackf77_lsame(uplo_, "U");
    magmaDoubleComplex *dlpanel;
    magma_int_t n_local[MagmaMaxGPUs], ldpanel;
    const magma_int_t stream1 = 0, stream2 = 1, stream3 = 2;
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK)
    /* used by ztrsm_work */
    int trsm_nb = 128;
    int trsm_n = trsm_nb*((nb+trsm_nb-1)/trsm_nb);
    magmaDoubleComplex *d_dinvA[MagmaMaxGPUs];
    magmaDoubleComplex *d_x[MagmaMaxGPUs];
#define dinvA(d,j) &(d_dinvA[(d)][(j)*trsm_nb*trsm_n])
#define dx(d,j) &(d_x[(d)][(j)*nb*m])
    /*
     * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE
     */
    for( d=0; d<num_gpus; d++ ) {
        magma_setdevice(d);
        if ( (MAGMA_SUCCESS != magma_zmalloc( &d_dinvA[d], 2*trsm_nb*trsm_n )) ||
                (MAGMA_SUCCESS != magma_zmalloc( &d_x[d],     2*nb*(upper ? n : m) )) ) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }
    }
    magma_setdevice(0);
#endif

    *info = 0;
    if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (!upper && num_gpus*ldda < max(1,n)) {
        *info = -4;
    } else if (upper && ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* initialization */
    for( d=0; d<num_gpus; d++ ) {
        /* local-n and local-ld */
        if (upper) {
            n_local[d] = (n/(nb*num_gpus))*nb;
            if (d < (n/nb)%num_gpus)
                n_local[d] += nb;
            else if (d == (n/nb)%num_gpus)
                n_local[d] += n%nb;
        } else {
            n_local[d] = (m/(nb*num_gpus))*nb;
            if (d < (m/nb)%num_gpus)
                n_local[d] += nb;
            else if (d == (m/nb)%num_gpus)
                n_local[d] += m%nb;
        }
    }

    /* == initialize the trace */
    trace_init( 1, num_gpus, 3, (CUstream_st**)stream );

    if (upper)
    {
        /* ---------------------------------------------- */
        /* Upper-triangular case                          */
        /* > Compute the Cholesky factorization A = U'*U. */
        /* ---------------------------------------------- */
        for (j=0; j<m; j+=nb) {

            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%num_gpus;
            buf = (j/nb)%num_gpus; // right now, we have num_gpu buffers, so id and buf are the same..

            /* Set the local index where the current panel is */
            j_local = j/(nb*num_gpus);
            jb = min(nb, (m-j));

            /* Update the current diagonal block on stream1 */
            magma_setdevice(id);
            if( j > 0 ) {
                magmablasSetKernelStream(stream[id][stream1]);
                trace_gpu_start( id, stream1, "syrk", "syrk" );
                magma_zherk(MagmaUpper, MagmaConjTrans, jb, j,
                            d_neg_one, dlA(id, 0, nb*j_local), ldda,
                            d_one,     dlA(id, j, nb*j_local), ldda);
                trace_gpu_end( id, stream1 );
            }

            /* send the diagonal to cpu on stream1 */
            trace_gpu_start( id, stream1, "comm", "D to CPU" );
            magma_zgetmatrix_async( jb, jb,
                                    dlA(id, j, nb*j_local), ldda,
                                    Aup(j,j),               lda,
                                    stream[id][stream1] );
            trace_gpu_end( id, stream1 );

            /* update off-diagonal blocks in the panel */
            if( j > 0 ) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    j_local2 = j_local+1;
                    if( d > id ) j_local2 --;
                    nb0 = nb*j_local2; // number of local columns in the panel, while jb is panel-size (number of rows)

                    if( n_local[d] > nb0 ) {
                        magma_setdevice(d);
                        magmablasSetKernelStream(stream[d][stream2]);
                        if( d == id ) {
                            dlpanel = dlA(d,0,nb*j_local);
                            ldpanel = ldda;
                            // the GPU owns the row from start, and no need of synch.
                            //magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu
                        } else {
                            dlpanel = dlP(d,nb,0,buf);
                            ldpanel = lddp;
                            magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu
                        }
                        trace_gpu_start( d, stream2, "gemm", "gemm" );
                        magma_zgemm(MagmaConjTrans, MagmaNoTrans,
                                    jb, n_local[d]-nb0, j,
                                    c_neg_one, dlpanel,        ldpanel,
                                    dlA(d, 0, nb0), ldda,
                                    c_one,     dlA(d, j, nb0), ldda);
                        trace_gpu_end( d, stream2 );
                        magma_event_record( event[d][2], stream[d][stream2] );
                    }
                    d = (d+1)%num_gpus;
                }
            }

            /* wait for panel and factorize it on cpu */
            magma_setdevice(id);
            magma_queue_sync( stream[id][stream1] );
            trace_cpu_start( 0, "getrf", "getrf" );
            lapackf77_zpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info);
            trace_cpu_end( 0 );
            if (*info != 0) {
                *info = *info + j;
                break;
            }

            /* send the diagonal to gpus on stream1 */
            if ( (j+jb) < n) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    if( d == id ) {
                        dlpanel = dlA(d, j, nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlP(d,0,0,buf);
                        ldpanel = lddp;
                    }
                    magma_setdevice(d);
                    trace_gpu_start( d, stream1, "comm", "comm" );
                    magma_zsetmatrix_async( jb, jb,
                                            Aup(j,j), lda,
                                            dlpanel,  ldpanel,
                                            stream[d][stream1] );
                    trace_gpu_end( d, stream1 );
                    magma_event_record( event[d][1], stream[d][stream1] );
                    d = (d+1)%num_gpus;
                }
            } else {
                magma_setdevice(id);
                trace_gpu_start( id, stream1, "comm", "comm" );
                magma_zsetmatrix_async( jb, jb,
                                        Aup(j,j),               lda,
                                        dlA(id, j, nb*j_local), ldda,
                                        stream[id][stream1] );
                trace_gpu_end( id, stream1 );
            }

            /* panel-factorize the off-diagonal */
            if ( (j+jb) < n) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    /* next column */
                    j_local2 = j_local+1;
                    if( d > id ) j_local2--;
                    if( d == id ) {
                        dlpanel = dlA(d,j,nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlP(d,0,0,buf);
                        ldpanel = lddp;
                    }
                    nb2 = n_local[d] - j_local2*nb;

                    magma_setdevice(d);
                    if( j+jb < m && d == (j/nb+1)%num_gpus ) {
                        /* owns the next column, look-ahead next block on stream1 */
                        nb0 = min(nb, nb2);
                        magmablasSetKernelStream(stream[d][stream1]);
                        magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update
                        trace_gpu_start( d, stream1, "trsm", "trsm" );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK)
                        magmablas_zlaset( MagmaUpperLower, trsm_nb, trsm_n, dinvA(d,0),trsm_nb );
                        magmablas_zlaset( MagmaUpperLower, nb0,jb, dx(d,0),nb0 );
                        magmablas_ztrsm_work( MagmaLeft, MagmaUpper,
                                              MagmaConjTrans, MagmaNonUnit,
                                              jb, nb0, c_one,
                                              dlpanel, ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                              1, dinvA(d,0), dx(d,0) );
#else
                        magma_ztrsm( MagmaLeft, MagmaUpper,
                                     MagmaConjTrans, MagmaNonUnit,
                                     jb, nb0, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, j, nb*j_local2), ldda);
#endif
                        magma_event_record( event[d][4], stream[d][stream1] );
                        trace_gpu_end( d, stream1 );
                    } else if( nb2 > 0 ) {
                        /* update all the blocks on stream2 */
                        magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor
                        trace_gpu_start( d, stream2, "trsm", "trsm" );
                        magmablasSetKernelStream(stream[d][stream2]);
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK)
                        magmablas_zlaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,0),trsm_nb );
                        magmablas_zlaset( MagmaUpperLower, nb2,jb, dx(d,0),nb2 );
                        magmablas_ztrsm_work( MagmaLeft, MagmaUpper,
                                              MagmaConjTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel, ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                              1, dinvA(d,0), dx(d,0) );
#else
                        magma_ztrsm( MagmaLeft, MagmaUpper,
                                     MagmaConjTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, j, nb*j_local2), ldda);
#endif
                        trace_gpu_end( d, stream2 );
                    }
                    d = (d+1)%num_gpus;
                } /* end of for */

                /* ========================================================== */
                if( j+jb < m ) {
                    d = (j/nb+1)%num_gpus;
                    /* next column */
                    j_local2 = j_local+1;
                    if( d > id ) j_local2--;
                    nb0 = min(nb, n_local[d]-nb*j_local2 );

                    /* even on 1 gpu, off-diagonals are copied to cpu (synchronize at the end).      *
                     * so we have the Cholesky factor, but only diagonal submatrix of the big panel, *
                     * on cpu at the end.                                                            */
                    int d2, buf2;
                    magma_setdevice(d);
                    /* lookahead done */
                    magma_queue_wait_event( stream[d][stream3], event[d][4] );

                    trace_gpu_start( d, stream3, "comm", "row to CPU" );
                    magma_zgetmatrix_async( (j+jb), nb0,
                                            dlA(d, 0, nb*j_local2), ldda,
                                            Aup(0,j+jb),            lda,
                                            stream[d][stream3] );
                    trace_gpu_end( d, stream3 );
                    magma_event_record( event[d][3], stream[d][stream3] );
                    /* needed on pluto */
                    //magma_queue_sync( stream[d][stream3] );

                    /* broadcast rows to gpus on stream2 */
                    buf2 = ((j+jb)/nb)%num_gpus;
                    for( d2=0; d2<num_gpus; d2++ ) {
                        if( d2 != d )
                        {
                            magma_setdevice(d2);
                            trace_gpu_start( d2, stream3, "comm", "row to GPUs" );
                            magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // rows arrived at cpu on stream3
                            magma_zsetmatrix_async( j+jb, nb0,
                                                    Aup(0,j+jb),       lda,
                                                    dlP(d2,nb,0,buf2), lddp,
                                                    stream[d2][stream3] );
                            trace_gpu_end( d2, stream3 );
                            magma_event_record( event[d2][0], stream[d2][stream3] );
                        }
                    }

                    /* =========================== */
                    /* update the remaining blocks */
                    nb2 = n_local[d]-(nb*j_local2 + nb0);
                    if( nb2 > 0 ) {
                        if( d == id ) {
                            dlpanel = dlA(d, j, nb*j_local);
                            ldpanel = ldda;
                        } else {
                            dlpanel = dlP(d,0,0,buf);
                            ldpanel = lddp;
                        }
                        magma_setdevice(d);
                        magmablasSetKernelStream(stream[d][stream2]);
                        trace_gpu_start( d, stream2, "trsm", "trsm" );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK)
                        int flag = 0;
                        if (flag == 0) {
                            magma_queue_wait_event( stream[d][stream2], event[d][4] ); // lookahead -> diagonal inversion
                        } else {
                            magmablas_zlaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,flag),trsm_nb );
                            magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received
                        }
                        magmablas_zlaset( MagmaUpperLower, nb2,jb, dx(d,1),nb2 );
                        magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel, ldpanel,
                                              dlA(d, j, nb*j_local2+nb0), ldda,
                                              flag, dinvA(d,flag), dx(d,1) );
#else
                        magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor
                        magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel, ldpanel,
                                     dlA(d, j, nb*j_local2+nb0), ldda);
#endif
                        trace_gpu_end( d, stream2 );
                    }
                }
            } /* end of ztrsm */
        } /* end of for j=1, .., n */
    } else {

        /* ---------------------------------------------- */
        /* Lower-triangular case                          */
        /* > Compute the Cholesky factorization A = L*L'. */
        /* ---------------------------------------------- */
        for (j=0; j<n; j+=nb) {

            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%num_gpus;
            buf = (j/nb)%num_gpus;

            /* Set the local index where the current panel is */
            j_local = j/(nb*num_gpus);
            jb = min(nb, (n-j));

            /* Update the current diagonal block on stream1 */
            magma_setdevice(id);
            if( j > 0 ) {
                magmablasSetKernelStream(stream[id][stream1]);
                magma_zherk(MagmaLower, MagmaNoTrans, jb, j,
                            d_neg_one, dlA(id, nb*j_local, 0), ldda,
                            d_one,     dlA(id, nb*j_local, j), ldda);
            }

            /* send the diagonal to cpu on stream1 */
            magma_zgetmatrix_async( jb, jb,
                                    dlA(id, nb*j_local, j), ldda,
                                    Alo(j,j),               lda,
                                    stream[id][stream1] );

            /* update off-diagonal blocks of the panel */
            if( j > 0 ) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    j_local2 = j_local+1;
                    if( d > id ) j_local2 --;
                    nb0 = nb*j_local2;

                    if( nb0 < n_local[d] ) {
                        magma_setdevice(d);
                        magmablasSetKernelStream(stream[d][stream2]);
                        if( d == id ) {
                            dlpanel = dlA(d, nb*j_local, 0);
                            ldpanel = ldda;
                        } else {
                            dlpanel = dlPT(d,0,nb,buf);
                            ldpanel = nb;
                            magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu
                        }
                        magma_zgemm( MagmaNoTrans, MagmaConjTrans,
                                     n_local[d]-nb0, jb, j,
                                     c_neg_one, dlA(d, nb0, 0), ldda,
                                     dlpanel,        ldpanel,
                                     c_one,     dlA(d, nb0, j), ldda);
                        magma_event_record( event[d][2], stream[d][stream2] );
                    }
                    d = (d+1)%num_gpus;
                }
            }

            /* wait for the panel and factorized it on cpu */
            magma_setdevice(id);
            magma_queue_sync( stream[id][stream1] );
            lapackf77_zpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info);
            if (*info != 0) {
                *info = *info + j;
                break;
            }

            /* send the diagonal to gpus on stream1 */
            if ( (j+jb) < m) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    if( d == id ) {
                        dlpanel = dlA(d, nb*j_local, j);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlPT(d, 0, 0, buf);
                        ldpanel = nb;
                    }
                    magma_setdevice(d);
                    magma_zsetmatrix_async( jb, jb,
                                            Alo(j,j), lda,
                                            dlpanel,  ldpanel,
                                            stream[d][stream1] );
                    magma_event_record( event[d][1], stream[d][stream1] );
                    d = (d+1)%num_gpus;
                }
            } else {
                magma_setdevice(id);
                magma_zsetmatrix_async( jb, jb,
                                        Alo(j,j),               lda,
                                        dlA(id, nb*j_local, j), ldda,
                                        stream[id][stream1] );
            }

            /* panel factorize the off-diagonal */
            if ( (j+jb) < m) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    /* next column */
                    j_local2 = j_local+1;
                    if( d > id ) j_local2--;
                    if( d == id ) {
                        dlpanel = dlA(d, nb*j_local, j);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlPT(d, 0, 0, buf);
                        ldpanel = nb;
                    }
                    nb2 = n_local[d] - j_local2*nb;
                    nb0 = min(nb, nb2);

                    magma_setdevice(d);
                    if( j+nb < n && d == (j/nb+1)%num_gpus ) { /* owns next column, look-ahead next block on stream1 */
                        if ( j > 0 ) magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update
                        magmablasSetKernelStream(stream[d][stream1]);
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK)
                        magmablas_zlaset( MagmaUpperLower, trsm_nb, trsm_n, dinvA(d,0),trsm_nb );
                        magmablas_zlaset( MagmaUpperLower, nb0,jb, dx(d,0),nb0 );
                        magmablas_ztrsm_work( MagmaRight, MagmaLower,
                                              MagmaConjTrans, MagmaNonUnit,
                                              nb0, jb, c_one,
                                              dlpanel, ldpanel,
                                              dlA(d, nb*j_local2, j), ldda,
                                              1, dinvA(d,0), dx(d,0) );
#else
                        magma_ztrsm( MagmaRight, MagmaLower,
                                     MagmaConjTrans, MagmaNonUnit,
                                     nb0, jb, c_one,
                                     dlpanel, ldpanel,
                                     dlA(d, nb*j_local2, j), ldda);
#endif
                        magma_event_record( event[d][4], stream[d][stream1] );
                    } else if( nb2 > 0 ) { /* other gpus updating all the blocks on stream2 */
                        /* update the entire column */
                        magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for the cholesky factor
                        magmablasSetKernelStream(stream[d][stream2]);
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK)
                        magmablas_zlaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,0),trsm_nb );
                        magmablas_zlaset( MagmaUpperLower, nb2,jb, dx(d,0),nb2 );
                        magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                              nb2, jb, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, nb*j_local2, j), ldda,
                                              1, dinvA(d,0), dx(d,0) );
#else
                        magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                     nb2, jb, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, nb*j_local2, j), ldda);
#endif
                    }
                    d = (d+1)%num_gpus;
                } /* end for d */

                /* ========================================================== */
                if( j+jb < n ) {
                    d = (j/nb+1)%num_gpus;
                    /* next column */
                    j_local2 = j_local+1;
                    if( d > id ) j_local2--;
                    nb0 = min(nb, n_local[d]-nb*j_local2 );

                    /* even on 1 gpu, we copy off-diagonal to cpu (but don't synchronize).  */
                    /* so we have the Cholesky factor on cpu at the end.                    */
                    int d2, buf2;
//#define ZPOTRF_DEVICE_TO_DEVICE
#ifdef ZPOTRF_DEVICE_TO_DEVICE
                    // lookahead done

                    /* broadcast the rows to gpus */
                    buf2 = ((j+jb)/nb)%num_gpus;
                    for( d2=0; d2<num_gpus; d2++ ) {
                        magma_setdevice(d2);
                        magma_queue_wait_event( stream[d2][stream3], event[d][4] );
                        if( d2 != d ) {
                            magma_zcopymatrix_async( nb0, j+jb,
                                                     dlPT(d2,0,nb,buf2), nb, // first nbxnb reserved for diagonal block
                                                     dlA(d, nb*j_local2, 0), ldda,
                                                     stream[d2][stream3] );
                            magma_event_record( event[d2][0], stream[d2][stream3] );
                        } else {
                            magma_zgetmatrix_async( nb0, j+jb,
                                                    dlA(d, nb*j_local2, 0), ldda,
                                                    Alo(j+jb,0),            lda,
                                                    stream[d][stream3] );
                        }
                    }
#else
                    // lookahead done
                    magma_setdevice(d);
                    magma_queue_wait_event( stream[d][stream3], event[d][4] );
                    magma_zgetmatrix_async( nb0, j+jb,
                                            dlA(d, nb*j_local2, 0), ldda,
                                            Alo(j+jb,0),            lda,
                                            stream[d][stream3] );
                    magma_event_record( event[d][3], stream[d][stream3] );
                    /* syn on rows on CPU, seem to be needed on Pluto */
                    //magma_queue_sync( stream[d][stream3] );

                    /* broadcast the rows to gpus */
                    buf2 = ((j+jb)/nb)%num_gpus;
                    for( d2=0; d2<num_gpus; d2++ ) {
                        if( d2 != d )
                        {
                            magma_setdevice(d2);
                            magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // getmatrix done
                            magma_zsetmatrix_async( nb0, j+jb,
                                                    Alo(j+jb,0),        lda,
                                                    dlPT(d2,0,nb,buf2), nb, // first nbxnb reserved for diagonal block
                                                    stream[d2][stream3] );
                            magma_event_record( event[d2][0], stream[d2][stream3] );
                        }
                    }
#endif
                    /* =================================== */
                    /* updates remaining blocks on stream2 */
                    nb2 = n_local[d] - (j_local2*nb + nb0);
                    if( nb2 > 0 ) {
                        if( d == id ) {
                            dlpanel = dlA(d, nb*j_local, j);
                            ldpanel = ldda;
                        } else {
                            dlpanel = dlPT(d,0,0,buf);
                            ldpanel = nb;
                        }
                        magma_setdevice(d);
                        magmablasSetKernelStream(stream[d][stream2]);
                        /* update the remaining blocks in the column */
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK)
                        int flag = 0;
                        if (flag == 0) {
                            magma_queue_wait_event( stream[d][stream2], event[d][4] ); // lookahead -> diagonal inversion
                        } else {
                            magmablas_zlaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,flag),trsm_nb );
                            magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received
                        }
                        magmablas_zlaset( MagmaUpperLower, nb2,jb, dx(d,1),nb2 );
                        magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                              nb2, jb, c_one,
                                              dlpanel,                    ldpanel,
                                              dlA(d, nb*j_local2+nb0, j), ldda,
                                              flag, dinvA(d,flag), dx(d,1) );
#else
                        magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received
                        magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                     nb2, jb, c_one,
                                     dlpanel,                    ldpanel,
                                     dlA(d, nb*j_local2+nb0, j), ldda);
#endif
                    }
                }
            }
        }
    } /* end of else not upper */

    /* == finalize the trace == */
    trace_finalize( "zpotrf.svg","trace.css" );
    for( d=0; d<num_gpus; d++ ) {
        magma_setdevice(d);
        for( j=0; j<3; j++ ) {
            magma_queue_sync( stream[d][j] );
        }
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK)
        magma_free( d_dinvA[d] );
        magma_free( d_x[d] );
#endif
        magmablasSetKernelStream(NULL);
    }
    magma_setdevice(0);

    return *info;
} /* magma_zpotrf_mgpu */
예제 #2
0
/**
    Purpose
    -------
    SPOTRF computes the Cholesky factorization of a real symmetric
    positive definite matrix dA.

    The factorization has the form
       dA = U**T * U,   if UPLO = MagmaUpper, or
       dA = L  * L**T,  if UPLO = MagmaLower,
    where U is an upper triangular matrix and L is lower triangular.

    This is the block version of the algorithm, calling Level 3 BLAS.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of dA is stored;
      -     = MagmaLower:  Lower triangle of dA is stored.

    @param[in]
    n       INTEGER
            The order of the matrix dA.  N >= 0.

    @param[in,out]
    dA      REAL array on the GPU, dimension (LDDA,N)
            On entry, the symmetric matrix dA.  If UPLO = MagmaUpper, the leading
            N-by-N upper triangular part of dA contains the upper
            triangular part of the matrix dA, and the strictly lower
            triangular part of dA is not referenced.  If UPLO = MagmaLower, the
            leading N-by-N lower triangular part of dA contains the lower
            triangular part of the matrix dA, and the strictly upper
            triangular part of dA is not referenced.
    \n
            On exit, if INFO = 0, the factor U or L from the Cholesky
            factorization dA = U**T * U or dA = L * L**T.

    @param[in]
    ldda     INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            divisible by 16.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
      -     > 0:  if INFO = i, the leading minor of order i is not
                  positive definite, and the factorization could not be
                  completed.

    @ingroup magma_sposv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_spotrf2_mgpu(int num_gpus, magma_uplo_t uplo, magma_int_t m, magma_int_t n,
                   magma_int_t off_i, magma_int_t off_j, magma_int_t nb,
                   float **d_lA,  magma_int_t ldda,
                   float **d_lP,  magma_int_t lddp,
                   float *A,      magma_int_t lda,   magma_int_t h,
                   magma_queue_t stream[][3], magma_event_t event[][5],
                   magma_int_t *info )
{
#define Alo(i, j)  (A +             ((j)+off_j)*lda  + (nb*(((i)/nb)%h)+off_i))
#define Aup(i, j)  (A + (nb*(((j)/nb)%h)+off_j)*lda  +               (i+off_i))

#define  dlA(id, i, j)    (d_lA[(id)] + (j)*ldda + (i))
#define  dlP(id, i, j, k) (d_lP[(id)] + (k)*nb*lddp + (j)*lddp + (i))
#define dlPT(id, i, j, k) (d_lP[(id)] + (k)*nb*lddp + (j)*nb   + (i))

    magma_int_t     j, jb, nb0, nb2, dd, d, id, j_local, j_local2, buf;
    float c_one     = MAGMA_S_ONE;
    float c_neg_one = MAGMA_S_NEG_ONE;
    float          d_one     =  1.0;
    float          d_neg_one = -1.0;
    int upper = (uplo == MagmaUpper);
    float *dlpanel;
    //magma_event_t event0[MagmaMaxGPUs], // syrk
    //            event1[MagmaMaxGPUs], // send off-diagonal
    //            event2[MagmaMaxGPUs], // send diagonal
    //            event3[MagmaMaxGPUs]; // trsm
    magma_int_t n_local[MagmaMaxGPUs], ldpanel;
    int stream0 = 0, stream1 = 1;
    #ifdef STRSM_WORK
    float *d_dinvA[MagmaMaxGPUs][2], *d_x[MagmaMaxGPUs][2]; /* used by strsm_work */
    #endif
    
    *info = 0;
    if (! upper && uplo != MagmaLower) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (!upper && num_gpus*ldda < max(1,n)) {
        *info = -4;
    } else if (upper && ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    for( d=0; d < num_gpus; d++ ) {
        /* local-n and local-ld */
        if (upper) {
            n_local[d] = ((n/nb)/num_gpus)*nb;
            if (d < (n/nb)%num_gpus)
                n_local[d] += nb;
            else if (d == (n/nb)%num_gpus)
                n_local[d] += n%nb;
        } else {
            n_local[d] = ((m/nb)/num_gpus)*nb;
            if (d < (m/nb)%num_gpus)
                n_local[d] += nb;
            else if (d == (m/nb)%num_gpus)
                n_local[d] += m%nb;
        }
        //magma_setdevice(d);
        //magma_event_create( &event0[d] );
        //magma_event_create( &event1[d] );
        //magma_event_create( &event2[d] );
        //magma_event_create( &event3[d] );
    }
    magma_setdevice(0);

    /* == initialize the trace */
    trace_init( 1, num_gpus, 3, (magma_queue_t*)stream );

    /* Use blocked code. */
    if (upper) {
        /* ---------------------------------------------- */
        /* Upper-triangular case                          */
        /* > Compute the Cholesky factorization A = U'*U. */
        /* ---------------------------------------------- */
        
#if defined(PRECISION_d) && defined(STRSM_WORK)
        /* invert the diagonals
         * Allocate device memory for the inversed diagonal blocks, size=m*NB
         */
        for( d=0; d < num_gpus; d++ ) {
            magma_setdevice(d);
            for( j=0; j < 2; j++ ) {
                magma_smalloc( &d_dinvA[d][j], nb*nb );
                magma_smalloc( &d_x[d][j],      n*nb );
                cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(float));
                cudaMemset(d_x[d][j],     0,  n*nb*sizeof(float));
            }
        }
        magma_setdevice(0);
#endif
        
        for (j=0; j < m; j += nb) {
            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%num_gpus;
            buf = (j/nb)%num_gpus;
            
            /* Set the local index where the current panel is */
            j_local = j/(nb*num_gpus);
            jb = min(nb, (m-j));
            
            if ( j > 0 ) {
                /* needed on pluto... */
                magma_setdevice(id);
                magma_queue_sync( stream[id][stream0] ); // wait for the column on CPU

                /* broadcast off-diagonal column to all gpus */
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd < num_gpus; dd++ ) {
                    if ( d != id ) {
                        magma_setdevice(d);
                
                        /* wait for it on CPU */
                        magma_queue_wait_event( stream[d][stream0], event[id][1] );
                
                        /* send it to GPU */
                        trace_gpu_start( d, stream0, "comm", "rows to GPUs" );
                        magma_ssetmatrix_async( j, jb,
                                                Aup(0,j),        lda,
                                                dlP(d,jb,0,buf), lddp,
                                                stream[d][stream0] );
                        trace_gpu_end( d, stream0 );
                        magma_event_record( event[d][1], stream[d][stream0] );
                    }
                    d = (d+1)%num_gpus;
                }
            }
            
            /* Update the current diagonal block */
            magma_setdevice(id);
            if ( j > 0 ) {
                magmablasSetKernelStream(stream[id][stream1]);
                trace_gpu_start( id, stream1, "syrk", "syrk" );
                magma_ssyrk(MagmaUpper, MagmaTrans, jb, j,
                            d_neg_one, dlA(id, 0, nb*j_local), ldda,
                            d_one,     dlA(id, j, nb*j_local), ldda);
                trace_gpu_end( id, stream1 );
                magma_event_record( event[id][0], stream[id][stream1] );
            }

            /* send the diagonal to cpu */
            magma_queue_wait_event( stream[id][stream0], event[id][0] ); // wait for syrk
            trace_gpu_start( id, stream0, "comm", "D to CPU" );
            magma_sgetmatrix_async( jb, jb,
                                    dlA(id, j, nb*j_local), ldda,
                                    Aup(j,j),               lda,
                                    stream[id][stream0] );
            trace_gpu_end( id, stream0 );

            if ( j > 0 ) {
                /* Compute the local block column of the panel. */
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd < num_gpus; dd++ ) {
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2 --;
                    nb0 = nb*j_local2;
                
                    if ( n_local[d] > nb0 ) {
                        /* wait for the off-diagonal */
                        if ( d != id ) {
                            //magma_queue_sync( stream[id][3] );
                            dlpanel = dlP(d, jb, 0, buf);
                            ldpanel = lddp;
                
                            /* wait for the offdiagonal column */
                            magma_queue_wait_event( stream[d][stream1], event[d][1] );
                        } else {
                            dlpanel = dlA(d, 0, nb*j_local);
                            ldpanel = ldda;
                        }
                        
                        /* update the panel */
                        magma_setdevice(d);
                        magmablasSetKernelStream(stream[d][stream1]);
                        trace_gpu_start( d, stream1, "gemm", "gemm" );
                        magma_sgemm(MagmaTrans, MagmaNoTrans,
                                    jb, n_local[d]-nb0, j,
                                    c_neg_one, dlpanel,        ldpanel,
                                               dlA(d, 0, nb0), ldda,
                                    c_one,     dlA(d, j, nb0), ldda);
                        trace_gpu_end( d, stream1 );
                    }
                    d = (d+1)%num_gpus;
                }
            }
            
            /* factor the diagonal */
            magma_setdevice(id);
            magma_queue_sync( stream[id][stream0] ); // wait for the diagonal
            trace_cpu_start( 0, "getrf", "getrf" );
            lapackf77_spotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info);
            trace_cpu_end( 0 );
            if (*info != 0) {
                *info = *info + j;
                break;
            }

            /* send the diagonal to gpus */
            if ( (j+jb) < n) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd < num_gpus; dd++ ) {
                    magma_setdevice(d);
                    if ( d == id ) {
                        dlpanel = dlA(d, j, nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlP(d, 0, 0, buf);
                        ldpanel = lddp;
                    }
                    
                    trace_gpu_start( d, stream0, "comm", "D to GPUs" );
                    magma_ssetmatrix_async( jb, jb,
                                            Aup(j,j), lda,
                                            dlpanel,  ldpanel,
                                            stream[d][stream0] );
                    trace_gpu_end( d, stream0 );
                    magma_event_record( event[d][2], stream[d][stream0] );
                    d = (d+1)%num_gpus;
                }
            } else {
                magma_setdevice(id);
                trace_gpu_start( id, stream0, "comm", "D to GPUs" );
                magma_ssetmatrix_async( jb, jb,
                                        Aup(j,j),               lda,
                                        dlA(id, j, nb*j_local), ldda,
                                        stream[id][stream0] );
                trace_gpu_end( id, stream0 );
            }
            
            /* panel-factorize the off-diagonal */
            if ( (j+jb) < n) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd < num_gpus; dd++ ) {
                    /* next column */
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2--;
                    if ( d == id ) {
                        dlpanel = dlA(d, j, nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlP(d, 0, 0, buf);
                        ldpanel = lddp;
                    }
                    nb2 = n_local[d]-nb*j_local2;
                    nb0 = min(nb, nb2 );
                    
                    magma_setdevice(d);
                    magmablasSetKernelStream(stream[d][stream1]);
                    magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for the diagonal
                    if ( j+jb < m && d == (j/nb+1)%num_gpus ) {
                        /* owns the next column, look-ahead the column */
                        trace_gpu_start( d, stream1, "trsm", "trsm" );
#if defined(PRECISION_d) && defined(STRSM_WORK)
                        magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit,
                                              jb, nb0, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                              d_dinvA[d][0], d_x[d][0] );
                        /*nb2 = n_local[d] - j_local2*nb;
                        magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                              d_dinvA[d], d_x[d] ); */
#else
                        /*nb2 = n_local[d] - j_local2*nb;
                        magma_strsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel,                ldda,
                                     dlA(d, j, nb*j_local2), ldda);
                        */
                        magma_strsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit,
                                     jb, nb0, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, j, nb*j_local2), ldda);
#endif
                        trace_gpu_end( d, stream1 );
                        magma_event_record( event[d][3], stream[d][stream1] );
                        
                        /* send the column to cpu */
                        if ( j+jb < m ) {
                            trace_gpu_start( d, stream0, "comm", "rows to CPU" );
                            magma_queue_wait_event( stream[d][stream0], event[d][3] ); // wait for lookahead
                            magma_sgetmatrix_async( (j+jb), nb0,
                                                    dlA(d, 0, nb*j_local2), ldda,
                                                    Aup(0,j+jb),            lda,
                                                    stream[d][stream0] );
                            trace_gpu_end( d, stream0 );
                            magma_event_record( event[d][1], stream[d][stream0] );
                        }
                        
                        /* update the remaining blocks */
                        nb2 = nb2 - nb0;
#if defined(PRECISION_d) && defined(STRSM_WORK)
                        magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel,                    ldpanel,
                                              dlA(d, j, nb*j_local2+nb0), ldda,
                                              d_dinvA[d][1], d_x[d][1] );
#else
                        magma_strsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel,                    ldpanel,
                                     dlA(d, j, nb*j_local2+nb0), ldda);
#endif
                    } else if ( nb2 > 0 ) {
                        /* update the entire trailing matrix */
                        trace_gpu_start( d, stream1, "trsm", "trsm" );
#if defined(PRECISION_d) && defined(STRSM_WORK)
                        magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                    d_dinvA[d][1], d_x[d][1] );
#else
                        magma_strsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, j, nb*j_local2), ldda);
#endif
                        trace_gpu_end( d, stream1 );
                    }
                    d = (d+1)%num_gpus;
                }
            } /* end of strsm */
        } /* end of for j=1, .., n */
    } else {
        /* -------------------------------------------- */
        /* Lower-triangular case                        */
        /* Compute the Cholesky factorization A = L*L'. */
        /* -------------------------------------------- */
#if defined(PRECISION_d) && defined(STRSM_WORK)
        /*
         * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE
         */
        for( d=0; d < num_gpus; d++ ) {
            magma_setdevice(d);
            for( j=0; j < 2; j++ ) {
                magma_smalloc( &d_dinvA[d][j], nb*nb );
                magma_smalloc( &d_x[d][j],     nb*m  );
                cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(float));
                cudaMemset(d_x[d][j],     0, nb* m*sizeof(float));
            }
        }
        magma_setdevice(0);
#endif

        for (j=0; j < n; j += nb) {
            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%num_gpus;
            buf = (j/nb)%num_gpus;
            
            /* Set the local index where the current panel is */
            j_local = j/(nb*num_gpus);
            jb = min(nb, (n-j));
            
            if ( j > 0 ) {
                /* needed on pluto... */
                magma_setdevice(id);
                magma_queue_sync( stream[id][stream0] ); // wait for the column on CPU

                /* broadcast offdiagonal row to all gpus */
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd < num_gpus; dd++ ) {
                    if ( d != id ) {
                        magma_setdevice(d);
                        /* wait for it on CPU */
                        magma_queue_wait_event( stream[d][stream0], event[id][1] );
            
                        /* send it to GPU */
                        magma_ssetmatrix_async( jb, j,
                                                Alo(j,0),         lda,
                                                dlPT(d,0,jb,buf), nb,
                                                stream[d][stream0] );
                        magma_event_record( event[d][1], stream[d][stream0] );
                    }
                    d = (d+1)%num_gpus;
                }
            }

            /* Update the current diagonal block */
            magma_setdevice(id);
            if ( j > 0 ) {
                magmablasSetKernelStream(stream[id][stream1]);
                magma_ssyrk(MagmaLower, MagmaNoTrans, jb, j,
                            d_neg_one, dlA(id, nb*j_local, 0), ldda,
                            d_one,     dlA(id, nb*j_local, j), ldda);
                magma_event_record( event[id][0], stream[id][stream1] );
            }
            
            /* send the diagonal to cpu */
            magma_queue_wait_event( stream[id][stream0], event[id][0] ); // wait for syrk
            magma_sgetmatrix_async( jb, jb,
                                    dlA(id, nb*j_local, j), ldda,
                                    Alo(j,j),               lda,
                                    stream[id][stream0] );

            /* update the offdiagonal blocks */
            if ( j > 0 ) {
                /* compute the block-rows of the panel */
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd < num_gpus; dd++ ) {
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2 --;
                    nb0 = nb*j_local2;
            
                    if ( nb0 < n_local[d] ) {
                        if ( d != id ) {
                            dlpanel = dlPT(d, 0, jb, buf);
                            ldpanel = nb;
            
                            /* wait for offdiagonal row */
                            magma_queue_wait_event( stream[d][stream1], event[d][1] );
                        } else {
                            dlpanel = dlA(d, nb*j_local, 0);
                            ldpanel = ldda;
                        }
            
                        magma_setdevice(d);
                        magmablasSetKernelStream(stream[d][stream1]);
                        magma_sgemm( MagmaNoTrans, MagmaTrans,
                                     n_local[d]-nb0, jb, j,
                                     c_neg_one, dlA(d, nb0, 0), ldda,
                                                dlpanel,        ldpanel,
                                     c_one,     dlA(d, nb0, j), ldda);
                    }
                    d = (d+1)%num_gpus;
                }
            }

            /* factor the diagonal */
            magma_setdevice(id);
            magma_queue_sync( stream[id][stream0] );
            lapackf77_spotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info);
            if (*info != 0) {
                *info = *info + j;
                break;
            }

            /* send the diagonal to gpus */
            if ( (j+jb) < m ) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd < num_gpus; dd++ ) {
                    magma_setdevice(d);
                    if ( d == id ) {
                        dlpanel = dlA(d, nb*j_local, j);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlPT(d, 0, 0, buf);
                        ldpanel = nb;
                    }
                    magma_ssetmatrix_async( jb, jb,
                                            Alo(j,j), lda,
                                            dlpanel,  ldpanel,
                                            stream[d][stream0] );
                    magma_event_record( event[d][2], stream[d][stream0] );
                    d = (d+1)%num_gpus;
                }
            } else {
                magma_setdevice(id);
                magma_ssetmatrix_async( jb, jb,
                                        Alo(j,j),               lda,
                                        dlA(id, nb*j_local, j), ldda,
                                        stream[id][stream0] );
            }

            /* factorize off-diagonal blocks */
            if ( (j+jb) < m ) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd < num_gpus; dd++ ) {
                    /* next column */
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2--;
                    if ( d == id ) {
                        dlpanel = dlA(d, nb*j_local, j);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlPT(d, 0, 0, buf);
                        ldpanel = nb;
                    }
                    nb2 = n_local[d] - j_local2*nb;
                    nb0 = min(nb, nb2 );
            
                    magma_setdevice(d);
                    magmablasSetKernelStream(stream[d][stream1]);
                    magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for the diagonal
                    if ( j+jb < n && d == (j/nb+1)%num_gpus ) {
                        /* owns the next column, look-ahead the column */
#if defined(PRECISION_d) && defined(STRSM_WORK)
                        magmablas_strsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit,
                                              nb0, jb, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, nb*j_local2, j), ldda,
                                              d_dinvA[d][0], d_x[d][0]);
#else
                        magma_strsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit,
                                     nb0, jb, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, nb*j_local2, j), ldda);
#endif
                        magma_event_record( event[d][3], stream[d][stream1] );

                        /* send the column to cpu */
                        if ( j+jb < n ) {
                            magma_queue_wait_event( stream[d][stream0], event[d][3] ); // wait for lookahead
                            magma_sgetmatrix_async( nb0, j+jb,
                                                    dlA(d, nb*j_local2, 0), ldda,
                                                    Alo(j+jb,0),            lda,
                                                    stream[d][stream0] );
                            magma_event_record( event[d][1], stream[d][stream0] );
                        }

                        /* update the remaining blocks */
                        nb2 = nb2 - nb0;
#if defined(PRECISION_d) && defined(STRSM_WORK)
                        magmablas_strsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit,
                                              nb2, jb, c_one,
                                              dlpanel,                    ldpanel,
                                              dlA(d, nb*j_local2+nb0, j), ldda,
                                              d_dinvA[d][1], d_x[d][1] );
#else
                        magma_strsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit,
                                     nb2, jb, c_one,
                                     dlpanel,                    ldpanel,
                                     dlA(d, nb*j_local2+nb0, j), ldda);
#endif
                    } else if ( nb2 > 0 ) {
                        /* update the entire trailing matrix */
#if defined(PRECISION_d) && defined(STRSM_WORK)
                        magmablas_strsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit,
                                              nb2, jb, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, nb*j_local2, j), ldda,
                                              d_dinvA[d][1], d_x[d][1] );
#else
                        magma_strsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit,
                                     nb2, jb, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, nb*j_local2, j), ldda);
#endif
                    }
                    d = (d+1)%num_gpus;
                }
            }
        }
    } /* end of else not upper */

    /* == finalize the trace == */
    trace_finalize( "spotrf.svg", "trace.css" );

    /* clean up */
    for( d=0; d < num_gpus; d++ ) {
        magma_setdevice(d);
        magma_queue_sync( stream[d][0] );
        magma_queue_sync( stream[d][1] );
        magmablasSetKernelStream(NULL);

        //magma_event_destroy( event0[d] );
        //magma_event_destroy( event1[d] );
        //magma_event_destroy( event2[d] );
        //magma_event_destroy( event3[d] );
    }
    magma_setdevice(0);

    return *info;
} /* magma_spotrf_mgpu */
예제 #3
0
extern "C" magma_int_t
magma_zpotrf2_mgpu(int num_gpus, char uplo, magma_int_t m, magma_int_t n,
                   magma_int_t off_i, magma_int_t off_j, magma_int_t nb,
                   magmaDoubleComplex **d_lA,  magma_int_t ldda,
                   magmaDoubleComplex **d_lP,  magma_int_t lddp,
                   magmaDoubleComplex *a,      magma_int_t lda,   magma_int_t h,
                   magma_queue_t stream[][3], magma_event_t event[][5],
                   magma_int_t *info )
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    ZPOTRF computes the Cholesky factorization of a complex Hermitian
    positive definite matrix dA.

    The factorization has the form
       dA = U**H * U,  if UPLO = 'U', or
       dA = L  * L**H,  if UPLO = 'L',
    where U is an upper triangular matrix and L is lower triangular.

    This is the block version of the algorithm, calling Level 3 BLAS.

    Arguments
    =========
    UPLO    (input) CHARACTER*1
            = 'U':  Upper triangle of dA is stored;
            = 'L':  Lower triangle of dA is stored.

    N       (input) INTEGER
            The order of the matrix dA.  N >= 0.

    dA      (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N)
            On entry, the Hermitian matrix dA.  If UPLO = 'U', the leading
            N-by-N upper triangular part of dA contains the upper
            triangular part of the matrix dA, and the strictly lower
            triangular part of dA is not referenced.  If UPLO = 'L', the
            leading N-by-N lower triangular part of dA contains the lower
            triangular part of the matrix dA, and the strictly upper
            triangular part of dA is not referenced.

            On exit, if INFO = 0, the factor U or L from the Cholesky
            factorization dA = U**H * U or dA = L * L**H.

    LDDA     (input) INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            dividable by 16.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
            > 0:  if INFO = i, the leading minor of order i is not
                  positive definite, and the factorization could not be
                  completed.
    =====================================================================   */


    magma_int_t     j, jb, nb0, nb2, dd, d, id, j_local, j_local2, buf;
    char            uplo_[2] = {uplo, 0};
    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    double          d_one     =  1.0;
    double          d_neg_one = -1.0;
    int upper = lapackf77_lsame(uplo_, "U");
    magmaDoubleComplex *dlpanel;
    //magma_event_t event0[MagmaMaxGPUs], // syrk
    //            event1[MagmaMaxGPUs], // send off-diagonal
    //            event2[MagmaMaxGPUs], // send diagonal
    //            event3[MagmaMaxGPUs]; // trsm
    magma_int_t n_local[MagmaMaxGPUs], ldpanel;
    int stream0 = 0, stream1 = 1;
    #ifdef ZTRSM_WORK
    magmaDoubleComplex *d_dinvA[MagmaMaxGPUs][2], *d_x[MagmaMaxGPUs][2]; /* used by ztrsm_work */
    #endif
    
    *info = 0;
    if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (!upper && num_gpus*ldda < max(1,n)) {
        *info = -4;
    } else if (upper && ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    for( d=0; d<num_gpus; d++ ) {
        /* local-n and local-ld */
        if (upper) {
            n_local[d] = ((n/nb)/num_gpus)*nb;
            if (d < (n/nb)%num_gpus)
                n_local[d] += nb;
            else if (d == (n/nb)%num_gpus)
                n_local[d] += n%nb;
        } else {
            n_local[d] = ((m/nb)/num_gpus)*nb;
            if (d < (m/nb)%num_gpus)
                n_local[d] += nb;
            else if (d == (m/nb)%num_gpus)
                n_local[d] += m%nb;
        }
        //magma_setdevice(d);
        //magma_event_create( &event0[d] );
        //magma_event_create( &event1[d] );
        //magma_event_create( &event2[d] );
        //magma_event_create( &event3[d] );
    }
    magma_setdevice(0);

    /* == initialize the trace */
    trace_init( 1, num_gpus, 3, stream );

    /* Use blocked code. */
    if (upper) {
        /* ---------------------------------------------- */
        /* Upper-triangular case                          */
        /* > Compute the Cholesky factorization A = U'*U. */
        /* ---------------------------------------------- */
        
#if defined(PRECISION_d) && defined(ZTRSM_WORK)
        /* invert the diagonals
         * Allocate device memory for the inversed diagonal blocks, size=m*NB
         */
        for( d=0; d<num_gpus; d++ ) {
            magma_setdevice(d);
            for( j=0; j<2; j++ ) {
                magma_zmalloc( &d_dinvA[d][j], nb*nb );
                magma_zmalloc( &d_x[d][j],      n*nb );
                cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(magmaDoubleComplex));
                cudaMemset(d_x[d][j],     0,  n*nb*sizeof(magmaDoubleComplex));
            }
        }
        magma_setdevice(0);
#endif
        
        for (j=0; j<m; j+=nb) {

            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%num_gpus;
            buf = (j/nb)%num_gpus;
            
            /* Set the local index where the current panel is */
            j_local = j/(nb*num_gpus);
            jb = min(nb, (m-j));
            
            if( j > 0 ) {
                /* needed on pluto... */
                magma_setdevice(id);
                magma_queue_sync( stream[id][stream0] ); // wait for the column on CPU

                /* broadcast off-diagonal column to all gpus */
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    if( d != id ) {
                        magma_setdevice(d);
                
                        /* wait for it on CPU */
                        magma_queue_wait_event( stream[d][stream0], event[id][1] );
                
                        /* send it to GPU */
                        trace_gpu_start( d, stream0, "comm", "rows to GPUs" );
                        magma_zsetmatrix_async( j, jb,
                                                Aup(0,j),        lda,
                                                dlP(d,jb,0,buf), lddp,
                                                stream[d][stream0] );
                        trace_gpu_end( d, stream0 );
                        magma_event_record( event[d][1], stream[d][stream0] );
                    }
                    d = (d+1)%num_gpus;
                }
            }
            
            /* Update the current diagonal block */
            magma_setdevice(id);
            if( j > 0 ) {
                magmablasSetKernelStream(stream[id][stream1]);
                trace_gpu_start( id, stream1, "syrk", "syrk" );
                magma_zherk(MagmaUpper, MagmaConjTrans, jb, j,
                            d_neg_one, dlA(id, 0, nb*j_local), ldda,
                            d_one,     dlA(id, j, nb*j_local), ldda);
                trace_gpu_end( id, stream1 );
                magma_event_record( event[id][0], stream[id][stream1] );
            }

            /* send the diagonal to cpu */
            magma_queue_wait_event( stream[id][stream0], event[id][0] ); // wait for syrk
            trace_gpu_start( id, stream0, "comm", "D to CPU" );
            magma_zgetmatrix_async( jb, jb,
                                    dlA(id, j, nb*j_local), ldda,
                                    Aup(j,j),               lda,
                                    stream[id][stream0] );
            trace_gpu_end( id, stream0 );

            if ( j > 0 ) {

                /* Compute the local block column of the panel. */
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    j_local2 = j_local+1;
                    if( d > id ) j_local2 --;
                    nb0 = nb*j_local2;
                
                    if( n_local[d] > nb0 ) {
                        /* wait for the off-diagonal */
                        if( d != id ) {
                            //magma_queue_sync( stream[id][3] );
                            dlpanel = dlP(d, jb, 0, buf);
                            ldpanel = lddp;
                
                            /* wait for the offdiagonal column */
                            magma_queue_wait_event( stream[d][stream1], event[d][1] );
                        } else {
                            dlpanel = dlA(d, 0, nb*j_local);
                            ldpanel = ldda;
                        }
                        
                        /* update the panel */
                        magma_setdevice(d);
                        magmablasSetKernelStream(stream[d][stream1]);
                        trace_gpu_start( d, stream1, "gemm", "gemm" );
                        magma_zgemm(MagmaConjTrans, MagmaNoTrans,
                                    jb, n_local[d]-nb0, j,
                                    c_neg_one, dlpanel,        ldpanel,
                                               dlA(d, 0, nb0), ldda,
                                    c_one,     dlA(d, j, nb0), ldda);
                        trace_gpu_end( d, stream1 );
                    }
                    d = (d+1)%num_gpus;
                }
            }
            
            /* factor the diagonal */
            magma_setdevice(id);
            magma_queue_sync( stream[id][stream0] ); // wait for the diagonal
            trace_cpu_start( 0, "getrf", "getrf" );
            lapackf77_zpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info);
            trace_cpu_end( 0 );
            if (*info != 0) {
                *info = *info + j;
                break;
            }

            /* send the diagonal to gpus */
            if ( (j+jb) < n) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    magma_setdevice(d);
                    if( d == id ) {
                        dlpanel = dlA(d, j, nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlP(d, 0, 0, buf);
                        ldpanel = lddp;
                    }
                    
                    trace_gpu_start( d, stream0, "comm", "D to GPUs" );
                    magma_zsetmatrix_async( jb, jb,
                                            Aup(j,j), lda,
                                            dlpanel,  ldpanel,
                                            stream[d][stream0] );
                    trace_gpu_end( d, stream0 );
                    magma_event_record( event[d][2], stream[d][stream0] );
                    d = (d+1)%num_gpus;
                }
            } else {
                magma_setdevice(id);
                trace_gpu_start( id, stream0, "comm", "D to GPUs" );
                magma_zsetmatrix_async( jb, jb,
                                        Aup(j,j),               lda,
                                        dlA(id, j, nb*j_local), ldda,
                                        stream[id][stream0] );
                trace_gpu_end( id, stream0 );
            }
            
            /* panel-factorize the off-diagonal */
            if ( (j+jb) < n) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    /* next column */
                    j_local2 = j_local+1;
                    if( d > id ) j_local2--;
                    if( d == id ) {
                        dlpanel = dlA(d, j, nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlP(d, 0, 0, buf);
                        ldpanel = lddp;
                    }
                    nb2 = n_local[d]-nb*j_local2;
                    nb0 = min(nb, nb2 );
                    
                    magma_setdevice(d);
                    magmablasSetKernelStream(stream[d][stream1]);
                    magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for the diagonal
                    if( j+jb < m && d == (j/nb+1)%num_gpus ) {
                        /* owns the next column, look-ahead the column */
                        trace_gpu_start( d, stream1, "trsm", "trsm" );
#if defined(PRECISION_d) && defined(ZTRSM_WORK)
                        magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                              jb, nb0, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                              d_dinvA[d][0], d_x[d][0] );
                        /*nb2 = n_local[d] - j_local2*nb;
                        magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                              d_dinvA[d], d_x[d] );*/
#else
                        /*nb2 = n_local[d] - j_local2*nb;
                        magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel,                ldda,
                                     dlA(d, j, nb*j_local2), ldda);
                        */
                        magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                     jb, nb0, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, j, nb*j_local2), ldda);
#endif
                        trace_gpu_end( d, stream1 );
                        magma_event_record( event[d][3], stream[d][stream1] );
                        
                        /* send the column to cpu */
                        if( j+jb < m ) {
                            trace_gpu_start( d, stream0, "comm", "rows to CPU" );
                            magma_queue_wait_event( stream[d][stream0], event[d][3] ); // wait for lookahead
                            magma_zgetmatrix_async( (j+jb), nb0,
                                                    dlA(d, 0, nb*j_local2), ldda,
                                                    Aup(0,j+jb),            lda,
                                                    stream[d][stream0] );
                            trace_gpu_end( d, stream0 );
                            magma_event_record( event[d][1], stream[d][stream0] );
                        }
                        
                        /* update the remaining blocks */
                        nb2 = nb2 - nb0;
#if defined(PRECISION_d) && defined(ZTRSM_WORK)
                        magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel,                    ldpanel,
                                              dlA(d, j, nb*j_local2+nb0), ldda,
                                              d_dinvA[d][1], d_x[d][1] );
#else
                        magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel,                    ldpanel,
                                     dlA(d, j, nb*j_local2+nb0), ldda);
#endif
                    } else if( nb2 > 0 ) {
                        /* update the entire trailing matrix */
                        trace_gpu_start( d, stream1, "trsm", "trsm" );
#if defined(PRECISION_d) && defined(ZTRSM_WORK)
                        magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                    d_dinvA[d][1], d_x[d][1] );
#else
                        magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, j, nb*j_local2), ldda);
#endif
                        trace_gpu_end( d, stream1 );
                    }
                    d = (d+1)%num_gpus;
                }
            } /* end of ztrsm */
        } /* end of for j=1, .., n */
    } else {
        /* -------------------------------------------- */
        /* Lower-triangular case                        */
        /* Compute the Cholesky factorization A = L*L'. */
        /* -------------------------------------------- */
#if defined(PRECISION_d) && defined(ZTRSM_WORK)
        /*
         * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE
         */
        for( d=0; d<num_gpus; d++ ) {
            magma_setdevice(d);
            for( j=0; j<2; j++ ) {
                magma_zmalloc( &d_dinvA[d][j], nb*nb );
                magma_zmalloc( &d_x[d][j],     nb*m  );
                cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(magmaDoubleComplex));
                cudaMemset(d_x[d][j],     0, nb* m*sizeof(magmaDoubleComplex));
            }
        }
        magma_setdevice(0);
#endif

        for (j=0; j<n; j+=nb) {

            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%num_gpus;
            buf = (j/nb)%num_gpus;
            
            /* Set the local index where the current panel is */
            j_local = j/(nb*num_gpus);
            jb = min(nb, (n-j));
            
            if( j > 0 ) {
                /* needed on pluto... */
                magma_setdevice(id);
                magma_queue_sync( stream[id][stream0] ); // wait for the column on CPU

                /* broadcast offdiagonal row to all gpus */
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    if( d != id ) {
                        magma_setdevice(d);
                        /* wait for it on CPU */
                        magma_queue_wait_event( stream[d][stream0], event[id][1] );
            
                        /* send it to GPU */
                        magma_zsetmatrix_async( jb, j,
                                                Alo(j,0),         lda,
                                                dlPT(d,0,jb,buf), nb,
                                                stream[d][stream0] );
                        magma_event_record( event[d][1], stream[d][stream0] );
                    }
                    d = (d+1)%num_gpus;
                }
            }

            /* Update the current diagonal block */
            magma_setdevice(id);
            if( j > 0 ) {
                magmablasSetKernelStream(stream[id][stream1]);
                magma_zherk(MagmaLower, MagmaNoTrans, jb, j,
                            d_neg_one, dlA(id, nb*j_local, 0), ldda,
                            d_one,     dlA(id, nb*j_local, j), ldda);
                magma_event_record( event[id][0], stream[id][stream1] );
            }
            
            /* send the diagonal to cpu */
            magma_queue_wait_event( stream[id][stream0], event[id][0] ); // wait for syrk
            magma_zgetmatrix_async( jb, jb,
                                    dlA(id, nb*j_local, j), ldda,
                                    Alo(j,j),               lda,
                                    stream[id][stream0] );

            /* update the offdiagonal blocks */
            if ( j > 0 ) {
                /* compute the block-rows of the panel */
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    j_local2 = j_local+1;
                    if( d > id ) j_local2 --;
                    nb0 = nb*j_local2;
            
                    if( nb0 < n_local[d] ) {
                        if( d != id ) {
                            dlpanel = dlPT(d, 0, jb, buf);
                            ldpanel = nb;
            
                            /* wait for offdiagonal row */
                            magma_queue_wait_event( stream[d][stream1], event[d][1] );
                        } else {
                            dlpanel = dlA(d, nb*j_local, 0);
                            ldpanel = ldda;
                        }
            
                        magma_setdevice(d);
                        magmablasSetKernelStream(stream[d][stream1]);
                        magma_zgemm( MagmaNoTrans, MagmaConjTrans,
                                     n_local[d]-nb0, jb, j,
                                     c_neg_one, dlA(d, nb0, 0), ldda,
                                                dlpanel,        ldpanel,
                                     c_one,     dlA(d, nb0, j), ldda);
                    }
                    d = (d+1)%num_gpus;
                }
            }

            /* factor the diagonal */
            magma_setdevice(id);
            magma_queue_sync( stream[id][stream0] );
            lapackf77_zpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info);
            if (*info != 0) {
                *info = *info + j;
                break;
            }

            /* send the diagonal to gpus */
            if ( (j+jb) < m ) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    magma_setdevice(d);
                    if( d == id ) {
                        dlpanel = dlA(d, nb*j_local, j);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlPT(d, 0, 0, buf);
                        ldpanel = nb;
                    }
                    magma_zsetmatrix_async( jb, jb,
                                            Alo(j,j), lda,
                                            dlpanel,  ldpanel,
                                            stream[d][stream0] );
                    magma_event_record( event[d][2], stream[d][stream0] );
                    d = (d+1)%num_gpus;
                }
            } else {
                magma_setdevice(id);
                magma_zsetmatrix_async( jb, jb,
                                        Alo(j,j),               lda,
                                        dlA(id, nb*j_local, j), ldda,
                                        stream[id][stream0] );
            }

            /* factorize off-diagonal blocks */
            if ( (j+jb) < m ) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    /* next column */
                    j_local2 = j_local+1;
                    if( d > id ) j_local2--;
                    if( d == id ) {
                        dlpanel = dlA(d, nb*j_local, j);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlPT(d, 0, 0, buf);
                        ldpanel = nb;
                    }
                    nb2 = n_local[d] - j_local2*nb;
                    nb0 = min(nb, nb2 );
            
                    magma_setdevice(d);
                    magmablasSetKernelStream(stream[d][stream1]);
                    magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for the diagonal
                    if( j+jb < n && d == (j/nb+1)%num_gpus ) {
                        /* owns the next column, look-ahead the column */
#if defined(PRECISION_d) && defined(ZTRSM_WORK)
                        magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                              nb0, jb, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, nb*j_local2, j), ldda,
                                              d_dinvA[d][0], d_x[d][0]);
#else
                        magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                     nb0, jb, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, nb*j_local2, j), ldda);
#endif
                        magma_event_record( event[d][3], stream[d][stream1] );

                        /* send the column to cpu */
                        if( j+jb < n ) {
                            magma_queue_wait_event( stream[d][stream0], event[d][3] ); // wait for lookahead
                            magma_zgetmatrix_async( nb0, j+jb,
                                                    dlA(d, nb*j_local2, 0), ldda,
                                                    Alo(j+jb,0),            lda,
                                                    stream[d][stream0] );
                            magma_event_record( event[d][1], stream[d][stream0] );
                        }

                        /* update the remaining blocks */
                        nb2 = nb2 - nb0;
#if defined(PRECISION_d) && defined(ZTRSM_WORK)
                        magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                              nb2, jb, c_one,
                                              dlpanel,                    ldpanel,
                                              dlA(d, nb*j_local2+nb0, j), ldda,
                                              d_dinvA[d][1], d_x[d][1] );
#else
                        magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                     nb2, jb, c_one,
                                     dlpanel,                    ldpanel,
                                     dlA(d, nb*j_local2+nb0, j), ldda);
#endif
                    } else if( nb2 > 0 ) {
                        /* update the entire trailing matrix */
#if defined(PRECISION_d) && defined(ZTRSM_WORK)
                        magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                              nb2, jb, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, nb*j_local2, j), ldda,
                                              d_dinvA[d][1], d_x[d][1] );
#else
                        magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                     nb2, jb, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, nb*j_local2, j), ldda);
#endif
                    }
                    d = (d+1)%num_gpus;
                }
            }
        }
    } /* end of else not upper */

    /* == finalize the trace == */
    trace_finalize( "zpotrf.svg","trace.css" );

    /* clean up */
    for( d=0; d<num_gpus; d++ ) {
        magma_setdevice(d);
        magma_queue_sync( stream[d][0] );
        magma_queue_sync( stream[d][1] );
        magmablasSetKernelStream(NULL);

        //magma_event_destroy( event0[d] );
        //magma_event_destroy( event1[d] );
        //magma_event_destroy( event2[d] );
        //magma_event_destroy( event3[d] );
    }
    magma_setdevice(0);

    return *info;
} /* magma_zpotrf_mgpu */
예제 #4
0
extern "C" magma_int_t
magma_cpotrf2_msub(
    magma_int_t num_subs, magma_int_t num_gpus, magma_uplo_t uplo, magma_int_t m, magma_int_t n, 
    magma_int_t off_i, magma_int_t off_j, magma_int_t nb,
    magmaFloatComplex_ptr *d_lA, size_t d_lA_offset, magma_int_t ldda, 
    magmaFloatComplex_ptr *d_lP, magma_int_t lddp, 
    magmaFloatComplex *a, magma_int_t lda, magma_int_t h,
    magma_queue_t *queues,
    magma_int_t *info )
{
/*  -- clMAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date November 2014

    Purpose   
    =======   
    CPOTRF computes the Cholesky factorization of a complex Hermitian   
    positive definite matrix dA.   

    The factorization has the form   
       dA = U**H * U,  if UPLO = 'U', or   
       dA = L  * L**H,  if UPLO = 'L',   
    where U is an upper triangular matrix and L is lower triangular.   

    This is the block version of the algorithm, calling Level 3 BLAS.   

    Arguments   
    =========   
    UPLO    (input) CHARACTER*1   
            = 'U':  Upper triangle of dA is stored;   
            = 'L':  Lower triangle of dA is stored.   

    N       (input) INTEGER   
            The order of the matrix dA.  N >= 0.   

    dA      (input/output) COMPLEX array on the GPU, dimension (LDDA,N)   
            On entry, the Hermitian matrix dA.  If UPLO = 'U', the leading   
            N-by-N upper triangular part of dA contains the upper   
            triangular part of the matrix dA, and the strictly lower   
            triangular part of dA is not referenced.  If UPLO = 'L', the   
            leading N-by-N lower triangular part of dA contains the lower   
            triangular part of the matrix dA, and the strictly upper   
            triangular part of dA is not referenced.   

            On exit, if INFO = 0, the factor U or L from the Cholesky   
            factorization dA = U**H * U or dA = L * L**H.   

    LDDA     (input) INTEGER   
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            divisible by 16.

    INFO    (output) INTEGER   
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value   
            > 0:  if INFO = i, the leading minor of order i is not   
                  positive definite, and the factorization could not be   
                  completed.   
    =====================================================================   */

    int tot_subs = num_subs*num_gpus;
    magma_int_t     j, jb, nb0, nb2, dd, d, id, j_local, j_local2;
    magmaFloatComplex c_one     = MAGMA_C_ONE;
    magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;
    float          d_one     =  1.0;
    float          d_neg_one = -1.0;
    magmaFloatComplex_ptr dlpanel;
    size_t dlpanel_offset;
    magma_int_t n_local[MagmaMaxSubs * MagmaMaxGPUs], ldpanel;

    // initialize trace
    trace_init(1, num_gpus, 2, queues);

    *info = 0;
    if ( (uplo != MagmaUpper) && (uplo != MagmaLower) ) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if ((uplo != MagmaUpper) && tot_subs*ldda < max(1,n)) {
        *info = -4;
    } else if ((uplo == MagmaUpper) && ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    for (d=0; d<tot_subs; d++) {
        /* local-n and local-ld */
        if (uplo == MagmaUpper) {
            n_local[d] = ((n/nb)/tot_subs)*nb;
            if (d < (n/nb)%tot_subs)
               n_local[d] += nb;
            else if (d == (n/nb)%tot_subs)
              n_local[d] += n%nb;
        } else {
            n_local[d] = ((m/nb)/tot_subs)*nb;
            if (d < (m/nb)%tot_subs)
               n_local[d] += nb;
            else if (d == (m/nb)%tot_subs)
              n_local[d] += m%nb;
        }
    }

    /* Use blocked code. */
    if (uplo == MagmaUpper) {
        /* ---------------------------------------------- */
        /* Upper-triangular case                          */
        /* > Compute the Cholesky factorization A = U'*U. */
        /* ---------------------------------------------- */
        for (j=0; j<m; j+=nb) {
            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%tot_subs;
            /* Set the local index where the current panel is */
            j_local = j/(nb*tot_subs);
            jb = min(nb, (m-j));
            if (j > 0) {
                // Wait for the column on CPU
                magma_queue_sync(queues[2*(id%num_gpus)]); 
                /* broadcast off-diagonal column to all gpus */
                d = (j/nb+1)%num_gpus;
                for (dd=0; dd<num_gpus; dd++) {
                    if (d != id%num_gpus) {
                        magma_csetmatrix_async( j, jb, 
                                                Aup(0,j),                lda, 
                                                dlP(d,jb,0,id%num_gpus), lddp, 
                                                queues[2*d], 
                                                trace_gpu_event(d, 0, "set", "set-col") );
                    }
                    d = (d+1)%num_gpus;
                }
                /* Update the current diagonal block */
                trace_gpu_start(id%num_gpus, 1, "herk", "herk");
                magma_cherk(MagmaUpper, MagmaConjTrans, jb, j, 
                            d_neg_one, dlA(id, 0, nb*j_local), ldda,
                            d_one,     dlA(id, j, nb*j_local), ldda,
                            queues[2*(id%num_gpus)+1]);
                magma_queue_sync(queues[2*(id%num_gpus)+1]); // Wait for syrk
            }
            /* Send the diagonal to cpu */
            magma_cgetmatrix_async( jb, jb, 
                                    dlA(id, j, nb*j_local), ldda,
                                    Aup(j,j),               lda,
                                    queues[2*(id%num_gpus)], 
                                    trace_gpu_event(id%num_gpus, 0, "get", "get-diag") );
            if (j > 0) {
                /* Compute the local block column of the panel. */
                d = (j/nb+1)%tot_subs;
                for (dd=0; dd<tot_subs; dd++) {
                    j_local2 = j_local+1;
                    if (d > id) j_local2 --;
                    nb0 = nb*j_local2;
                    if (n_local[d] > nb0) {
                        if (d%num_gpus != id%num_gpus) {
                            dlpanel = d_lP[d%num_gpus];
                            dlpanel_offset = dlP_offset(jb, 0, id%num_gpus);
                            ldpanel = lddp;
                            /* Wait for the offdiagonal column */
                            if (dd < num_gpus) magma_queue_sync(queues[2*(d%num_gpus)]);
                        } else {
                            dlpanel = d_lA[id];
                            dlpanel_offset = dlA_offset(0, nb*j_local);
                            ldpanel = ldda;
                        }
                        /* update the panel */
                        trace_gpu_start(d%num_gpus, 1, "gemm", "gemm");
                        magma_cgemm(MagmaConjTrans, MagmaNoTrans, 
                                    jb, n_local[d]-nb0, j, 
                                    c_neg_one, dlpanel, dlpanel_offset, ldpanel,
                                               dlA(d, 0, nb0), ldda, 
                                    c_one,     dlA(d, j, nb0), ldda,
                                    queues[2*(d%num_gpus)+1]);
                    }
                    d = (d+1)%tot_subs;
                }
            }
            /* factor the diagonal */
            magma_queue_sync( queues[2*(id%num_gpus)] ); // wait for the diagonal
            trace_cpu_start(0, "potrf", "potrf");
            lapackf77_cpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info);
            trace_cpu_end(0);
            if (*info != 0) {
                *info = *info + j;
                break;
            }

            /* send the diagonal to gpus */
            if ((j+jb) < n) {
                d = (j/nb+1)%num_gpus;
                for (dd=0; dd<num_gpus; dd++) {
                    if (d == id%num_gpus) {
                        dlpanel = d_lA[id];
                        dlpanel_offset = dlA_offset(j, nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = d_lP[d];
                        dlpanel_offset = dlP_offset(0, 0, id%num_gpus);
                        ldpanel = lddp;
                    }
                    magma_csetmatrix_async( jb, jb, 
                                            Aup(j,j),                lda,
                                            dlpanel, dlpanel_offset, ldpanel, 
                                            queues[2*d], 
                                            trace_gpu_event(d, 0, "set", "set-diag"));
                    d = (d+1)%num_gpus;
                }
            } else {
                magma_csetmatrix_async( jb, jb, 
                                        Aup(j,j),               lda, 
                                        dlA(id, j, nb*j_local), ldda,
                                        queues[2*(id%num_gpus)], 
                                        trace_gpu_event(id%num_gpus, 0, "set", "set-diag") );
            }

            /* panel-factorize the off-diagonal */
            if ((j+jb) < n) {
                d = (j/nb+1)%tot_subs;
                for (dd=0; dd<tot_subs; dd++) {
                    /* next column */
                    j_local2 = j_local+1;
                    if (d > id) j_local2--;
                    if (d%num_gpus == id%num_gpus) {
                        dlpanel = d_lA[id];
                        dlpanel_offset = dlA_offset(j, nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = d_lP[d%num_gpus];
                        dlpanel_offset = dlP_offset(0, 0, id%num_gpus);
                        ldpanel = lddp;
                    }
                    nb2 = n_local[d]-nb*j_local2;
                    nb0 = min(nb, nb2);
                    if (dd < num_gpus) magma_queue_sync( queues[2*(d%num_gpus)] ); // wait for the diagonal
                    if (j+jb < m && d == (j/nb+1)%tot_subs) {
                        /* owns the next column, look-ahead the column */
                        trace_gpu_start(d%num_gpus, 1, "trsm", "trsm");
                        magma_ctrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                     jb, nb0, c_one,
                                     dlpanel, dlpanel_offset, ldpanel,
                                     dlA(d, j, nb*j_local2), ldda, 
                                     queues[2*(d%num_gpus)+1] );
                        /* send the column to cpu */
                        magma_queue_sync(queues[2*(d%num_gpus)+1]);  // wait for lookahead
                        magma_cgetmatrix_async( (j+jb), nb0, 
                                                dlA(d, 0, nb*j_local2), ldda, 
                                                Aup(0,j+jb),            lda,
                                                queues[2*(d%num_gpus)], 
                                                trace_gpu_event(d%num_gpus, 0, "get", "get-col") );
                        /* update the remaining blocks */
                        nb2 = nb2 - nb0;
                        trace_gpu_start(d%num_gpus, 1, "trsm", "trsm");
                        magma_ctrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                     jb, nb2, c_one, 
                                     dlpanel, dlpanel_offset, ldpanel,
                                     dlA(d, j, nb*j_local2+nb0), ldda, 
                                     queues[2*(d%num_gpus)+1] );
                    } else if (nb2 > 0) {
                        /* update the entire trailing matrix */
                        trace_gpu_start(d%num_gpus, 1, "trsm", "trsm");
                        magma_ctrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, 
                                     jb, nb2, c_one, 
                                     dlpanel, dlpanel_offset, ldpanel,
                                     dlA(d, j, nb*j_local2), ldda,
                                     queues[2*(d%num_gpus)+1] );
                    }
                    d = (d+1)%tot_subs;
                }
            }
        }
    } else { 
        /* -------------------------------------------- */
        /* Lower-triangular case                        */
        /* Compute the Cholesky factorization A = L*L'. */
        /* -------------------------------------------- */
        for (j=0; j<n; j+=nb) {
            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%tot_subs;
            /* Set the local index where the current panel is */
            j_local = j/(nb*tot_subs);
            jb = min(nb, (n-j));
            if (j > 0) {
                if (num_gpus > 1) {
                    // Wait for the row on CPU to broadcast
                    magma_queue_sync(queues[2*(id%num_gpus)]); 
                }
                /* broadcast off-diagonal row to all the GPUs */
                d = (j/nb+1)%num_gpus;
                for (dd=0; dd<num_gpus; dd++) {
                    if (d != id%num_gpus) {
                        /* send it to GPU-d */
                        magma_csetmatrix_async( jb, j,
                                                Alo(j,0),                 lda,
                                                dlPT(d,0,jb,id%num_gpus), nb, 
                                                queues[2*d], 
                                                trace_gpu_event(d, 0, "set", "set-row") );
                    }
                    d = (d+1)%num_gpus;
                }
                /* Update the current diagonal block */
                trace_gpu_start(id%num_gpus, 1, "herk", "herk");
                magma_cherk(MagmaLower, MagmaNoTrans, jb, j,
                            d_neg_one, dlA(id, nb*j_local, 0), ldda,
                            d_one,     dlA(id, nb*j_local, j), ldda,
                            queues[2*(id%num_gpus)+1]);
                magma_queue_sync(queues[2*(id%num_gpus)+1]); // wait for syrk
            }
            /* send the diagonal to cpu */
            magma_cgetmatrix_async( jb, jb,
                                    dlA(id, nb*j_local, j), ldda,
                                    Alo(j,j),               lda, 
                                    queues[2*(id%num_gpus)], 
                                    trace_gpu_event(id%num_gpus, 0, "get", "get") );
            /* update the offdiagonal blocks */
            if (j > 0) {
                /* compute the block-rows of the panel */
                d = (j/nb+1)%tot_subs;
                for (dd=0; dd<tot_subs; dd++) {
                    j_local2 = j_local+1;
                    if (d > id) j_local2 --;
                    nb0 = nb*j_local2;
                    if (nb0 < n_local[d]) {
                        if (d%num_gpus != id%num_gpus) {
                            dlpanel = d_lP[d%num_gpus];
                            dlpanel_offset = dlPT_offset(0, jb, id%num_gpus);
                            ldpanel = nb;
                            /* Wait for offdiagonal row */
                            if (dd < num_gpus) magma_queue_sync(queues[2*(d%num_gpus)]);
                        } else {
                            dlpanel = d_lA[id];
                            dlpanel_offset = dlA_offset(nb*j_local, 0);
                            ldpanel = ldda;
                        }
                        /* Update the panel */
                        trace_gpu_start(d%num_gpus, 1, "gemm", "gemm");
                        magma_cgemm( MagmaNoTrans, MagmaConjTrans,
                                     n_local[d]-nb0, jb, j,
                                     c_neg_one, dlA(d, nb0, 0), ldda,
                                                dlpanel, dlpanel_offset, ldpanel,
                                     c_one,     dlA(d, nb0, j), ldda, 
                                     queues[2*(d%num_gpus)+1]);
                    }
                    d = (d+1)%tot_subs;
                }
            }

            /* factor the diagonal */
            magma_queue_sync( queues[2*(id%num_gpus)] );
            trace_cpu_start(0, "potrf", "potrf");
            lapackf77_cpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info);
            trace_cpu_end(0);
            if (*info != 0) {
                printf( " cpotrf returned %d (id=%d,j=%d,j_local=%d,jb=%d)\n",*info,id,j,j_local,jb );
                *info = *info + j;
                break;
            }

            /* send the diagonal to gpus */
            if ((j+jb) < m) {
                d = (j/nb+1)%num_gpus;
                for (dd=0; dd<num_gpus; dd++) {
                    if (d == id%num_gpus) {
                        dlpanel = d_lA[id];
                        dlpanel_offset = dlA_offset(nb*j_local, j);
                        ldpanel = ldda;
                    } else {
                        dlpanel = d_lP[d];
                        dlpanel_offset = dlPT_offset(0, 0, id%num_gpus);
                        ldpanel = nb;
                    }
                    magma_csetmatrix_async( jb, jb,
                                            Alo(j,j), lda,
                                            dlpanel,  dlpanel_offset, ldpanel, 
                                            queues[2*d], 
                                            trace_gpu_event(d, 0, "set", "set-diag") );
                    d = (d+1)%num_gpus;
                }
            } else {
                magma_csetmatrix_async( jb, jb,
                                        Alo(j,j),               lda,
                                        dlA(id, nb*j_local, j), ldda, 
                                        queues[2*(id%num_gpus)],
                                        trace_gpu_event(id%num_gpus, 0, "set", "set-diag") );
            }

            /* factorize off-diagonal blocks */
            if ((j+jb) < m) {
                d = (j/nb+1)%tot_subs;
                for (dd=0; dd<tot_subs; dd++) {
                    /* next column */
                    j_local2 = j_local+1;
                    if (d > id) j_local2--;
                    if (d%num_gpus == id%num_gpus) {
                        dlpanel = d_lA[id];
                        dlpanel_offset = dlA_offset(nb*j_local, j);
                        ldpanel = ldda;
                    } else {         
                        dlpanel = d_lP[d%num_gpus];
                        dlpanel_offset = dlPT_offset(0, 0, id%num_gpus);
                        ldpanel = nb;
                    }
                    nb2 = n_local[d] - j_local2*nb;
                    nb0 = min(nb, nb2 );
                    // wait for the diagonal
                    if (dd < num_gpus) magma_queue_sync(queues[2*(d%num_gpus)]);
                    if (j+jb < n && d == (j/nb+1)%tot_subs) {
                        /* owns the next column, look-ahead the column */
                        trace_gpu_start(d%num_gpus, 1, "trsm", "trsm");
                        magma_ctrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, 
                                     nb0, jb, c_one,
                                     dlpanel,  dlpanel_offset, ldpanel, 
                                     dlA(d, nb*j_local2, j), ldda,
                                     queues[2*(d%num_gpus)+1]);
                        /* send the column to cpu */
                        magma_queue_sync( queues[2*(d%num_gpus)+1] ); // wait for lookahead
                        magma_cgetmatrix_async( nb0, j+jb,
                                                dlA(d, nb*j_local2, 0), ldda,
                                                Alo(j+jb,0),            lda, 
                                                queues[2*(d%num_gpus)], 
                                                trace_gpu_event(d%num_gpus, 0, "get", "get") );
                        /* update the remaining blocks */
                        nb2 = nb2 - nb0;
                        trace_gpu_start(d%num_gpus, 1, "trsm", "trsm");
                        magma_ctrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, 
                                     nb2, jb, c_one,
                                     dlpanel, dlpanel_offset, ldpanel, 
                                     dlA(d, nb*j_local2+nb0, j), ldda, 
                                     queues[2*(d%num_gpus)+1]);
                    } else if (nb2 > 0) {
                        /* update the entire trailing matrix */
                        trace_gpu_start(d%num_gpus, 1, "trsm", "trsm");
                        magma_ctrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, 
                                     nb2, jb, c_one,
                                     dlpanel, dlpanel_offset, ldpanel, 
                                     dlA(d, nb*j_local2, j), ldda, 
                                     queues[2*(d%num_gpus)+1]);
                    }
                    d = (d+1)%tot_subs;
                }
            }
        }
    } /* end of else not upper */

    /* clean up */
    for( d=0; d<num_gpus; d++ ) {
        magma_queue_sync( queues[2*d] );
        magma_queue_sync( queues[2*d+1] );
    }

    trace_finalize("cpotrf_msub.svg", "trace.css");
    return *info;
} /* magma_cpotrf2_msub */
예제 #5
0
extern "C" magma_int_t
magma_dpotrf2_mgpu(
    magma_int_t num_gpus, magma_uplo_t uplo, magma_int_t m, magma_int_t n, 
    magma_int_t off_i, magma_int_t off_j, magma_int_t nb,
    magmaDouble_ptr *d_lA, size_t d_lA_offset, magma_int_t ldda, 
    magmaDouble_ptr *d_lP,  magma_int_t lddp, 
    double *a,      magma_int_t lda,   magma_int_t h,
    magma_queue_t *queues,
    magma_int_t *info )
{
/*  -- clMAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date November 2014

    Purpose   
    =======   
    DPOTRF computes the Cholesky factorization of a real symmetric   
    positive definite matrix dA.   

    The factorization has the form   
       dA = U**H * U,  if UPLO = 'U', or   
       dA = L  * L**H,  if UPLO = 'L',   
    where U is an upper triangular matrix and L is lower triangular.   

    This is the block version of the algorithm, calling Level 3 BLAS.   

    Arguments   
    =========   
    UPLO    (input) CHARACTER*1   
            = 'U':  Upper triangle of dA is stored;   
            = 'L':  Lower triangle of dA is stored.   

    N       (input) INTEGER   
            The order of the matrix dA.  N >= 0.   

    dA      (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N)   
            On entry, the symmetric matrix dA.  If UPLO = 'U', the leading   
            N-by-N upper triangular part of dA contains the upper   
            triangular part of the matrix dA, and the strictly lower   
            triangular part of dA is not referenced.  If UPLO = 'L', the   
            leading N-by-N lower triangular part of dA contains the lower   
            triangular part of the matrix dA, and the strictly upper   
            triangular part of dA is not referenced.   

            On exit, if INFO = 0, the factor U or L from the Cholesky   
            factorization dA = U**H * U or dA = L * L**H.   

    LDDA     (input) INTEGER   
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            divisible by 16.

    INFO    (output) INTEGER   
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value   
            > 0:  if INFO = i, the leading minor of order i is not   
                  positive definite, and the factorization could not be   
                  completed.   
    =====================================================================   */

    magma_int_t     j, jb, nb0, nb2, dd, d, id, j_local, j_local2, buf;
    double c_one     = MAGMA_D_ONE;
    double c_neg_one = MAGMA_D_NEG_ONE;
    double          d_one     =  1.0;
    double          d_neg_one = -1.0;
    magmaDouble_ptr dlpanel;
    size_t dlpanel_offset;
    magma_int_t n_local[MagmaMaxGPUs], ldpanel;
    magma_event_t events[MagmaMaxGPUs];

    *info = 0;
    if ( (uplo != MagmaUpper) && (uplo != MagmaLower) ) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if ((uplo != MagmaUpper) && num_gpus*ldda < max(1,n)) {
        *info = -4;
    } else if ((uplo == MagmaUpper) && ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    {

      for( d=0; d<num_gpus; d++ ) {
        /* local-n and local-ld */
        if (uplo == MagmaUpper) {
            n_local[d] = ((n/nb)/num_gpus)*nb;
            if (d < (n/nb)%num_gpus)
               n_local[d] += nb;
            else if (d == (n/nb)%num_gpus)
              n_local[d] += n%nb;
        } else {
            n_local[d] = ((m/nb)/num_gpus)*nb;
            if (d < (m/nb)%num_gpus)
               n_local[d] += nb;
            else if (d == (m/nb)%num_gpus)
              n_local[d] += m%nb;
        }
      }

      /* Use blocked code. */
      if (uplo == MagmaUpper) 
        {
          /* ---------------------------------------------- */
          /* Upper-triangular case                          */
          /* > Compute the Cholesky factorization A = U'*U. */
          /* ---------------------------------------------- */
            for(j=0;j<m;j+=nb){
                /* Set the GPU number that holds the current panel */
                id  = (j/nb)%num_gpus;
                buf = (j/nb)%num_gpus;
                
                /* Set the local index where the current panel is */
                j_local = j/(nb*num_gpus);
                jb = min(nb, (m-j));

                if(j>0){
                    magma_queue_sync(queues[id*2]); // wait for the column on CPU
                    /* broadcast off-diagonal column to all gpus */
                    d = (j/nb+1)%num_gpus;
                    for(dd=0;dd<num_gpus;dd++){
                        if(d != id){
                            //magma_queue_sync(queues[2*d]);
                            magma_dsetmatrix_async( j, jb, 
                                                    Aup(0,j), lda, 
                                                    dlP(d,jb,0,buf), lddp, 
                                                    queues[d*2], NULL );
                        }
                        d = (d+1)%num_gpus;
                    }
                }
                /* Update the current diagonal block */
                if( j > 0 ) {
                    magma_dsyrk(MagmaUpper, MagmaConjTrans, jb, j, 
                                d_neg_one, dlA(id, 0, nb*j_local), ldda,
                                d_one,     dlA(id, j, nb*j_local), ldda,
                                queues[2*id+1]);
                                                                                                }
                /* send the diagonal to cpu */
                magma_queue_sync(queues[2*id+1]);// wait for syrk
                magma_dgetmatrix_async( jb, jb, 
                                        dlA(id, j, nb*j_local), ldda,
                                        Aup(j,j), lda,
                                        queues[2*id], NULL);
                if(j>0){
                    /* Compute the local block column of the panel. */
                    d = (j/nb+1)%num_gpus;
                    for(dd=0;dd<num_gpus;dd++){
                        j_local2 = j_local+1;
                        if(d>id) j_local2 --;
                        nb0 = nb*j_local2;

                        if(n_local[d]>nb0){
                            /* wait for the off-diagonal */
                            if(d!=id){
                                dlpanel = d_lP[d];
                                dlpanel_offset = dlP_offset(jb, 0, buf);
                                ldpanel = lddp;
                        
                                /* wait for the offdiagonal column */
                                magma_queue_sync(queues[d*2]);
                            }else{
                                dlpanel = d_lA[d];
                                dlpanel_offset = dlA_offset(0, nb*j_local);
                                ldpanel = ldda;
                            }

                            /* update the panel */
                            magma_dgemm(MagmaConjTrans, MagmaNoTrans, 
                                        jb, n_local[d]-nb0, j, 
                                        c_neg_one, dlpanel, dlpanel_offset, ldpanel,
                                        dlA(d, 0, nb0), ldda, 
                                        c_one, dlA(d, j, nb0), ldda,
                                        queues[2*d+1]);

                        }
                        d = (d+1)%num_gpus;
                    }
                }

                /* factor the diagonal */
                magma_queue_sync( queues[id*2] ); // wait for the diagonal
                lapackf77_dpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info);
                if (*info != 0) {
                    *info = *info + j;
                    break;
                }

                /* send the diagonal to gpus */
                if ( (j+jb) < n) {
                    d = (j/nb+1)%num_gpus;
                    for( dd=0; dd<num_gpus; dd++ ) {
                        if( d == id ) {
                            dlpanel = d_lA[d];
                            dlpanel_offset = dlA_offset(j, nb*j_local);
                            ldpanel = ldda;
                        } else {
                            dlpanel = d_lP[d];
                            dlpanel_offset = dlP_offset(0, 0, buf);                                            
                            ldpanel = lddp;
                        }
                        magma_dsetmatrix_async( jb, jb, 
                                                Aup(j,j), lda,
                                                dlpanel, dlpanel_offset,  ldpanel, 
                                                queues[d*2], NULL);
                        d = (d+1)%num_gpus;
                    }
                } else {
                    magma_dsetmatrix_async( jb, jb, 
                                            Aup(j,j), lda, 
                                            dlA(id, j, nb*j_local), ldda,
                                            queues[id*2], NULL );
                }

                /* panel-factorize the off-diagonal */
                if((j+jb)<n){
                    d = (j/nb+1)%num_gpus;
                    for(dd=0;dd<num_gpus;dd++){
                        /* next column */
                        j_local2 = j_local+1;
                        if(d>id) j_local2--;
                        if( d == id ) {
                            dlpanel = d_lA[d];
                            dlpanel_offset = dlA_offset(j, nb*j_local);
                            ldpanel = ldda;
                        } else {
                            dlpanel = d_lP[d];
                            dlpanel_offset = dlP_offset(0, 0, buf);
                            ldpanel = lddp;
                        }
                        nb2 = n_local[d]-nb*j_local2;
                        nb0 = min(nb, nb2 );
                        magma_queue_sync( queues[2*d]); // wait for the diagonal
                        if(j+jb < m && d == (j/nb+1)%num_gpus){
                            /* owns the next column, look-ahead the column */
                            magma_dtrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                         jb, nb0, c_one,
                                         dlpanel, dlpanel_offset, ldpanel,
                                         dlA(d, j, nb*j_local2), ldda, 
                                         queues[2*d+1]);
                            /* send the column to cpu */
                            if(j+jb < m){
                                magma_queue_sync(queues[2*d+1]);  // wait for lookahead
                                 magma_dgetmatrix_async( (j+jb), nb0, 
                                                         dlA(d, 0, nb*j_local2), ldda, 
                                                         Aup(0,j+jb), lda,
                                                         queues[2*d], NULL);
                            }

                            /* update the remaining blocks */
                            nb2 = nb2 - nb0;

                            magma_dtrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                         jb, nb2, c_one, 
                                         dlpanel, dlpanel_offset, ldpanel,
                                         dlA(d, j, nb*j_local2+nb0), ldda, 
                                         queues[2*d+1]);
                        }else if(nb2 > 0){
                            /* update the entire trailing matrix */
                            magma_dtrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, 
                                         jb, nb2, c_one, 
                                         dlpanel, dlpanel_offset, ldpanel,
                                         dlA(d, j, nb*j_local2), ldda,
                                         queues[d*2+1]);
                        }
                        d = (d+1)%num_gpus;
                    }
                }
            }            
        } else { 
            /* -------------------------------------------- */
            /* Lower-triangular case                        */
            /* Compute the Cholesky factorization A = L*L'. */
            /* -------------------------------------------- */
            for (j=0; j<n; j+=nb) {

              /* Set the GPU number that holds the current panel */
              id  = (j/nb)%num_gpus;
              buf = (j/nb)%num_gpus;

              /* Set the local index where the current panel is */
              j_local = j/(nb*num_gpus);
              jb = min(nb, (n-j));

              if( j > 0 ) {
/* needed on pluto... */
//magma_setdevice(id);
                   magma_queue_sync( queues[id*2] ); // wait for the column on CPU

                  /* broadcast offdiagonal row to all gpus */
                  d = (j/nb+1)%num_gpus;
                  for( dd=0; dd<num_gpus; dd++ ) {
                      if( d != id ) {
                          /* wait for it on CPU */
                          //magma_queue_sync( queues[d*2] );

                          /* send it to GPU */
                          magma_dsetmatrix_async( jb, j,
                                                  Alo(j,0), lda,
                                                  dlPT(d,0,jb,buf), nb, 
                                                  queues[d*2], NULL );
                         clFlush(queues[d*2]);
                      }
                      d = (d+1)%num_gpus;
                  }
              }

              /* Update the current diagonal block */
              if( j > 0 ) {
                  magma_dsyrk(MagmaLower, MagmaNoTrans, jb, j,
                              d_neg_one, dlA(id, nb*j_local, 0), ldda,
                              d_one,     dlA(id, nb*j_local, j), ldda,
                              queues[id*2+1]);
                magma_queue_sync( queues[id*2+1] ); // wait for syrk
              }

              /* update the offdiagonal blocks */
              if ( j > 0 ) {
                  /* compute the block-rows of the panel */
                  d = (j/nb+1)%num_gpus;
                  for( dd=0; dd<num_gpus; dd++ ) {
                      j_local2 = j_local+1;
                      if( d > id ) j_local2 --;
                      nb0 = nb*j_local2;

                      if( nb0 < n_local[d] ) {
                          if( d != id ) {
                              //dlpanel = dlPT(d);
                              dlpanel = d_lP[d];
                              dlpanel_offset = dlPT_offset(0, jb, buf);
                              ldpanel = nb;

                              /* wait for offdiagonal row */
                              magma_queue_sync(queues[d*2]);
                          } else {
                              dlpanel = d_lA[d];
                              dlpanel_offset = dlA_offset(nb*j_local, 0);
                              ldpanel = ldda;
                          }

                          magma_dgemm( MagmaNoTrans, MagmaConjTrans,
                                       n_local[d]-nb0, jb, j,
                                       c_neg_one, dlA(d, nb0, 0), ldda,
                                                  dlpanel, dlpanel_offset, ldpanel,
                                       c_one,     dlA(d, nb0, j), ldda, 
                                       queues[d*2+1]);
                      }
                      d = (d+1)%num_gpus;
                  }
              }

              /* send the diagonal to cpu */
              magma_dgetmatrix_async( jb, jb,
                                      dlA(id, nb*j_local, j), ldda,
                                      Alo(j,j),               lda, 
                                      queues[id*2], &events[id] );
              clFlush(queues[id*2]);
              /* factor the diagonal */
              magma_queue_sync( queues[id*2] );
              lapackf77_dpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info);
              if (*info != 0) {
                  printf("row number: %d\n", (int) j);
                  *info = *info + j;
                  break;
              }

              /* send the diagonal to gpus */
              if ( (j+jb) < m ) {
                  d = (j/nb+1)%num_gpus;
                  for( dd=0; dd<num_gpus; dd++ ) {
                      if( d == id ) {
                          dlpanel = d_lA[d];
                          dlpanel_offset = dlA_offset(nb*j_local, j);
                          ldpanel = ldda;
                      } else {
                          //dlpanel = dlPT(d);
                          dlpanel = d_lP[d];
                          dlpanel_offset = dlPT_offset(0, 0, buf);
                          ldpanel = nb;
                      }
                      magma_dsetmatrix_async( jb, jb,
                                              Alo(j,j), lda,
                                              dlpanel,  dlpanel_offset, ldpanel, 
                                              queues[d*2], NULL );
                      clFlush(queues[d*2]);
                      d = (d+1)%num_gpus;
                  }
              } else {
                  magma_dsetmatrix_async( jb, jb,
                                          Alo(j,j),       lda,
                                          dlA(id, nb*j_local, j), ldda, 
                                          queues[id*2], NULL );
                  clFlush(queues[id*2]);
              }

              /* factorize off-diagonal blocks */
              if ( (j+jb) < m ) {
                  d = (j/nb+1)%num_gpus;
                  for( dd=0; dd<num_gpus; dd++ ) {
                      /* next column */
                      j_local2 = j_local+1;
                      if( d > id ) j_local2--;
                      if( d == id ) {
                          dlpanel = d_lA[d];
                          dlpanel_offset = dlA_offset(nb*j_local, j);
                          ldpanel = ldda;
                      } else {         
                          //dlpanel = dlPT(d);
                          dlpanel = d_lP[d];
                          dlpanel_offset = dlPT_offset(0, 0, buf);
                          ldpanel = nb;
                      }
                      nb2 = n_local[d] - j_local2*nb;
                      nb0 = min(nb, nb2 );
                        
                      magma_queue_sync(queues[d*2]);
                      // wait for the diagonal
                      if( j+jb < n && d == (j/nb+1)%num_gpus ) {
                          /* owns the next column, look-ahead the column */
                          magma_dtrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, 
                                       nb0, jb, c_one,
                                       dlpanel,  dlpanel_offset, ldpanel, 
                                       dlA(d, nb*j_local2, j), ldda,
                                       queues[d*2+1]);
                          /* send the column to cpu */
                          if( j+jb < n ) {
                              magma_queue_sync( queues[d*2+1] ); // wait for lookahead
                              magma_dgetmatrix_async( nb0, j+jb,
                                                      dlA(d, nb*j_local2, 0), ldda,
                                                      Alo(j+jb,0),            lda, 
                                                      queues[d*2], NULL);
                              clFlush(queues[d*2]);
                          }
                          /* update the remaining blocks */
                          nb2 = nb2 - nb0;
                          magma_dtrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, 
                                       nb2, jb, c_one,
                                       dlpanel, dlpanel_offset, ldpanel, 
                                       dlA(d, nb*j_local2+nb0, j), ldda, 
                                       queues[d*2+1]);
                      } else if( nb2 > 0 ) {
                          /* update the entire trailing matrix */
                          magma_dtrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, 
                                       nb2, jb, c_one,
                                       dlpanel, dlpanel_offset, ldpanel, 
                                       dlA(d, nb*j_local2, j), ldda, 
                                       queues[d*2+1]);
                      }
                      d = (d+1)%num_gpus;
                  }
              }
            }
          } /* end of else not upper */

          /* clean up */
          for( d=0; d<num_gpus; d++ ) {
              magma_queue_sync( queues[d*2] );
              magma_queue_sync( queues[d*2+1] );
          }

    } /* end of not lapack */

    return *info;
} /* magma_dpotrf_mgpu */
예제 #6
0
extern "C" magma_int_t
magma_dpotrf3_mgpu(int num_gpus, char uplo, magma_int_t m, magma_int_t n, 
                   magma_int_t off_i, magma_int_t off_j, magma_int_t nb,
                   double **d_lA,  magma_int_t ldda, 
                   double **d_lP,  magma_int_t lddp, 
                   double *a,      magma_int_t lda,   magma_int_t h,
                   cudaStream_t stream[][3], cudaEvent_t event[][5],
                   magma_int_t *info ) 
{
/*  -- MAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2012

    Purpose   
    =======   
    DPOTRF computes the Cholesky factorization of a real symmetric   
    positive definite matrix dA.   
    Auxiliary subroutine for dpotrf2_ooc. It is multiple gpu interface to compute 
    Cholesky of a "rectangular" matrix.

    The factorization has the form   
       dA = U**T * U,  if UPLO = 'U', or   
       dA = L  * L**T,  if UPLO = 'L',   
    where U is an upper triangular matrix and L is lower triangular.   

    This is the block version of the algorithm, calling Level 3 BLAS.   

    Arguments   
    =========   
    UPLO    (input) CHARACTER*1   
            = 'U':  Upper triangle of dA is stored;   
            = 'L':  Lower triangle of dA is stored.   

    N       (input) INTEGER   
            The order of the matrix dA.  N >= 0.   

    dA      (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N)   
            On entry, the symmetric matrix dA.  If UPLO = 'U', the leading   
            N-by-N upper triangular part of dA contains the upper   
            triangular part of the matrix dA, and the strictly lower   
            triangular part of dA is not referenced.  If UPLO = 'L', the   
            leading N-by-N lower triangular part of dA contains the lower   
            triangular part of the matrix dA, and the strictly upper   
            triangular part of dA is not referenced.   

            On exit, if INFO = 0, the factor U or L from the Cholesky   
            factorization dA = U**T * U or dA = L * L**T.   

    LDDA     (input) INTEGER   
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            dividable by 16.

    INFO    (output) INTEGER   
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value   
            > 0:  if INFO = i, the leading minor of order i is not   
                  positive definite, and the factorization could not be   
                  completed.   
    =====================================================================   */


    magma_int_t     j, jb, nb0, nb2, d, dd, id, j_local, j_local2, buf;
    char            uplo_[2] = {uplo, 0};
    double c_one     = MAGMA_D_ONE;
    double c_neg_one = MAGMA_D_NEG_ONE;
    double          d_one     =  1.0;
    double          d_neg_one = -1.0;
    int upper = lapackf77_lsame(uplo_, "U");
    double *dlpanel;
    magma_int_t n_local[MagmaMaxGPUs], ldpanel;
    //cudaEvent_t event0[MagmaMaxGPUs],  /* send row to CPU    */
    //            event1[MagmaMaxGPUs],  /* send diag to GPU   */
    //            event2[MagmaMaxGPUs],  /* offdiagonal update */
    //            event3[MagmaMaxGPUs],  /* send row to GPU    */
    //            event4[MagmaMaxGPUs];  /* lookahead          */
    const magma_int_t stream1 = 0, stream2 = 1, stream3 = 2;
    double *d_dinvA[MagmaMaxGPUs][2], *d_x[MagmaMaxGPUs][2]; /* used by dtrsm_work */

    *info = 0;
    if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (!upper && num_gpus*ldda < max(1,n)) {
        *info = -4;
    } else if (upper && ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* initialization */
    for( d=0; d<num_gpus; d++ ) {
      /* local-n and local-ld */
      if (upper) {
        n_local[d] = ((n/nb)/num_gpus)*nb;
        if (d < (n/nb)%num_gpus)
          n_local[d] += nb;
        else if (d == (n/nb)%num_gpus)
          n_local[d] += n%nb;
      } else {
        n_local[d] = ((m/nb)/num_gpus)*nb;
        if (d < (m/nb)%num_gpus)
          n_local[d] += nb;
        else if (d == (m/nb)%num_gpus)
          n_local[d] += m%nb;
      }
      //magma_setdevice(d);
      //magma_event_create( &event0[d] );
      //magma_event_create( &event1[d] );
      //magma_event_create( &event2[d] );
      //magma_event_create( &event3[d] );
      //magma_event_create( &event4[d] );
    }

    /* == initialize the trace */
    trace_init( 1, num_gpus, 3, (CUstream_st**)stream );

    if (upper) 
    {     
      /* ---------------------------------------------- */
      /* Upper-triangular case                          */
      /* > Compute the Cholesky factorization A = U'*U. */
      /* ---------------------------------------------- */
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
      /* invert the diagonals 
       * Allocate device memory for the inversed diagonal blocks, size=m*NB
       */
      for( d=0; d<num_gpus; d++ ) {
          magma_setdevice(d);
          for( j=0; j<2; j++ ) {
              cudaMalloc((void**)&d_dinvA[d][j], nb*nb*sizeof(double));
              cudaMalloc((void**)&d_x[d][j],      n*nb*sizeof(double));
              cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(double));
              cudaMemset(d_x[d][j],     0,  n*nb*sizeof(double));
          }
      }
      magma_setdevice(0);
#endif

      for (j=0; j<m; j+=nb) {

        /* Set the GPU number that holds the current panel */
        id  = (j/nb)%num_gpus;
        buf = (j/nb)%num_gpus;

        /* Set the local index where the current panel is */
        j_local = j/(nb*num_gpus);
        jb = min(nb, (m-j));

        /* Update the current diagonal block on stream1 */
        magma_setdevice(id);
        if( j > 0 ) {
            magmablasSetKernelStream(stream[id][stream1]);
            trace_gpu_start( id, stream1, "syrk", "syrk" );
            magma_dsyrk(MagmaUpper, MagmaTrans, jb, j, 
                        d_neg_one, dlA(id, 0, nb*j_local), ldda, 
                        d_one,     dlA(id, j, nb*j_local), ldda);
            trace_gpu_end( id, stream1 );
        }

        /* send the diagonal to cpu on stream1 */
        trace_gpu_start( id, stream1, "comm", "D to CPU" );
        magma_dgetmatrix_async( jb, jb,
                                dlA(id, j, nb*j_local), ldda,
                                Aup(j,j),               lda, 
                                stream[id][stream1] );
        trace_gpu_end( id, stream1 );

        /* update off-diagonal blocks in the panel */
        if( j > 0 ) {
            d = (j/nb+1)%num_gpus;
            for( dd=0; dd<num_gpus; dd++ ) {
                j_local2 = j_local+1;
                if( d > id ) j_local2 --;
                nb0 = nb*j_local2; 

                if( n_local[d] > nb0 ) {
                    magma_setdevice(d);
                    magmablasSetKernelStream(stream[d][stream2]);
                    if( d == id ) {
                        dlpanel = dlA(d, 0, nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlP(d, jb, 0, buf);
                        ldpanel = lddp;
                        magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu
                    }
                    trace_gpu_start( d, stream2, "gemm", "gemm" );
                    magma_dgemm(MagmaTrans, MagmaNoTrans, 
                                jb, n_local[d]-nb0, j, 
                                c_neg_one, dlpanel,        ldpanel, 
                                           dlA(d, 0, nb0), ldda,
                                c_one,     dlA(d, j, nb0), ldda);
                    trace_gpu_end( d, stream2 );
                    magma_event_record( event[d][2], stream[d][stream2] );
                }
                d = (d+1)%num_gpus;
            }
        }

        /* wait for panel and factorize it on cpu */
        magma_setdevice(id);
        magma_queue_sync( stream[id][stream1] );
        trace_cpu_start( 0, "getrf", "getrf" );
        lapackf77_dpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info);
        trace_cpu_end( 0 );
        if (*info != 0) {
            *info = *info + j;
            break;
        }

        /* send the diagonal to gpus on stream1 */
        if ( (j+jb) < n) {
          d = (j/nb+1)%num_gpus;
          for( dd=0; dd<num_gpus; dd++ ) {
            if( d == id ) {
                dlpanel = dlA(d, j, nb*j_local);
                ldpanel = ldda;
            } else {
                dlpanel = dlP(d, 0, 0, buf);
                ldpanel = lddp;
            }
            magma_setdevice(d);
            trace_gpu_start( d, stream1, "comm", "comm" );
            magma_dsetmatrix_async( jb, jb,
                                    Aup(j,j), lda,
                                    dlpanel,  ldpanel, 
                                    stream[d][stream1] );
            trace_gpu_end( d, stream1 );
            magma_event_record( event[d][1], stream[d][stream1] );
            d = (d+1)%num_gpus;
          }
        } else {
          magma_setdevice(id);
          trace_gpu_start( id, stream1, "comm", "comm" );
          magma_dsetmatrix_async( jb, jb,
                                  Aup(j,j),               lda,
                                  dlA(id, j, nb*j_local), ldda, 
                                  stream[id][stream1] );
          trace_gpu_end( id, stream1 );
        }

        /* panel-factorize the off-diagonal */
        if ( (j+jb) < n) {
            d = (j/nb+1)%num_gpus;
            for( dd=0; dd<num_gpus; dd++ ) {
                /* next column */
                j_local2 = j_local+1;
                if( d > id ) j_local2--;
                if( d == id ) {
                    dlpanel = dlA(d,j,nb*j_local);
                    ldpanel = ldda;
                } else {
                    dlpanel = dlP(d, 0, 0, buf);
                    ldpanel = lddp;
                }
                nb2 = n_local[d] - j_local2*nb;
                nb0 = min(nb, nb2);
        
                magma_setdevice(d);
                //magma_queue_sync( stream[d][stream1] );  // synch on chol for remaining update
                //magma_queue_sync( stream[d][stream2] );
                if( j+jb < m && d == (j/nb+1)%num_gpus ) { 
                    /* owns the next column, look-ahead next block on stream1 */
                    magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update 
                    magmablasSetKernelStream(stream[d][stream1]);
                    trace_gpu_start( d, stream1, "trsm", "trsm" );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
                    magmablas_dtrsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, 
                                          jb, nb0, c_one,
                                          dlpanel,                ldpanel, 
                                          dlA(d, j, nb*j_local2), ldda,
                                          d_dinvA[d][0], d_x[d][0] );
#else
                    magma_dtrsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, 
                                 jb, nb0, c_one,
                                 dlpanel,                ldpanel, 
                                 dlA(d, j, nb*j_local2), ldda);
#endif
                    magma_event_record( event[d][4], stream[d][stream1] );
                    trace_gpu_end( d, stream1 );
                } else if( nb2 > 0 ) {
                    /* update all the blocks on stream2 */
                    magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor 
                    trace_gpu_start( d, stream2, "trsm", "trsm" );
                    magmablasSetKernelStream(stream[d][stream2]);
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
                    magmablas_dtrsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, 
                                          jb, nb2, c_one,
                                          dlpanel,                ldpanel, 
                                          dlA(d, j, nb*j_local2), ldda,
                                          d_dinvA[d][1], d_x[d][1] );
#else
                    magma_dtrsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, 
                                 jb, nb2, c_one,
                                 dlpanel,                ldpanel, 
                                 dlA(d, j, nb*j_local2), ldda);
#endif
                    trace_gpu_end( d, stream2 );
                }
                d = (d+1)%num_gpus;
            } /* end of for */

/* ======================================================================================== */

            d = (j/nb+1)%num_gpus;
            /* next column */
            j_local2 = j_local+1;
            if( d > id ) j_local2--;
            nb0 = min(nb, n_local[d]-nb*j_local2 );
        
            /* even on 1 gpu, off-diagonals are copied to cpu (synchronize at the end).      *
             * so we have the Cholesky factor, but only diagonal submatrix of the big panel, *
             * on cpu at the end.                                                            */
            if( j+jb < m ) { 
                int d2, id2, j2, buf2;
                magma_setdevice(d);
                /* make sure all the previous sets are done */
                if( h < num_gpus ) {
                    /* > offdiagonal */
                    for( d2=0; d2<num_gpus; d2++ ) {
                        j2 = j - (1+d2)*nb;
                        if( j2 < 0 ) break;
                        id2  = (j2/nb)%num_gpus;
                        magma_queue_wait_event( stream[d][stream3], event[id2][0] );
                    }

                    /* > diagonal */
                    for( d2=0; d2<num_gpus; d2++ ) {
                        j2 = j - d2*nb;
                        if( j2 < 0 ) break;
                        id2  = (j2/nb)%num_gpus;
                        magma_queue_wait_event( stream[d][stream3], event[id2][1] );
                    }
                }
                /* lookahead */
                magma_queue_wait_event( stream[d][stream3], event[d][4] );

                trace_gpu_start( d, stream3, "comm", "row to CPU" );
                magma_dgetmatrix_async( (j+jb), nb0,
                                        dlA(d, 0, nb*j_local2), ldda,
                                        Aup(0,j+jb),            lda, 
                                        stream[d][stream3] );
                trace_gpu_end( d, stream3 );
                magma_event_record( event[d][3], stream[d][stream3] );
/* needed on pluto */
magma_queue_sync( stream[d][stream3] );

                /* wait for the off-diagonal on cpu */
                //magma_setdevice(id);
                //magma_queue_sync( stream[id][stream3] );

                /* broadcast rows to gpus on stream2 */
                buf2 = ((j+jb)/nb)%num_gpus;
                for( d2=0; d2<num_gpus; d2++ ) {
                    if( d2 != d ) 
                    {
                        magma_setdevice(d2);
                        trace_gpu_start( d2, stream3, "comm", "row to GPUs" );
                        magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // rows arrived at cpu on stream3
                        magma_dsetmatrix_async( j+jb, nb0,
                                                Aup(0,j+jb),        lda,
                                                dlP(d2,nb0,0,buf2), lddp, 
                                                stream[d2][stream3] );
                        trace_gpu_end( d2, stream3 );
                        magma_event_record( event[d2][0], stream[d2][stream3] );
                    }
                } 
            }

/* ======================================================================================== */
            /* gpu owning the next column                    */
            /* after look ahead, update the remaining blocks */
            if( j+jb < m ) /* no update on the last block column */ 
            {
                d = (j/nb+1)%num_gpus;
                /* next column */
                j_local2 = j_local+1;
                if( d > id ) j_local2--;
                if( d == id ) {
                    dlpanel = dlA(d, j, nb*j_local);
                    ldpanel = ldda;
                } else {
                    dlpanel = dlP(d, 0, 0, buf);
                    ldpanel = lddp;
                }
                nb0 = min(nb, n_local[d]-nb*j_local2 );
                nb2 =         n_local[d]-nb*j_local2 - nb0;
        
                /* update the remaining blocks */
                if( nb2 > 0 ) {
                    magma_setdevice(d);
                    magmablasSetKernelStream(stream[d][stream2]);  
                    magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor
                    trace_gpu_start( d, stream2, "trsm", "trsm" );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
                    magmablas_dtrsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, 
                                          jb, nb2, c_one,
                                          dlpanel,                    ldpanel, 
                                          dlA(d, j, nb*j_local2+nb0), ldda,
                                          d_dinvA[d][1], d_x[d][1] );
#else
                    magma_dtrsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, 
                                 jb, nb2, c_one,
                                 dlpanel,                    ldpanel, 
                                 dlA(d, j, nb*j_local2+nb0), ldda);
#endif
                    trace_gpu_end( d, stream2 );
                }
            }
        } /* end of dtrsm */
      } /* end of for j=1, .., n */
    } else { 

      /* ---------------------------------------------- */
      /* Lower-triangular case                          */
      /* > Compute the Cholesky factorization A = L*L'. */
      /* ---------------------------------------------- */
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
      /* 
       * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE
       */
      for( d=0; d<num_gpus; d++ ) {
         magma_setdevice(d);
         for( j=0; j<2; j++ ) {
             cudaMalloc((void**)&d_dinvA[d][j], nb*nb*sizeof(double));
             cudaMalloc((void**)&d_x[d][j],     nb*m *sizeof(double));
             cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(double));
             cudaMemset(d_x[d][j],     0, nb* m*sizeof(double));
         }
      }
      magma_setdevice(0);
#endif

      for (j=0; j<n; j+=nb) {

        /* Set the GPU number that holds the current panel */
        id  = (j/nb)%num_gpus;
        buf = (j/nb)%num_gpus;

        /* Set the local index where the current panel is */
        j_local = j/(nb*num_gpus);
        jb = min(nb, (n-j));

        /* Update the current diagonal block on stream1 */
        magma_setdevice(id);
        if( j > 0 ) {
            magmablasSetKernelStream(stream[id][stream1]);
            magma_dsyrk(MagmaLower, MagmaNoTrans, jb, j,
                        d_neg_one, dlA(id, nb*j_local, 0), ldda,
                        d_one,     dlA(id, nb*j_local, j), ldda);
        }

        /* send the diagonal to cpu on stream1 */
        magma_dgetmatrix_async( jb, jb,
                                dlA(id, nb*j_local, j), ldda,
                                Alo(j,j),               lda, 
                                stream[id][stream1] );

        /* update off-diagonal blocks of the panel */
        if( j > 0 ) {
            d = (j/nb+1)%num_gpus;
            for( dd=0; dd<num_gpus; dd++ ) {
                j_local2 = j_local+1;
                if( d > id ) j_local2 --;
                nb0 = nb*j_local2; 

                if( nb0 < n_local[d] ) {
                    magma_setdevice(d);
                    magmablasSetKernelStream(stream[d][stream2]);
                    if( d == id ) {
                        dlpanel = dlA(d, nb*j_local, 0);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlPT(d,0,jb,buf);
                        ldpanel = nb;
                        magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu
                    }
                    magma_dgemm( MagmaNoTrans, MagmaTrans,
                                 n_local[d]-nb0, jb, j,
                                 c_neg_one, dlA(d, nb0, 0), ldda,
                                            dlpanel,        ldpanel,
                                 c_one,     dlA(d, nb0, j), ldda);
                    magma_event_record( event[d][2], stream[d][stream2] );
                }
                d = (d+1)%num_gpus;
            }
        }

        /* wait for the panel and factorized it on cpu */
        magma_setdevice(id);
        magma_queue_sync( stream[id][stream1] );
        lapackf77_dpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info);
        if (*info != 0) {
           *info = *info + j;
           break;
        }

        /* send the diagonal to gpus on stream1 */
        if ( (j+jb) < m) {
          d = (j/nb+1)%num_gpus;
          for( dd=0; dd<num_gpus; dd++ ) {
            if( d == id ) {
                dlpanel = dlA(d, nb*j_local, j);
                ldpanel = ldda;
            } else {
                dlpanel = dlPT(d, 0, 0, buf);
                ldpanel = nb;
            }
            magma_setdevice(d);
            magma_dsetmatrix_async( jb, jb,
                                    Alo(j,j), lda,
                                    dlpanel,  ldpanel, 
                                    stream[d][stream1] );
            magma_event_record( event[d][1], stream[d][stream1] );
            d = (d+1)%num_gpus;
          }
        } else {
          magma_setdevice(id);
          magma_dsetmatrix_async( jb, jb,
                                  Alo(j,j),               lda,
                                  dlA(id, nb*j_local, j), ldda, 
                                  stream[id][stream1] );
        }

        /* panel factorize the off-diagonal */
        if ( (j+jb) < m) {
          d = (j/nb+1)%num_gpus;
          for( dd=0; dd<num_gpus; dd++ ) {
            /* next column */
            j_local2 = j_local+1;
            if( d > id ) j_local2--;
            if( d == id ) {
                dlpanel = dlA(d, nb*j_local, j);
                ldpanel = ldda;
            } else {         
                dlpanel = dlPT(d, 0, 0, buf);
                ldpanel = nb;
            }
            nb2 = n_local[d] - j_local2*nb;
            nb0 = min(nb, nb2 );

            magma_setdevice(d);
            if( j+nb < n && d == (j/nb+1)%num_gpus ) { /* owns next column, look-ahead next block on stream1 */
              magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update 
              magmablasSetKernelStream(stream[d][stream1]);
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
              magmablas_dtrsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, 
                                    nb0, jb, c_one,
                                    dlpanel,                ldpanel, 
                                    dlA(d, nb*j_local2, j), ldda,
                                    d_dinvA[d][0], d_x[d][0] );
#else
              magma_dtrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, 
                           nb0, jb, c_one,
                           dlpanel,                ldpanel, 
                           dlA(d, nb*j_local2, j), ldda);
#endif
              magma_event_record( event[d][4], stream[d][stream1] );
            } else if( nb2 > 0 ) { /* other gpus updating all the blocks on stream2 */
              /* update the entire column */
              magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for the cholesky factor
              magmablasSetKernelStream(stream[d][stream2]);
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
              magmablas_dtrsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, 
                                    nb2, jb, c_one,
                                    dlpanel,                ldpanel, 
                                    dlA(d, nb*j_local2, j), ldda,
                                    d_dinvA[d][1], d_x[d][1] );
#else
              magma_dtrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, 
                           nb2, jb, c_one,
                           dlpanel,                ldpanel, 
                           dlA(d, nb*j_local2, j), ldda);
#endif
            }
            d = (d+1)%num_gpus;
          } /* end for d */

/* ======================================================================================== */

          d = (j/nb+1)%num_gpus;
          /* next column */
          j_local2 = j_local+1;
          if( d > id ) j_local2--;
          nb0 = min(nb, n_local[d]-nb*j_local2 );

          /* even on 1 gpu, we copy off-diagonal to cpu (but don't synchronize).  */
          /* so we have the Cholesky factor on cpu at the end.                    */
          if( j+jb < n ) {
              int d2, id2, j2, buf2;
              magma_setdevice(d);
              /* make sure all the previous sets are done */
              if( h < num_gpus ) {
                  /* > offdiagonal */
                  for( d2=0; d2<num_gpus; d2++ ) {
                      j2 = j - (1+d2)*nb;
                      if( j2 < 0 ) break;
                      id2  = (j2/nb)%num_gpus;
                      magma_queue_wait_event( stream[d][stream3], event[id2][0] );
                  }

                  /* > diagonal */
                  for( d2=0; d2<num_gpus; d2++ ) {
                      j2 = j - d2*nb;
                      if( j2 < 0 ) break;
                      id2  = (j2/nb)%num_gpus;
                      magma_queue_wait_event( stream[d][stream3], event[id2][1] );
                  }
              }
              // lookahead done
              magma_queue_wait_event( stream[d][stream3], event[d][4] );

              magma_dgetmatrix_async( nb0, j+jb,
                                      dlA(d, nb*j_local2, 0), ldda,
                                      Alo(j+jb,0),            lda, 
                                      stream[d][stream3] );
              magma_event_record( event[d][3], stream[d][stream3] );
/* syn on rows on CPU, seem to be needed on Pluto */
magma_queue_sync( stream[d][stream3] );

              /* broadcast the rows to gpus */
              buf2 = ((j+jb)/nb)%num_gpus;
              for( d2=0; d2<num_gpus; d2++ ) {
                  if( d2 != d ) 
                  {
                      magma_setdevice(d2);
                      magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // getmatrix done
                      magma_dsetmatrix_async( nb0, j+jb,
                                              Alo(j+jb,0),         lda,
                                              dlPT(d2,0,nb0,buf2), nb, 
                                              stream[d2][stream3] );
                      magma_event_record( event[d2][0], stream[d2][stream3] );
                  }
              }
          }

/* ======================================================================================== */
          /* gpu owing the next column updates remaining blocks on stream2 */
          if( j+nb < n ) { // no lookahead on the last block column
            d = (j/nb+1)%num_gpus;

            /* next column */
            j_local2 = j_local+1;
            if( d > id ) j_local2--;
            if( d == id ) {
              dlpanel = dlA(d, nb*j_local, j);
              ldpanel = ldda;
            } else {         
              dlpanel = dlPT(d,0,0,buf);
              ldpanel = nb;
            }
            nb0 = min(nb, n_local[d]-nb*j_local2 );
            nb2 = n_local[d] - j_local2*nb - nb0;

            if( nb2 > 0 ) {
                magma_setdevice(d);
                magmablasSetKernelStream(stream[d][stream2]);
                /* update the remaining blocks in the column */
                magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
                magmablas_dtrsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, 
                                      nb2, jb, c_one,
                                      dlpanel,                    ldpanel, 
                                      dlA(d, nb*j_local2+nb0, j), ldda,
                                      d_dinvA[d][1], d_x[d][1] );
#else
                magma_dtrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, 
                             nb2, jb, c_one,
                             dlpanel,                    ldpanel, 
                             dlA(d, nb*j_local2+nb0, j), ldda);
#endif
            }
          }
        }
      }
    } /* end of else not upper */

    /* == finalize the trace == */
    trace_finalize( "dpotrf.svg","trace.css" );
    for( d=0; d<num_gpus; d++ ) {
      magma_setdevice(d);
      magma_queue_sync( stream[d][0] ); 
      magma_queue_sync( stream[d][1] );
      magma_queue_sync( stream[d][2] );
      magmablasSetKernelStream(NULL);

      //magma_event_destroy( event0[d] ); 
      //magma_event_destroy( event1[d] );
      //magma_event_destroy( event2[d] );
      //magma_event_destroy( event3[d] );
      //magma_event_destroy( event4[d] );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
      for( j=0; j<2; j++ ) {
          magma_free( d_dinvA[d][j] );
          magma_free( d_x[d][j] );
      }
#endif
    }
    magma_setdevice(0);

    return *info;
} /* magma_dpotrf_mgpu */
예제 #7
0
/**
    Purpose
    -------
    SPOTRF computes the Cholesky factorization of a real symmetric
    positive definite matrix dA.
    Auxiliary subroutine for spotrf2_ooc. It is multiple gpu interface to compute
    Cholesky of a "rectangular" matrix.

    The factorization has the form
       dA = U**H * U,   if UPLO = MagmaUpper, or
       dA = L  * L**H,  if UPLO = MagmaLower,
    where U is an upper triangular matrix and L is lower triangular.

    This is the block version of the algorithm, calling Level 3 BLAS.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of dA is stored;
      -     = MagmaLower:  Lower triangle of dA is stored.

    @param[in]
    n       INTEGER
            The order of the matrix dA.  N >= 0.

    @param[in,out]
    dA      REAL array on the GPU, dimension (LDDA,N)
            On entry, the symmetric matrix dA.  If UPLO = MagmaUpper, the leading
            N-by-N upper triangular part of dA contains the upper
            triangular part of the matrix dA, and the strictly lower
            triangular part of dA is not referenced.  If UPLO = MagmaLower, the
            leading N-by-N lower triangular part of dA contains the lower
            triangular part of the matrix dA, and the strictly upper
            triangular part of dA is not referenced.
    \n
            On exit, if INFO = 0, the factor U or L from the Cholesky
            factorization dA = U**H * U or dA = L * L**H.

    @param[in]
    ldda     INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            divisible by 16.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
      -     > 0:  if INFO = i, the leading minor of order i is not
                  positive definite, and the factorization could not be
                  completed.

    @ingroup magma_sposv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_spotrf3_mgpu(
    magma_int_t ngpu,
    magma_uplo_t uplo, magma_int_t m, magma_int_t n,
    magma_int_t off_i, magma_int_t off_j, magma_int_t nb,
    magmaFloat_ptr d_lA[],  magma_int_t ldda,
    magmaFloat_ptr d_lP[],  magma_int_t lddp,
    float *A,          magma_int_t lda, magma_int_t h,
    magma_queue_t queues[][3], magma_event_t events[][5],
    magma_int_t *info )
{
#define Alo(i, j)  (A +             ((j)+off_j)*lda  + (nb*(((i)/nb)%h)+off_i))
#define Aup(i, j)  (A + (nb*(((j)/nb)%h)+off_j)*lda  +               (i+off_i))

#define dlA(id, i, j)     (d_lA[(id)] + (j)*ldda + (i))
#define dlP(id, i, j, k)  (d_lP[(id)] + (k)*nb*lddp + (j)*lddp + (i))
#define dlPT(id, i, j, k) (d_lP[(id)] + (k)*nb*lddp + (j)*nb   + (i))

    magma_int_t     j, jb, nb0, nb2, d, dd, id, j_local, j_local2, buf;
    float c_one     = MAGMA_S_ONE;
    float c_neg_one = MAGMA_S_NEG_ONE;
    float          d_one     =  1.0;
    float          d_neg_one = -1.0;
    int upper = (uplo == MagmaUpper);
    float *dlpanel;
    magma_int_t n_local[MagmaMaxGPUs], ldpanel;
    const magma_int_t stream1 = 0, stream2 = 1, stream3 = 2;
    
    *info = 0;
    if (! upper && uplo != MagmaLower) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (!upper && ngpu*ldda < max(1,n)) {
        *info = -4;
    } else if (upper && ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );
    
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
    /* used by strsm_work */
    float c_zero    = MAGMA_S_ZERO;
    int trsm_nb = 128;
    int trsm_n = trsm_nb*((nb+trsm_nb-1)/trsm_nb);
    float *d_dinvA[MagmaMaxGPUs];
    float *d_x[MagmaMaxGPUs];
    #define dinvA(d,j) &(d_dinvA[(d)][(j)*trsm_nb*trsm_n])
    #define dx(d,j) &(d_x[(d)][(j)*nb*m])
    /*
     * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE
     */
    // TODO free memory on failure.
    for( d=0; d < ngpu; d++ ) {
        magma_setdevice(d);
        if ( (MAGMA_SUCCESS != magma_smalloc( &d_dinvA[d], 2*trsm_nb*trsm_n )) ||
             (MAGMA_SUCCESS != magma_smalloc( &d_x[d],     2*nb*(upper ? n : m) )) ) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }
    }
    magma_setdevice(0);
#endif
    
    /* initialization */
    for( d=0; d < ngpu; d++ ) {
        /* local-n and local-ld */
        if (upper) {
            n_local[d] = (n/(nb*ngpu))*nb;
            if (d < (n/nb)%ngpu)
                n_local[d] += nb;
            else if (d == (n/nb)%ngpu)
                n_local[d] += n%nb;
        } else {
            n_local[d] = (m/(nb*ngpu))*nb;
            if (d < (m/nb)%ngpu)
                n_local[d] += nb;
            else if (d == (m/nb)%ngpu)
                n_local[d] += m%nb;
        }
    }

    /* == initialize the trace */
    trace_init( 1, ngpu, 3, (CUstream_st**)queues );

    if (upper) {
        /* ---------------------------------------------- */
        /* Upper-triangular case                          */
        /* > Compute the Cholesky factorization A = U'*U. */
        /* ---------------------------------------------- */
        for (j=0; j < m; j += nb) {
            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%ngpu;
            buf = (j/nb)%ngpu; // right now, we have ngpu buffers, so id and buf are the same..
            
            /* Set the local index where the current panel is */
            j_local = j/(nb*ngpu);
            jb = min(nb, (m-j));
 
            /* Update the current diagonal block on stream1 */
            magma_setdevice(id);
            if ( j > 0 ) {
                magmablasSetKernelStream( queues[id][stream1] );
                trace_gpu_start( id, stream1, "syrk", "syrk" );
                magma_ssyrk(MagmaUpper, MagmaConjTrans, jb, j,
                            d_neg_one, dlA(id, 0, nb*j_local), ldda,
                            d_one,     dlA(id, j, nb*j_local), ldda);
                trace_gpu_end( id, stream1 );
            }
            
            /* send the diagonal to cpu on stream1 */
            trace_gpu_start( id, stream1, "comm", "D to CPU" );
            magma_sgetmatrix_async( jb, jb,
                                    dlA(id, j, nb*j_local), ldda,
                                    Aup(j,j),               lda,
                                    queues[id][stream1] );
            trace_gpu_end( id, stream1 );

            /* update off-diagonal blocks in the panel */
            if ( j > 0 ) {
                d = (j/nb+1)%ngpu;
                for( dd=0; dd < ngpu; dd++ ) {
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2 --;
                    nb0 = nb*j_local2; // number of local columns in the panel, while jb is panel-size (number of rows)
            
                    if ( n_local[d] > nb0 ) {
                        magma_setdevice(d);
                        magmablasSetKernelStream( queues[d][stream2] );
                        if ( d == id ) {
                            dlpanel = dlA(d,0,nb*j_local);
                            ldpanel = ldda;
                            // the GPU owns the row from start, and no need of synch.
                            //magma_queue_wait_event( queues[d][stream2], events[d][0] ); // rows arrived at gpu
                            magma_queue_wait_event( queues[d][stream2], events[d][4] ); // wait for look-ahead trsm to finish
                        } else {
                            dlpanel = dlP(d,nb,0,buf);
                            ldpanel = lddp;
                            magma_queue_wait_event( queues[d][stream2], events[d][0] ); // rows arrived at gpu
                        }
                        trace_gpu_start( d, stream2, "gemm", "gemm" );
                        magma_sgemm(MagmaConjTrans, MagmaNoTrans,
                                    jb, n_local[d]-nb0, j,
                                    c_neg_one, dlpanel,        ldpanel,
                                               dlA(d, 0, nb0), ldda,
                                    c_one,     dlA(d, j, nb0), ldda);
                        trace_gpu_end( d, stream2 );
                        magma_event_record( events[d][2], queues[d][stream2] );
                    }
                    d = (d+1)%ngpu;
                }
            }

            /* wait for panel and factorize it on cpu */
            magma_setdevice(id);
            magma_queue_sync( queues[id][stream1] );
            trace_cpu_start( 0, "getrf", "getrf" );
            lapackf77_spotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info);
            trace_cpu_end( 0 );
            if (*info != 0) {
                *info = *info + j;
                break;
            }
            
            /* send the diagonal to gpus on stream1 */
            if ( (j+jb) < n) {
                d = (j/nb+1)%ngpu;
                for( dd=0; dd < ngpu; dd++ ) {
                    if ( d == id ) {
                        dlpanel = dlA(d, j, nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlP(d,0,0,buf);
                        ldpanel = lddp;
                    }
                    magma_setdevice(d);
                    trace_gpu_start( d, stream1, "comm", "comm" );
                    magma_ssetmatrix_async( jb, jb,
                                            Aup(j,j), lda,
                                            dlpanel,  ldpanel,
                                            queues[d][stream1] );
                    trace_gpu_end( d, stream1 );
                    magma_event_record( events[d][1], queues[d][stream1] );
                    d = (d+1)%ngpu;
                }
            } else {
                magma_setdevice(id);
                trace_gpu_start( id, stream1, "comm", "comm" );
                magma_ssetmatrix_async( jb, jb,
                                        Aup(j,j),               lda,
                                        dlA(id, j, nb*j_local), ldda,
                                        queues[id][stream1] );
                trace_gpu_end( id, stream1 );
            }
            
            /* panel-factorize the off-diagonal */
            if ( (j+jb) < n) {
                d = (j/nb+1)%ngpu;
                for( dd=0; dd < ngpu; dd++ ) {
                    /* next column */
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2--;
                    if ( d == id ) {
                        dlpanel = dlA(d,j,nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlP(d,0,0,buf);
                        ldpanel = lddp;
                    }
                    nb2 = n_local[d] - j_local2*nb;
                    
                    magma_setdevice(d);
                    if ( j+jb < m && d == (j/nb+1)%ngpu ) {
                        /* owns the next column, look-ahead next block on stream1 */
                        nb0 = min(nb, nb2);
                        magmablasSetKernelStream( queues[d][stream1] );
                        magma_queue_wait_event( queues[d][stream1], events[d][2] ); // wait for gemm update
                        trace_gpu_start( d, stream1, "trsm", "trsm" );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
                        magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,0), trsm_nb );
                        magmablas_slaset( MagmaFull, nb0,     jb,     c_zero, c_zero, dx(d,0), nb0 );
                        magmablas_strsm_work( MagmaLeft, MagmaUpper,
                                              MagmaConjTrans, MagmaNonUnit,
                                              jb, nb0, c_one,
                                              dlpanel, ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                              1, dinvA(d,0), dx(d,0) );
#else
                        magma_strsm( MagmaLeft, MagmaUpper,
                                     MagmaConjTrans, MagmaNonUnit,
                                     jb, nb0, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, j, nb*j_local2), ldda);
#endif
                        magma_event_record( events[d][4], queues[d][stream1] );
                        trace_gpu_end( d, stream1 );
                    } else if ( nb2 > 0 ) {
                        /* update all the blocks on stream2 */
                        magma_queue_wait_event( queues[d][stream2], events[d][1] ); // wait for cholesky factor
                        trace_gpu_start( d, stream2, "trsm", "trsm" );
                        magmablasSetKernelStream( queues[d][stream2] );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
                        magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,0), trsm_nb );
                        magmablas_slaset( MagmaFull, nb2,     jb,     c_zero, c_zero, dx(d,0), nb2 );
                        magmablas_strsm_work( MagmaLeft, MagmaUpper,
                                              MagmaConjTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel, ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                              1, dinvA(d,0), dx(d,0) );
#else
                        magma_strsm( MagmaLeft, MagmaUpper,
                                     MagmaConjTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, j, nb*j_local2), ldda);
#endif
                        trace_gpu_end( d, stream2 );
                    }
                    d = (d+1)%ngpu;
                } /* end of for */

                /* ========================================================== */
                if ( j+jb < m ) {
                    d = (j/nb+1)%ngpu;
                    /* next column */
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2--;
                    nb0 = min(nb, n_local[d]-nb*j_local2 );
                
                    /* even on 1 gpu, off-diagonals are copied to cpu (synchronize at the end).      *
                     * so we have the Cholesky factor, but only diagonal submatrix of the big panel, *
                     * on cpu at the end.                                                            */
                    int d2, buf2;
                    magma_setdevice(d);
                    /* lookahead done */
                    magma_queue_wait_event( queues[d][stream3], events[d][4] );
                
                    trace_gpu_start( d, stream3, "comm", "row to CPU" );
                    magma_sgetmatrix_async( (j+jb), nb0,
                                            dlA(d, 0, nb*j_local2), ldda,
                                            Aup(0,j+jb),            lda,
                                            queues[d][stream3] );
                    trace_gpu_end( d, stream3 );
                    magma_event_record( events[d][3], queues[d][stream3] );
                    /* needed on pluto */
                    //magma_queue_sync( queues[d][stream3] );
                
                    /* broadcast rows to gpus on stream2 */
                    buf2 = ((j+jb)/nb)%ngpu;
                    for( d2=0; d2 < ngpu; d2++ ) {
                        if ( d2 != d ) {
                            magma_setdevice(d2);
                            trace_gpu_start( d2, stream3, "comm", "row to GPUs" );
                            magma_queue_wait_event( queues[d2][stream3], events[d][3] ); // rows arrived at cpu on stream3
                            magma_ssetmatrix_async( j+jb, nb0,
                                                    Aup(0,j+jb),       lda,
                                                    dlP(d2,nb,0,buf2), lddp,
                                                    queues[d2][stream3] );
                            trace_gpu_end( d2, stream3 );
                            magma_event_record( events[d2][0], queues[d2][stream3] );
                        }
                    }

                    /* =========================== */
                    /* update the remaining blocks */
                    nb2 = n_local[d]-(nb*j_local2 + nb0);
                    if ( nb2 > 0 ) {
                        if ( d == id ) {
                            dlpanel = dlA(d, j, nb*j_local);
                            ldpanel = ldda;
                        } else {
                            dlpanel = dlP(d,0,0,buf);
                            ldpanel = lddp;
                        }
                        magma_setdevice(d);
                        magmablasSetKernelStream( queues[d][stream2] );
                        trace_gpu_start( d, stream2, "trsm", "trsm" );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
                        int flag = 0;
                        if (flag == 0) {
                            magma_queue_wait_event( queues[d][stream2], events[d][4] ); // lookahead -> diagonal inversion
                        } else {
                            magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,flag), trsm_nb );
                            magma_queue_wait_event( queues[d][stream2], events[d][1] ); // panel received
                        }
                        magmablas_slaset( MagmaFull, nb2, jb, c_zero, c_zero, dx(d,1), nb2 );
                        magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel, ldpanel,
                                              dlA(d, j, nb*j_local2+nb0), ldda,
                                              flag, dinvA(d,flag), dx(d,1) );
#else
                        magma_queue_wait_event( queues[d][stream2], events[d][1] ); // wait for cholesky factor
                        magma_strsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel, ldpanel,
                                     dlA(d, j, nb*j_local2+nb0), ldda);
#endif
                        trace_gpu_end( d, stream2 );
                    }
                }
            } /* end of strsm */
        } /* end of for j=1, .., n */
    } else {
        /* ---------------------------------------------- */
        /* Lower-triangular case                          */
        /* > Compute the Cholesky factorization A = L*L'. */
        /* ---------------------------------------------- */
        for (j=0; j < n; j += nb) {
        
            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%ngpu;
            buf = (j/nb)%ngpu;
            
            /* Set the local index where the current panel is */
            j_local = j/(nb*ngpu);
            jb = min(nb, (n-j));

            /* Update the current diagonal block on stream1 */
            magma_setdevice(id);
            if ( j > 0 ) {
                magmablasSetKernelStream( queues[id][stream1] );
                magma_ssyrk(MagmaLower, MagmaNoTrans, jb, j,
                            d_neg_one, dlA(id, nb*j_local, 0), ldda,
                            d_one,     dlA(id, nb*j_local, j), ldda);
            }

            /* send the diagonal to cpu on stream1 */
            magma_sgetmatrix_async( jb, jb,
                                    dlA(id, nb*j_local, j), ldda,
                                    Alo(j,j),               lda,
                                    queues[id][stream1] );

            /* update off-diagonal blocks of the panel */
            if ( j > 0 ) {
                d = (j/nb+1)%ngpu;
                for( dd=0; dd < ngpu; dd++ ) {
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2 --;
                    nb0 = nb*j_local2;
            
                    if ( nb0 < n_local[d] ) {
                        magma_setdevice(d);
                        magmablasSetKernelStream( queues[d][stream2] );
                        if ( d == id ) {
                            dlpanel = dlA(d, nb*j_local, 0);
                            ldpanel = ldda;
                            magma_queue_wait_event( queues[d][stream2], events[d][4] ); // wait for look-ahead trsm to finish
                        } else {
                            dlpanel = dlPT(d,0,nb,buf);
                            ldpanel = nb;
                            magma_queue_wait_event( queues[d][stream2], events[d][0] ); // rows arrived at gpu
                        }
                        magma_sgemm( MagmaNoTrans, MagmaConjTrans,
                                     n_local[d]-nb0, jb, j,
                                     c_neg_one, dlA(d, nb0, 0), ldda,
                                                dlpanel,        ldpanel,
                                     c_one,     dlA(d, nb0, j), ldda);
                        magma_event_record( events[d][2], queues[d][stream2] );
                    }
                    d = (d+1)%ngpu;
                }
            }

            /* wait for the panel and factorized it on cpu */
            magma_setdevice(id);
            magma_queue_sync( queues[id][stream1] );
            lapackf77_spotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info);
            if (*info != 0) {
                *info = *info + j;
                break;
            }

            /* send the diagonal to gpus on stream1 */
            if ( (j+jb) < m) {
                d = (j/nb+1)%ngpu;
                for( dd=0; dd < ngpu; dd++ ) {
                    if ( d == id ) {
                        dlpanel = dlA(d, nb*j_local, j);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlPT(d, 0, 0, buf);
                        ldpanel = nb;
                    }
                    magma_setdevice(d);
                    magma_ssetmatrix_async( jb, jb,
                                            Alo(j,j), lda,
                                            dlpanel,  ldpanel,
                                            queues[d][stream1] );
                    magma_event_record( events[d][1], queues[d][stream1] );
                    d = (d+1)%ngpu;
                }
            } else {
                magma_setdevice(id);
                magma_ssetmatrix_async( jb, jb,
                                        Alo(j,j),               lda,
                                        dlA(id, nb*j_local, j), ldda,
                                        queues[id][stream1] );
            }

            /* panel factorize the off-diagonal */
            if ( (j+jb) < m) {
                d = (j/nb+1)%ngpu;
                for( dd=0; dd < ngpu; dd++ ) {
                    /* next column */
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2--;
                    if ( d == id ) {
                        dlpanel = dlA(d, nb*j_local, j);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlPT(d, 0, 0, buf);
                        ldpanel = nb;
                    }
                    nb2 = n_local[d] - j_local2*nb;
                    nb0 = min(nb, nb2);
                    
                    magma_setdevice(d);
                    if ( j+nb < n && d == (j/nb+1)%ngpu ) { /* owns next column, look-ahead next block on stream1 */
                        if ( j > 0 ) magma_queue_wait_event( queues[d][stream1], events[d][2] ); // wait for gemm update
                        magmablasSetKernelStream( queues[d][stream1] );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
                        magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,0), trsm_nb );
                        magmablas_slaset( MagmaFull, nb0,     jb,     c_zero, c_zero, dx(d,0), nb0 );
                        magmablas_strsm_work( MagmaRight, MagmaLower,
                                              MagmaConjTrans, MagmaNonUnit,
                                              nb0, jb, c_one,
                                              dlpanel, ldpanel,
                                              dlA(d, nb*j_local2, j), ldda,
                                              1, dinvA(d,0), dx(d,0) );
#else
                        magma_strsm( MagmaRight, MagmaLower,
                                     MagmaConjTrans, MagmaNonUnit,
                                     nb0, jb, c_one,
                                     dlpanel, ldpanel,
                                     dlA(d, nb*j_local2, j), ldda);
#endif
                        magma_event_record( events[d][4], queues[d][stream1] );
                    } else if ( nb2 > 0 ) { /* other gpus updating all the blocks on stream2 */
                        /* update the entire column */
                        magma_queue_wait_event( queues[d][stream2], events[d][1] ); // wait for the cholesky factor
                        magmablasSetKernelStream( queues[d][stream2] );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
                        magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,0), trsm_nb );
                        magmablas_slaset( MagmaFull, nb2,     jb,     c_zero, c_zero, dx(d,0), nb2 );
                        magmablas_strsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                              nb2, jb, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, nb*j_local2, j), ldda,
                                              1, dinvA(d,0), dx(d,0) );
#else
                        magma_strsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                     nb2, jb, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, nb*j_local2, j), ldda);
#endif
                    }
                    d = (d+1)%ngpu;
                } /* end for d */

                /* ========================================================== */
                if ( j+jb < n ) {
                    d = (j/nb+1)%ngpu;
                    /* next column */
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2--;
                    nb0 = min(nb, n_local[d]-nb*j_local2 );
                
                    /* even on 1 gpu, we copy off-diagonal to cpu (but don't synchronize).  */
                    /* so we have the Cholesky factor on cpu at the end.                    */
                    int d2, buf2;
//#define SPOTRF_DEVICE_TO_DEVICE
#ifdef SPOTRF_DEVICE_TO_DEVICE
                    // lookahead done
                
                    /* broadcast the rows to gpus */
                    buf2 = ((j+jb)/nb)%ngpu;
                    for( d2=0; d2 < ngpu; d2++ ) {
                        magma_setdevice(d2);
                        magma_queue_wait_event( queues[d2][stream3], events[d][4] );
                        if ( d2 != d ) {
                            magma_scopymatrix_async( nb0, j+jb,
                                                     dlPT(d2,0,nb,buf2), nb, // first nbxnb reserved for diagonal block
                                                     dlA(d, nb*j_local2, 0), ldda,
                                                     queues[d2][stream3] );
                            magma_event_record( events[d2][0], queues[d2][stream3] );
                        } else {
                            magma_sgetmatrix_async( nb0, j+jb,
                                                    dlA(d, nb*j_local2, 0), ldda,
                                                    Alo(j+jb,0),            lda,
                                                    queues[d][stream3] );
                        }
                    }
#else
                    // lookahead done
                    magma_setdevice(d);
                    magma_queue_wait_event( queues[d][stream3], events[d][4] );
                    magma_sgetmatrix_async( nb0, j+jb,
                                            dlA(d, nb*j_local2, 0), ldda,
                                            Alo(j+jb,0),            lda,
                                            queues[d][stream3] );
                    magma_event_record( events[d][3], queues[d][stream3] );
                    /* syn on rows on CPU, seem to be needed on Pluto */
                    //magma_queue_sync( queues[d][stream3] );
                
                    /* broadcast the rows to gpus */
                    buf2 = ((j+jb)/nb)%ngpu;
                    for( d2=0; d2 < ngpu; d2++ ) {
                        if ( d2 != d ) {
                            magma_setdevice(d2);
                            magma_queue_wait_event( queues[d2][stream3], events[d][3] ); // getmatrix done
                            magma_ssetmatrix_async( nb0, j+jb,
                                                    Alo(j+jb,0),        lda,
                                                    dlPT(d2,0,nb,buf2), nb, // first nbxnb reserved for diagonal block
                                                    queues[d2][stream3] );
                            magma_event_record( events[d2][0], queues[d2][stream3] );
                        }
                    }
#endif
                    /* =================================== */
                    /* updates remaining blocks on stream2 */
                    nb2 = n_local[d] - (j_local2*nb + nb0);
                    if ( nb2 > 0 ) {
                        if ( d == id ) {
                            dlpanel = dlA(d, nb*j_local, j);
                            ldpanel = ldda;
                        } else {
                            dlpanel = dlPT(d,0,0,buf);
                            ldpanel = nb;
                        }
                        magma_setdevice(d);
                        magmablasSetKernelStream( queues[d][stream2] );
                        /* update the remaining blocks in the column */
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
                        int flag = 0;
                        if (flag == 0) {
                            magma_queue_wait_event( queues[d][stream2], events[d][4] ); // lookahead -> diagonal inversion
                        } else {
                            magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,flag), trsm_nb );
                            magma_queue_wait_event( queues[d][stream2], events[d][1] ); // panel received
                        }
                        magmablas_slaset( MagmaFull, nb2, jb, c_zero, c_zero, dx(d,1), nb2 );
                        magmablas_strsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                              nb2, jb, c_one,
                                              dlpanel,                    ldpanel,
                                              dlA(d, nb*j_local2+nb0, j), ldda,
                                              flag, dinvA(d,flag), dx(d,1) );
#else
                        magma_queue_wait_event( queues[d][stream2], events[d][1] ); // panel received
                        magma_strsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                     nb2, jb, c_one,
                                     dlpanel,                    ldpanel,
                                     dlA(d, nb*j_local2+nb0, j), ldda);
#endif
                    }
                }
            }
        }
    } /* end of else not upper */

    /* == finalize the trace == */
    trace_finalize( "spotrf.svg", "trace.css" );
    for( d=0; d < ngpu; d++ ) {
        magma_setdevice(d);
        for( j=0; j < 3; j++ ) {
            magma_queue_sync( queues[d][j] );
        }
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
        magma_free( d_dinvA[d] );
        magma_free( d_x[d] );
#endif
    }
    magma_setdevice( orig_dev );
    magmablasSetKernelStream( orig_stream );

    return *info;
} /* magma_spotrf_mgpu */