Exemplo n.º 1
0
extern "C" magma_int_t
magma_zpotrf3_mgpu(magma_int_t num_gpus, char uplo, magma_int_t m, magma_int_t n,
                   magma_int_t off_i, magma_int_t off_j, magma_int_t nb,
                   magmaDoubleComplex *d_lA[],  magma_int_t ldda,
                   magmaDoubleComplex *d_lP[],  magma_int_t lddp,
                   magmaDoubleComplex *a,       magma_int_t lda, magma_int_t h,
                   magma_queue_t stream[][3], magma_event_t event[][5],
                   magma_int_t *info )
{
    /*  -- MAGMA (version 1.4.0) --
           Univ. of Tennessee, Knoxville
           Univ. of California, Berkeley
           Univ. of Colorado, Denver
           August 2013

        Purpose
        =======
        ZPOTRF computes the Cholesky factorization of a complex Hermitian
        positive definite matrix dA.
        Auxiliary subroutine for zpotrf2_ooc. It is multiple gpu interface to compute
        Cholesky of a "rectangular" matrix.

        The factorization has the form
           dA = U**H * U,  if UPLO = 'U', or
           dA = L  * L**H,  if UPLO = 'L',
        where U is an upper triangular matrix and L is lower triangular.

        This is the block version of the algorithm, calling Level 3 BLAS.

        Arguments
        =========
        UPLO    (input) CHARACTER*1
                = 'U':  Upper triangle of dA is stored;
                = 'L':  Lower triangle of dA is stored.

        N       (input) INTEGER
                The order of the matrix dA.  N >= 0.

        dA      (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N)
                On entry, the Hermitian matrix dA.  If UPLO = 'U', the leading
                N-by-N upper triangular part of dA contains the upper
                triangular part of the matrix dA, and the strictly lower
                triangular part of dA is not referenced.  If UPLO = 'L', the
                leading N-by-N lower triangular part of dA contains the lower
                triangular part of the matrix dA, and the strictly upper
                triangular part of dA is not referenced.

                On exit, if INFO = 0, the factor U or L from the Cholesky
                factorization dA = U**H * U or dA = L * L**H.

        LDDA     (input) INTEGER
                The leading dimension of the array dA.  LDDA >= max(1,N).
                To benefit from coalescent memory accesses LDDA must be
                dividable by 16.

        INFO    (output) INTEGER
                = 0:  successful exit
                < 0:  if INFO = -i, the i-th argument had an illegal value
                > 0:  if INFO = i, the leading minor of order i is not
                      positive definite, and the factorization could not be
                      completed.
        =====================================================================   */


    magma_int_t     j, jb, nb0, nb2, d, dd, id, j_local, j_local2, buf;
    char            uplo_[2] = {uplo, 0};
    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    double          d_one     =  1.0;
    double          d_neg_one = -1.0;
    int upper = lapackf77_lsame(uplo_, "U");
    magmaDoubleComplex *dlpanel;
    magma_int_t n_local[MagmaMaxGPUs], ldpanel;
    const magma_int_t stream1 = 0, stream2 = 1, stream3 = 2;
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK)
    /* used by ztrsm_work */
    int trsm_nb = 128;
    int trsm_n = trsm_nb*((nb+trsm_nb-1)/trsm_nb);
    magmaDoubleComplex *d_dinvA[MagmaMaxGPUs];
    magmaDoubleComplex *d_x[MagmaMaxGPUs];
#define dinvA(d,j) &(d_dinvA[(d)][(j)*trsm_nb*trsm_n])
#define dx(d,j) &(d_x[(d)][(j)*nb*m])
    /*
     * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE
     */
    for( d=0; d<num_gpus; d++ ) {
        magma_setdevice(d);
        if ( (MAGMA_SUCCESS != magma_zmalloc( &d_dinvA[d], 2*trsm_nb*trsm_n )) ||
                (MAGMA_SUCCESS != magma_zmalloc( &d_x[d],     2*nb*(upper ? n : m) )) ) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }
    }
    magma_setdevice(0);
#endif

    *info = 0;
    if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (!upper && num_gpus*ldda < max(1,n)) {
        *info = -4;
    } else if (upper && ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* initialization */
    for( d=0; d<num_gpus; d++ ) {
        /* local-n and local-ld */
        if (upper) {
            n_local[d] = (n/(nb*num_gpus))*nb;
            if (d < (n/nb)%num_gpus)
                n_local[d] += nb;
            else if (d == (n/nb)%num_gpus)
                n_local[d] += n%nb;
        } else {
            n_local[d] = (m/(nb*num_gpus))*nb;
            if (d < (m/nb)%num_gpus)
                n_local[d] += nb;
            else if (d == (m/nb)%num_gpus)
                n_local[d] += m%nb;
        }
    }

    /* == initialize the trace */
    trace_init( 1, num_gpus, 3, (CUstream_st**)stream );

    if (upper)
    {
        /* ---------------------------------------------- */
        /* Upper-triangular case                          */
        /* > Compute the Cholesky factorization A = U'*U. */
        /* ---------------------------------------------- */
        for (j=0; j<m; j+=nb) {

            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%num_gpus;
            buf = (j/nb)%num_gpus; // right now, we have num_gpu buffers, so id and buf are the same..

            /* Set the local index where the current panel is */
            j_local = j/(nb*num_gpus);
            jb = min(nb, (m-j));

            /* Update the current diagonal block on stream1 */
            magma_setdevice(id);
            if( j > 0 ) {
                magmablasSetKernelStream(stream[id][stream1]);
                trace_gpu_start( id, stream1, "syrk", "syrk" );
                magma_zherk(MagmaUpper, MagmaConjTrans, jb, j,
                            d_neg_one, dlA(id, 0, nb*j_local), ldda,
                            d_one,     dlA(id, j, nb*j_local), ldda);
                trace_gpu_end( id, stream1 );
            }

            /* send the diagonal to cpu on stream1 */
            trace_gpu_start( id, stream1, "comm", "D to CPU" );
            magma_zgetmatrix_async( jb, jb,
                                    dlA(id, j, nb*j_local), ldda,
                                    Aup(j,j),               lda,
                                    stream[id][stream1] );
            trace_gpu_end( id, stream1 );

            /* update off-diagonal blocks in the panel */
            if( j > 0 ) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    j_local2 = j_local+1;
                    if( d > id ) j_local2 --;
                    nb0 = nb*j_local2; // number of local columns in the panel, while jb is panel-size (number of rows)

                    if( n_local[d] > nb0 ) {
                        magma_setdevice(d);
                        magmablasSetKernelStream(stream[d][stream2]);
                        if( d == id ) {
                            dlpanel = dlA(d,0,nb*j_local);
                            ldpanel = ldda;
                            // the GPU owns the row from start, and no need of synch.
                            //magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu
                        } else {
                            dlpanel = dlP(d,nb,0,buf);
                            ldpanel = lddp;
                            magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu
                        }
                        trace_gpu_start( d, stream2, "gemm", "gemm" );
                        magma_zgemm(MagmaConjTrans, MagmaNoTrans,
                                    jb, n_local[d]-nb0, j,
                                    c_neg_one, dlpanel,        ldpanel,
                                    dlA(d, 0, nb0), ldda,
                                    c_one,     dlA(d, j, nb0), ldda);
                        trace_gpu_end( d, stream2 );
                        magma_event_record( event[d][2], stream[d][stream2] );
                    }
                    d = (d+1)%num_gpus;
                }
            }

            /* wait for panel and factorize it on cpu */
            magma_setdevice(id);
            magma_queue_sync( stream[id][stream1] );
            trace_cpu_start( 0, "getrf", "getrf" );
            lapackf77_zpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info);
            trace_cpu_end( 0 );
            if (*info != 0) {
                *info = *info + j;
                break;
            }

            /* send the diagonal to gpus on stream1 */
            if ( (j+jb) < n) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    if( d == id ) {
                        dlpanel = dlA(d, j, nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlP(d,0,0,buf);
                        ldpanel = lddp;
                    }
                    magma_setdevice(d);
                    trace_gpu_start( d, stream1, "comm", "comm" );
                    magma_zsetmatrix_async( jb, jb,
                                            Aup(j,j), lda,
                                            dlpanel,  ldpanel,
                                            stream[d][stream1] );
                    trace_gpu_end( d, stream1 );
                    magma_event_record( event[d][1], stream[d][stream1] );
                    d = (d+1)%num_gpus;
                }
            } else {
                magma_setdevice(id);
                trace_gpu_start( id, stream1, "comm", "comm" );
                magma_zsetmatrix_async( jb, jb,
                                        Aup(j,j),               lda,
                                        dlA(id, j, nb*j_local), ldda,
                                        stream[id][stream1] );
                trace_gpu_end( id, stream1 );
            }

            /* panel-factorize the off-diagonal */
            if ( (j+jb) < n) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    /* next column */
                    j_local2 = j_local+1;
                    if( d > id ) j_local2--;
                    if( d == id ) {
                        dlpanel = dlA(d,j,nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlP(d,0,0,buf);
                        ldpanel = lddp;
                    }
                    nb2 = n_local[d] - j_local2*nb;

                    magma_setdevice(d);
                    if( j+jb < m && d == (j/nb+1)%num_gpus ) {
                        /* owns the next column, look-ahead next block on stream1 */
                        nb0 = min(nb, nb2);
                        magmablasSetKernelStream(stream[d][stream1]);
                        magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update
                        trace_gpu_start( d, stream1, "trsm", "trsm" );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK)
                        magmablas_zlaset( MagmaUpperLower, trsm_nb, trsm_n, dinvA(d,0),trsm_nb );
                        magmablas_zlaset( MagmaUpperLower, nb0,jb, dx(d,0),nb0 );
                        magmablas_ztrsm_work( MagmaLeft, MagmaUpper,
                                              MagmaConjTrans, MagmaNonUnit,
                                              jb, nb0, c_one,
                                              dlpanel, ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                              1, dinvA(d,0), dx(d,0) );
#else
                        magma_ztrsm( MagmaLeft, MagmaUpper,
                                     MagmaConjTrans, MagmaNonUnit,
                                     jb, nb0, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, j, nb*j_local2), ldda);
#endif
                        magma_event_record( event[d][4], stream[d][stream1] );
                        trace_gpu_end( d, stream1 );
                    } else if( nb2 > 0 ) {
                        /* update all the blocks on stream2 */
                        magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor
                        trace_gpu_start( d, stream2, "trsm", "trsm" );
                        magmablasSetKernelStream(stream[d][stream2]);
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK)
                        magmablas_zlaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,0),trsm_nb );
                        magmablas_zlaset( MagmaUpperLower, nb2,jb, dx(d,0),nb2 );
                        magmablas_ztrsm_work( MagmaLeft, MagmaUpper,
                                              MagmaConjTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel, ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                              1, dinvA(d,0), dx(d,0) );
#else
                        magma_ztrsm( MagmaLeft, MagmaUpper,
                                     MagmaConjTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, j, nb*j_local2), ldda);
#endif
                        trace_gpu_end( d, stream2 );
                    }
                    d = (d+1)%num_gpus;
                } /* end of for */

                /* ========================================================== */
                if( j+jb < m ) {
                    d = (j/nb+1)%num_gpus;
                    /* next column */
                    j_local2 = j_local+1;
                    if( d > id ) j_local2--;
                    nb0 = min(nb, n_local[d]-nb*j_local2 );

                    /* even on 1 gpu, off-diagonals are copied to cpu (synchronize at the end).      *
                     * so we have the Cholesky factor, but only diagonal submatrix of the big panel, *
                     * on cpu at the end.                                                            */
                    int d2, buf2;
                    magma_setdevice(d);
                    /* lookahead done */
                    magma_queue_wait_event( stream[d][stream3], event[d][4] );

                    trace_gpu_start( d, stream3, "comm", "row to CPU" );
                    magma_zgetmatrix_async( (j+jb), nb0,
                                            dlA(d, 0, nb*j_local2), ldda,
                                            Aup(0,j+jb),            lda,
                                            stream[d][stream3] );
                    trace_gpu_end( d, stream3 );
                    magma_event_record( event[d][3], stream[d][stream3] );
                    /* needed on pluto */
                    //magma_queue_sync( stream[d][stream3] );

                    /* broadcast rows to gpus on stream2 */
                    buf2 = ((j+jb)/nb)%num_gpus;
                    for( d2=0; d2<num_gpus; d2++ ) {
                        if( d2 != d )
                        {
                            magma_setdevice(d2);
                            trace_gpu_start( d2, stream3, "comm", "row to GPUs" );
                            magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // rows arrived at cpu on stream3
                            magma_zsetmatrix_async( j+jb, nb0,
                                                    Aup(0,j+jb),       lda,
                                                    dlP(d2,nb,0,buf2), lddp,
                                                    stream[d2][stream3] );
                            trace_gpu_end( d2, stream3 );
                            magma_event_record( event[d2][0], stream[d2][stream3] );
                        }
                    }

                    /* =========================== */
                    /* update the remaining blocks */
                    nb2 = n_local[d]-(nb*j_local2 + nb0);
                    if( nb2 > 0 ) {
                        if( d == id ) {
                            dlpanel = dlA(d, j, nb*j_local);
                            ldpanel = ldda;
                        } else {
                            dlpanel = dlP(d,0,0,buf);
                            ldpanel = lddp;
                        }
                        magma_setdevice(d);
                        magmablasSetKernelStream(stream[d][stream2]);
                        trace_gpu_start( d, stream2, "trsm", "trsm" );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK)
                        int flag = 0;
                        if (flag == 0) {
                            magma_queue_wait_event( stream[d][stream2], event[d][4] ); // lookahead -> diagonal inversion
                        } else {
                            magmablas_zlaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,flag),trsm_nb );
                            magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received
                        }
                        magmablas_zlaset( MagmaUpperLower, nb2,jb, dx(d,1),nb2 );
                        magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel, ldpanel,
                                              dlA(d, j, nb*j_local2+nb0), ldda,
                                              flag, dinvA(d,flag), dx(d,1) );
#else
                        magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor
                        magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel, ldpanel,
                                     dlA(d, j, nb*j_local2+nb0), ldda);
#endif
                        trace_gpu_end( d, stream2 );
                    }
                }
            } /* end of ztrsm */
        } /* end of for j=1, .., n */
    } else {

        /* ---------------------------------------------- */
        /* Lower-triangular case                          */
        /* > Compute the Cholesky factorization A = L*L'. */
        /* ---------------------------------------------- */
        for (j=0; j<n; j+=nb) {

            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%num_gpus;
            buf = (j/nb)%num_gpus;

            /* Set the local index where the current panel is */
            j_local = j/(nb*num_gpus);
            jb = min(nb, (n-j));

            /* Update the current diagonal block on stream1 */
            magma_setdevice(id);
            if( j > 0 ) {
                magmablasSetKernelStream(stream[id][stream1]);
                magma_zherk(MagmaLower, MagmaNoTrans, jb, j,
                            d_neg_one, dlA(id, nb*j_local, 0), ldda,
                            d_one,     dlA(id, nb*j_local, j), ldda);
            }

            /* send the diagonal to cpu on stream1 */
            magma_zgetmatrix_async( jb, jb,
                                    dlA(id, nb*j_local, j), ldda,
                                    Alo(j,j),               lda,
                                    stream[id][stream1] );

            /* update off-diagonal blocks of the panel */
            if( j > 0 ) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    j_local2 = j_local+1;
                    if( d > id ) j_local2 --;
                    nb0 = nb*j_local2;

                    if( nb0 < n_local[d] ) {
                        magma_setdevice(d);
                        magmablasSetKernelStream(stream[d][stream2]);
                        if( d == id ) {
                            dlpanel = dlA(d, nb*j_local, 0);
                            ldpanel = ldda;
                        } else {
                            dlpanel = dlPT(d,0,nb,buf);
                            ldpanel = nb;
                            magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu
                        }
                        magma_zgemm( MagmaNoTrans, MagmaConjTrans,
                                     n_local[d]-nb0, jb, j,
                                     c_neg_one, dlA(d, nb0, 0), ldda,
                                     dlpanel,        ldpanel,
                                     c_one,     dlA(d, nb0, j), ldda);
                        magma_event_record( event[d][2], stream[d][stream2] );
                    }
                    d = (d+1)%num_gpus;
                }
            }

            /* wait for the panel and factorized it on cpu */
            magma_setdevice(id);
            magma_queue_sync( stream[id][stream1] );
            lapackf77_zpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info);
            if (*info != 0) {
                *info = *info + j;
                break;
            }

            /* send the diagonal to gpus on stream1 */
            if ( (j+jb) < m) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    if( d == id ) {
                        dlpanel = dlA(d, nb*j_local, j);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlPT(d, 0, 0, buf);
                        ldpanel = nb;
                    }
                    magma_setdevice(d);
                    magma_zsetmatrix_async( jb, jb,
                                            Alo(j,j), lda,
                                            dlpanel,  ldpanel,
                                            stream[d][stream1] );
                    magma_event_record( event[d][1], stream[d][stream1] );
                    d = (d+1)%num_gpus;
                }
            } else {
                magma_setdevice(id);
                magma_zsetmatrix_async( jb, jb,
                                        Alo(j,j),               lda,
                                        dlA(id, nb*j_local, j), ldda,
                                        stream[id][stream1] );
            }

            /* panel factorize the off-diagonal */
            if ( (j+jb) < m) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    /* next column */
                    j_local2 = j_local+1;
                    if( d > id ) j_local2--;
                    if( d == id ) {
                        dlpanel = dlA(d, nb*j_local, j);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlPT(d, 0, 0, buf);
                        ldpanel = nb;
                    }
                    nb2 = n_local[d] - j_local2*nb;
                    nb0 = min(nb, nb2);

                    magma_setdevice(d);
                    if( j+nb < n && d == (j/nb+1)%num_gpus ) { /* owns next column, look-ahead next block on stream1 */
                        if ( j > 0 ) magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update
                        magmablasSetKernelStream(stream[d][stream1]);
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK)
                        magmablas_zlaset( MagmaUpperLower, trsm_nb, trsm_n, dinvA(d,0),trsm_nb );
                        magmablas_zlaset( MagmaUpperLower, nb0,jb, dx(d,0),nb0 );
                        magmablas_ztrsm_work( MagmaRight, MagmaLower,
                                              MagmaConjTrans, MagmaNonUnit,
                                              nb0, jb, c_one,
                                              dlpanel, ldpanel,
                                              dlA(d, nb*j_local2, j), ldda,
                                              1, dinvA(d,0), dx(d,0) );
#else
                        magma_ztrsm( MagmaRight, MagmaLower,
                                     MagmaConjTrans, MagmaNonUnit,
                                     nb0, jb, c_one,
                                     dlpanel, ldpanel,
                                     dlA(d, nb*j_local2, j), ldda);
#endif
                        magma_event_record( event[d][4], stream[d][stream1] );
                    } else if( nb2 > 0 ) { /* other gpus updating all the blocks on stream2 */
                        /* update the entire column */
                        magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for the cholesky factor
                        magmablasSetKernelStream(stream[d][stream2]);
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK)
                        magmablas_zlaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,0),trsm_nb );
                        magmablas_zlaset( MagmaUpperLower, nb2,jb, dx(d,0),nb2 );
                        magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                              nb2, jb, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, nb*j_local2, j), ldda,
                                              1, dinvA(d,0), dx(d,0) );
#else
                        magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                     nb2, jb, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, nb*j_local2, j), ldda);
#endif
                    }
                    d = (d+1)%num_gpus;
                } /* end for d */

                /* ========================================================== */
                if( j+jb < n ) {
                    d = (j/nb+1)%num_gpus;
                    /* next column */
                    j_local2 = j_local+1;
                    if( d > id ) j_local2--;
                    nb0 = min(nb, n_local[d]-nb*j_local2 );

                    /* even on 1 gpu, we copy off-diagonal to cpu (but don't synchronize).  */
                    /* so we have the Cholesky factor on cpu at the end.                    */
                    int d2, buf2;
//#define ZPOTRF_DEVICE_TO_DEVICE
#ifdef ZPOTRF_DEVICE_TO_DEVICE
                    // lookahead done

                    /* broadcast the rows to gpus */
                    buf2 = ((j+jb)/nb)%num_gpus;
                    for( d2=0; d2<num_gpus; d2++ ) {
                        magma_setdevice(d2);
                        magma_queue_wait_event( stream[d2][stream3], event[d][4] );
                        if( d2 != d ) {
                            magma_zcopymatrix_async( nb0, j+jb,
                                                     dlPT(d2,0,nb,buf2), nb, // first nbxnb reserved for diagonal block
                                                     dlA(d, nb*j_local2, 0), ldda,
                                                     stream[d2][stream3] );
                            magma_event_record( event[d2][0], stream[d2][stream3] );
                        } else {
                            magma_zgetmatrix_async( nb0, j+jb,
                                                    dlA(d, nb*j_local2, 0), ldda,
                                                    Alo(j+jb,0),            lda,
                                                    stream[d][stream3] );
                        }
                    }
#else
                    // lookahead done
                    magma_setdevice(d);
                    magma_queue_wait_event( stream[d][stream3], event[d][4] );
                    magma_zgetmatrix_async( nb0, j+jb,
                                            dlA(d, nb*j_local2, 0), ldda,
                                            Alo(j+jb,0),            lda,
                                            stream[d][stream3] );
                    magma_event_record( event[d][3], stream[d][stream3] );
                    /* syn on rows on CPU, seem to be needed on Pluto */
                    //magma_queue_sync( stream[d][stream3] );

                    /* broadcast the rows to gpus */
                    buf2 = ((j+jb)/nb)%num_gpus;
                    for( d2=0; d2<num_gpus; d2++ ) {
                        if( d2 != d )
                        {
                            magma_setdevice(d2);
                            magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // getmatrix done
                            magma_zsetmatrix_async( nb0, j+jb,
                                                    Alo(j+jb,0),        lda,
                                                    dlPT(d2,0,nb,buf2), nb, // first nbxnb reserved for diagonal block
                                                    stream[d2][stream3] );
                            magma_event_record( event[d2][0], stream[d2][stream3] );
                        }
                    }
#endif
                    /* =================================== */
                    /* updates remaining blocks on stream2 */
                    nb2 = n_local[d] - (j_local2*nb + nb0);
                    if( nb2 > 0 ) {
                        if( d == id ) {
                            dlpanel = dlA(d, nb*j_local, j);
                            ldpanel = ldda;
                        } else {
                            dlpanel = dlPT(d,0,0,buf);
                            ldpanel = nb;
                        }
                        magma_setdevice(d);
                        magmablasSetKernelStream(stream[d][stream2]);
                        /* update the remaining blocks in the column */
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK)
                        int flag = 0;
                        if (flag == 0) {
                            magma_queue_wait_event( stream[d][stream2], event[d][4] ); // lookahead -> diagonal inversion
                        } else {
                            magmablas_zlaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,flag),trsm_nb );
                            magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received
                        }
                        magmablas_zlaset( MagmaUpperLower, nb2,jb, dx(d,1),nb2 );
                        magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                              nb2, jb, c_one,
                                              dlpanel,                    ldpanel,
                                              dlA(d, nb*j_local2+nb0, j), ldda,
                                              flag, dinvA(d,flag), dx(d,1) );
#else
                        magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received
                        magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                     nb2, jb, c_one,
                                     dlpanel,                    ldpanel,
                                     dlA(d, nb*j_local2+nb0, j), ldda);
#endif
                    }
                }
            }
        }
    } /* end of else not upper */

    /* == finalize the trace == */
    trace_finalize( "zpotrf.svg","trace.css" );
    for( d=0; d<num_gpus; d++ ) {
        magma_setdevice(d);
        for( j=0; j<3; j++ ) {
            magma_queue_sync( stream[d][j] );
        }
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK)
        magma_free( d_dinvA[d] );
        magma_free( d_x[d] );
#endif
        magmablasSetKernelStream(NULL);
    }
    magma_setdevice(0);

    return *info;
} /* magma_zpotrf_mgpu */
Exemplo n.º 2
0
/**
    Purpose
    -------
    SORGQR generates an M-by-N REAL matrix Q with orthonormal columns,
    which is defined as the first N columns of a product of K elementary
    reflectors of order M

        Q  =  H(1) H(2) . . . H(k)

    as returned by SGEQRF.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix Q. M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix Q. M >= N >= 0.

    @param[in]
    k       INTEGER
            The number of elementary reflectors whose product defines the
            matrix Q. N >= K >= 0.

    @param[in,out]
    A       REAL array A, dimension (LDDA,N).
            On entry, the i-th column must contain the vector
            which defines the elementary reflector H(i), for
            i = 1,2,...,k, as returned by SGEQRF_GPU in the
            first k columns of its array argument A.
            On exit, the M-by-N matrix Q.

    @param[in]
    lda     INTEGER
            The first dimension of the array A. LDA >= max(1,M).

    @param[in]
    tau     REAL array, dimension (K)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by SGEQRF_GPU.

    @param[in]
    T       REAL array, dimension (NB, min(M,N)).
            T contains the T matrices used in blocking the elementary
            reflectors H(i), e.g., this can be the 6th argument of
            magma_sgeqrf_gpu (except stored on the CPU, not the GPU).

    @param[in]
    nb      INTEGER
            This is the block size used in SGEQRF_GPU, and correspondingly
            the size of the T matrices, used in the factorization, and
            stored in T.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value

    @ingroup magma_sgeqrf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_sorgqr_m(
    magma_int_t m, magma_int_t n, magma_int_t k,
    float *A, magma_int_t lda,
    float *tau,
    float *T, magma_int_t nb,
    magma_int_t *info)
{
#define  A(i,j)   ( A    + (i) + (j)*lda )
#define dA(d,i,j) (dA[d] + (i) + (j)*ldda)
#define dT(d,i,j) (dT[d] + (i) + (j)*nb)

    float c_zero = MAGMA_S_ZERO;
    float c_one  = MAGMA_S_ONE;

    magma_int_t m_kk, n_kk, k_kk, mi;
    magma_int_t lwork, ldwork;
    magma_int_t d, i, ib, j, jb, ki, kk;
    float *work=NULL;

    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if ((n < 0) || (n > m)) {
        *info = -2;
    } else if ((k < 0) || (k > n)) {
        *info = -3;
    } else if (lda < max(1,m)) {
        *info = -5;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    if (n <= 0) {
        return *info;
    }
    
    magma_int_t di, dn;
    magma_int_t dpanel;

    magma_int_t ngpu = magma_num_gpus();
    
    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    
    // Allocate memory on GPUs for A and workspaces
    magma_int_t ldda    = magma_roundup( m, 32 );
    magma_int_t lddwork = magma_roundup( n, 32 );
    magma_int_t min_lblocks = (n / nb) / ngpu;  // min. blocks per gpu
    magma_int_t last_dev    = (n / nb) % ngpu;  // device with last block
    
    magma_int_t  nlocal[ MagmaMaxGPUs ] = { 0 };
    float *dA[ MagmaMaxGPUs ] = { NULL };
    float *dT[ MagmaMaxGPUs ] = { NULL };
    float *dV[ MagmaMaxGPUs ] = { NULL };
    float *dW[ MagmaMaxGPUs ] = { NULL };
    magma_queue_t queues[ MagmaMaxGPUs ] = { NULL };
    
    for( d = 0; d < ngpu; ++d ) {
        // example with n = 75, nb = 10, ngpu = 3
        // min_lblocks = 2
        // last_dev    = 1
        // gpu 0: 2  blocks, cols:  0- 9, 30-39, 60-69
        // gpu 1: 1+ blocks, cols: 10-19, 40-49, 70-74 (partial)
        // gpu 2: 1  block,  cols: 20-29, 50-59
        magma_setdevice( d );
        nlocal[d] = min_lblocks*nb;
        if ( d < last_dev ) {
            nlocal[d] += nb;
        }
        else if ( d == last_dev ) {
            nlocal[d] += (n % nb);
        }
        
        ldwork = nlocal[d]*ldda  // dA
               + nb*m            // dT
               + nb*ldda         // dV
               + nb*lddwork;     // dW
        if ( MAGMA_SUCCESS != magma_smalloc( &dA[d], ldwork )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            goto cleanup;
        }
        dT[d] = dA[d] + nlocal[d]*ldda;
        dV[d] = dT[d] + nb*m;
        dW[d] = dV[d] + nb*ldda;
        
        magma_queue_create( d, &queues[d] );
    }
    
    trace_init( 1, ngpu, 1, queues );
    
    // first kk columns are handled by blocked method.
    // ki is start of 2nd-to-last block
    if ((nb > 1) && (nb < k)) {
        ki = (k - nb - 1) / nb * nb;
        kk = min(k, ki + nb);
    } else {
        ki = 0;
        kk = 0;
    }

    // Allocate CPU work space
    // n*nb  for larfb work
    // m*nb  for V
    // nb*nb for T
    lwork = (n + m + nb) * nb;
    magma_smalloc_cpu( &work, lwork );
    if (work == NULL) {
        *info = MAGMA_ERR_HOST_ALLOC;
        goto cleanup;
    }
    float *work_T, *work_V;
    work_T = work + n*nb;
    work_V = work + n*nb + nb*nb;

    // Use unblocked code for the last or only block.
    if (kk < n) {
        trace_cpu_start( 0, "ungqr", "ungqr last block" );
        m_kk = m - kk;
        n_kk = n - kk;
        k_kk = k - kk;
        
        // sorgqr requires less workspace (n*nb), but is slow if k < sorgqr's block size.
        // replacing it with the 4 routines below is much faster (e.g., 60x).
        //magma_int_t iinfo;
        //lapackf77_sorgqr( &m_kk, &n_kk, &k_kk,
        //                  A(kk, kk), &lda,
        //                  &tau[kk], work, &lwork, &iinfo );
        
        lapackf77_slacpy( MagmaFullStr, &m_kk, &k_kk, A(kk,kk), &lda, work_V, &m_kk);
        lapackf77_slaset( MagmaFullStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda );
        
        lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr,
                          &m_kk, &k_kk,
                          work_V, &m_kk, &tau[kk], work_T, &k_kk);
        lapackf77_slarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr,
                          &m_kk, &n_kk, &k_kk,
                          work_V, &m_kk, work_T, &k_kk, A(kk, kk), &lda, work, &n_kk );
        
        if (kk > 0) {
            for( j=kk; j < n; j += nb ) {
                jb = min( n-j, nb );
                d  =  (j / nb) % ngpu;
                di = ((j / nb) / ngpu) * nb;
                magma_setdevice( d );
                magma_ssetmatrix( m_kk, jb,
                                  A(kk, j),  lda,
                                  dA(d, kk, di), ldda, queues[d] );
                
                // Set A(1:kk,kk+1:n) to zero.
                magmablas_slaset( MagmaFull, kk, jb, c_zero, c_zero, dA(d, 0, di), ldda, queues[d] );
            }
        }
        trace_cpu_end( 0 );
    }

    if (kk > 0) {
        // Use blocked code
        // send T to all GPUs
        for( d = 0; d < ngpu; ++d ) {
            magma_setdevice( d );
            trace_gpu_start( d, 0, "set", "set T" );
            magma_ssetmatrix_async( nb, min(m,n), T, nb, dT[d], nb, queues[d] );
            trace_gpu_end( d, 0 );
        }
        
        // queue: set Aii (V) --> laset --> laset --> larfb --> [next]
        // CPU has no computation
        for( i = ki; i >= 0; i -= nb ) {
            ib = min(nb, k - i);
            mi = m - i;
            dpanel =  (i / nb) % ngpu;
            di     = ((i / nb) / ngpu) * nb;

            // Send current panel to dV on the GPUs
            lapackf77_slaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda );
            for( d = 0; d < ngpu; ++d ) {
                magma_setdevice( d );
                trace_gpu_start( d, 0, "set", "set V" );
                magma_ssetmatrix_async( mi, ib,
                                        A(i, i), lda,
                                        dV[d],   ldda, queues[d] );
                trace_gpu_end( d, 0 );
            }
            
            // set panel to identity
            magma_setdevice( dpanel );
            trace_gpu_start( dpanel, 0, "laset", "laset" );
            magmablas_slaset( MagmaFull, i,  ib, c_zero, c_zero, dA(dpanel, 0, di), ldda, queues[dpanel] );
            magmablas_slaset( MagmaFull, mi, ib, c_zero, c_one,  dA(dpanel, i, di), ldda, queues[dpanel] );
            trace_gpu_end( dpanel, 0 );
            
            if (i < n) {
                // Apply H to A(i:m,i:n) from the left
                for( d = 0; d < ngpu; ++d ) {
                    magma_setdevice( d );
                    magma_indices_1D_bcyclic( nb, ngpu, d, i, n, &di, &dn );
                    trace_gpu_start( d, 0, "larfb", "larfb" );
                    magma_slarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise,
                                      mi, dn-di, ib,
                                      dV[d],        ldda, dT(d,0,i), nb,
                                      dA(d, i, di), ldda, dW[d], lddwork, queues[d] );
                    trace_gpu_end( d, 0 );
                }
            }
        }
        
        // copy result back to CPU
        trace_cpu_start( 0, "get", "get A" );
        magma_sgetmatrix_1D_col_bcyclic( m, n, dA, ldda, A, lda, ngpu, nb, queues );
        trace_cpu_end( 0 );
    }
    
    #ifdef TRACING
    char name[80];
    snprintf( name, sizeof(name), "sorgqr-n%d-ngpu%d.svg", m, ngpu );
    trace_finalize( name, "trace.css" );
    #endif
    
cleanup:
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magma_free( dA[d] );
        magma_queue_destroy( queues[d] );
    }
    magma_free_cpu( work );
    magma_setdevice( orig_dev );
    
    return *info;
} /* magma_sorgqr */
Exemplo n.º 3
0
extern "C" magma_int_t
magma_cpotrf2_msub(
    magma_int_t num_subs, magma_int_t num_gpus, magma_uplo_t uplo, magma_int_t m, magma_int_t n, 
    magma_int_t off_i, magma_int_t off_j, magma_int_t nb,
    magmaFloatComplex_ptr *d_lA, size_t d_lA_offset, magma_int_t ldda, 
    magmaFloatComplex_ptr *d_lP, magma_int_t lddp, 
    magmaFloatComplex *a, magma_int_t lda, magma_int_t h,
    magma_queue_t *queues,
    magma_int_t *info )
{
/*  -- clMAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date November 2014

    Purpose   
    =======   
    CPOTRF computes the Cholesky factorization of a complex Hermitian   
    positive definite matrix dA.   

    The factorization has the form   
       dA = U**H * U,  if UPLO = 'U', or   
       dA = L  * L**H,  if UPLO = 'L',   
    where U is an upper triangular matrix and L is lower triangular.   

    This is the block version of the algorithm, calling Level 3 BLAS.   

    Arguments   
    =========   
    UPLO    (input) CHARACTER*1   
            = 'U':  Upper triangle of dA is stored;   
            = 'L':  Lower triangle of dA is stored.   

    N       (input) INTEGER   
            The order of the matrix dA.  N >= 0.   

    dA      (input/output) COMPLEX array on the GPU, dimension (LDDA,N)   
            On entry, the Hermitian matrix dA.  If UPLO = 'U', the leading   
            N-by-N upper triangular part of dA contains the upper   
            triangular part of the matrix dA, and the strictly lower   
            triangular part of dA is not referenced.  If UPLO = 'L', the   
            leading N-by-N lower triangular part of dA contains the lower   
            triangular part of the matrix dA, and the strictly upper   
            triangular part of dA is not referenced.   

            On exit, if INFO = 0, the factor U or L from the Cholesky   
            factorization dA = U**H * U or dA = L * L**H.   

    LDDA     (input) INTEGER   
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            divisible by 16.

    INFO    (output) INTEGER   
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value   
            > 0:  if INFO = i, the leading minor of order i is not   
                  positive definite, and the factorization could not be   
                  completed.   
    =====================================================================   */

    int tot_subs = num_subs*num_gpus;
    magma_int_t     j, jb, nb0, nb2, dd, d, id, j_local, j_local2;
    magmaFloatComplex c_one     = MAGMA_C_ONE;
    magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;
    float          d_one     =  1.0;
    float          d_neg_one = -1.0;
    magmaFloatComplex_ptr dlpanel;
    size_t dlpanel_offset;
    magma_int_t n_local[MagmaMaxSubs * MagmaMaxGPUs], ldpanel;

    // initialize trace
    trace_init(1, num_gpus, 2, queues);

    *info = 0;
    if ( (uplo != MagmaUpper) && (uplo != MagmaLower) ) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if ((uplo != MagmaUpper) && tot_subs*ldda < max(1,n)) {
        *info = -4;
    } else if ((uplo == MagmaUpper) && ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    for (d=0; d<tot_subs; d++) {
        /* local-n and local-ld */
        if (uplo == MagmaUpper) {
            n_local[d] = ((n/nb)/tot_subs)*nb;
            if (d < (n/nb)%tot_subs)
               n_local[d] += nb;
            else if (d == (n/nb)%tot_subs)
              n_local[d] += n%nb;
        } else {
            n_local[d] = ((m/nb)/tot_subs)*nb;
            if (d < (m/nb)%tot_subs)
               n_local[d] += nb;
            else if (d == (m/nb)%tot_subs)
              n_local[d] += m%nb;
        }
    }

    /* Use blocked code. */
    if (uplo == MagmaUpper) {
        /* ---------------------------------------------- */
        /* Upper-triangular case                          */
        /* > Compute the Cholesky factorization A = U'*U. */
        /* ---------------------------------------------- */
        for (j=0; j<m; j+=nb) {
            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%tot_subs;
            /* Set the local index where the current panel is */
            j_local = j/(nb*tot_subs);
            jb = min(nb, (m-j));
            if (j > 0) {
                // Wait for the column on CPU
                magma_queue_sync(queues[2*(id%num_gpus)]); 
                /* broadcast off-diagonal column to all gpus */
                d = (j/nb+1)%num_gpus;
                for (dd=0; dd<num_gpus; dd++) {
                    if (d != id%num_gpus) {
                        magma_csetmatrix_async( j, jb, 
                                                Aup(0,j),                lda, 
                                                dlP(d,jb,0,id%num_gpus), lddp, 
                                                queues[2*d], 
                                                trace_gpu_event(d, 0, "set", "set-col") );
                    }
                    d = (d+1)%num_gpus;
                }
                /* Update the current diagonal block */
                trace_gpu_start(id%num_gpus, 1, "herk", "herk");
                magma_cherk(MagmaUpper, MagmaConjTrans, jb, j, 
                            d_neg_one, dlA(id, 0, nb*j_local), ldda,
                            d_one,     dlA(id, j, nb*j_local), ldda,
                            queues[2*(id%num_gpus)+1]);
                magma_queue_sync(queues[2*(id%num_gpus)+1]); // Wait for syrk
            }
            /* Send the diagonal to cpu */
            magma_cgetmatrix_async( jb, jb, 
                                    dlA(id, j, nb*j_local), ldda,
                                    Aup(j,j),               lda,
                                    queues[2*(id%num_gpus)], 
                                    trace_gpu_event(id%num_gpus, 0, "get", "get-diag") );
            if (j > 0) {
                /* Compute the local block column of the panel. */
                d = (j/nb+1)%tot_subs;
                for (dd=0; dd<tot_subs; dd++) {
                    j_local2 = j_local+1;
                    if (d > id) j_local2 --;
                    nb0 = nb*j_local2;
                    if (n_local[d] > nb0) {
                        if (d%num_gpus != id%num_gpus) {
                            dlpanel = d_lP[d%num_gpus];
                            dlpanel_offset = dlP_offset(jb, 0, id%num_gpus);
                            ldpanel = lddp;
                            /* Wait for the offdiagonal column */
                            if (dd < num_gpus) magma_queue_sync(queues[2*(d%num_gpus)]);
                        } else {
                            dlpanel = d_lA[id];
                            dlpanel_offset = dlA_offset(0, nb*j_local);
                            ldpanel = ldda;
                        }
                        /* update the panel */
                        trace_gpu_start(d%num_gpus, 1, "gemm", "gemm");
                        magma_cgemm(MagmaConjTrans, MagmaNoTrans, 
                                    jb, n_local[d]-nb0, j, 
                                    c_neg_one, dlpanel, dlpanel_offset, ldpanel,
                                               dlA(d, 0, nb0), ldda, 
                                    c_one,     dlA(d, j, nb0), ldda,
                                    queues[2*(d%num_gpus)+1]);
                    }
                    d = (d+1)%tot_subs;
                }
            }
            /* factor the diagonal */
            magma_queue_sync( queues[2*(id%num_gpus)] ); // wait for the diagonal
            trace_cpu_start(0, "potrf", "potrf");
            lapackf77_cpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info);
            trace_cpu_end(0);
            if (*info != 0) {
                *info = *info + j;
                break;
            }

            /* send the diagonal to gpus */
            if ((j+jb) < n) {
                d = (j/nb+1)%num_gpus;
                for (dd=0; dd<num_gpus; dd++) {
                    if (d == id%num_gpus) {
                        dlpanel = d_lA[id];
                        dlpanel_offset = dlA_offset(j, nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = d_lP[d];
                        dlpanel_offset = dlP_offset(0, 0, id%num_gpus);
                        ldpanel = lddp;
                    }
                    magma_csetmatrix_async( jb, jb, 
                                            Aup(j,j),                lda,
                                            dlpanel, dlpanel_offset, ldpanel, 
                                            queues[2*d], 
                                            trace_gpu_event(d, 0, "set", "set-diag"));
                    d = (d+1)%num_gpus;
                }
            } else {
                magma_csetmatrix_async( jb, jb, 
                                        Aup(j,j),               lda, 
                                        dlA(id, j, nb*j_local), ldda,
                                        queues[2*(id%num_gpus)], 
                                        trace_gpu_event(id%num_gpus, 0, "set", "set-diag") );
            }

            /* panel-factorize the off-diagonal */
            if ((j+jb) < n) {
                d = (j/nb+1)%tot_subs;
                for (dd=0; dd<tot_subs; dd++) {
                    /* next column */
                    j_local2 = j_local+1;
                    if (d > id) j_local2--;
                    if (d%num_gpus == id%num_gpus) {
                        dlpanel = d_lA[id];
                        dlpanel_offset = dlA_offset(j, nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = d_lP[d%num_gpus];
                        dlpanel_offset = dlP_offset(0, 0, id%num_gpus);
                        ldpanel = lddp;
                    }
                    nb2 = n_local[d]-nb*j_local2;
                    nb0 = min(nb, nb2);
                    if (dd < num_gpus) magma_queue_sync( queues[2*(d%num_gpus)] ); // wait for the diagonal
                    if (j+jb < m && d == (j/nb+1)%tot_subs) {
                        /* owns the next column, look-ahead the column */
                        trace_gpu_start(d%num_gpus, 1, "trsm", "trsm");
                        magma_ctrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                     jb, nb0, c_one,
                                     dlpanel, dlpanel_offset, ldpanel,
                                     dlA(d, j, nb*j_local2), ldda, 
                                     queues[2*(d%num_gpus)+1] );
                        /* send the column to cpu */
                        magma_queue_sync(queues[2*(d%num_gpus)+1]);  // wait for lookahead
                        magma_cgetmatrix_async( (j+jb), nb0, 
                                                dlA(d, 0, nb*j_local2), ldda, 
                                                Aup(0,j+jb),            lda,
                                                queues[2*(d%num_gpus)], 
                                                trace_gpu_event(d%num_gpus, 0, "get", "get-col") );
                        /* update the remaining blocks */
                        nb2 = nb2 - nb0;
                        trace_gpu_start(d%num_gpus, 1, "trsm", "trsm");
                        magma_ctrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                     jb, nb2, c_one, 
                                     dlpanel, dlpanel_offset, ldpanel,
                                     dlA(d, j, nb*j_local2+nb0), ldda, 
                                     queues[2*(d%num_gpus)+1] );
                    } else if (nb2 > 0) {
                        /* update the entire trailing matrix */
                        trace_gpu_start(d%num_gpus, 1, "trsm", "trsm");
                        magma_ctrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, 
                                     jb, nb2, c_one, 
                                     dlpanel, dlpanel_offset, ldpanel,
                                     dlA(d, j, nb*j_local2), ldda,
                                     queues[2*(d%num_gpus)+1] );
                    }
                    d = (d+1)%tot_subs;
                }
            }
        }
    } else { 
        /* -------------------------------------------- */
        /* Lower-triangular case                        */
        /* Compute the Cholesky factorization A = L*L'. */
        /* -------------------------------------------- */
        for (j=0; j<n; j+=nb) {
            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%tot_subs;
            /* Set the local index where the current panel is */
            j_local = j/(nb*tot_subs);
            jb = min(nb, (n-j));
            if (j > 0) {
                if (num_gpus > 1) {
                    // Wait for the row on CPU to broadcast
                    magma_queue_sync(queues[2*(id%num_gpus)]); 
                }
                /* broadcast off-diagonal row to all the GPUs */
                d = (j/nb+1)%num_gpus;
                for (dd=0; dd<num_gpus; dd++) {
                    if (d != id%num_gpus) {
                        /* send it to GPU-d */
                        magma_csetmatrix_async( jb, j,
                                                Alo(j,0),                 lda,
                                                dlPT(d,0,jb,id%num_gpus), nb, 
                                                queues[2*d], 
                                                trace_gpu_event(d, 0, "set", "set-row") );
                    }
                    d = (d+1)%num_gpus;
                }
                /* Update the current diagonal block */
                trace_gpu_start(id%num_gpus, 1, "herk", "herk");
                magma_cherk(MagmaLower, MagmaNoTrans, jb, j,
                            d_neg_one, dlA(id, nb*j_local, 0), ldda,
                            d_one,     dlA(id, nb*j_local, j), ldda,
                            queues[2*(id%num_gpus)+1]);
                magma_queue_sync(queues[2*(id%num_gpus)+1]); // wait for syrk
            }
            /* send the diagonal to cpu */
            magma_cgetmatrix_async( jb, jb,
                                    dlA(id, nb*j_local, j), ldda,
                                    Alo(j,j),               lda, 
                                    queues[2*(id%num_gpus)], 
                                    trace_gpu_event(id%num_gpus, 0, "get", "get") );
            /* update the offdiagonal blocks */
            if (j > 0) {
                /* compute the block-rows of the panel */
                d = (j/nb+1)%tot_subs;
                for (dd=0; dd<tot_subs; dd++) {
                    j_local2 = j_local+1;
                    if (d > id) j_local2 --;
                    nb0 = nb*j_local2;
                    if (nb0 < n_local[d]) {
                        if (d%num_gpus != id%num_gpus) {
                            dlpanel = d_lP[d%num_gpus];
                            dlpanel_offset = dlPT_offset(0, jb, id%num_gpus);
                            ldpanel = nb;
                            /* Wait for offdiagonal row */
                            if (dd < num_gpus) magma_queue_sync(queues[2*(d%num_gpus)]);
                        } else {
                            dlpanel = d_lA[id];
                            dlpanel_offset = dlA_offset(nb*j_local, 0);
                            ldpanel = ldda;
                        }
                        /* Update the panel */
                        trace_gpu_start(d%num_gpus, 1, "gemm", "gemm");
                        magma_cgemm( MagmaNoTrans, MagmaConjTrans,
                                     n_local[d]-nb0, jb, j,
                                     c_neg_one, dlA(d, nb0, 0), ldda,
                                                dlpanel, dlpanel_offset, ldpanel,
                                     c_one,     dlA(d, nb0, j), ldda, 
                                     queues[2*(d%num_gpus)+1]);
                    }
                    d = (d+1)%tot_subs;
                }
            }

            /* factor the diagonal */
            magma_queue_sync( queues[2*(id%num_gpus)] );
            trace_cpu_start(0, "potrf", "potrf");
            lapackf77_cpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info);
            trace_cpu_end(0);
            if (*info != 0) {
                printf( " cpotrf returned %d (id=%d,j=%d,j_local=%d,jb=%d)\n",*info,id,j,j_local,jb );
                *info = *info + j;
                break;
            }

            /* send the diagonal to gpus */
            if ((j+jb) < m) {
                d = (j/nb+1)%num_gpus;
                for (dd=0; dd<num_gpus; dd++) {
                    if (d == id%num_gpus) {
                        dlpanel = d_lA[id];
                        dlpanel_offset = dlA_offset(nb*j_local, j);
                        ldpanel = ldda;
                    } else {
                        dlpanel = d_lP[d];
                        dlpanel_offset = dlPT_offset(0, 0, id%num_gpus);
                        ldpanel = nb;
                    }
                    magma_csetmatrix_async( jb, jb,
                                            Alo(j,j), lda,
                                            dlpanel,  dlpanel_offset, ldpanel, 
                                            queues[2*d], 
                                            trace_gpu_event(d, 0, "set", "set-diag") );
                    d = (d+1)%num_gpus;
                }
            } else {
                magma_csetmatrix_async( jb, jb,
                                        Alo(j,j),               lda,
                                        dlA(id, nb*j_local, j), ldda, 
                                        queues[2*(id%num_gpus)],
                                        trace_gpu_event(id%num_gpus, 0, "set", "set-diag") );
            }

            /* factorize off-diagonal blocks */
            if ((j+jb) < m) {
                d = (j/nb+1)%tot_subs;
                for (dd=0; dd<tot_subs; dd++) {
                    /* next column */
                    j_local2 = j_local+1;
                    if (d > id) j_local2--;
                    if (d%num_gpus == id%num_gpus) {
                        dlpanel = d_lA[id];
                        dlpanel_offset = dlA_offset(nb*j_local, j);
                        ldpanel = ldda;
                    } else {         
                        dlpanel = d_lP[d%num_gpus];
                        dlpanel_offset = dlPT_offset(0, 0, id%num_gpus);
                        ldpanel = nb;
                    }
                    nb2 = n_local[d] - j_local2*nb;
                    nb0 = min(nb, nb2 );
                    // wait for the diagonal
                    if (dd < num_gpus) magma_queue_sync(queues[2*(d%num_gpus)]);
                    if (j+jb < n && d == (j/nb+1)%tot_subs) {
                        /* owns the next column, look-ahead the column */
                        trace_gpu_start(d%num_gpus, 1, "trsm", "trsm");
                        magma_ctrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, 
                                     nb0, jb, c_one,
                                     dlpanel,  dlpanel_offset, ldpanel, 
                                     dlA(d, nb*j_local2, j), ldda,
                                     queues[2*(d%num_gpus)+1]);
                        /* send the column to cpu */
                        magma_queue_sync( queues[2*(d%num_gpus)+1] ); // wait for lookahead
                        magma_cgetmatrix_async( nb0, j+jb,
                                                dlA(d, nb*j_local2, 0), ldda,
                                                Alo(j+jb,0),            lda, 
                                                queues[2*(d%num_gpus)], 
                                                trace_gpu_event(d%num_gpus, 0, "get", "get") );
                        /* update the remaining blocks */
                        nb2 = nb2 - nb0;
                        trace_gpu_start(d%num_gpus, 1, "trsm", "trsm");
                        magma_ctrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, 
                                     nb2, jb, c_one,
                                     dlpanel, dlpanel_offset, ldpanel, 
                                     dlA(d, nb*j_local2+nb0, j), ldda, 
                                     queues[2*(d%num_gpus)+1]);
                    } else if (nb2 > 0) {
                        /* update the entire trailing matrix */
                        trace_gpu_start(d%num_gpus, 1, "trsm", "trsm");
                        magma_ctrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, 
                                     nb2, jb, c_one,
                                     dlpanel, dlpanel_offset, ldpanel, 
                                     dlA(d, nb*j_local2, j), ldda, 
                                     queues[2*(d%num_gpus)+1]);
                    }
                    d = (d+1)%tot_subs;
                }
            }
        }
    } /* end of else not upper */

    /* clean up */
    for( d=0; d<num_gpus; d++ ) {
        magma_queue_sync( queues[2*d] );
        magma_queue_sync( queues[2*d+1] );
    }

    trace_finalize("cpotrf_msub.svg", "trace.css");
    return *info;
} /* magma_cpotrf2_msub */
Exemplo n.º 4
0
extern "C" magma_int_t
magma_zpotrf2_mgpu(int num_gpus, char uplo, magma_int_t m, magma_int_t n,
                   magma_int_t off_i, magma_int_t off_j, magma_int_t nb,
                   magmaDoubleComplex **d_lA,  magma_int_t ldda,
                   magmaDoubleComplex **d_lP,  magma_int_t lddp,
                   magmaDoubleComplex *a,      magma_int_t lda,   magma_int_t h,
                   magma_queue_t stream[][3], magma_event_t event[][5],
                   magma_int_t *info )
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    ZPOTRF computes the Cholesky factorization of a complex Hermitian
    positive definite matrix dA.

    The factorization has the form
       dA = U**H * U,  if UPLO = 'U', or
       dA = L  * L**H,  if UPLO = 'L',
    where U is an upper triangular matrix and L is lower triangular.

    This is the block version of the algorithm, calling Level 3 BLAS.

    Arguments
    =========
    UPLO    (input) CHARACTER*1
            = 'U':  Upper triangle of dA is stored;
            = 'L':  Lower triangle of dA is stored.

    N       (input) INTEGER
            The order of the matrix dA.  N >= 0.

    dA      (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N)
            On entry, the Hermitian matrix dA.  If UPLO = 'U', the leading
            N-by-N upper triangular part of dA contains the upper
            triangular part of the matrix dA, and the strictly lower
            triangular part of dA is not referenced.  If UPLO = 'L', the
            leading N-by-N lower triangular part of dA contains the lower
            triangular part of the matrix dA, and the strictly upper
            triangular part of dA is not referenced.

            On exit, if INFO = 0, the factor U or L from the Cholesky
            factorization dA = U**H * U or dA = L * L**H.

    LDDA     (input) INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            dividable by 16.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
            > 0:  if INFO = i, the leading minor of order i is not
                  positive definite, and the factorization could not be
                  completed.
    =====================================================================   */


    magma_int_t     j, jb, nb0, nb2, dd, d, id, j_local, j_local2, buf;
    char            uplo_[2] = {uplo, 0};
    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    double          d_one     =  1.0;
    double          d_neg_one = -1.0;
    int upper = lapackf77_lsame(uplo_, "U");
    magmaDoubleComplex *dlpanel;
    //magma_event_t event0[MagmaMaxGPUs], // syrk
    //            event1[MagmaMaxGPUs], // send off-diagonal
    //            event2[MagmaMaxGPUs], // send diagonal
    //            event3[MagmaMaxGPUs]; // trsm
    magma_int_t n_local[MagmaMaxGPUs], ldpanel;
    int stream0 = 0, stream1 = 1;
    #ifdef ZTRSM_WORK
    magmaDoubleComplex *d_dinvA[MagmaMaxGPUs][2], *d_x[MagmaMaxGPUs][2]; /* used by ztrsm_work */
    #endif
    
    *info = 0;
    if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (!upper && num_gpus*ldda < max(1,n)) {
        *info = -4;
    } else if (upper && ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    for( d=0; d<num_gpus; d++ ) {
        /* local-n and local-ld */
        if (upper) {
            n_local[d] = ((n/nb)/num_gpus)*nb;
            if (d < (n/nb)%num_gpus)
                n_local[d] += nb;
            else if (d == (n/nb)%num_gpus)
                n_local[d] += n%nb;
        } else {
            n_local[d] = ((m/nb)/num_gpus)*nb;
            if (d < (m/nb)%num_gpus)
                n_local[d] += nb;
            else if (d == (m/nb)%num_gpus)
                n_local[d] += m%nb;
        }
        //magma_setdevice(d);
        //magma_event_create( &event0[d] );
        //magma_event_create( &event1[d] );
        //magma_event_create( &event2[d] );
        //magma_event_create( &event3[d] );
    }
    magma_setdevice(0);

    /* == initialize the trace */
    trace_init( 1, num_gpus, 3, stream );

    /* Use blocked code. */
    if (upper) {
        /* ---------------------------------------------- */
        /* Upper-triangular case                          */
        /* > Compute the Cholesky factorization A = U'*U. */
        /* ---------------------------------------------- */
        
#if defined(PRECISION_d) && defined(ZTRSM_WORK)
        /* invert the diagonals
         * Allocate device memory for the inversed diagonal blocks, size=m*NB
         */
        for( d=0; d<num_gpus; d++ ) {
            magma_setdevice(d);
            for( j=0; j<2; j++ ) {
                magma_zmalloc( &d_dinvA[d][j], nb*nb );
                magma_zmalloc( &d_x[d][j],      n*nb );
                cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(magmaDoubleComplex));
                cudaMemset(d_x[d][j],     0,  n*nb*sizeof(magmaDoubleComplex));
            }
        }
        magma_setdevice(0);
#endif
        
        for (j=0; j<m; j+=nb) {

            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%num_gpus;
            buf = (j/nb)%num_gpus;
            
            /* Set the local index where the current panel is */
            j_local = j/(nb*num_gpus);
            jb = min(nb, (m-j));
            
            if( j > 0 ) {
                /* needed on pluto... */
                magma_setdevice(id);
                magma_queue_sync( stream[id][stream0] ); // wait for the column on CPU

                /* broadcast off-diagonal column to all gpus */
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    if( d != id ) {
                        magma_setdevice(d);
                
                        /* wait for it on CPU */
                        magma_queue_wait_event( stream[d][stream0], event[id][1] );
                
                        /* send it to GPU */
                        trace_gpu_start( d, stream0, "comm", "rows to GPUs" );
                        magma_zsetmatrix_async( j, jb,
                                                Aup(0,j),        lda,
                                                dlP(d,jb,0,buf), lddp,
                                                stream[d][stream0] );
                        trace_gpu_end( d, stream0 );
                        magma_event_record( event[d][1], stream[d][stream0] );
                    }
                    d = (d+1)%num_gpus;
                }
            }
            
            /* Update the current diagonal block */
            magma_setdevice(id);
            if( j > 0 ) {
                magmablasSetKernelStream(stream[id][stream1]);
                trace_gpu_start( id, stream1, "syrk", "syrk" );
                magma_zherk(MagmaUpper, MagmaConjTrans, jb, j,
                            d_neg_one, dlA(id, 0, nb*j_local), ldda,
                            d_one,     dlA(id, j, nb*j_local), ldda);
                trace_gpu_end( id, stream1 );
                magma_event_record( event[id][0], stream[id][stream1] );
            }

            /* send the diagonal to cpu */
            magma_queue_wait_event( stream[id][stream0], event[id][0] ); // wait for syrk
            trace_gpu_start( id, stream0, "comm", "D to CPU" );
            magma_zgetmatrix_async( jb, jb,
                                    dlA(id, j, nb*j_local), ldda,
                                    Aup(j,j),               lda,
                                    stream[id][stream0] );
            trace_gpu_end( id, stream0 );

            if ( j > 0 ) {

                /* Compute the local block column of the panel. */
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    j_local2 = j_local+1;
                    if( d > id ) j_local2 --;
                    nb0 = nb*j_local2;
                
                    if( n_local[d] > nb0 ) {
                        /* wait for the off-diagonal */
                        if( d != id ) {
                            //magma_queue_sync( stream[id][3] );
                            dlpanel = dlP(d, jb, 0, buf);
                            ldpanel = lddp;
                
                            /* wait for the offdiagonal column */
                            magma_queue_wait_event( stream[d][stream1], event[d][1] );
                        } else {
                            dlpanel = dlA(d, 0, nb*j_local);
                            ldpanel = ldda;
                        }
                        
                        /* update the panel */
                        magma_setdevice(d);
                        magmablasSetKernelStream(stream[d][stream1]);
                        trace_gpu_start( d, stream1, "gemm", "gemm" );
                        magma_zgemm(MagmaConjTrans, MagmaNoTrans,
                                    jb, n_local[d]-nb0, j,
                                    c_neg_one, dlpanel,        ldpanel,
                                               dlA(d, 0, nb0), ldda,
                                    c_one,     dlA(d, j, nb0), ldda);
                        trace_gpu_end( d, stream1 );
                    }
                    d = (d+1)%num_gpus;
                }
            }
            
            /* factor the diagonal */
            magma_setdevice(id);
            magma_queue_sync( stream[id][stream0] ); // wait for the diagonal
            trace_cpu_start( 0, "getrf", "getrf" );
            lapackf77_zpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info);
            trace_cpu_end( 0 );
            if (*info != 0) {
                *info = *info + j;
                break;
            }

            /* send the diagonal to gpus */
            if ( (j+jb) < n) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    magma_setdevice(d);
                    if( d == id ) {
                        dlpanel = dlA(d, j, nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlP(d, 0, 0, buf);
                        ldpanel = lddp;
                    }
                    
                    trace_gpu_start( d, stream0, "comm", "D to GPUs" );
                    magma_zsetmatrix_async( jb, jb,
                                            Aup(j,j), lda,
                                            dlpanel,  ldpanel,
                                            stream[d][stream0] );
                    trace_gpu_end( d, stream0 );
                    magma_event_record( event[d][2], stream[d][stream0] );
                    d = (d+1)%num_gpus;
                }
            } else {
                magma_setdevice(id);
                trace_gpu_start( id, stream0, "comm", "D to GPUs" );
                magma_zsetmatrix_async( jb, jb,
                                        Aup(j,j),               lda,
                                        dlA(id, j, nb*j_local), ldda,
                                        stream[id][stream0] );
                trace_gpu_end( id, stream0 );
            }
            
            /* panel-factorize the off-diagonal */
            if ( (j+jb) < n) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    /* next column */
                    j_local2 = j_local+1;
                    if( d > id ) j_local2--;
                    if( d == id ) {
                        dlpanel = dlA(d, j, nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlP(d, 0, 0, buf);
                        ldpanel = lddp;
                    }
                    nb2 = n_local[d]-nb*j_local2;
                    nb0 = min(nb, nb2 );
                    
                    magma_setdevice(d);
                    magmablasSetKernelStream(stream[d][stream1]);
                    magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for the diagonal
                    if( j+jb < m && d == (j/nb+1)%num_gpus ) {
                        /* owns the next column, look-ahead the column */
                        trace_gpu_start( d, stream1, "trsm", "trsm" );
#if defined(PRECISION_d) && defined(ZTRSM_WORK)
                        magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                              jb, nb0, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                              d_dinvA[d][0], d_x[d][0] );
                        /*nb2 = n_local[d] - j_local2*nb;
                        magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                              d_dinvA[d], d_x[d] );*/
#else
                        /*nb2 = n_local[d] - j_local2*nb;
                        magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel,                ldda,
                                     dlA(d, j, nb*j_local2), ldda);
                        */
                        magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                     jb, nb0, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, j, nb*j_local2), ldda);
#endif
                        trace_gpu_end( d, stream1 );
                        magma_event_record( event[d][3], stream[d][stream1] );
                        
                        /* send the column to cpu */
                        if( j+jb < m ) {
                            trace_gpu_start( d, stream0, "comm", "rows to CPU" );
                            magma_queue_wait_event( stream[d][stream0], event[d][3] ); // wait for lookahead
                            magma_zgetmatrix_async( (j+jb), nb0,
                                                    dlA(d, 0, nb*j_local2), ldda,
                                                    Aup(0,j+jb),            lda,
                                                    stream[d][stream0] );
                            trace_gpu_end( d, stream0 );
                            magma_event_record( event[d][1], stream[d][stream0] );
                        }
                        
                        /* update the remaining blocks */
                        nb2 = nb2 - nb0;
#if defined(PRECISION_d) && defined(ZTRSM_WORK)
                        magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel,                    ldpanel,
                                              dlA(d, j, nb*j_local2+nb0), ldda,
                                              d_dinvA[d][1], d_x[d][1] );
#else
                        magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel,                    ldpanel,
                                     dlA(d, j, nb*j_local2+nb0), ldda);
#endif
                    } else if( nb2 > 0 ) {
                        /* update the entire trailing matrix */
                        trace_gpu_start( d, stream1, "trsm", "trsm" );
#if defined(PRECISION_d) && defined(ZTRSM_WORK)
                        magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                    d_dinvA[d][1], d_x[d][1] );
#else
                        magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, j, nb*j_local2), ldda);
#endif
                        trace_gpu_end( d, stream1 );
                    }
                    d = (d+1)%num_gpus;
                }
            } /* end of ztrsm */
        } /* end of for j=1, .., n */
    } else {
        /* -------------------------------------------- */
        /* Lower-triangular case                        */
        /* Compute the Cholesky factorization A = L*L'. */
        /* -------------------------------------------- */
#if defined(PRECISION_d) && defined(ZTRSM_WORK)
        /*
         * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE
         */
        for( d=0; d<num_gpus; d++ ) {
            magma_setdevice(d);
            for( j=0; j<2; j++ ) {
                magma_zmalloc( &d_dinvA[d][j], nb*nb );
                magma_zmalloc( &d_x[d][j],     nb*m  );
                cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(magmaDoubleComplex));
                cudaMemset(d_x[d][j],     0, nb* m*sizeof(magmaDoubleComplex));
            }
        }
        magma_setdevice(0);
#endif

        for (j=0; j<n; j+=nb) {

            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%num_gpus;
            buf = (j/nb)%num_gpus;
            
            /* Set the local index where the current panel is */
            j_local = j/(nb*num_gpus);
            jb = min(nb, (n-j));
            
            if( j > 0 ) {
                /* needed on pluto... */
                magma_setdevice(id);
                magma_queue_sync( stream[id][stream0] ); // wait for the column on CPU

                /* broadcast offdiagonal row to all gpus */
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    if( d != id ) {
                        magma_setdevice(d);
                        /* wait for it on CPU */
                        magma_queue_wait_event( stream[d][stream0], event[id][1] );
            
                        /* send it to GPU */
                        magma_zsetmatrix_async( jb, j,
                                                Alo(j,0),         lda,
                                                dlPT(d,0,jb,buf), nb,
                                                stream[d][stream0] );
                        magma_event_record( event[d][1], stream[d][stream0] );
                    }
                    d = (d+1)%num_gpus;
                }
            }

            /* Update the current diagonal block */
            magma_setdevice(id);
            if( j > 0 ) {
                magmablasSetKernelStream(stream[id][stream1]);
                magma_zherk(MagmaLower, MagmaNoTrans, jb, j,
                            d_neg_one, dlA(id, nb*j_local, 0), ldda,
                            d_one,     dlA(id, nb*j_local, j), ldda);
                magma_event_record( event[id][0], stream[id][stream1] );
            }
            
            /* send the diagonal to cpu */
            magma_queue_wait_event( stream[id][stream0], event[id][0] ); // wait for syrk
            magma_zgetmatrix_async( jb, jb,
                                    dlA(id, nb*j_local, j), ldda,
                                    Alo(j,j),               lda,
                                    stream[id][stream0] );

            /* update the offdiagonal blocks */
            if ( j > 0 ) {
                /* compute the block-rows of the panel */
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    j_local2 = j_local+1;
                    if( d > id ) j_local2 --;
                    nb0 = nb*j_local2;
            
                    if( nb0 < n_local[d] ) {
                        if( d != id ) {
                            dlpanel = dlPT(d, 0, jb, buf);
                            ldpanel = nb;
            
                            /* wait for offdiagonal row */
                            magma_queue_wait_event( stream[d][stream1], event[d][1] );
                        } else {
                            dlpanel = dlA(d, nb*j_local, 0);
                            ldpanel = ldda;
                        }
            
                        magma_setdevice(d);
                        magmablasSetKernelStream(stream[d][stream1]);
                        magma_zgemm( MagmaNoTrans, MagmaConjTrans,
                                     n_local[d]-nb0, jb, j,
                                     c_neg_one, dlA(d, nb0, 0), ldda,
                                                dlpanel,        ldpanel,
                                     c_one,     dlA(d, nb0, j), ldda);
                    }
                    d = (d+1)%num_gpus;
                }
            }

            /* factor the diagonal */
            magma_setdevice(id);
            magma_queue_sync( stream[id][stream0] );
            lapackf77_zpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info);
            if (*info != 0) {
                *info = *info + j;
                break;
            }

            /* send the diagonal to gpus */
            if ( (j+jb) < m ) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    magma_setdevice(d);
                    if( d == id ) {
                        dlpanel = dlA(d, nb*j_local, j);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlPT(d, 0, 0, buf);
                        ldpanel = nb;
                    }
                    magma_zsetmatrix_async( jb, jb,
                                            Alo(j,j), lda,
                                            dlpanel,  ldpanel,
                                            stream[d][stream0] );
                    magma_event_record( event[d][2], stream[d][stream0] );
                    d = (d+1)%num_gpus;
                }
            } else {
                magma_setdevice(id);
                magma_zsetmatrix_async( jb, jb,
                                        Alo(j,j),               lda,
                                        dlA(id, nb*j_local, j), ldda,
                                        stream[id][stream0] );
            }

            /* factorize off-diagonal blocks */
            if ( (j+jb) < m ) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                    /* next column */
                    j_local2 = j_local+1;
                    if( d > id ) j_local2--;
                    if( d == id ) {
                        dlpanel = dlA(d, nb*j_local, j);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlPT(d, 0, 0, buf);
                        ldpanel = nb;
                    }
                    nb2 = n_local[d] - j_local2*nb;
                    nb0 = min(nb, nb2 );
            
                    magma_setdevice(d);
                    magmablasSetKernelStream(stream[d][stream1]);
                    magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for the diagonal
                    if( j+jb < n && d == (j/nb+1)%num_gpus ) {
                        /* owns the next column, look-ahead the column */
#if defined(PRECISION_d) && defined(ZTRSM_WORK)
                        magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                              nb0, jb, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, nb*j_local2, j), ldda,
                                              d_dinvA[d][0], d_x[d][0]);
#else
                        magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                     nb0, jb, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, nb*j_local2, j), ldda);
#endif
                        magma_event_record( event[d][3], stream[d][stream1] );

                        /* send the column to cpu */
                        if( j+jb < n ) {
                            magma_queue_wait_event( stream[d][stream0], event[d][3] ); // wait for lookahead
                            magma_zgetmatrix_async( nb0, j+jb,
                                                    dlA(d, nb*j_local2, 0), ldda,
                                                    Alo(j+jb,0),            lda,
                                                    stream[d][stream0] );
                            magma_event_record( event[d][1], stream[d][stream0] );
                        }

                        /* update the remaining blocks */
                        nb2 = nb2 - nb0;
#if defined(PRECISION_d) && defined(ZTRSM_WORK)
                        magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                              nb2, jb, c_one,
                                              dlpanel,                    ldpanel,
                                              dlA(d, nb*j_local2+nb0, j), ldda,
                                              d_dinvA[d][1], d_x[d][1] );
#else
                        magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                     nb2, jb, c_one,
                                     dlpanel,                    ldpanel,
                                     dlA(d, nb*j_local2+nb0, j), ldda);
#endif
                    } else if( nb2 > 0 ) {
                        /* update the entire trailing matrix */
#if defined(PRECISION_d) && defined(ZTRSM_WORK)
                        magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                              nb2, jb, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, nb*j_local2, j), ldda,
                                              d_dinvA[d][1], d_x[d][1] );
#else
                        magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                     nb2, jb, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, nb*j_local2, j), ldda);
#endif
                    }
                    d = (d+1)%num_gpus;
                }
            }
        }
    } /* end of else not upper */

    /* == finalize the trace == */
    trace_finalize( "zpotrf.svg","trace.css" );

    /* clean up */
    for( d=0; d<num_gpus; d++ ) {
        magma_setdevice(d);
        magma_queue_sync( stream[d][0] );
        magma_queue_sync( stream[d][1] );
        magmablasSetKernelStream(NULL);

        //magma_event_destroy( event0[d] );
        //magma_event_destroy( event1[d] );
        //magma_event_destroy( event2[d] );
        //magma_event_destroy( event3[d] );
    }
    magma_setdevice(0);

    return *info;
} /* magma_zpotrf_mgpu */
Exemplo n.º 5
0
/**
    Purpose   
    =======   

    SSYTRF_nopiv_gpu computes the LDLt factorization of a real symmetric   
    matrix A.

    The factorization has the form   
       A = U^H * D * U , if UPLO = 'U', or   
       A = L  * D * L^H, if UPLO = 'L',   
    where U is an upper triangular matrix, L is lower triangular, and
    D is a diagonal matrix.

    This is the block version of the algorithm, calling Level 3 BLAS.   

    Arguments
    ---------
    @param[in]
    UPLO    CHARACTER*1   
      -     = 'U':  Upper triangle of A is stored;   
      -     = 'L':  Lower triangle of A is stored.   

    @param[in]
    N       INTEGER   
            The order of the matrix A.  N >= 0.   

    @param[in,out]
    dA      REAL array on the GPU, dimension (LDA,N)   
            On entry, the symmetric matrix A.  If UPLO = 'U', the leading   
            N-by-N upper triangular part of A contains the upper   
            triangular part of the matrix A, and the strictly lower   
            triangular part of A is not referenced.  If UPLO = 'L', the   
            leading N-by-N lower triangular part of A contains the lower   
            triangular part of the matrix A, and the strictly upper   
            triangular part of A is not referenced.   
    \n
            On exit, if INFO = 0, the factor U or L from the Cholesky   
            factorization A = U^H D U or A = L D L^H.   
    \n 
            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using cudaMallocHost.

    @param[in]
    LDA     INTEGER   
            The leading dimension of the array A.  LDA >= max(1,N).   

    @param[out]
    INFO    INTEGER   
      -     = 0:  successful exit   
      -     < 0:  if INFO = -i, the i-th argument had an illegal value 
                  if INFO = -6, the GPU memory allocation failed 
      -     > 0:  if INFO = i, the leading minor of order i is not   
                  positive definite, and the factorization could not be   
                  completed.   
    
    @ingroup magma_ssytrf_comp
    ******************************************************************* */
extern "C" magma_int_t
magma_ssytrf_nopiv_gpu(
    magma_uplo_t uplo, magma_int_t n,
    magmaFloat_ptr dA, magma_int_t ldda,
    magma_int_t *info)
{
    #define  A(i, j)  (A)
    #define dA(i, j)  (dA +(j)*ldda + (i))
    #define dW(i, j)  (dW +(j)*ldda + (i))
    #define dWt(i, j) (dW +(j)*nb   + (i))

    /* Local variables */
    float zone  = MAGMA_S_ONE;
    float mzone = MAGMA_S_NEG_ONE;
    int                upper = (uplo == MagmaUpper);
    magma_int_t j, k, jb, nb, ib, iinfo;

    *info = 0;
    if (! upper && uplo != MagmaLower) {
      *info = -1;
    } else if (n < 0) {
      *info = -2;
    } else if (ldda < max(1,n)) {
      *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return MAGMA_ERR_ILLEGAL_VALUE;
    }

    /* Quick return */
    if ( n == 0 )
      return MAGMA_SUCCESS;

    nb = magma_get_ssytrf_nopiv_nb(n);
    ib = min(32, nb); // inner-block for diagonal factorization

    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );


    magma_queue_t stream[2];
    magma_event_t event;
    magma_queue_create(&stream[0]);
    magma_queue_create(&stream[1]);
    magma_event_create( &event );
    trace_init( 1, 1, 2, stream );

    // CPU workspace
    float *A;
    if (MAGMA_SUCCESS != magma_smalloc_pinned( &A, nb*nb )) {
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }

    // GPU workspace
    magmaFloat_ptr dW;
    if (MAGMA_SUCCESS != magma_smalloc( &dW, (1+nb)*ldda )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    /* Use hybrid blocked code. */
    if (upper) {
        //=========================================================
        // Compute the LDLt factorization A = U'*D*U without pivoting.
        // main loop
        for (j=0; j<n; j += nb) {
            jb = min(nb, (n-j));
            
            // copy A(j,j) back to CPU
            trace_gpu_start( 0, 0, "get", "get" );
            //magma_queue_wait_event( stream[1], event );                                                                
            magma_event_sync(event);
            magma_sgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), nb, stream[1]);
            trace_gpu_end( 0, 0 );

            // factorize the diagonal block
            magma_queue_sync(stream[1]);
            trace_cpu_start( 0, "potrf", "potrf" );
            ssytrf_nopiv_cpu(MagmaUpper, jb, ib, A(j, j), nb, info);
            trace_cpu_end( 0 );
            if (*info != 0){
                *info = *info + j;
                break;
            }
            
            // copy A(j,j) back to GPU
            trace_gpu_start( 0, 0, "set", "set" );
            magma_ssetmatrix_async(jb, jb, A(j, j), nb, dA(j, j), ldda, stream[0]);
            trace_gpu_end( 0, 0 );
                
            if ( (j+jb) < n) {
                // compute the off-diagonal blocks of current block column
                magmablasSetKernelStream( stream[0] );
                trace_gpu_start( 0, 0, "trsm", "trsm" );
                magma_strsm(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaUnit, 
                            jb, (n-j-jb), 
                            zone, dA(j, j),    ldda, 
                            dA(j, j+jb), ldda);
                magma_scopymatrix( jb, n-j-jb, dA( j, j+jb ), ldda, dWt( 0, j+jb ), nb );
                
                // update the trailing submatrix with D
                magmablas_slascl_diag(MagmaUpper, jb, n-j-jb,
                                      dA(j,    j), ldda,
                                      dA(j, j+jb), ldda,
                                      &iinfo);
                trace_gpu_end( 0, 0 );
                
                // update the trailing submatrix with U and W
                trace_gpu_start( 0, 0, "gemm", "gemm" );
                for (k=j+jb; k<n; k+=nb) {
                    magma_int_t kb = min(nb,n-k);
                    magma_sgemm(MagmaConjTrans, MagmaNoTrans, kb, n-k, jb,
                                mzone, dWt(0, k), nb, 
                                       dA(j, k), ldda,
                                zone,  dA(k, k), ldda);
                    if (k==j+jb)
                        magma_event_record( event, stream[0] );
                }
                trace_gpu_end( 0, 0 );
            }
        }
    } else {
        //=========================================================
        // Compute the LDLt factorization A = L*D*L' without pivoting.
        // main loop
        for (j=0; j<n; j+=nb) {
            jb = min(nb, (n-j));
            
            // copy A(j,j) back to CPU
            trace_gpu_start( 0, 0, "get", "get" );
            //magma_queue_wait_event( stream[0], event );                                                                
            magma_event_sync(event);
            magma_sgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), nb, stream[1]);
            trace_gpu_end( 0, 0 );
            
            // factorize the diagonal block
            magma_queue_sync(stream[1]);
            trace_cpu_start( 0, "potrf", "potrf" );
            ssytrf_nopiv_cpu(MagmaLower, jb, ib, A(j, j), nb, info);
            trace_cpu_end( 0 );
            if (*info != 0){
                *info = *info + j;
                break;
            }

            // copy A(j,j) back to GPU
            trace_gpu_start( 0, 0, "set", "set" );
            magma_ssetmatrix_async(jb, jb, A(j, j), nb, dA(j, j), ldda, stream[0]);
            trace_gpu_end( 0, 0 );
            
            if ( (j+jb) < n) {
                // compute the off-diagonal blocks of current block column
                magmablasSetKernelStream( stream[0] );
                trace_gpu_start( 0, 0, "trsm", "trsm" );
                magma_strsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaUnit, 
                            (n-j-jb), jb, 
                            zone, dA(j,    j), ldda, 
                            dA(j+jb, j), ldda);
                magma_scopymatrix( n-j-jb,jb, dA( j+jb, j ), ldda, dW( j+jb, 0 ), ldda );
                
                // update the trailing submatrix with D
                magmablas_slascl_diag(MagmaLower, n-j-jb, jb,
                                      dA(j,    j), ldda,
                                      dA(j+jb, j), ldda,
                                      &iinfo);
                trace_gpu_end( 0, 0 );
                
                // update the trailing submatrix with L and W
                trace_gpu_start( 0, 0, "gemm", "gemm" );
                for (k=j+jb; k<n; k+=nb) {
                    magma_int_t kb = min(nb,n-k);
                    magma_sgemm(MagmaNoTrans, MagmaConjTrans, n-k, kb, jb,
                                mzone, dA(k, j), ldda, 
                                       dW(k, 0), ldda,
                                zone,  dA(k, k), ldda);
                    if (k==j+jb)
                        magma_event_record( event, stream[0] );
                }
                trace_gpu_end( 0, 0 );
            }
        }
    }
    
    trace_finalize( "ssytrf.svg","trace.css" );
    magma_queue_destroy(stream[0]);
    magma_queue_destroy(stream[1]);
    magma_event_destroy( event );
    magma_free( dW );
    magma_free_pinned( A );
    
    magmablasSetKernelStream( orig_stream );
    return MAGMA_SUCCESS;
} /* magma_ssytrf_nopiv */
Exemplo n.º 6
0
/**
    Purpose
    -------
    SPOTRF computes the Cholesky factorization of a real symmetric
    positive definite matrix dA.
    Auxiliary subroutine for spotrf2_ooc. It is multiple gpu interface to compute
    Cholesky of a "rectangular" matrix.

    The factorization has the form
       dA = U**H * U,   if UPLO = MagmaUpper, or
       dA = L  * L**H,  if UPLO = MagmaLower,
    where U is an upper triangular matrix and L is lower triangular.

    This is the block version of the algorithm, calling Level 3 BLAS.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of dA is stored;
      -     = MagmaLower:  Lower triangle of dA is stored.

    @param[in]
    n       INTEGER
            The order of the matrix dA.  N >= 0.

    @param[in,out]
    dA      REAL array on the GPU, dimension (LDDA,N)
            On entry, the symmetric matrix dA.  If UPLO = MagmaUpper, the leading
            N-by-N upper triangular part of dA contains the upper
            triangular part of the matrix dA, and the strictly lower
            triangular part of dA is not referenced.  If UPLO = MagmaLower, the
            leading N-by-N lower triangular part of dA contains the lower
            triangular part of the matrix dA, and the strictly upper
            triangular part of dA is not referenced.
    \n
            On exit, if INFO = 0, the factor U or L from the Cholesky
            factorization dA = U**H * U or dA = L * L**H.

    @param[in]
    ldda     INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            divisible by 16.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
      -     > 0:  if INFO = i, the leading minor of order i is not
                  positive definite, and the factorization could not be
                  completed.

    @ingroup magma_sposv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_spotrf3_mgpu(
    magma_int_t ngpu,
    magma_uplo_t uplo, magma_int_t m, magma_int_t n,
    magma_int_t off_i, magma_int_t off_j, magma_int_t nb,
    magmaFloat_ptr d_lA[],  magma_int_t ldda,
    magmaFloat_ptr d_lP[],  magma_int_t lddp,
    float *A,          magma_int_t lda, magma_int_t h,
    magma_queue_t queues[][3], magma_event_t events[][5],
    magma_int_t *info )
{
#define Alo(i, j)  (A +             ((j)+off_j)*lda  + (nb*(((i)/nb)%h)+off_i))
#define Aup(i, j)  (A + (nb*(((j)/nb)%h)+off_j)*lda  +               (i+off_i))

#define dlA(id, i, j)     (d_lA[(id)] + (j)*ldda + (i))
#define dlP(id, i, j, k)  (d_lP[(id)] + (k)*nb*lddp + (j)*lddp + (i))
#define dlPT(id, i, j, k) (d_lP[(id)] + (k)*nb*lddp + (j)*nb   + (i))

    magma_int_t     j, jb, nb0, nb2, d, dd, id, j_local, j_local2, buf;
    float c_one     = MAGMA_S_ONE;
    float c_neg_one = MAGMA_S_NEG_ONE;
    float          d_one     =  1.0;
    float          d_neg_one = -1.0;
    int upper = (uplo == MagmaUpper);
    float *dlpanel;
    magma_int_t n_local[MagmaMaxGPUs], ldpanel;
    const magma_int_t stream1 = 0, stream2 = 1, stream3 = 2;
    
    *info = 0;
    if (! upper && uplo != MagmaLower) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (!upper && ngpu*ldda < max(1,n)) {
        *info = -4;
    } else if (upper && ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );
    
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
    /* used by strsm_work */
    float c_zero    = MAGMA_S_ZERO;
    int trsm_nb = 128;
    int trsm_n = trsm_nb*((nb+trsm_nb-1)/trsm_nb);
    float *d_dinvA[MagmaMaxGPUs];
    float *d_x[MagmaMaxGPUs];
    #define dinvA(d,j) &(d_dinvA[(d)][(j)*trsm_nb*trsm_n])
    #define dx(d,j) &(d_x[(d)][(j)*nb*m])
    /*
     * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE
     */
    // TODO free memory on failure.
    for( d=0; d < ngpu; d++ ) {
        magma_setdevice(d);
        if ( (MAGMA_SUCCESS != magma_smalloc( &d_dinvA[d], 2*trsm_nb*trsm_n )) ||
             (MAGMA_SUCCESS != magma_smalloc( &d_x[d],     2*nb*(upper ? n : m) )) ) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }
    }
    magma_setdevice(0);
#endif
    
    /* initialization */
    for( d=0; d < ngpu; d++ ) {
        /* local-n and local-ld */
        if (upper) {
            n_local[d] = (n/(nb*ngpu))*nb;
            if (d < (n/nb)%ngpu)
                n_local[d] += nb;
            else if (d == (n/nb)%ngpu)
                n_local[d] += n%nb;
        } else {
            n_local[d] = (m/(nb*ngpu))*nb;
            if (d < (m/nb)%ngpu)
                n_local[d] += nb;
            else if (d == (m/nb)%ngpu)
                n_local[d] += m%nb;
        }
    }

    /* == initialize the trace */
    trace_init( 1, ngpu, 3, (CUstream_st**)queues );

    if (upper) {
        /* ---------------------------------------------- */
        /* Upper-triangular case                          */
        /* > Compute the Cholesky factorization A = U'*U. */
        /* ---------------------------------------------- */
        for (j=0; j < m; j += nb) {
            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%ngpu;
            buf = (j/nb)%ngpu; // right now, we have ngpu buffers, so id and buf are the same..
            
            /* Set the local index where the current panel is */
            j_local = j/(nb*ngpu);
            jb = min(nb, (m-j));
 
            /* Update the current diagonal block on stream1 */
            magma_setdevice(id);
            if ( j > 0 ) {
                magmablasSetKernelStream( queues[id][stream1] );
                trace_gpu_start( id, stream1, "syrk", "syrk" );
                magma_ssyrk(MagmaUpper, MagmaConjTrans, jb, j,
                            d_neg_one, dlA(id, 0, nb*j_local), ldda,
                            d_one,     dlA(id, j, nb*j_local), ldda);
                trace_gpu_end( id, stream1 );
            }
            
            /* send the diagonal to cpu on stream1 */
            trace_gpu_start( id, stream1, "comm", "D to CPU" );
            magma_sgetmatrix_async( jb, jb,
                                    dlA(id, j, nb*j_local), ldda,
                                    Aup(j,j),               lda,
                                    queues[id][stream1] );
            trace_gpu_end( id, stream1 );

            /* update off-diagonal blocks in the panel */
            if ( j > 0 ) {
                d = (j/nb+1)%ngpu;
                for( dd=0; dd < ngpu; dd++ ) {
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2 --;
                    nb0 = nb*j_local2; // number of local columns in the panel, while jb is panel-size (number of rows)
            
                    if ( n_local[d] > nb0 ) {
                        magma_setdevice(d);
                        magmablasSetKernelStream( queues[d][stream2] );
                        if ( d == id ) {
                            dlpanel = dlA(d,0,nb*j_local);
                            ldpanel = ldda;
                            // the GPU owns the row from start, and no need of synch.
                            //magma_queue_wait_event( queues[d][stream2], events[d][0] ); // rows arrived at gpu
                            magma_queue_wait_event( queues[d][stream2], events[d][4] ); // wait for look-ahead trsm to finish
                        } else {
                            dlpanel = dlP(d,nb,0,buf);
                            ldpanel = lddp;
                            magma_queue_wait_event( queues[d][stream2], events[d][0] ); // rows arrived at gpu
                        }
                        trace_gpu_start( d, stream2, "gemm", "gemm" );
                        magma_sgemm(MagmaConjTrans, MagmaNoTrans,
                                    jb, n_local[d]-nb0, j,
                                    c_neg_one, dlpanel,        ldpanel,
                                               dlA(d, 0, nb0), ldda,
                                    c_one,     dlA(d, j, nb0), ldda);
                        trace_gpu_end( d, stream2 );
                        magma_event_record( events[d][2], queues[d][stream2] );
                    }
                    d = (d+1)%ngpu;
                }
            }

            /* wait for panel and factorize it on cpu */
            magma_setdevice(id);
            magma_queue_sync( queues[id][stream1] );
            trace_cpu_start( 0, "getrf", "getrf" );
            lapackf77_spotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info);
            trace_cpu_end( 0 );
            if (*info != 0) {
                *info = *info + j;
                break;
            }
            
            /* send the diagonal to gpus on stream1 */
            if ( (j+jb) < n) {
                d = (j/nb+1)%ngpu;
                for( dd=0; dd < ngpu; dd++ ) {
                    if ( d == id ) {
                        dlpanel = dlA(d, j, nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlP(d,0,0,buf);
                        ldpanel = lddp;
                    }
                    magma_setdevice(d);
                    trace_gpu_start( d, stream1, "comm", "comm" );
                    magma_ssetmatrix_async( jb, jb,
                                            Aup(j,j), lda,
                                            dlpanel,  ldpanel,
                                            queues[d][stream1] );
                    trace_gpu_end( d, stream1 );
                    magma_event_record( events[d][1], queues[d][stream1] );
                    d = (d+1)%ngpu;
                }
            } else {
                magma_setdevice(id);
                trace_gpu_start( id, stream1, "comm", "comm" );
                magma_ssetmatrix_async( jb, jb,
                                        Aup(j,j),               lda,
                                        dlA(id, j, nb*j_local), ldda,
                                        queues[id][stream1] );
                trace_gpu_end( id, stream1 );
            }
            
            /* panel-factorize the off-diagonal */
            if ( (j+jb) < n) {
                d = (j/nb+1)%ngpu;
                for( dd=0; dd < ngpu; dd++ ) {
                    /* next column */
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2--;
                    if ( d == id ) {
                        dlpanel = dlA(d,j,nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlP(d,0,0,buf);
                        ldpanel = lddp;
                    }
                    nb2 = n_local[d] - j_local2*nb;
                    
                    magma_setdevice(d);
                    if ( j+jb < m && d == (j/nb+1)%ngpu ) {
                        /* owns the next column, look-ahead next block on stream1 */
                        nb0 = min(nb, nb2);
                        magmablasSetKernelStream( queues[d][stream1] );
                        magma_queue_wait_event( queues[d][stream1], events[d][2] ); // wait for gemm update
                        trace_gpu_start( d, stream1, "trsm", "trsm" );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
                        magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,0), trsm_nb );
                        magmablas_slaset( MagmaFull, nb0,     jb,     c_zero, c_zero, dx(d,0), nb0 );
                        magmablas_strsm_work( MagmaLeft, MagmaUpper,
                                              MagmaConjTrans, MagmaNonUnit,
                                              jb, nb0, c_one,
                                              dlpanel, ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                              1, dinvA(d,0), dx(d,0) );
#else
                        magma_strsm( MagmaLeft, MagmaUpper,
                                     MagmaConjTrans, MagmaNonUnit,
                                     jb, nb0, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, j, nb*j_local2), ldda);
#endif
                        magma_event_record( events[d][4], queues[d][stream1] );
                        trace_gpu_end( d, stream1 );
                    } else if ( nb2 > 0 ) {
                        /* update all the blocks on stream2 */
                        magma_queue_wait_event( queues[d][stream2], events[d][1] ); // wait for cholesky factor
                        trace_gpu_start( d, stream2, "trsm", "trsm" );
                        magmablasSetKernelStream( queues[d][stream2] );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
                        magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,0), trsm_nb );
                        magmablas_slaset( MagmaFull, nb2,     jb,     c_zero, c_zero, dx(d,0), nb2 );
                        magmablas_strsm_work( MagmaLeft, MagmaUpper,
                                              MagmaConjTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel, ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                              1, dinvA(d,0), dx(d,0) );
#else
                        magma_strsm( MagmaLeft, MagmaUpper,
                                     MagmaConjTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, j, nb*j_local2), ldda);
#endif
                        trace_gpu_end( d, stream2 );
                    }
                    d = (d+1)%ngpu;
                } /* end of for */

                /* ========================================================== */
                if ( j+jb < m ) {
                    d = (j/nb+1)%ngpu;
                    /* next column */
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2--;
                    nb0 = min(nb, n_local[d]-nb*j_local2 );
                
                    /* even on 1 gpu, off-diagonals are copied to cpu (synchronize at the end).      *
                     * so we have the Cholesky factor, but only diagonal submatrix of the big panel, *
                     * on cpu at the end.                                                            */
                    int d2, buf2;
                    magma_setdevice(d);
                    /* lookahead done */
                    magma_queue_wait_event( queues[d][stream3], events[d][4] );
                
                    trace_gpu_start( d, stream3, "comm", "row to CPU" );
                    magma_sgetmatrix_async( (j+jb), nb0,
                                            dlA(d, 0, nb*j_local2), ldda,
                                            Aup(0,j+jb),            lda,
                                            queues[d][stream3] );
                    trace_gpu_end( d, stream3 );
                    magma_event_record( events[d][3], queues[d][stream3] );
                    /* needed on pluto */
                    //magma_queue_sync( queues[d][stream3] );
                
                    /* broadcast rows to gpus on stream2 */
                    buf2 = ((j+jb)/nb)%ngpu;
                    for( d2=0; d2 < ngpu; d2++ ) {
                        if ( d2 != d ) {
                            magma_setdevice(d2);
                            trace_gpu_start( d2, stream3, "comm", "row to GPUs" );
                            magma_queue_wait_event( queues[d2][stream3], events[d][3] ); // rows arrived at cpu on stream3
                            magma_ssetmatrix_async( j+jb, nb0,
                                                    Aup(0,j+jb),       lda,
                                                    dlP(d2,nb,0,buf2), lddp,
                                                    queues[d2][stream3] );
                            trace_gpu_end( d2, stream3 );
                            magma_event_record( events[d2][0], queues[d2][stream3] );
                        }
                    }

                    /* =========================== */
                    /* update the remaining blocks */
                    nb2 = n_local[d]-(nb*j_local2 + nb0);
                    if ( nb2 > 0 ) {
                        if ( d == id ) {
                            dlpanel = dlA(d, j, nb*j_local);
                            ldpanel = ldda;
                        } else {
                            dlpanel = dlP(d,0,0,buf);
                            ldpanel = lddp;
                        }
                        magma_setdevice(d);
                        magmablasSetKernelStream( queues[d][stream2] );
                        trace_gpu_start( d, stream2, "trsm", "trsm" );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
                        int flag = 0;
                        if (flag == 0) {
                            magma_queue_wait_event( queues[d][stream2], events[d][4] ); // lookahead -> diagonal inversion
                        } else {
                            magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,flag), trsm_nb );
                            magma_queue_wait_event( queues[d][stream2], events[d][1] ); // panel received
                        }
                        magmablas_slaset( MagmaFull, nb2, jb, c_zero, c_zero, dx(d,1), nb2 );
                        magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel, ldpanel,
                                              dlA(d, j, nb*j_local2+nb0), ldda,
                                              flag, dinvA(d,flag), dx(d,1) );
#else
                        magma_queue_wait_event( queues[d][stream2], events[d][1] ); // wait for cholesky factor
                        magma_strsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel, ldpanel,
                                     dlA(d, j, nb*j_local2+nb0), ldda);
#endif
                        trace_gpu_end( d, stream2 );
                    }
                }
            } /* end of strsm */
        } /* end of for j=1, .., n */
    } else {
        /* ---------------------------------------------- */
        /* Lower-triangular case                          */
        /* > Compute the Cholesky factorization A = L*L'. */
        /* ---------------------------------------------- */
        for (j=0; j < n; j += nb) {
        
            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%ngpu;
            buf = (j/nb)%ngpu;
            
            /* Set the local index where the current panel is */
            j_local = j/(nb*ngpu);
            jb = min(nb, (n-j));

            /* Update the current diagonal block on stream1 */
            magma_setdevice(id);
            if ( j > 0 ) {
                magmablasSetKernelStream( queues[id][stream1] );
                magma_ssyrk(MagmaLower, MagmaNoTrans, jb, j,
                            d_neg_one, dlA(id, nb*j_local, 0), ldda,
                            d_one,     dlA(id, nb*j_local, j), ldda);
            }

            /* send the diagonal to cpu on stream1 */
            magma_sgetmatrix_async( jb, jb,
                                    dlA(id, nb*j_local, j), ldda,
                                    Alo(j,j),               lda,
                                    queues[id][stream1] );

            /* update off-diagonal blocks of the panel */
            if ( j > 0 ) {
                d = (j/nb+1)%ngpu;
                for( dd=0; dd < ngpu; dd++ ) {
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2 --;
                    nb0 = nb*j_local2;
            
                    if ( nb0 < n_local[d] ) {
                        magma_setdevice(d);
                        magmablasSetKernelStream( queues[d][stream2] );
                        if ( d == id ) {
                            dlpanel = dlA(d, nb*j_local, 0);
                            ldpanel = ldda;
                            magma_queue_wait_event( queues[d][stream2], events[d][4] ); // wait for look-ahead trsm to finish
                        } else {
                            dlpanel = dlPT(d,0,nb,buf);
                            ldpanel = nb;
                            magma_queue_wait_event( queues[d][stream2], events[d][0] ); // rows arrived at gpu
                        }
                        magma_sgemm( MagmaNoTrans, MagmaConjTrans,
                                     n_local[d]-nb0, jb, j,
                                     c_neg_one, dlA(d, nb0, 0), ldda,
                                                dlpanel,        ldpanel,
                                     c_one,     dlA(d, nb0, j), ldda);
                        magma_event_record( events[d][2], queues[d][stream2] );
                    }
                    d = (d+1)%ngpu;
                }
            }

            /* wait for the panel and factorized it on cpu */
            magma_setdevice(id);
            magma_queue_sync( queues[id][stream1] );
            lapackf77_spotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info);
            if (*info != 0) {
                *info = *info + j;
                break;
            }

            /* send the diagonal to gpus on stream1 */
            if ( (j+jb) < m) {
                d = (j/nb+1)%ngpu;
                for( dd=0; dd < ngpu; dd++ ) {
                    if ( d == id ) {
                        dlpanel = dlA(d, nb*j_local, j);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlPT(d, 0, 0, buf);
                        ldpanel = nb;
                    }
                    magma_setdevice(d);
                    magma_ssetmatrix_async( jb, jb,
                                            Alo(j,j), lda,
                                            dlpanel,  ldpanel,
                                            queues[d][stream1] );
                    magma_event_record( events[d][1], queues[d][stream1] );
                    d = (d+1)%ngpu;
                }
            } else {
                magma_setdevice(id);
                magma_ssetmatrix_async( jb, jb,
                                        Alo(j,j),               lda,
                                        dlA(id, nb*j_local, j), ldda,
                                        queues[id][stream1] );
            }

            /* panel factorize the off-diagonal */
            if ( (j+jb) < m) {
                d = (j/nb+1)%ngpu;
                for( dd=0; dd < ngpu; dd++ ) {
                    /* next column */
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2--;
                    if ( d == id ) {
                        dlpanel = dlA(d, nb*j_local, j);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlPT(d, 0, 0, buf);
                        ldpanel = nb;
                    }
                    nb2 = n_local[d] - j_local2*nb;
                    nb0 = min(nb, nb2);
                    
                    magma_setdevice(d);
                    if ( j+nb < n && d == (j/nb+1)%ngpu ) { /* owns next column, look-ahead next block on stream1 */
                        if ( j > 0 ) magma_queue_wait_event( queues[d][stream1], events[d][2] ); // wait for gemm update
                        magmablasSetKernelStream( queues[d][stream1] );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
                        magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,0), trsm_nb );
                        magmablas_slaset( MagmaFull, nb0,     jb,     c_zero, c_zero, dx(d,0), nb0 );
                        magmablas_strsm_work( MagmaRight, MagmaLower,
                                              MagmaConjTrans, MagmaNonUnit,
                                              nb0, jb, c_one,
                                              dlpanel, ldpanel,
                                              dlA(d, nb*j_local2, j), ldda,
                                              1, dinvA(d,0), dx(d,0) );
#else
                        magma_strsm( MagmaRight, MagmaLower,
                                     MagmaConjTrans, MagmaNonUnit,
                                     nb0, jb, c_one,
                                     dlpanel, ldpanel,
                                     dlA(d, nb*j_local2, j), ldda);
#endif
                        magma_event_record( events[d][4], queues[d][stream1] );
                    } else if ( nb2 > 0 ) { /* other gpus updating all the blocks on stream2 */
                        /* update the entire column */
                        magma_queue_wait_event( queues[d][stream2], events[d][1] ); // wait for the cholesky factor
                        magmablasSetKernelStream( queues[d][stream2] );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
                        magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,0), trsm_nb );
                        magmablas_slaset( MagmaFull, nb2,     jb,     c_zero, c_zero, dx(d,0), nb2 );
                        magmablas_strsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                              nb2, jb, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, nb*j_local2, j), ldda,
                                              1, dinvA(d,0), dx(d,0) );
#else
                        magma_strsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                     nb2, jb, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, nb*j_local2, j), ldda);
#endif
                    }
                    d = (d+1)%ngpu;
                } /* end for d */

                /* ========================================================== */
                if ( j+jb < n ) {
                    d = (j/nb+1)%ngpu;
                    /* next column */
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2--;
                    nb0 = min(nb, n_local[d]-nb*j_local2 );
                
                    /* even on 1 gpu, we copy off-diagonal to cpu (but don't synchronize).  */
                    /* so we have the Cholesky factor on cpu at the end.                    */
                    int d2, buf2;
//#define SPOTRF_DEVICE_TO_DEVICE
#ifdef SPOTRF_DEVICE_TO_DEVICE
                    // lookahead done
                
                    /* broadcast the rows to gpus */
                    buf2 = ((j+jb)/nb)%ngpu;
                    for( d2=0; d2 < ngpu; d2++ ) {
                        magma_setdevice(d2);
                        magma_queue_wait_event( queues[d2][stream3], events[d][4] );
                        if ( d2 != d ) {
                            magma_scopymatrix_async( nb0, j+jb,
                                                     dlPT(d2,0,nb,buf2), nb, // first nbxnb reserved for diagonal block
                                                     dlA(d, nb*j_local2, 0), ldda,
                                                     queues[d2][stream3] );
                            magma_event_record( events[d2][0], queues[d2][stream3] );
                        } else {
                            magma_sgetmatrix_async( nb0, j+jb,
                                                    dlA(d, nb*j_local2, 0), ldda,
                                                    Alo(j+jb,0),            lda,
                                                    queues[d][stream3] );
                        }
                    }
#else
                    // lookahead done
                    magma_setdevice(d);
                    magma_queue_wait_event( queues[d][stream3], events[d][4] );
                    magma_sgetmatrix_async( nb0, j+jb,
                                            dlA(d, nb*j_local2, 0), ldda,
                                            Alo(j+jb,0),            lda,
                                            queues[d][stream3] );
                    magma_event_record( events[d][3], queues[d][stream3] );
                    /* syn on rows on CPU, seem to be needed on Pluto */
                    //magma_queue_sync( queues[d][stream3] );
                
                    /* broadcast the rows to gpus */
                    buf2 = ((j+jb)/nb)%ngpu;
                    for( d2=0; d2 < ngpu; d2++ ) {
                        if ( d2 != d ) {
                            magma_setdevice(d2);
                            magma_queue_wait_event( queues[d2][stream3], events[d][3] ); // getmatrix done
                            magma_ssetmatrix_async( nb0, j+jb,
                                                    Alo(j+jb,0),        lda,
                                                    dlPT(d2,0,nb,buf2), nb, // first nbxnb reserved for diagonal block
                                                    queues[d2][stream3] );
                            magma_event_record( events[d2][0], queues[d2][stream3] );
                        }
                    }
#endif
                    /* =================================== */
                    /* updates remaining blocks on stream2 */
                    nb2 = n_local[d] - (j_local2*nb + nb0);
                    if ( nb2 > 0 ) {
                        if ( d == id ) {
                            dlpanel = dlA(d, nb*j_local, j);
                            ldpanel = ldda;
                        } else {
                            dlpanel = dlPT(d,0,0,buf);
                            ldpanel = nb;
                        }
                        magma_setdevice(d);
                        magmablasSetKernelStream( queues[d][stream2] );
                        /* update the remaining blocks in the column */
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
                        int flag = 0;
                        if (flag == 0) {
                            magma_queue_wait_event( queues[d][stream2], events[d][4] ); // lookahead -> diagonal inversion
                        } else {
                            magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,flag), trsm_nb );
                            magma_queue_wait_event( queues[d][stream2], events[d][1] ); // panel received
                        }
                        magmablas_slaset( MagmaFull, nb2, jb, c_zero, c_zero, dx(d,1), nb2 );
                        magmablas_strsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                              nb2, jb, c_one,
                                              dlpanel,                    ldpanel,
                                              dlA(d, nb*j_local2+nb0, j), ldda,
                                              flag, dinvA(d,flag), dx(d,1) );
#else
                        magma_queue_wait_event( queues[d][stream2], events[d][1] ); // panel received
                        magma_strsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                     nb2, jb, c_one,
                                     dlpanel,                    ldpanel,
                                     dlA(d, nb*j_local2+nb0, j), ldda);
#endif
                    }
                }
            }
        }
    } /* end of else not upper */

    /* == finalize the trace == */
    trace_finalize( "spotrf.svg", "trace.css" );
    for( d=0; d < ngpu; d++ ) {
        magma_setdevice(d);
        for( j=0; j < 3; j++ ) {
            magma_queue_sync( queues[d][j] );
        }
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
        magma_free( d_dinvA[d] );
        magma_free( d_x[d] );
#endif
    }
    magma_setdevice( orig_dev );
    magmablasSetKernelStream( orig_stream );

    return *info;
} /* magma_spotrf_mgpu */
Exemplo n.º 7
0
/**
    Purpose
    -------
    DLATRD reduces NB rows and columns of a real symmetric matrix A to
    symmetric tridiagonal form by an orthogonal similarity
    transformation Q' * A * Q, and returns the matrices V and W which are
    needed to apply the transformation to the unreduced part of A.

    If UPLO = MagmaUpper, DLATRD reduces the last NB rows and columns of a
    matrix, of which the upper triangle is supplied;
    if UPLO = MagmaLower, DLATRD reduces the first NB rows and columns of a
    matrix, of which the lower triangle is supplied.

    This is an auxiliary routine called by DSYTRD.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
            Specifies whether the upper or lower triangular part of the
            symmetric matrix A is stored:
      -     = MagmaUpper: Upper triangular
      -     = MagmaLower: Lower triangular

    @param[in]
    n       INTEGER
            The order of the matrix A.

    @param[in]
    nb      INTEGER
            The number of rows and columns to be reduced.

    @param[in,out]
    A       DOUBLE_PRECISION array, dimension (LDA,N)
            On entry, the symmetric matrix A.  If UPLO = MagmaUpper, the leading
            n-by-n upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = MagmaLower, the
            leading n-by-n lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
            On exit:
      -     if UPLO = MagmaUpper, the last NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements above the diagonal
              with the array TAU, represent the orthogonal matrix Q as a
              product of elementary reflectors;
      -     if UPLO = MagmaLower, the first NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements below the diagonal
              with the array TAU, represent the  orthogonal matrix Q as a
              product of elementary reflectors.
            See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= (1,N).

    @param[out]
    e       DOUBLE_PRECISION array, dimension (N-1)
            If UPLO = MagmaUpper, E(n-nb:n-1) contains the superdiagonal
            elements of the last NB columns of the reduced matrix;
            if UPLO = MagmaLower, E(1:nb) contains the subdiagonal elements of
            the first NB columns of the reduced matrix.

    @param[out]
    tau     DOUBLE_PRECISION array, dimension (N-1)
            The scalar factors of the elementary reflectors, stored in
            TAU(n-nb:n-1) if UPLO = MagmaUpper, and in TAU(1:nb) if UPLO = MagmaLower.
            See Further Details.

    @param[out]
    W       DOUBLE_PRECISION array, dimension (LDW,NB)
            The n-by-nb matrix W required to update the unreduced part
            of A.

    @param[in]
    ldw     INTEGER
            The leading dimension of the array W. LDW >= max(1,N).

    Further Details
    ---------------
    If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary
    reflectors

       Q = H(n) H(n-1) . . . H(n-nb+1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i),
    and tau in TAU(i-1).

    If UPLO = MagmaLower, the matrix Q is represented as a product of elementary
    reflectors

       Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i),
    and tau in TAU(i).

    The elements of the vectors v together form the n-by-nb matrix V
    which is needed, with W, to apply the transformation to the unreduced
    part of the matrix, using a symmetric rank-2k update of the form:
    A := A - V*W' - W*V'.

    The contents of A on exit are illustrated by the following examples
    with n = 5 and nb = 2:

    if UPLO = MagmaUpper:                       if UPLO = MagmaLower:

      (  a   a   a   v4  v5 )              (  d                  )
      (      a   a   v4  v5 )              (  1   d              )
      (          a   1   v5 )              (  v1  1   a          )
      (              d   1  )              (  v1  v2  a   a      )
      (                  d  )              (  v1  v2  a   a   a  )

    where d denotes a diagonal element of the reduced matrix, a denotes
    an element of the original matrix that is unchanged, and vi denotes
    an element of the vector defining H(i).

    @ingroup magma_dsyev_aux
    ********************************************************************/
extern "C" double
magma_dlatrd_mgpu(magma_int_t num_gpus, magma_uplo_t uplo,
                  magma_int_t n0, magma_int_t n, magma_int_t nb, magma_int_t nb0,
                  double *A,  magma_int_t lda,
                  double *e, double *tau,
                  double *W,   magma_int_t ldw,
                  double **dA, magma_int_t ldda, magma_int_t offset,
                  double **dW, magma_int_t lddw,
                  double *dwork[MagmaMaxGPUs], magma_int_t ldwork,
                  magma_int_t k,
                  double *dx[MagmaMaxGPUs],
                  double *dy[MagmaMaxGPUs],
                  double *work,
                  magma_queue_t stream[][10],
                  double *times)
{
#define A(i, j) (A + (j)*lda + (i))
#define W(i, j) (W + (j)*ldw + (i))

#define dA(id, i, j)  (dA[(id)] + ((j)+loffset)*ldda + (i) + offset)
#define dW(id, i, j)  (dW[(id)] + (j)          *lddw + (i))
#define dW1(id, i, j) (dW[(id)] + ((j)+nb)     *lddw + (i))

    double mv_time = 0.0;
    magma_int_t i;
#ifndef MAGMABLAS_DSYMV_MGPU
    magma_int_t loffset = nb0*((offset/nb0)/num_gpus);
#endif

    double c_neg_one = MAGMA_D_NEG_ONE;
    double c_one     = MAGMA_D_ONE;
    double c_zero    = MAGMA_D_ZERO;
    double value     = MAGMA_D_ZERO;
    magma_int_t id, idw, i_one = 1;

    //magma_int_t kk;
    magma_int_t ione = 1;

    magma_int_t i_n, i_1, iw;

    double alpha;

    double *dx2[MagmaMaxGPUs];
    double *f;
    magma_dmalloc_cpu( &f, n );

    if (n <= 0) {
        return 0;
    }

//#define PROFILE_SYMV
#ifdef PROFILE_SYMV
    magma_event_t start, stop;
    float etime;
    magma_timestr_t cpu_start, cpu_end;
    magma_setdevice(0);
    magma_event_create( &start );
    magma_event_create( &stop  );
#endif

    if (uplo == MagmaUpper) {
        /* Reduce last NB columns of upper triangle */
        for (i = n-1; i >= n - nb ; --i) {
            i_1 = i + 1;
            i_n = n - i - 1;
            iw = i - n + nb;
            if (i < n-1) {
                /* Update A(1:i,i) */
                double wii = *W(i, iw+1);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                    lapackf77_dlacgv(&i_one, &wii, &ldw);
                #endif
                wii = -wii;
                blasf77_daxpy(&i_1, &wii, A(0, i+1), &i_one, A(0, i), &ione);

                wii = *A(i, i+1);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                    lapackf77_dlacgv(&i_one, &wii, &ldw);
                #endif
                wii = -wii;
                blasf77_daxpy(&i_1, &wii, W(0, iw+1), &i_one, A(0, i), &ione);
            }
            if (i > 0) {
                /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */
                alpha = *A(i-1, i);
                lapackf77_dlarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]);

                e[i-1] = MAGMA_D_REAL( alpha );
                *A(i-1,i) = MAGMA_D_MAKE( 1, 0 );
                for( id=0; id < num_gpus; id++ ) {
                    magma_setdevice(id);
                    dx2[id] = dW1(id, 0, iw);
                    magma_dsetvector_async( n, A(0,i), 1, dW1(id, 0, iw), 1, stream[id][0]);
#ifndef  MAGMABLAS_DSYMV_MGPU
                    magma_dsetvector_async( i, A(0,i), 1, dx[id], 1, stream[id][0] );
#endif
                }
                magmablas_dsymv_mgpu(num_gpus, k, MagmaUpper, i, nb0, c_one, dA, ldda, 0,
                                     dx2, ione, c_zero, dy, ione, dwork, ldwork,
                                     work, W(0, iw), stream );

                if (i < n-1) {
                    blasf77_dgemv(MagmaTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw,
                                  A(0, i), &ione, &c_zero, W(i+1, iw), &ione);
                }

                /* overlap update */
                if ( i < n-1 && i-1 >= n - nb ) {
                    magma_int_t im1_1 = i_1 - 1;
                    magma_int_t im1   = i-1;
                    /* Update A(1:i,i) */
                    #if defined(PRECISION_z) || defined(PRECISION_c)
                        magma_int_t im1_n = i_n + 1;
                        lapackf77_dlacgv(&im1_n, W(im1, iw+1), &ldw);
                    #endif
                    blasf77_dgemv("No transpose", &im1_1, &i_n, &c_neg_one, A(0, i+1), &lda,
                                  W(im1, iw+1), &ldw, &c_one, A(0, i-1), &ione);
                    #if defined(PRECISION_z) || defined(PRECISION_c)
                        lapackf77_dlacgv(&im1_n, W(im1, iw+1), &ldw);
                        lapackf77_dlacgv(&im1_n, A(im1, i +1), &lda);
                    #endif
                    blasf77_dgemv("No transpose", &im1_1, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                                  A(im1, i+1), &lda, &c_one, A(0, i-1), &ione);
                    #if defined(PRECISION_z) || defined(PRECISION_c)
                        lapackf77_dlacgv(&im1_n, A(im1, i+1), &lda);
                    #endif
                }

                // 3. Here is where we need it // TODO find the right place
                magmablas_dsymv_sync(num_gpus, k, i, work, W(0, iw), stream );

                if (i < n-1) {
                    blasf77_dgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda,
                                  W(i+1, iw), &ione, &c_one, W(0, iw), &ione);

                    blasf77_dgemv(MagmaTransStr, &i, &i_n, &c_one, A(0, i+1), &lda,
                                  A(0, i), &ione, &c_zero, W(i+1, iw), &ione);

                    blasf77_dgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                                  W(i+1, iw), &ione, &c_one, W(0, iw), &ione);
                }

                blasf77_dscal(&i, &tau[i - 1], W(0, iw), &ione);

                #if defined(PRECISION_z) || defined(PRECISION_c)
                cblas_ddot_sub( i, W(0,iw), ione, A(0,i), ione, &value );
                #else
                value = cblas_ddot( i, W(0,iw), ione, A(0,i), ione );
                #endif
                alpha = tau[i - 1] * -.5f * value;
                blasf77_daxpy(&i, &alpha, A(0, i), &ione, W(0, iw), &ione);

                for( id=0; id < num_gpus; id++ ) {
                    magma_setdevice(id);
                    if ( k > 1 ) {
                        magma_dsetvector_async( n, W(0,iw), 1, dW(id, 0, iw), 1, stream[id][1] );
                    } else {
                        magma_dsetvector_async( n, W(0,iw), 1, dW(id, 0, iw), 1, stream[id][0] );
                    }
                }
            }
        }
    } else {
        /*  Reduce first NB columns of lower triangle */
        for (i = 0; i < nb; ++i) {
            /* Update A(i:n,i) */
            i_n = n - i;
            idw = ((offset+i)/nb)%num_gpus;
            if ( i > 0 ) {
                trace_cpu_start( 0, "gemv", "gemv" );
                double wii = *W(i, i-1);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                    lapackf77_dlacgv(&i_one, &wii, &ldw);
                #endif
                wii = -wii;
                blasf77_daxpy( &i_n, &wii, A(i, i-1), &ione, A(i, i), &ione);

                wii = *A(i, i-1);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                    lapackf77_dlacgv(&i_one, &wii, &lda);
                #endif
                wii = -wii;
                blasf77_daxpy( &i_n, &wii, W(i, i-1), &ione, A(i, i), &ione);
            }

            if (i < n-1) {
                /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */
                i_n = n - i - 1;
                trace_cpu_start( 0, "larfg", "larfg" );
                alpha = *A(i+1, i);
#ifdef PROFILE_SYMV
                cpu_start = get_current_time();
#endif
                lapackf77_dlarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]);
#ifdef PROFILE_SYMV
                cpu_end = get_current_time();
                times[0] += GetTimerValue(cpu_start,cpu_end)/1000.0;
#endif
                e[i] = MAGMA_D_REAL( alpha );
                *A(i+1,i) = MAGMA_D_MAKE( 1, 0 );
                trace_cpu_end( 0 );

                /* Compute W(i+1:n,i) */
                // 1. Send the block reflector  A(i+1:n,i) to the GPU
                //trace_gpu_start(  idw, 0, "comm", "comm1" );
#ifndef  MAGMABLAS_DSYMV_MGPU
                magma_setdevice(idw);
                magma_dsetvector( i_n, A(i+1,i), 1, dA(idw, i+1, i), 1 );
#endif
                for( id=0; id < num_gpus; id++ ) {
                    magma_setdevice(id);
                    trace_gpu_start( id, 0, "comm", "comm" );
#ifdef MAGMABLAS_DSYMV_MGPU
                    dx2[id] = dW1(id, 0, i)-offset;
#else
                    dx2[id] = dx[id];
                    magma_dsetvector( i_n, A(i+1,i), 1, dx[id], 1 );
#endif
                    magma_dsetvector_async( n, A(0,i), 1, dW1(id, 0, i), 1, stream[id][0] );
                    trace_gpu_end( id, 0 );
                }
                /* mat-vec on multiple GPUs */
#ifdef PROFILE_SYMV
                magma_setdevice(0);
                magma_event_record(start, stream[0][0]);
#endif
                magmablas_dsymv_mgpu(num_gpus, k, MagmaLower, i_n, nb0, c_one, dA, ldda, offset+i+1,
                                       dx2, ione, c_zero, dy, ione, dwork, ldwork,
                                       work, W(i+1,i), stream );
#ifdef PROFILE_SYMV
                magma_setdevice(0);
                magma_event_record(stop, stream[0][0]);
#endif
                trace_cpu_start( 0, "gemv", "gemv" );
                blasf77_dgemv(MagmaTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw,
                              A(i+1, i), &ione, &c_zero, W(0, i), &ione);
                blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda,
                              W(0, i), &ione, &c_zero, f, &ione);
                blasf77_dgemv(MagmaTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda,
                              A(i+1, i), &ione, &c_zero, W(0, i), &ione);
                trace_cpu_end( 0 );

                /* overlap update */
                if ( i > 0 && i+1 < n ) {
                    magma_int_t ip1 = i+1;
                    trace_cpu_start( 0, "gemv", "gemv" );
                    #if defined(PRECISION_z) || defined(PRECISION_c)
                        lapackf77_dlacgv(&i, W(ip1, 0), &ldw);
                    #endif
                    blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, A(ip1, 0), &lda,
                                  W(ip1, 0), &ldw, &c_one, A(ip1, ip1), &ione);
                    #if defined(PRECISION_z) || defined(PRECISION_c)
                        lapackf77_dlacgv(&i, W(ip1, 0), &ldw);
                        lapackf77_dlacgv(&i, A(ip1, 0), &lda);
                    #endif
                    blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, W(ip1, 0), &ldw,
                                  A(ip1, 0), &lda, &c_one, A(ip1, ip1), &ione);
                    #if defined(PRECISION_z) || defined(PRECISION_c)
                        lapackf77_dlacgv(&i, A(ip1, 0), &lda);
                    #endif
                    trace_cpu_end( 0 );
                }

                /* synchronize */
                magmablas_dsymv_sync(num_gpus, k, i_n, work, W(i+1,i), stream );
#ifdef PROFILE_SYMV
                cudaEventElapsedTime(&etime, start, stop);
                mv_time += (etime/1000.0);
                times[1+(i_n/(n0/10))] += (etime/1000.0);
#endif
                trace_cpu_start( 0, "axpy", "axpy" );
                if (i != 0)
                    blasf77_daxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione);

                blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw,
                              W(0, i), &ione, &c_one, W(i+1, i), &ione);
                blasf77_dscal(&i_n, &tau[i], W(i+1,i), &ione);

                #if defined(PRECISION_z) || defined(PRECISION_c)
                    cblas_ddot_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value );
                #else
                    value = cblas_ddot( i_n, W(i+1,i), ione, A(i+1,i), ione );
                #endif
                alpha = tau[i]* -.5f * value;
                blasf77_daxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione);
                trace_cpu_end( 0 );
                for( id=0; id < num_gpus; id++ ) {
                    magma_setdevice(id);
                    if ( k > 1 ) {
                        magma_dsetvector_async( n, W(0,i), 1, dW(id, 0, i), 1, stream[id][1] );
                    } else {
                        magma_dsetvector_async( n, W(0,i), 1, dW(id, 0, i), 1, stream[id][0] );
                    }
                }
            }
        }
    }

#ifdef PROFILE_SYMV
    magma_setdevice(0);
    magma_event_destory( start );
    magma_event_destory( stop  );
#endif
    for( id=0; id < num_gpus; id++ ) {
        magma_setdevice(id);
        if ( k > 1 )
            magma_queue_sync(stream[id][1]);
    }
    magma_free_cpu(f);

    return mv_time;
} /* magma_dlatrd_mgpu */
Exemplo n.º 8
0
extern "C" magma_int_t 
magma_dsytrf_nopiv(magma_uplo_t uplo, magma_int_t n, 
                   double *A, magma_int_t lda, 
                   magma_int_t *info)
{
/*  -- MAGMA (version 1.6.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2011

    Purpose   
    =======   

    DSYTRF_nopiv computes the LDLt factorization of a real symmetric   
    matrix A. This version does not require work space on the GPU passed 
    as input. GPU memory is allocated in the routine.

    The factorization has the form   
       A = U\*\*H * D * U,  if UPLO = 'U', or   
       A = L  * D * L\*\*H, if UPLO = 'L',   
    where U is an upper triangular matrix, L is lower triangular, and
    D is a diagonal matrix.

    This is the block version of the algorithm, calling Level 3 BLAS.   

    Arguments   
    =========   

    UPLO    (input) CHARACTER*1   
            = 'U':  Upper triangle of A is stored;   
            = 'L':  Lower triangle of A is stored.   

    N       (input) INTEGER   
            The order of the matrix A.  N >= 0.   

    A       (input/output) DOUBLE_PRECISION array, dimension (LDA,N)   
            On entry, the symmetric matrix A.  If UPLO = 'U', the leading   
            N-by-N upper triangular part of A contains the upper   
            triangular part of the matrix A, and the strictly lower   
            triangular part of A is not referenced.  If UPLO = 'L', the   
            leading N-by-N lower triangular part of A contains the lower   
            triangular part of the matrix A, and the strictly upper   
            triangular part of A is not referenced.   

            On exit, if INFO = 0, the factor U or L from the Cholesky   
            factorization A = U\*\*H*U or A = L*L\*\*H.   

            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using cudaMallocHost.

    LDA     (input) INTEGER   
            The leading dimension of the array A.  LDA >= max(1,N).   

    INFO    (output) INTEGER   
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value 
                  if INFO = -6, the GPU memory allocation failed 
            > 0:  if INFO = i, the leading minor of order i is not   
                  positive definite, and the factorization could not be   
                  completed.   

    =====================================================================    */


    /* Local variables */
    double zone  = MAGMA_D_ONE;
    double mzone = MAGMA_D_NEG_ONE;
    int                upper = (uplo == MagmaUpper);
    magma_int_t j, k, jb, ldda, nb, ib, iinfo;
    magmaDouble_ptr dA;
    magmaDouble_ptr dW;

    *info = 0;
    if (! upper && uplo != MagmaLower) {
      *info = -1;
    } else if (n < 0) {
      *info = -2;
    } else if (lda < max(1,n)) {
      *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return MAGMA_ERR_ILLEGAL_VALUE;
    }

    /* Quick return */
    if ( n == 0 )
      return MAGMA_SUCCESS;

    ldda = ((n+31)/32)*32;
    nb = magma_get_dsytrf_nopiv_nb(n);
    ib = min(32, nb); // inner-block for diagonal factorization

    if ((MAGMA_SUCCESS != magma_dmalloc(&dA, n *ldda)) ||
        (MAGMA_SUCCESS != magma_dmalloc(&dW, nb*ldda))) {
        /* alloc failed so call the non-GPU-resident version */
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    magma_queue_t stream[2];
    magma_event_t event;
    magma_queue_create(&stream[0]);
    magma_queue_create(&stream[1]);
    magma_event_create( &event );
    trace_init( 1, 1, 2, (CUstream_st**)stream );

    //if (nb <= 1 || nb >= n) 
    //{
    //    lapackf77_dpotrf(uplo_, &n, a, &lda, info);
    //} else 
    {
        /* Use hybrid blocked code. */
        if (upper) {
            //=========================================================
            // Compute the LDLt factorization A = U'*D*U without pivoting.
            // copy matrix to GPU
            for (j=0; j<n; j+=nb) {
                jb = min(nb, (n-j));
                trace_gpu_start( 0, 0, "set", "set" );
                magma_dsetmatrix_async(j+jb, jb, A(0, j), lda, dA(0, j), ldda, stream[0]);
                trace_gpu_end( 0, 0 );
            }

            // main loop
            for (j=0; j<n; j += nb) {
                jb = min(nb, (n-j));

                // copy A(j,j) back to CPU
                trace_gpu_start( 0, 0, "get", "get" );
                magma_dgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, stream[0]);
                trace_gpu_end( 0, 0 );

                // copy j-th column of U back to CPU
                magma_queue_wait_event( stream[1], event );
                trace_gpu_start( 0, 1, "get", "get" );
                magma_dgetmatrix_async(j, jb, dA(0, j), ldda, A(0, j), lda, stream[1]);
                trace_gpu_end( 0, 1 );

                // factorize the diagonal block
                magma_queue_sync(stream[0]);
                trace_cpu_start( 0, "potrf", "potrf" );
                dsytrf_nopiv_cpu(MagmaUpper, jb, ib, A(j, j), lda, info);
                trace_cpu_end( 0 );
                if (*info != 0){
                    *info = *info + j;
                    break;
                }

                // copy A(j,j) back to GPU
                trace_gpu_start( 0, 0, "set", "set" );
                magma_dsetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, stream[0]);
                trace_gpu_end( 0, 0 );
                
                if ( (j+jb) < n) {
                    // compute the off-diagonal blocks of current block column
                    magmablasSetKernelStream( stream[0] );
                    trace_gpu_start( 0, 0, "trsm", "trsm" );
                    magma_dtrsm(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaUnit, 
                                jb, (n-j-jb), 
                                zone, dA(j, j),    ldda, 
                                      dA(j, j+jb), ldda);
                    magma_dcopymatrix( jb, n-j-jb, dA( j, j+jb ), ldda, dWt( 0, j+jb ), nb );

                    // update the trailing submatrix with D
                    magmablas_dlascl_diag(MagmaUpper, jb, n-j-jb,
                                          dA(j,    j), ldda,
                                          dA(j, j+jb), ldda,
                                          &iinfo);
                    magma_event_record( event, stream[0] );
                    trace_gpu_end( 0, 0 );

                    // update the trailing submatrix with U and W
                    trace_gpu_start( 0, 0, "gemm", "gemm" );
                    for (k=j+jb; k<n; k+=nb)
                    {
                        magma_int_t kb = min(nb,n-k);
                        magma_dgemm(MagmaConjTrans, MagmaNoTrans, kb, n-k, jb,
                                    mzone, dWt(0, k), nb, 
                                           dA(j, k), ldda,
                                    zone,  dA(k, k), ldda);
                    }
                    trace_gpu_end( 0, 0 );
                }
            }
        } else {
            //=========================================================
            // Compute the LDLt factorization A = L*D*L' without pivoting.
            // copy the matrix to GPU
            for (j=0; j<n; j+=nb) {
                jb = min(nb, (n-j));
                trace_gpu_start( 0, 0, "set", "set" );
                magma_dsetmatrix_async((n-j), jb, A(j, j), lda, dA(j, j), ldda, stream[0]);
                trace_gpu_end( 0, 0 );
            }

            // main loop
            for (j=0; j<n; j+=nb) {
                jb = min(nb, (n-j));

                // copy A(j,j) back to CPU
                trace_gpu_start( 0, 0, "get", "get" );
                magma_dgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, stream[0]);
                trace_gpu_end( 0, 0 );

                // copy j-th row of L back to CPU
                magma_queue_wait_event( stream[1], event );
                trace_gpu_start( 0, 1, "get", "get" );
                magma_dgetmatrix_async(jb, j, dA(j, 0), ldda, A(j, 0), lda, stream[1]);
                trace_gpu_end( 0, 1 );

                // factorize the diagonal block
                magma_queue_sync(stream[0]);
                trace_cpu_start( 0, "potrf", "potrf" );
                dsytrf_nopiv_cpu(MagmaLower, jb, ib, A(j, j), lda, info);
                trace_cpu_end( 0 );
                if (*info != 0){
                    *info = *info + j;
                    break;
                }
                // copy A(j,j) back to GPU
                trace_gpu_start( 0, 0, "set", "set" );
                magma_dsetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, stream[0]);
                trace_gpu_end( 0, 0 );
                
                if ( (j+jb) < n) {
                    // compute the off-diagonal blocks of current block column
                    magmablasSetKernelStream( stream[0] );
                    trace_gpu_start( 0, 0, "trsm", "trsm" );
                    magma_dtrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaUnit, 
                                (n-j-jb), jb, 
                                zone, dA(j,    j), ldda, 
                                      dA(j+jb, j), ldda);
                    magma_dcopymatrix( n-j-jb,jb, dA( j+jb, j ), ldda, dW( j+jb, 0 ), ldda );

                    // update the trailing submatrix with D
                    magmablas_dlascl_diag(MagmaLower, n-j-jb, jb,
                                          dA(j,    j), ldda,
                                          dA(j+jb, j), ldda,
                                          &iinfo);
                    magma_event_record( event, stream[0] );
                    trace_gpu_end( 0, 0 );

                    // update the trailing submatrix with L and W
                    trace_gpu_start( 0, 0, "gemm", "gemm" );
                    for (k=j+jb; k<n; k+=nb)
                    {
                        magma_int_t kb = min(nb,n-k);
                        magma_dgemm(MagmaNoTrans, MagmaConjTrans, n-k, kb, jb,
                                    mzone, dA(k, j), ldda, 
                                           dW(k, 0), ldda,
                                    zone,  dA(k, k), ldda);
                    }
                    trace_gpu_end( 0, 0 );
                }
            }
        }
    }
    
    trace_finalize( "dsytrf.svg","trace.css" );
    magma_queue_destroy(stream[0]);
    magma_queue_destroy(stream[1]);
    magma_event_destroy( event );
    magma_free(dW);
    magma_free(dA);
    
    return MAGMA_SUCCESS;
} /* magma_dsytrf_nopiv */
Exemplo n.º 9
0
/**
    Purpose
    -------
    ZGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.
    Use two buffer to send panels.

    Arguments
    ---------
    @param[in]
    ngpu    INTEGER
            Number of GPUs to use. ngpu > 0.

    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    d_lAT   COMPLEX_16 array of pointers on the GPU, dimension (ngpu).
            On entry, the M-by-N matrix A distributed over GPUs
            (d_lAT[d] points to the local matrix on d-th GPU).
            It uses a 1D block column cyclic format (with the block size
            nb), and each local matrix is stored by row.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    @param[in]
    lddat   INTEGER
            The leading dimension of the array d_lAT[d]. LDDA >= max(1,M).

    @param[out]
    ipiv    INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    @param (workspace) on device
    d_lAP   COMPLEX_16 array of pointers on the GPU, dimension (ngpu).
            d_lAP[d] is the workspace on d-th GPU. Each local workspace
            must be of size (3+ngpu)*nb*maxm, where maxm is m rounded
            up to a multiple of 32 and nb is the block size.

    @param (workspace)
    W       COMPLEX_16 array, dimension (ngpu*nb*maxm).
            It is used to store panel on CPU.

    @param[in]
    ldw     INTEGER
            The leading dimension of the workspace w.

    @param[in]
    queues  magma_queue_t
            queues[d] points to the streams for the d-th GPU to execute
            in. Each GPU require two streams.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
      -     > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    @ingroup magma_zgesv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_zgetrf2_mgpu(
    magma_int_t ngpu,
    magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset,
    magmaDoubleComplex_ptr d_lAT[], magma_int_t lddat, magma_int_t *ipiv,
    magmaDoubleComplex_ptr d_lAP[],
    magmaDoubleComplex *W, magma_int_t ldw,
    magma_queue_t queues[][2],
    magma_int_t *info)
{
#define dAT(id,i,j)  (d_lAT[(id)] + ((offset)+(i)*nb)*lddat + (j)*nb)
#define W(j) (W + ((j)%ngpu)*nb*ldw)

    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;

    magma_int_t block_size = 32;
    magma_int_t iinfo, n_local[MagmaMaxGPUs];
    magma_int_t maxm, mindim;
    magma_int_t i, j, d, dd, rows, cols, s, ldpan[MagmaMaxGPUs];
    magma_int_t id, j_local, j_local2, nb0, nb1, h = 2+ngpu;
    magmaDoubleComplex *d_panel[MagmaMaxGPUs], *panel_local[MagmaMaxGPUs];

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -2;
    else if (n < 0)
        *info = -3;
    else if (ngpu*lddat < max(1,n))
        *info = -5;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    if ( ngpu > ceil((double)n/nb) ) {
        *info = -1;
        return *info;
    }

    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );
    
    /* Use hybrid blocked code. */
    maxm  = ((m + block_size-1)/block_size)*block_size;

    /* some initializations */
    for (d=0; d < ngpu; d++) {
        magma_setdevice(d);
        
        n_local[d] = ((n/nb)/ngpu)*nb;
        if (d < (n/nb) % ngpu)
            n_local[d] += nb;
        else if (d == (n/nb) % ngpu)
            n_local[d] += n % nb;
        
        /* workspaces */
        d_panel[d] = &(d_lAP[d][h*nb*maxm]);   /* temporary panel storage */
    }
    trace_init( 1, ngpu, 2, (CUstream_st**)queues );

    /* start sending the panel to cpu */
    nb0 = min(mindim, nb);
    magma_setdevice(0);
    magmablasSetKernelStream(queues[0][1]);
    trace_gpu_start( 0, 1, "comm", "get" );
    magmablas_ztranspose( nb0, m, dAT(0,0,0), lddat, d_lAP[0], maxm );
    magma_zgetmatrix_async( m, nb0,
                            d_lAP[0], maxm,
                            W(0),     ldw, queues[0][1] );
    trace_gpu_end( 0, 1 );

    /* ------------------------------------------------------------------------------------- */
    magma_timer_t time=0;
    timer_start( time );

    s = mindim / nb;
    for( j=0; j < s; j++ ) {
        /* Set the GPU number that holds the current panel */
        id = j % ngpu;
        magma_setdevice(id);
        
        /* Set the local index where the current panel is */
        j_local = j/ngpu;
        cols  = maxm - j*nb;
        rows  = m - j*nb;
        
        /* synchronize j-th panel from id-th gpu into work */
        magma_queue_sync( queues[id][1] );
        
        /* j-th panel factorization */
        trace_cpu_start( 0, "getrf", "getrf" );
        lapackf77_zgetrf( &rows, &nb, W(j), &ldw, ipiv+j*nb, &iinfo);
        if ( (*info == 0) && (iinfo > 0) ) {
            *info = iinfo + j*nb;
        }
        trace_cpu_end( 0 );
        
        /* start sending the panel to all the gpus */
        d = (j+1) % ngpu;
        for( dd=0; dd < ngpu; dd++ ) {
            magma_setdevice(d);
            trace_gpu_start( 0, 1, "comm", "set" );
            magma_zsetmatrix_async( rows, nb,
                                    W(j),     ldw,
                                    &d_lAP[d][(j%h)*nb*maxm], cols,
                                    queues[d][1] );
            trace_gpu_end( 0, 1 );
            d = (d+1) % ngpu;
        }
        
        /* apply the pivoting */
        d = (j+1) % ngpu;
        for( dd=0; dd < ngpu; dd++ ) {
            magma_setdevice(d);
            
            trace_gpu_start( d, 1, "pivot", "pivot" );
            if ( dd == 0 ) {
                for( i=j*nb; i < j*nb + nb; ++i ) {
                    ipiv[i] += j*nb;
                }
            }
            magmablas_zlaswp_q( lddat, dAT(d,0,0), lddat, j*nb + 1, j*nb + nb, ipiv, 1, queues[d][0] );
            trace_gpu_end( d, 1 );
            d = (d+1) % ngpu;
        }
    
    
        /* update the trailing-matrix/look-ahead */
        d = (j+1) % ngpu;
        for( dd=0; dd < ngpu; dd++ ) {
            magma_setdevice(d);
            
            /* storage for panel */
            if ( d == id ) {
                /* the panel belond to this gpu */
                panel_local[d] = dAT(d,j,j_local);
                ldpan[d] = lddat;
                /* next column */
                j_local2 = j_local+1;
            } else {
                /* the panel belong to another gpu */
                panel_local[d] = d_panel[d];
                ldpan[d] = nb;
                /* next column */
                j_local2 = j_local;
                if ( d < id ) j_local2 ++;
            }
            /* the size of the next column */
            if ( s > (j+1) ) {
                nb0 = nb;
            } else {
                nb0 = n_local[d]-nb*(s/ngpu);
                if ( d < s % ngpu ) nb0 -= nb;
            }
            if ( d == (j+1) % ngpu) {
                /* owns the next column, look-ahead the column */
                nb1 = nb0;
                magmablasSetKernelStream(queues[d][1]);
                
                /* make sure all the pivoting has been applied */
                magma_queue_sync(queues[d][0]);
                trace_gpu_start( d, 1, "gemm", "gemm" );
                /* transpose panel on GPU */
                magmablas_ztranspose( rows, nb, &d_lAP[d][(j%h)*nb*maxm], cols, panel_local[d], ldpan[d] );
                /* synch for remaining update */
                magma_queue_sync(queues[d][1]);
            } else {
                /* update the entire trailing matrix */
                nb1 = n_local[d] - j_local2*nb;
                magmablasSetKernelStream(queues[d][0]);
                
                /* synchronization to make sure panel arrived on gpu */
                magma_queue_sync(queues[d][1]);
                trace_gpu_start( d, 0, "gemm", "gemm" );
                /* transpose panel on GPU */
                magmablas_ztranspose( rows, nb, &d_lAP[d][(j%h)*nb*maxm], cols, panel_local[d], ldpan[d] );
            }
            
            /* gpu updating the trailing matrix */
            magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                         nb1, nb, c_one,
                         panel_local[d],       ldpan[d],
                         dAT(d, j, j_local2), lddat);
            magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                         nb1, m-(j+1)*nb, nb,
                         c_neg_one, dAT(d, j,   j_local2),         lddat,
                                    &(panel_local[d][nb*ldpan[d]]), ldpan[d],
                         c_one,     dAT(d, j+1, j_local2),         lddat );
        
            if ( d == (j+1) % ngpu ) {
                /* Set the local index where the current panel is */
                int loff    = j+1;
                int j_local = (j+1)/ngpu;
                int ldda    = maxm - (j+1)*nb;
                int cols    = m - (j+1)*nb;
                nb0 = min(nb, mindim - (j+1)*nb); /* size of the diagonal block */
                trace_gpu_end( d, 1 );
                
                if ( nb0 > 0 ) {
                    /* transpose the panel for sending it to cpu */
                    trace_gpu_start( d, 1, "comm", "get" );
                    magmablas_ztranspose( nb0, m-(j+1)*nb, dAT(d,loff,j_local), lddat, &d_lAP[d][((j+1)%h)*nb*maxm], ldda );
             
                    /* send the panel to cpu */
                    magma_zgetmatrix_async( cols, nb0,
                                            &d_lAP[d][((j+1)%h)*nb*maxm], ldda,
                                            W(j+1), ldw, queues[d][1] );

                    trace_gpu_end( d, 1 );
                }
            } else {
                trace_gpu_end( d, 0 );
            }
            
            d = (d+1) % ngpu;
        }
    
        /* update the remaining matrix by gpu owning the next panel */
        if ( (j+1) < s ) {
            int j_local = (j+1)/ngpu;
            int rows  = m - (j+1)*nb;
            
            d = (j+1) % ngpu;
            magma_setdevice(d);
            magmablasSetKernelStream(queues[d][0]);
            trace_gpu_start( d, 0, "gemm", "gemm" );

            magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                         n_local[d] - (j_local+1)*nb, nb,
                         c_one, panel_local[d],       ldpan[d],
                                dAT(d,j,j_local+1),  lddat );
            magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                         n_local[d]-(j_local+1)*nb, rows, nb,
                         c_neg_one, dAT(d,j,j_local+1),            lddat,
                                    &(panel_local[d][nb*ldpan[d]]), ldpan[d],
                         c_one,     dAT(d,j+1,  j_local+1),        lddat );
            trace_gpu_end( d, 0 );
        }
    } /* end of for j=1..s */
    /* ------------------------------------------------------------------------------ */
    
    /* Set the GPU number that holds the last panel */
    id = s % ngpu;
    
    /* Set the local index where the last panel is */
    j_local = s/ngpu;
    
    /* size of the last diagonal-block */
    nb0 = min(m - s*nb, n - s*nb);
    rows = m    - s*nb;
    cols = maxm - s*nb;
    
    if ( nb0 > 0 ) {
        magma_setdevice(id);
        
        /* wait for the last panel on cpu */
        magma_queue_sync( queues[id][1] );
    
        /* factor on cpu */
        lapackf77_zgetrf( &rows, &nb0, W(s), &ldw, ipiv+s*nb, &iinfo);
        if ( (*info == 0) && (iinfo > 0) )
            *info = iinfo + s*nb;
        
        /* send the factor to gpus */
        for( d=0; d < ngpu; d++ ) {
            magma_setdevice(d);
            j_local2 = j_local;
            if ( d < id ) j_local2 ++;
            
            if ( d == id || n_local[d] > j_local2*nb ) {
                magma_zsetmatrix_async( rows, nb0,
                                        W(s),     ldw,
                                        &d_lAP[d][(s%h)*nb*maxm],
                                        cols, queues[d][1] );
            }
        }
        
        for( d=0; d < ngpu; d++ ) {
            magma_setdevice(d);
            if ( d == 0 ) {
                for( i=s*nb; i < s*nb + nb0; ++i ) {
                    ipiv[i] += s*nb;
                }
            }
            magmablas_zlaswp_q( lddat, dAT(d,0,0), lddat, s*nb + 1, s*nb + nb0, ipiv, 1, queues[d][0] );
        }
        
        for( d=0; d < ngpu; d++ ) {
            magma_setdevice(d);
            magmablasSetKernelStream(queues[d][1]);
            
            /* wait for the pivoting to be done */
            magma_queue_sync( queues[d][0] );
            
            j_local2 = j_local;
            if ( d < id ) j_local2++;
            if ( d == id ) {
                /* the panel belond to this gpu */
                panel_local[d] = dAT(d,s,j_local);
                
                /* next column */
                nb1 = n_local[d] - j_local*nb-nb0;
                
                magmablas_ztranspose( rows, nb0, &d_lAP[d][(s%h)*nb*maxm], cols, panel_local[d], lddat );
                
                if ( nb1 > 0 ) {
                    magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                                 nb1, nb0, c_one,
                                 panel_local[d],        lddat,
                                 dAT(d,s,j_local)+nb0, lddat);
                }
            } else if ( n_local[d] > j_local2*nb ) {
                /* the panel belong to another gpu */
                panel_local[d] = d_panel[d];
                
                /* next column */
                nb1 = n_local[d] - j_local2*nb;
                
                magmablas_ztranspose( rows, nb0, &d_lAP[d][(s%h)*nb*maxm], cols, panel_local[d], nb );
                magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             nb1, nb0, c_one,
                             panel_local[d],     nb,
                             dAT(d,s,j_local2), lddat);
            }
        }
    } /* if ( nb0 > 0 ) */
    
    /* clean up */
    trace_finalize( "zgetrf_mgpu.svg","trace.css" );
    for( d=0; d < ngpu; d++ ) {
        magma_setdevice(d);
        magma_queue_sync( queues[d][0] );
        magma_queue_sync( queues[d][1] );
    }
    magma_setdevice( orig_dev );
    magmablasSetKernelStream( orig_stream );
    
    timer_start( time );
    timer_printf("\n Performance %f GFlop/s\n", FLOPS_ZGETRF(m,n) / 1e9 / time );

    return *info;
} /* magma_zgetrf2_mgpu */
Exemplo n.º 10
0
/**
    Purpose
    -------
    ZLATRD reduces NB rows and columns of a complex Hermitian matrix A to
    Hermitian tridiagonal form by an orthogonal similarity
    transformation Q' * A * Q, and returns the matrices V and W which are
    needed to apply the transformation to the unreduced part of A.

    If UPLO = MagmaUpper, ZLATRD reduces the last NB rows and columns of a
    matrix, of which the upper triangle is supplied;
    if UPLO = MagmaLower, ZLATRD reduces the first NB rows and columns of a
    matrix, of which the lower triangle is supplied.

    This is an auxiliary routine called by ZHETRD.

    Arguments
    ---------
    @param[in]
    ngpu    INTEGER
            Number of GPUs to use. ngpu > 0.

    @param[in]
    uplo    magma_uplo_t
            Specifies whether the upper or lower triangular part of the
            Hermitian matrix A is stored:
      -     = MagmaUpper: Upper triangular
      -     = MagmaLower: Lower triangular

    @param[in]
    n       INTEGER
            The order of the matrix A.

    @param[in]
    nb      INTEGER
            The number of rows and columns to be reduced.

    @param[in]
    nb0     INTEGER
            The block size used for the matrix distribution.
            nb and nb0 can be different for the final step of zhetrd.

    @param[in,out]
    A       COMPLEX_16 array, dimension (LDA,N)
            On entry, the Hermitian matrix A.  If UPLO = MagmaUpper, the leading
            n-by-n upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = MagmaLower, the
            leading n-by-n lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
            On exit:
      -     if UPLO = MagmaUpper, the last NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements above the diagonal
              with the array TAU, represent the orthogonal matrix Q as a
              product of elementary reflectors;
      -     if UPLO = MagmaLower, the first NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements below the diagonal
              with the array TAU, represent the  orthogonal matrix Q as a
              product of elementary reflectors.
            See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= (1,N).

    @param[out]
    e       COMPLEX_16 array, dimension (N-1)
            If UPLO = MagmaUpper, E(n-nb:n-1) contains the superdiagonal
            elements of the last NB columns of the reduced matrix;
            if UPLO = MagmaLower, E(1:nb) contains the subdiagonal elements of
            the first NB columns of the reduced matrix.

    @param[out]
    tau     COMPLEX_16 array, dimension (N-1)
            The scalar factors of the elementary reflectors, stored in
            TAU(n-nb:n-1) if UPLO = MagmaUpper, and in TAU(1:nb) if UPLO = MagmaLower.
            See Further Details.

    @param[out]
    W       COMPLEX_16 array, dimension (LDW,NB)
            The n-by-nb matrix W required to update the unreduced part
            of A.

    @param[in]
    ldw     INTEGER
            The leading dimension of the array W. LDW >= max(1,N).

    @param
    dA

    @param[in]
    ldda

    @param[in]
    offset

    @param
    dW

    @param[in]
    lddw

    @param
    hwork

    @param[in]
    lhwork

    @param
    dwork

    @param[in]
    ldwork
             
    @param[in]
    queues  magma_queue_t array of dimension (ngpu).
            queues[dev] is an execution queue on GPU dev.
    
    Further Details
    ---------------
    If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary
    reflectors

       Q = H(n) H(n-1) . . . H(n-nb+1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i),
    and tau in TAU(i-1).

    If UPLO = MagmaLower, the matrix Q is represented as a product of elementary
    reflectors

       Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i),
    and tau in TAU(i).

    The elements of the vectors v together form the n-by-nb matrix V
    which is needed, with W, to apply the transformation to the unreduced
    part of the matrix, using a Hermitian rank-2k update of the form:
    A := A - V*W' - W*V'.

    The contents of A on exit are illustrated by the following examples
    with n = 5 and nb = 2:

    if UPLO = MagmaUpper:                       if UPLO = MagmaLower:

      (  a   a   a   v4  v5 )              (  d                  )
      (      a   a   v4  v5 )              (  1   d              )
      (          a   1   v5 )              (  v1  1   a          )
      (              d   1  )              (  v1  v2  a   a      )
      (                  d  )              (  v1  v2  a   a   a  )

    where d denotes a diagonal element of the reduced matrix, a denotes
    an element of the original matrix that is unchanged, and vi denotes
    an element of the vector defining H(i).

    @ingroup magma_zheev_aux
    ********************************************************************/
extern "C" magma_int_t
magma_zlatrd_mgpu(
    magma_int_t ngpu,
    magma_uplo_t uplo,
    magma_int_t n, magma_int_t nb, magma_int_t nb0,
    magmaDoubleComplex *A,  magma_int_t lda,
    double *e, magmaDoubleComplex *tau,
    magmaDoubleComplex *W,          magma_int_t ldw,
    magmaDoubleComplex_ptr dA[],    magma_int_t ldda, magma_int_t offset,
    magmaDoubleComplex_ptr dW[],    magma_int_t lddw,
    magmaDoubleComplex    *hwork,   magma_int_t lhwork,
    magmaDoubleComplex_ptr dwork[], magma_int_t ldwork,
    magma_queue_t queues[] )
{
#define A(i, j) (A + (j)*lda + (i))
#define W(i, j) (W + (j)*ldw + (i))

#define dA(dev, i, j)  (dA[(dev)] + ((j)+loffset)*ldda + (i) + offset)
#define dW(dev, i, j)  (dW[(dev)] + (j)          *lddw + (i))
#define dW1(dev, i, j) (dW[(dev)] + ((j)+nb)     *lddw + (i))

    /* Constants */
    const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    const magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    const magmaDoubleComplex c_zero    = MAGMA_Z_ZERO;
    const magma_int_t ione = 1;

    /* Local variables */
    magmaDoubleComplex alpha, value;
    magma_int_t dev;
    magma_int_t i, n_i, n_i_1, ip1, iw;

    // TODO check arguments
    magma_int_t info = 0;
    if (n <= 0) {
        return info;
    }
    
    // TODO allocate f in zhetrd and pass into zlatrd. (e.g., expand hwork a bit)
    magmaDoubleComplex *f;
    magma_zmalloc_cpu( &f, n );
    if ( f == NULL ) {
        info = MAGMA_ERR_HOST_ALLOC;
        return info;
    }

    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    
    if (uplo == MagmaUpper) {
        /* Reduce last NB columns of upper triangle */
        for (i = n-1; i >= n - nb; --i) {
            ip1 = i + 1;
            n_i_1 = n - i - 1;
            iw = i - n + nb;
            if (i < n-1) {
                /* Update A(1:i,i) */
                magmaDoubleComplex wii = -conj( *W(i, iw+1) );
                blasf77_zaxpy( &ip1, &wii, A(0, i+1), &ione, A(0, i), &ione );

                wii = -conj( *A(i, i+1) );
                blasf77_zaxpy( &ip1, &wii, W(0, iw+1), &ione, A(0, i), &ione );
            }
            if (i > 0) {
                /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */
                alpha = *A(i-1, i);
                lapackf77_zlarfg( &i, &alpha, A(0, i), &ione, &tau[i - 1] );

                e[i-1] = MAGMA_Z_REAL( alpha );
                *A(i-1,i) = MAGMA_Z_ONE;
                
                // TODO Previously, this set dx2[dev] = dW1(dev, 0, iw); and used dx2 in zhemv.
                // TODO Now zhemv handles broadcasting x to the GPUs, but data in dW1 is
                // TODO apparently still used in zhetrd_mgpu / zher2k_mgpu.
                for( dev=0; dev < ngpu; dev++ ) {
                    magma_setdevice( dev );
                    magma_zsetvector_async( n, A(0,i), 1, dW1(dev, 0, iw), 1, queues[dev] );
                }
                magmablas_zhemv_mgpu( 
                    MagmaUpper, i, c_one, dA, ldda, 0,
                    A(0,i), 1, c_zero, W(0, iw), 1,
                    hwork, lhwork, dwork, ldwork, ngpu, nb0, queues );

                if (i < n-1) {
                    blasf77_zgemv( MagmaConjTransStr, &i, &n_i_1, &c_one,
                                   W(0,   iw+1), &ldw,
                                   A(0,   i),    &ione, &c_zero,
                                   W(i+1, iw),   &ione );
                }

                /* overlap update */
                if ( i < n-1 && i-1 >= n - nb ) {
                    /* Update A(1:i,i) */
                    #ifdef COMPLEX
                    lapackf77_zlacgv( &n_i_1, W(i-1, iw+1), &ldw );
                    #endif
                    blasf77_zgemv( "No transpose", &i, &n_i_1, &c_neg_one,
                                   A(0,   i+1),  &lda,
                                   W(i-1, iw+1), &ldw, &c_one,
                                   A(0,   i-1),  &ione );
                    #ifdef COMPLEX
                    lapackf77_zlacgv( &n_i_1, W(i-1, iw+1), &ldw );
                    lapackf77_zlacgv( &n_i_1, A(i-1, i +1), &lda );
                    #endif
                    blasf77_zgemv( "No transpose", &i, &n_i_1, &c_neg_one,
                                   W(0,   iw+1), &ldw,
                                   A(i-1, i+1),  &lda, &c_one,
                                   A(0,   i-1),  &ione );
                    #ifdef COMPLEX
                    lapackf77_zlacgv( &n_i_1, A(i-1, i+1), &lda );
                    #endif
                }

                // synchronize to get zhemv result W(0, iw)
                magmablas_zhemv_mgpu_sync( 
                    MagmaUpper, i, c_one, dA, ldda, 0,
                    A(0,i), 1, c_zero, W(0, iw), 1,
                    hwork, lhwork, dwork, ldwork, ngpu, nb0, queues );

                if (i < n-1) {
                    blasf77_zgemv( "No transpose", &i, &n_i_1, &c_neg_one,
                                   A(0,   i+1), &lda,
                                   W(i+1, iw),  &ione, &c_one,
                                   W(0,   iw),  &ione );

                    blasf77_zgemv( MagmaConjTransStr, &i, &n_i_1, &c_one,
                                   A(0,   i+1), &lda,
                                   A(0,   i),   &ione, &c_zero,
                                   W(i+1, iw),  &ione );

                    blasf77_zgemv( "No transpose", &i, &n_i_1, &c_neg_one,
                                   W(0,   iw+1), &ldw,
                                   W(i+1, iw),   &ione, &c_one,
                                   W(0,   iw),   &ione );
                }

                blasf77_zscal( &i, &tau[i - 1], W(0, iw), &ione );

                value = magma_cblas_zdotc( i, W(0,iw), ione, A(0,i), ione );
                alpha = tau[i - 1] * -0.5f * value;
                blasf77_zaxpy( &i, &alpha, A(0, i), &ione, W(0, iw), &ione );

                for( dev=0; dev < ngpu; dev++ ) {
                    magma_setdevice( dev );
                    magma_zsetvector_async( n, W(0,iw), 1, dW(dev, 0, iw), 1, queues[dev] );
                }
            }
        }
    } else {
        /*  Reduce first NB columns of lower triangle */
        for (i = 0; i < nb; ++i) {
            /* Update A(i:n,i) */
            n_i = n - i;
            //idw = ((offset+i)/nb)%ngpu;
            if ( i > 0 ) {
                trace_cpu_start( 0, "gemv", "gemv" );
                magmaDoubleComplex wii = -conj( *W(i, i-1) );
                blasf77_zaxpy( &n_i, &wii, A(i, i-1), &ione, A(i, i), &ione );

                wii = -conj( *A(i, i-1) );
                blasf77_zaxpy( &n_i, &wii, W(i, i-1), &ione, A(i, i), &ione );
            }

            if (i < n-1) {
                /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */
                n_i_1 = n - i - 1;
                trace_cpu_start( 0, "larfg", "larfg" );
                alpha = *A(i+1, i);
                lapackf77_zlarfg( &n_i_1, &alpha, A(min(i+2,n-1), i), &ione, &tau[i] );
                e[i] = MAGMA_Z_REAL( alpha );
                *A(i+1,i) = MAGMA_Z_ONE;
                trace_cpu_end( 0 );

                /* Compute W(i+1:n,i) */
                // TODO Previously, this set dx2[id] = dW1(id, 0, i)-offset; and used dx2 in zhemv.
                // TODO Now zhemv handles broadcasting x to the GPUs, but data in dW1 is
                // TODO apparently still used in zhetrd_mgpu / zher2k_mgpu.
                for( dev=0; dev < ngpu; dev++ ) {
                    magma_setdevice( dev );
                    magma_zsetvector_async( n, A(0,i), 1, dW1(dev, 0, i), 1, queues[dev] );
                }
                
                magmablas_zhemv_mgpu( 
                    MagmaLower, n_i_1, c_one, dA, ldda, offset+i+1,
                    A(i+1, i), 1, c_zero, W(i+1, i), 1,
                    hwork, lhwork, dwork, ldwork, ngpu, nb0, queues );
                
                trace_cpu_start( 0, "gemv", "gemv" );
                blasf77_zgemv( MagmaConjTransStr, &n_i_1, &i, &c_one,
                               W(i+1, 0), &ldw,
                               A(i+1, i), &ione, &c_zero,
                               W(0,   i), &ione );
                
                blasf77_zgemv( "No transpose", &n_i_1, &i, &c_neg_one,
                               A(i+1, 0), &lda,
                               W(0,   i), &ione, &c_zero,
                               f,         &ione );
                
                blasf77_zgemv( MagmaConjTransStr, &n_i_1, &i, &c_one,
                               A(i+1, 0), &lda,
                               A(i+1, i), &ione, &c_zero,
                               W(0,   i), &ione );
                trace_cpu_end( 0 );

                /* overlap update */
                if ( i > 0 && i+1 < n ) {
                    trace_cpu_start( 0, "gemv", "gemv" );
                    #ifdef COMPLEX
                    lapackf77_zlacgv( &i, W(i+1, 0), &ldw );
                    #endif
                    blasf77_zgemv( "No transpose", &n_i_1, &i, &c_neg_one,
                                   A(i+1, 0),   &lda,
                                   W(i+1, 0),   &ldw, &c_one,
                                   A(i+1, i+1), &ione );
                    #ifdef COMPLEX
                    lapackf77_zlacgv( &i, W(i+1, 0), &ldw );
                    lapackf77_zlacgv( &i, A(i+1, 0), &lda );
                    #endif
                    blasf77_zgemv( "No transpose", &n_i_1, &i, &c_neg_one,
                                   W(i+1, 0),   &ldw,
                                   A(i+1, 0),   &lda, &c_one,
                                   A(i+1, i+1), &ione );
                    #ifdef COMPLEX
                    lapackf77_zlacgv( &i, A(i+1, 0), &lda );
                    #endif
                    trace_cpu_end( 0 );
                }

                // synchronize to get zhemv result W(i+1, i)
                magmablas_zhemv_mgpu_sync( 
                    MagmaLower, n_i_1, c_one, dA, ldda, offset+i+1,
                    A(i+1, i), 1, c_zero, W(i+1, i), 1,
                    hwork, lhwork, dwork, ldwork, ngpu, nb0, queues );
                
                trace_cpu_start( 0, "axpy", "axpy" );
                if (i != 0) {
                    blasf77_zaxpy( &n_i_1, &c_one, f, &ione, W(i+1, i), &ione );
                }

                blasf77_zgemv( "No transpose", &n_i_1, &i, &c_neg_one,
                               W(i+1, 0), &ldw,
                               W(0,   i), &ione, &c_one,
                               W(i+1, i), &ione );
                blasf77_zscal( &n_i_1, &tau[i], W(i+1,i), &ione );

                value = magma_cblas_zdotc( n_i_1, W(i+1,i), ione, A(i+1,i), ione );
                alpha = tau[i] * -0.5f * value;
                blasf77_zaxpy( &n_i_1, &alpha, A(i+1, i), &ione, W(i+1,i), &ione );
                trace_cpu_end( 0 );
                for( dev=0; dev < ngpu; dev++ ) {
                    magma_setdevice( dev );
                    magma_zsetvector_async( n, W(0,i), 1, dW(dev, 0, i), 1, queues[dev] );
                }
            }
        }
    }

    magma_free_cpu( f );

    magma_setdevice( orig_dev );
    
    return info;
} /* magma_zlatrd_mgpu */
Exemplo n.º 11
0
extern "C" magma_int_t
magma_dpotrf3_mgpu(int num_gpus, char uplo, magma_int_t m, magma_int_t n, 
                   magma_int_t off_i, magma_int_t off_j, magma_int_t nb,
                   double **d_lA,  magma_int_t ldda, 
                   double **d_lP,  magma_int_t lddp, 
                   double *a,      magma_int_t lda,   magma_int_t h,
                   cudaStream_t stream[][3], cudaEvent_t event[][5],
                   magma_int_t *info ) 
{
/*  -- MAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2012

    Purpose   
    =======   
    DPOTRF computes the Cholesky factorization of a real symmetric   
    positive definite matrix dA.   
    Auxiliary subroutine for dpotrf2_ooc. It is multiple gpu interface to compute 
    Cholesky of a "rectangular" matrix.

    The factorization has the form   
       dA = U**T * U,  if UPLO = 'U', or   
       dA = L  * L**T,  if UPLO = 'L',   
    where U is an upper triangular matrix and L is lower triangular.   

    This is the block version of the algorithm, calling Level 3 BLAS.   

    Arguments   
    =========   
    UPLO    (input) CHARACTER*1   
            = 'U':  Upper triangle of dA is stored;   
            = 'L':  Lower triangle of dA is stored.   

    N       (input) INTEGER   
            The order of the matrix dA.  N >= 0.   

    dA      (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N)   
            On entry, the symmetric matrix dA.  If UPLO = 'U', the leading   
            N-by-N upper triangular part of dA contains the upper   
            triangular part of the matrix dA, and the strictly lower   
            triangular part of dA is not referenced.  If UPLO = 'L', the   
            leading N-by-N lower triangular part of dA contains the lower   
            triangular part of the matrix dA, and the strictly upper   
            triangular part of dA is not referenced.   

            On exit, if INFO = 0, the factor U or L from the Cholesky   
            factorization dA = U**T * U or dA = L * L**T.   

    LDDA     (input) INTEGER   
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            dividable by 16.

    INFO    (output) INTEGER   
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value   
            > 0:  if INFO = i, the leading minor of order i is not   
                  positive definite, and the factorization could not be   
                  completed.   
    =====================================================================   */


    magma_int_t     j, jb, nb0, nb2, d, dd, id, j_local, j_local2, buf;
    char            uplo_[2] = {uplo, 0};
    double c_one     = MAGMA_D_ONE;
    double c_neg_one = MAGMA_D_NEG_ONE;
    double          d_one     =  1.0;
    double          d_neg_one = -1.0;
    int upper = lapackf77_lsame(uplo_, "U");
    double *dlpanel;
    magma_int_t n_local[MagmaMaxGPUs], ldpanel;
    //cudaEvent_t event0[MagmaMaxGPUs],  /* send row to CPU    */
    //            event1[MagmaMaxGPUs],  /* send diag to GPU   */
    //            event2[MagmaMaxGPUs],  /* offdiagonal update */
    //            event3[MagmaMaxGPUs],  /* send row to GPU    */
    //            event4[MagmaMaxGPUs];  /* lookahead          */
    const magma_int_t stream1 = 0, stream2 = 1, stream3 = 2;
    double *d_dinvA[MagmaMaxGPUs][2], *d_x[MagmaMaxGPUs][2]; /* used by dtrsm_work */

    *info = 0;
    if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (!upper && num_gpus*ldda < max(1,n)) {
        *info = -4;
    } else if (upper && ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* initialization */
    for( d=0; d<num_gpus; d++ ) {
      /* local-n and local-ld */
      if (upper) {
        n_local[d] = ((n/nb)/num_gpus)*nb;
        if (d < (n/nb)%num_gpus)
          n_local[d] += nb;
        else if (d == (n/nb)%num_gpus)
          n_local[d] += n%nb;
      } else {
        n_local[d] = ((m/nb)/num_gpus)*nb;
        if (d < (m/nb)%num_gpus)
          n_local[d] += nb;
        else if (d == (m/nb)%num_gpus)
          n_local[d] += m%nb;
      }
      //magma_setdevice(d);
      //magma_event_create( &event0[d] );
      //magma_event_create( &event1[d] );
      //magma_event_create( &event2[d] );
      //magma_event_create( &event3[d] );
      //magma_event_create( &event4[d] );
    }

    /* == initialize the trace */
    trace_init( 1, num_gpus, 3, (CUstream_st**)stream );

    if (upper) 
    {     
      /* ---------------------------------------------- */
      /* Upper-triangular case                          */
      /* > Compute the Cholesky factorization A = U'*U. */
      /* ---------------------------------------------- */
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
      /* invert the diagonals 
       * Allocate device memory for the inversed diagonal blocks, size=m*NB
       */
      for( d=0; d<num_gpus; d++ ) {
          magma_setdevice(d);
          for( j=0; j<2; j++ ) {
              cudaMalloc((void**)&d_dinvA[d][j], nb*nb*sizeof(double));
              cudaMalloc((void**)&d_x[d][j],      n*nb*sizeof(double));
              cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(double));
              cudaMemset(d_x[d][j],     0,  n*nb*sizeof(double));
          }
      }
      magma_setdevice(0);
#endif

      for (j=0; j<m; j+=nb) {

        /* Set the GPU number that holds the current panel */
        id  = (j/nb)%num_gpus;
        buf = (j/nb)%num_gpus;

        /* Set the local index where the current panel is */
        j_local = j/(nb*num_gpus);
        jb = min(nb, (m-j));

        /* Update the current diagonal block on stream1 */
        magma_setdevice(id);
        if( j > 0 ) {
            magmablasSetKernelStream(stream[id][stream1]);
            trace_gpu_start( id, stream1, "syrk", "syrk" );
            magma_dsyrk(MagmaUpper, MagmaTrans, jb, j, 
                        d_neg_one, dlA(id, 0, nb*j_local), ldda, 
                        d_one,     dlA(id, j, nb*j_local), ldda);
            trace_gpu_end( id, stream1 );
        }

        /* send the diagonal to cpu on stream1 */
        trace_gpu_start( id, stream1, "comm", "D to CPU" );
        magma_dgetmatrix_async( jb, jb,
                                dlA(id, j, nb*j_local), ldda,
                                Aup(j,j),               lda, 
                                stream[id][stream1] );
        trace_gpu_end( id, stream1 );

        /* update off-diagonal blocks in the panel */
        if( j > 0 ) {
            d = (j/nb+1)%num_gpus;
            for( dd=0; dd<num_gpus; dd++ ) {
                j_local2 = j_local+1;
                if( d > id ) j_local2 --;
                nb0 = nb*j_local2; 

                if( n_local[d] > nb0 ) {
                    magma_setdevice(d);
                    magmablasSetKernelStream(stream[d][stream2]);
                    if( d == id ) {
                        dlpanel = dlA(d, 0, nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlP(d, jb, 0, buf);
                        ldpanel = lddp;
                        magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu
                    }
                    trace_gpu_start( d, stream2, "gemm", "gemm" );
                    magma_dgemm(MagmaTrans, MagmaNoTrans, 
                                jb, n_local[d]-nb0, j, 
                                c_neg_one, dlpanel,        ldpanel, 
                                           dlA(d, 0, nb0), ldda,
                                c_one,     dlA(d, j, nb0), ldda);
                    trace_gpu_end( d, stream2 );
                    magma_event_record( event[d][2], stream[d][stream2] );
                }
                d = (d+1)%num_gpus;
            }
        }

        /* wait for panel and factorize it on cpu */
        magma_setdevice(id);
        magma_queue_sync( stream[id][stream1] );
        trace_cpu_start( 0, "getrf", "getrf" );
        lapackf77_dpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info);
        trace_cpu_end( 0 );
        if (*info != 0) {
            *info = *info + j;
            break;
        }

        /* send the diagonal to gpus on stream1 */
        if ( (j+jb) < n) {
          d = (j/nb+1)%num_gpus;
          for( dd=0; dd<num_gpus; dd++ ) {
            if( d == id ) {
                dlpanel = dlA(d, j, nb*j_local);
                ldpanel = ldda;
            } else {
                dlpanel = dlP(d, 0, 0, buf);
                ldpanel = lddp;
            }
            magma_setdevice(d);
            trace_gpu_start( d, stream1, "comm", "comm" );
            magma_dsetmatrix_async( jb, jb,
                                    Aup(j,j), lda,
                                    dlpanel,  ldpanel, 
                                    stream[d][stream1] );
            trace_gpu_end( d, stream1 );
            magma_event_record( event[d][1], stream[d][stream1] );
            d = (d+1)%num_gpus;
          }
        } else {
          magma_setdevice(id);
          trace_gpu_start( id, stream1, "comm", "comm" );
          magma_dsetmatrix_async( jb, jb,
                                  Aup(j,j),               lda,
                                  dlA(id, j, nb*j_local), ldda, 
                                  stream[id][stream1] );
          trace_gpu_end( id, stream1 );
        }

        /* panel-factorize the off-diagonal */
        if ( (j+jb) < n) {
            d = (j/nb+1)%num_gpus;
            for( dd=0; dd<num_gpus; dd++ ) {
                /* next column */
                j_local2 = j_local+1;
                if( d > id ) j_local2--;
                if( d == id ) {
                    dlpanel = dlA(d,j,nb*j_local);
                    ldpanel = ldda;
                } else {
                    dlpanel = dlP(d, 0, 0, buf);
                    ldpanel = lddp;
                }
                nb2 = n_local[d] - j_local2*nb;
                nb0 = min(nb, nb2);
        
                magma_setdevice(d);
                //magma_queue_sync( stream[d][stream1] );  // synch on chol for remaining update
                //magma_queue_sync( stream[d][stream2] );
                if( j+jb < m && d == (j/nb+1)%num_gpus ) { 
                    /* owns the next column, look-ahead next block on stream1 */
                    magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update 
                    magmablasSetKernelStream(stream[d][stream1]);
                    trace_gpu_start( d, stream1, "trsm", "trsm" );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
                    magmablas_dtrsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, 
                                          jb, nb0, c_one,
                                          dlpanel,                ldpanel, 
                                          dlA(d, j, nb*j_local2), ldda,
                                          d_dinvA[d][0], d_x[d][0] );
#else
                    magma_dtrsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, 
                                 jb, nb0, c_one,
                                 dlpanel,                ldpanel, 
                                 dlA(d, j, nb*j_local2), ldda);
#endif
                    magma_event_record( event[d][4], stream[d][stream1] );
                    trace_gpu_end( d, stream1 );
                } else if( nb2 > 0 ) {
                    /* update all the blocks on stream2 */
                    magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor 
                    trace_gpu_start( d, stream2, "trsm", "trsm" );
                    magmablasSetKernelStream(stream[d][stream2]);
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
                    magmablas_dtrsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, 
                                          jb, nb2, c_one,
                                          dlpanel,                ldpanel, 
                                          dlA(d, j, nb*j_local2), ldda,
                                          d_dinvA[d][1], d_x[d][1] );
#else
                    magma_dtrsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, 
                                 jb, nb2, c_one,
                                 dlpanel,                ldpanel, 
                                 dlA(d, j, nb*j_local2), ldda);
#endif
                    trace_gpu_end( d, stream2 );
                }
                d = (d+1)%num_gpus;
            } /* end of for */

/* ======================================================================================== */

            d = (j/nb+1)%num_gpus;
            /* next column */
            j_local2 = j_local+1;
            if( d > id ) j_local2--;
            nb0 = min(nb, n_local[d]-nb*j_local2 );
        
            /* even on 1 gpu, off-diagonals are copied to cpu (synchronize at the end).      *
             * so we have the Cholesky factor, but only diagonal submatrix of the big panel, *
             * on cpu at the end.                                                            */
            if( j+jb < m ) { 
                int d2, id2, j2, buf2;
                magma_setdevice(d);
                /* make sure all the previous sets are done */
                if( h < num_gpus ) {
                    /* > offdiagonal */
                    for( d2=0; d2<num_gpus; d2++ ) {
                        j2 = j - (1+d2)*nb;
                        if( j2 < 0 ) break;
                        id2  = (j2/nb)%num_gpus;
                        magma_queue_wait_event( stream[d][stream3], event[id2][0] );
                    }

                    /* > diagonal */
                    for( d2=0; d2<num_gpus; d2++ ) {
                        j2 = j - d2*nb;
                        if( j2 < 0 ) break;
                        id2  = (j2/nb)%num_gpus;
                        magma_queue_wait_event( stream[d][stream3], event[id2][1] );
                    }
                }
                /* lookahead */
                magma_queue_wait_event( stream[d][stream3], event[d][4] );

                trace_gpu_start( d, stream3, "comm", "row to CPU" );
                magma_dgetmatrix_async( (j+jb), nb0,
                                        dlA(d, 0, nb*j_local2), ldda,
                                        Aup(0,j+jb),            lda, 
                                        stream[d][stream3] );
                trace_gpu_end( d, stream3 );
                magma_event_record( event[d][3], stream[d][stream3] );
/* needed on pluto */
magma_queue_sync( stream[d][stream3] );

                /* wait for the off-diagonal on cpu */
                //magma_setdevice(id);
                //magma_queue_sync( stream[id][stream3] );

                /* broadcast rows to gpus on stream2 */
                buf2 = ((j+jb)/nb)%num_gpus;
                for( d2=0; d2<num_gpus; d2++ ) {
                    if( d2 != d ) 
                    {
                        magma_setdevice(d2);
                        trace_gpu_start( d2, stream3, "comm", "row to GPUs" );
                        magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // rows arrived at cpu on stream3
                        magma_dsetmatrix_async( j+jb, nb0,
                                                Aup(0,j+jb),        lda,
                                                dlP(d2,nb0,0,buf2), lddp, 
                                                stream[d2][stream3] );
                        trace_gpu_end( d2, stream3 );
                        magma_event_record( event[d2][0], stream[d2][stream3] );
                    }
                } 
            }

/* ======================================================================================== */
            /* gpu owning the next column                    */
            /* after look ahead, update the remaining blocks */
            if( j+jb < m ) /* no update on the last block column */ 
            {
                d = (j/nb+1)%num_gpus;
                /* next column */
                j_local2 = j_local+1;
                if( d > id ) j_local2--;
                if( d == id ) {
                    dlpanel = dlA(d, j, nb*j_local);
                    ldpanel = ldda;
                } else {
                    dlpanel = dlP(d, 0, 0, buf);
                    ldpanel = lddp;
                }
                nb0 = min(nb, n_local[d]-nb*j_local2 );
                nb2 =         n_local[d]-nb*j_local2 - nb0;
        
                /* update the remaining blocks */
                if( nb2 > 0 ) {
                    magma_setdevice(d);
                    magmablasSetKernelStream(stream[d][stream2]);  
                    magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor
                    trace_gpu_start( d, stream2, "trsm", "trsm" );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
                    magmablas_dtrsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, 
                                          jb, nb2, c_one,
                                          dlpanel,                    ldpanel, 
                                          dlA(d, j, nb*j_local2+nb0), ldda,
                                          d_dinvA[d][1], d_x[d][1] );
#else
                    magma_dtrsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, 
                                 jb, nb2, c_one,
                                 dlpanel,                    ldpanel, 
                                 dlA(d, j, nb*j_local2+nb0), ldda);
#endif
                    trace_gpu_end( d, stream2 );
                }
            }
        } /* end of dtrsm */
      } /* end of for j=1, .., n */
    } else { 

      /* ---------------------------------------------- */
      /* Lower-triangular case                          */
      /* > Compute the Cholesky factorization A = L*L'. */
      /* ---------------------------------------------- */
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
      /* 
       * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE
       */
      for( d=0; d<num_gpus; d++ ) {
         magma_setdevice(d);
         for( j=0; j<2; j++ ) {
             cudaMalloc((void**)&d_dinvA[d][j], nb*nb*sizeof(double));
             cudaMalloc((void**)&d_x[d][j],     nb*m *sizeof(double));
             cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(double));
             cudaMemset(d_x[d][j],     0, nb* m*sizeof(double));
         }
      }
      magma_setdevice(0);
#endif

      for (j=0; j<n; j+=nb) {

        /* Set the GPU number that holds the current panel */
        id  = (j/nb)%num_gpus;
        buf = (j/nb)%num_gpus;

        /* Set the local index where the current panel is */
        j_local = j/(nb*num_gpus);
        jb = min(nb, (n-j));

        /* Update the current diagonal block on stream1 */
        magma_setdevice(id);
        if( j > 0 ) {
            magmablasSetKernelStream(stream[id][stream1]);
            magma_dsyrk(MagmaLower, MagmaNoTrans, jb, j,
                        d_neg_one, dlA(id, nb*j_local, 0), ldda,
                        d_one,     dlA(id, nb*j_local, j), ldda);
        }

        /* send the diagonal to cpu on stream1 */
        magma_dgetmatrix_async( jb, jb,
                                dlA(id, nb*j_local, j), ldda,
                                Alo(j,j),               lda, 
                                stream[id][stream1] );

        /* update off-diagonal blocks of the panel */
        if( j > 0 ) {
            d = (j/nb+1)%num_gpus;
            for( dd=0; dd<num_gpus; dd++ ) {
                j_local2 = j_local+1;
                if( d > id ) j_local2 --;
                nb0 = nb*j_local2; 

                if( nb0 < n_local[d] ) {
                    magma_setdevice(d);
                    magmablasSetKernelStream(stream[d][stream2]);
                    if( d == id ) {
                        dlpanel = dlA(d, nb*j_local, 0);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlPT(d,0,jb,buf);
                        ldpanel = nb;
                        magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu
                    }
                    magma_dgemm( MagmaNoTrans, MagmaTrans,
                                 n_local[d]-nb0, jb, j,
                                 c_neg_one, dlA(d, nb0, 0), ldda,
                                            dlpanel,        ldpanel,
                                 c_one,     dlA(d, nb0, j), ldda);
                    magma_event_record( event[d][2], stream[d][stream2] );
                }
                d = (d+1)%num_gpus;
            }
        }

        /* wait for the panel and factorized it on cpu */
        magma_setdevice(id);
        magma_queue_sync( stream[id][stream1] );
        lapackf77_dpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info);
        if (*info != 0) {
           *info = *info + j;
           break;
        }

        /* send the diagonal to gpus on stream1 */
        if ( (j+jb) < m) {
          d = (j/nb+1)%num_gpus;
          for( dd=0; dd<num_gpus; dd++ ) {
            if( d == id ) {
                dlpanel = dlA(d, nb*j_local, j);
                ldpanel = ldda;
            } else {
                dlpanel = dlPT(d, 0, 0, buf);
                ldpanel = nb;
            }
            magma_setdevice(d);
            magma_dsetmatrix_async( jb, jb,
                                    Alo(j,j), lda,
                                    dlpanel,  ldpanel, 
                                    stream[d][stream1] );
            magma_event_record( event[d][1], stream[d][stream1] );
            d = (d+1)%num_gpus;
          }
        } else {
          magma_setdevice(id);
          magma_dsetmatrix_async( jb, jb,
                                  Alo(j,j),               lda,
                                  dlA(id, nb*j_local, j), ldda, 
                                  stream[id][stream1] );
        }

        /* panel factorize the off-diagonal */
        if ( (j+jb) < m) {
          d = (j/nb+1)%num_gpus;
          for( dd=0; dd<num_gpus; dd++ ) {
            /* next column */
            j_local2 = j_local+1;
            if( d > id ) j_local2--;
            if( d == id ) {
                dlpanel = dlA(d, nb*j_local, j);
                ldpanel = ldda;
            } else {         
                dlpanel = dlPT(d, 0, 0, buf);
                ldpanel = nb;
            }
            nb2 = n_local[d] - j_local2*nb;
            nb0 = min(nb, nb2 );

            magma_setdevice(d);
            if( j+nb < n && d == (j/nb+1)%num_gpus ) { /* owns next column, look-ahead next block on stream1 */
              magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update 
              magmablasSetKernelStream(stream[d][stream1]);
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
              magmablas_dtrsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, 
                                    nb0, jb, c_one,
                                    dlpanel,                ldpanel, 
                                    dlA(d, nb*j_local2, j), ldda,
                                    d_dinvA[d][0], d_x[d][0] );
#else
              magma_dtrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, 
                           nb0, jb, c_one,
                           dlpanel,                ldpanel, 
                           dlA(d, nb*j_local2, j), ldda);
#endif
              magma_event_record( event[d][4], stream[d][stream1] );
            } else if( nb2 > 0 ) { /* other gpus updating all the blocks on stream2 */
              /* update the entire column */
              magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for the cholesky factor
              magmablasSetKernelStream(stream[d][stream2]);
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
              magmablas_dtrsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, 
                                    nb2, jb, c_one,
                                    dlpanel,                ldpanel, 
                                    dlA(d, nb*j_local2, j), ldda,
                                    d_dinvA[d][1], d_x[d][1] );
#else
              magma_dtrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, 
                           nb2, jb, c_one,
                           dlpanel,                ldpanel, 
                           dlA(d, nb*j_local2, j), ldda);
#endif
            }
            d = (d+1)%num_gpus;
          } /* end for d */

/* ======================================================================================== */

          d = (j/nb+1)%num_gpus;
          /* next column */
          j_local2 = j_local+1;
          if( d > id ) j_local2--;
          nb0 = min(nb, n_local[d]-nb*j_local2 );

          /* even on 1 gpu, we copy off-diagonal to cpu (but don't synchronize).  */
          /* so we have the Cholesky factor on cpu at the end.                    */
          if( j+jb < n ) {
              int d2, id2, j2, buf2;
              magma_setdevice(d);
              /* make sure all the previous sets are done */
              if( h < num_gpus ) {
                  /* > offdiagonal */
                  for( d2=0; d2<num_gpus; d2++ ) {
                      j2 = j - (1+d2)*nb;
                      if( j2 < 0 ) break;
                      id2  = (j2/nb)%num_gpus;
                      magma_queue_wait_event( stream[d][stream3], event[id2][0] );
                  }

                  /* > diagonal */
                  for( d2=0; d2<num_gpus; d2++ ) {
                      j2 = j - d2*nb;
                      if( j2 < 0 ) break;
                      id2  = (j2/nb)%num_gpus;
                      magma_queue_wait_event( stream[d][stream3], event[id2][1] );
                  }
              }
              // lookahead done
              magma_queue_wait_event( stream[d][stream3], event[d][4] );

              magma_dgetmatrix_async( nb0, j+jb,
                                      dlA(d, nb*j_local2, 0), ldda,
                                      Alo(j+jb,0),            lda, 
                                      stream[d][stream3] );
              magma_event_record( event[d][3], stream[d][stream3] );
/* syn on rows on CPU, seem to be needed on Pluto */
magma_queue_sync( stream[d][stream3] );

              /* broadcast the rows to gpus */
              buf2 = ((j+jb)/nb)%num_gpus;
              for( d2=0; d2<num_gpus; d2++ ) {
                  if( d2 != d ) 
                  {
                      magma_setdevice(d2);
                      magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // getmatrix done
                      magma_dsetmatrix_async( nb0, j+jb,
                                              Alo(j+jb,0),         lda,
                                              dlPT(d2,0,nb0,buf2), nb, 
                                              stream[d2][stream3] );
                      magma_event_record( event[d2][0], stream[d2][stream3] );
                  }
              }
          }

/* ======================================================================================== */
          /* gpu owing the next column updates remaining blocks on stream2 */
          if( j+nb < n ) { // no lookahead on the last block column
            d = (j/nb+1)%num_gpus;

            /* next column */
            j_local2 = j_local+1;
            if( d > id ) j_local2--;
            if( d == id ) {
              dlpanel = dlA(d, nb*j_local, j);
              ldpanel = ldda;
            } else {         
              dlpanel = dlPT(d,0,0,buf);
              ldpanel = nb;
            }
            nb0 = min(nb, n_local[d]-nb*j_local2 );
            nb2 = n_local[d] - j_local2*nb - nb0;

            if( nb2 > 0 ) {
                magma_setdevice(d);
                magmablasSetKernelStream(stream[d][stream2]);
                /* update the remaining blocks in the column */
                magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
                magmablas_dtrsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, 
                                      nb2, jb, c_one,
                                      dlpanel,                    ldpanel, 
                                      dlA(d, nb*j_local2+nb0, j), ldda,
                                      d_dinvA[d][1], d_x[d][1] );
#else
                magma_dtrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, 
                             nb2, jb, c_one,
                             dlpanel,                    ldpanel, 
                             dlA(d, nb*j_local2+nb0, j), ldda);
#endif
            }
          }
        }
      }
    } /* end of else not upper */

    /* == finalize the trace == */
    trace_finalize( "dpotrf.svg","trace.css" );
    for( d=0; d<num_gpus; d++ ) {
      magma_setdevice(d);
      magma_queue_sync( stream[d][0] ); 
      magma_queue_sync( stream[d][1] );
      magma_queue_sync( stream[d][2] );
      magmablasSetKernelStream(NULL);

      //magma_event_destroy( event0[d] ); 
      //magma_event_destroy( event1[d] );
      //magma_event_destroy( event2[d] );
      //magma_event_destroy( event3[d] );
      //magma_event_destroy( event4[d] );
#if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
      for( j=0; j<2; j++ ) {
          magma_free( d_dinvA[d][j] );
          magma_free( d_x[d][j] );
      }
#endif
    }
    magma_setdevice(0);

    return *info;
} /* magma_dpotrf_mgpu */
Exemplo n.º 12
0
extern "C" magma_int_t
magma_zpotrf2_mgpu(
    int num_gpus, magma_uplo_t uplo, magma_int_t m, magma_int_t n, 
    magma_int_t off_i, magma_int_t off_j, magma_int_t nb,
    magmaDoubleComplex_ptr *d_lA, size_t d_lA_offset, magma_int_t ldda, 
    magmaDoubleComplex_ptr *d_lP,  magma_int_t lddp, 
    magmaDoubleComplex *a,      magma_int_t lda,   magma_int_t h,
    magma_queue_t *queues,
    magma_int_t *info )
{
/*  -- clMAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date November 2014

    Purpose   
    =======   
    ZPOTRF computes the Cholesky factorization of a complex Hermitian   
    positive definite matrix dA.   

    The factorization has the form   
       dA = U**H * U,  if UPLO = 'U', or   
       dA = L  * L**H,  if UPLO = 'L',   
    where U is an upper triangular matrix and L is lower triangular.   

    This is the block version of the algorithm, calling Level 3 BLAS.   

    Arguments   
    =========   
    UPLO    (input) CHARACTER*1   
            = 'U':  Upper triangle of dA is stored;   
            = 'L':  Lower triangle of dA is stored.   

    N       (input) INTEGER   
            The order of the matrix dA.  N >= 0.   

    dA      (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N)   
            On entry, the Hermitian matrix dA.  If UPLO = 'U', the leading   
            N-by-N upper triangular part of dA contains the upper   
            triangular part of the matrix dA, and the strictly lower   
            triangular part of dA is not referenced.  If UPLO = 'L', the   
            leading N-by-N lower triangular part of dA contains the lower   
            triangular part of the matrix dA, and the strictly upper   
            triangular part of dA is not referenced.   

            On exit, if INFO = 0, the factor U or L from the Cholesky   
            factorization dA = U**H * U or dA = L * L**H.   

    LDDA     (input) INTEGER   
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            divisible by 16.

    INFO    (output) INTEGER   
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value   
            > 0:  if INFO = i, the leading minor of order i is not   
                  positive definite, and the factorization could not be   
                  completed.   
    =====================================================================   */

    magma_int_t     j, jb, nb0, nb2, dd, d, id, j_local, j_local2, buf;
    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    double          d_one     =  1.0;
    double          d_neg_one = -1.0;
    magmaDoubleComplex_ptr dlpanel;
    size_t dlpanel_offset;
    magma_int_t n_local[MagmaMaxGPUs], ldpanel;
    magma_event_t* events[MagmaMaxGPUs][2];

    *info = 0;
    if ( (uplo != MagmaUpper) && (uplo != MagmaLower) ) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if ((uplo != MagmaUpper) && num_gpus*ldda < max(1,n)) {
        *info = -4;
    } else if ((uplo == MagmaUpper) && ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    {

      for( d=0; d<num_gpus; d++ ) {
        /* local-n and local-ld */
        if (uplo == MagmaUpper) {
            n_local[d] = ((n/nb)/num_gpus)*nb;
            if (d < (n/nb)%num_gpus)
               n_local[d] += nb;
            else if (d == (n/nb)%num_gpus)
              n_local[d] += n%nb;
        } else {
            n_local[d] = ((m/nb)/num_gpus)*nb;
            if (d < (m/nb)%num_gpus)
               n_local[d] += nb;
            else if (d == (m/nb)%num_gpus)
              n_local[d] += m%nb;
        }
      }

      /* Use blocked code. */
      if (uplo == MagmaUpper) 
        {
          /* ---------------------------------------------- */
          /* Upper-triangular case                          */
          /* > Compute the Cholesky factorization A = U'*U. */
          /* ---------------------------------------------- */
            printf("support Lower case only!\n");
        } else { 
            trace_init(1, num_gpus, 2, queues);
            /* -------------------------------------------- */
            /* Lower-triangular case                        */
            /* Compute the Cholesky factorization A = L*L'. */
            /* -------------------------------------------- */
            // try one queue per device firstly!
            for (j=0; j<n; j+=nb) {

              /* Set the GPU number that holds the current panel */
              id  = (j/nb)%num_gpus;
              buf = (j/nb)%num_gpus;

              /* Set the local index where the current panel is */
              j_local = j/(nb*num_gpus);
              jb = min(nb, (n-j));

              if( j > 0 ) {
                  /* needed on pluto... */
                  //magma_setdevice(id);
                  magma_queue_sync( queues[id*2] ); // wait for the column on CPU

                  /* broadcast offdiagonal row to all gpus */
                  d = (j/nb+1)%num_gpus;
                  for( dd=0; dd<num_gpus; dd++ ) {
                      if( d != id ) {
                          /* wait for it on CPU */
                          //magma_queue_sync( queues[d*2] );

                          /* send it to GPU */
                          trace_gpu_start(d, 0, "comm_set", "set");
                          events[d][0] = trace_gpu_end(d, 0);
                          magma_zsetmatrix_async( jb, j,
                                                  Alo(j,0), 0,      lda,
                                                  dlPT(d,0,jb,buf), nb, 
                                                  queues[d*2], events[d][0] );
                         clFlush(queues[d*2]);
                      }
                      d = (d+1)%num_gpus;
                  }
              }

              /* Update the current diagonal block */
              if( j > 0 ) {
                  trace_gpu_start(id, 1, "herk", "herk");
                  events[id][1] = trace_gpu_end(id, 1);
                  magma_zherk_trace(MagmaLower, MagmaNoTrans, jb, j,
                              d_neg_one, dlA(id, nb*j_local, 0), ldda,
                              d_one,     dlA(id, nb*j_local, j), ldda,
                              queues[id*2+1], events[id][1]);
                 // magma_queue_sync( queues[id*2+1] ); // wait for syrk
                  clWaitForEvents(1, events[id][1]);
              }

              /* send the diagonal to cpu */
              trace_gpu_start(id, 0, "comm_get", "get");
              events[id][0] = trace_gpu_end(id, 0);
              magma_zgetmatrix_async( jb, jb,
                                      dlA(id, nb*j_local, j), ldda,
                                      Alo(j,j), 0,            lda, 
                                      queues[id*2], events[id][0] );
              clFlush(queues[id*2]);


              /* update the offdiagonal blocks */
              if ( j > 0 ) {
                  /* compute the block-rows of the panel */
                  d = (j/nb+1)%num_gpus;
                  for( dd=0; dd<num_gpus; dd++ ) {
                      j_local2 = j_local+1;
                      if( d > id ) j_local2 --;
                      nb0 = nb*j_local2;

                      if( nb0 < n_local[d] ) {
                          if( d != id ) {
                              //dlpanel = dlPT(d);
                              dlpanel = d_lP[d];
                              dlpanel_offset = dlPT_offset(0, jb, buf);
                              ldpanel = nb;

                              /* wait for offdiagonal row */
                              magma_queue_sync(queues[d*2]);
                          } else {
                              dlpanel = d_lA[d];
                              dlpanel_offset = dlA_offset(nb*j_local, 0);
                              ldpanel = ldda;
                          }
                        
                          trace_gpu_start(d, 1, "gemm", "gemm");
                          events[d][1] = trace_gpu_end(d, 1);
                          magma_zgemm_trace( MagmaNoTrans, MagmaConjTrans,
                                       n_local[d]-nb0, jb, j,
                                       c_neg_one, dlA(d, nb0, 0), ldda,
                                                  dlpanel, dlpanel_offset, ldpanel,
                                       c_one,     dlA(d, nb0, j), ldda, 
                                       queues[d*2+1], events[d][1]);
                      }
                      d = (d+1)%num_gpus;
                  }
              }

              /* send the diagonal to cpu */
              /*
              trace_gpu_start(id, 0, "comm_get", "get");
              events[id][0] = trace_gpu_end(id, 0);
              magma_zgetmatrix_async( jb, jb,
                                      dlA(id, nb*j_local, j), ldda,
                                      Alo(j,j), 0,            lda, 
                                      queues[id*2], events[id][0] );
              clFlush(queues[id*2]);
              */
              /* factor the diagonal */
              magma_queue_sync( queues[id*2] );
              trace_cpu_start(0, "potrf", "potrf");
              lapackf77_zpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info);
              trace_cpu_end(0);
              if (*info != 0) {
                  printf("row number: %d\n", j);
                  *info = *info + j;
                  break;
              }

              /* send the diagonal to gpus */
              if ( (j+jb) < m ) {
                  d = (j/nb+1)%num_gpus;
                  for( dd=0; dd<num_gpus; dd++ ) {
                      if( d == id ) {
                          dlpanel = d_lA[d];
                          dlpanel_offset = dlA_offset(nb*j_local, j);
                          ldpanel = ldda;
                      } else {
                          //dlpanel = dlPT(d);
                          dlpanel = d_lP[d];
                          dlpanel_offset = dlPT_offset(0, 0, buf);
                          ldpanel = nb;
                      }
                      trace_gpu_start(d, 0, "comm_set", "set");
                      events[d][0] = trace_gpu_end(d, 0);
                      magma_zsetmatrix_async( jb, jb,
                                              Alo(j,j), 0, lda,
                                              dlpanel,  dlpanel_offset, ldpanel, 
                                              queues[d*2], events[d][0] );
                      clFlush(queues[d*2]);
                      d = (d+1)%num_gpus;
                  }
              } else {
                  trace_gpu_start(id, 0, "comm_set", "set");
                  events[id][0] = trace_gpu_end(id, 0);
                  magma_zsetmatrix_async( jb, jb,
                                          Alo(j,j),       0,      lda,
                                          dlA(id, nb*j_local, j), ldda, 
                                          queues[id*2], events[id][0] );
                  clFlush(queues[id*2]);
              }

              /* factorize off-diagonal blocks */
              if ( (j+jb) < m ) {
                  d = (j/nb+1)%num_gpus;
                  for( dd=0; dd<num_gpus; dd++ ) {
                      /* next column */
                      j_local2 = j_local+1;
                      if( d > id ) j_local2--;
                      if( d == id ) {
                          dlpanel = d_lA[d];
                          dlpanel_offset = dlA_offset(nb*j_local, j);
                          ldpanel = ldda;
                      } else {         
                          //dlpanel = dlPT(d);
                          dlpanel = d_lP[d];
                          dlpanel_offset = dlPT_offset(0, 0, buf);
                          ldpanel = nb;
                      }
                      nb2 = n_local[d] - j_local2*nb;
                      nb0 = min(nb, nb2 );
                        
                      magma_queue_sync(queues[d*2]);
                      // wait for the diagonal
                      if( j+jb < n && d == (j/nb+1)%num_gpus ) {
                          /* owns the next column, look-ahead the column */
                          trace_gpu_start(d, 1, "trsm", "trsm");
                          events[d][1] = trace_gpu_end(d, 1);
                          magma_ztrsm_trace( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, 
                                       nb0, jb, c_one,
                                       dlpanel,  dlpanel_offset, ldpanel, 
                                       dlA(d, nb*j_local2, j), ldda,
                                       queues[d*2+1], events[d][1]);
                          /* send the column to cpu */
                          if( j+jb < n ) {
                              magma_queue_sync( queues[d*2+1] ); // wait for lookahead
                              trace_gpu_start(d, 0, "comm_get", "get");
                              events[d][0] = trace_gpu_end(d, 0);
                              magma_zgetmatrix_async( nb0, j+jb,
                                                      dlA(d, nb*j_local2, 0), ldda,
                                                      Alo(j+jb,0), 0,           lda, 
                                                      queues[d*2], events[d][0]);
                              clFlush(queues[d*2]);
                          }
                          /* update the remaining blocks */
                          nb2 = nb2 - nb0;
                          trace_gpu_start(d, 1, "trsm", "trsm");
                          events[d][1] = trace_gpu_end(d, 1);
                          magma_ztrsm_trace( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, 
                                       nb2, jb, c_one,
                                       dlpanel, dlpanel_offset, ldpanel, 
                                       dlA(d, nb*j_local2+nb0, j), ldda, 
                                       queues[d*2+1], events[d][1]);
                      } else if( nb2 > 0 ) {
                          /* update the entire trailing matrix */
                          trace_gpu_start(d, 1, "trsm", "trsm");
                          events[d][1] = trace_gpu_end(d, 1);
                          magma_ztrsm_trace( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, 
                                       nb2, jb, c_one,
                                       dlpanel, dlpanel_offset, ldpanel, 
                                       dlA(d, nb*j_local2, j), ldda, 
                                       queues[d*2+1], events[d][1]);
                      }
                      d = (d+1)%num_gpus;
                  }
              }
            }
          } /* end of else not upper */

          /* clean up */
          for( d=0; d<num_gpus; d++ ) {
              magma_queue_sync( queues[d*2] );
              magma_queue_sync( queues[d*2+1] );
          }
          trace_finalize("dpotrf2_mgpu.svg", "trace.css");

    } /* end of not lapack */

    return *info;
} /* magma_zpotrf_mgpu */
Exemplo n.º 13
0
extern "C" magma_int_t
magma_ssytrd_sy2sb( char uplo, magma_int_t n, magma_int_t nb,
                    float *a, magma_int_t lda, 
                    float *tau,
                    float *work, magma_int_t lwork,
                    float *dT,
                    magma_int_t threads, magma_int_t *info)
{
/*  -- MAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2012

    Purpose   
    =======   
    SSYTRD_HE2HB reduces a real symmetric matrix A to real symmetric   
    band-diagonal form T by an orthogonal similarity transformation:   
    Q**T * A * Q = T.   
    This version stores the triangular matrices T used in the accumulated
    Householder transformations (I - V T V').

    Arguments   
    =========   
    UPLO    (input) CHARACTER*1   
            = 'U':  Upper triangle of A is stored;   
            = 'L':  Lower triangle of A is stored.   

    N       (input) INTEGER   
            The order of the matrix A.  N >= 0.   

    A       (input/output) REAL array, dimension (LDA,N)   
            On entry, the symmetric matrix A.  If UPLO = 'U', the leading   
            N-by-N upper triangular part of A contains the upper   
            triangular part of the matrix A, and the strictly lower   
            triangular part of A is not referenced.  If UPLO = 'L', the   
            leading N-by-N lower triangular part of A contains the lower   
            triangular part of the matrix A, and the strictly upper   
            triangular part of A is not referenced.   
            On exit, if UPLO = 'U', the Upper band-diagonal of A is 
            overwritten by the corresponding elements of the   
            band-diagonal matrix T, and the elements above the band   
            diagonal, with the array TAU, represent the orthogonal   
            matrix Q as a product of elementary reflectors; if UPLO   
            = 'L', the the Lower band-diagonal of A is overwritten by 
            the corresponding elements of the band-diagonal   
            matrix T, and the elements below the band-diagonal, with   
            the array TAU, represent the orthogonal matrix Q as a product   
            of elementary reflectors. See Further Details.   

    LDA     (input) INTEGER   
            The leading dimension of the array A.  LDA >= max(1,N).   

    TAU     (output) REAL array, dimension (N-1)   
            The scalar factors of the elementary reflectors (see Further   
            Details).   

    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))   
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.   

    LWORK   (input) INTEGER   
            The dimension of the array WORK.  LWORK >= 1.   
            For optimum performance LWORK >= N*NB, where NB is the   
            optimal blocksize.   

            If LWORK = -1, then a workspace query is assumed; the routine   
            only calculates the optimal size of the WORK array, returns   
            this value as the first entry of the WORK array, and no error   
            message related to LWORK is issued by XERBLA.   

    dT      (output) REAL array on the GPU, dimension N*NB, 
            where NB is the optimal blocksize.
            On exit dT holds the upper triangular matrices T from the 
            accumulated Householder transformations (I - V T V') used
            in the factorization. The nb x nb matrices T are ordered 
            consecutively in memory one after another.

    INFO    (output) INTEGER   
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value   

    Further Details   
    ===============   
    If UPLO = 'U', the matrix Q is represented as a product of elementary   
    reflectors   

       Q = H(n-1) . . . H(2) H(1).   

    Each H(i) has the form   

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with   
    v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in   
    A(1:i-1,i+1), and tau in TAU(i).   

    If UPLO = 'L', the matrix Q is represented as a product of elementary   
    reflectors   

       Q = H(1) H(2) . . . H(n-1).   

    Each H(i) has the form   

       H(i) = I - tau * v * v'   

    where tau is a real scalar, and v is a real vector with   
    v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i),   
    and tau in TAU(i).

    The contents of A on exit are illustrated by the following examples   
    with n = 5:   

    if UPLO = 'U':                       if UPLO = 'L':   

      (  d   e   v2  v3  v4 )              (  d                  )   
      (      d   e   v3  v4 )              (  e   d              )   
      (          d   e   v4 )              (  v1  e   d          )   
      (              d   e  )              (  v1  v2  e   d      )   
      (                  d  )              (  v1  v2  v3  e   d  )   

    where d and e denote diagonal and off-diagonal elements of T, and vi   
    denotes an element of the vector defining H(i).   
    =====================================================================    */

    #define a_ref(a_1,a_2)  ( a  + ((a_2)-1)*( lda) + (a_1)-1)
    #define da_ref(a_1,a_2) (da  + ((a_2)-1)*(ldda) + (a_1)-1)
    #define tau_ref(a_1)    (tau + (a_1)-1)
    #define t_ref(a_1)      (dT  + ((a_1)-1)*(lddt))

    char uplo_[2] = {uplo, 0};

    int ldda = ((n+31)/32)*32;
    int lddt = nb;
   
    float c_neg_one  = MAGMA_S_NEG_ONE;
    float c_neg_half = MAGMA_S_NEG_HALF;
    float c_one  = MAGMA_S_ONE ;
    float c_zero = MAGMA_S_ZERO;
    float  d_one = MAGMA_D_ONE;

    magma_int_t pm, pn, indi, indj, pk;
    magma_int_t pm_old=0, pn_old=0, indi_old=0, indj_old=0;

    int i;
    int lwkopt;
    int lquery;

    *info = 0;
    int upper = lapackf77_lsame(uplo_, "U");
    lquery = lwork == -1;
    if (! upper && ! lapackf77_lsame(uplo_, "L")) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,n)) {
        *info = -4;
    } else if (lwork < 1 && ! lquery) {
        *info = -9;
    }

    if (*info == 0) {
      /* Determine the block size. */
      lwkopt = n * nb;
      MAGMA_S_SET2REAL( work[0], lwkopt );
    }

    if (*info != 0)
      return *info;
    else if (lquery)
      return *info;

    /* Quick return if possible */
    if (n == 0) {
        work[0] = c_one;
        return *info;
    }

    float *da;
    if (MAGMA_SUCCESS != magma_smalloc( &da, (n + 2*nb)*ldda )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    magma_int_t mklth = min(threads,12);
#if defined(USEMKL)
    mkl_set_num_threads(mklth);
#endif
#if defined(USEACML)
    omp_set_num_threads(mklth);
#endif


    /* Use the first panel of da as work space */
    float *dwork = da+n*ldda;
    float *dW    = dwork + nb*ldda;

    #ifdef TRACING
    char buf[80];
    #endif
    cudaStream_t stream[3];
    magma_queue_create( &stream[0] );
    magma_queue_create( &stream[1] );
    stream[2] = 0;  // default stream
    
    trace_init( 1, 1, 3, stream );

    float *hT = work + lwork - nb*nb;
    lwork -= nb*nb;
    memset( hT, 0, nb*nb*sizeof(float));

    magmablasSetKernelStream( stream[0] );
    cudaEvent_t Pupdate_event;
    cudaEventCreateWithFlags(&Pupdate_event,cudaEventDisableTiming);
    //cudaEventCreate(&Pupdate_event);


    if (upper) {
      printf("SSYTRD_HE2HB is not yet implemented for upper matrix storage. Exit.\n");
      exit(1);

    }else {
        /* Copy the matrix to the GPU */
        if (1 <= n-nb){
            trace_gpu_start( 0, 0, "set", "set A" );
            magma_ssetmatrix_async( (n-nb), (n-nb),
                                    a_ref(nb+1, nb+1),  lda,
                                    da_ref(nb+1, nb+1), ldda, stream[0] );
            trace_gpu_end( 0, 0 );
        }

        /* Reduce the lower triangle of A */
        for (i = 1; i <= n-nb; i += nb) 
        {
             indi = i+nb;
             indj = i;
             pm   = n - i - nb + 1;
             //pn   = min(i+nb-1, n-nb) -i + 1;
             pn   = nb;
             
             /*   Get the current panel (no need for the 1st iteration) */
             if (i > 1 ){
                 // spanel_to_q copy the upper oof diagonal part of 
                 // the matrix to work to be restored later. acctually
                 //  the zero's and one's putted are not used this is only
                 //   because we don't have a function that copy only the
                 //    upper part of A to be restored after copying the 
                 //    lookahead panel that has been computted from GPU to CPU. 
                 spanel_to_q(MagmaUpper, pn-1, a_ref(i, i+1), lda, work);

                 trace_gpu_start( 0, 1, "get", "get panel" );
                 //magma_queue_sync( stream[0] );
                 cudaStreamWaitEvent(stream[1], Pupdate_event, 0);
                 magma_sgetmatrix_async( (pm+pn), pn,
                                         da_ref( i, i), ldda,
                                         a_ref ( i, i), lda, stream[1] );
                 trace_gpu_end( 0, 1 );

                 trace_gpu_start( 0, 2, "syr2k", "syr2k" );
                 magma_ssyr2k(MagmaLower, MagmaNoTrans, pm_old-pn_old, pn_old, c_neg_one,
                      da_ref(indi_old+pn_old, indj_old), ldda,
                      dW + pn_old           , pm_old, d_one,
                      da_ref(indi_old+pn_old, indi_old+pn_old), ldda);
                 trace_gpu_end( 0, 2 );

                 trace_cpu_start( 0, "sync", "sync on 1" );
                 magma_queue_sync( stream[1] );
                 trace_cpu_end( 0 );
                 sq_to_panel(MagmaUpper, pn-1, a_ref(i, i+1), lda, work);
             }

             /* ==========================================================
                QR factorization on a panel starting nb off of the diagonal.
                Prepare the V and T matrices. 
                ==========================================================  */
             #ifdef TRACING
             snprintf( buf, sizeof(buf), "panel %d", i );
             #endif
             trace_cpu_start( 0, "geqrf", buf );
             lapackf77_sgeqrf(&pm, &pn, a_ref(indi, indj), &lda, 
                        tau_ref(i), work, &lwork, info);
             
             /* Form the matrix T */
                         pk=min(pm,pn);
             lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr,
                           &pm, &pk, a_ref(indi, indj), &lda,
                           tau_ref(i), hT, &nb);

             /* Prepare V - put 0s in the upper triangular part of the panel
                (and 1s on the diagonal), temporaly storing the original in work */
             spanel_to_q(MagmaUpper, pk, a_ref(indi, indj), lda, work);
             trace_cpu_end( 0 );

             /* Send V from the CPU to the GPU */
             trace_gpu_start( 0, 0, "set", "set V and T" );
             magma_ssetmatrix_async( pm, pk,
                                     a_ref(indi, indj),  lda,
                                     da_ref(indi, indj), ldda, stream[0] );

             /* Send the triangular factor T to the GPU */
             magma_ssetmatrix_async( pk, pk,
                                     hT,       nb,
                                     t_ref(i), lddt, stream[0] );
             trace_gpu_end( 0, 0 );
             
             /* ==========================================================
                Compute W:
                1. X = A (V T)
                2. W = X - 0.5* V * (T' * (V' * X)) 
                ==========================================================  */
             /* dwork = V T */
             trace_cpu_start( 0, "sync", "sync on 0" );
             // this sync is done here to be sure that the copy has been finished
             // because below we made a restore sq_to_panel and this restore need
             // to ensure that the copy has been finished. we did it here to allow
             // overlapp of restore with next gemm and symm.
             magma_queue_sync( stream[0] );
             trace_cpu_end( 0 );
             
             trace_gpu_start( 0, 2, "gemm", "work = V*T" );
             magma_sgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk,
                         c_one, da_ref(indi, indj), ldda, 
                         t_ref(i), lddt,
                         c_zero, dwork, pm);
             trace_gpu_end( 0, 2 );
             
             /* dW = X = A*V*T. dW = A*dwork */ 
             trace_gpu_start( 0, 2, "symm", "X = A*work" );
             magma_ssymm(MagmaLeft, uplo, pm, pk,
                         c_one, da_ref(indi, indi), ldda,
                         dwork, pm,
                         c_zero, dW, pm);
             trace_gpu_end( 0, 2 );
             /* restore the panel */
             sq_to_panel(MagmaUpper, pk, a_ref(indi, indj), lda, work);
             
             /* dwork = V*T already ==> dwork' = T'*V'
              * compute T'*V'*X ==> dwork'*W ==>
              * dwork + pm*nb = ((T' * V') * X) = dwork' * X = dwork' * W */
             trace_gpu_start( 0, 2, "gemm", "work = T'*V'*X" );
             magma_sgemm(MagmaTrans, MagmaNoTrans, pk, pk, pm,
                         c_one, dwork, pm, 
                         dW, pm,
                         c_zero, dwork + pm*nb, nb);
             trace_gpu_end( 0, 2 );
             
             /* W = X - 0.5 * V * T'*V'*X
              *   = X - 0.5 * V * (dwork + pm*nb) = W - 0.5 * V * (dwork + pm*nb) */
             trace_gpu_start( 0, 2, "gemm", "W = X - 0.5*V*(T'*V'*X)" );
             magma_sgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk,
                         c_neg_half, da_ref(indi, indj), ldda,
                         dwork + pm*nb, nb, 
                         c_one,     dW, pm);
             trace_gpu_end( 0, 2 );

             /* ==========================================================
                Update the unreduced submatrix A(i+ib:n,i+ib:n), using   
                an update of the form:  A := A - V*W' - W*V' 
                ==========================================================  */
             if (i + nb <= n-nb){
                 /* There would be next iteration;
                    do lookahead - update the next panel */
                 trace_gpu_start( 0, 2, "gemm", "gemm 4 next panel left" );
                 magma_sgemm(MagmaNoTrans, MagmaTrans, pm, pn, pn, c_neg_one,
                             da_ref(indi, indj), ldda,
                             dW                , pm, c_one,
                             da_ref(indi, indi), ldda);
                 trace_gpu_end( 0, 2 );
             
                 trace_gpu_start( 0, 2, "gemm", "gemm 5 next panel right" );
                 magma_sgemm(MagmaNoTrans, MagmaTrans, pm, pn, pn, c_neg_one,
                             dW                , pm,
                             da_ref(indi, indj), ldda, c_one,
                             da_ref(indi, indi), ldda);
                 trace_gpu_end( 0, 2 );
                 cudaEventRecord(Pupdate_event, stream[0]);
             }
             else {
                 /* no look-ahead as this is last iteration */
                 trace_gpu_start( 0, 2, "syr2k", "syr2k last iteration" );
                 magma_ssyr2k(MagmaLower, MagmaNoTrans, pk, pk, c_neg_one,
                              da_ref(indi, indj), ldda,
                              dW                , pm, d_one,
                              da_ref(indi, indi), ldda);
                 trace_gpu_end( 0, 2 );
             }
             
             indi_old = indi;
             indj_old = indj;
             pm_old   = pm;
             pn_old   = pn;
        }  // end loop for(i)

        /* Send the last block to the CPU */
        pk = min(pm,pn);
        if (1 <= n-nb){
            spanel_to_q(MagmaUpper, pk-1, a_ref(n-pk+1, n-pk+2), lda, work);
            trace_gpu_start( 0, 2, "get", "get last block" );
            magma_sgetmatrix( pk, pk,
                              da_ref(n-pk+1, n-pk+1), ldda,
                              a_ref(n-pk+1, n-pk+1),  lda );
            trace_gpu_end( 0, 2 );
            sq_to_panel(MagmaUpper, pk-1, a_ref(n-pk+1, n-pk+2), lda, work);
        }
    }// end of LOWER
    
    trace_finalize( "ssytrd_sy2sb.svg", "trace.css" );

    cudaEventDestroy(Pupdate_event);
    magma_queue_destroy( stream[0] );
    magma_queue_destroy( stream[1] );
    magma_free( da );
    MAGMA_S_SET2REAL( work[0], lwkopt );
    magmablasSetKernelStream( 0 );
    
#if defined(USEMKL)
    mkl_set_num_threads(1);
#endif
#if defined(USEACML)
    omp_set_num_threads(1);
#endif
    

    return *info;
} /* ssytrd_sy2sb_ */
Exemplo n.º 14
0
/**
    Purpose
    -------
    ZUNGQR generates an M-by-N COMPLEX_16 matrix Q with orthonormal columns,
    which is defined as the first N columns of a product of K elementary
    reflectors of order M

        Q  =  H(1) H(2) . . . H(k)

    as returned by ZGEQRF.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix Q. M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix Q. M >= N >= 0.

    @param[in]
    k       INTEGER
            The number of elementary reflectors whose product defines the
            matrix Q. N >= K >= 0.

    @param[in,out]
    A       COMPLEX_16 array A, dimension (LDDA,N).
            On entry, the i-th column must contain the vector
            which defines the elementary reflector H(i), for
            i = 1,2,...,k, as returned by ZGEQRF_GPU in the
            first k columns of its array argument A.
            On exit, the M-by-N matrix Q.

    @param[in]
    lda     INTEGER
            The first dimension of the array A. LDA >= max(1,M).

    @param[in]
    tau     COMPLEX_16 array, dimension (K)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by ZGEQRF_GPU.

    @param[in]
    T       COMPLEX_16 array, dimension (NB, min(M,N)).
            T contains the T matrices used in blocking the elementary
            reflectors H(i), e.g., this can be the 6th argument of
            magma_zgeqrf_gpu (except stored on the CPU, not the GPU).

    @param[in]
    nb      INTEGER
            This is the block size used in ZGEQRF_GPU, and correspondingly
            the size of the T matrices, used in the factorization, and
            stored in T.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument has an illegal value

    @ingroup magma_zgeqrf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_zungqr_m(
    magma_int_t m, magma_int_t n, magma_int_t k,
    magmaDoubleComplex *A, magma_int_t lda,
    magmaDoubleComplex *tau,
    magmaDoubleComplex *T, magma_int_t nb,
    magma_int_t *info)
{
#define  A(i,j)   ( A    + (i) + (j)*lda )
#define dA(d,i,j) (dA[d] + (i) + (j)*ldda)
#define dT(d,i,j) (dT[d] + (i) + (j)*nb)

    magmaDoubleComplex c_zero = MAGMA_Z_ZERO;
    magmaDoubleComplex c_one  = MAGMA_Z_ONE;

    magma_int_t m_kk, n_kk, k_kk, mi;
    magma_int_t lwork, ldwork;
    magma_int_t i, ib, ki, kk, iinfo;
    magmaDoubleComplex *work;

    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if ((n < 0) || (n > m)) {
        *info = -2;
    } else if ((k < 0) || (k > n)) {
        *info = -3;
    } else if (lda < max(1,m)) {
        *info = -5;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    if (n <= 0) {
        return *info;
    }
    
    magma_int_t di, dn;
    magma_int_t dpanel;

    magma_int_t ngpu = magma_num_gpus();
    
    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );
    
    // Allocate memory on GPUs for A and workspaces
    magma_int_t ldda    = ((m + 31) / 32) * 32;
    magma_int_t lddwork = ((n + 31) / 32) * 32;
    magma_int_t min_lblocks = (n / nb) / ngpu;  // min. blocks per gpu
    magma_int_t last_dev    = (n / nb) % ngpu;  // device with last block
    
    magma_int_t  nlocal[ MagmaMaxGPUs ] = { 0 };
    magmaDoubleComplex *dA[ MagmaMaxGPUs ] = { NULL };
    magmaDoubleComplex *dT[ MagmaMaxGPUs ] = { NULL };
    magmaDoubleComplex *dV[ MagmaMaxGPUs ] = { NULL };
    magmaDoubleComplex *dW[ MagmaMaxGPUs ] = { NULL };
    magma_queue_t stream[ MagmaMaxGPUs ] = { NULL };
    
    for( int d = 0; d < ngpu; ++d ) {
        // example with n = 75, nb = 10, ngpu = 3
        // min_lblocks = 2
        // last_dev    = 1
        // gpu 0: 2  blocks, cols:  0- 9, 30-39, 60-69
        // gpu 1: 1+ blocks, cols: 10-19, 40-49, 70-74 (partial)
        // gpu 2: 1  block,  cols: 20-29, 50-59
        magma_setdevice( d );
        nlocal[d] = min_lblocks*nb;
        if ( d < last_dev ) {
            nlocal[d] += nb;
        }
        else if ( d == last_dev ) {
            nlocal[d] += (n % nb);
        }
        
        ldwork = nlocal[d]*ldda  // dA
               + nb*m            // dT
               + nb*ldda         // dV
               + nb*lddwork;     // dW
        if ( MAGMA_SUCCESS != magma_zmalloc( &dA[d], ldwork )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            goto CLEANUP;
        }
        dT[d] = dA[d] + nlocal[d]*ldda;
        dV[d] = dT[d] + nb*m;
        dW[d] = dV[d] + nb*ldda;
        
        magma_queue_create( &stream[d] );
    }
    
    trace_init( 1, ngpu, 1, stream );
    
    // first kk columns are handled by blocked method.
    // ki is start of 2nd-to-last block
    if ((nb > 1) && (nb < k)) {
        ki = (k - nb - 1) / nb * nb;
        kk = min(k, ki + nb);
    } else {
        ki = 0;
        kk = 0;
    }

    // Allocate CPU work space
    // n*nb for zungqr workspace
    lwork = n * nb;
    magma_zmalloc_cpu( &work, lwork );
    if (work == NULL) {
        *info = MAGMA_ERR_HOST_ALLOC;
        goto CLEANUP;
    }

    // Use unblocked code for the last or only block.
    if (kk < n) {
        trace_cpu_start( 0, "ungqr", "ungqr last block" );
        m_kk = m - kk;
        n_kk = n - kk;
        k_kk = k - kk;
        dpanel =  (kk / nb) % ngpu;
        di     = ((kk / nb) / ngpu) * nb;
        magma_setdevice( dpanel );
        
        lapackf77_zungqr( &m_kk, &n_kk, &k_kk,
                          A(kk, kk), &lda,
                          &tau[kk], work, &lwork, &iinfo );

        magma_zsetmatrix( m_kk, n_kk,
                          A(kk, kk),  lda,
                          dA(dpanel, kk, di), ldda );
        
        // Set A(1:kk,kk+1:n) to zero.
        magmablas_zlaset( MagmaFull, kk, n - kk, c_zero, c_zero, dA(dpanel, 0, di), ldda );
        trace_cpu_end( 0 );
    }

    if (kk > 0) {
        // Use blocked code
        // send T to all GPUs
        for( int d = 0; d < ngpu; ++d ) {
            magma_setdevice( d );
            trace_gpu_start( d, 0, "set", "set T" );
            magma_zsetmatrix_async( nb, min(m,n), T, nb, dT[d], nb, stream[d] );
            trace_gpu_end( d, 0 );
        }
        
        // stream: set Aii (V) --> laset --> laset --> larfb --> [next]
        // CPU has no computation
        for( i = ki; i >= 0; i -= nb ) {
            ib = min(nb, k - i);
            mi = m - i;
            dpanel =  (i / nb) % ngpu;
            di     = ((i / nb) / ngpu) * nb;

            // Send current panel to the GPUs
            lapackf77_zlaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda );
            for( int d = 0; d < ngpu; ++d ) {
                magma_setdevice( d );
                trace_gpu_start( d, 0, "set", "set V" );
                magma_zsetmatrix_async( mi, ib,
                                        A(i, i), lda,
                                        dV[d],   ldda, stream[d] );
                trace_gpu_end( d, 0 );
            }
            
            // set panel to identity
            magma_setdevice( dpanel );
            magmablasSetKernelStream( stream[dpanel] );
            trace_gpu_start( dpanel, 0, "laset", "laset" );
            magmablas_zlaset( MagmaFull, i,  ib, c_zero, c_zero, dA(dpanel, 0, di), ldda );
            magmablas_zlaset( MagmaFull, mi, ib, c_zero, c_one,  dA(dpanel, i, di), ldda );
            trace_gpu_end( dpanel, 0 );
            
            if (i < n) {
                // Apply H to A(i:m,i:n) from the left
                for( int d = 0; d < ngpu; ++d ) {
                    magma_setdevice( d );
                    magmablasSetKernelStream( stream[d] );
                    magma_indices_1D_bcyclic( nb, ngpu, d, i, n, &di, &dn );
                    trace_gpu_start( d, 0, "larfb", "larfb" );
                    magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise,
                                      mi, dn-di, ib,
                                      dV[d],        ldda, dT(d,0,i), nb,
                                      dA(d, i, di), ldda, dW[d], lddwork );
                    trace_gpu_end( d, 0 );
                }
            }
        }
    }
    
    // copy result back to CPU
    trace_cpu_start( 0, "get", "get A" );
    magma_zgetmatrix_1D_col_bcyclic( m, n, dA, ldda, A, lda, ngpu, nb );
    trace_cpu_end( 0 );
    
    #ifdef TRACING
    char name[80];
    snprintf( name, sizeof(name), "zungqr-n%d-ngpu%d.svg", m, ngpu );
    trace_finalize( name, "trace.css" );
    #endif
    
CLEANUP:
    for( int d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magma_free( dA[d] );
        magma_queue_destroy( stream[d] );
    }
    magma_free_cpu( work );
    magma_setdevice( orig_dev );
    magmablasSetKernelStream( orig_stream );
    
    return *info;
} /* magma_zungqr */
Exemplo n.º 15
0
/**
    Purpose
    -------
    DSYTRD_HE2HB reduces a real symmetric matrix A to real symmetric
    band-diagonal form T by an orthogonal similarity transformation:
    Q**H * A * Q = T.
    This version stores the triangular matrices T used in the accumulated
    Householder transformations (I - V T V').

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of A is stored;
      -     = MagmaLower:  Lower triangle of A is stored.

    @param[in]
    n       INTEGER
            The order of the matrix A.  N >= 0.

    @param[in,out]
    A       DOUBLE_PRECISION array, dimension (LDA,N)
            On entry, the symmetric matrix A.  If UPLO = MagmaUpper, the leading
            N-by-N upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = MagmaLower, the
            leading N-by-N lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
            On exit, if UPLO = MagmaUpper, the Upper band-diagonal of A is
            overwritten by the corresponding elements of the
            band-diagonal matrix T, and the elements above the band
            diagonal, with the array TAU, represent the orthogonal
            matrix Q as a product of elementary reflectors; if UPLO
            = MagmaLower, the the Lower band-diagonal of A is overwritten by
            the corresponding elements of the band-diagonal
            matrix T, and the elements below the band-diagonal, with
            the array TAU, represent the orthogonal matrix Q as a product
            of elementary reflectors. See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    tau     DOUBLE_PRECISION array, dimension (N-1)
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    work    (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.  LWORK >= 1.
            For optimum performance LWORK >= N*NB, where NB is the
            optimal blocksize.
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    @param[out]
    dT      DOUBLE_PRECISION array on the GPU, dimension N*NB,
            where NB is the optimal blocksize.
            On exit dT holds the upper triangular matrices T from the
            accumulated Householder transformations (I - V T V') used
            in the factorization. The nb x nb matrices T are ordered
            consecutively in memory one after another.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value

    Further Details
    ---------------
    If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary
    reflectors

       Q = H(n-1) . . . H(2) H(1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in
    A(1:i-1,i+1), and tau in TAU(i).

    If UPLO = MagmaLower, the matrix Q is represented as a product of elementary
    reflectors

       Q = H(1) H(2) . . . H(n-1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i),
    and tau in TAU(i).

    The contents of A on exit are illustrated by the following examples
    with n = 5:

    if UPLO = MagmaUpper:                if UPLO = MagmaLower:

      (  d   e   v2  v3  v4 )              (  d                  )
      (      d   e   v3  v4 )              (  e   d              )
      (          d   e   v4 )              (  v1  e   d          )
      (              d   e  )              (  v1  v2  e   d      )
      (                  d  )              (  v1  v2  v3  e   d  )

    where d and e denote diagonal and off-diagonal elements of T, and vi
    denotes an element of the vector defining H(i).

    @ingroup magma_dsyev_2stage
    ********************************************************************/
extern "C" magma_int_t
magma_dsytrd_sy2sb( magma_uplo_t uplo, magma_int_t n, magma_int_t nb,
                    double *A, magma_int_t lda,
                    double *tau,
                    double *work, magma_int_t lwork,
                    double *dT,
                    magma_int_t *info)
{
    #define  A(a_1,a_2)  ( A + ((a_2)-1)*( lda) + (a_1)-1)
    #define dA(a_1,a_2)  (dA + ((a_2)-1)*(ldda) + (a_1)-1)
    #define tau_ref(a_1) (tau + (a_1)-1)
    #define dT(a_1)      (dT + ((a_1)-1)*(lddt))

    int ldda = ((n+31)/32)*32;
    int lddt = nb;
   
    double c_neg_one  = MAGMA_D_NEG_ONE;
    double c_neg_half = MAGMA_D_NEG_HALF;
    double c_one  = MAGMA_D_ONE;
    double c_zero = MAGMA_D_ZERO;
    double  d_one = MAGMA_D_ONE;

    magma_int_t pm, pn, indi, indj, pk;
    magma_int_t pm_old=0, pn_old=0, indi_old=0, indj_old=0;

    int i;
    int lwkopt;
    int lquery;

    *info = 0;
    int upper = (uplo == MagmaUpper);
    lquery = (lwork == -1);
    if (! upper && uplo != MagmaLower) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,n)) {
        *info = -4;
    } else if (lwork < 1 && ! lquery) {
        *info = -9;
    }

    /* Determine the block size. */
    lwkopt = n * nb;
    if (*info == 0) {
        work[0] = MAGMA_D_MAKE( lwkopt, 0 );
    }

    if (*info != 0)
        return *info;
    else if (lquery)
        return *info;

    /* Quick return if possible */
    if (n == 0) {
        work[0] = c_one;
        return *info;
    }

    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );
    
    double *dA;
    if (MAGMA_SUCCESS != magma_dmalloc( &dA, (n + 2*nb)*ldda )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    // limit to 16 threads
    magma_int_t orig_threads = magma_get_lapack_numthreads();
    magma_set_lapack_numthreads( min(orig_threads,16) );

    /* Use the first panel of dA as work space */
    double *dwork = dA + n*ldda;
    double *dW    = dwork + nb*ldda;

    #ifdef TRACING
    char buf[80];
    #endif
    magma_queue_t stream[3];
    magma_queue_create( &stream[0] );
    magma_queue_create( &stream[1] );
    stream[2] = 0;  // default stream
    
    trace_init( 1, 1, 3, stream );

    double *hT = work + lwork - nb*nb;
    lwork -= nb*nb;
    memset( hT, 0, nb*nb*sizeof(double));

    magmablasSetKernelStream( stream[0] );
    magma_event_t Pupdate_event;
    cudaEventCreateWithFlags(&Pupdate_event,cudaEventDisableTiming);
    //magma_event_create(&Pupdate_event);


    if (upper) {
        printf("DSYTRD_HE2HB is not yet implemented for upper matrix storage. Exit.\n");
        exit(1);
    } else {
        /* Copy the matrix to the GPU */
        if (1 <= n-nb) {
            trace_gpu_start( 0, 0, "set", "set A" );
            magma_dsetmatrix_async( (n-nb), (n-nb),
                                    A(nb+1, nb+1),  lda,
                                    dA(nb+1, nb+1), ldda, stream[0] );
            trace_gpu_end( 0, 0 );
        }

        /* Reduce the lower triangle of A */
        for (i = 1; i <= n-nb; i += nb) {
             indi = i+nb;
             indj = i;
             pm   = n - i - nb + 1;
             //pn   = min(i+nb-1, n-nb) -i + 1;
             pn   = nb;
             
             /*   Get the current panel (no need for the 1st iteration) */
             if (i > 1 ) {
                 // dpanel_to_q copy the upper oof diagonal part of
                 // the matrix to work to be restored later. acctually
                 //  the zero's and one's putted are not used this is only
                 //   because we don't have a function that copy only the
                 //    upper part of A to be restored after copying the
                 //    lookahead panel that has been computted from GPU to CPU.
                 dpanel_to_q(MagmaUpper, pn-1, A(i, i+1), lda, work);

                 trace_gpu_start( 0, 1, "get", "get panel" );
                 //magma_queue_sync( stream[0] );
                 magma_queue_wait_event(stream[1], Pupdate_event);  //, 0);
                 magma_dgetmatrix_async( (pm+pn), pn,
                                         dA( i, i), ldda,
                                         A ( i, i), lda, stream[1] );
                 trace_gpu_end( 0, 1 );

                 trace_gpu_start( 0, 2, "her2k", "her2k" );
                 magma_dsyr2k(MagmaLower, MagmaNoTrans, pm_old-pn_old, pn_old, c_neg_one,
                      dA(indi_old+pn_old, indj_old), ldda,
                      dW + pn_old,            pm_old, d_one,
                      dA(indi_old+pn_old, indi_old+pn_old), ldda);
                 trace_gpu_end( 0, 2 );

                 trace_cpu_start( 0, "sync", "sync on 1" );
                 magma_queue_sync( stream[1] );
                 trace_cpu_end( 0 );
                 dq_to_panel(MagmaUpper, pn-1, A(i, i+1), lda, work);
             }

             /* ==========================================================
                QR factorization on a panel starting nb off of the diagonal.
                Prepare the V and T matrices.
                ==========================================================  */
             #ifdef TRACING
             snprintf( buf, sizeof(buf), "panel %d", i );
             #endif
             trace_cpu_start( 0, "geqrf", buf );
             lapackf77_dgeqrf(&pm, &pn, A(indi, indj), &lda,
                        tau_ref(i), work, &lwork, info);
             
             /* Form the matrix T */
                         pk=min(pm,pn);
             lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr,
                           &pm, &pk, A(indi, indj), &lda,
                           tau_ref(i), hT, &nb);

             /* Prepare V - put 0s in the upper triangular part of the panel
                (and 1s on the diagonal), temporaly storing the original in work */
             dpanel_to_q(MagmaUpper, pk, A(indi, indj), lda, work);
             trace_cpu_end( 0 );

             /* Send V from the CPU to the GPU */
             trace_gpu_start( 0, 0, "set", "set V and T" );
             magma_dsetmatrix_async( pm, pk,
                                     A(indi, indj),  lda,
                                     dA(indi, indj), ldda, stream[0] );

             /* Send the triangular factor T to the GPU */
             magma_dsetmatrix_async( pk, pk,
                                     hT,       nb,
                                     dT(i), lddt, stream[0] );
             trace_gpu_end( 0, 0 );
             
             /* ==========================================================
                Compute W:
                1. X = A (V T)
                2. W = X - 0.5* V * (T' * (V' * X))
                ==========================================================  */
             /* dwork = V T */
             trace_cpu_start( 0, "sync", "sync on 0" );
             // this sync is done here to be sure that the copy has been finished
             // because below we made a restore dq_to_panel and this restore need
             // to ensure that the copy has been finished. we did it here to allow
             // overlapp of restore with next gemm and symm.
             magma_queue_sync( stream[0] );
             trace_cpu_end( 0 );
             
             trace_gpu_start( 0, 2, "gemm", "work = V*T" );
             magma_dgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk,
                         c_one, dA(indi, indj), ldda,
                         dT(i), lddt,
                         c_zero, dwork, pm);
             trace_gpu_end( 0, 2 );
             
             /* dW = X = A*V*T. dW = A*dwork */
             trace_gpu_start( 0, 2, "hemm", "X = A*work" );
             magma_dsymm(MagmaLeft, uplo, pm, pk,
                         c_one, dA(indi, indi), ldda,
                         dwork, pm,
                         c_zero, dW, pm);
             trace_gpu_end( 0, 2 );
             /* restore the panel */
             dq_to_panel(MagmaUpper, pk, A(indi, indj), lda, work);
             
             /* dwork = V*T already ==> dwork' = T'*V'
              * compute T'*V'*X ==> dwork'*W ==>
              * dwork + pm*nb = ((T' * V') * X) = dwork' * X = dwork' * W */
             trace_gpu_start( 0, 2, "gemm", "work = T'*V'*X" );
             magma_dgemm(MagmaConjTrans, MagmaNoTrans, pk, pk, pm,
                         c_one, dwork, pm,
                         dW, pm,
                         c_zero, dwork + pm*nb, nb);
             trace_gpu_end( 0, 2 );
             
             /* W = X - 0.5 * V * T'*V'*X
              *   = X - 0.5 * V * (dwork + pm*nb) = W - 0.5 * V * (dwork + pm*nb) */
             trace_gpu_start( 0, 2, "gemm", "W = X - 0.5*V*(T'*V'*X)" );
             magma_dgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk,
                         c_neg_half, dA(indi, indj), ldda,
                         dwork + pm*nb, nb,
                         c_one,     dW, pm);
             trace_gpu_end( 0, 2 );

             /* ==========================================================
                Update the unreduced submatrix A(i+ib:n,i+ib:n), using
                an update of the form:  A := A - V*W' - W*V'
                ==========================================================  */
             if (i + nb <= n-nb) {
                 /* There would be next iteration;
                    do lookahead - update the next panel */
                 trace_gpu_start( 0, 2, "gemm", "gemm 4 next panel left" );
                 magma_dgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one,
                             dA(indi, indj), ldda,
                             dW,                 pm, c_one,
                             dA(indi, indi), ldda);
                 trace_gpu_end( 0, 2 );
             
                 trace_gpu_start( 0, 2, "gemm", "gemm 5 next panel right" );
                 magma_dgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one,
                             dW,                 pm,
                             dA(indi, indj), ldda, c_one,
                             dA(indi, indi), ldda);
                 trace_gpu_end( 0, 2 );
                 magma_event_record(Pupdate_event, stream[0]);
             }
             else {
                 /* no look-ahead as this is last iteration */
                 trace_gpu_start( 0, 2, "her2k", "her2k last iteration" );
                 magma_dsyr2k(MagmaLower, MagmaNoTrans, pk, pk, c_neg_one,
                              dA(indi, indj), ldda,
                              dW,                 pm, d_one,
                              dA(indi, indi), ldda);
                 trace_gpu_end( 0, 2 );
             }
             
             indi_old = indi;
             indj_old = indj;
             pm_old   = pm;
             pn_old   = pn;
        }  // end loop for (i)

        /* Send the last block to the CPU */
        pk = min(pm,pn);
        if (1 <= n-nb) {
            dpanel_to_q(MagmaUpper, pk-1, A(n-pk+1, n-pk+2), lda, work);
            trace_gpu_start( 0, 2, "get", "get last block" );
            magma_dgetmatrix( pk, pk,
                              dA(n-pk+1, n-pk+1), ldda,
                              A(n-pk+1, n-pk+1),  lda );
            trace_gpu_end( 0, 2 );
            dq_to_panel(MagmaUpper, pk-1, A(n-pk+1, n-pk+2), lda, work);
        }
    }// end of LOWER
    
    trace_finalize( "dsytrd_sy2sb.svg", "trace.css" );

    magma_event_destroy( Pupdate_event );
    magma_queue_destroy( stream[0] );
    magma_queue_destroy( stream[1] );
    magma_free( dA );
    work[0] = MAGMA_D_MAKE( lwkopt, 0 );

    magmablasSetKernelStream( orig_stream );    
    magma_set_lapack_numthreads( orig_threads );

    return *info;
} /* magma_dsytrd_sy2sb */
Exemplo n.º 16
0
/**
    Purpose
    =======

    SSYTRF_nopiv computes the LDLt factorization of a real symmetric
    matrix A. This version does not require work space on the GPU passed
    as input. GPU memory is allocated in the routine.

    The factorization has the form
       A = U^H * D * U,   if UPLO = MagmaUpper, or
       A = L   * D * L^H, if UPLO = MagmaLower,
    where U is an upper triangular matrix, L is lower triangular, and
    D is a diagonal matrix.

    This is the block version of the algorithm, calling Level 3 BLAS.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of A is stored;
      -     = MagmaLower:  Lower triangle of A is stored.

    @param[in]
    n       INTEGER
            The order of the matrix A.  N >= 0.

    @param[in,out]
    A       REAL array, dimension (LDA,N)
            On entry, the symmetric matrix A.  If UPLO = MagmaUpper, the leading
            N-by-N upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = MagmaLower, the
            leading N-by-N lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
    \n
            On exit, if INFO = 0, the factor U or L from the Cholesky
            factorization A = U^H D U or A = L D L^H.
    \n
            Higher performance is achieved if A is in pinned memory.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  if INFO = -6, the GPU memory allocation failed
      -     > 0:  if INFO = i, the leading minor of order i is not
                  positive definite, and the factorization could not be
                  completed.

    @ingroup magma_ssysv_comp
    ******************************************************************* */
extern "C" magma_int_t
magma_ssytrf_nopiv(
    magma_uplo_t uplo, magma_int_t n,
    float *A, magma_int_t lda,
    magma_int_t *info)
{
    #define  A(i, j)  ( A +(j)*lda  + (i))
    #define dA(i, j)  (dA +(j)*ldda + (i))
    #define dW(i, j)  (dW +(j)*ldda + (i))
    #define dWt(i, j) (dW +(j)*nb   + (i))

    /* Constants */
    const float c_one     = MAGMA_S_ONE;
    const float c_neg_one = MAGMA_S_NEG_ONE;
    
    /* Local variables */
    bool upper = (uplo == MagmaUpper);
    magma_int_t j, k, jb, ldda, nb, ib, iinfo;
    magmaFloat_ptr dA;
    magmaFloat_ptr dW;

    *info = 0;
    if (! upper && uplo != MagmaLower) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,n)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return MAGMA_ERR_ILLEGAL_VALUE;
    }

    /* Quick return */
    if ( n == 0 )
      return MAGMA_SUCCESS;

    ldda = magma_roundup( n, 32 );
    nb = magma_get_ssytrf_nopiv_nb(n);
    ib = min(32, nb); // inner-block for diagonal factorization

    if ((MAGMA_SUCCESS != magma_smalloc(&dA, n *ldda)) ||
        (MAGMA_SUCCESS != magma_smalloc(&dW, nb*ldda))) {
        /* alloc failed so call the non-GPU-resident version */
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    magma_device_t cdev;
    magma_queue_t queues[2];
    magma_event_t event;

    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queues[0] );
    magma_queue_create( cdev, &queues[1] );
    magma_event_create( &event );
    trace_init( 1, 1, 2, queues );

    /* Use hybrid blocked code. */
    if (upper) {
        //=========================================================
        // Compute the LDLt factorization A = U'*D*U without pivoting.
        // copy matrix to GPU
        for (j=0; j < n; j += nb) {
            jb = min(nb, (n-j));
            trace_gpu_start( 0, 0, "set", "set" );
            magma_ssetmatrix_async(j+jb, jb, A(0, j), lda, dA(0, j), ldda, queues[0]);
            trace_gpu_end( 0, 0 );
        }
        
        // main loop
        for (j=0; j < n; j += nb) {
            jb = min(nb, (n-j));
            
            // copy A(j,j) back to CPU
            trace_gpu_start( 0, 0, "get", "get" );
            if ( j != 0) {
                //magma_event_sync(event);
                magma_sgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, queues[1]);
            }
            trace_gpu_end( 0, 0 );
            
            // factorize the diagonal block
            magma_queue_sync(queues[1]);
            trace_cpu_start( 0, "potrf", "potrf" );
            magma_ssytrf_nopiv_cpu( MagmaUpper, jb, ib, A(j, j), lda, info );
            trace_cpu_end( 0 );
            if (*info != 0) {
                *info = *info + j;
                break;
            }
            
            // copy A(j,j) back to GPU
            trace_gpu_start( 0, 0, "set", "set" );
            magma_ssetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, queues[0]);
            trace_gpu_end( 0, 0 );
            
            // copy j-th column of U back to CPU
            trace_gpu_start( 0, 1, "get", "get" );
            magma_sgetmatrix_async(j, jb, dA(0, j), ldda, A(0, j), lda, queues[1]);
            trace_gpu_end( 0, 1 );

            if ( (j+jb) < n) {
                // compute the off-diagonal blocks of current block column
                trace_gpu_start( 0, 0, "trsm", "trsm" );
                magma_strsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaUnit,
                             jb, (n-j-jb),
                             c_one, dA(j, j), ldda,
                             dA(j, j+jb), ldda,
                             queues[0] );
                magma_scopymatrix( jb, n-j-jb, dA( j, j+jb ), ldda, dWt( 0, j+jb ), nb, queues[0] );
                
                // update the trailing submatrix with D
                magmablas_slascl_diag( MagmaUpper, jb, n-j-jb,
                                       dA(j,    j), ldda,
                                       dA(j, j+jb), ldda,
                                       queues[0], &iinfo);
                trace_gpu_end( 0, 0 );
                
                // update the trailing submatrix with U and W
                trace_gpu_start( 0, 0, "gemm", "gemm" );
                for (k=j+jb; k < n; k += nb) {
                    magma_int_t kb = min(nb,n-k);
                    magma_sgemm( MagmaConjTrans, MagmaNoTrans, kb, n-k, jb,
                                 c_neg_one, dWt(0, k), nb,
                                            dA(j, k),  ldda,
                                 c_one,     dA(k, k),  ldda,
                                 queues[0]);
                    if (k == j+jb) {
                        // magma_event_record( event, queues[0] );
                        magma_queue_sync( queues[0] );
                    }
                }
                trace_gpu_end( 0, 0 );
            }
        }
    } else {
        //=========================================================
        // Compute the LDLt factorization A = L*D*L' without pivoting.
        // copy the matrix to GPU
        for (j=0; j < n; j += nb) {
            jb = min(nb, (n-j));
            trace_gpu_start( 0, 0, "set", "set" );
            magma_ssetmatrix_async((n-j), jb, A(j, j), lda, dA(j, j), ldda, queues[0]);
            trace_gpu_end( 0, 0 );
        }
        
        // main loop
        for (j=0; j < n; j += nb) {
            jb = min(nb, (n-j));
            
            // copy A(j,j) back to CPU
            trace_gpu_start( 0, 0, "get", "get" );
            if (j != 0) {
                //magma_event_sync(event);
                magma_sgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, queues[1]);
            }
            trace_gpu_end( 0, 0 );
            
            // factorize the diagonal block
            magma_queue_sync(queues[1]);
            trace_cpu_start( 0, "potrf", "potrf" );
            magma_ssytrf_nopiv_cpu( MagmaLower, jb, ib, A(j, j), lda, info );
            trace_cpu_end( 0 );
            if (*info != 0) {
                *info = *info + j;
                break;
            }

            // copy A(j,j) back to GPU
            trace_gpu_start( 0, 0, "set", "set" );
            magma_ssetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, queues[0]);
            trace_gpu_end( 0, 0 );
            
            // copy j-th row of L back to CPU
            trace_gpu_start( 0, 1, "get", "get" );
            magma_sgetmatrix_async(jb, j, dA(j, 0), ldda, A(j, 0), lda, queues[1]);
            trace_gpu_end( 0, 1 );
            
            if ( (j+jb) < n) {
                // compute the off-diagonal blocks of current block column
                trace_gpu_start( 0, 0, "trsm", "trsm" );
                magma_strsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaUnit,
                             (n-j-jb), jb,
                             c_one, dA(j, j), ldda,
                             dA(j+jb, j), ldda,
                             queues[0] );
                magma_scopymatrix( n-j-jb,jb, dA( j+jb, j ), ldda, dW( j+jb, 0 ), ldda, queues[0] );
                
                // update the trailing submatrix with D
                magmablas_slascl_diag( MagmaLower, n-j-jb, jb,
                                       dA(j,    j), ldda,
                                       dA(j+jb, j), ldda,
                                       queues[0], &iinfo );
                trace_gpu_end( 0, 0 );
                
                // update the trailing submatrix with L and W
                trace_gpu_start( 0, 0, "gemm", "gemm" );
                for (k=j+jb; k < n; k += nb) {
                    magma_int_t kb = min(nb,n-k);
                    magma_sgemm( MagmaNoTrans, MagmaConjTrans, n-k, kb, jb,
                                 c_neg_one, dA(k, j), ldda,
                                            dW(k, 0), ldda,
                                 c_one,     dA(k, k), ldda,
                                 queues[0] );
                    if (k == j+jb) {
                        //magma_event_record( event, queues[0] );
                        magma_queue_sync(queues[0]);
                    }
                }
                trace_gpu_end( 0, 0 );
            }
        }
    }
    
    trace_finalize( "ssytrf.svg","trace.css" );
    magma_queue_destroy(queues[0]);
    magma_queue_destroy(queues[1]);
    magma_event_destroy( event );
    magma_free(dW);
    magma_free(dA);
    
    return MAGMA_SUCCESS;
} /* magma_ssytrf_nopiv */
Exemplo n.º 17
0
/**
    Purpose
    -------
    SGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.
    Use two buffer to send panels.

    Arguments
    ---------
    @param[in]
    num_gpus INTEGER
            The number of GPUs to be used for the factorization.

    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    A       REAL array on the GPU, dimension (LDDA,N).
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    @param[in]
    ldda    INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).

    @param[out]
    ipiv    INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
      -     > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    @ingroup magma_sgesv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_sgetrf2_mgpu(magma_int_t num_gpus,
         magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset,
         float *d_lAT[], magma_int_t lddat, magma_int_t *ipiv,
         float *d_lAP[], float *w, magma_int_t ldw,
         magma_queue_t streaml[][2], magma_int_t *info)
{
#define dAT(id,i,j)  (d_lAT[(id)] + ((offset)+(i)*nb)*lddat + (j)*nb)
#define W(j) (w+((j)%num_gpus)*nb*ldw)

    float c_one     = MAGMA_S_ONE;
    float c_neg_one = MAGMA_S_NEG_ONE;

    magma_int_t block_size = 32;
    magma_int_t iinfo, n_local[MagmaMaxGPUs];
    magma_int_t maxm, mindim;
    magma_int_t i, d, dd, rows, cols, s, ldpan[MagmaMaxGPUs];
    magma_int_t id, i_local, i_local2, nb0, nb1, h = 2+num_gpus;
    float *d_panel[MagmaMaxGPUs], *panel_local[MagmaMaxGPUs];

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -2;
    else if (n < 0)
        *info = -3;
    else if (num_gpus*lddat < max(1,n))
        *info = -5;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    if ( num_gpus > ceil((float)n/nb) ) {
        *info = -1;
        return *info;
    }

    /* Use hybrid blocked code. */
    maxm  = ((m + block_size-1)/block_size)*block_size;

    /* some initializations */
    for (i=0; i < num_gpus; i++) {
        magma_setdevice(i);
        
        n_local[i] = ((n/nb)/num_gpus)*nb;
        if (i < (n/nb)%num_gpus)
            n_local[i] += nb;
        else if (i == (n/nb)%num_gpus)
            n_local[i] += n%nb;
        
        /* workspaces */
        d_panel[i] = &(d_lAP[i][h*nb*maxm]);   /* temporary panel storage */
    }
    trace_init( 1, num_gpus, 2, (CUstream_st**)streaml );

    /* start sending the panel to cpu */
    nb0 = min(mindim, nb);
    magma_setdevice(0);
    magmablasSetKernelStream(streaml[0][1]);
    trace_gpu_start( 0, 1, "comm", "get" );
    magmablas_stranspose( nb0, m, dAT(0,0,0), lddat, d_lAP[0], maxm );
    magma_sgetmatrix_async( m, nb0,
                            d_lAP[0], maxm,
                            W(0),     ldw, streaml[0][1] );
    trace_gpu_end( 0, 1 );

    /* ------------------------------------------------------------------------------------- */
    magma_timer_t time=0;
    timer_start( time );

    s = mindim / nb;
    for( i=0; i < s; i++ ) {
        /* Set the GPU number that holds the current panel */
        id = i%num_gpus;
        magma_setdevice(id);
        
        /* Set the local index where the current panel is */
        i_local = i/num_gpus;
        cols  = maxm - i*nb;
        rows  = m - i*nb;
        
        /* synchrnoize i-th panel from id-th gpu into work */
        magma_queue_sync( streaml[id][1] );
        
        /* i-th panel factorization */
        trace_cpu_start( 0, "getrf", "getrf" );
        lapackf77_sgetrf( &rows, &nb, W(i), &ldw, ipiv+i*nb, &iinfo);
        if ( (*info == 0) && (iinfo > 0) ) {
            *info = iinfo + i*nb;
        }
        trace_cpu_end( 0 );
        
        /* start sending the panel to all the gpus */
        d = (i+1)%num_gpus;
        for( dd=0; dd < num_gpus; dd++ ) {
            magma_setdevice(d);
            trace_gpu_start( 0, 1, "comm", "set" );
            magma_ssetmatrix_async( rows, nb,
                                    W(i),     ldw,
                                    &d_lAP[d][(i%h)*nb*maxm], cols,
                                    streaml[d][1] );
            trace_gpu_end( 0, 1 );
            d = (d+1)%num_gpus;
        }
        
        /* apply the pivoting */
        d = (i+1)%num_gpus;
        for( dd=0; dd < num_gpus; dd++ ) {
            magma_setdevice(d);
            magmablasSetKernelStream(streaml[d][0]);
            
            trace_gpu_start( d, 1, "pivot", "pivot" );
            if ( dd == 0 )
                magmablas_spermute_long2( lddat, dAT(d,0,0), lddat, ipiv, nb, i*nb );
            else
                magmablas_spermute_long3(        dAT(d,0,0), lddat, ipiv, nb, i*nb );
            trace_gpu_end( d, 1 );
            d = (d+1)%num_gpus;
        }
    
    
        /* update the trailing-matrix/look-ahead */
        d = (i+1)%num_gpus;
        for( dd=0; dd < num_gpus; dd++ ) {
            magma_setdevice(d);
            
            /* storage for panel */
            if ( d == id ) {
                /* the panel belond to this gpu */
                panel_local[d] = dAT(d,i,i_local);
                ldpan[d] = lddat;
                /* next column */
                i_local2 = i_local+1;
            } else {
                /* the panel belong to another gpu */
                panel_local[d] = d_panel[d];
                ldpan[d] = nb;
                /* next column */
                i_local2 = i_local;
                if ( d < id ) i_local2 ++;
            }
            /* the size of the next column */
            if ( s > (i+1) ) {
                nb0 = nb;
            } else {
                nb0 = n_local[d]-nb*(s/num_gpus);
                if ( d < s%num_gpus ) nb0 -= nb;
            }
            if ( d == (i+1)%num_gpus) {
                /* owns the next column, look-ahead the column */
                nb1 = nb0;
                magmablasSetKernelStream(streaml[d][1]);
                
                /* make sure all the pivoting has been applied */
                magma_queue_sync(streaml[d][0]);
                trace_gpu_start( d, 1, "gemm", "gemm" );
                /* transpose panel on GPU */
                magmablas_stranspose( rows, nb, &d_lAP[d][(i%h)*nb*maxm], cols, panel_local[d], ldpan[d] );
                /* synch for remaining update */
                magma_queue_sync(streaml[d][1]);
            } else {
                /* update the entire trailing matrix */
                nb1 = n_local[d] - i_local2*nb;
                magmablasSetKernelStream(streaml[d][0]);
                
                /* synchronization to make sure panel arrived on gpu */
                magma_queue_sync(streaml[d][1]);
                trace_gpu_start( d, 0, "gemm", "gemm" );
                /* transpose panel on GPU */
                magmablas_stranspose( rows, nb, &d_lAP[d][(i%h)*nb*maxm], cols, panel_local[d], ldpan[d] );
            }
            
            /* gpu updating the trailing matrix */
            magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                         nb1, nb, c_one,
                         panel_local[d],       ldpan[d],
                         dAT(d, i, i_local2), lddat);
            magma_sgemm( MagmaNoTrans, MagmaNoTrans,
                         nb1, m-(i+1)*nb, nb,
                         c_neg_one, dAT(d, i,   i_local2),         lddat,
                                    &(panel_local[d][nb*ldpan[d]]), ldpan[d],
                         c_one,     dAT(d, i+1, i_local2),         lddat );
        
            if ( d == (i+1)%num_gpus ) {
                /* Set the local index where the current panel is */
                int loff    = i+1;
                int i_local = (i+1)/num_gpus;
                int ldda    = maxm - (i+1)*nb;
                int cols    = m - (i+1)*nb;
                nb0 = min(nb, mindim - (i+1)*nb); /* size of the diagonal block */
                trace_gpu_end( d, 1 );
                
                if ( nb0 > 0 ) {
                    /* transpose the panel for sending it to cpu */
                    trace_gpu_start( d, 1, "comm", "get" );
                    magmablas_stranspose( nb0, m-(i+1)*nb, dAT(d,loff,i_local), lddat, &d_lAP[d][((i+1)%h)*nb*maxm], ldda );
             
                    /* send the panel to cpu */
                    magma_sgetmatrix_async( cols, nb0,
                                            &d_lAP[d][((i+1)%h)*nb*maxm], ldda,
                                            W(i+1), ldw, streaml[d][1] );

                    trace_gpu_end( d, 1 );
                }
            } else {
                trace_gpu_end( d, 0 );
            }
            
            d = (d+1)%num_gpus;
        }
    
        /* update the remaining matrix by gpu owning the next panel */
        if ( (i+1) < s ) {
            int i_local = (i+1)/num_gpus;
            int rows  = m - (i+1)*nb;
            
            d = (i+1)%num_gpus;
            magma_setdevice(d);
            magmablasSetKernelStream(streaml[d][0]);
            trace_gpu_start( d, 0, "gemm", "gemm" );

            magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                         n_local[d] - (i_local+1)*nb, nb,
                         c_one, panel_local[d],       ldpan[d],
                                dAT(d,i,i_local+1),  lddat );
            magma_sgemm( MagmaNoTrans, MagmaNoTrans,
                         n_local[d]-(i_local+1)*nb, rows, nb,
                         c_neg_one, dAT(d,i,i_local+1),            lddat,
                                    &(panel_local[d][nb*ldpan[d]]), ldpan[d],
                         c_one,     dAT(d,i+1,  i_local+1),        lddat );
            trace_gpu_end( d, 0 );
        }
    } /* end of for i=1..s */
    /* ------------------------------------------------------------------------------ */
    
    /* Set the GPU number that holds the last panel */
    id = s%num_gpus;
    
    /* Set the local index where the last panel is */
    i_local = s/num_gpus;
    
    /* size of the last diagonal-block */
    nb0 = min(m - s*nb, n - s*nb);
    rows = m    - s*nb;
    cols = maxm - s*nb;
    
    if ( nb0 > 0 ) {
        magma_setdevice(id);
        
        /* wait for the last panel on cpu */
        magma_queue_sync( streaml[id][1] );
    
        /* factor on cpu */
        lapackf77_sgetrf( &rows, &nb0, W(s), &ldw, ipiv+s*nb, &iinfo);
        if ( (*info == 0) && (iinfo > 0) )
            *info = iinfo + s*nb;
        
        /* send the factor to gpus */
        for( d=0; d < num_gpus; d++ ) {
            magma_setdevice(d);
            i_local2 = i_local;
            if ( d < id ) i_local2 ++;
            
            if ( d == id || n_local[d] > i_local2*nb ) {
                magma_ssetmatrix_async( rows, nb0,
                                        W(s),     ldw,
                                        &d_lAP[d][(s%h)*nb*maxm],
                                        cols, streaml[d][1] );
            }
        }
        
        for( d=0; d < num_gpus; d++ ) {
            magma_setdevice(d);
            magmablasSetKernelStream(streaml[d][0]);
            if ( d == 0 )
                magmablas_spermute_long2( lddat, dAT(d,0,0), lddat, ipiv, nb0, s*nb );
            else
                magmablas_spermute_long3(        dAT(d,0,0), lddat, ipiv, nb0, s*nb );
        }
        
        for( d=0; d < num_gpus; d++ ) {
            magma_setdevice(d);
            magmablasSetKernelStream(streaml[d][1]);
            
            /* wait for the pivoting to be done */
            magma_queue_sync( streaml[d][0] );
            
            i_local2 = i_local;
            if ( d < id ) i_local2++;
            if ( d == id ) {
                /* the panel belond to this gpu */
                panel_local[d] = dAT(d,s,i_local);
                
                /* next column */
                nb1 = n_local[d] - i_local*nb-nb0;
                
                magmablas_stranspose( rows, nb0, &d_lAP[d][(s%h)*nb*maxm], cols, panel_local[d], lddat );
                
                if ( nb1 > 0 ) {
                    magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                                 nb1, nb0, c_one,
                                 panel_local[d],        lddat,
                                 dAT(d,s,i_local)+nb0, lddat);
                }
            } else if ( n_local[d] > i_local2*nb ) {
                /* the panel belong to another gpu */
                panel_local[d] = d_panel[d];
                
                /* next column */
                nb1 = n_local[d] - i_local2*nb;
                
                magmablas_stranspose( rows, nb0, &d_lAP[d][(s%h)*nb*maxm], cols, panel_local[d], nb );
                magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             nb1, nb0, c_one,
                             panel_local[d],     nb,
                             dAT(d,s,i_local2), lddat);
            }
        }
    } /* if ( nb0 > 0 ) */
    
    /* clean up */
    trace_finalize( "sgetrf_mgpu.svg","trace.css" );
    for( d=0; d < num_gpus; d++ ) {
        magma_setdevice(d);
        magma_queue_sync( streaml[d][0] );
        magma_queue_sync( streaml[d][1] );
        magmablasSetKernelStream(NULL);
    }
    magma_setdevice(0);
    
    timer_start( time );
    timer_printf("\n Performance %f GFlop/s\n", FLOPS_SGETRF(m,n) / 1e9 / time );

    return *info;
} /* magma_sgetrf2_mgpu */
Exemplo n.º 18
0
extern "C" magma_int_t
magma_zgetrf2_mgpu(magma_int_t num_gpus, 
         magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset,
         cuDoubleComplex **d_lAT, magma_int_t lddat, magma_int_t *ipiv,
         cuDoubleComplex **d_lAP, cuDoubleComplex *w, magma_int_t ldw,
         cudaStream_t streaml[][2], magma_int_t *info)
#endif
{
/*  -- MAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2010

    Purpose
    =======

    ZGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.
    Use two buffer to send panels..

    Arguments
    =========

    NUM_GPUS 
            (input) INTEGER
            The number of GPUS to be used for the factorization.

    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N).
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    LDDA     (input) INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).

    IPIV    (output) INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  if INFO = -7, internal GPU memory allocation failed.
            > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.
    =====================================================================    */

#define inAT(id,i,j)  (d_lAT[(id)] + ((offset)+(i)*nb)*lddat + (j)*nb)
#define W(j) (w+((j)%num_gpus)*nb*ldw)

    cuDoubleComplex c_one     = MAGMA_Z_ONE;
    cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;

    magma_int_t block_size = 32;
    magma_int_t iinfo, n_local[4]; 
    magma_int_t maxm, mindim;
    magma_int_t i, ii, d, dd, rows, cols, s, ldpan[4];
    magma_int_t id, i_local, i_local2, nb0, nb1;
    cuDoubleComplex *d_panel[4], *panel_local[4];
    //cudaStream_t streaml[4][2];

    /* Check arguments */
    *info = 0;
    if (m < 0)
    *info = -2;
    else if (n < 0)
    *info = -3;
    else if (num_gpus*lddat < max(1,n))
    *info = -5;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    //nb     = magma_get_zgetrf_nb(m);
    if( num_gpus > ceil((double)n/nb) ) {
      *info = -1;
      return *info;
    }

    {
      /* Use hybrid blocked code. */
      maxm  = ((m + block_size-1)/block_size)*block_size;

      /* some initializations */
      for(i=0; i<num_gpus; i++){
        magmaSetDevice(i);

        n_local[i] = ((n/nb)/num_gpus)*nb;
        if (i < (n/nb)%num_gpus)
           n_local[i] += nb;
        else if (i == (n/nb)%num_gpus)
           n_local[i] += n%nb;

        /* workspaces */
        d_panel[i] = &(d_lAP[i][nb*maxm]);   /* temporary panel storage */

        /* create local streams */
        //magma_queue_create(&streaml[i][0]);
        //magma_queue_create(&streaml[i][1]);
      }
      trace_init( 1, num_gpus, 2, (CUstream_st**)streaml );

      /* start sending the panel to cpu */
      nb0 = min(mindim, nb);
      magmaSetDevice(0);
      magmablasSetKernelStream(streaml[0][1]);
      trace_gpu_start( 0, 1, "comm", "get" );
      if( nb0 == nb )
        magmablas_ztranspose(  d_lAP[0], maxm, inAT(0,0,0), lddat, nb0, maxm );
      else
        magmablas_ztranspose2( d_lAP[0], maxm, inAT(0,0,0), lddat, nb0, maxm );
      magma_zgetmatrix_async( m, nb0,
                              d_lAP[0], maxm,
                              W(0),     ldw, streaml[0][1] );
      trace_gpu_end( 0, 1 );

      /* ------------------------------------------------------------------------------------- */
#ifdef  PROFILE
      magma_timestr_t start_timer, end_timer;
      start_timer = get_current_time();
#endif
      s = mindim / nb;
      for( i=0; i<s; i++ )
            {
                /* Set the GPU number that holds the current panel */
                id = i%num_gpus;
                magmaSetDevice(id);

                /* Set the local index where the current panel is */
                i_local = i/num_gpus;
                cols  = maxm - i*nb;
                rows  = m - i*nb;

                /* synchrnoize i-th panel from id-th gpu into work */
                magma_queue_sync( streaml[id][1] );

                /* i-th panel factorization */
                trace_cpu_start( 0, "getrf", "getrf" );
#ifdef PANEL_FACT_MC
                cntxt->nb = 12;
                magma_zgetrf_mc(cntxt, &rows, &nb, W(i), &ldw, ipiv+i*nb, &iinfo);
#else
                lapackf77_zgetrf( &rows, &nb, W(i), &ldw, ipiv+i*nb, &iinfo);
#endif
                if ( (*info == 0) && (iinfo > 0) ) {
                    *info = iinfo + i*nb;
                    //break;
                }
                trace_cpu_end( 0 );

                /* start sending the panel to all the gpus */
                d = (i+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                  magmaSetDevice(d);
                  trace_gpu_start( 0, 1, "comm", "set" );
                  magma_zsetmatrix_async( rows, nb,
                                          W(i),     ldw,
                                          d_lAP[d], cols, streaml[d][1] );
                  trace_gpu_end( 0, 1 );
                  d = (d+1)%num_gpus;
                }

                /* apply the pivoting */
                d = (i+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                  magmaSetDevice(d);
                  magmablasSetKernelStream(streaml[d][0]);

                  trace_gpu_start( d, 1, "pivot", "pivot" );
                  if( dd == 0 )
                    magmablas_zpermute_long2( lddat, inAT(d,0,0), lddat, ipiv, nb, i*nb );
                  else
                    magmablas_zpermute_long3(        inAT(d,0,0), lddat, ipiv, nb, i*nb );
                  trace_gpu_end( d, 1 );
                  d = (d+1)%num_gpus;
                }


                /* update the trailing-matrix/look-ahead */
                d = (i+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                  magmaSetDevice(d);

                  /* storage for panel */
                  if( d == id ) {
                    /* the panel belond to this gpu */
                    panel_local[d] = inAT(d,i,i_local);
                    ldpan[d] = lddat;
                    /* next column */
                    i_local2 = i_local+1;
                  } else {
                    /* the panel belong to another gpu */
                    panel_local[d] = &d_panel[d][(i%2)*nb*maxm];
                    //panel_local[d] = d_panel[d];
                    ldpan[d] = nb;
                    /* next column */
                    i_local2 = i_local;
                    if( d < id ) i_local2 ++;
                  }
                  /* the size of the next column */
                  if ( s > (i+1) ) {
                    nb0 = nb;
                  } else {
                    nb0 = n_local[d]-nb*(s/num_gpus);
                    if( d < s%num_gpus ) nb0 -= nb;
                  }
                  if( d == (i+1)%num_gpus) {
                    /* owns the next column, look-ahead the column */
                    nb1 = nb0;
                    magmablasSetKernelStream(streaml[d][1]);

                    /* make sure all the pivoting has been applied */
                    magma_queue_sync(streaml[d][0]);
                    trace_gpu_start( d, 1, "gemm", "gemm" );
                  } else {
                    /* update the entire trailing matrix */
                    nb1 = n_local[d] - i_local2*nb;
                    magmablasSetKernelStream(streaml[d][0]);

                    /* synchronization to make sure panel arrived on gpu */
                    magma_queue_sync(streaml[d][1]);
                    trace_gpu_start( d, 0, "gemm", "gemm" );
                  }
                  magmablas_ztranspose(panel_local[d], ldpan[d], d_lAP[d], cols, cols, nb);

                  /* gpu updating the trailing matrix */
                  //magmablas_ztrsm(
                  magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                               nb1, nb, c_one,
                               panel_local[d],       ldpan[d],
                               inAT(d, i, i_local2), lddat);
                  //cublasZgemm
                  magma_zgemm( MagmaNoTrans, MagmaNoTrans, 
                               nb1, m-(i+1)*nb, nb, 
                               c_neg_one, inAT(d, i,   i_local2),         lddat,
                                          &(panel_local[d][nb*ldpan[d]]), ldpan[d], 
                               c_one,     inAT(d, i+1, i_local2),         lddat );

                  if( d == (i+1)%num_gpus ) 
                  {
                    /* Set the local index where the current panel is */
                    int loff    = i+1;
                    int i_local = (i+1)/num_gpus;
                    int ldda    = maxm - (i+1)*nb;
                    int cols    = m - (i+1)*nb;
                    nb0 = min(nb, mindim - (i+1)*nb); /* size of the diagonal block */
                    trace_gpu_end( d, 1 );

                    if( nb0 > 0 ) {
                        /* transpose the panel for sending it to cpu */
                        trace_gpu_start( d, 1, "comm", "get" );
                        if( i+1 < s ) 
                          magmablas_ztranspose(  d_lAP[d], ldda, inAT(d,loff,i_local), lddat, nb0, ldda );
                        else
                          magmablas_ztranspose2(  d_lAP[d], ldda, inAT(d,loff,i_local), lddat, nb0, ldda );

                        /* send the panel to cpu */
                        magma_zgetmatrix_async( cols, nb0,
                                                d_lAP[d], ldda,
                                                W(i+1),   ldw, streaml[d][1] );
                        trace_gpu_end( d, 1 );
                    }
                  } else {
                    trace_gpu_end( d, 0 );
                  }

                  d = (d+1)%num_gpus;
                }

                /* update the remaining matrix by gpu owning the next panel */
                if( (i+1) < s ) {
                  int i_local = (i+1)/num_gpus;
                  int rows  = m - (i+1)*nb;

                  d = (i+1)%num_gpus;
                  magmaSetDevice(d);
                  magmablasSetKernelStream(streaml[d][0]);
                  trace_gpu_start( d, 0, "gemm", "gemm" );
                  //magmablas_ztrsm
                  magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                               n_local[d] - (i_local+1)*nb, nb, 
                               c_one, panel_local[d],       ldpan[d], 
                                      inAT(d,i,i_local+1),  lddat );
                  //cublasZgemm
                  magma_zgemm( MagmaNoTrans, MagmaNoTrans, 
                               n_local[d]-(i_local+1)*nb, rows, nb, 
                               c_neg_one, inAT(d,i,i_local+1),            lddat, 
                                          &(panel_local[d][nb*ldpan[d]]), ldpan[d], 
                               c_one,     inAT(d,i+1,  i_local+1),        lddat );
                  trace_gpu_end( d, 0 );
                }
             } /* end of for i=1..s */
             /* ------------------------------------------------------------------------------ */

            /* Set the GPU number that holds the last panel */
            id = s%num_gpus;

            /* Set the local index where the last panel is */
            i_local = s/num_gpus;

            /* size of the last diagonal-block */
            nb0 = min(m - s*nb, n - s*nb);
            rows = m    - s*nb;
            cols = maxm - s*nb;

            if( nb0 > 0 ) {
              magmaSetDevice(id);

              /* wait for the last panel on cpu */
              magma_queue_sync( streaml[id][1] );

              /* factor on cpu */
              lapackf77_zgetrf( &rows, &nb0, W(s), &ldw, ipiv+s*nb, &iinfo);
              if ( (*info == 0) && (iinfo > 0) )
              *info = iinfo + s*nb;

              /* send the factor to gpus */
              for( d=0; d<num_gpus; d++ ) {
                magmaSetDevice(d);
                i_local2 = i_local;
                if( d < id ) i_local2 ++;

                if( d == id || n_local[d] > i_local2*nb ) {
                  magma_zsetmatrix_async( rows, nb0,
                                          W(s),     ldw,
                                          d_lAP[d], cols, streaml[d][1] );
                }
              }

              for( d=0; d<num_gpus; d++ ) {
                magmaSetDevice(d);
                magmablasSetKernelStream(streaml[d][0]);
                if( d == 0 )
                  magmablas_zpermute_long2( lddat, inAT(d,0,0), lddat, ipiv, nb0, s*nb );
                else
                  magmablas_zpermute_long3(        inAT(d,0,0), lddat, ipiv, nb0, s*nb );
              }

              for( d=0; d<num_gpus; d++ ) {
                magmaSetDevice(d);
                magmablasSetKernelStream(streaml[d][1]);

                /* wait for the pivoting to be done */
                magma_queue_sync( streaml[d][0] );

                i_local2 = i_local;
                if( d < id ) i_local2++;
                if( d == id ) {
                  /* the panel belond to this gpu */
                  panel_local[d] = inAT(d,s,i_local);

                  /* next column */
                  nb1 = n_local[d] - i_local*nb-nb0;

                  magmablas_ztranspose2( panel_local[d], lddat, d_lAP[d], cols, rows, nb0);

                  if( nb1 > 0 )
                  //cublasZtrsm
                  magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                               nb1, nb0, c_one,
                               panel_local[d],        lddat, 
                               inAT(d,s,i_local)+nb0, lddat);
                } else if( n_local[d] > i_local2*nb ) {
                  /* the panel belong to another gpu */
                  panel_local[d] = &d_panel[d][(s%2)*nb*maxm];
                  //panel_local[d] = d_panel[d];

                  /* next column */
                  nb1 = n_local[d] - i_local2*nb;

                  magmablas_ztranspose2( panel_local[d], nb, d_lAP[d], cols, rows, nb0);
                  //cublasZtrsm
                  magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                               nb1, nb0, c_one,
                               panel_local[d],     nb, 
                               inAT(d,s,i_local2), lddat);
                }
              }
            } /* if( nb0 > 0 ) */

            /* clean up */
            trace_finalize( "zgetrf_mgpu.svg","trace.css" );
            for( d=0; d<num_gpus; d++ ) {
              magmaSetDevice(d);
              magma_queue_sync( streaml[d][0] );
              magma_queue_sync( streaml[d][1] );
              //magma_queue_destroy(streaml[d][0]);
              //magma_queue_destroy(streaml[d][1]);
              magmablasSetKernelStream(NULL);
            } 
            magmaSetDevice(0);
#ifdef  PROFILE
            end_timer = get_current_time();
            printf("\n Performance %f GFlop/s\n", (2./3.*n*n*n /1000000.) / GetTimerValue(start_timer, end_timer));
#endif
    }

    return *info;
    /* End of MAGMA_ZGETRF2_MGPU */
}
Exemplo n.º 19
0
extern "C" magma_int_t
magma_cungqr_m(
    magma_int_t m, magma_int_t n, magma_int_t k,
    magmaFloatComplex *A, magma_int_t lda,
    magmaFloatComplex *tau,
    magmaFloatComplex *T, magma_int_t nb,
    magma_int_t *info)
{
/*  -- MAGMA (version 1.4.1) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       December 2013

    Purpose
    =======
    CUNGQR generates an M-by-N COMPLEX matrix Q with orthonormal columns,
    which is defined as the first N columns of a product of K elementary
    reflectors of order M

          Q  =  H(1) H(2) . . . H(k)

    as returned by CGEQRF.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix Q. M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix Q. M >= N >= 0.

    K       (input) INTEGER
            The number of elementary reflectors whose product defines the
            matrix Q. N >= K >= 0.

    A       (input/output) COMPLEX array A, dimension (LDDA,N).
            On entry, the i-th column must contain the vector
            which defines the elementary reflector H(i), for
            i = 1,2,...,k, as returned by CGEQRF_GPU in the
            first k columns of its array argument A.
            On exit, the M-by-N matrix Q.

    LDA     (input) INTEGER
            The first dimension of the array A. LDA >= max(1,M).

    TAU     (input) COMPLEX array, dimension (K)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by CGEQRF_GPU.

    T       (input) COMPLEX array, dimension (NB, min(M,N)).
            T contains the T matrices used in blocking the elementary
            reflectors H(i), e.g., this can be the 6th argument of
            magma_cgeqrf_gpu (except stored on the CPU, not the GPU).

    NB      (input) INTEGER
            This is the block size used in CGEQRF_GPU, and correspondingly
            the size of the T matrices, used in the factorization, and
            stored in T.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument has an illegal value
    =====================================================================    */

#define  A(i,j)   ( A    + (i) + (j)*lda )
#define dA(d,i,j) (dA[d] + (i) + (j)*ldda)
#define dT(d,i,j) (dT[d] + (i) + (j)*nb)

    magmaFloatComplex c_zero = MAGMA_C_ZERO;
    magmaFloatComplex c_one  = MAGMA_C_ONE;

    magma_int_t m_kk, n_kk, k_kk, mi;
    magma_int_t lwork, ldwork;
    magma_int_t i, ib, ki, kk, iinfo;
    magmaFloatComplex *work;

    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if ((n < 0) || (n > m)) {
        *info = -2;
    } else if ((k < 0) || (k > n)) {
        *info = -3;
    } else if (lda < max(1,m)) {
        *info = -5;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    if (n <= 0) {
        return *info;
    }
    
    magma_int_t di, dn;
    int dpanel;

    int ngpu = magma_num_gpus();
    int doriginal;
    magma_getdevice( &doriginal );
    
    // Allocate memory on GPUs for A and workspaces
    magma_int_t ldda    = ((m + 31) / 32) * 32;
    magma_int_t lddwork = ((n + 31) / 32) * 32;
    magma_int_t min_lblocks = (n / nb) / ngpu;  // min. blocks per gpu
    magma_int_t last_dev    = (n / nb) % ngpu;  // device with last block
    
    magma_int_t  nlocal[ MagmaMaxGPUs ] = { 0 };
    magmaFloatComplex *dA[ MagmaMaxGPUs ] = { NULL };
    magmaFloatComplex *dT[ MagmaMaxGPUs ] = { NULL };
    magmaFloatComplex *dV[ MagmaMaxGPUs ] = { NULL };
    magmaFloatComplex *dW[ MagmaMaxGPUs ] = { NULL };
    magma_queue_t stream[ MagmaMaxGPUs ] = { NULL };
    
    for( int d = 0; d < ngpu; ++d ) {
        // example with n = 75, nb = 10, ngpu = 3
        // min_lblocks = 2
        // last_dev    = 1
        // gpu 0: 2  blocks, cols:  0- 9, 30-39, 60-69
        // gpu 1: 1+ blocks, cols: 10-19, 40-49, 70-74 (partial)
        // gpu 2: 1  block , cols: 20-29, 50-59
        magma_setdevice( d );
        nlocal[d] = min_lblocks*nb;
        if ( d < last_dev ) {
            nlocal[d] += nb;
        }
        else if ( d == last_dev ) {
            nlocal[d] += (n % nb);
        }
        
        ldwork = nlocal[d]*ldda  // dA
               + nb*m            // dT
               + nb*ldda         // dV
               + nb*lddwork;     // dW
        if ( MAGMA_SUCCESS != magma_cmalloc( &dA[d], ldwork )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            goto CLEANUP;
        }
        dT[d] = dA[d] + nlocal[d]*ldda;
        dV[d] = dT[d] + nb*m;
        dW[d] = dV[d] + nb*ldda;
        
        magma_queue_create( &stream[d] );
    }
    
    trace_init( 1, ngpu, 1, stream );
    
    // first kk columns are handled by blocked method.
    // ki is start of 2nd-to-last block
    if ((nb > 1) && (nb < k)) {
        ki = (k - nb - 1) / nb * nb;
        kk = min(k, ki + nb);
    } else {
        ki = 0;
        kk = 0;
    }

    // Allocate CPU work space
    // n*nb for cungqr workspace
    lwork = n * nb;
    magma_cmalloc_cpu( &work, lwork );
    if (work == NULL) {
        *info = MAGMA_ERR_HOST_ALLOC;
        goto CLEANUP;
    }

    // Use unblocked code for the last or only block.
    if (kk < n) {
        trace_cpu_start( 0, "ungqr", "ungqr last block" );
        m_kk = m - kk;
        n_kk = n - kk;
        k_kk = k - kk;
        dpanel =  (kk / nb) % ngpu;
        di     = ((kk / nb) / ngpu) * nb;
        magma_setdevice( dpanel );
        
        lapackf77_cungqr( &m_kk, &n_kk, &k_kk,
                          A(kk, kk), &lda,
                          &tau[kk], work, &lwork, &iinfo );

        magma_csetmatrix( m_kk, n_kk,
                          A(kk, kk),  lda,
                          dA(dpanel, kk, di), ldda );
        
        // Set A(1:kk,kk+1:n) to zero.
        magmablas_claset( MagmaUpperLower, kk, n - kk, dA(dpanel, 0, di), ldda );
        trace_cpu_end( 0 );
    }

    if (kk > 0) {
        // Use blocked code
        // send T to all GPUs
        for( int d = 0; d < ngpu; ++d ) {
            magma_setdevice( d );
            trace_gpu_start( d, 0, "set", "set T" );
            magma_csetmatrix_async( nb, min(m,n), T, nb, dT[d], nb, stream[d] );
            trace_gpu_end( d, 0 );
        }
        
        // stream: set Aii (V) --> laset --> laset --> larfb --> [next]
        // CPU has no computation
        for( i = ki; i >= 0; i -= nb ) {
            ib = min(nb, k - i);
            mi = m - i;
            dpanel =  (i / nb) % ngpu;
            di     = ((i / nb) / ngpu) * nb;

            // Send current panel to the GPUs
            lapackf77_claset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda );
            for( int d = 0; d < ngpu; ++d ) {
                magma_setdevice( d );
                trace_gpu_start( d, 0, "set", "set V" );
                magma_csetmatrix_async( mi, ib,
                                        A(i, i), lda,
                                        dV[d],   ldda, stream[d] );
                trace_gpu_end( d, 0 );
            }
            
            // set panel to identity
            magma_setdevice( dpanel );
            magmablasSetKernelStream( stream[dpanel] );
            trace_gpu_start( dpanel, 0, "laset", "laset" );
            magmablas_claset( MagmaUpperLower, i, ib, dA(dpanel, 0, di), ldda );
            magmablas_claset_identity( mi, ib, dA(dpanel, i, di), ldda );
            trace_gpu_end( dpanel, 0 );
            
            if (i < n) {
                // Apply H to A(i:m,i:n) from the left
                for( int d = 0; d < ngpu; ++d ) {
                    magma_setdevice( d );
                    magmablasSetKernelStream( stream[d] );
                    magma_indices_1D_bcyclic( nb, ngpu, d, i, n, &di, &dn );
                    trace_gpu_start( d, 0, "larfb", "larfb" );
                    magma_clarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise,
                                      mi, dn-di, ib,
                                      dV[d],        ldda, dT(d,0,i), nb,
                                      dA(d, i, di), ldda, dW[d], lddwork );
                    trace_gpu_end( d, 0 );
                }
            }
        }
    }
    
    // copy result back to CPU
    trace_cpu_start( 0, "get", "get A" );
    magma_cgetmatrix_1D_col_bcyclic( m, n, dA, ldda, A, lda, ngpu, nb );
    trace_cpu_end( 0 );
    
    #ifdef TRACING
    char name[80];
    snprintf( name, sizeof(name), "cungqr-n%d-ngpu%d.svg", m, ngpu );
    trace_finalize( name, "trace.css" );
    #endif
    
CLEANUP:
    for( int d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magmablasSetKernelStream( NULL );
        magma_free( dA[d] );
        dA[d] = NULL;
        if ( stream[d] != NULL ) {
            magma_queue_destroy( stream[d] );
        }
    }
    magma_free_cpu( work );
    magma_setdevice( doriginal );
    
    return *info;
} /* magma_cungqr */
Exemplo n.º 20
0
/**
    Purpose
    -------
    SPOTRF computes the Cholesky factorization of a real symmetric
    positive definite matrix dA.

    The factorization has the form
       dA = U**T * U,   if UPLO = MagmaUpper, or
       dA = L  * L**T,  if UPLO = MagmaLower,
    where U is an upper triangular matrix and L is lower triangular.

    This is the block version of the algorithm, calling Level 3 BLAS.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of dA is stored;
      -     = MagmaLower:  Lower triangle of dA is stored.

    @param[in]
    n       INTEGER
            The order of the matrix dA.  N >= 0.

    @param[in,out]
    dA      REAL array on the GPU, dimension (LDDA,N)
            On entry, the symmetric matrix dA.  If UPLO = MagmaUpper, the leading
            N-by-N upper triangular part of dA contains the upper
            triangular part of the matrix dA, and the strictly lower
            triangular part of dA is not referenced.  If UPLO = MagmaLower, the
            leading N-by-N lower triangular part of dA contains the lower
            triangular part of the matrix dA, and the strictly upper
            triangular part of dA is not referenced.
    \n
            On exit, if INFO = 0, the factor U or L from the Cholesky
            factorization dA = U**T * U or dA = L * L**T.

    @param[in]
    ldda     INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            divisible by 16.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
      -     > 0:  if INFO = i, the leading minor of order i is not
                  positive definite, and the factorization could not be
                  completed.

    @ingroup magma_sposv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_spotrf2_mgpu(int num_gpus, magma_uplo_t uplo, magma_int_t m, magma_int_t n,
                   magma_int_t off_i, magma_int_t off_j, magma_int_t nb,
                   float **d_lA,  magma_int_t ldda,
                   float **d_lP,  magma_int_t lddp,
                   float *A,      magma_int_t lda,   magma_int_t h,
                   magma_queue_t stream[][3], magma_event_t event[][5],
                   magma_int_t *info )
{
#define Alo(i, j)  (A +             ((j)+off_j)*lda  + (nb*(((i)/nb)%h)+off_i))
#define Aup(i, j)  (A + (nb*(((j)/nb)%h)+off_j)*lda  +               (i+off_i))

#define  dlA(id, i, j)    (d_lA[(id)] + (j)*ldda + (i))
#define  dlP(id, i, j, k) (d_lP[(id)] + (k)*nb*lddp + (j)*lddp + (i))
#define dlPT(id, i, j, k) (d_lP[(id)] + (k)*nb*lddp + (j)*nb   + (i))

    magma_int_t     j, jb, nb0, nb2, dd, d, id, j_local, j_local2, buf;
    float c_one     = MAGMA_S_ONE;
    float c_neg_one = MAGMA_S_NEG_ONE;
    float          d_one     =  1.0;
    float          d_neg_one = -1.0;
    int upper = (uplo == MagmaUpper);
    float *dlpanel;
    //magma_event_t event0[MagmaMaxGPUs], // syrk
    //            event1[MagmaMaxGPUs], // send off-diagonal
    //            event2[MagmaMaxGPUs], // send diagonal
    //            event3[MagmaMaxGPUs]; // trsm
    magma_int_t n_local[MagmaMaxGPUs], ldpanel;
    int stream0 = 0, stream1 = 1;
    #ifdef STRSM_WORK
    float *d_dinvA[MagmaMaxGPUs][2], *d_x[MagmaMaxGPUs][2]; /* used by strsm_work */
    #endif
    
    *info = 0;
    if (! upper && uplo != MagmaLower) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (!upper && num_gpus*ldda < max(1,n)) {
        *info = -4;
    } else if (upper && ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    for( d=0; d < num_gpus; d++ ) {
        /* local-n and local-ld */
        if (upper) {
            n_local[d] = ((n/nb)/num_gpus)*nb;
            if (d < (n/nb)%num_gpus)
                n_local[d] += nb;
            else if (d == (n/nb)%num_gpus)
                n_local[d] += n%nb;
        } else {
            n_local[d] = ((m/nb)/num_gpus)*nb;
            if (d < (m/nb)%num_gpus)
                n_local[d] += nb;
            else if (d == (m/nb)%num_gpus)
                n_local[d] += m%nb;
        }
        //magma_setdevice(d);
        //magma_event_create( &event0[d] );
        //magma_event_create( &event1[d] );
        //magma_event_create( &event2[d] );
        //magma_event_create( &event3[d] );
    }
    magma_setdevice(0);

    /* == initialize the trace */
    trace_init( 1, num_gpus, 3, (magma_queue_t*)stream );

    /* Use blocked code. */
    if (upper) {
        /* ---------------------------------------------- */
        /* Upper-triangular case                          */
        /* > Compute the Cholesky factorization A = U'*U. */
        /* ---------------------------------------------- */
        
#if defined(PRECISION_d) && defined(STRSM_WORK)
        /* invert the diagonals
         * Allocate device memory for the inversed diagonal blocks, size=m*NB
         */
        for( d=0; d < num_gpus; d++ ) {
            magma_setdevice(d);
            for( j=0; j < 2; j++ ) {
                magma_smalloc( &d_dinvA[d][j], nb*nb );
                magma_smalloc( &d_x[d][j],      n*nb );
                cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(float));
                cudaMemset(d_x[d][j],     0,  n*nb*sizeof(float));
            }
        }
        magma_setdevice(0);
#endif
        
        for (j=0; j < m; j += nb) {
            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%num_gpus;
            buf = (j/nb)%num_gpus;
            
            /* Set the local index where the current panel is */
            j_local = j/(nb*num_gpus);
            jb = min(nb, (m-j));
            
            if ( j > 0 ) {
                /* needed on pluto... */
                magma_setdevice(id);
                magma_queue_sync( stream[id][stream0] ); // wait for the column on CPU

                /* broadcast off-diagonal column to all gpus */
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd < num_gpus; dd++ ) {
                    if ( d != id ) {
                        magma_setdevice(d);
                
                        /* wait for it on CPU */
                        magma_queue_wait_event( stream[d][stream0], event[id][1] );
                
                        /* send it to GPU */
                        trace_gpu_start( d, stream0, "comm", "rows to GPUs" );
                        magma_ssetmatrix_async( j, jb,
                                                Aup(0,j),        lda,
                                                dlP(d,jb,0,buf), lddp,
                                                stream[d][stream0] );
                        trace_gpu_end( d, stream0 );
                        magma_event_record( event[d][1], stream[d][stream0] );
                    }
                    d = (d+1)%num_gpus;
                }
            }
            
            /* Update the current diagonal block */
            magma_setdevice(id);
            if ( j > 0 ) {
                magmablasSetKernelStream(stream[id][stream1]);
                trace_gpu_start( id, stream1, "syrk", "syrk" );
                magma_ssyrk(MagmaUpper, MagmaTrans, jb, j,
                            d_neg_one, dlA(id, 0, nb*j_local), ldda,
                            d_one,     dlA(id, j, nb*j_local), ldda);
                trace_gpu_end( id, stream1 );
                magma_event_record( event[id][0], stream[id][stream1] );
            }

            /* send the diagonal to cpu */
            magma_queue_wait_event( stream[id][stream0], event[id][0] ); // wait for syrk
            trace_gpu_start( id, stream0, "comm", "D to CPU" );
            magma_sgetmatrix_async( jb, jb,
                                    dlA(id, j, nb*j_local), ldda,
                                    Aup(j,j),               lda,
                                    stream[id][stream0] );
            trace_gpu_end( id, stream0 );

            if ( j > 0 ) {
                /* Compute the local block column of the panel. */
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd < num_gpus; dd++ ) {
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2 --;
                    nb0 = nb*j_local2;
                
                    if ( n_local[d] > nb0 ) {
                        /* wait for the off-diagonal */
                        if ( d != id ) {
                            //magma_queue_sync( stream[id][3] );
                            dlpanel = dlP(d, jb, 0, buf);
                            ldpanel = lddp;
                
                            /* wait for the offdiagonal column */
                            magma_queue_wait_event( stream[d][stream1], event[d][1] );
                        } else {
                            dlpanel = dlA(d, 0, nb*j_local);
                            ldpanel = ldda;
                        }
                        
                        /* update the panel */
                        magma_setdevice(d);
                        magmablasSetKernelStream(stream[d][stream1]);
                        trace_gpu_start( d, stream1, "gemm", "gemm" );
                        magma_sgemm(MagmaTrans, MagmaNoTrans,
                                    jb, n_local[d]-nb0, j,
                                    c_neg_one, dlpanel,        ldpanel,
                                               dlA(d, 0, nb0), ldda,
                                    c_one,     dlA(d, j, nb0), ldda);
                        trace_gpu_end( d, stream1 );
                    }
                    d = (d+1)%num_gpus;
                }
            }
            
            /* factor the diagonal */
            magma_setdevice(id);
            magma_queue_sync( stream[id][stream0] ); // wait for the diagonal
            trace_cpu_start( 0, "getrf", "getrf" );
            lapackf77_spotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info);
            trace_cpu_end( 0 );
            if (*info != 0) {
                *info = *info + j;
                break;
            }

            /* send the diagonal to gpus */
            if ( (j+jb) < n) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd < num_gpus; dd++ ) {
                    magma_setdevice(d);
                    if ( d == id ) {
                        dlpanel = dlA(d, j, nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlP(d, 0, 0, buf);
                        ldpanel = lddp;
                    }
                    
                    trace_gpu_start( d, stream0, "comm", "D to GPUs" );
                    magma_ssetmatrix_async( jb, jb,
                                            Aup(j,j), lda,
                                            dlpanel,  ldpanel,
                                            stream[d][stream0] );
                    trace_gpu_end( d, stream0 );
                    magma_event_record( event[d][2], stream[d][stream0] );
                    d = (d+1)%num_gpus;
                }
            } else {
                magma_setdevice(id);
                trace_gpu_start( id, stream0, "comm", "D to GPUs" );
                magma_ssetmatrix_async( jb, jb,
                                        Aup(j,j),               lda,
                                        dlA(id, j, nb*j_local), ldda,
                                        stream[id][stream0] );
                trace_gpu_end( id, stream0 );
            }
            
            /* panel-factorize the off-diagonal */
            if ( (j+jb) < n) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd < num_gpus; dd++ ) {
                    /* next column */
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2--;
                    if ( d == id ) {
                        dlpanel = dlA(d, j, nb*j_local);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlP(d, 0, 0, buf);
                        ldpanel = lddp;
                    }
                    nb2 = n_local[d]-nb*j_local2;
                    nb0 = min(nb, nb2 );
                    
                    magma_setdevice(d);
                    magmablasSetKernelStream(stream[d][stream1]);
                    magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for the diagonal
                    if ( j+jb < m && d == (j/nb+1)%num_gpus ) {
                        /* owns the next column, look-ahead the column */
                        trace_gpu_start( d, stream1, "trsm", "trsm" );
#if defined(PRECISION_d) && defined(STRSM_WORK)
                        magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit,
                                              jb, nb0, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                              d_dinvA[d][0], d_x[d][0] );
                        /*nb2 = n_local[d] - j_local2*nb;
                        magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                              d_dinvA[d], d_x[d] ); */
#else
                        /*nb2 = n_local[d] - j_local2*nb;
                        magma_strsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel,                ldda,
                                     dlA(d, j, nb*j_local2), ldda);
                        */
                        magma_strsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit,
                                     jb, nb0, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, j, nb*j_local2), ldda);
#endif
                        trace_gpu_end( d, stream1 );
                        magma_event_record( event[d][3], stream[d][stream1] );
                        
                        /* send the column to cpu */
                        if ( j+jb < m ) {
                            trace_gpu_start( d, stream0, "comm", "rows to CPU" );
                            magma_queue_wait_event( stream[d][stream0], event[d][3] ); // wait for lookahead
                            magma_sgetmatrix_async( (j+jb), nb0,
                                                    dlA(d, 0, nb*j_local2), ldda,
                                                    Aup(0,j+jb),            lda,
                                                    stream[d][stream0] );
                            trace_gpu_end( d, stream0 );
                            magma_event_record( event[d][1], stream[d][stream0] );
                        }
                        
                        /* update the remaining blocks */
                        nb2 = nb2 - nb0;
#if defined(PRECISION_d) && defined(STRSM_WORK)
                        magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel,                    ldpanel,
                                              dlA(d, j, nb*j_local2+nb0), ldda,
                                              d_dinvA[d][1], d_x[d][1] );
#else
                        magma_strsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel,                    ldpanel,
                                     dlA(d, j, nb*j_local2+nb0), ldda);
#endif
                    } else if ( nb2 > 0 ) {
                        /* update the entire trailing matrix */
                        trace_gpu_start( d, stream1, "trsm", "trsm" );
#if defined(PRECISION_d) && defined(STRSM_WORK)
                        magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit,
                                              jb, nb2, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, j, nb*j_local2), ldda,
                                    d_dinvA[d][1], d_x[d][1] );
#else
                        magma_strsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit,
                                     jb, nb2, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, j, nb*j_local2), ldda);
#endif
                        trace_gpu_end( d, stream1 );
                    }
                    d = (d+1)%num_gpus;
                }
            } /* end of strsm */
        } /* end of for j=1, .., n */
    } else {
        /* -------------------------------------------- */
        /* Lower-triangular case                        */
        /* Compute the Cholesky factorization A = L*L'. */
        /* -------------------------------------------- */
#if defined(PRECISION_d) && defined(STRSM_WORK)
        /*
         * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE
         */
        for( d=0; d < num_gpus; d++ ) {
            magma_setdevice(d);
            for( j=0; j < 2; j++ ) {
                magma_smalloc( &d_dinvA[d][j], nb*nb );
                magma_smalloc( &d_x[d][j],     nb*m  );
                cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(float));
                cudaMemset(d_x[d][j],     0, nb* m*sizeof(float));
            }
        }
        magma_setdevice(0);
#endif

        for (j=0; j < n; j += nb) {
            /* Set the GPU number that holds the current panel */
            id  = (j/nb)%num_gpus;
            buf = (j/nb)%num_gpus;
            
            /* Set the local index where the current panel is */
            j_local = j/(nb*num_gpus);
            jb = min(nb, (n-j));
            
            if ( j > 0 ) {
                /* needed on pluto... */
                magma_setdevice(id);
                magma_queue_sync( stream[id][stream0] ); // wait for the column on CPU

                /* broadcast offdiagonal row to all gpus */
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd < num_gpus; dd++ ) {
                    if ( d != id ) {
                        magma_setdevice(d);
                        /* wait for it on CPU */
                        magma_queue_wait_event( stream[d][stream0], event[id][1] );
            
                        /* send it to GPU */
                        magma_ssetmatrix_async( jb, j,
                                                Alo(j,0),         lda,
                                                dlPT(d,0,jb,buf), nb,
                                                stream[d][stream0] );
                        magma_event_record( event[d][1], stream[d][stream0] );
                    }
                    d = (d+1)%num_gpus;
                }
            }

            /* Update the current diagonal block */
            magma_setdevice(id);
            if ( j > 0 ) {
                magmablasSetKernelStream(stream[id][stream1]);
                magma_ssyrk(MagmaLower, MagmaNoTrans, jb, j,
                            d_neg_one, dlA(id, nb*j_local, 0), ldda,
                            d_one,     dlA(id, nb*j_local, j), ldda);
                magma_event_record( event[id][0], stream[id][stream1] );
            }
            
            /* send the diagonal to cpu */
            magma_queue_wait_event( stream[id][stream0], event[id][0] ); // wait for syrk
            magma_sgetmatrix_async( jb, jb,
                                    dlA(id, nb*j_local, j), ldda,
                                    Alo(j,j),               lda,
                                    stream[id][stream0] );

            /* update the offdiagonal blocks */
            if ( j > 0 ) {
                /* compute the block-rows of the panel */
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd < num_gpus; dd++ ) {
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2 --;
                    nb0 = nb*j_local2;
            
                    if ( nb0 < n_local[d] ) {
                        if ( d != id ) {
                            dlpanel = dlPT(d, 0, jb, buf);
                            ldpanel = nb;
            
                            /* wait for offdiagonal row */
                            magma_queue_wait_event( stream[d][stream1], event[d][1] );
                        } else {
                            dlpanel = dlA(d, nb*j_local, 0);
                            ldpanel = ldda;
                        }
            
                        magma_setdevice(d);
                        magmablasSetKernelStream(stream[d][stream1]);
                        magma_sgemm( MagmaNoTrans, MagmaTrans,
                                     n_local[d]-nb0, jb, j,
                                     c_neg_one, dlA(d, nb0, 0), ldda,
                                                dlpanel,        ldpanel,
                                     c_one,     dlA(d, nb0, j), ldda);
                    }
                    d = (d+1)%num_gpus;
                }
            }

            /* factor the diagonal */
            magma_setdevice(id);
            magma_queue_sync( stream[id][stream0] );
            lapackf77_spotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info);
            if (*info != 0) {
                *info = *info + j;
                break;
            }

            /* send the diagonal to gpus */
            if ( (j+jb) < m ) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd < num_gpus; dd++ ) {
                    magma_setdevice(d);
                    if ( d == id ) {
                        dlpanel = dlA(d, nb*j_local, j);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlPT(d, 0, 0, buf);
                        ldpanel = nb;
                    }
                    magma_ssetmatrix_async( jb, jb,
                                            Alo(j,j), lda,
                                            dlpanel,  ldpanel,
                                            stream[d][stream0] );
                    magma_event_record( event[d][2], stream[d][stream0] );
                    d = (d+1)%num_gpus;
                }
            } else {
                magma_setdevice(id);
                magma_ssetmatrix_async( jb, jb,
                                        Alo(j,j),               lda,
                                        dlA(id, nb*j_local, j), ldda,
                                        stream[id][stream0] );
            }

            /* factorize off-diagonal blocks */
            if ( (j+jb) < m ) {
                d = (j/nb+1)%num_gpus;
                for( dd=0; dd < num_gpus; dd++ ) {
                    /* next column */
                    j_local2 = j_local+1;
                    if ( d > id ) j_local2--;
                    if ( d == id ) {
                        dlpanel = dlA(d, nb*j_local, j);
                        ldpanel = ldda;
                    } else {
                        dlpanel = dlPT(d, 0, 0, buf);
                        ldpanel = nb;
                    }
                    nb2 = n_local[d] - j_local2*nb;
                    nb0 = min(nb, nb2 );
            
                    magma_setdevice(d);
                    magmablasSetKernelStream(stream[d][stream1]);
                    magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for the diagonal
                    if ( j+jb < n && d == (j/nb+1)%num_gpus ) {
                        /* owns the next column, look-ahead the column */
#if defined(PRECISION_d) && defined(STRSM_WORK)
                        magmablas_strsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit,
                                              nb0, jb, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, nb*j_local2, j), ldda,
                                              d_dinvA[d][0], d_x[d][0]);
#else
                        magma_strsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit,
                                     nb0, jb, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, nb*j_local2, j), ldda);
#endif
                        magma_event_record( event[d][3], stream[d][stream1] );

                        /* send the column to cpu */
                        if ( j+jb < n ) {
                            magma_queue_wait_event( stream[d][stream0], event[d][3] ); // wait for lookahead
                            magma_sgetmatrix_async( nb0, j+jb,
                                                    dlA(d, nb*j_local2, 0), ldda,
                                                    Alo(j+jb,0),            lda,
                                                    stream[d][stream0] );
                            magma_event_record( event[d][1], stream[d][stream0] );
                        }

                        /* update the remaining blocks */
                        nb2 = nb2 - nb0;
#if defined(PRECISION_d) && defined(STRSM_WORK)
                        magmablas_strsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit,
                                              nb2, jb, c_one,
                                              dlpanel,                    ldpanel,
                                              dlA(d, nb*j_local2+nb0, j), ldda,
                                              d_dinvA[d][1], d_x[d][1] );
#else
                        magma_strsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit,
                                     nb2, jb, c_one,
                                     dlpanel,                    ldpanel,
                                     dlA(d, nb*j_local2+nb0, j), ldda);
#endif
                    } else if ( nb2 > 0 ) {
                        /* update the entire trailing matrix */
#if defined(PRECISION_d) && defined(STRSM_WORK)
                        magmablas_strsm_work( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit,
                                              nb2, jb, c_one,
                                              dlpanel,                ldpanel,
                                              dlA(d, nb*j_local2, j), ldda,
                                              d_dinvA[d][1], d_x[d][1] );
#else
                        magma_strsm( MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit,
                                     nb2, jb, c_one,
                                     dlpanel,                ldpanel,
                                     dlA(d, nb*j_local2, j), ldda);
#endif
                    }
                    d = (d+1)%num_gpus;
                }
            }
        }
    } /* end of else not upper */

    /* == finalize the trace == */
    trace_finalize( "spotrf.svg", "trace.css" );

    /* clean up */
    for( d=0; d < num_gpus; d++ ) {
        magma_setdevice(d);
        magma_queue_sync( stream[d][0] );
        magma_queue_sync( stream[d][1] );
        magmablasSetKernelStream(NULL);

        //magma_event_destroy( event0[d] );
        //magma_event_destroy( event1[d] );
        //magma_event_destroy( event2[d] );
        //magma_event_destroy( event3[d] );
    }
    magma_setdevice(0);

    return *info;
} /* magma_spotrf_mgpu */
Exemplo n.º 21
0
extern "C" double
magma_zlatrd_mgpu(magma_int_t num_gpus, char uplo,
                  magma_int_t n0, magma_int_t n, magma_int_t nb, magma_int_t nb0,
                  magmaDoubleComplex *a,  magma_int_t lda,
                  double *e, magmaDoubleComplex *tau,
                  magmaDoubleComplex *w,   magma_int_t ldw,
                  magmaDoubleComplex **da, magma_int_t ldda, magma_int_t offset,
                  magmaDoubleComplex **dw, magma_int_t lddw,
                  magmaDoubleComplex *dwork[MagmaMaxGPUs], magma_int_t ldwork,
                  magma_int_t k,
                  magmaDoubleComplex  *dx[MagmaMaxGPUs], magmaDoubleComplex *dy[MagmaMaxGPUs],
                  magmaDoubleComplex *work,
                  magma_queue_t stream[][10],
                  double *times)
{
/*  -- MAGMA (version 1.4.1) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       December 2013

    Purpose
    =======
    ZLATRD reduces NB rows and columns of a complex Hermitian matrix A to
    Hermitian tridiagonal form by an orthogonal similarity
    transformation Q' * A * Q, and returns the matrices V and W which are
    needed to apply the transformation to the unreduced part of A.

    If UPLO = 'U', ZLATRD reduces the last NB rows and columns of a
    matrix, of which the upper triangle is supplied;
    if UPLO = 'L', ZLATRD reduces the first NB rows and columns of a
    matrix, of which the lower triangle is supplied.

    This is an auxiliary routine called by ZHETRD.

    Arguments
    =========
    UPLO    (input) CHARACTER*1
            Specifies whether the upper or lower triangular part of the
            Hermitian matrix A is stored:
            = 'U': Upper triangular
            = 'L': Lower triangular

    N       (input) INTEGER
            The order of the matrix A.

    NB      (input) INTEGER
            The number of rows and columns to be reduced.

    A       (input/output) COMPLEX_16 array, dimension (LDA,N)
            On entry, the Hermitian matrix A.  If UPLO = 'U', the leading
            n-by-n upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = 'L', the
            leading n-by-n lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
            On exit:
            if UPLO = 'U', the last NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements above the diagonal
              with the array TAU, represent the orthogonal matrix Q as a
              product of elementary reflectors;
            if UPLO = 'L', the first NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements below the diagonal
              with the array TAU, represent the  orthogonal matrix Q as a
              product of elementary reflectors.
            See Further Details.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= (1,N).

    E       (output) COMPLEX_16 array, dimension (N-1)
            If UPLO = 'U', E(n-nb:n-1) contains the superdiagonal
            elements of the last NB columns of the reduced matrix;
            if UPLO = 'L', E(1:nb) contains the subdiagonal elements of
            the first NB columns of the reduced matrix.

    TAU     (output) COMPLEX_16 array, dimension (N-1)
            The scalar factors of the elementary reflectors, stored in
            TAU(n-nb:n-1) if UPLO = 'U', and in TAU(1:nb) if UPLO = 'L'.
            See Further Details.

    W       (output) COMPLEX_16 array, dimension (LDW,NB)
            The n-by-nb matrix W required to update the unreduced part
            of A.

    LDW     (input) INTEGER
            The leading dimension of the array W. LDW >= max(1,N).

    Further Details
    ===============
    If UPLO = 'U', the matrix Q is represented as a product of elementary
    reflectors

       Q = H(n) H(n-1) . . . H(n-nb+1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i),
    and tau in TAU(i-1).

    If UPLO = 'L', the matrix Q is represented as a product of elementary
    reflectors

       Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i),
    and tau in TAU(i).

    The elements of the vectors v together form the n-by-nb matrix V
    which is needed, with W, to apply the transformation to the unreduced
    part of the matrix, using a Hermitian rank-2k update of the form:
    A := A - V*W' - W*V'.

    The contents of A on exit are illustrated by the following examples
    with n = 5 and nb = 2:

    if UPLO = 'U':                       if UPLO = 'L':

      (  a   a   a   v4  v5 )              (  d                  )
      (      a   a   v4  v5 )              (  1   d              )
      (          a   1   v5 )              (  v1  1   a          )
      (              d   1  )              (  v1  v2  a   a      )
      (                  d  )              (  v1  v2  a   a   a  )

    where d denotes a diagonal element of the reduced matrix, a denotes
    an element of the original matrix that is unchanged, and vi denotes
    an element of the vector defining H(i).
    =====================================================================    */

    char uplo_[2]  = {uplo, 0};

    double mv_time = 0.0;
    magma_int_t i;
#ifndef MAGMABLAS_ZHEMV_MGPU
    magma_int_t loffset = nb0*((offset/nb0)/num_gpus);
#endif

    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magmaDoubleComplex c_zero    = MAGMA_Z_ZERO;
    magmaDoubleComplex value     = MAGMA_Z_ZERO;
    magma_int_t id, idw, i_one = 1;

    //magma_int_t kk;
    magma_int_t ione = 1;

    magma_int_t i_n, i_1, iw;

    magmaDoubleComplex alpha;

    magmaDoubleComplex *dx2[MagmaMaxGPUs];
    magmaDoubleComplex *f = (magmaDoubleComplex *)malloc(n*sizeof(magmaDoubleComplex ));

    if (n <= 0) {
        return 0;
    }

//#define PROFILE_SYMV
#ifdef PROFILE_SYMV
    magma_event_t start, stop;
    float etime;
    magma_timestr_t cpu_start, cpu_end;
    magma_setdevice(0);
    magma_event_create( &start );
    magma_event_create( &stop  );
#endif

    if (lapackf77_lsame(uplo_, "U")) {
        /* Reduce last NB columns of upper triangle */
        for (i = n-1; i >= n - nb ; --i) {
            i_1 = i + 1;
            i_n = n - i - 1;
            iw = i - n + nb;
            if (i < n-1) {
                /* Update A(1:i,i) */
                magmaDoubleComplex wii = *W(i, iw+1);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                    lapackf77_zlacgv(&i_one, &wii, &ldw);
                #endif
                wii = -wii;
                blasf77_zaxpy(&i_1, &wii, A(0, i+1), &i_one, A(0, i), &ione);

                wii = *A(i, i+1);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                    lapackf77_zlacgv(&i_one, &wii, &ldw);
                #endif
                wii = -wii;
                blasf77_zaxpy(&i_1, &wii, W(0, iw+1), &i_one, A(0, i), &ione);
            }
            if (i > 0) {
                /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */
                alpha = *A(i-1, i);
                lapackf77_zlarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]);

                e[i-1] = MAGMA_Z_REAL( alpha );
                *A(i-1,i) = MAGMA_Z_MAKE( 1, 0 );
                for( id=0; id<num_gpus; id++ ) {
                    magma_setdevice(id);
                    dx2[id] = dW1(id, 0, iw);
                    magma_zsetvector_async( n, A(0,i), 1, dW1(id, 0, iw), 1, stream[id][0]);
#ifndef  MAGMABLAS_ZHEMV_MGPU
                    magma_zsetvector_async( i, A(0,i), 1, dx[id], 1, stream[id][0] );
#endif
                }
                magmablas_zhemv_mgpu(num_gpus, k, 'U', i, nb0, c_one, da, ldda, 0,
                                     dx2, ione, c_zero, dy, ione, dwork, ldwork,
                                     work, W(0, iw), stream );

                if (i < n-1) {
                    blasf77_zgemv(MagmaConjTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw,
                                  A(0, i), &ione, &c_zero, W(i+1, iw), &ione);
                }

                /* overlap update */
                if( i < n-1 && i-1 >= n - nb )
                {
                    magma_int_t im1_1 = i_1 - 1;
                    magma_int_t im1   = i-1;
                    /* Update A(1:i,i) */
                    #if defined(PRECISION_z) || defined(PRECISION_c)
                        magma_int_t im1_n = i_n + 1;
                        lapackf77_zlacgv(&im1_n, W(im1, iw+1), &ldw);
                    #endif
                    blasf77_zgemv("No transpose", &im1_1, &i_n, &c_neg_one, A(0, i+1), &lda,
                                  W(im1, iw+1), &ldw, &c_one, A(0, i-1), &ione);
                    #if defined(PRECISION_z) || defined(PRECISION_c)
                        lapackf77_zlacgv(&im1_n, W(im1, iw+1), &ldw);
                        lapackf77_zlacgv(&im1_n, A(im1, i +1), &lda);
                    #endif
                    blasf77_zgemv("No transpose", &im1_1, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                                  A(im1, i+1), &lda, &c_one, A(0, i-1), &ione);
                    #if defined(PRECISION_z) || defined(PRECISION_c)
                        lapackf77_zlacgv(&im1_n, A(im1, i+1), &lda);
                    #endif
                }

                // 3. Here is where we need it // TODO find the right place
                magmablas_zhemv_sync(num_gpus, k, i, work, W(0, iw), stream );

                if (i < n-1) {
                    blasf77_zgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda,
                                  W(i+1, iw), &ione, &c_one, W(0, iw), &ione);

                    blasf77_zgemv(MagmaConjTransStr, &i, &i_n, &c_one, A(0, i+1), &lda,
                                  A(0, i), &ione, &c_zero, W(i+1, iw), &ione);

                    blasf77_zgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                                  W(i+1, iw), &ione, &c_one, W(0, iw), &ione);
                }

                blasf77_zscal(&i, &tau[i - 1], W(0, iw), &ione);

                #if defined(PRECISION_z) || defined(PRECISION_c)
                cblas_zdotc_sub( i, W(0,iw), ione, A(0,i), ione, &value );
                #else
                value = cblas_zdotc( i, W(0,iw), ione, A(0,i), ione );
                #endif
                alpha = tau[i - 1] * -.5f * value;
                blasf77_zaxpy(&i, &alpha, A(0, i), &ione, W(0, iw), &ione);

                for( id=0; id<num_gpus; id++ ) {
                    magma_setdevice(id);
                    if( k > 1 ) {
                        magma_zsetvector_async( n, W(0,iw), 1, dW(id, 0, iw), 1, stream[id][1] );
                    } else {
                        magma_zsetvector_async( n, W(0,iw), 1, dW(id, 0, iw), 1, stream[id][0] );
                    }
                }
            }
        }
    } else {
        /*  Reduce first NB columns of lower triangle */
        for (i = 0; i < nb; ++i) {
            /* Update A(i:n,i) */
            i_n = n - i;
            idw = ((offset+i)/nb)%num_gpus;
            if( i > 0 ) {
                trace_cpu_start( 0, "gemv", "gemv" );
                magmaDoubleComplex wii = *W(i, i-1);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                    lapackf77_zlacgv(&i_one, &wii, &ldw);
                #endif
                wii = -wii;
                blasf77_zaxpy( &i_n, &wii, A(i, i-1), &ione, A(i, i), &ione);

                wii = *A(i, i-1);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                    lapackf77_zlacgv(&i_one, &wii, &lda);
                #endif
                wii = -wii;
                blasf77_zaxpy( &i_n, &wii, W(i, i-1), &ione, A(i, i), &ione);
            }

            if (i < n-1) {
                /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */
                i_n = n - i - 1;
                trace_cpu_start( 0, "larfg", "larfg" );
                alpha = *A(i+1, i);
#ifdef PROFILE_SYMV
                cpu_start = get_current_time();
#endif
                lapackf77_zlarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]);
#ifdef PROFILE_SYMV
                cpu_end = get_current_time();
                times[0] += GetTimerValue(cpu_start,cpu_end)/1000.0;
#endif
                e[i] = MAGMA_Z_REAL( alpha );
                *A(i+1,i) = MAGMA_Z_MAKE( 1, 0 );
                trace_cpu_end( 0 );

                /* Compute W(i+1:n,i) */
                // 1. Send the block reflector  A(i+1:n,i) to the GPU
                //trace_gpu_start(  idw, 0, "comm", "comm1" );
#ifndef  MAGMABLAS_ZHEMV_MGPU
                magma_setdevice(idw);
                magma_zsetvector( i_n, A(i+1,i), 1, dA(idw, i+1, i), 1 );
#endif
                for( id=0; id<num_gpus; id++ ) {
                    magma_setdevice(id);
                    trace_gpu_start( id, 0, "comm", "comm" );
#ifdef MAGMABLAS_ZHEMV_MGPU
                    dx2[id] = dW1(id, 0, i)-offset;
#else
                    dx2[id] = dx[id];
                    magma_zsetvector( i_n, A(i+1,i), 1, dx[id], 1 );
#endif
                    magma_zsetvector_async( n, A(0,i), 1, dW1(id, 0, i), 1, stream[id][0] );
                    trace_gpu_end( id, 0 );
                }
                /* mat-vec on multiple GPUs */
#ifdef PROFILE_SYMV
                magma_setdevice(0);
                magma_event_record(start, stream[0][0]);
#endif
                magmablas_zhemv_mgpu(num_gpus, k, 'L', i_n, nb0, c_one, da, ldda, offset+i+1,
                                       dx2, ione, c_zero, dy, ione, dwork, ldwork,
                                       work, W(i+1,i), stream );
#ifdef PROFILE_SYMV
                magma_setdevice(0);
                magma_event_record(stop, stream[0][0]);
#endif
                trace_cpu_start( 0, "gemv", "gemv" );
                blasf77_zgemv(MagmaConjTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw,
                              A(i+1, i), &ione, &c_zero, W(0, i), &ione);
                blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda,
                              W(0, i), &ione, &c_zero, f, &ione);
                blasf77_zgemv(MagmaConjTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda,
                              A(i+1, i), &ione, &c_zero, W(0, i), &ione);
                trace_cpu_end( 0 );

                /* overlap update */
                if( i > 0 && i+1 < n )
                {
                    magma_int_t ip1 = i+1;
                    trace_cpu_start( 0, "gemv", "gemv" );
                    #if defined(PRECISION_z) || defined(PRECISION_c)
                        lapackf77_zlacgv(&i, W(ip1, 0), &ldw);
                    #endif
                    blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, A(ip1, 0), &lda,
                                  W(ip1, 0), &ldw, &c_one, A(ip1, ip1), &ione);
                    #if defined(PRECISION_z) || defined(PRECISION_c)
                        lapackf77_zlacgv(&i, W(ip1, 0), &ldw);
                        lapackf77_zlacgv(&i, A(ip1 ,0), &lda);
                    #endif
                    blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, W(ip1, 0), &ldw,
                                  A(ip1, 0), &lda, &c_one, A(ip1, ip1), &ione);
                    #if defined(PRECISION_z) || defined(PRECISION_c)
                        lapackf77_zlacgv(&i, A(ip1, 0), &lda);
                    #endif
                    trace_cpu_end( 0 );
                }

                /* synchronize */
                magmablas_zhemv_sync(num_gpus, k, i_n, work, W(i+1,i), stream );
#ifdef PROFILE_SYMV
                cudaEventElapsedTime(&etime, start, stop);
                mv_time += (etime/1000.0);
                times[1+(i_n/(n0/10))] += (etime/1000.0);
#endif
                trace_cpu_start( 0, "axpy", "axpy" );
                if (i!=0)
                    blasf77_zaxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione);

                blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw,
                              W(0, i), &ione, &c_one, W(i+1, i), &ione);
                blasf77_zscal(&i_n, &tau[i], W(i+1,i), &ione);

                #if defined(PRECISION_z) || defined(PRECISION_c)
                    cblas_zdotc_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value );
                #else
                    value = cblas_zdotc( i_n, W(i+1,i), ione, A(i+1,i), ione );
                #endif
                alpha = tau[i]* -.5f * value;
                blasf77_zaxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione);
                trace_cpu_end( 0 );
                for( id=0; id<num_gpus; id++ ) {
                    magma_setdevice(id);
                    if( k > 1 ) {
                        magma_zsetvector_async( n, W(0,i), 1, dW(id, 0, i), 1, stream[id][1] );
                    } else {
                        magma_zsetvector_async( n, W(0,i), 1, dW(id, 0, i), 1, stream[id][0] );
                    }
                }
            }
        }
    }

#ifdef PROFILE_SYMV
    magma_setdevice(0);
    magma_event_destory( start );
    magma_event_destory( stop  );
#endif
    for( id=0; id<num_gpus; id++ ) {
        magma_setdevice(id);
        if( k > 1) magma_queue_sync(stream[id][1]);
    }
    free(f);

    return mv_time;
} /* zlatrd_ */