Exemple #1
0
/* Task execution code */
void SCHED_zgetrf(Quark* quark)
{
  int M;
  int N;
  cuDoubleComplex *A;
  int LDA;
  int *IPIV;

  int *iinfo;

  int info;

  quark_unpack_args_5(quark, M, N, A, LDA, IPIV);

  lapackf77_zgetrf(&M, &N, A, &LDA, IPIV, &info); 

  if (info > 0) {
    iinfo[1] = iinfo[0] + info;
  }

}
Exemple #2
0
/**
    Purpose
    -------
    ZGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.
    This version assumes the computation runs through the NULL stream
    and therefore is not overlapping computation with communication.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    dA      COMPLEX_16 array on the GPU, dimension (LDDA,N).
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    @param[in]
    ldda    INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).

    @param[out]
    ipiv    INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
      -     > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    @ingroup magma_zgesv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_zgetrf2_gpu(magma_int_t m, magma_int_t n,
                  magmaDoubleComplex *dA, magma_int_t ldda,
                  magma_int_t *ipiv, magma_int_t *info)
{
    #define dAT(i,j) (dAT + (i)*nb*lddat + (j)*nb)

    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;

    magma_int_t iinfo, nb;
    magma_int_t maxm, maxn, mindim;
    magma_int_t i, rows, cols, s, lddat, lddwork;
    magmaDoubleComplex *dAT, *dAP, *work;

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (ldda < max(1,m))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    nb     = magma_get_zgetrf_nb(m);
    s      = mindim / nb;

    if (nb <= 1 || nb >= min(m,n)) {
        /* Use CPU code. */
        magma_zmalloc_cpu( &work, m * n );
        if ( work == NULL ) {
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        magma_zgetmatrix( m, n, dA, ldda, work, m );
        lapackf77_zgetrf(&m, &n, work, &m, ipiv, info);
        magma_zsetmatrix( m, n, work, m, dA, ldda );
        magma_free_cpu(work);
    }
    else {
        /* Use hybrid blocked code. */
        maxm = ((m + 31)/32)*32;
        maxn = ((n + 31)/32)*32;

        lddat   = maxn;
        lddwork = maxm;

        dAT = dA;

        if (MAGMA_SUCCESS != magma_zmalloc( &dAP, nb*maxm )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }

        if ( m == n ) {
            lddat = ldda;
            magmablas_ztranspose_inplace( m, dAT, ldda );
        }
        else {
            if (MAGMA_SUCCESS != magma_zmalloc( &dAT, maxm*maxn )) {
                magma_free( dAP );
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }
            magmablas_ztranspose( m, n, dA, ldda, dAT, lddat );
        }

        if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, maxm*nb )) {
            magma_free( dAP );
            if ( ! (m == n))
                magma_free( dAT );
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }

        for( i=0; i < s; i++ ) {
            // download i-th panel
            cols = maxm - i*nb;
            //magmablas_ztranspose( nb, cols, dAT(i,i), lddat, dAP, cols );
            magmablas_ztranspose( nb, m-i*nb, dAT(i,i), lddat, dAP, cols );
            magma_zgetmatrix( m-i*nb, nb, dAP, cols, work, lddwork );

            // make sure that gpu queue is empty
            magma_device_sync();

            if ( i > 0 ) {
                magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             n - (i+1)*nb, nb,
                             c_one, dAT(i-1,i-1), lddat,
                                    dAT(i-1,i+1), lddat );
                magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                             n-(i+1)*nb, m-i*nb, nb,
                             c_neg_one, dAT(i-1,i+1), lddat,
                                        dAT(i,  i-1), lddat,
                             c_one,     dAT(i,  i+1), lddat );
            }

            // do the cpu part
            rows = m - i*nb;
            lapackf77_zgetrf( &rows, &nb, work, &lddwork, ipiv+i*nb, &iinfo);
            if ( (*info == 0) && (iinfo > 0) )
                *info = iinfo + i*nb;

            magmablas_zpermute_long2( n, dAT, lddat, ipiv, nb, i*nb );

            // upload i-th panel
            magma_zsetmatrix( m-i*nb, nb, work, lddwork, dAP, maxm );
            //magmablas_ztranspose( cols, nb, dAP, maxm, dAT(i,i), lddat );
            magmablas_ztranspose( m-i*nb, nb, dAP, maxm, dAT(i,i), lddat );

            // do the small non-parallel computations (next panel update)
            if ( s > (i+1) ) {
                magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             nb, nb,
                             c_one, dAT(i, i  ), lddat,
                                    dAT(i, i+1), lddat);
                magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                             nb, m-(i+1)*nb, nb,
                             c_neg_one, dAT(i,   i+1), lddat,
                                        dAT(i+1, i  ), lddat,
                             c_one,     dAT(i+1, i+1), lddat );
            }
            else {
                magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             n-s*nb, nb,
                             c_one, dAT(i, i  ), lddat,
                                    dAT(i, i+1), lddat);
                magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                             n-(i+1)*nb, m-(i+1)*nb, nb,
                             c_neg_one, dAT(i,   i+1), lddat,
                                        dAT(i+1, i  ), lddat,
                             c_one,     dAT(i+1, i+1), lddat );
            }
        }

        magma_int_t nb0 = min(m - s*nb, n - s*nb);
        rows = m - s*nb;
        cols = maxm - s*nb;

        magmablas_ztranspose( nb0, rows, dAT(s,s), lddat, dAP, maxm );
        magma_zgetmatrix( rows, nb0, dAP, maxm, work, lddwork );

        // make sure that gpu queue is empty
        magma_device_sync();

        // do the cpu part
        lapackf77_zgetrf( &rows, &nb0, work, &lddwork, ipiv+s*nb, &iinfo);
        if ( (*info == 0) && (iinfo > 0) )
            *info = iinfo + s*nb;
        magmablas_zpermute_long2( n, dAT, lddat, ipiv, nb0, s*nb );

        // upload i-th panel
        magma_zsetmatrix( rows, nb0, work, lddwork, dAP, maxm );
        magmablas_ztranspose( rows, nb0, dAP, maxm, dAT(s,s), lddat );

        magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                     n-s*nb-nb0, nb0,
                     c_one, dAT(s,s),     lddat,
                            dAT(s,s)+nb0, lddat);

        if ( m == n ) {
            magmablas_ztranspose_inplace( m, dAT, lddat );
        }
        else {
            magmablas_ztranspose( n, m, dAT, lddat, dA, ldda );
            magma_free( dAT );
        }

        magma_free( dAP );
        magma_free_pinned( work );
    }
    return *info;
} /* magma_zgetrf2_gpu */
Exemple #3
0
magma_err_t
magma_zgetrf_gpu(magma_int_t m, magma_int_t n,
                 magmaDoubleComplex_ptr dA, size_t dA_offset, magma_int_t ldda,
                 magma_int_t *ipiv, magma_int_t *info,
                 magma_queue_t queue )
{
/*  -- clMAGMA (version 1.1.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date January 2014

    Purpose
    =======
    ZGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.

    Arguments
    =========

    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N).
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    LDDA     (input) INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).

    IPIV    (output) INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  if INFO = -7, internal GPU memory allocation failed.
            > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.
    =====================================================================    */

#define inAT(i,j) dAT, dAT_offset + (i)*nb*lddat + (j)*nb

    magmaDoubleComplex c_one     = MAGMA_Z_MAKE(  1.0, 0.0 );
    magmaDoubleComplex c_neg_one = MAGMA_Z_MAKE( -1.0, 0.0 );

    magma_int_t iinfo, nb;
    magma_int_t maxm, maxn, mindim;
    magma_int_t i, rows, cols, s, lddat, lddwork;
    
    magmaDoubleComplex_ptr dAT, dAP;
    magmaDoubleComplex *work;

    magma_err_t err;

    *info = 0;
    if (m < 0)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (ldda < max(1,m))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    if (m == 0 || n == 0)
        return MAGMA_SUCCESS;

    mindim = min(m, n);
    nb     = magma_get_zgetrf_nb(m);
    s      = mindim / nb;

    if (nb <= 1 || nb >= min(m,n))
      {
        // use CPU code
        err = magma_zmalloc_cpu(  &work, m*n );
        if ( err != MAGMA_SUCCESS ) {
          *info = MAGMA_ERR_HOST_ALLOC;
          return *info;
        }

        chk( magma_zgetmatrix( m, n, dA, dA_offset, ldda, work, 0, m, queue ));
        lapackf77_zgetrf(&m, &n, work, &m, ipiv, info);
        chk( magma_zsetmatrix( m, n, work, 0, m, dA, dA_offset, ldda, queue ));

        magma_free_cpu(work);
      }
    else
      {
        size_t dAT_offset;

        // use hybrid blocked code
        maxm = ((m + 31)/32)*32;
        maxn = ((n + 31)/32)*32;

        lddat   = maxn;
        lddwork = maxm;

        if ( MAGMA_SUCCESS != magma_zmalloc( &dAP, nb*maxm )) {
          *info = MAGMA_ERR_DEVICE_ALLOC;
          return *info;
        }

        if ((m == n) && (m % 32 == 0) && (ldda%32 == 0))
          {
            dAT = dA;
            dAT_offset = dA_offset;
            magma_ztranspose_inplace( dAT, dAT_offset, ldda, lddat, queue );
          }
        else
          {
            dAT_offset = 0;
            if ( MAGMA_SUCCESS != magma_zmalloc( &dAT, maxm*maxn )) {
              magma_free( dAP );
              *info = MAGMA_ERR_DEVICE_ALLOC;
              return *info;
            }

            magma_ztranspose2( dAT, dAT_offset, lddat, dA, dA_offset,  ldda, m, n, queue );
        }

        if ( MAGMA_SUCCESS != magma_zmalloc_cpu( &work, maxm*nb ) ) {
          magma_free( dAP );
          if (! ((m == n) && (m % 32 == 0) && (ldda%32 == 0)) )
            magma_free( dAT );

          *info = MAGMA_ERR_HOST_ALLOC;
          return *info;
        }

        for( i=0; i<s; i++ )
            {
                // download i-th panel
                cols = maxm - i*nb;
                magma_ztranspose( dAP, 0, cols, inAT(i,i), lddat, nb, cols, queue );
                magma_zgetmatrix(m-i*nb, nb, dAP, 0, cols, work, 0, lddwork, queue);

                if ( i>0 ){
                    magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                                 n - (i+1)*nb, nb,
                                 c_one, inAT(i-1,i-1), lddat,
                                 inAT(i-1,i+1), lddat, queue );
                    magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                                 n-(i+1)*nb, m-i*nb, nb,
                                 c_neg_one, inAT(i-1,i+1), lddat,
                                            inAT(i,  i-1), lddat,
                                 c_one,     inAT(i,  i+1), lddat, queue );
                }

                // do the cpu part
                rows = m - i*nb;
                lapackf77_zgetrf( &rows, &nb, work, &lddwork, ipiv+i*nb, &iinfo);
                if ( (*info == 0) && (iinfo > 0) )
                    *info = iinfo + i*nb;

                magma_zpermute_long2(n, dAT, dAT_offset, lddat, ipiv, nb, i*nb, queue );

                // upload i-th panel
                magma_zsetmatrix(m-i*nb, nb, work, 0, lddwork, dAP, 0, maxm, queue);
                magma_ztranspose(inAT(i,i), lddat, dAP, 0, maxm, cols, nb, queue );

                // do the small non-parallel computations
                if ( s > (i+1) ) {
                    magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                                 nb, nb,
                                 c_one, inAT(i, i  ), lddat,
                                 inAT(i, i+1), lddat, queue);
                    magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                                 nb, m-(i+1)*nb, nb,
                                 c_neg_one, inAT(i,   i+1), lddat,
                                            inAT(i+1, i  ), lddat,
                                 c_one,     inAT(i+1, i+1), lddat, queue );
                }
                else {
                    magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                                 n-s*nb, nb,
                                 c_one, inAT(i, i  ), lddat,
                                 inAT(i, i+1), lddat, queue);
                    magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                                 n-(i+1)*nb, m-(i+1)*nb, nb,
                                 c_neg_one, inAT(i,   i+1), lddat,
                                            inAT(i+1, i  ), lddat,
                                 c_one,     inAT(i+1, i+1), lddat, queue );
                }
            }

        magma_int_t nb0 = min(m - s*nb, n - s*nb);
        rows = m - s*nb;
        cols = maxm - s*nb;

        magma_ztranspose2( dAP, 0, maxm, inAT(s,s), lddat, nb0, rows, queue);
        magma_zgetmatrix(rows, nb0, dAP, 0, maxm, work, 0, lddwork, queue);

        // do the cpu part
        lapackf77_zgetrf( &rows, &nb0, work, &lddwork, ipiv+s*nb, &iinfo);
        if ( (*info == 0) && (iinfo > 0) )
            *info = iinfo + s*nb;
        magma_zpermute_long2(n, dAT, dAT_offset, lddat, ipiv, nb0, s*nb, queue );

        // upload i-th panel
        magma_zsetmatrix(rows, nb0, work, 0, lddwork, dAP, 0, maxm, queue);
        magma_ztranspose2( inAT(s,s), lddat, dAP, 0, maxm, rows, nb0, queue );

        magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                     n-s*nb-nb0, nb0,
                     c_one, inAT(s,s),     lddat,
                     inAT(s,s)+nb0, lddat, queue);

        if ((m == n) && (m % 32 == 0) && (ldda%32 == 0)) {
          magma_ztranspose_inplace( dAT, dAT_offset, lddat, ldda, queue );
        }
        else {
          magma_ztranspose2( dA, dA_offset, ldda, dAT, dAT_offset, lddat, n, m, queue );
          magma_free( dAT );
        }

        magma_free( dAP );
        magma_free_cpu( work );
    }

    return *info;
    /* End of MAGMA_ZGETRF_GPU */
}
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing ztrsm
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    real_Double_t   gflops, magma_perf, magma_time=0, cublas_perf, cublas_time, cpu_perf=0, cpu_time=0;
    double          magma_error, cublas_error, work[1];
    magma_int_t M, N, info;
    magma_int_t Ak;
    magma_int_t sizeA, sizeB;
    magma_int_t lda, ldb, ldda, lddb;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t *ipiv;

    magmaDoubleComplex *h_A, *h_B, *h_Bcublas, *h_Bmagma, *h_B1, *h_X1, *h_X2;
    magmaDoubleComplex *d_A, *d_B;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    magmaDoubleComplex c_one = MAGMA_Z_ONE;
    magmaDoubleComplex alpha = MAGMA_Z_MAKE(  0.29, -0.86 );
    magma_int_t status = 0;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    double tol = opts.tolerance * lapackf77_dlamch("E");

    printf("side = %s, uplo = %s, transA = %s, diag = %s \n",
           lapack_side_const(opts.side), lapack_uplo_const(opts.uplo),
           lapack_trans_const(opts.transA), lapack_diag_const(opts.diag) );
    printf("    M     N  MAGMA Gflop/s (ms)  CUBLAS Gflop/s (ms)   CPU Gflop/s (ms)  MAGMA error  CUBLAS error\n");
    printf("==================================================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            gflops = FLOPS_ZTRSM(opts.side, M, N) / 1e9;

            if ( opts.side == MagmaLeft ) {
                lda = M;
                Ak = M;
            } else {
                lda = N;
                Ak = N;
            }
            
            ldb = M;
            
            ldda = ((lda+31)/32)*32;
            lddb = ((ldb+31)/32)*32;
            
            sizeA = lda*Ak;
            sizeB = ldb*N;
            
            TESTING_MALLOC_CPU( h_A,       magmaDoubleComplex, lda*Ak  );
            TESTING_MALLOC_CPU( h_B,       magmaDoubleComplex, ldb*N   );
            TESTING_MALLOC_CPU( h_B1,      magmaDoubleComplex, ldb*N   );
            TESTING_MALLOC_CPU( h_X1,      magmaDoubleComplex, ldb*N   );
            TESTING_MALLOC_CPU( h_X2,      magmaDoubleComplex, ldb*N   );
            TESTING_MALLOC_CPU( h_Bcublas, magmaDoubleComplex, ldb*N   );
            TESTING_MALLOC_CPU( h_Bmagma,  magmaDoubleComplex, ldb*N   );
            TESTING_MALLOC_CPU( ipiv,      magma_int_t,        Ak      );
            
            TESTING_MALLOC_DEV( d_A,       magmaDoubleComplex, ldda*Ak );
            TESTING_MALLOC_DEV( d_B,       magmaDoubleComplex, lddb*N  );
            
            /* Initialize the matrices */
            /* Factor A into LU to get well-conditioned triangular matrix.
             * Copy L to U, since L seems okay when used with non-unit diagonal
             * (i.e., from U), while U fails when used with unit diagonal. */
            lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A );
            lapackf77_zgetrf( &Ak, &Ak, h_A, &lda, ipiv, &info );
            for( int j = 0; j < Ak; ++j ) {
                for( int i = 0; i < j; ++i ) {
                    *h_A(i,j) = *h_A(j,i);
                }
            }
            
            lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B );
            memcpy(h_B1, h_B, sizeB*sizeof(magmaDoubleComplex));
            
            /* =====================================================================
               Performs operation using MAGMABLAS
               =================================================================== */
            magma_zsetmatrix( Ak, Ak, h_A, lda, d_A, ldda );
            magma_zsetmatrix( M, N, h_B, ldb, d_B, lddb );
            
            magma_time = magma_sync_wtime( NULL );
            magmablas_ztrsm( opts.side, opts.uplo, opts.transA, opts.diag, 
                             M, N,
                             alpha, d_A, ldda,
                                    d_B, lddb );
            magma_time = magma_sync_wtime( NULL ) - magma_time;
            magma_perf = gflops / magma_time;
            
            magma_zgetmatrix( M, N, d_B, lddb, h_Bmagma, ldb );
            
            /* =====================================================================
               Performs operation using CUBLAS
               =================================================================== */
            magma_zsetmatrix( M, N, h_B, ldb, d_B, lddb );
            
            cublas_time = magma_sync_wtime( NULL );
            cublasZtrsm( handle, cublas_side_const(opts.side), cublas_uplo_const(opts.uplo),
                         cublas_trans_const(opts.transA), cublas_diag_const(opts.diag),
                         M, N, 
                         &alpha, d_A, ldda,
                                 d_B, lddb );
            cublas_time = magma_sync_wtime( NULL ) - cublas_time;
            cublas_perf = gflops / cublas_time;
            
            magma_zgetmatrix( M, N, d_B, lddb, h_Bcublas, ldb );
            
            /* =====================================================================
               Performs operation using CPU BLAS
               =================================================================== */
            if ( opts.lapack ) {
                cpu_time = magma_wtime();
                blasf77_ztrsm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), 
                               &M, &N,
                               &alpha, h_A, &lda,
                                       h_B, &ldb );
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
            }
            
            /* =====================================================================
               Check the result
               =================================================================== */
            // ||b - Ax|| / (||A||*||x||)
            memcpy(h_X1, h_Bmagma, sizeB*sizeof(magmaDoubleComplex));
            
            magmaDoubleComplex alpha2 = MAGMA_Z_DIV(  c_one, alpha );
            blasf77_ztrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), 
                            &M, &N,
                            &alpha2, h_A, &lda,
                            h_X1, &ldb );

            blasf77_zaxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X1, &ione );
            double norm1 =  lapackf77_zlange( "M", &M, &N, h_X1, &ldb, work );
            double normx =  lapackf77_zlange( "M", &M, &N, h_Bmagma, &ldb, work );
            double normA =  lapackf77_zlange( "M", &Ak, &Ak, h_A, &lda, work );

            magma_error = norm1/(normx*normA);

            memcpy(h_X2, h_Bcublas, sizeB*sizeof(magmaDoubleComplex));
            blasf77_ztrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), 
                            &M, &N,
                            &alpha2, h_A, &lda,
                            h_X2, &ldb );

            blasf77_zaxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X2, &ione );
            norm1 =  lapackf77_zlange( "M", &M, &N, h_X2, &ldb, work );
            normx =  lapackf77_zlange( "M", &M, &N, h_Bcublas, &ldb, work );
            normA =  lapackf77_zlange( "M", &Ak, &Ak, h_A, &lda, work );
            
            cublas_error = norm1/(normx*normA);
            
            if ( opts.lapack ) {
                printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e     %8.2e   %s\n",
                        (int) M, (int) N,
                        magma_perf,  1000.*magma_time,
                        cublas_perf, 1000.*cublas_time,
                        cpu_perf,    1000.*cpu_time,
                        magma_error, cublas_error,
                        (magma_error < tol && cublas_error < tol? "ok" : "failed"));
                status += ! (magma_error < tol && cublas_error < tol);
            }
            else {
                printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)     ---   (  ---  )   %8.2e     %8.2e   %s\n",
                        (int) M, (int) N,
                        magma_perf,  1000.*magma_time,
                        cublas_perf, 1000.*cublas_time,
                        magma_error, cublas_error,
                        (magma_error < tol && cublas_error < tol? "ok" : "failed"));
                status += ! (magma_error < tol && cublas_error < tol);
            }
            
            TESTING_FREE_CPU( h_A  );
            TESTING_FREE_CPU( h_B  );
            TESTING_FREE_CPU( h_B1 );
            TESTING_FREE_CPU( h_X1 );
            TESTING_FREE_CPU( h_X2 );
            TESTING_FREE_CPU( h_Bcublas );
            TESTING_FREE_CPU( h_Bmagma  );
            
            TESTING_FREE_DEV( d_A );
            TESTING_FREE_DEV( d_B );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    TESTING_FINALIZE();
    return status;
}
Exemple #5
0
/**
    Purpose
    -------
    ZGETRF_m computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.  This version does not
    require work space on the GPU passed as input. GPU memory is allocated
    in the routine. The matrix may exceed the GPU memory.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.

    Note: The factorization of big panel is done calling multiple-gpu-interface.
    Pivots are applied on GPU within the big panel.

    Arguments
    ---------
    @param[in]
    ngpu    INTEGER
            Number of GPUs to use. ngpu > 0.

    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    A       COMPLEX_16 array, dimension (LDA,N)
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.
    \n
            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    @param[out]
    ipiv    INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
      -     > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    @ingroup magma_zgesv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_zgetrf_m(
    magma_int_t ngpu,
    magma_int_t m, magma_int_t n,
    magmaDoubleComplex *A, magma_int_t lda, magma_int_t *ipiv,
    magma_int_t *info)
{
#define     A(i,j) (A      + (j)*lda + (i))
#define dAT(d,i,j) (dAT[d] + (i)*nb*ldn_local + (j)*nb)
#define dPT(d,i,j) (dPT[d] + (i)*nb*nb + (j)*nb*maxm)

    magma_timer_t time=0, time_total=0, time_alloc=0, time_set=0, time_get=0, time_comp=0;
    timer_start( time_total );
    real_Double_t flops;

    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    magmaDoubleComplex *dAT[MagmaMaxGPUs], *dA[MagmaMaxGPUs], *dPT[MagmaMaxGPUs];
    magma_int_t        iinfo = 0, nb, nbi, maxm, n_local[MagmaMaxGPUs], ldn_local;
    magma_int_t        N, M, NB, NBk, I, d, ngpu0 = ngpu;
    magma_int_t        ii, jj, h, offset, ib, rows;
    
    magma_queue_t stream[MagmaMaxGPUs][2];
    magma_event_t  event[MagmaMaxGPUs][2];

    *info = 0;
    if (m < 0)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (lda < max(1,m))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );
    
    /* initialize nb */
    nb = magma_get_zgetrf_nb(m);
    maxm = ((m  + 31)/32)*32;

    /* figure out NB */
    size_t freeMem, totalMem;
    cudaMemGetInfo( &freeMem, &totalMem );
    freeMem /= sizeof(magmaDoubleComplex);
    
    /* number of columns in the big panel */
    h = 1+(2+ngpu0);
    NB = (magma_int_t)(0.8*freeMem/maxm-h*nb);
    const char* ngr_nb_char = getenv("MAGMA_NGR_NB");
    if ( ngr_nb_char != NULL )
        NB = max( nb, min( NB, atoi(ngr_nb_char) ) );
    //NB = 5*max(nb,32);

    if ( ngpu0 > ceil((double)NB/nb) ) {
        ngpu = (int)ceil((double)NB/nb);
        h = 1+(2+ngpu);
        NB = (magma_int_t)(0.8*freeMem/maxm-h*nb);
    } else {
        ngpu = ngpu0;
    }
    if ( ngpu*NB >= n ) {
        #ifdef CHECK_ZGETRF_OOC
        printf( "      * still fit in GPU memory.\n" );
        #endif
        NB = n;
    } else {
        #ifdef CHECK_ZGETRF_OOC
        printf( "      * don't fit in GPU memory.\n" );
        #endif
        NB = ngpu*NB;
        NB = max( nb, (NB / nb) * nb); /* making sure it's devisable by nb (x64) */
    }

    #ifdef CHECK_ZGETRF_OOC
    if ( NB != n ) printf( "      * running in out-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n", n, NB, nb, (double)freeMem );
    else           printf( "      * running in in-core mode  (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n", n, NB, nb, (double)freeMem );
    #endif

    if ( (nb <= 1) || (nb >= min(m,n)) ) {
        /* Use CPU code for scalar of one tile. */
        lapackf77_zgetrf(&m, &n, A, &lda, ipiv, info);
    } else {
        /* Use hybrid blocked code. */

        /* allocate memory on GPU to store the big panel */
        timer_start( time_alloc );
        n_local[0] = (NB/nb)/ngpu;
        if ( NB%(nb*ngpu) != 0 )
            n_local[0]++;
        n_local[0] *= nb;
        ldn_local = ((n_local[0]+31)/32)*32;
    
        for( d=0; d < ngpu; d++ ) {
            magma_setdevice(d);
            if (MAGMA_SUCCESS != magma_zmalloc( &dA[d], (ldn_local+h*nb)*maxm )) {
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }
            dPT[d] = dA[d] + nb*maxm;      /* for storing the previous panel from CPU */
            dAT[d] = dA[d] + h*nb*maxm;    /* for storing the big panel               */
            magma_queue_create( &stream[d][0] );
            magma_queue_create( &stream[d][1] );
            magma_event_create( &event[d][0] );
            magma_event_create( &event[d][1] );
        }
        //magma_setdevice(0);
        timer_stop( time_alloc );
        
        for( I=0; I < n; I += NB ) {
            M = m;
            N = min( NB, n-I );       /* number of columns in this big panel             */
            //s = min( max(m-I,0), N )/nb; /* number of small block-columns in this big panel */
    
            maxm = ((M + 31)/32)*32;
            if ( ngpu0 > ceil((double)N/nb) ) {
                ngpu = (int)ceil((double)N/nb);
            } else {
                ngpu = ngpu0;
            }
    
            for( d=0; d < ngpu; d++ ) {
                n_local[d] = ((N/nb)/ngpu)*nb;
                if (d < (N/nb)%ngpu)
                    n_local[d] += nb;
                else if (d == (N/nb)%ngpu)
                    n_local[d] += N%nb;
            }
            ldn_local = ((n_local[0]+31)/32)*32;
            
            /* upload the next big panel into GPU, transpose (A->A'), and pivot it */
            timer_start( time );
            magmablas_zsetmatrix_transpose_mgpu(ngpu, stream, A(0,I), lda,
                                                dAT, ldn_local, dA, maxm, M, N, nb);
            for( d=0; d < ngpu; d++ ) {
                magma_setdevice(d);
                magma_queue_sync( stream[d][0] );
                magma_queue_sync( stream[d][1] );
                magmablasSetKernelStream(NULL);
            }
            time_set += timer_stop( time );
    
            timer_start( time );
            /* == --------------------------------------------------------------- == */
            /* == loop around the previous big-panels to update the new big-panel == */
            for( offset = 0; offset < min(m,I); offset += NB ) {
                NBk = min( m-offset, NB );
                /* start sending the first tile from the previous big-panels to gpus */
                for( d=0; d < ngpu; d++ ) {
                    magma_setdevice(d);
                    nbi  = min( nb, NBk );
                    magma_zsetmatrix_async( (M-offset), nbi,
                                            A(offset,offset), lda,
                                            dA[d],            (maxm-offset), stream[d][0] );
                    
                    /* make sure the previous update finished */
                    magmablasSetKernelStream(stream[d][0]);
                    //magma_queue_sync( stream[d][1] );
                    magma_queue_wait_event( stream[d][0], event[d][0] );
                    
                    /* transpose */
                    magmablas_ztranspose( M-offset, nbi, dA[d], maxm-offset, dPT(d,0,0), nb );
                }
                
                /* applying the pivot from the previous big-panel */
                for( d=0; d < ngpu; d++ ) {
                    magma_setdevice(d);
                    magmablas_zlaswp_q( ldn_local, dAT(d,0,0), ldn_local, offset+1, offset+NBk, ipiv, 1, stream[d][1] );
                }
                
                /* == going through each block-column of previous big-panels == */
                for( jj=0, ib=offset/nb; jj < NBk; jj += nb, ib++ ) {
                    ii   = offset+jj;
                    rows = maxm - ii;
                    nbi  = min( nb, NBk-jj );
                    for( d=0; d < ngpu; d++ ) {
                        magma_setdevice(d);
                        
                        /* wait for a block-column on GPU */
                        magma_queue_sync( stream[d][0] );
                        
                        /* start sending next column */
                        if ( jj+nb < NBk ) {
                            magma_zsetmatrix_async( (M-ii-nb), min(nb,NBk-jj-nb),
                                                    A(ii+nb,ii+nb), lda,
                                                    dA[d],          (rows-nb), stream[d][0] );
                            
                            /* make sure the previous update finished */
                            magmablasSetKernelStream(stream[d][0]);
                            //magma_queue_sync( stream[d][1] );
                            magma_queue_wait_event( stream[d][0], event[d][(1+jj/nb)%2] );
                            
                            /* transpose next column */
                            magmablas_ztranspose( M-ii-nb, nb, dA[d], rows-nb, dPT(d,0,(1+jj/nb)%2), nb );
                        }
                        
                        /* update with the block column */
                        magmablasSetKernelStream(stream[d][1]);
                        magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                                     n_local[d], nbi, c_one, dPT(d,0,(jj/nb)%2), nb, dAT(d,ib,0), ldn_local );
                        if ( M > ii+nb ) {
                            magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                                n_local[d], M-(ii+nb), nbi, c_neg_one, dAT(d,ib,0), ldn_local,
                                dPT(d,1,(jj/nb)%2), nb, c_one, dAT(d,ib+1,0), ldn_local );
                        }
                        magma_event_record( event[d][(jj/nb)%2], stream[d][1] );
                    
                    } /* end of for each block-columns in a big-panel */
                }
            } /* end of for each previous big-panels */
            for( d=0; d < ngpu; d++ ) {
                magma_setdevice(d);
                magma_queue_sync( stream[d][0] );
                magma_queue_sync( stream[d][1] );
            magmablasSetKernelStream(NULL);
            }
    
            /* calling magma-gpu interface to panel-factorize the big panel */
            if ( M > I ) {
                magma_zgetrf2_mgpu(ngpu, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, A(0,I), lda,
                                   stream, &iinfo);
                if ( iinfo < 0 ) {
                    *info = iinfo;
                    break;
                } else if ( iinfo != 0 ) {
                    *info = iinfo + I * NB;
                    //break;
                }
                /* adjust pivots */
                for( ii=I; ii < min(I+N,m); ii++ )
                    ipiv[ii] += I;
            }
            time_comp += timer_stop( time );
    
            /* download the current big panel to CPU */
            timer_start( time );
            magmablas_zgetmatrix_transpose_mgpu(ngpu, stream, dAT, ldn_local, A(0,I), lda, dA, maxm, M, N, nb);
            for( d=0; d < ngpu; d++ ) {
                magma_setdevice(d);
                magma_queue_sync( stream[d][0] );
                magma_queue_sync( stream[d][1] );
            magmablasSetKernelStream(NULL);
            }
            time_get += timer_stop( time );
        } /* end of for */
    
        timer_stop( time_total );
        flops = FLOPS_ZGETRF( m, n ) / 1e9;
        timer_printf(" memory-allocation time: %e\n", time_alloc );
        timer_printf(" NB=%d nb=%d\n", (int) NB, (int) nb );
        timer_printf(" memcopy and transpose %e seconds\n", time_set );
        timer_printf(" total time %e seconds\n", time_total );
        timer_printf(" Performance %f GFlop/s, %f seconds without htod and dtoh\n",     flops / (time_comp),               time_comp               );
        timer_printf(" Performance %f GFlop/s, %f seconds with    htod\n",              flops / (time_comp + time_set),    time_comp + time_set    );
        timer_printf(" Performance %f GFlop/s, %f seconds with    dtoh\n",              flops / (time_comp + time_get),    time_comp + time_get    );
        timer_printf(" Performance %f GFlop/s, %f seconds without memory-allocation\n", flops / (time_total - time_alloc), time_total - time_alloc );
    
        for( d=0; d < ngpu0; d++ ) {
            magma_setdevice(d);
            magma_free( dA[d] );
            magma_event_destroy( event[d][0] );
            magma_event_destroy( event[d][1] );
            magma_queue_destroy( stream[d][0] );
            magma_queue_destroy( stream[d][1] );
        }
        magma_setdevice( orig_dev );
        magmablasSetKernelStream( orig_stream );
    }
    if ( *info >= 0 )
        magma_zgetrf_piv(m, n, NB, A, lda, ipiv, info);
    return *info;
} /* magma_zgetrf_m */
Exemple #6
0
extern "C" magma_int_t
magma_zgetrf2_mgpu(magma_int_t num_gpus, 
         magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset,
         cuDoubleComplex **d_lAT, magma_int_t lddat, magma_int_t *ipiv,
         cuDoubleComplex **d_lAP, cuDoubleComplex *w, magma_int_t ldw,
         cudaStream_t streaml[][2], magma_int_t *info)
#endif
{
/*  -- MAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2010

    Purpose
    =======

    ZGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.
    Use two buffer to send panels..

    Arguments
    =========

    NUM_GPUS 
            (input) INTEGER
            The number of GPUS to be used for the factorization.

    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N).
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    LDDA     (input) INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).

    IPIV    (output) INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  if INFO = -7, internal GPU memory allocation failed.
            > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.
    =====================================================================    */

#define inAT(id,i,j)  (d_lAT[(id)] + ((offset)+(i)*nb)*lddat + (j)*nb)
#define W(j) (w+((j)%num_gpus)*nb*ldw)

    cuDoubleComplex c_one     = MAGMA_Z_ONE;
    cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;

    magma_int_t block_size = 32;
    magma_int_t iinfo, n_local[4]; 
    magma_int_t maxm, mindim;
    magma_int_t i, ii, d, dd, rows, cols, s, ldpan[4];
    magma_int_t id, i_local, i_local2, nb0, nb1;
    cuDoubleComplex *d_panel[4], *panel_local[4];
    //cudaStream_t streaml[4][2];

    /* Check arguments */
    *info = 0;
    if (m < 0)
    *info = -2;
    else if (n < 0)
    *info = -3;
    else if (num_gpus*lddat < max(1,n))
    *info = -5;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    //nb     = magma_get_zgetrf_nb(m);
    if( num_gpus > ceil((double)n/nb) ) {
      *info = -1;
      return *info;
    }

    {
      /* Use hybrid blocked code. */
      maxm  = ((m + block_size-1)/block_size)*block_size;

      /* some initializations */
      for(i=0; i<num_gpus; i++){
        magmaSetDevice(i);

        n_local[i] = ((n/nb)/num_gpus)*nb;
        if (i < (n/nb)%num_gpus)
           n_local[i] += nb;
        else if (i == (n/nb)%num_gpus)
           n_local[i] += n%nb;

        /* workspaces */
        d_panel[i] = &(d_lAP[i][nb*maxm]);   /* temporary panel storage */

        /* create local streams */
        //magma_queue_create(&streaml[i][0]);
        //magma_queue_create(&streaml[i][1]);
      }
      trace_init( 1, num_gpus, 2, (CUstream_st**)streaml );

      /* start sending the panel to cpu */
      nb0 = min(mindim, nb);
      magmaSetDevice(0);
      magmablasSetKernelStream(streaml[0][1]);
      trace_gpu_start( 0, 1, "comm", "get" );
      if( nb0 == nb )
        magmablas_ztranspose(  d_lAP[0], maxm, inAT(0,0,0), lddat, nb0, maxm );
      else
        magmablas_ztranspose2( d_lAP[0], maxm, inAT(0,0,0), lddat, nb0, maxm );
      magma_zgetmatrix_async( m, nb0,
                              d_lAP[0], maxm,
                              W(0),     ldw, streaml[0][1] );
      trace_gpu_end( 0, 1 );

      /* ------------------------------------------------------------------------------------- */
#ifdef  PROFILE
      magma_timestr_t start_timer, end_timer;
      start_timer = get_current_time();
#endif
      s = mindim / nb;
      for( i=0; i<s; i++ )
            {
                /* Set the GPU number that holds the current panel */
                id = i%num_gpus;
                magmaSetDevice(id);

                /* Set the local index where the current panel is */
                i_local = i/num_gpus;
                cols  = maxm - i*nb;
                rows  = m - i*nb;

                /* synchrnoize i-th panel from id-th gpu into work */
                magma_queue_sync( streaml[id][1] );

                /* i-th panel factorization */
                trace_cpu_start( 0, "getrf", "getrf" );
#ifdef PANEL_FACT_MC
                cntxt->nb = 12;
                magma_zgetrf_mc(cntxt, &rows, &nb, W(i), &ldw, ipiv+i*nb, &iinfo);
#else
                lapackf77_zgetrf( &rows, &nb, W(i), &ldw, ipiv+i*nb, &iinfo);
#endif
                if ( (*info == 0) && (iinfo > 0) ) {
                    *info = iinfo + i*nb;
                    //break;
                }
                trace_cpu_end( 0 );

                /* start sending the panel to all the gpus */
                d = (i+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                  magmaSetDevice(d);
                  trace_gpu_start( 0, 1, "comm", "set" );
                  magma_zsetmatrix_async( rows, nb,
                                          W(i),     ldw,
                                          d_lAP[d], cols, streaml[d][1] );
                  trace_gpu_end( 0, 1 );
                  d = (d+1)%num_gpus;
                }

                /* apply the pivoting */
                d = (i+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                  magmaSetDevice(d);
                  magmablasSetKernelStream(streaml[d][0]);

                  trace_gpu_start( d, 1, "pivot", "pivot" );
                  if( dd == 0 )
                    magmablas_zpermute_long2( lddat, inAT(d,0,0), lddat, ipiv, nb, i*nb );
                  else
                    magmablas_zpermute_long3(        inAT(d,0,0), lddat, ipiv, nb, i*nb );
                  trace_gpu_end( d, 1 );
                  d = (d+1)%num_gpus;
                }


                /* update the trailing-matrix/look-ahead */
                d = (i+1)%num_gpus;
                for( dd=0; dd<num_gpus; dd++ ) {
                  magmaSetDevice(d);

                  /* storage for panel */
                  if( d == id ) {
                    /* the panel belond to this gpu */
                    panel_local[d] = inAT(d,i,i_local);
                    ldpan[d] = lddat;
                    /* next column */
                    i_local2 = i_local+1;
                  } else {
                    /* the panel belong to another gpu */
                    panel_local[d] = &d_panel[d][(i%2)*nb*maxm];
                    //panel_local[d] = d_panel[d];
                    ldpan[d] = nb;
                    /* next column */
                    i_local2 = i_local;
                    if( d < id ) i_local2 ++;
                  }
                  /* the size of the next column */
                  if ( s > (i+1) ) {
                    nb0 = nb;
                  } else {
                    nb0 = n_local[d]-nb*(s/num_gpus);
                    if( d < s%num_gpus ) nb0 -= nb;
                  }
                  if( d == (i+1)%num_gpus) {
                    /* owns the next column, look-ahead the column */
                    nb1 = nb0;
                    magmablasSetKernelStream(streaml[d][1]);

                    /* make sure all the pivoting has been applied */
                    magma_queue_sync(streaml[d][0]);
                    trace_gpu_start( d, 1, "gemm", "gemm" );
                  } else {
                    /* update the entire trailing matrix */
                    nb1 = n_local[d] - i_local2*nb;
                    magmablasSetKernelStream(streaml[d][0]);

                    /* synchronization to make sure panel arrived on gpu */
                    magma_queue_sync(streaml[d][1]);
                    trace_gpu_start( d, 0, "gemm", "gemm" );
                  }
                  magmablas_ztranspose(panel_local[d], ldpan[d], d_lAP[d], cols, cols, nb);

                  /* gpu updating the trailing matrix */
                  //magmablas_ztrsm(
                  magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                               nb1, nb, c_one,
                               panel_local[d],       ldpan[d],
                               inAT(d, i, i_local2), lddat);
                  //cublasZgemm
                  magma_zgemm( MagmaNoTrans, MagmaNoTrans, 
                               nb1, m-(i+1)*nb, nb, 
                               c_neg_one, inAT(d, i,   i_local2),         lddat,
                                          &(panel_local[d][nb*ldpan[d]]), ldpan[d], 
                               c_one,     inAT(d, i+1, i_local2),         lddat );

                  if( d == (i+1)%num_gpus ) 
                  {
                    /* Set the local index where the current panel is */
                    int loff    = i+1;
                    int i_local = (i+1)/num_gpus;
                    int ldda    = maxm - (i+1)*nb;
                    int cols    = m - (i+1)*nb;
                    nb0 = min(nb, mindim - (i+1)*nb); /* size of the diagonal block */
                    trace_gpu_end( d, 1 );

                    if( nb0 > 0 ) {
                        /* transpose the panel for sending it to cpu */
                        trace_gpu_start( d, 1, "comm", "get" );
                        if( i+1 < s ) 
                          magmablas_ztranspose(  d_lAP[d], ldda, inAT(d,loff,i_local), lddat, nb0, ldda );
                        else
                          magmablas_ztranspose2(  d_lAP[d], ldda, inAT(d,loff,i_local), lddat, nb0, ldda );

                        /* send the panel to cpu */
                        magma_zgetmatrix_async( cols, nb0,
                                                d_lAP[d], ldda,
                                                W(i+1),   ldw, streaml[d][1] );
                        trace_gpu_end( d, 1 );
                    }
                  } else {
                    trace_gpu_end( d, 0 );
                  }

                  d = (d+1)%num_gpus;
                }

                /* update the remaining matrix by gpu owning the next panel */
                if( (i+1) < s ) {
                  int i_local = (i+1)/num_gpus;
                  int rows  = m - (i+1)*nb;

                  d = (i+1)%num_gpus;
                  magmaSetDevice(d);
                  magmablasSetKernelStream(streaml[d][0]);
                  trace_gpu_start( d, 0, "gemm", "gemm" );
                  //magmablas_ztrsm
                  magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                               n_local[d] - (i_local+1)*nb, nb, 
                               c_one, panel_local[d],       ldpan[d], 
                                      inAT(d,i,i_local+1),  lddat );
                  //cublasZgemm
                  magma_zgemm( MagmaNoTrans, MagmaNoTrans, 
                               n_local[d]-(i_local+1)*nb, rows, nb, 
                               c_neg_one, inAT(d,i,i_local+1),            lddat, 
                                          &(panel_local[d][nb*ldpan[d]]), ldpan[d], 
                               c_one,     inAT(d,i+1,  i_local+1),        lddat );
                  trace_gpu_end( d, 0 );
                }
             } /* end of for i=1..s */
             /* ------------------------------------------------------------------------------ */

            /* Set the GPU number that holds the last panel */
            id = s%num_gpus;

            /* Set the local index where the last panel is */
            i_local = s/num_gpus;

            /* size of the last diagonal-block */
            nb0 = min(m - s*nb, n - s*nb);
            rows = m    - s*nb;
            cols = maxm - s*nb;

            if( nb0 > 0 ) {
              magmaSetDevice(id);

              /* wait for the last panel on cpu */
              magma_queue_sync( streaml[id][1] );

              /* factor on cpu */
              lapackf77_zgetrf( &rows, &nb0, W(s), &ldw, ipiv+s*nb, &iinfo);
              if ( (*info == 0) && (iinfo > 0) )
              *info = iinfo + s*nb;

              /* send the factor to gpus */
              for( d=0; d<num_gpus; d++ ) {
                magmaSetDevice(d);
                i_local2 = i_local;
                if( d < id ) i_local2 ++;

                if( d == id || n_local[d] > i_local2*nb ) {
                  magma_zsetmatrix_async( rows, nb0,
                                          W(s),     ldw,
                                          d_lAP[d], cols, streaml[d][1] );
                }
              }

              for( d=0; d<num_gpus; d++ ) {
                magmaSetDevice(d);
                magmablasSetKernelStream(streaml[d][0]);
                if( d == 0 )
                  magmablas_zpermute_long2( lddat, inAT(d,0,0), lddat, ipiv, nb0, s*nb );
                else
                  magmablas_zpermute_long3(        inAT(d,0,0), lddat, ipiv, nb0, s*nb );
              }

              for( d=0; d<num_gpus; d++ ) {
                magmaSetDevice(d);
                magmablasSetKernelStream(streaml[d][1]);

                /* wait for the pivoting to be done */
                magma_queue_sync( streaml[d][0] );

                i_local2 = i_local;
                if( d < id ) i_local2++;
                if( d == id ) {
                  /* the panel belond to this gpu */
                  panel_local[d] = inAT(d,s,i_local);

                  /* next column */
                  nb1 = n_local[d] - i_local*nb-nb0;

                  magmablas_ztranspose2( panel_local[d], lddat, d_lAP[d], cols, rows, nb0);

                  if( nb1 > 0 )
                  //cublasZtrsm
                  magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                               nb1, nb0, c_one,
                               panel_local[d],        lddat, 
                               inAT(d,s,i_local)+nb0, lddat);
                } else if( n_local[d] > i_local2*nb ) {
                  /* the panel belong to another gpu */
                  panel_local[d] = &d_panel[d][(s%2)*nb*maxm];
                  //panel_local[d] = d_panel[d];

                  /* next column */
                  nb1 = n_local[d] - i_local2*nb;

                  magmablas_ztranspose2( panel_local[d], nb, d_lAP[d], cols, rows, nb0);
                  //cublasZtrsm
                  magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                               nb1, nb0, c_one,
                               panel_local[d],     nb, 
                               inAT(d,s,i_local2), lddat);
                }
              }
            } /* if( nb0 > 0 ) */

            /* clean up */
            trace_finalize( "zgetrf_mgpu.svg","trace.css" );
            for( d=0; d<num_gpus; d++ ) {
              magmaSetDevice(d);
              magma_queue_sync( streaml[d][0] );
              magma_queue_sync( streaml[d][1] );
              //magma_queue_destroy(streaml[d][0]);
              //magma_queue_destroy(streaml[d][1]);
              magmablasSetKernelStream(NULL);
            } 
            magmaSetDevice(0);
#ifdef  PROFILE
            end_timer = get_current_time();
            printf("\n Performance %f GFlop/s\n", (2./3.*n*n*n /1000000.) / GetTimerValue(start_timer, end_timer));
#endif
    }

    return *info;
    /* End of MAGMA_ZGETRF2_MGPU */
}
Exemple #7
0
int main( int argc, char** argv )
{
    TESTING_INIT();
    
    real_Double_t   gflops, t1, t2;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    magma_int_t ione = 1;
    magma_trans_t trans[] = { MagmaNoTrans, MagmaConjTrans, MagmaTrans };
    magma_uplo_t  uplo [] = { MagmaLower, MagmaUpper };
    magma_diag_t  diag [] = { MagmaUnit, MagmaNonUnit };
    magma_side_t  side [] = { MagmaLeft, MagmaRight };
    
    magmaDoubleComplex  *A,  *B,  *C,   *C2, *LU;
    magmaDoubleComplex *dA, *dB, *dC1, *dC2;
    magmaDoubleComplex alpha = MAGMA_Z_MAKE( 0.5, 0.1 );
    magmaDoubleComplex beta  = MAGMA_Z_MAKE( 0.7, 0.2 );
    double dalpha = 0.6;
    double dbeta  = 0.8;
    double work[1], error, total_error;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t m, n, k, size, maxn, ld, info;
    magma_int_t *piv;
    magma_int_t err;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" );
    
    total_error = 0.;
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        m = opts.msize[itest];
        n = opts.nsize[itest];
        k = opts.ksize[itest];
        printf("=========================================================================\n");
        printf( "m=%d, n=%d, k=%d\n", (int) m, (int) n, (int) k );
        
        // allocate matrices
        // over-allocate so they can be any combination of {m,n,k} x {m,n,k}.
        maxn = max( max( m, n ), k );
        ld = max( 1, maxn );
        size = ld*maxn;
        err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) );  assert( err == 0 );
        err = magma_zmalloc_pinned( &A,  size );  assert( err == 0 );
        err = magma_zmalloc_pinned( &B,  size );  assert( err == 0 );
        err = magma_zmalloc_pinned( &C,  size );  assert( err == 0 );
        err = magma_zmalloc_pinned( &C2, size );  assert( err == 0 );
        err = magma_zmalloc_pinned( &LU, size );  assert( err == 0 );
        err = magma_zmalloc( &dA,  size );        assert( err == 0 );
        err = magma_zmalloc( &dB,  size );        assert( err == 0 );
        err = magma_zmalloc( &dC1, size );        assert( err == 0 );
        err = magma_zmalloc( &dC2, size );        assert( err == 0 );
        
        // initialize matrices
        size = maxn*maxn;
        lapackf77_zlarnv( &ione, ISEED, &size, A  );
        lapackf77_zlarnv( &ione, ISEED, &size, B  );
        lapackf77_zlarnv( &ione, ISEED, &size, C  );
        
        printf( "========== Level 1 BLAS ==========\n" );
        
        // ----- test ZSWAP
        // swap columns 2 and 3 of dA, then copy to C2 and compare with A
        if ( n >= 3 ) {
            magma_zsetmatrix( m, n, A, ld, dA, ld );
            magma_zsetmatrix( m, n, A, ld, dB, ld );
            magma_zswap( m, dA(0,1), 1, dA(0,2), 1 );
            magma_zswap( m, dB(0,1), 1, dB(0,2), 1 );
            
            // check results, storing diff between magma and cuda calls in C2
            cublasZaxpy( handle, ld*n, &c_neg_one, dA, 1, dB, 1 );
            magma_zgetmatrix( m, n, dB, ld, C2, ld );
            error = lapackf77_zlange( "F", &m, &k, C2, &ld, work );
            total_error += error;
            printf( "zswap             diff %.2g\n", error );
        }
        else {
            printf( "zswap skipped for n < 3\n" );
        }
        
        // ----- test IZAMAX
        // get argmax of column of A
        magma_zsetmatrix( m, k, A, ld, dA, ld );
        error = 0;
        for( int j = 0; j < k; ++j ) {
            magma_int_t i1 = magma_izamax( m, dA(0,j), 1 );
            int i2;  // NOT magma_int_t, for cublas
            cublasIzamax( handle, m, dA(0,j), 1, &i2 );
            // todo need sync here?
            assert( i1 == i2 );
            error += abs( i1 - i2 );
        }
        total_error += error;
        gflops = (double)m * k / 1e9;
        printf( "izamax            diff %.2g\n", error );
        printf( "\n" );
        
        printf( "========== Level 2 BLAS ==========\n" );
        
        // ----- test ZGEMV
        // c = alpha*A*b + beta*c,  with A m*n; b,c m or n-vectors
        // try no-trans/trans
        for( int ia = 0; ia < 3; ++ia ) {
            magma_zsetmatrix( m, n, A,  ld, dA,  ld );
            magma_zsetvector( maxn, B, 1, dB,  1 );
            magma_zsetvector( maxn, C, 1, dC1, 1 );
            magma_zsetvector( maxn, C, 1, dC2, 1 );
            
            t1 = magma_sync_wtime( 0 );
            magma_zgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 );
            t1 = magma_sync_wtime( 0 ) - t1;
            
            t2 = magma_sync_wtime( 0 );
            cublasZgemv( handle, cublas_trans_const(trans[ia]),
                         m, n, &alpha, dA, ld, dB, 1, &beta, dC2, 1 );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            size = (trans[ia] == MagmaNoTrans ? m : n);
            cublasZaxpy( handle, size, &c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetvector( size, dC2, 1, C2, 1 );
            error = lapackf77_zlange( "F", &size, &ione, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZGEMV( m, n ) / 1e9;
            printf( "zgemv( %c )        diff %.2g,  Gflop/s %7.2f, %7.2f\n",
                    lapacke_trans_const(trans[ia]), error, gflops/t1, gflops/t2 );
        }
        printf( "\n" );
        
        // ----- test ZHEMV
        // c = alpha*A*b + beta*c,  with A m*m symmetric; b,c m-vectors
        // try upper/lower
        for( int iu = 0; iu < 2; ++iu ) {
            magma_zsetmatrix( m, m, A, ld, dA, ld );
            magma_zsetvector( m, B, 1, dB,  1 );
            magma_zsetvector( m, C, 1, dC1, 1 );
            magma_zsetvector( m, C, 1, dC2, 1 );
            
            t1 = magma_sync_wtime( 0 );
            magma_zhemv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 );
            t1 = magma_sync_wtime( 0 ) - t1;
            
            t2 = magma_sync_wtime( 0 );
            cublasZhemv( handle, cublas_uplo_const(uplo[iu]),
                         m, &alpha, dA, ld, dB, 1, &beta, dC2, 1 );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( handle, m, &c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetvector( m, dC2, 1, C2, 1 );
            error = lapackf77_zlange( "F", &m, &ione, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZHEMV( m ) / 1e9;
            printf( "zhemv( %c )        diff %.2g,  Gflop/s %7.2f, %7.2f\n",
                    lapacke_uplo_const(uplo[iu]), error, gflops/t1, gflops/t2 );
        }
        printf( "\n" );
        
        // ----- test ZTRSV
        // solve A*c = c,  with A m*m triangular; c m-vector
        // try upper/lower, no-trans/trans, unit/non-unit diag
        // Factor A into LU to get well-conditioned triangles, else solve yields garbage.
        // Still can give garbage if solves aren't consistent with LU factors,
        // e.g., using unit diag for U, so copy lower triangle to upper triangle.
        // Also used for trsm later.
        lapackf77_zlacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld );
        lapackf77_zgetrf( &maxn, &maxn, LU, &ld, piv, &info );
        for( int j = 0; j < maxn; ++j ) {
            for( int i = 0; i < j; ++i ) {
                *LU(i,j) = *LU(j,i);
            }
        }
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
        for( int id = 0; id < 2; ++id ) {
            magma_zsetmatrix( m, m, LU, ld, dA, ld );
            magma_zsetvector( m, C, 1, dC1, 1 );
            magma_zsetvector( m, C, 1, dC2, 1 );
            
            t1 = magma_sync_wtime( 0 );
            magma_ztrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 );
            t1 = magma_sync_wtime( 0 ) - t1;
            
            t2 = magma_sync_wtime( 0 );
            cublasZtrsv( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]),
                         cublas_diag_const(diag[id]), m, dA, ld, dC2, 1 );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( handle, m, &c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetvector( m, dC2, 1, C2, 1 );
            error = lapackf77_zlange( "F", &m, &ione, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZTRSM( MagmaLeft, m, 1 ) / 1e9;
            printf( "ztrsv( %c, %c, %c )  diff %.2g,  Gflop/s %7.2f, %7.2f\n",
                    lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), lapacke_diag_const(diag[id]),
                    error, gflops/t1, gflops/t2 );
        }}}
        printf( "\n" );
        
        printf( "========== Level 3 BLAS ==========\n" );
        
        // ----- test ZGEMM
        // C = alpha*A*B + beta*C,  with A m*k or k*m; B k*n or n*k; C m*n
        // try combinations of no-trans/trans
        for( int ia = 0; ia < 3; ++ia ) {
        for( int ib = 0; ib < 3; ++ib ) {
            bool nta = (trans[ia] == MagmaNoTrans);
            bool ntb = (trans[ib] == MagmaNoTrans);
            magma_zsetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA,  ld );
            magma_zsetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB,  ld );
            magma_zsetmatrix( m, n, C, ld, dC1, ld );
            magma_zsetmatrix( m, n, C, ld, dC2, ld );
            
            t1 = magma_sync_wtime( 0 );
            magma_zgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            
            t2 = magma_sync_wtime( 0 );
            cublasZgemm( handle, cublas_trans_const(trans[ia]), cublas_trans_const(trans[ib]),
                         m, n, k, &alpha, dA, ld, dB, ld, &beta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_zlange( "F", &m, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZGEMM( m, n, k ) / 1e9;
            printf( "zgemm( %c, %c )     diff %.2g,  Gflop/s %7.2f, %7.2f\n",
                    lapacke_trans_const(trans[ia]), lapacke_trans_const(trans[ib]),
                    error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test ZHEMM
        // C = alpha*A*B + beta*C  (left)  with A m*m symmetric; B,C m*n; or
        // C = alpha*B*A + beta*C  (right) with A n*n symmetric; B,C m*n
        // try left/right, upper/lower
        for( int is = 0; is < 2; ++is ) {
        for( int iu = 0; iu < 2; ++iu ) {
            magma_zsetmatrix( m, m, A, ld, dA,  ld );
            magma_zsetmatrix( m, n, B, ld, dB,  ld );
            magma_zsetmatrix( m, n, C, ld, dC1, ld );
            magma_zsetmatrix( m, n, C, ld, dC2, ld );
            
            t1 = magma_sync_wtime( 0 );
            magma_zhemm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            
            t2 = magma_sync_wtime( 0 );
            cublasZhemm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]),
                         m, n, &alpha, dA, ld, dB, ld, &beta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_zlange( "F", &m, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZHEMM( side[is], m, n ) / 1e9;
            printf( "zhemm( %c, %c )     diff %.2g,  Gflop/s %7.2f, %7.2f\n",
                    lapacke_side_const(side[is]), lapacke_uplo_const(uplo[iu]),
                    error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test ZHERK
        // C = alpha*A*A^H + beta*C  (no-trans) with A m*k and C m*m symmetric; or
        // C = alpha*A^H*A + beta*C  (trans)    with A k*m and C m*m symmetric
        // try upper/lower, no-trans/trans
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
            magma_zsetmatrix( n, k, A, ld, dA,  ld );
            magma_zsetmatrix( n, n, C, ld, dC1, ld );
            magma_zsetmatrix( n, n, C, ld, dC2, ld );
            
            t1 = magma_sync_wtime( 0 );
            magma_zherk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            
            t2 = magma_sync_wtime( 0 );
            cublasZherk( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]),
                         n, k, &dalpha, dA, ld, &dbeta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetmatrix( n, n, dC2, ld, C2, ld );
            error = lapackf77_zlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZHERK( k, n ) / 1e9;
            printf( "zherk( %c, %c )     diff %.2g,  Gflop/s %7.2f, %7.2f\n",
                    lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]),
                    error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test ZHER2K
        // C = alpha*A*B^H + ^alpha*B*A^H + beta*C  (no-trans) with A,B n*k; C n*n symmetric; or
        // C = alpha*A^H*B + ^alpha*B^H*A + beta*C  (trans)    with A,B k*n; C n*n symmetric
        // try upper/lower, no-trans/trans
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
            bool nt = (trans[it] == MagmaNoTrans);
            magma_zsetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA,  ld );
            magma_zsetmatrix( n, n, C, ld, dC1, ld );
            magma_zsetmatrix( n, n, C, ld, dC2, ld );
            
            t1 = magma_sync_wtime( 0 );
            magma_zher2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            
            t2 = magma_sync_wtime( 0 );
            cublasZher2k( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]),
                          n, k, &alpha, dA, ld, dB, ld, &dbeta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetmatrix( n, n, dC2, ld, C2, ld );
            error = lapackf77_zlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZHER2K( k, n ) / 1e9;
            printf( "zher2k( %c, %c )    diff %.2g,  Gflop/s %7.2f, %7.2f\n",
                    lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]),
                    error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test ZTRMM
        // C = alpha*A*C  (left)  with A m*m triangular; C m*n; or
        // C = alpha*C*A  (right) with A n*n triangular; C m*n
        // try left/right, upper/lower, no-trans/trans, unit/non-unit
        for( int is = 0; is < 2; ++is ) {
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
        for( int id = 0; id < 2; ++id ) {
            bool left = (side[is] == MagmaLeft);
            magma_zsetmatrix( (left ? m : n), (left ? m : n), A, ld, dA,  ld );
            magma_zsetmatrix( m, n, C, ld, dC1, ld );
            magma_zsetmatrix( m, n, C, ld, dC2, ld );
            
            t1 = magma_sync_wtime( 0 );
            magma_ztrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            
            // note cublas does trmm out-of-place (i.e., adds output matrix C),
            // but allows C=B to do in-place.
            t2 = magma_sync_wtime( 0 );
            cublasZtrmm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]),
                         cublas_trans_const(trans[it]), cublas_diag_const(diag[id]),
                         m, n, &alpha, dA, ld, dC2, ld, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_zlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZTRMM( side[is], m, n ) / 1e9;
            printf( "ztrmm( %c, %c )     diff %.2g,  Gflop/s %7.2f, %7.2f\n",
                    lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]),
                    error, gflops/t1, gflops/t2 );
        }}}}
        printf( "\n" );
        
        // ----- test ZTRSM
        // solve A*X = alpha*B  (left)  with A m*m triangular; B m*n; or
        // solve X*A = alpha*B  (right) with A n*n triangular; B m*n
        // try left/right, upper/lower, no-trans/trans, unit/non-unit
        for( int is = 0; is < 2; ++is ) {
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
        for( int id = 0; id < 2; ++id ) {
            bool left = (side[is] == MagmaLeft);
            magma_zsetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA,  ld );
            magma_zsetmatrix( m, n, C, ld, dC1, ld );
            magma_zsetmatrix( m, n, C, ld, dC2, ld );
            
            t1 = magma_sync_wtime( 0 );
            magma_ztrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            
            t2 = magma_sync_wtime( 0 );
            cublasZtrsm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]),
                         cublas_trans_const(trans[it]), cublas_diag_const(diag[id]),
                         m, n, &alpha, dA, ld, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_zlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZTRSM( side[is], m, n ) / 1e9;
            printf( "ztrsm( %c, %c )     diff %.2g,  Gflop/s %7.2f, %7.2f\n",
                    lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]),
                    error, gflops/t1, gflops/t2 );
        }}}}
        printf( "\n" );
        
        // cleanup
        magma_free_cpu( piv );
        magma_free_pinned( A  );
        magma_free_pinned( B  );
        magma_free_pinned( C  );
        magma_free_pinned( C2 );
        magma_free_pinned( LU );
        magma_free( dA  );
        magma_free( dB  );
        magma_free( dC1 );
        magma_free( dC2 );
        fflush( stdout );
    }
    
    if ( total_error != 0. ) {
        printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n",
                total_error );
    }
    else {
        printf( "all tests passed\n" );
    }
    
    TESTING_FINALIZE();
    
    int status = (total_error != 0.);
    return status;
}
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing zgetrf_mgpu
*/
int main( int argc, char** argv )
{
    TESTING_INIT();

    real_Double_t    gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0;
    double           error;
    magmaDoubleComplex *h_A;
    magmaDoubleComplex_ptr d_lA[ MagmaMaxGPUs ];
    magma_int_t *ipiv;
    magma_int_t M, N, n2, lda, ldda, n_local, ngpu;
    magma_int_t info, min_mn, nb, ldn_local;
    magma_int_t status = 0;

    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    double tol = opts.tolerance * lapackf77_dlamch("E");

    printf("ngpu %d\n", (int) opts.ngpu );
    if ( opts.check == 2 ) {
        printf("    M     N   CPU GFlop/s (sec)   GPU GFlop/s (sec)   |Ax-b|/(N*|A|*|x|)\n");
    }
    else {
        printf("    M     N   CPU GFlop/s (sec)   GPU GFlop/s (sec)   |PA-LU|/(N*|A|)\n");
    }
    printf("=========================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            min_mn = min(M, N);
            lda    = M;
            n2     = lda*N;
            ldda   = ((M+31)/32)*32;
            nb     = magma_get_zgetrf_nb( M );
            gflops = FLOPS_ZGETRF( M, N ) / 1e9;
            
            // ngpu must be at least the number of blocks
            ngpu = min( opts.ngpu, int((N+nb-1)/nb) );
            if ( ngpu < opts.ngpu ) {
                printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu );
            }
            
            // Allocate host memory for the matrix
            TESTING_MALLOC_CPU( ipiv, magma_int_t,        min_mn );
            TESTING_MALLOC_CPU( h_A,  magmaDoubleComplex, n2     );
            
            // Allocate device memory
            for( int dev=0; dev < ngpu; dev++ ) {
                n_local = ((N/nb)/ngpu)*nb;
                if (dev < (N/nb) % ngpu)
                    n_local += nb;
                else if (dev == (N/nb) % ngpu)
                    n_local += N % nb;
                ldn_local = ((n_local+31)/32)*32;  // TODO why?
                magma_setdevice( dev );
                TESTING_MALLOC_DEV( d_lA[dev], magmaDoubleComplex, ldda*ldn_local );
            }
    
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            if ( opts.lapack ) {
                init_matrix( M, N, h_A, lda );
                
                cpu_time = magma_wtime();
                lapackf77_zgetrf( &M, &N, h_A, &lda, ipiv, &info );
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
                if (info != 0)
                    printf("lapackf77_zgetrf returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));
            }
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            init_matrix( M, N, h_A, lda );
            magma_zsetmatrix_1D_col_bcyclic( M, N, h_A, lda, d_lA, ldda, ngpu, nb );
    
            gpu_time = magma_wtime();
            magma_zgetrf_mgpu( ngpu, M, N, d_lA, ldda, ipiv, &info );
            gpu_time = magma_wtime() - gpu_time;
            gpu_perf = gflops / gpu_time;
            if (info != 0)
                printf("magma_zgetrf_mgpu returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
                       
            magma_zgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_A, lda, ngpu, nb );
    
            /* =====================================================================
               Check the factorization
               =================================================================== */
            if ( opts.lapack ) {
                printf("%5d %5d  %7.2f (%7.2f)   %7.2f (%7.2f)",
                       (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time );
            }
            else {
                printf("%5d %5d    ---   (  ---  )   %7.2f (%7.2f)",
                       (int) M, (int) N, gpu_perf, gpu_time );
            }
            if ( opts.check == 2 ) {
                error = get_residual( M, N, h_A, lda, ipiv );
                printf("   %8.2e   %s\n", error, (error < tol ? "ok" : "failed"));
                status += ! (error < tol);
            }
            else if ( opts.check ) {
                error = get_LU_error( M, N, h_A, lda, ipiv );
                printf("   %8.2e   %s\n", error, (error < tol ? "ok" : "failed"));
                status += ! (error < tol);
            }
            else {
                printf( "     ---\n" );
            }
            
            TESTING_FREE_CPU( ipiv );
            TESTING_FREE_CPU( h_A );
            for( int dev=0; dev < ngpu; dev++ ) {
                magma_setdevice( dev );
                TESTING_FREE_DEV( d_lA[dev] );
            }
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    TESTING_FINALIZE();
    return status;
}
Exemple #9
0
/**
    Purpose
    -------
    ZGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
        A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.

    Arguments
    ---------
    @param[in]
    ngpu    INTEGER
            Number of GPUs to use. ngpu > 0.

    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    d_lA    COMPLEX_16 array of pointers on the GPU, dimension (ngpu).
            On entry, the M-by-N matrix A distributed over GPUs
            (d_lA[d] points to the local matrix on d-th GPU).
            It uses 1D block column cyclic format with the block size of nb,
            and each local matrix is stored by column.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    @param[in]
    ldda     INTEGER
            The leading dimension of the array d_lA.  LDDA >= max(1,M).

    @param[out]
    ipiv    INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
      -     > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    @ingroup magma_zgesv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_zgetrf_mgpu(
    magma_int_t ngpu,
    magma_int_t m, magma_int_t n,
    magmaDoubleComplex_ptr d_lA[], magma_int_t ldda, magma_int_t *ipiv,
    magma_int_t *info)
{
    magma_int_t nb, n_local[MagmaMaxGPUs];
    magma_int_t maxm;
    magma_int_t i, j, d, lddat, lddwork;
    magmaDoubleComplex *d_lAT[MagmaMaxGPUs];
    magmaDoubleComplex *d_panel[MagmaMaxGPUs], *work;
    magma_queue_t streaml[MagmaMaxGPUs][2];

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -2;
    else if (n < 0)
        *info = -3;
    else if (ldda < max(1,m))
        *info = -5;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    nb = magma_get_zgetrf_nb(m);

    if (nb <= 1 || nb >= n) {
        /* Use CPU code. */
        magma_zmalloc_cpu( &work, m * n );
        if ( work == NULL ) {
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        magma_zgetmatrix( m, n, d_lA[0], ldda, work, m );
        lapackf77_zgetrf(&m, &n, work, &m, ipiv, info);
        magma_zsetmatrix( m, n, work, m, d_lA[0], ldda );
        magma_free_cpu(work);
    } else {
        /* Use hybrid blocked code. */
        magma_device_t orig_dev;
        magma_getdevice( &orig_dev );
        magma_queue_t orig_stream;
        magmablasGetKernelStream( &orig_stream );
        
        maxm = ((m + 31)/32)*32;
        if ( ngpu > ceil((double)n/nb) ) {
            printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu );
            *info = -1;
            return *info;
        }

        /* allocate workspace for each GPU */
        lddat = ((((((n+nb-1)/nb)/ngpu)*nb)+31)/32)*32;
        lddat = (n+nb-1)/nb;                 /* number of block columns         */
        lddat = (lddat+ngpu-1)/ngpu;         /* number of block columns per GPU */
        lddat = nb*lddat;                    /* number of columns per GPU       */
        lddat = ((lddat+31)/32)*32;          /* make it a multiple of 32        */
        for (i=0; i < ngpu; i++) {
            magma_setdevice(i);
            
            /* local-n and local-ld */
            n_local[i] = ((n/nb)/ngpu)*nb;
            if (i < (n/nb)%ngpu)
                n_local[i] += nb;
            else if (i == (n/nb)%ngpu)
                n_local[i] += n%nb;
            
            /* workspaces */
            if (MAGMA_SUCCESS != magma_zmalloc( &d_panel[i], (3+ngpu)*nb*maxm )) {
                for( j=0; j <= i; j++ ) {
                    magma_setdevice(j);
                }
                for( j=0; j < i; j++ ) {
                    magma_setdevice(j);
                    magma_free( d_panel[j] );
                    magma_free( d_lAT[j]   );
                }
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }
            
            /* local-matrix storage */
            if (MAGMA_SUCCESS != magma_zmalloc( &d_lAT[i], lddat*maxm )) {
                for( j=0; j <= i; j++ ) {
                    magma_setdevice(j);
                    magma_free( d_panel[j] );
                }
                for( j=0; j < i; j++ ) {
                    magma_setdevice(j);
                    magma_free( d_lAT[j] );
                }
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }
            
            /* create the streams */
            magma_queue_create( &streaml[i][0] );
            magma_queue_create( &streaml[i][1] );
            
            magmablasSetKernelStream(streaml[i][1]);
            magmablas_ztranspose( m, n_local[i], d_lA[i], ldda, d_lAT[i], lddat );
        }
        for (i=0; i < ngpu; i++) {
            magma_setdevice(i);
            magma_queue_sync(streaml[i][0]);
            magmablasSetKernelStream(NULL);
        }
        magma_setdevice(0);

        /* cpu workspace */
        lddwork = maxm;
        if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, lddwork*nb*ngpu )) {
            for (i=0; i < ngpu; i++ ) {
                magma_setdevice(i);
                magma_free( d_panel[i] );
                magma_free( d_lAT[i]   );
            }
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }

        /* calling multi-gpu interface with allocated workspaces and streams */
        magma_zgetrf2_mgpu(ngpu, m, n, nb, 0, d_lAT, lddat, ipiv, d_panel, work, maxm,
                           streaml, info);

        /* clean up */
        for( d=0; d < ngpu; d++ ) {
            magma_setdevice(d);
            
            /* save on output */
            magmablas_ztranspose( n_local[d], m, d_lAT[d], lddat, d_lA[d], ldda );
            magma_device_sync();
            magma_free( d_lAT[d]   );
            magma_free( d_panel[d] );
            magma_queue_destroy( streaml[d][0] );
            magma_queue_destroy( streaml[d][1] );
        } /* end of for d=1,..,ngpu */
        magma_setdevice( orig_dev );
        magmablasSetKernelStream( orig_stream );
        magma_free_pinned( work );
    }
        
    return *info;
}
Exemple #10
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing ztrsm
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    real_Double_t   gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf=0, cpu_time=0;
    double          magma_error, cublas_error, work[1];
    magma_int_t M, N, info;
    magma_int_t Ak;
    magma_int_t sizeA, sizeB;
    magma_int_t lda, ldb, ldda, lddb;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
   
    magma_int_t *piv;
    magma_err_t err;

    magmaDoubleComplex *h_A, *h_B, *h_Bcublas, *h_Bmagma, *h_B1, *h_X1, *h_X2, *LU, *LUT;
    magmaDoubleComplex *d_A, *d_B;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    magmaDoubleComplex c_one = MAGMA_Z_ONE;
    magmaDoubleComplex alpha = MAGMA_Z_MAKE(  0.29, -0.86 );
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    printf("If running lapack (option --lapack), MAGMA and CUBLAS error are both computed\n"
           "relative to CPU BLAS result. Else, MAGMA error is computed relative to CUBLAS result.\n\n"
           "side = %c, uplo = %c, transA = %c, diag = %c \n", opts.side, opts.uplo, opts.transA, opts.diag );
    printf("    M     N  MAGMA Gflop/s (ms)  CUBLAS Gflop/s (ms)   CPU Gflop/s (ms)  MAGMA error  CUBLAS error\n");
    printf("==================================================================================================\n");
    for( int i = 0; i < opts.ntest; ++i ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[i];
            N = opts.nsize[i];
            gflops = FLOPS_ZTRSM(opts.side, M, N) / 1e9;

            if ( opts.side == MagmaLeft ) {
                lda = M;
                Ak = M;
            } else {
                lda = N;
                Ak = N;
            }
            
            ldb = M;
            
            ldda = ((lda+31)/32)*32;
            lddb = ((ldb+31)/32)*32;
            
            sizeA = lda*Ak;
            sizeB = ldb*N;
            
            TESTING_MALLOC( h_A,  magmaDoubleComplex, lda*Ak );
            TESTING_MALLOC( LU,      magmaDoubleComplex, lda*Ak );
            TESTING_MALLOC( LUT,  magmaDoubleComplex, lda*Ak );
            TESTING_MALLOC( h_B,  magmaDoubleComplex, ldb*N  );
            TESTING_MALLOC( h_B1,  magmaDoubleComplex, ldb*N );
            TESTING_MALLOC( h_X1,  magmaDoubleComplex, ldb*N );
            TESTING_MALLOC( h_X2,  magmaDoubleComplex, ldb*N );
            TESTING_MALLOC( h_Bcublas, magmaDoubleComplex, ldb*N  );
            TESTING_MALLOC( h_Bmagma, magmaDoubleComplex, ldb*N  );
            
            TESTING_DEVALLOC( d_A, magmaDoubleComplex, ldda*Ak );
            TESTING_DEVALLOC( d_B, magmaDoubleComplex, lddb*N  );
            
            /* Initialize the matrices */
            lapackf77_zlarnv( &ione, ISEED, &sizeA, LU );
            err = magma_malloc_cpu( (void**) &piv, Ak*sizeof(magma_int_t) );  assert( err == 0 );
            lapackf77_zgetrf( &Ak, &Ak, LU, &lda, piv, &info );
        
            int i, j;
            for(i=0;i<Ak;i++){
                for(j=0;j<Ak;j++){
                    LUT[j+i*lda] = LU[i+j*lda];
                }
            }

            lapackf77_zlacpy(MagmaUpperStr, &Ak, &Ak, LUT, &lda, LU, &lda);

            if(opts.uplo == MagmaLower){
                lapackf77_zlacpy(MagmaLowerStr, &Ak, &Ak, LU, &lda, h_A, &lda);
            }else{
                lapackf77_zlacpy(MagmaUpperStr, &Ak, &Ak, LU, &lda, h_A, &lda);
            }
            
            lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B );
            memcpy(h_B1, h_B, sizeB*sizeof(magmaDoubleComplex));
            /* =====================================================================
               Performs operation using MAGMA-BLAS
               =================================================================== */
            magma_zsetmatrix( Ak, Ak, h_A, lda, d_A, ldda );
            magma_zsetmatrix( M, N, h_B, ldb, d_B, lddb );
            
            magma_time = magma_sync_wtime( NULL );
            magmablas_ztrsm( opts.side, opts.uplo, opts.transA, opts.diag, 
                             M, N,
                             alpha, d_A, ldda,
                                    d_B, lddb );
            magma_time = magma_sync_wtime( NULL ) - magma_time;
            magma_perf = gflops / magma_time;
            
            magma_zgetmatrix( M, N, d_B, lddb, h_Bmagma, ldb );
            
            /* =====================================================================
               Performs operation using CUDA-BLAS
               =================================================================== */
            magma_zsetmatrix( M, N, h_B, ldb, d_B, lddb );
            
            cublas_time = magma_sync_wtime( NULL );
            cublasZtrsm( opts.side, opts.uplo, opts.transA, opts.diag,
                         M, N, 
                         alpha, d_A, ldda,
                                d_B, lddb );
            cublas_time = magma_sync_wtime( NULL ) - cublas_time;
            cublas_perf = gflops / cublas_time;
            
            magma_zgetmatrix( M, N, d_B, lddb, h_Bcublas, ldb );
            
            /* =====================================================================
               Performs operation using CPU BLAS
               =================================================================== */
            if ( opts.lapack ) {
                cpu_time = magma_wtime();
                blasf77_ztrsm( &opts.side, &opts.uplo, &opts.transA, &opts.diag, 
                               &M, &N,
                               &alpha, h_A, &lda,
                                       h_B, &ldb );
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
            }
            
            /* =====================================================================
               Check the result
               =================================================================== */
            // ||b - Ax|| / (||A||*||x||)
            memcpy(h_X1, h_Bmagma, sizeB*sizeof(magmaDoubleComplex));
            
            magmaDoubleComplex alpha2 = MAGMA_Z_DIV(  c_one, alpha );
            blasf77_ztrmm( &opts.side, &opts.uplo, &opts.transA, &opts.diag, 
                            &M, &N,
                            &alpha2, h_A, &lda,
                            h_X1, &ldb );

            blasf77_zaxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X1, &ione );
            double norm1 =  lapackf77_zlange( "M", &M, &N, h_X1, &ldb, work );
            double normx =  lapackf77_zlange( "M", &M, &N, h_Bmagma, &ldb, work );
            double normA =  lapackf77_zlange( "M", &Ak, &Ak, h_A, &lda, work );


            magma_error = norm1/(normx*normA);

            memcpy(h_X2, h_Bcublas, sizeB*sizeof(magmaDoubleComplex));
            blasf77_ztrmm( &opts.side, &opts.uplo, &opts.transA, &opts.diag, 
                            &M, &N,
                            &alpha2, h_A, &lda,
                            h_X2, &ldb );

            blasf77_zaxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X2, &ione );
            norm1 =  lapackf77_zlange( "M", &M, &N, h_X2, &ldb, work );
            normx =  lapackf77_zlange( "M", &M, &N, h_Bcublas, &ldb, work );
            normA =  lapackf77_zlange( "M", &Ak, &Ak, h_A, &lda, work );
            
            cublas_error = norm1/(normx*normA);
            
            if ( opts.lapack ) {
                printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e     %8.2e\n",
                        (int) M, (int) N,
                        magma_perf,  1000.*magma_time,
                        cublas_perf, 1000.*cublas_time,
                        cpu_perf,    1000.*cpu_time,
                        magma_error, cublas_error );
            }
            else {
                printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)     ---   (  ---  )   %8.2e     %8.2e\n",
                        (int) M, (int) N,
                        magma_perf,  1000.*magma_time,
                        cublas_perf, 1000.*cublas_time,
                        magma_error, cublas_error );
            }
            
            TESTING_FREE( h_A  );
            TESTING_FREE( LU  );
            TESTING_FREE( LUT );
            TESTING_FREE( h_B  );
            TESTING_FREE( h_Bcublas );
            TESTING_FREE( h_Bmagma );
            TESTING_FREE( h_B1  );
            TESTING_FREE( h_X1 );
            TESTING_FREE( h_X2 );
            
            TESTING_DEVFREE( d_A );
            TESTING_DEVFREE( d_B );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    TESTING_FINALIZE();
    return 0;
}
/**
    Purpose
    -------
    ZGETRF_INCPIV computes an LU factorization of a general M-by-N tile A
    using partial pivoting with row interchanges.

    The factorization has the form

      A = P * L * U

    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 2.5 BLAS version of the algorithm.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in]
    ib      INTEGER
            The inner-blocking size.  IB >= 0.

    @param[in,out]
    hA      DOUBLE COMPLEX array, dimension(LDHA, N), on cpu.
            On entry, only the M-by-IB first panel needs to be identical to dA(1..M, 1..IB).
            On exit, the content is incomplete. Shouldn't be used.

    @param[in]
    ldha    INTEGER
            The leading dimension of the array hA.  LDHA >= max(1,M).

    @param[in,out]
    dA      DOUBLE COMPLEX array, dimension(LDDA, N), on gpu.
            On entry, the M-by-N tile to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    @param[in]
    ldda    INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).

    @param[out]
    hL      DOUBLE COMPLEX array, dimension(LDHL, min(M,N)), on vpu.
            On exit, contains in the upper part the IB-by-K lower triangular tile,
            and in the lower part IB-by-min(M,N) the inverse of the top part.

    @param[in]
    ldhl    INTEGER
            The leading dimension of the array hL.  LDHL >= max(1,2*IB).

    @param[out]
    dL      DOUBLE COMPLEX array, dimension(LDDL, K), on gpu.
            On exit, contains in the upper part the IB-by-min(M,N) lower triangular tile,
            and in the lower part IB-by-min(M,N) the inverse of the top part.

    @param[in]
    lddl    INTEGER
            The leading dimension of the array dL.  LDDL >= max(1,2*IB).

    @param[out]
    ipiv    INTEGER array, dimension min(M,N), on the cpu.
            The pivot indices array.

    @param[out]
    dWORK   DOUBLE COMPLEX array, dimension(LDDWORK, 2*IB), on gpu.
            Workspace.

    @param[in]
    lddwork INTEGER
            The leading dimension of the array dWORK.  LDDWORK >= max(NB, 1).

    @param[out]
    info    INTEGER
            - PLASMA_SUCCESS successful exit
            - < 0 if INFO = -k, the k-th argument had an illegal value
            - > 0 if INFO = k, U(k,k) is exactly zero. The factorization
                has been completed, but the factor U is exactly
                singular, and division by zero will occur if it is used
                to solve a system of equations.

    @ingroup magma_zgesv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_zgetrf_incpiv_gpu( magma_order_t order, magma_int_t m, magma_int_t n, magma_int_t ib,
                         magmaDoubleComplex *hA, magma_int_t ldha, magmaDoubleComplex *dA, magma_int_t ldda,
                         magmaDoubleComplex *hL, magma_int_t ldhl, magmaDoubleComplex *dL, magma_int_t lddl,
                         magma_int_t *ipiv,
                         magmaDoubleComplex *dwork, magma_int_t lddwork,
                         magma_int_t *info)
{
#define AT(i,j) (dAT + (i)*ib*ldda + (j)*ib)
#define hA(i,j) (hA  + (i)*ib + (j)*ib*ldha)
#define hL(j)   (hL  + (j)*ib*ldhl         )
#define hL2(j)  (hL2 + (j)*ib*ldhl         )
#define dL(j)   (dL  + (j)*ib*lddl         )
#define dL2(j)  (dL2 + (j)*ib*lddl         )

    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;

    magma_int_t iinfo;
    magma_int_t maxm, mindim;
    magma_int_t i, rows, cols, s, ii, sb;
    magmaDoubleComplex *dAT;
#ifndef WITHOUTTRTRI
    magmaDoubleComplex *dL2 = dL + ib;
    magmaDoubleComplex *hL2 = hL + ib;
#endif

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (ldda < max(1,m))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    s      = mindim / ib;

    if ( ib >= mindim ) {
        /* Use CPU code. */
        lapackf77_zgetrf(&m, &n, hA, &ldha, ipiv, info);

#ifndef WITHOUTTRTRI
        CORE_zlacpy(PlasmaUpperLower, mindim, mindim,
                    (PLASMA_Complex64_t*)hA, ldha,
                    (PLASMA_Complex64_t*)hL2, ldhl );

        CORE_ztrtri( PlasmaLower, PlasmaUnit, mindim,
                     (PLASMA_Complex64_t*)hL2, ldhl, info );
        if (*info != 0 ) {
            fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info);
        }

        magma_zsetmatrix( mindim, mindim, hL2, ldhl, dL2, lddl );
#endif

        if ( order == MagmaRowMajor ) {
            magma_zsetmatrix( m, n, hA, ldha, dwork, lddwork );
            magmablas_ztranspose( m, n, dwork, lddwork, dA, ldda );
        } else {
            magma_zsetmatrix( m, n, hA, ldha, dA, ldda );
        }
    }
    else {
        /* Use hybrid blocked code. */
        maxm = ((m + 31)/32)*32;

        if ( order == MagmaColMajor ) {
            magmablas_zgetmo_in( dA, dAT, ldda, m, n );
        } else {
            dAT = dA;
        }

        for( i=0; i < s; i++ ) {
            ii = i * ib;
            sb = min(ib, mindim-ii);
            cols = maxm - ii;

            if ( i > 0 ) {
                // download i-th panel
                magmablas_ztranspose( sb, m, AT(0,i), ldda, dwork, maxm );
                magma_zgetmatrix( m, sb, dwork, maxm, hA(0, i), ldha );

                // make sure that gpu queue is empty
                //magma_device_sync();
#ifndef WITHOUTTRTRI
                magma_ztrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                             n - (ii+sb), ib,
                             c_one, dL2(i-1),    lddl,
                                    AT(i-1,i+1), ldda );
#else
                magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             n - (ii+sb), ib,
                             c_one, AT(i-1,i-1), ldda,
                                    AT(i-1,i+1), ldda );
#endif
                magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                             n-(ii+sb), m-ii, ib,
                             c_neg_one, AT(i-1,i+1), ldda,
                                        AT(i,  i-1), ldda,
                             c_one,     AT(i,  i+1), ldda );
            }

            // do the cpu part
            rows = m - ii;
            lapackf77_zgetrf( &rows, &sb, hA(i, i), &ldha, ipiv+ii, &iinfo);
            if ( (*info == 0) && (iinfo > 0) )
                *info = iinfo + ii;

            {
                int j;
                int fin = ii + sb;
                for (j=ii; j < fin; j++) {
                    ipiv[j] = ii + ipiv[j];
                }
            }
            magmablas_zlaswp( n-ii, AT(0, i), ldda, ii+1, ii+sb, ipiv, 1 );

#ifndef WITHOUTTRTRI
            CORE_zlacpy(PlasmaLower, sb, sb,
                        (PLASMA_Complex64_t*)hA(i, i), ldha,
                        (PLASMA_Complex64_t*)hL2(i), ldhl );

            CORE_ztrtri( PlasmaLower, PlasmaUnit, sb,
                         (PLASMA_Complex64_t*)hL2(i), ldhl, info );
            if (*info != 0 ) {
                fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info);
            }
            magma_zsetmatrix( sb, sb, hL2(i), ldhl, dL2(i), lddl );
#endif
            // upload i-th panel
            magma_zsetmatrix( rows, sb, hA(i, i), ldha, dwork, cols );
            magmablas_ztranspose( rows, sb, dwork, cols, AT(i,i), ldda );

            // do the small non-parallel computations
            if ( s > (i+1) ) {
#ifndef WITHOUTTRTRI
                magma_ztrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                             sb, sb,
                             c_one, dL2(i),     lddl,
                                    AT(i, i+1), ldda);
#else
                magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             sb, sb,
                             c_one, AT(i, i  ), ldda,
                                    AT(i, i+1), ldda);
#endif
                magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                             sb, m-(ii+sb), sb,
                             c_neg_one, AT(i,   i+1), ldda,
                                        AT(i+1, i  ), ldda,
                             c_one,     AT(i+1, i+1), ldda );
            }
            else {
                /* Update of the last panel */
#ifndef WITHOUTTRTRI
                magma_ztrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                             n-mindim, sb,
                             c_one, dL2(i),     lddl,
                                    AT(i, i+1), ldda);
#else
                magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             n-mindim, sb,
                             c_one, AT(i, i  ), ldda,
                                    AT(i, i+1), ldda);
#endif
                /* m-(ii+sb) should be always 0 */
                magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                             n-mindim, m-(ii+sb), sb,
                             c_neg_one, AT(i,   i+1), ldda,
                                        AT(i+1, i  ), ldda,
                             c_one,     AT(i+1, i+1), ldda );
            }
        }

        if ( order == MagmaColMajor ) {
            magmablas_zgetmo_out( dA, dAT, ldda, m, n );
        }
    }
    return *info;
}
Exemple #12
0
/**
    Purpose
    -------
    ZGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.
    Use two buffer to send panels.

    Arguments
    ---------
    @param[in]
    ngpu    INTEGER
            Number of GPUs to use. ngpu > 0.

    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    d_lAT   COMPLEX_16 array of pointers on the GPU, dimension (ngpu).
            On entry, the M-by-N matrix A distributed over GPUs
            (d_lAT[d] points to the local matrix on d-th GPU).
            It uses a 1D block column cyclic format (with the block size
            nb), and each local matrix is stored by row.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    @param[in]
    lddat   INTEGER
            The leading dimension of the array d_lAT[d]. LDDA >= max(1,M).

    @param[out]
    ipiv    INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    @param (workspace) on device
    d_lAP   COMPLEX_16 array of pointers on the GPU, dimension (ngpu).
            d_lAP[d] is the workspace on d-th GPU. Each local workspace
            must be of size (3+ngpu)*nb*maxm, where maxm is m rounded
            up to a multiple of 32 and nb is the block size.

    @param (workspace)
    W       COMPLEX_16 array, dimension (ngpu*nb*maxm).
            It is used to store panel on CPU.

    @param[in]
    ldw     INTEGER
            The leading dimension of the workspace w.

    @param[in]
    queues  magma_queue_t
            queues[d] points to the streams for the d-th GPU to execute
            in. Each GPU require two streams.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
      -     > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    @ingroup magma_zgesv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_zgetrf2_mgpu(
    magma_int_t ngpu,
    magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset,
    magmaDoubleComplex_ptr d_lAT[], magma_int_t lddat, magma_int_t *ipiv,
    magmaDoubleComplex_ptr d_lAP[],
    magmaDoubleComplex *W, magma_int_t ldw,
    magma_queue_t queues[][2],
    magma_int_t *info)
{
#define dAT(id,i,j)  (d_lAT[(id)] + ((offset)+(i)*nb)*lddat + (j)*nb)
#define W(j) (W + ((j)%ngpu)*nb*ldw)

    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;

    magma_int_t block_size = 32;
    magma_int_t iinfo, n_local[MagmaMaxGPUs];
    magma_int_t maxm, mindim;
    magma_int_t i, j, d, dd, rows, cols, s, ldpan[MagmaMaxGPUs];
    magma_int_t id, j_local, j_local2, nb0, nb1, h = 2+ngpu;
    magmaDoubleComplex *d_panel[MagmaMaxGPUs], *panel_local[MagmaMaxGPUs];

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -2;
    else if (n < 0)
        *info = -3;
    else if (ngpu*lddat < max(1,n))
        *info = -5;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    if ( ngpu > ceil((double)n/nb) ) {
        *info = -1;
        return *info;
    }

    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );
    
    /* Use hybrid blocked code. */
    maxm  = ((m + block_size-1)/block_size)*block_size;

    /* some initializations */
    for (d=0; d < ngpu; d++) {
        magma_setdevice(d);
        
        n_local[d] = ((n/nb)/ngpu)*nb;
        if (d < (n/nb) % ngpu)
            n_local[d] += nb;
        else if (d == (n/nb) % ngpu)
            n_local[d] += n % nb;
        
        /* workspaces */
        d_panel[d] = &(d_lAP[d][h*nb*maxm]);   /* temporary panel storage */
    }
    trace_init( 1, ngpu, 2, (CUstream_st**)queues );

    /* start sending the panel to cpu */
    nb0 = min(mindim, nb);
    magma_setdevice(0);
    magmablasSetKernelStream(queues[0][1]);
    trace_gpu_start( 0, 1, "comm", "get" );
    magmablas_ztranspose( nb0, m, dAT(0,0,0), lddat, d_lAP[0], maxm );
    magma_zgetmatrix_async( m, nb0,
                            d_lAP[0], maxm,
                            W(0),     ldw, queues[0][1] );
    trace_gpu_end( 0, 1 );

    /* ------------------------------------------------------------------------------------- */
    magma_timer_t time=0;
    timer_start( time );

    s = mindim / nb;
    for( j=0; j < s; j++ ) {
        /* Set the GPU number that holds the current panel */
        id = j % ngpu;
        magma_setdevice(id);
        
        /* Set the local index where the current panel is */
        j_local = j/ngpu;
        cols  = maxm - j*nb;
        rows  = m - j*nb;
        
        /* synchronize j-th panel from id-th gpu into work */
        magma_queue_sync( queues[id][1] );
        
        /* j-th panel factorization */
        trace_cpu_start( 0, "getrf", "getrf" );
        lapackf77_zgetrf( &rows, &nb, W(j), &ldw, ipiv+j*nb, &iinfo);
        if ( (*info == 0) && (iinfo > 0) ) {
            *info = iinfo + j*nb;
        }
        trace_cpu_end( 0 );
        
        /* start sending the panel to all the gpus */
        d = (j+1) % ngpu;
        for( dd=0; dd < ngpu; dd++ ) {
            magma_setdevice(d);
            trace_gpu_start( 0, 1, "comm", "set" );
            magma_zsetmatrix_async( rows, nb,
                                    W(j),     ldw,
                                    &d_lAP[d][(j%h)*nb*maxm], cols,
                                    queues[d][1] );
            trace_gpu_end( 0, 1 );
            d = (d+1) % ngpu;
        }
        
        /* apply the pivoting */
        d = (j+1) % ngpu;
        for( dd=0; dd < ngpu; dd++ ) {
            magma_setdevice(d);
            
            trace_gpu_start( d, 1, "pivot", "pivot" );
            if ( dd == 0 ) {
                for( i=j*nb; i < j*nb + nb; ++i ) {
                    ipiv[i] += j*nb;
                }
            }
            magmablas_zlaswp_q( lddat, dAT(d,0,0), lddat, j*nb + 1, j*nb + nb, ipiv, 1, queues[d][0] );
            trace_gpu_end( d, 1 );
            d = (d+1) % ngpu;
        }
    
    
        /* update the trailing-matrix/look-ahead */
        d = (j+1) % ngpu;
        for( dd=0; dd < ngpu; dd++ ) {
            magma_setdevice(d);
            
            /* storage for panel */
            if ( d == id ) {
                /* the panel belond to this gpu */
                panel_local[d] = dAT(d,j,j_local);
                ldpan[d] = lddat;
                /* next column */
                j_local2 = j_local+1;
            } else {
                /* the panel belong to another gpu */
                panel_local[d] = d_panel[d];
                ldpan[d] = nb;
                /* next column */
                j_local2 = j_local;
                if ( d < id ) j_local2 ++;
            }
            /* the size of the next column */
            if ( s > (j+1) ) {
                nb0 = nb;
            } else {
                nb0 = n_local[d]-nb*(s/ngpu);
                if ( d < s % ngpu ) nb0 -= nb;
            }
            if ( d == (j+1) % ngpu) {
                /* owns the next column, look-ahead the column */
                nb1 = nb0;
                magmablasSetKernelStream(queues[d][1]);
                
                /* make sure all the pivoting has been applied */
                magma_queue_sync(queues[d][0]);
                trace_gpu_start( d, 1, "gemm", "gemm" );
                /* transpose panel on GPU */
                magmablas_ztranspose( rows, nb, &d_lAP[d][(j%h)*nb*maxm], cols, panel_local[d], ldpan[d] );
                /* synch for remaining update */
                magma_queue_sync(queues[d][1]);
            } else {
                /* update the entire trailing matrix */
                nb1 = n_local[d] - j_local2*nb;
                magmablasSetKernelStream(queues[d][0]);
                
                /* synchronization to make sure panel arrived on gpu */
                magma_queue_sync(queues[d][1]);
                trace_gpu_start( d, 0, "gemm", "gemm" );
                /* transpose panel on GPU */
                magmablas_ztranspose( rows, nb, &d_lAP[d][(j%h)*nb*maxm], cols, panel_local[d], ldpan[d] );
            }
            
            /* gpu updating the trailing matrix */
            magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                         nb1, nb, c_one,
                         panel_local[d],       ldpan[d],
                         dAT(d, j, j_local2), lddat);
            magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                         nb1, m-(j+1)*nb, nb,
                         c_neg_one, dAT(d, j,   j_local2),         lddat,
                                    &(panel_local[d][nb*ldpan[d]]), ldpan[d],
                         c_one,     dAT(d, j+1, j_local2),         lddat );
        
            if ( d == (j+1) % ngpu ) {
                /* Set the local index where the current panel is */
                int loff    = j+1;
                int j_local = (j+1)/ngpu;
                int ldda    = maxm - (j+1)*nb;
                int cols    = m - (j+1)*nb;
                nb0 = min(nb, mindim - (j+1)*nb); /* size of the diagonal block */
                trace_gpu_end( d, 1 );
                
                if ( nb0 > 0 ) {
                    /* transpose the panel for sending it to cpu */
                    trace_gpu_start( d, 1, "comm", "get" );
                    magmablas_ztranspose( nb0, m-(j+1)*nb, dAT(d,loff,j_local), lddat, &d_lAP[d][((j+1)%h)*nb*maxm], ldda );
             
                    /* send the panel to cpu */
                    magma_zgetmatrix_async( cols, nb0,
                                            &d_lAP[d][((j+1)%h)*nb*maxm], ldda,
                                            W(j+1), ldw, queues[d][1] );

                    trace_gpu_end( d, 1 );
                }
            } else {
                trace_gpu_end( d, 0 );
            }
            
            d = (d+1) % ngpu;
        }
    
        /* update the remaining matrix by gpu owning the next panel */
        if ( (j+1) < s ) {
            int j_local = (j+1)/ngpu;
            int rows  = m - (j+1)*nb;
            
            d = (j+1) % ngpu;
            magma_setdevice(d);
            magmablasSetKernelStream(queues[d][0]);
            trace_gpu_start( d, 0, "gemm", "gemm" );

            magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                         n_local[d] - (j_local+1)*nb, nb,
                         c_one, panel_local[d],       ldpan[d],
                                dAT(d,j,j_local+1),  lddat );
            magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                         n_local[d]-(j_local+1)*nb, rows, nb,
                         c_neg_one, dAT(d,j,j_local+1),            lddat,
                                    &(panel_local[d][nb*ldpan[d]]), ldpan[d],
                         c_one,     dAT(d,j+1,  j_local+1),        lddat );
            trace_gpu_end( d, 0 );
        }
    } /* end of for j=1..s */
    /* ------------------------------------------------------------------------------ */
    
    /* Set the GPU number that holds the last panel */
    id = s % ngpu;
    
    /* Set the local index where the last panel is */
    j_local = s/ngpu;
    
    /* size of the last diagonal-block */
    nb0 = min(m - s*nb, n - s*nb);
    rows = m    - s*nb;
    cols = maxm - s*nb;
    
    if ( nb0 > 0 ) {
        magma_setdevice(id);
        
        /* wait for the last panel on cpu */
        magma_queue_sync( queues[id][1] );
    
        /* factor on cpu */
        lapackf77_zgetrf( &rows, &nb0, W(s), &ldw, ipiv+s*nb, &iinfo);
        if ( (*info == 0) && (iinfo > 0) )
            *info = iinfo + s*nb;
        
        /* send the factor to gpus */
        for( d=0; d < ngpu; d++ ) {
            magma_setdevice(d);
            j_local2 = j_local;
            if ( d < id ) j_local2 ++;
            
            if ( d == id || n_local[d] > j_local2*nb ) {
                magma_zsetmatrix_async( rows, nb0,
                                        W(s),     ldw,
                                        &d_lAP[d][(s%h)*nb*maxm],
                                        cols, queues[d][1] );
            }
        }
        
        for( d=0; d < ngpu; d++ ) {
            magma_setdevice(d);
            if ( d == 0 ) {
                for( i=s*nb; i < s*nb + nb0; ++i ) {
                    ipiv[i] += s*nb;
                }
            }
            magmablas_zlaswp_q( lddat, dAT(d,0,0), lddat, s*nb + 1, s*nb + nb0, ipiv, 1, queues[d][0] );
        }
        
        for( d=0; d < ngpu; d++ ) {
            magma_setdevice(d);
            magmablasSetKernelStream(queues[d][1]);
            
            /* wait for the pivoting to be done */
            magma_queue_sync( queues[d][0] );
            
            j_local2 = j_local;
            if ( d < id ) j_local2++;
            if ( d == id ) {
                /* the panel belond to this gpu */
                panel_local[d] = dAT(d,s,j_local);
                
                /* next column */
                nb1 = n_local[d] - j_local*nb-nb0;
                
                magmablas_ztranspose( rows, nb0, &d_lAP[d][(s%h)*nb*maxm], cols, panel_local[d], lddat );
                
                if ( nb1 > 0 ) {
                    magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                                 nb1, nb0, c_one,
                                 panel_local[d],        lddat,
                                 dAT(d,s,j_local)+nb0, lddat);
                }
            } else if ( n_local[d] > j_local2*nb ) {
                /* the panel belong to another gpu */
                panel_local[d] = d_panel[d];
                
                /* next column */
                nb1 = n_local[d] - j_local2*nb;
                
                magmablas_ztranspose( rows, nb0, &d_lAP[d][(s%h)*nb*maxm], cols, panel_local[d], nb );
                magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             nb1, nb0, c_one,
                             panel_local[d],     nb,
                             dAT(d,s,j_local2), lddat);
            }
        }
    } /* if ( nb0 > 0 ) */
    
    /* clean up */
    trace_finalize( "zgetrf_mgpu.svg","trace.css" );
    for( d=0; d < ngpu; d++ ) {
        magma_setdevice(d);
        magma_queue_sync( queues[d][0] );
        magma_queue_sync( queues[d][1] );
    }
    magma_setdevice( orig_dev );
    magmablasSetKernelStream( orig_stream );
    
    timer_start( time );
    timer_printf("\n Performance %f GFlop/s\n", FLOPS_ZGETRF(m,n) / 1e9 / time );

    return *info;
} /* magma_zgetrf2_mgpu */
Exemple #13
0
extern "C" magma_int_t
magma_zgetrf_gpu(magma_int_t m, magma_int_t n, 
                 magmaDoubleComplex *dA, magma_int_t ldda,
                 magma_int_t *ipiv, magma_int_t *info)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    ZGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.
    If the current stream is NULL, this version replaces it with user defined
    stream to overlap computation with communication. 

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N).
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    LDDA     (input) INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).

    IPIV    (output) INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
            > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.
    =====================================================================    */

    #define dAT(i,j) (dAT + (i)*nb*lddat + (j)*nb)

    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;

    magma_int_t iinfo, nb;
    magma_int_t maxm, maxn, mindim;
    magma_int_t i, rows, cols, s, lddat, lddwork;
    magmaDoubleComplex *dAT, *dAP, *work;

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (ldda < max(1,m))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    nb     = magma_get_zgetrf_nb(m);
    s      = mindim / nb;

    if (nb <= 1 || nb >= min(m,n)) {
        /* Use CPU code. */
        magma_zmalloc_cpu( &work, m * n );
        if ( work == NULL ) {
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        magma_zgetmatrix( m, n, dA, ldda, work, m );
        lapackf77_zgetrf(&m, &n, work, &m, ipiv, info);
        magma_zsetmatrix( m, n, work, m, dA, ldda );
        magma_free_cpu(work);
    }
    else {
        /* Use hybrid blocked code. */
        maxm = ((m + 31)/32)*32;
        maxn = ((n + 31)/32)*32;

        lddat   = maxn;
        lddwork = maxm;

        dAT = dA;

        if (MAGMA_SUCCESS != magma_zmalloc( &dAP, nb*maxm )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }

        if ( m == n ) {
            lddat = ldda;
            magmablas_ztranspose_inplace( m, dAT, ldda );
        }
        else {
            if (MAGMA_SUCCESS != magma_zmalloc( &dAT, maxm*maxn )) {
                magma_free( dAP );
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }
            magmablas_ztranspose2( dAT, lddat, dA, ldda, m, n );
        }

        if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, maxm*nb )) {
            magma_free( dAP );
            if ( ! (m == n))
                magma_free( dAT );
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }

        /* Define user stream if current stream is NULL */ 
        cudaStream_t stream[2], current_stream;
        magmablasGetKernelStream(&current_stream);

        magma_queue_create( &stream[0] );
        if (current_stream == NULL) {
           magma_queue_create( &stream[1] );
           magmablasSetKernelStream(stream[1]);
        }
        else
           stream[1] = current_stream;
  
        for( i=0; i<s; i++ )
            {
                // download i-th panel
                cols = maxm - i*nb;
                //magmablas_ztranspose( dAP, cols, dAT(i,i), lddat, nb, cols   );
                magmablas_ztranspose2( dAP, cols, dAT(i,i), lddat, nb, m-i*nb );

                // make sure that that the transpose has completed
                magma_queue_sync( stream[1] );
                magma_zgetmatrix_async( m-i*nb, nb, dAP, cols, work, lddwork,
                                        stream[0]);

                if ( i>0 ){
                    magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                                 n - (i+1)*nb, nb, 
                                 c_one, dAT(i-1,i-1), lddat, 
                                        dAT(i-1,i+1), lddat );
                    magma_zgemm( MagmaNoTrans, MagmaNoTrans, 
                                 n-(i+1)*nb, m-i*nb, nb, 
                                 c_neg_one, dAT(i-1,i+1), lddat, 
                                            dAT(i,  i-1), lddat, 
                                 c_one,     dAT(i,  i+1), lddat );
                }

                // do the cpu part
                rows = m - i*nb;
                magma_queue_sync( stream[0] );
                lapackf77_zgetrf( &rows, &nb, work, &lddwork, ipiv+i*nb, &iinfo);
                if ( (*info == 0) && (iinfo > 0) )
                    *info = iinfo + i*nb;

                // upload i-th panel
                magma_zsetmatrix_async( m-i*nb, nb, work, lddwork, dAP, maxm,
                                        stream[0]);

                magmablas_zpermute_long2( n, dAT, lddat, ipiv, nb, i*nb );

                magma_queue_sync( stream[0] );
                //magmablas_ztranspose(dAT(i,i), lddat, dAP, maxm, cols, nb);
                magmablas_ztranspose2(dAT(i,i), lddat, dAP, maxm, m-i*nb, nb);

                // do the small non-parallel computations (next panel update)
                if ( s > (i+1) ) {
                    magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                                 nb, nb, 
                                 c_one, dAT(i, i  ), lddat,
                                        dAT(i, i+1), lddat);
                    magma_zgemm( MagmaNoTrans, MagmaNoTrans, 
                                 nb, m-(i+1)*nb, nb, 
                                 c_neg_one, dAT(i,   i+1), lddat,
                                            dAT(i+1, i  ), lddat, 
                                 c_one,     dAT(i+1, i+1), lddat );
                }
                else {
                    magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                                 n-s*nb, nb, 
                                 c_one, dAT(i, i  ), lddat,
                                        dAT(i, i+1), lddat);
                    magma_zgemm( MagmaNoTrans, MagmaNoTrans, 
                                 n-(i+1)*nb, m-(i+1)*nb, nb,
                                 c_neg_one, dAT(i,   i+1), lddat,
                                            dAT(i+1, i  ), lddat, 
                                 c_one,     dAT(i+1, i+1), lddat );
                }
            }

        magma_int_t nb0 = min(m - s*nb, n - s*nb);
        rows = m - s*nb;
        cols = maxm - s*nb;

        magmablas_ztranspose2( dAP, maxm, dAT(s,s), lddat, nb0, rows);
        magma_zgetmatrix( rows, nb0, dAP, maxm, work, lddwork );

        // do the cpu part
        lapackf77_zgetrf( &rows, &nb0, work, &lddwork, ipiv+s*nb, &iinfo);
        if ( (*info == 0) && (iinfo > 0) )
            *info = iinfo + s*nb;
        magmablas_zpermute_long2( n, dAT, lddat, ipiv, nb0, s*nb );

        // upload i-th panel
        magma_zsetmatrix( rows, nb0, work, lddwork, dAP, maxm );
        magmablas_ztranspose2( dAT(s,s), lddat, dAP, maxm, rows, nb0);

        magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                     n-s*nb-nb0, nb0,
                     c_one, dAT(s,s),     lddat, 
                            dAT(s,s)+nb0, lddat);

        if ( m == n ) {
            magmablas_ztranspose_inplace( m, dAT, lddat );
        }
        else {
            magmablas_ztranspose2( dA, ldda, dAT, lddat, n, m );
            magma_free( dAT );
        }

        magma_free( dAP );
        magma_free_pinned( work );
    
        magma_queue_destroy( stream[0] );
        if (current_stream == NULL) {
            magma_queue_destroy( stream[1] );
            magmablasSetKernelStream(NULL);
        }
    }
    return *info;
}   /* End of MAGMA_ZGETRF_GPU */
Exemple #14
0
extern "C" magma_int_t
magma_zgetrf_mgpu(
    magma_int_t ngpu, 
    magma_int_t m, magma_int_t n, 
    magmaDoubleComplex_ptr *d_lA, size_t dlA_offset, magma_int_t ldda,
    magma_int_t *ipiv,
    magma_queue_t *queues,
    magma_int_t *info)
{
/*  -- clMAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date November 2014

    Purpose
    =======
    ZGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
        A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.

    Arguments
    =========
    NUM_GPUS 
            (input) INTEGER
            The number of GPUS to be used for the factorization.

    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N).
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    LDDA     (input) INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).

    IPIV    (output) INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
            > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.
    =====================================================================    */

    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;

    magma_int_t nb, n_local[MagmaMaxGPUs];
    magma_int_t maxm, mindim;
    magma_int_t d, d2, lddat, ldwork;
    magmaDoubleComplex_ptr d_lAT[MagmaMaxGPUs];
    magmaDoubleComplex_ptr d_panel[MagmaMaxGPUs];
    magmaDoubleComplex *work;

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -2;
    else if (n < 0)
        *info = -3;
    else if (ldda < max(1,m))
        *info = -5;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    nb     = magma_get_zgetrf_nb(m);

    if (nb <= 1 || nb >= n) {
        /* Use CPU code. */
        magma_zmalloc_cpu( &work, m * n );
        if ( work == NULL ) {
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        magma_zgetmatrix( m, n, d_lA[0], 0, ldda, work, m, queues[0] );
        lapackf77_zgetrf(&m, &n, work, &m, ipiv, info);
        magma_zsetmatrix( m, n, work, m, d_lA[0], 0, ldda, queues[0] );
        magma_free_cpu(work);
    } else {
        /* Use hybrid blocked code. */
        maxm = ((m + 31)/32)*32;
        if ( ngpu > ceil((double)n/nb) ) {
            printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu );
            *info = -1;
            return *info;
        }

        /* allocate workspace for each GPU */
        lddat = (n+nb-1)/nb;                 /* number of block columns         */
        lddat = (lddat+ngpu-1)/ngpu;         /* number of block columns per GPU */
        lddat = nb*lddat;                    /* number of columns per GPU       */
        lddat = ((lddat+31)/32)*32;          /* make it a multiple of 32        */
        for( d=0; d < ngpu; d++ ) {
            /* local-n and local-ld */
            n_local[d] = ((n/nb)/ngpu)*nb;
            if (d < (n/nb)%ngpu)
                n_local[d] += nb;
            else if (d == (n/nb)%ngpu)
                n_local[d] += n%nb;

            /* workspaces */
            if (MAGMA_SUCCESS != magma_zmalloc( &d_panel[d], 3*nb*maxm )) {
                for( d2=0; d2 < d; d2++ ) {
                    magma_free( d_panel[d2] );
                    magma_free( d_lAT[d2]   );
                }
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }

            /* local-matrix storage */
            if (MAGMA_SUCCESS != magma_zmalloc( &d_lAT[d], lddat*maxm )) {
                for( d2=0; d2 <= d; d2++ ) {
                    magma_free( d_panel[d2] );
                }
                for( d2=0; d2 < d; d2++ ) {
                    magma_free( d_lAT[d2] );
                }
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }

            magmablas_ztranspose( m, n_local[d], d_lA[d], 0, ldda, d_lAT[d], 0, lddat, queues[2*d+1] );
        }
        for( d=0; d < ngpu; d++ ) {
            magma_queue_sync(queues[2*d+1]);
        }

        /* cpu workspace */
        ldwork = maxm;
        if (MAGMA_SUCCESS != magma_zmalloc_cpu( &work, ldwork*nb*ngpu )) {
            for( d=0; d < ngpu; d++ ) {
                magma_free( d_panel[d] );
                magma_free( d_lAT[d]   );
            }
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }

        /* calling multi-gpu interface with allocated workspaces and queues */
        magma_zgetrf2_mgpu(ngpu, m, n, nb, 0, d_lAT, 0, lddat, ipiv, d_panel, 0, work, maxm,
                           queues, info);

        /* clean up */
        for( d=0; d < ngpu; d++ ) {
            /* save on output */
            magmablas_ztranspose( n_local[d], m, d_lAT[d], 0, lddat, d_lA[d], 0, ldda, queues[2*d+1] );
            magma_queue_sync(queues[2*d+1]);
            magma_free( d_lAT[d]   );
            magma_free( d_panel[d] );
        } /* end of for d=1,..,ngpu */
        magma_free_cpu( work );
    }
      
    return *info;       
}
Exemple #15
0
extern "C" magma_int_t
magma_zgetrf2_gpu(
    magma_int_t m, magma_int_t n,
    magmaDoubleComplex_ptr dA, size_t dA_offset, magma_int_t ldda,
    magma_int_t *ipiv,
    magma_queue_t queues[2],
    magma_int_t *info )
{
/*  -- clMAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date November 2014

    Purpose
    =======
    ZGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
        A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N).
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    LDDA     (input) INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).

    IPIV    (output) INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
            > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.
    =====================================================================    */

    #define  dA(i_, j_) dA,   dA_offset  + (i_)*nb       + (j_)*nb*ldda
    #define dAT(i_, j_) dAT,  dAT_offset + (i_)*nb*lddat + (j_)*nb
    #define dAP(i_, j_) dAP,               (i_)          + (j_)*maxm
    #define work(i_)   (work + (i_))

    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;

    magma_int_t iinfo, nb;
    magma_int_t maxm, maxn, mindim;
    magma_int_t i, j, rows, s, lddat, ldwork;
    magmaDoubleComplex_ptr dAT, dAP;
    magmaDoubleComplex *work;
    size_t dAT_offset;

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (ldda < max(1,m))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    nb     = magma_get_zgetrf_nb(m);
    s      = mindim / nb;

    if (nb <= 1 || nb >= min(m,n)) {
        /* Use CPU code. */
        if ( MAGMA_SUCCESS != magma_zmalloc_cpu( &work, m*n )) {
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        magma_zgetmatrix( m, n, dA(0,0), ldda, work(0), m, queues[0] );
        lapackf77_zgetrf( &m, &n, work, &m, ipiv, info );
        magma_zsetmatrix( m, n, work(0), m, dA(0,0), ldda, queues[0] );
        magma_free_cpu( work );
    }
    else {
        /* Use hybrid blocked code. */
        maxm = ((m + 31)/32)*32;
        maxn = ((n + 31)/32)*32;

        if ( MAGMA_SUCCESS != magma_zmalloc( &dAP, nb*maxm )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }

        // square matrices can be done in place;
        // rectangular requires copy to transpose
        if ( m == n ) {
            dAT = dA;
            dAT_offset = dA_offset;
            lddat = ldda;
            magmablas_ztranspose_inplace( m, dAT(0,0), lddat, queues[0] );
        }
        else {
            lddat = maxn;  // N-by-M
            dAT_offset = 0;
            if ( MAGMA_SUCCESS != magma_zmalloc( &dAT, lddat*maxm )) {
                magma_free( dAP );
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }
            magmablas_ztranspose( m, n, dA(0,0), ldda, dAT(0,0), lddat, queues[0] );
        }

        ldwork = maxm;
        /*
        if ( MAGMA_SUCCESS != magma_zmalloc_cpu( &work, ldwork*nb ) ) {
            magma_free( dAP );
            if ( dA != dAT )
                magma_free( dAT );

            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        */
        cl_mem work_mapped = clCreateBuffer( gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, ldwork*nb * sizeof(magmaDoubleComplex), NULL, NULL );
        work = (magmaDoubleComplex*) clEnqueueMapBuffer( queues[0], work_mapped, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, ldwork*nb * sizeof(magmaDoubleComplex), 0, NULL, NULL, NULL );

        for( j=0; j < s; j++ ) {
            // download j-th panel
            magmablas_ztranspose( nb, m-j*nb, dAT(j,j), lddat, dAP(0,0), maxm, queues[0] );
            clFlush( queues[0] );
            magma_queue_sync( queues[0] );
            magma_zgetmatrix_async( m-j*nb, nb, dAP(0,0), maxm, work(0), ldwork, queues[1], NULL );
            clFlush( queues[1] );
            if ( j > 0 ) {
                magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             n - (j+1)*nb, nb,
                             c_one, dAT(j-1,j-1), lddat,
                             dAT(j-1,j+1), lddat, queues[0] );
                magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                             n-(j+1)*nb, m-j*nb, nb,
                             c_neg_one, dAT(j-1,j+1), lddat,
                                        dAT(j,  j-1), lddat,
                             c_one,     dAT(j,  j+1), lddat, queues[0] );
            }

            magma_queue_sync( queues[1] );
            // do the cpu part
            rows = m - j*nb;
            lapackf77_zgetrf( &rows, &nb, work, &ldwork, ipiv+j*nb, &iinfo );
            if ( *info == 0 && iinfo > 0 )
                *info = iinfo + j*nb;

            for( i=j*nb; i < j*nb + nb; ++i ) {
                ipiv[i] += j*nb;
            }
            magmablas_zlaswp( n, dAT(0,0), lddat, j*nb + 1, j*nb + nb, ipiv, 1, queues[0] );
            clFlush( queues[0] );

            // upload j-th panel
            magma_zsetmatrix_async( m-j*nb, nb, work(0), ldwork, dAP(0,0), maxm, queues[1], NULL );
            magma_queue_sync( queues[1] );
            magmablas_ztranspose( m-j*nb, nb, dAP(0,0), maxm, dAT(j,j), lddat, queues[0] );
            clFlush( queues[0] );
            
            // do the small non-parallel computations (next panel update)
            if ( s > (j+1) ) {
                magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             nb, nb,
                             c_one, dAT(j, j  ), lddat,
                             dAT(j, j+1), lddat, queues[0] );
                magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                             nb, m-(j+1)*nb, nb,
                             c_neg_one, dAT(j,   j+1), lddat,
                                        dAT(j+1, j  ), lddat,
                             c_one,     dAT(j+1, j+1), lddat, queues[0] );
            }
            else {
                magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             n-s*nb, nb,
                             c_one, dAT(j, j  ), lddat,
                             dAT(j, j+1), lddat, queues[0] );
                magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                             n-(j+1)*nb, m-(j+1)*nb, nb,
                             c_neg_one, dAT(j,   j+1), lddat,
                                        dAT(j+1, j  ), lddat,
                             c_one,     dAT(j+1, j+1), lddat, queues[0] );
            }
        }

        magma_int_t nb0 = min( m - s*nb, n - s*nb );
        if ( nb0 > 0 ) {
            rows = m - s*nb;
    
            magmablas_ztranspose( nb0, rows, dAT(s,s), lddat, dAP(0,0), maxm, queues[0] );
            clFlush( queues[0] );
            magma_queue_sync( queues[0] );
            magma_zgetmatrix_async( rows, nb0, dAP(0,0), maxm, work(0), ldwork, queues[1], NULL );
            magma_queue_sync( queues[1] );
            
            // do the cpu part
            lapackf77_zgetrf( &rows, &nb0, work, &ldwork, ipiv+s*nb, &iinfo );
            if ( (*info == 0) && (iinfo > 0) )
                *info = iinfo + s*nb;
            
            for( i=s*nb; i < s*nb + nb0; ++i ) {
                ipiv[i] += s*nb;
            }
            magmablas_zlaswp( n, dAT(0,0), lddat, s*nb + 1, s*nb + nb0, ipiv, 1, queues[0] );
            clFlush( queues[0] );
            
            // upload j-th panel
            magma_zsetmatrix_async( rows, nb0, work(0), ldwork, dAP(0,0), maxm, queues[1], NULL );
            magma_queue_sync( queues[1] );
            magmablas_ztranspose( rows, nb0, dAP(0,0), maxm, dAT(s,s), lddat, queues[0] );
            clFlush( queues[0] );
    
            magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                         n-s*nb-nb0, nb0,
                         c_one, dAT(s,s),     lddat,
                         dAT(s,s)+nb0, lddat, queues[0] );
        }

        // undo transpose
        if ( dA == dAT ) {
            magmablas_ztranspose_inplace( m, dAT(0,0), lddat, queues[0] );
        }
        else {
            magmablas_ztranspose( n, m, dAT(0,0), lddat, dA(0,0), ldda, queues[0] );
            magma_free( dAT );
        }
        
        magma_queue_sync( queues[0] );
        magma_queue_sync( queues[1] );
        magma_free( dAP );
        // magma_free_cpu( work );
        clEnqueueUnmapMemObject( queues[0], work_mapped, work, 0, NULL, NULL );
        clReleaseMemObject( work_mapped );
    }

    return *info;
} /* magma_zgetrf_gpu */
Exemple #16
0
extern "C" magma_err_t
magma_zgetrf_msub(magma_int_t trans, magma_int_t num_subs, magma_int_t num_gpus, 
                 magma_int_t m, magma_int_t n, 
                 magmaDoubleComplex_ptr *d_lA, size_t dlA_offset, magma_int_t ldda,
                 magma_int_t *ipiv, magma_int_t *info,
                 magma_queue_t *queues)
{
/*  -- clMAGMA (version 1.1.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date January 2014

    Purpose
    =======

    ZGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.

    Arguments
    =========

    NUM_GPUS 
            (input) INTEGER
            The number of GPUS to be used for the factorization.

    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N).
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    LDDA     (input) INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).

    IPIV    (output) INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
            > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.
    =====================================================================    */


    magma_int_t maxm, tot_subs = num_subs*num_gpus;
    magma_int_t i, j, d, lddat;
    /* submatrix info */
    magma_int_t nb, n_local[ MagmaMaxSubs * MagmaMaxGPUs ];
    magmaDoubleComplex_ptr d_lAT[ MagmaMaxSubs * MagmaMaxGPUs ];
    /* local workspace per GPU */
    magmaDoubleComplex_ptr d_panel[ MagmaMaxGPUs ];
    magmaDoubleComplex_ptr d_lAP[ MagmaMaxGPUs ];
    magmaDoubleComplex *work;

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -2;
    else if (n < 0)
        *info = -3;
    else if (trans == MagmaTrans && ldda < max(1,m))
        *info = -5;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    nb = magma_get_zgetrf_nb(m);

    if (nb <= 1 || nb >= n) {
        /* Use CPU code. */
        magma_zmalloc_cpu( &work, m * n );
        if (work == NULL) {
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        magma_zgetmatrix( m, n, d_lA[0], 0, ldda, work, 0, m, queues[0] );
        lapackf77_zgetrf(&m, &n, work, &m, ipiv, info);
        magma_zsetmatrix( m, n, work, 0, m, d_lA[0], 0, ldda, queues[0] );
        magma_free_cpu(work);
    } else {
        /* Use hybrid blocked code. */
        maxm = ((m + 31)/32)*32;
        if (tot_subs > ceil((double)n/nb)) {
            printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) tot_subs );
            *info = -1;
            return *info;
        }

        /* allocate workspace for each GPU */
        lddat = n/nb;           /* number of block columns         */
        lddat = lddat/tot_subs; /* number of block columns per GPU */
        lddat = nb*lddat;       /* number of columns per GPU       */
        if (lddat * tot_subs < n) {
            /* left over */
            if (n-lddat*tot_subs >= nb) {
                lddat += nb;
            } else {
                lddat += (n-lddat*tot_subs)%nb;
            }
        }
        lddat = ((lddat+31)/32)*32; /* make it a multiple of 32 */
        /* allocating workspace */
        for (d=0; d<num_gpus; d++) {
            //#define SINGLE_GPU_PER_CONTEXT
            #ifdef SINGLE_GPU_PER_CONTEXT
            if ((MAGMA_SUCCESS != magma_zmalloc_mgpu( d, &d_panel[d], (2+num_gpus)*nb*maxm ))  ||
                (MAGMA_SUCCESS != magma_zmalloc_mgpu( d, &d_lAP[d],   (2+num_gpus)*nb*maxm )) ) {
            #else
            if ((MAGMA_SUCCESS != magma_zmalloc( &d_panel[d], (2+num_gpus)*nb*maxm ))  ||
                (MAGMA_SUCCESS != magma_zmalloc( &d_lAP[d], (2+num_gpus)*nb*maxm )) ) {
            #endif
                for( i=0; i<d; i++ ) {
                    magma_free( d_panel[i] );
                    magma_free( d_lAP[i] );
                }
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }
        }
        /* transposing the local matrix */
        for (i=0; i<tot_subs; i++) {
            /* local-n and local-ld */
            n_local[i] = ((n/nb)/tot_subs)*nb;
            if (i < (n/nb)%tot_subs)
               n_local[i] += nb;
            else if (i == (n/nb)%tot_subs)
               n_local[i] += n%nb;

            /* local-matrix storage */
            if (trans == MagmaNoTrans) {
                d_lAT[i] = d_lA[i];
            } else {
                if ((m == n_local[i]) && (m%32 == 0) && (ldda%32 == 0) && (ldda == lddat)) {
                    d_lAT[i] = d_lA[i];
                    magma_ztranspose_inplace( d_lA[i], 0, ldda, ldda, queues[2*(i%num_gpus)+1] );
                } else {
                    #ifdef SINGLE_GPU_PER_CONTEXT
                    if (MAGMA_SUCCESS != magma_zmalloc_mgpu( i%num_gpus, &d_lAT[i], lddat*maxm )) {
                    #else
                    if (MAGMA_SUCCESS != magma_zmalloc( &d_lAT[i], lddat*maxm )) {
                    #endif
                        for (j=0; j<=i; j++) {
                            magma_free( d_panel[j] );
                            magma_free( d_lAP[j] );
                        }
                        for (j=0; j<i; j++) {
                            if (d_lAT[j] != d_lA[j]) magma_free( d_lAT[j] );
                        }
                        *info = MAGMA_ERR_DEVICE_ALLOC;
                        return *info;
                    }
                    magma_ztranspose2(d_lAT[i], 0, lddat, d_lA[i], 0, ldda, m, n_local[i], queues[2*(i%num_gpus)+1]);
                }
            }
        }
        if (trans == MagmaNoTrans) {
            for (d=0; d<num_gpus; d++){
                magma_queue_sync(queues[2*d+1]);
            }
        }

        /* cpu workspace */
        #ifdef USE_PINNED_CLMEMORY
        cl_mem buffer = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(magmaDoubleComplex)*maxm*nb*(1+num_gpus), NULL, NULL);
        for (d=0; d<num_gpus; d++) {
            work = (magmaDoubleComplex*)clEnqueueMapBuffer(queues[2*d], buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0,
                                                           sizeof(magmaDoubleComplex)*maxm*nb*(1+num_gpus), 0, NULL, NULL, NULL);
        }
        #else
        if (MAGMA_SUCCESS != magma_zmalloc_cpu( &work, maxm*nb*(1+num_gpus) )) {
            for(d=0; d<num_gpus; d++ ) magma_free( d_panel[d] );
            for(d=0; d<tot_subs; d++ ) {
                if( d_lAT[d] != d_lA[d] ) magma_free( d_lAT[d] );
            }
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        #endif

        /* calling multi-gpu interface with allocated workspaces and streams */
        magma_zgetrf2_msub(num_subs, num_gpus, m, n, nb, 0, d_lAT, 0, lddat, ipiv, d_lAP, d_panel, 0, work, maxm,
                           info, queues);

        /* save on output */
        for (d=0; d<tot_subs; d++) {
            if (trans == MagmaNoTrans) {
                //magma_zcopymatrix( n_local[d], m, d_lAT[d], 0, lddat, d_lA[d], 0, ldda, queues[2*d+1] );
            } else {
                if (d_lAT[d] == d_lA[d]) {
                    magma_ztranspose_inplace( d_lA[d], 0, ldda, ldda, queues[2*(d%num_gpus)+1] );
                } else {
                    magma_ztranspose2( d_lA[d], 0, ldda, d_lAT[d], 0, lddat, n_local[d], m, queues[2*(d%num_gpus)+1] );
                }
            }
        }
        /* clean up */
        for (d=0; d<num_gpus; d++) {
            magma_queue_sync(queues[2*d+1]);
            magma_free( d_panel[d] );
            magma_free( d_lAP[d] );
            d_panel[d] = d_lAP[d] = NULL;
        } 
        for (d=0; d<tot_subs; d++) {
            if (d_lAT[d] != d_lA[d]) {
                magma_free( d_lAT[d] ); 
                d_lAT[d] = NULL;
            }
        }
        #ifdef USE_PINNED_CLMEMORY
        for (d=0; d<num_gpus; d++) {
            clEnqueueUnmapMemObject(queues[2*d], buffer, work, 0, NULL, NULL);
        }
        clReleaseMemObject( buffer );
        #else
        magma_free_cpu( work );
        #endif
        work = NULL;
      }
      return *info;       
      /* End of MAGMA_ZGETRF_MSUB */
}
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing zgetrf_batched
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    real_Double_t   gflops, magma_perf, magma_time, cublas_perf=0., cublas_time=0., cpu_perf=0, cpu_time=0;
    double          error;
    magma_int_t cublas_enable = 0;
    magmaDoubleComplex *h_A, *h_R;
    magmaDoubleComplex *dA_magma;
    magmaDoubleComplex **dA_array = NULL;

    magma_int_t     **dipiv_array = NULL;
    magma_int_t     *ipiv, *cpu_info;
    magma_int_t     *dipiv_magma, *dinfo_magma;
    
    magma_int_t M, N, n2, lda, ldda, min_mn, info;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t batchCount;
    magma_int_t status = 0;

    magma_opts opts( MagmaOptsBatched );
    opts.parse_opts( argc, argv );
    //opts.lapack |= opts.check;

    batchCount = opts.batchcount;
    magma_int_t columns;
    
    double tol = opts.tolerance * lapackf77_dlamch("E");

    printf("%% BatchCount   M     N    CPU Gflop/s (ms)   MAGMA Gflop/s (ms)   CUBLAS Gflop/s (ms)   ||PA-LU||/(||A||*N)\n");
    printf("%%==========================================================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            min_mn = min(M, N);
            lda    = M;
            n2     = lda*N * batchCount;
            ldda   = magma_roundup( M, opts.align );  // multiple of 32 by default
            gflops = FLOPS_ZGETRF( M, N ) / 1e9 * batchCount;
            
            TESTING_MALLOC_CPU( cpu_info, magma_int_t, batchCount );
            TESTING_MALLOC_CPU( ipiv, magma_int_t,     min_mn * batchCount );
            TESTING_MALLOC_CPU( h_A,  magmaDoubleComplex, n2 );
            TESTING_MALLOC_CPU( h_R,  magmaDoubleComplex, n2 );
            
            TESTING_MALLOC_DEV( dA_magma,  magmaDoubleComplex, ldda*N * batchCount );
            TESTING_MALLOC_DEV( dipiv_magma,  magma_int_t, min_mn * batchCount );
            TESTING_MALLOC_DEV( dinfo_magma,  magma_int_t, batchCount );

            TESTING_MALLOC_DEV( dA_array,    magmaDoubleComplex*, batchCount );
            TESTING_MALLOC_DEV( dipiv_array, magma_int_t*,        batchCount );

            /* Initialize the matrix */
            lapackf77_zlarnv( &ione, ISEED, &n2, h_A );
            // make A diagonally dominant, to not need pivoting
            for( int s=0; s < batchCount; ++s ) {
                for( int i=0; i < min_mn; ++i ) {
                    h_A[ i + i*lda + s*lda*N ] = MAGMA_Z_MAKE(
                        MAGMA_Z_REAL( h_A[ i + i*lda + s*lda*N ] ) + N,
                        MAGMA_Z_IMAG( h_A[ i + i*lda + s*lda*N ] ));
                }
            }
            columns = N * batchCount;
            lapackf77_zlacpy( MagmaFullStr, &M, &columns, h_A, &lda, h_R, &lda );
            magma_zsetmatrix( M, columns, h_R, lda, dA_magma, ldda );
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            magma_zset_pointer( dA_array, dA_magma, ldda, 0, 0, ldda*N, batchCount, opts.queue );
            magma_time = magma_sync_wtime( opts.queue );
            info = magma_zgetrf_nopiv_batched( M, N, dA_array, ldda, dinfo_magma, batchCount, opts.queue);
            magma_time = magma_sync_wtime( opts.queue ) - magma_time;
            magma_perf = gflops / magma_time;
            // check correctness of results throught "dinfo_magma" and correctness of argument throught "info"
            magma_getvector( batchCount, sizeof(magma_int_t), dinfo_magma, 1, cpu_info, 1);
            for (int i=0; i < batchCount; i++)
            {
                if (cpu_info[i] != 0 ) {
                    printf("magma_zgetrf_batched matrix %d returned internal error %d\n", i, (int)cpu_info[i] );
                }
            }
            if (info != 0) {
                printf("magma_zgetrf_batched returned argument error %d: %s.\n",
                        (int) info, magma_strerror( info ));
            }

            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            if ( opts.lapack ) {
                cpu_time = magma_wtime();
                for (int i=0; i < batchCount; i++) {
                    lapackf77_zgetrf(&M, &N, h_A + i*lda*N, &lda, ipiv + i * min_mn, &info);
                    assert( info == 0 );
                }
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
                if (info != 0) {
                    printf("lapackf77_zgetrf returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));
                }
            }
            
            /* =====================================================================
               Check the factorization
               =================================================================== */
            if ( opts.lapack ) {
                printf("%10d %5d %5d   %7.2f (%7.2f)    %7.2f (%7.2f)     %7.2f (%7.2f)",
                       (int) batchCount, (int) M, (int) N, cpu_perf, cpu_time*1000., magma_perf, magma_time*1000., cublas_perf*cublas_enable, cublas_time*1000.*cublas_enable  );
            }
            else {
                printf("%10d %5d %5d     ---   (  ---  )    %7.2f (%7.2f)     %7.2f (%7.2f)",
                       (int) batchCount, (int) M, (int) N, magma_perf, magma_time*1000., cublas_perf*cublas_enable, cublas_time*1000.*cublas_enable );
            }

            if ( opts.check ) {
                // initialize ipiv to 1, 2, 3, ...
                for (int i=0; i < batchCount; i++)
                {
                    for (int k=0; k < min_mn; k++) {
                        ipiv[i*min_mn+k] = k+1;
                    }
                }

                magma_zgetmatrix( M, N*batchCount, dA_magma, ldda, h_A, lda );
                error = 0;
                for (int i=0; i < batchCount; i++)
                {
                    double err;
                    err = get_LU_error( M, N, h_R + i * lda*N, lda, h_A + i * lda*N, ipiv + i * min_mn);
                    if ( isnan(err) || isinf(err) ) {
                        error = err;
                        break;
                    }
                    error = max( err, error );
                }
                bool okay = (error < tol);
                status += ! okay;
                printf("   %8.2e  %s\n", error, (okay ? "ok" : "failed") );
            }
            else {
                printf("     ---  \n");
            }
            
            TESTING_FREE_CPU( cpu_info );
            TESTING_FREE_CPU( ipiv );
            TESTING_FREE_CPU( h_A );
            TESTING_FREE_CPU( h_R );

            TESTING_FREE_DEV( dA_magma );
            TESTING_FREE_DEV( dinfo_magma );
            TESTING_FREE_DEV( dipiv_magma );
            TESTING_FREE_DEV( dipiv_array );
            TESTING_FREE_DEV( dA_array );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }
    
    opts.cleanup();
    TESTING_FINALIZE();
    return status;
}
Exemple #18
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing ztrsm
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    real_Double_t   gflops, cublas_perf, cublas_time, cpu_perf=0, cpu_time=0;
    double          cublas_error, normA, normx, normr, work[1];
    magma_int_t N, info;
    magma_int_t sizeA;
    magma_int_t lda, ldda;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t *ipiv;

    magmaDoubleComplex *h_A, *h_b, *h_x, *h_xcublas;
    magmaDoubleComplex_ptr d_A, d_x;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    magma_int_t status = 0;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    double tol = opts.tolerance * lapackf77_dlamch("E");
    
    printf("uplo = %s, transA = %s, diag = %s\n",
           lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag) );
    printf("    N  CUBLAS Gflop/s (ms)   CPU Gflop/s (ms)   CUBLAS error\n");
    printf("============================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            N = opts.nsize[itest];
            gflops = FLOPS_ZTRSM(opts.side, N, 1) / 1e9;
            lda    = N;
            ldda   = ((lda+31)/32)*32;
            sizeA  = lda*N;
            
            TESTING_MALLOC_CPU( ipiv,      magma_int_t,        N     );
            TESTING_MALLOC_CPU( h_A,       magmaDoubleComplex, lda*N );
            TESTING_MALLOC_CPU( h_b,       magmaDoubleComplex, N     );
            TESTING_MALLOC_CPU( h_x,       magmaDoubleComplex, N     );
            TESTING_MALLOC_CPU( h_xcublas, magmaDoubleComplex, N     );
            
            TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N );
            TESTING_MALLOC_DEV( d_x, magmaDoubleComplex, N      );
            
            /* Initialize the matrices */
            /* Factor A into LU to get well-conditioned triangular matrix.
             * Copy L to U, since L seems okay when used with non-unit diagonal
             * (i.e., from U), while U fails when used with unit diagonal. */
            lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A );
            lapackf77_zgetrf( &N, &N, h_A, &lda, ipiv, &info );
            for( int j = 0; j < N; ++j ) {
                for( int i = 0; i < j; ++i ) {
                    *h_A(i,j) = *h_A(j,i);
                }
            }
            
            lapackf77_zlarnv( &ione, ISEED, &N, h_b );
            blasf77_zcopy( &N, h_b, &ione, h_x, &ione );
            
            /* =====================================================================
               Performs operation using CUBLAS
               =================================================================== */
            magma_zsetmatrix( N, N, h_A, lda, d_A, ldda );
            magma_zsetvector( N, h_x, 1, d_x, 1 );
            
            cublas_time = magma_sync_wtime( NULL );
            cublasZtrsv( opts.handle, cublas_uplo_const(opts.uplo),
                         cublas_trans_const(opts.transA), cublas_diag_const(opts.diag),
                         N,
                         d_A, ldda,
                         d_x, 1 );
            cublas_time = magma_sync_wtime( NULL ) - cublas_time;
            cublas_perf = gflops / cublas_time;
            
            magma_zgetvector( N, d_x, 1, h_xcublas, 1 );
            
            /* =====================================================================
               Performs operation using CPU BLAS
               =================================================================== */
            if ( opts.lapack ) {
                cpu_time = magma_wtime();
                blasf77_ztrsv( lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag),
                               &N,
                               h_A, &lda,
                               h_x, &ione );
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
            }
            
            /* =====================================================================
               Check the result
               =================================================================== */
            // ||b - Ax|| / (||A||*||x||)
            // error for CUBLAS
            normA = lapackf77_zlange( "F", &N, &N, h_A, &lda, work );
            
            normx = lapackf77_zlange( "F", &N, &ione, h_xcublas, &ione, work );
            blasf77_ztrmv( lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag),
                           &N,
                           h_A, &lda,
                           h_xcublas, &ione );
            blasf77_zaxpy( &N, &c_neg_one, h_b, &ione, h_xcublas, &ione );
            normr = lapackf77_zlange( "F", &N, &ione, h_xcublas, &N, work );
            cublas_error = normr / (normA*normx);

            if ( opts.lapack ) {
                printf("%5d   %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e   %s\n",
                        (int) N,
                        cublas_perf, 1000.*cublas_time,
                        cpu_perf,    1000.*cpu_time,
                        cublas_error, (cublas_error < tol ? "ok" : "failed"));
                status += ! (cublas_error < tol);
            }
            else {
                printf("%5d   %7.2f (%7.2f)     ---  (  ---  )   %8.2e   %s\n",
                        (int) N,
                        cublas_perf, 1000.*cublas_time,
                        cublas_error, (cublas_error < tol ? "ok" : "failed"));
                status += ! (cublas_error < tol);
            }
            
            TESTING_FREE_CPU( ipiv );
            TESTING_FREE_CPU( h_A  );
            TESTING_FREE_CPU( h_b  );
            TESTING_FREE_CPU( h_x  );
            TESTING_FREE_CPU( h_xcublas );
            
            TESTING_FREE_DEV( d_A );
            TESTING_FREE_DEV( d_x );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    TESTING_FINALIZE();
    return status;
}
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing zgetrf_mgpu
*/
int main( int argc, char** argv)
{
    TESTING_INIT();
    
    real_Double_t    gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0;
    double error;
    magmaDoubleComplex *h_A, *h_P;
    magmaDoubleComplex_ptr d_lA[ MagmaMaxSubs * MagmaMaxGPUs ];
    magma_int_t     *ipiv;
    magma_int_t M, N, n2, lda, ldda, info, min_mn;
    magma_int_t dev, j, k, ngpu, nsub, n_local, nb, nk, ldn_local, maxm;
    magma_int_t status   = 0;

    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    double tol = opts.tolerance * lapackf77_dlamch("E");
    
    /* Initialize queues */
    magma_queue_t  queues[MagmaMaxGPUs * 2];
    magma_device_t devices[MagmaMaxGPUs];
    magma_int_t num = 0;
    magma_int_t err;
    err = magma_getdevices( devices, MagmaMaxGPUs, &num );
    if ( err != 0 || num < 1 ) {
        fprintf( stderr, "magma_getdevices failed: %d\n", (int) err );
        exit(-1);
    }
    for( dev=0; dev < opts.ngpu; dev++ ) {
        err = magma_queue_create( devices[dev], &queues[2*dev] );
        if ( err != 0 ) {
            fprintf( stderr, "magma_queue_create failed: %d (device %d)\n", (int) err, dev );
            exit(-1);
        }
        err = magma_queue_create( devices[dev], &queues[2*dev+1] );
        if ( err != 0 ) {
            fprintf( stderr, "magma_queue_create failed: %d (device %d)\n", (int) err, dev );
            exit(-1);
        }
    }
    
    printf("trans %s, ngpu %d, nsub %d\n",
           lapack_trans_const(opts.transA), (int) opts.ngpu, (int) opts.nsub );
    if ( opts.check == 2 ) {
        printf("    M     N   CPU GFlop/s (sec)   GPU GFlop/s (sec)   |Ax-b|/(N*|A|*|x|)\n");
    }
    else {
        printf("    M     N   CPU GFlop/s (sec)   GPU GFlop/s (sec)   |PA-LU|/(N*|A|)\n");
    }
    printf("=========================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            min_mn = min(M, N);
            maxm   = 32*((M+31)/32);
            lda    = M;
            n2     = lda*N;
            nb     = magma_get_zgetrf_nb(M);
            gflops = FLOPS_ZGETRF( M, N ) / 1e9;
            
            // nsubs * ngpu must be at least the number of blocks
            ngpu = opts.ngpu;
            nsub = opts.nsub;
            if ( nsub*ngpu > N/nb ) {
                nsub = 1;
                ngpu = 1;
                printf( " * too many GPUs for the matrix size, using %d GPUs and %d submatrices\n", (int) ngpu, (int) nsub );
            }
    
            /* Allocate host memory for the matrix */
            TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn );
            TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 );
            TESTING_MALLOC_CPU( h_P, magmaDoubleComplex, lda*nb );
            
            /* Allocate device memory */
            if ( opts.transA == MagmaNoTrans ) {
                ldda = N/nb;                     /* number of block columns         */
                ldda = ldda/(ngpu*nsub);     /* number of block columns per GPU */
                ldda = nb*ldda;                  /* number of columns per GPU       */
                if ( ldda * ngpu*nsub < N ) {
                    /* left over */
                    if ( N-ldda*ngpu*nsub >= nb ) {
                        ldda += nb;
                    } else {
                        ldda += (N-ldda*ngpu*nsub)%nb;
                    }
                }
                ldda = ((ldda+31)/32)*32; /* make it a multiple of 32 */
                for( j=0; j < nsub * ngpu; j++ ) {
                    TESTING_MALLOC_DEV( d_lA[j], magmaDoubleComplex, ldda*maxm );
                }
            } else {
                ldda = ((M+31)/32)*32;
                for( j=0; j < nsub * ngpu; j++ ) {
                    n_local = ((N/nb)/(nsub*ngpu))*nb;
                    if ( j < (N/nb)%(nsub*ngpu) ) {
                        n_local += nb;
                    } else if ( j == (N/nb)%(nsub*ngpu) ) {
                        n_local += N%nb;
                    }
                    TESTING_MALLOC_DEV( d_lA[j], magmaDoubleComplex, ldda*n_local );
                }
            }

            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            if ( opts.lapack ) {
                init_matrix( M, N, h_A, lda );
                
                cpu_time = magma_wtime();
                lapackf77_zgetrf( &M, &N, h_A, &lda, ipiv, &info );
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
                if ( info != 0 )
                    printf("lapackf77_zgetrf returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));
            }
    
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            init_matrix( M, N, h_A, lda );
            if ( opts.transA == MagmaNoTrans ) {
                for( j=0; j < N; j += nb ) {
                    k = (j/nb)%(nsub*ngpu);
                    nk = min(nb, N-j);
    
                    /* transpose on CPU, then copy to GPU */
                    int ii,jj;
                    for( ii=0; ii < M; ii++ ) {
                        for( jj=0; jj < nk; jj++ ) {
                            h_P[jj+ii*nk] = h_A[j*lda + ii+jj*lda];
                        }
                    }
                    magma_zsetmatrix( nk, M,
                                      h_P, nk,
                                      d_lA[k], j/(nb*nsub*ngpu)*nb, ldda,
                                      queues[2*(k%ngpu)] );
                }
            } else {
                ldda = ((M+31)/32)*32;
                for( j=0; j < N; j += nb ) {
                    k = (j/nb)%(nsub*ngpu);
                    nk = min(nb, N-j);
                    magma_zsetmatrix( M, nk,
                                      h_A + j*lda, lda,
                                      d_lA[k], j/(nb*nsub*ngpu)*nb*ldda, ldda,
                                      queues[2*(k%ngpu)] );
                }
            }
            
            gpu_time = magma_wtime();
            magma_zgetrf_msub( opts.transA, nsub, ngpu, M, N, d_lA, 0, ldda, ipiv, queues, &info );
            gpu_time = magma_wtime() - gpu_time;
            gpu_perf = gflops / gpu_time;
            if (info != 0)
                printf("magma_zgetrf_mgpu returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
    
            /* get the matrix from GPUs */
            if ( opts.transA == MagmaNoTrans ) {
                for (j=0; j < N; j+=nb) {
                    k = (j/nb)%(nsub*ngpu);
                    nk = min(nb, N-j);
    
                    /* copy to CPU and then transpose */
                    magma_zgetmatrix( nk, M,
                                      d_lA[k], j/(nb*nsub*ngpu)*nb, ldda,
                                      h_P, nk, queues[2*(k%ngpu)] );
                    int ii, jj;
                    for( ii=0; ii < M; ii++ ) {
                        for( jj=0; jj < nk; jj++ ) {
                            h_A[j*lda + ii+jj*lda] = h_P[jj+ii*nk];
                        }
                    }
                }
            } else {
                for (j=0; j < N; j+=nb) {
                    k = (j/nb)%(nsub*ngpu);
                    nk = min(nb, N-j);
                    magma_zgetmatrix( M, nk,
                                      d_lA[k], j/(nb*nsub*ngpu)*nb*ldda, ldda,
                                      h_A + j*lda, lda, queues[2*(k%ngpu)] );
                }
            }
    
            /* =====================================================================
               Check the factorization
               =================================================================== */
            if ( opts.lapack ) {
                printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)",
                       (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time );
            }
            else {
                printf("%5d %5d     ---   (  ---  )   %7.2f (%7.2f)",
                       (int) M, (int) N, gpu_perf, gpu_time );
            }
            if ( opts.check == 2 ) {
                error = get_residual( M, N, h_A, lda, ipiv );
                printf("   %8.2e   %s\n", error, (error < tol ? "ok" : "failed"));
                status += ! (error < tol);
            }
            else if ( opts.check ) {
                error = get_LU_error( M, N, h_A, lda, ipiv );
                printf("   %8.2e   %s\n", error, (error < tol ? "ok" : "failed"));
                status += ! (error < tol);
            }
            else {
                printf("     ---  \n");
            }
            
            TESTING_FREE_CPU( ipiv );
            TESTING_FREE_CPU( h_A );
            TESTING_FREE_CPU( h_P );
            for( dev=0; dev < ngpu; dev++ ) {
                for( k=0; k < nsub; k++ ) {
                    TESTING_FREE_DEV( d_lA[dev*nsub + k] );
                }
            }
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }
    
    /* Free queues */
    for( dev=0; dev < opts.ngpu; dev++ ) {
        magma_queue_destroy( queues[2*dev] );
        magma_queue_destroy( queues[2*dev+1] );
    }

    TESTING_FINALIZE();
    return status;
}
Exemple #20
0
/**
    Purpose
    -------
    ZGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
        A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.
    
    If the current stream is NULL, this version replaces it with a new
    stream to overlap computation with communication.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    dA      COMPLEX_16 array on the GPU, dimension (LDDA,N).
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    @param[in]
    ldda     INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).

    @param[out]
    ipiv    INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
      -     > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    @ingroup magma_zgesv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_zgetrf_gpu(
    magma_int_t m, magma_int_t n,
    magmaDoubleComplex_ptr dA, magma_int_t ldda, magma_int_t *ipiv,
    magma_int_t *info)
{
    #define dAT(i_, j_) (dAT + (i_)*nb*lddat + (j_)*nb)

    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;

    magma_int_t iinfo, nb;
    magma_int_t maxm, maxn, mindim;
    magma_int_t i, j, rows, cols, s, lddat, ldwork;
    magmaDoubleComplex *dAT, *dAP, *work;

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (ldda < max(1,m))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    nb     = magma_get_zgetrf_nb(m);
    s      = mindim / nb;

    if (nb <= 1 || nb >= min(m,n)) {
        /* Use CPU code. */
        magma_zmalloc_cpu( &work, m * n );
        if ( work == NULL ) {
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        magma_zgetmatrix( m, n, dA, ldda, work, m );
        lapackf77_zgetrf(&m, &n, work, &m, ipiv, info);
        magma_zsetmatrix( m, n, work, m, dA, ldda );
        magma_free_cpu(work);
    }
    else {
        /* Use hybrid blocked code. */
        maxm = ((m + 31)/32)*32;
        maxn = ((n + 31)/32)*32;

        if (MAGMA_SUCCESS != magma_zmalloc( &dAP, nb*maxm )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }

        // square matrices can be done in place;
        // rectangular requires copy to transpose
        if ( m == n ) {
            dAT = dA;
            lddat = ldda;
            magmablas_ztranspose_inplace( m, dAT, ldda );
        }
        else {
            lddat = maxn;  // N-by-M
            if (MAGMA_SUCCESS != magma_zmalloc( &dAT, lddat*maxm )) {
                magma_free( dAP );
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }
            magmablas_ztranspose( m, n, dA, ldda, dAT, lddat );
        }

        ldwork = maxm;
        if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, ldwork*nb )) {
            magma_free( dAP );
            if ( ! (m == n))
                magma_free( dAT );
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }

        /* Define user stream if current stream is NULL */
        magma_queue_t stream[2];
        
        magma_queue_t orig_stream;
        magmablasGetKernelStream( &orig_stream );

        magma_queue_create( &stream[0] );
        if (orig_stream == NULL) {
            magma_queue_create( &stream[1] );
            magmablasSetKernelStream(stream[1]);
        }
        else {
            stream[1] = orig_stream;
        }
  
        for( j=0; j < s; j++ ) {
            // download j-th panel
            cols = maxm - j*nb;
            magmablas_ztranspose( nb, m-j*nb, dAT(j,j), lddat, dAP, cols );

            // make sure that the transpose has completed
            magma_queue_sync( stream[1] );
            magma_zgetmatrix_async( m-j*nb, nb, dAP, cols, work, ldwork,
                                    stream[0]);

            if ( j > 0 ) {
                magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             n - (j+1)*nb, nb,
                             c_one, dAT(j-1,j-1), lddat,
                                    dAT(j-1,j+1), lddat );
                magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                             n-(j+1)*nb, m-j*nb, nb,
                             c_neg_one, dAT(j-1,j+1), lddat,
                                        dAT(j,  j-1), lddat,
                             c_one,     dAT(j,  j+1), lddat );
            }

            // do the cpu part
            rows = m - j*nb;
            magma_queue_sync( stream[0] );
            lapackf77_zgetrf( &rows, &nb, work, &ldwork, ipiv+j*nb, &iinfo);
            if ( *info == 0 && iinfo > 0 )
                *info = iinfo + j*nb;

            // upload j-th panel
            magma_zsetmatrix_async( m-j*nb, nb, work, ldwork, dAP, maxm,
                                    stream[0]);

            for( i=j*nb; i < j*nb + nb; ++i ) {
                ipiv[i] += j*nb;
            }
            magmablas_zlaswp( n, dAT, lddat, j*nb + 1, j*nb + nb, ipiv, 1 );

            magma_queue_sync( stream[0] );
            magmablas_ztranspose( m-j*nb, nb, dAP, maxm, dAT(j,j), lddat );

            // do the small non-parallel computations (next panel update)
            if ( s > (j+1) ) {
                magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             nb, nb,
                             c_one, dAT(j, j  ), lddat,
                                    dAT(j, j+1), lddat);
                magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                             nb, m-(j+1)*nb, nb,
                             c_neg_one, dAT(j,   j+1), lddat,
                                        dAT(j+1, j  ), lddat,
                             c_one,     dAT(j+1, j+1), lddat );
            }
            else {
                magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             n-s*nb, nb,
                             c_one, dAT(j, j  ), lddat,
                                    dAT(j, j+1), lddat);
                magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                             n-(j+1)*nb, m-(j+1)*nb, nb,
                             c_neg_one, dAT(j,   j+1), lddat,
                                        dAT(j+1, j  ), lddat,
                             c_one,     dAT(j+1, j+1), lddat );
            }
        }

        magma_int_t nb0 = min(m - s*nb, n - s*nb);
        if ( nb0 > 0 ) {
            rows = m - s*nb;
            cols = maxm - s*nb;
    
            magmablas_ztranspose( nb0, rows, dAT(s,s), lddat, dAP, maxm );
            magma_zgetmatrix( rows, nb0, dAP, maxm, work, ldwork );
    
            // do the cpu part
            lapackf77_zgetrf( &rows, &nb0, work, &ldwork, ipiv+s*nb, &iinfo);
            if ( *info == 0 && iinfo > 0 )
                *info = iinfo + s*nb;
                
            for( i=s*nb; i < s*nb + nb0; ++i ) {
                ipiv[i] += s*nb;
            }
            magmablas_zlaswp( n, dAT, lddat, s*nb + 1, s*nb + nb0, ipiv, 1 );
    
            // upload j-th panel
            magma_zsetmatrix( rows, nb0, work, ldwork, dAP, maxm );
            magmablas_ztranspose( rows, nb0, dAP, maxm, dAT(s,s), lddat );
    
            magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                         n-s*nb-nb0, nb0,
                         c_one, dAT(s,s),     lddat,
                                dAT(s,s)+nb0, lddat);
        }
        
        // undo transpose
        if ( m == n ) {
            magmablas_ztranspose_inplace( m, dAT, lddat );
        }
        else {
            magmablas_ztranspose( n, m, dAT, lddat, dA, ldda );
            magma_free( dAT );
        }

        magma_free( dAP );
        magma_free_pinned( work );
        
        magma_queue_destroy( stream[0] );
        if (orig_stream == NULL) {
            magma_queue_destroy( stream[1] );
        }
        magmablasSetKernelStream( orig_stream );
    }
    
    return *info;
} /* magma_zgetrf_gpu */
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing zgetrf
*/
int main( int argc, char** argv)
{
    real_Double_t    gflops, gpu_perf, cpu_perf, gpu_time, cpu_time, error;
    magmaDoubleComplex *h_A, *h_R;
    magmaDoubleComplex_ptr d_A;
    magma_int_t     *ipiv;

    /* Matrix size */
    magma_int_t M = 0, N = 0, n2, lda, ldda;
#if defined (PRECISION_z)
    magma_int_t size[10] = {1024,2048,3072,4032,4992,5952,7000,7000,7000,7000};
#else
    magma_int_t size[10] = {1024,2048,3072,4032,4992,5952,7104,8064,9000,10000};
#endif
    magma_int_t i, info, min_mn;
    //magma_int_t nb, maxn, ret;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};

    if (argc != 1){
        for(i = 1; i<argc; i++){
            if (strcmp("-N", argv[i])==0)
                N = atoi(argv[++i]);
            else if (strcmp("-M", argv[i])==0)
                M = atoi(argv[++i]);
        }
        if (M>0 && N>0)
            printf("  testing_zgetrf -M %d -N %d\n\n", M, N);
        else
            {
                printf("\nUsage: \n");
                printf("  testing_zgetrf -M %d -N %d\n\n", 1024, 1024);
                exit(1);
            }
    }
    else {
        printf("\nUsage: \n");
        printf("  testing_zgetrf_gpu -M %d -N %d\n\n", 1024, 1024);
        M = N = size[9];
    }

    /* Initialize */
    magma_queue_t  queue;
    magma_device_t device[ MagmaMaxGPUs ];
    int num = 0;
    magma_err_t err;

    magma_init();
    err = magma_get_devices( device, MagmaMaxGPUs, &num );
    if ( err != 0 || num < 1 ) {
      fprintf( stderr, "magma_get_devices failed: %d\n", err );
      exit(-1);
    }
    err = magma_queue_create( device[0], &queue );
    if ( err != 0 ) {
      fprintf( stderr, "magma_queue_create failed: %d\n", err );
      exit(-1);
    }
    
    ldda   = ((M+31)/32)*32;
    //maxn   = ((N+31)/32)*32;
    n2     = M * N;
    min_mn = min(M, N);
    //nb     = magma_get_zgetrf_nb(min_mn);

    /* Allocate host memory for the matrix */
    TESTING_MALLOC_CPU( ipiv, magma_int_t,        min_mn );
    TESTING_MALLOC_CPU( h_A,  magmaDoubleComplex, n2     );
    TESTING_MALLOC_PIN( h_R,  magmaDoubleComplex, n2     );
    TESTING_MALLOC_DEV( d_A,  magmaDoubleComplex, ldda*N );

    printf("\n\n");
    printf("  M     N    CPU GFlop/ (sec)s   GPU GFlop/s (sec)   ||PA-LU||/(||A||*N)\n");
    printf("========================================================================\n");
    for(i=0; i<10; i++){
        if (argc == 1){
            M = N = size[i];
        }
        min_mn= min(M, N);
        lda   = M;
        n2    = lda*N;
        ldda  = ((M+31)/32)*32;
        gflops = FLOPS( (double)M, (double)N ) *1e-9;

        /* Initialize the matrix */
        lapackf77_zlarnv( &ione, ISEED, &n2, h_A );
        lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda );

        /* =====================================================================
           Performs operation using LAPACK
           =================================================================== */
        cpu_time = magma_wtime();
        lapackf77_zgetrf(&M, &N, h_A, &lda, ipiv, &info);
        cpu_time = magma_wtime() - cpu_time;
        if (info < 0)
            printf("Argument %d of zgetrf had an illegal value.\n", -info);

        cpu_perf = gflops / cpu_time;

        /* ====================================================================
           Performs operation using MAGMA
           =================================================================== */
        magma_zsetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue );
        magma_zgetrf_gpu( M, N, d_A, 0, ldda, ipiv, &info, queue );

        magma_zsetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue );
        gpu_time = magma_wtime();
        magma_zgetrf_gpu( M, N, d_A, 0, ldda, ipiv, &info, queue );
        gpu_time = magma_wtime() - gpu_time;
        if (info < 0)
            printf("Argument %d of zgetrf had an illegal value.\n", -info);

        gpu_perf = gflops / gpu_time;

        /* =====================================================================
           Check the factorization
           =================================================================== */
        magma_zgetmatrix( M, N, d_A, 0, ldda, h_A, 0, lda, queue );
        error = get_LU_error(M, N, h_R, lda, h_A, ipiv);
        
        printf("%5d %5d  %6.2f (%6.2f)     %6.2f (%6.2f)      %e\n",
               M, N, cpu_perf, cpu_time, gpu_perf, gpu_time, error);

        if (argc != 1)
            break;
    }

    /* clean up */
    TESTING_FREE_CPU( ipiv );
    TESTING_FREE_CPU( h_A );
    TESTING_FREE_PIN( h_R );
    TESTING_FREE_DEV( d_A );

    magma_queue_destroy( queue );
    magma_finalize();
}
Exemple #22
0
extern "C" magma_int_t
magma_zgetrf_mgpu(magma_int_t num_gpus, 
                 magma_int_t m, magma_int_t n, 
                 cuDoubleComplex **d_lA, magma_int_t ldda,
                 magma_int_t *ipiv, magma_int_t *info)
{
/*  -- MAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2012

    Purpose
    =======

    ZGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.

    Arguments
    =========

    NUM_GPUS 
            (input) INTEGER
            The number of GPUS to be used for the factorization.

    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N).
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    LDDA     (input) INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).

    IPIV    (output) INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
            > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.
    =====================================================================    */

#define inAT(id,i,j) (d_lAT[(id)] + (i)*nb*lddat + (j)*nb)

    cuDoubleComplex c_one     = MAGMA_Z_ONE;
    cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;

    magma_int_t iinfo, nb, n_local[MagmaMaxGPUs];
    magma_int_t maxm, mindim;
    magma_int_t i, j, d, rows, cols, s, lddat, lddwork;
    magma_int_t id, i_local, i_local2, nb0, nb1;
    cuDoubleComplex *d_lAT[MagmaMaxGPUs];
    cuDoubleComplex *d_panel[MagmaMaxGPUs], *work;
    cudaStream_t streaml[4][2];

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -2;
    else if (n < 0)
        *info = -3;
    else if (ldda < max(1,m))
        *info = -5;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    nb     = magma_get_zgetrf_nb(m);

    if (nb <= 1 || nb >= n) {
          /* Use CPU code. */
          magma_zmalloc_cpu( &work, m * n );
          if ( work == NULL ) {
              *info = MAGMA_ERR_HOST_ALLOC;
              return *info;
          }
          magma_zgetmatrix( m, n, d_lA[0], ldda, work, m );
          lapackf77_zgetrf(&m, &n, work, &m, ipiv, info);
          magma_zsetmatrix( m, n, work, m, d_lA[0], ldda );
          magma_free_cpu(work);
    } else {
          /* Use hybrid blocked code. */
          maxm = ((m + 31)/32)*32;
          if( num_gpus > ceil((double)n/nb) ) {
            printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) num_gpus );
            *info = -1;
            return *info;
          }

          /* allocate workspace for each GPU */
          lddat = ((((((n+nb-1)/nb)/num_gpus)*nb)+31)/32)*32;
          lddat = (n+nb-1)/nb;                 /* number of block columns         */
          lddat = (lddat+num_gpus-1)/num_gpus; /* number of block columns per GPU */
          lddat = nb*lddat;                    /* number of columns per GPU       */
          lddat = ((lddat+31)/32)*32;          /* make it a multiple of 32        */
          for(i=0; i<num_gpus; i++){
            magma_setdevice(i);

            /* local-n and local-ld */
            n_local[i] = ((n/nb)/num_gpus)*nb;
            if (i < (n/nb)%num_gpus)
               n_local[i] += nb;
            else if (i == (n/nb)%num_gpus)
               n_local[i] += n%nb;

            /* workspaces */
            if (MAGMA_SUCCESS != magma_zmalloc( &d_panel[i], 3*nb*maxm )) {
                for( j=0; j<=i; j++ ) {
                    magma_setdevice(j);
                }
                for( j=0; j<i; j++ ) {
                    magma_setdevice(j);
                    magma_free( d_panel[j] );
                    magma_free( d_lAT[j]   );
                }
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }

            /* local-matrix storage */
            if (MAGMA_SUCCESS != magma_zmalloc( &d_lAT[i], lddat*maxm )) {
                for( j=0; j<=i; j++ ) {
                    magma_setdevice(j);
                    magma_free( d_panel[j] );
                }
                for( j=0; j<i; j++ ) {
                    magma_setdevice(j);
                    magma_free( d_lAT[j] );
                }
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }

            /* create the streams */
            magma_queue_create( &streaml[i][0] );
            magma_queue_create( &streaml[i][1] );

            magmablasSetKernelStream(streaml[i][1]);
            magmablas_ztranspose2( d_lAT[i], lddat, d_lA[i], ldda, m, n_local[i] );
          }
          for(i=0; i<num_gpus; i++){
            magma_setdevice(i);
            cudaStreamSynchronize(streaml[i][0]);
            magmablasSetKernelStream(NULL);
          }
          magma_setdevice(0);

          /* cpu workspace */
          lddwork = maxm;
          if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, lddwork*nb*num_gpus )) {
              for(i=0; i<num_gpus; i++ ) {
                  magma_setdevice(i);
                  magma_free( d_panel[i] );
                  magma_free( d_lAT[i]   );
              }
              *info = MAGMA_ERR_HOST_ALLOC;
              return *info;
          }

          /* calling multi-gpu interface with allocated workspaces and streams */
          //magma_zgetrf1_mgpu( num_gpus, m, n, nb, 0, d_lAT, lddat, ipiv, d_panel, work, maxm,
          //                   (cudaStream_t **)streaml, info );
          magma_zgetrf2_mgpu(num_gpus, m, n, nb, 0, d_lAT, lddat, ipiv, d_panel, work, maxm,
                             streaml, info);

          /* clean up */
          for( d=0; d<num_gpus; d++ ) {
              magma_setdevice(d);
              
              /* save on output */
              magmablas_ztranspose2( d_lA[d], ldda, d_lAT[d], lddat, n_local[d], m );
              magma_device_sync();
              magma_free( d_lAT[d]   );
              magma_free( d_panel[d] );
              magma_queue_destroy( streaml[d][0] );
              magma_queue_destroy( streaml[d][1] );
              magmablasSetKernelStream(NULL);
          } /* end of for d=1,..,num_gpus */
          magma_setdevice(0);
          magma_free_pinned( work );
        }
        
        return *info;       
        /* End of MAGMA_ZGETRF_MGPU */
}
Exemple #23
0
extern "C" magma_int_t
magma_zgetrf(
    magma_int_t m, magma_int_t n,
    magmaDoubleComplex *A, magma_int_t lda, magma_int_t *ipiv,
    magma_queue_t queue[2],
    magma_int_t *info)
{
/*  -- clMAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date November 2014

    Purpose
    =======
    ZGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.  This version does not
    require work space on the GPU passed as input. GPU memory is allocated
    in the routine.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.
    If the current stream is NULL, this version replaces it with user defined
    stream to overlap computation with communication. 

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) COMPLEX_16 array, dimension (LDA,N)
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    IPIV    (output) INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
            > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    =====================================================================    */

#define dAT(i,j) dAT, dAT_offset + ((i)*nb*lddat + (j)*nb)

    magmaDoubleComplex *work;
    magmaDoubleComplex_ptr dAT, dA, dwork, dAP;
    size_t dA_offset, dAT_offset;
    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    magma_int_t     iinfo, nb;

    *info = 0;

    if (m < 0)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (lda < max(1,m))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    nb = magma_get_zgetrf_nb(m);

    if ( (nb <= 1) || (nb >= min(m,n)) ) {
        /* Use CPU code. */
        lapackf77_zgetrf(&m, &n, A, &lda, ipiv, info);
    } else {
        /* Use hybrid blocked code. */
        magma_int_t maxm, maxn, ldda, maxdim, lddat;
        magma_int_t i, j, rows, cols, s = min(m, n)/nb;
        
        maxm = ((m + 31)/32)*32;
        maxn = ((n + 31)/32)*32;

        lddat   = maxn;
        ldda    = maxm;

        maxdim = max(maxm, maxn);

        /* set number of GPUs */
        magma_int_t num_gpus = magma_num_gpus();
        if ( num_gpus > 1 ) {
            /* call multi-GPU non-GPU-resident interface  */
            printf("multiple-GPU verison not implemented\n");
            return MAGMA_ERR_NOT_IMPLEMENTED;
            // magma_zgetrf_m(num_gpus, m, n, A, lda, ipiv, info);
            // return *info;
        }

        /* explicitly checking the memory requirement */
        magma_int_t totalMem = magma_queue_meminfo( queue[0] );
        totalMem /= sizeof(magmaDoubleComplex);

        int h = 1+(2+num_gpus), num_gpus2 = num_gpus;
        int NB = (magma_int_t)(0.8*totalMem/maxm-h*nb);
        const char* ngr_nb_char = getenv("MAGMA_NGR_NB");
        if( ngr_nb_char != NULL )
            NB = max( nb, min( NB, atoi(ngr_nb_char) ) );

        if( num_gpus > ceil((double)NB/nb) ) {
            num_gpus2 = (int)ceil((double)NB/nb);
            h = 1+(2+num_gpus2);
            NB = (magma_int_t)(0.8*totalMem/maxm-h*nb);
        } 
        if( num_gpus2*NB < n ) {
            /* require too much memory, so call non-GPU-resident version */
            printf("non-GPU-resident version not implemented\n");
            return MAGMA_ERR_NOT_IMPLEMENTED; 
            //magma_zgetrf_m(num_gpus, m, n, A, lda, ipiv, info);
            //return *info;
        }

        work = A;
        if (maxdim*maxdim < 2*maxm*maxn) {
            // if close to square, allocate square matrix and transpose in-place
            if (MAGMA_SUCCESS != 
                magma_zmalloc( &dwork, (nb*maxm + maxdim*maxdim) ) ) {
                /* alloc failed so call non-GPU-resident version */
                printf("non-GPU-resident version not implemented\n");
                return MAGMA_ERR_NOT_IMPLEMENTED;
                //magma_zgetrf_m(num_gpus, m, n, A, lda, ipiv, info);
                //return *info;
            }
            dAP = dwork;

            dA = dwork;
            dA_offset = nb*maxm;            

            ldda = lddat = maxdim;
            magma_zsetmatrix( m, n, A, lda, dA, dA_offset, ldda, queue[0] );
            
            dAT = dA;
            dAT_offset = dA_offset;
            magmablas_ztranspose_inplace( m, dAT, dAT_offset, ldda, queue[0] );
        }
        else {
            // if very rectangular, allocate dA and dAT and transpose out-of-place
            if (MAGMA_SUCCESS != 
                magma_zmalloc( &dwork, (nb + maxn)*maxm )) {
                /* alloc failed so call non-GPU-resident version */
                printf("non-GPU-resident version not implemented\n");
                return MAGMA_ERR_NOT_IMPLEMENTED;
                //magma_zgetrf_m(num_gpus, m, n, A, lda, ipiv, info);
                //return *info;
            }
            dAP = dwork;

            dA = dwork;
            dA_offset = nb*maxm;
            
            magma_zsetmatrix( m, n, A, lda, dA, dA_offset, ldda, queue[0] );
            
            if (MAGMA_SUCCESS != magma_zmalloc( &dAT, maxm*maxn )) {
                /* alloc failed so call non-GPU-resident version */
                magma_free( dwork );
                printf("non-GPU-resident version not implemented\n");
                return MAGMA_ERR_NOT_IMPLEMENTED;
                //magma_zgetrf_m(num_gpus, m, n, A, lda, ipiv, info);
                //return *info;
            }
            dAT_offset = 0;   
            magmablas_ztranspose( m, n, dA, dA_offset, ldda, dAT, dAT_offset, lddat, queue[0] );
        }
        
        lapackf77_zgetrf( &m, &nb, work, &lda, ipiv, &iinfo);

        for( j = 0; j < s; j++ )
        {
            // download j-th panel
            cols = maxm - j*nb;
            
            if (j>0){
                // download j-th panel 
                magmablas_ztranspose( nb, cols, dAT(j,j), lddat, dAP, 0, cols, queue[0] );

                magma_queue_sync(queue[0]);
                magma_zgetmatrix_async( m-j*nb, nb, dAP, 0, cols, work, lda, 
                                        queue[1], NULL);
                
                magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             n - (j+1)*nb, nb,
                             c_one, dAT(j-1,j-1), lddat,
                                    dAT(j-1,j+1), lddat, queue[0] );
                magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                             n-(j+1)*nb, m-j*nb, nb,
                             c_neg_one, dAT(j-1,j+1), lddat,
                                        dAT(j,  j-1), lddat,
                             c_one,     dAT(j,  j+1), lddat, queue[0] );

                // do the cpu part
                rows = m - j*nb;
                magma_queue_sync( queue[1] );
                lapackf77_zgetrf( &rows, &nb, work, &lda, ipiv+j*nb, &iinfo);
            }
            if (*info == 0 && iinfo > 0)
                *info = iinfo + j*nb;

            for( i=j*nb; i < j*nb + nb; ++i ) {
                ipiv[i] += j*nb;
            }
            magmablas_zlaswp( n, dAT, dAT_offset, lddat, j*nb + 1, j*nb + nb, ipiv, 1, queue[0] );

            // upload j-th panel
            magma_zsetmatrix_async( m-j*nb, nb, work, lda, dAP, 0, maxm,
                                    queue[1], NULL);
            magma_queue_sync( queue[1] );

            magmablas_ztranspose( cols, nb, dAP, 0, maxm, dAT(j,j), lddat, queue[0] );

            // do the small non-parallel computations
            if (s > (j+1)){
                magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             nb, nb,
                             c_one, dAT(j, j  ), lddat,
                                    dAT(j, j+1), lddat, queue[0]);
                magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                             nb, m-(j+1)*nb, nb,
                             c_neg_one, dAT(j,   j+1), lddat,
                                        dAT(j+1, j  ), lddat,
                             c_one,     dAT(j+1, j+1), lddat, queue[0] );
            }
            else{
                magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             n-s*nb, nb,
                             c_one, dAT(j, j  ), lddat,
                                    dAT(j, j+1), lddat, queue[0] );
                magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                             n-(j+1)*nb, m-(j+1)*nb, nb,
                             c_neg_one, dAT(j,   j+1), lddat,
                                        dAT(j+1, j  ), lddat,
                             c_one,     dAT(j+1, j+1), lddat, queue[0] );
            }
        }
        
        magma_int_t nb0 = min(m - s*nb, n - s*nb);
        if ( nb0 > 0 ) {
            rows = m - s*nb;
            cols = maxm - s*nb;
    
            magmablas_ztranspose( nb0, rows, dAT(s,s), lddat, dAP, 0, maxm, queue[0]);
            magma_queue_sync(queue[0]);
            magma_zgetmatrix_async( rows, nb0, dAP, 0, maxm, work, lda, queue[1], NULL );
            magma_queue_sync(queue[1]);

            // do the cpu part
            lapackf77_zgetrf( &rows, &nb0, work, &lda, ipiv+s*nb, &iinfo);
            if (*info == 0 && iinfo > 0)
                *info = iinfo + s*nb;
            
            for( i=s*nb; i < s*nb + nb0; ++i ) {
                ipiv[i] += s*nb;
            }
            magmablas_zlaswp( n, dAT, dAT_offset, lddat, s*nb + 1, s*nb + nb0, ipiv, 1, queue[0] );
    
            magma_zsetmatrix_async( rows, nb0, work, lda, dAP, 0, maxm, queue[1], NULL );
            magma_queue_sync(queue[1]);
            magmablas_ztranspose( rows, nb0, dAP, 0, maxm, dAT(s,s), lddat, queue[0]);
    
            magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                         n-s*nb-nb0, nb0,
                         c_one, dAT(s, s),     lddat,
                                dAT(s, s)+nb0, lddat, queue[0] );
        }
       
        if (maxdim*maxdim < 2*maxm*maxn) {
            magmablas_ztranspose_inplace( m, dAT, dAT_offset, lddat, queue[0] );
            magma_zgetmatrix( m, n, dA, dA_offset, ldda, A, lda, queue[0] );
        } else {
            magmablas_ztranspose( n, m, dAT, dAT_offset, lddat, dA, dA_offset, ldda, queue[0] );
            magma_zgetmatrix( m, n, dA, dA_offset, ldda, A, lda, queue[0] );
            magma_queue_sync(queue[0]);
            magma_free( dAT );
        }

        magma_queue_sync(queue[0]);
        magma_free( dwork );
    }
    
    return *info;
} /* magma_zgetrf */
Exemple #24
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing zgetri_batched
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    // constants
    const magmaDoubleComplex c_zero    = MAGMA_Z_ZERO;
    const magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    
    real_Double_t   gflops, gpu_perf, gpu_time, cpu_perf, cpu_time;
    magmaDoubleComplex *h_A, *h_Ainv, *h_R, *work;
    magmaDoubleComplex_ptr d_A, d_invA;
    magmaDoubleComplex_ptr *dA_array;
    magmaDoubleComplex_ptr *dinvA_array;
    magma_int_t **dipiv_array;
    magma_int_t *dinfo_array;
    magma_int_t *ipiv, *cpu_info;
    magma_int_t *d_ipiv, *d_info;
    magma_int_t N, n2, lda, ldda, info, info1, info2, lwork;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    magmaDoubleComplex tmp;
    double  error, rwork[1];
    magma_int_t columns;
    magma_int_t status = 0;
    
    magma_opts opts( MagmaOptsBatched );
    opts.parse_opts( argc, argv );
    
    magma_int_t batchCount = opts.batchcount;
    double tol = opts.tolerance * lapackf77_dlamch("E");

    printf("%% batchCount   N    CPU Gflop/s (ms)    GPU Gflop/s (ms)   ||I - A*A^{-1}||_1 / (N*cond(A))\n");
    printf("%%===============================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {    
        for( int iter = 0; iter < opts.niter; ++iter ) {
            N = opts.nsize[itest];
            lda    = N;
            n2     = lda*N * batchCount;
            ldda   = magma_roundup( N, opts.align );  // multiple of 32 by default
            // This is the correct flops but since this getri_batched is based on
            // 2 trsm = getrs and to know the real flops I am using the getrs one
            //gflops = (FLOPS_ZGETRF( N, N ) + FLOPS_ZGETRI( N ))/ 1e9 * batchCount;
            gflops = (FLOPS_ZGETRF( N, N ) + FLOPS_ZGETRS( N, N ))/ 1e9 * batchCount;

            // query for workspace size
            lwork = -1;
            lapackf77_zgetri( &N, NULL, &lda, NULL, &tmp, &lwork, &info );
            if (info != 0) {
                printf("lapackf77_zgetri returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            }
            lwork = magma_int_t( MAGMA_Z_REAL( tmp ));
            
            TESTING_MALLOC_CPU( cpu_info, magma_int_t,        batchCount );
            TESTING_MALLOC_CPU( ipiv,     magma_int_t,        N * batchCount );
            TESTING_MALLOC_CPU( work,     magmaDoubleComplex, lwork*batchCount );
            TESTING_MALLOC_CPU( h_A,      magmaDoubleComplex, n2     );
            TESTING_MALLOC_CPU( h_Ainv,   magmaDoubleComplex, n2     );
            TESTING_MALLOC_CPU( h_R,      magmaDoubleComplex, n2     );
            
            TESTING_MALLOC_DEV( d_A,      magmaDoubleComplex, ldda*N * batchCount );
            TESTING_MALLOC_DEV( d_invA,   magmaDoubleComplex, ldda*N * batchCount );
            TESTING_MALLOC_DEV( d_ipiv,   magma_int_t,        N * batchCount );
            TESTING_MALLOC_DEV( d_info,   magma_int_t,        batchCount );

            TESTING_MALLOC_DEV( dA_array,    magmaDoubleComplex*, batchCount );
            TESTING_MALLOC_DEV( dinvA_array, magmaDoubleComplex*, batchCount );
            TESTING_MALLOC_DEV( dinfo_array, magma_int_t,         batchCount );
            TESTING_MALLOC_DEV( dipiv_array, magma_int_t*,        batchCount );
            
            /* Initialize the matrix */
            lapackf77_zlarnv( &ione, ISEED, &n2, h_A );
            columns = N * batchCount;
            lapackf77_zlacpy( MagmaFullStr, &N, &columns, h_A, &lda, h_R,  &lda );
            lapackf77_zlacpy( MagmaFullStr, &N, &columns, h_A, &lda, h_Ainv, &lda );
            magma_zsetmatrix( N, columns, h_R, lda, d_A, ldda, opts.queue );

            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            magma_zset_pointer( dA_array, d_A, ldda, 0, 0, ldda * N, batchCount, opts.queue );
            magma_zset_pointer( dinvA_array, d_invA, ldda, 0, 0, ldda * N, batchCount, opts.queue );
            magma_iset_pointer( dipiv_array, d_ipiv, 1, 0, 0, N, batchCount, opts.queue );

            gpu_time = magma_sync_wtime( opts.queue );
            info1 = magma_zgetrf_batched( N, N, dA_array, ldda, dipiv_array, dinfo_array, batchCount, opts.queue);
            info2 = magma_zgetri_outofplace_batched( N, dA_array, ldda, dipiv_array, dinvA_array, ldda, dinfo_array, batchCount, opts.queue);
            gpu_time = magma_sync_wtime( opts.queue ) - gpu_time;
            gpu_perf = gflops / gpu_time;

            // check correctness of results throught "dinfo_magma" and correctness of argument throught "info"
            magma_getvector( batchCount, sizeof(magma_int_t), dinfo_array, 1, cpu_info, 1, opts.queue );
            for (magma_int_t i=0; i < batchCount; i++)
            {
                if (cpu_info[i] != 0 ) {
                    printf("magma_zgetrf_batched matrix %d returned error %d\n", (int) i, (int)cpu_info[i] );
                }
            }
            if (info1 != 0) printf("magma_zgetrf_batched returned argument error %d: %s.\n", (int) info1, magma_strerror( info1 ));
            if (info2 != 0) printf("magma_zgetri_batched returned argument error %d: %s.\n", (int) info2, magma_strerror( info2 ));
            
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            if ( opts.lapack ) {
                cpu_time = magma_wtime();
                #if !defined (BATCHED_DISABLE_PARCPU) && defined(_OPENMP)
                magma_int_t nthreads = magma_get_lapack_numthreads();
                magma_set_lapack_numthreads(1);
                magma_set_omp_numthreads(nthreads);
                #pragma omp parallel for schedule(dynamic)
                #endif
                for (int i=0; i < batchCount; i++)
                {
                    magma_int_t locinfo;
                    lapackf77_zgetrf(&N, &N, h_Ainv + i*lda*N, &lda, ipiv + i*N, &locinfo);
                    if (locinfo != 0) {
                        printf("lapackf77_zgetrf returned error %d: %s.\n",
                               (int) locinfo, magma_strerror( locinfo ));
                    }
                    lapackf77_zgetri(&N, h_Ainv + i*lda*N, &lda, ipiv + i*N, work + i*lwork, &lwork, &locinfo );
                    if (locinfo != 0) {
                        printf("lapackf77_zgetri returned error %d: %s.\n",
                               (int) locinfo, magma_strerror( locinfo ));
                    }
                }
                #if !defined (BATCHED_DISABLE_PARCPU) && defined(_OPENMP)
                    magma_set_lapack_numthreads(nthreads);
                #endif
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
                
                printf("%10d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)",
                       (int) batchCount, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000. );
            }
            else {
                printf("%10d %5d     ---   (  ---  )   %7.2f (%7.2f)",
                       (int) batchCount, (int) N, gpu_perf, gpu_time*1000. );
            }
            
            /* =====================================================================
               Check the result
               =================================================================== */
            if ( opts.check ) {
                magma_igetvector( N*batchCount, d_ipiv, 1, ipiv, 1, opts.queue );
                magma_zgetmatrix( N, N*batchCount, d_invA, ldda, h_Ainv, lda, opts.queue );
                error = 0;
                for (magma_int_t i=0; i < batchCount; i++)
                {
                    for (magma_int_t k=0; k < N; k++) {
                        if (ipiv[i*N+k] < 1 || ipiv[i*N+k] > N )
                        {
                            printf("error for matrix %d ipiv @ %d = %d\n", (int) i, (int) k, (int) ipiv[i*N+k]);
                            error = -1;
                        }
                    }
                    if (error == -1) {
                        break;
                    }
                    
                    // compute 1-norm condition number estimate, following LAPACK's zget03
                    double normA, normAinv, rcond, err;
                    normA    = lapackf77_zlange( "1", &N, &N, h_A    + i*lda*N, &lda, rwork );
                    normAinv = lapackf77_zlange( "1", &N, &N, h_Ainv + i*lda*N, &lda, rwork );
                    if ( normA <= 0 || normAinv <= 0 ) {
                        rcond = 0;
                        err = 1 / (tol/opts.tolerance);  // == 1/eps
                    }
                    else {
                        rcond = (1 / normA) / normAinv;
                        // R = I
                        // R -= A*A^{-1}
                        // err = ||I - A*A^{-1}|| / ( N ||A||*||A^{-1}|| ) = ||R|| * rcond / N, using 1-norm
                        lapackf77_zlaset( "full", &N, &N, &c_zero, &c_one, h_R + i*lda*N, &lda );
                        blasf77_zgemm( "no", "no", &N, &N, &N, &c_neg_one,
                                       h_A    + i*lda*N, &lda,
                                       h_Ainv + i*lda*N, &lda, &c_one,
                                       h_R    + i*lda*N, &lda );
                        err = lapackf77_zlange( "1", &N, &N, h_R + i*lda*N, &lda, rwork );
                        err = err * rcond / N;
                    }
                    if ( isnan(err) || isinf(err) ) {
                        error = err;
                        break;
                    }
                    error = max( err, error );
                }
                bool okay = (error < tol);
                status += ! okay;
                printf("   %8.2e   %s\n", error, (okay ? "ok" : "failed") );
            }
            else {
                printf("\n");
            }

            TESTING_FREE_CPU( cpu_info );
            TESTING_FREE_CPU( ipiv   );
            TESTING_FREE_CPU( work   );
            TESTING_FREE_CPU( h_A    );
            TESTING_FREE_CPU( h_Ainv );
            TESTING_FREE_CPU( h_R    );
            
            TESTING_FREE_DEV( d_A );
            TESTING_FREE_DEV( d_invA );
            TESTING_FREE_DEV( d_ipiv );
            TESTING_FREE_DEV( d_info );
            
            TESTING_FREE_DEV( dA_array );
            TESTING_FREE_DEV( dinvA_array );
            TESTING_FREE_DEV( dinfo_array );
            TESTING_FREE_DEV( dipiv_array );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }
    
    opts.cleanup();
    TESTING_FINALIZE();
    return status;
}
Exemple #25
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing zgetrf
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    real_Double_t   gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0;
    double          error;
    magmaDoubleComplex *h_A;
    magma_int_t     *ipiv;
    magma_int_t     M, N, n2, lda, info, min_mn;
    magma_int_t     status = 0;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    double tol = opts.tolerance * lapackf77_dlamch("E");

    printf("ngpu %d\n", (int) opts.ngpu );
    if ( opts.check == 2 ) {
        printf("    M     N   CPU GFlop/s (sec)   GPU GFlop/s (sec)   |Ax-b|/(N*|A|*|x|)\n");
    }
    else {
        printf("    M     N   CPU GFlop/s (sec)   GPU GFlop/s (sec)   |PA-LU|/(N*|A|)\n");
    }
    printf("=========================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            min_mn = min(M, N);
            lda    = M;
            n2     = lda*N;
            gflops = FLOPS_ZGETRF( M, N ) / 1e9;
            
            TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn );
            TESTING_MALLOC_PIN( h_A,  magmaDoubleComplex, n2 );
            
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            if ( opts.lapack ) {
                init_matrix( M, N, h_A, lda );
                
                cpu_time = magma_wtime();
                lapackf77_zgetrf(&M, &N, h_A, &lda, ipiv, &info);
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
                if (info != 0)
                    printf("lapackf77_zgetrf returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));
            }
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            init_matrix( M, N, h_A, lda );
            
            gpu_time = magma_wtime();
            magma_zgetrf( M, N, h_A, lda, ipiv, &info);
            gpu_time = magma_wtime() - gpu_time;
            gpu_perf = gflops / gpu_time;
            if (info != 0)
                printf("magma_zgetrf returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            
            /* =====================================================================
               Check the factorization
               =================================================================== */
            if ( opts.lapack ) {
                printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)",
                       (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time );
            }
            else {
                printf("%5d %5d     ---   (  ---  )   %7.2f (%7.2f)",
                       (int) M, (int) N, gpu_perf, gpu_time );
            }
            if ( opts.check == 2 ) {
                error = get_residual( M, N, h_A, lda, ipiv );
                printf("   %8.2e   %s\n", error, (error < tol ? "ok" : "failed"));
                status += ! (error < tol);
            }
            else if ( opts.check ) {
                error = get_LU_error( M, N, h_A, lda, ipiv );
                printf("   %8.2e   %s\n", error, (error < tol ? "ok" : "failed"));
                status += ! (error < tol);
            }
            else {
                printf("     ---   \n");
            }
            
            TESTING_FREE_CPU( ipiv );
            TESTING_FREE_PIN( h_A  );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    TESTING_FINALIZE();
    return status;
}