Exemple #1
0
extern "C" magma_int_t
magma_cgetrf_gpu(magma_int_t m, magma_int_t n, 
                 magmaFloatComplex *dA, magma_int_t ldda,
                 magma_int_t *ipiv, magma_int_t *info)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    CGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.
    If the current stream is NULL, this version replaces it with user defined
    stream to overlap computation with communication. 

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) COMPLEX array on the GPU, dimension (LDDA,N).
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    LDDA     (input) INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).

    IPIV    (output) INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
            > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.
    =====================================================================    */

    #define dAT(i,j) (dAT + (i)*nb*lddat + (j)*nb)

    magmaFloatComplex c_one     = MAGMA_C_ONE;
    magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;

    magma_int_t iinfo, nb;
    magma_int_t maxm, maxn, mindim;
    magma_int_t i, rows, cols, s, lddat, lddwork;
    magmaFloatComplex *dAT, *dAP, *work;

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (ldda < max(1,m))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    nb     = magma_get_cgetrf_nb(m);
    s      = mindim / nb;

    if (nb <= 1 || nb >= min(m,n)) {
        /* Use CPU code. */
        magma_cmalloc_cpu( &work, m * n );
        if ( work == NULL ) {
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        magma_cgetmatrix( m, n, dA, ldda, work, m );
        lapackf77_cgetrf(&m, &n, work, &m, ipiv, info);
        magma_csetmatrix( m, n, work, m, dA, ldda );
        magma_free_cpu(work);
    }
    else {
        /* Use hybrid blocked code. */
        maxm = ((m + 31)/32)*32;
        maxn = ((n + 31)/32)*32;

        lddat   = maxn;
        lddwork = maxm;

        dAT = dA;

        if (MAGMA_SUCCESS != magma_cmalloc( &dAP, nb*maxm )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }

        if ( m == n ) {
            lddat = ldda;
            magmablas_ctranspose_inplace( m, dAT, ldda );
        }
        else {
            if (MAGMA_SUCCESS != magma_cmalloc( &dAT, maxm*maxn )) {
                magma_free( dAP );
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }
            magmablas_ctranspose2( dAT, lddat, dA, ldda, m, n );
        }

        if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, maxm*nb )) {
            magma_free( dAP );
            if ( ! (m == n))
                magma_free( dAT );
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }

        /* Define user stream if current stream is NULL */ 
        cudaStream_t stream[2], current_stream;
        magmablasGetKernelStream(&current_stream);

        magma_queue_create( &stream[0] );
        if (current_stream == NULL) {
           magma_queue_create( &stream[1] );
           magmablasSetKernelStream(stream[1]);
        }
        else
           stream[1] = current_stream;
  
        for( i=0; i<s; i++ )
            {
                // download i-th panel
                cols = maxm - i*nb;
                //magmablas_ctranspose( dAP, cols, dAT(i,i), lddat, nb, cols   );
                magmablas_ctranspose2( dAP, cols, dAT(i,i), lddat, nb, m-i*nb );

                // make sure that that the transpose has completed
                magma_queue_sync( stream[1] );
                magma_cgetmatrix_async( m-i*nb, nb, dAP, cols, work, lddwork,
                                        stream[0]);

                if ( i>0 ){
                    magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                                 n - (i+1)*nb, nb, 
                                 c_one, dAT(i-1,i-1), lddat, 
                                        dAT(i-1,i+1), lddat );
                    magma_cgemm( MagmaNoTrans, MagmaNoTrans, 
                                 n-(i+1)*nb, m-i*nb, nb, 
                                 c_neg_one, dAT(i-1,i+1), lddat, 
                                            dAT(i,  i-1), lddat, 
                                 c_one,     dAT(i,  i+1), lddat );
                }

                // do the cpu part
                rows = m - i*nb;
                magma_queue_sync( stream[0] );
                lapackf77_cgetrf( &rows, &nb, work, &lddwork, ipiv+i*nb, &iinfo);
                if ( (*info == 0) && (iinfo > 0) )
                    *info = iinfo + i*nb;

                // upload i-th panel
                magma_csetmatrix_async( m-i*nb, nb, work, lddwork, dAP, maxm,
                                        stream[0]);

                magmablas_cpermute_long2( n, dAT, lddat, ipiv, nb, i*nb );

                magma_queue_sync( stream[0] );
                //magmablas_ctranspose(dAT(i,i), lddat, dAP, maxm, cols, nb);
                magmablas_ctranspose2(dAT(i,i), lddat, dAP, maxm, m-i*nb, nb);

                // do the small non-parallel computations (next panel update)
                if ( s > (i+1) ) {
                    magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                                 nb, nb, 
                                 c_one, dAT(i, i  ), lddat,
                                        dAT(i, i+1), lddat);
                    magma_cgemm( MagmaNoTrans, MagmaNoTrans, 
                                 nb, m-(i+1)*nb, nb, 
                                 c_neg_one, dAT(i,   i+1), lddat,
                                            dAT(i+1, i  ), lddat, 
                                 c_one,     dAT(i+1, i+1), lddat );
                }
                else {
                    magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                                 n-s*nb, nb, 
                                 c_one, dAT(i, i  ), lddat,
                                        dAT(i, i+1), lddat);
                    magma_cgemm( MagmaNoTrans, MagmaNoTrans, 
                                 n-(i+1)*nb, m-(i+1)*nb, nb,
                                 c_neg_one, dAT(i,   i+1), lddat,
                                            dAT(i+1, i  ), lddat, 
                                 c_one,     dAT(i+1, i+1), lddat );
                }
            }

        magma_int_t nb0 = min(m - s*nb, n - s*nb);
        rows = m - s*nb;
        cols = maxm - s*nb;

        magmablas_ctranspose2( dAP, maxm, dAT(s,s), lddat, nb0, rows);
        magma_cgetmatrix( rows, nb0, dAP, maxm, work, lddwork );

        // do the cpu part
        lapackf77_cgetrf( &rows, &nb0, work, &lddwork, ipiv+s*nb, &iinfo);
        if ( (*info == 0) && (iinfo > 0) )
            *info = iinfo + s*nb;
        magmablas_cpermute_long2( n, dAT, lddat, ipiv, nb0, s*nb );

        // upload i-th panel
        magma_csetmatrix( rows, nb0, work, lddwork, dAP, maxm );
        magmablas_ctranspose2( dAT(s,s), lddat, dAP, maxm, rows, nb0);

        magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                     n-s*nb-nb0, nb0,
                     c_one, dAT(s,s),     lddat, 
                            dAT(s,s)+nb0, lddat);

        if ( m == n ) {
            magmablas_ctranspose_inplace( m, dAT, lddat );
        }
        else {
            magmablas_ctranspose2( dA, ldda, dAT, lddat, n, m );
            magma_free( dAT );
        }

        magma_free( dAP );
        magma_free_pinned( work );
    
        magma_queue_destroy( stream[0] );
        if (current_stream == NULL) {
            magma_queue_destroy( stream[1] );
            magmablasSetKernelStream(NULL);
        }
    }
    return *info;
}   /* End of MAGMA_CGETRF_GPU */
Exemple #2
0
extern "C" magma_int_t
magma_cgetrf_mgpu(magma_int_t num_gpus,
                 magma_int_t m, magma_int_t n,
                 magmaFloatComplex **d_lA, magma_int_t ldda,
                 magma_int_t *ipiv, magma_int_t *info)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    CGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.

    Arguments
    =========
    NUM_GPUS
            (input) INTEGER
            The number of GPUS to be used for the factorization.

    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) COMPLEX array on the GPU, dimension (LDDA,N).
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    LDDA     (input) INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).

    IPIV    (output) INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
            > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.
    =====================================================================    */

#define inAT(id,i,j) (d_lAT[(id)] + (i)*nb*lddat + (j)*nb)

    magma_int_t nb, n_local[MagmaMaxGPUs];
    magma_int_t maxm, mindim;
    magma_int_t i, j, d, lddat, lddwork;
    magmaFloatComplex *d_lAT[MagmaMaxGPUs];
    magmaFloatComplex *d_panel[MagmaMaxGPUs], *work;
    magma_queue_t streaml[MagmaMaxGPUs][2];

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -2;
    else if (n < 0)
        *info = -3;
    else if (ldda < max(1,m))
        *info = -5;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    nb     = magma_get_cgetrf_nb(m);

    if (nb <= 1 || nb >= n) {
        /* Use CPU code. */
        magma_cmalloc_cpu( &work, m * n );
        if ( work == NULL ) {
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        magma_cgetmatrix( m, n, d_lA[0], ldda, work, m );
        lapackf77_cgetrf(&m, &n, work, &m, ipiv, info);
        magma_csetmatrix( m, n, work, m, d_lA[0], ldda );
        magma_free_cpu(work);
    } else {
        /* Use hybrid blocked code. */
        maxm = ((m + 31)/32)*32;
        if( num_gpus > ceil((float)n/nb) ) {
            printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) num_gpus );
            *info = -1;
            return *info;
        }

        /* allocate workspace for each GPU */
        lddat = ((((((n+nb-1)/nb)/num_gpus)*nb)+31)/32)*32;
        lddat = (n+nb-1)/nb;                 /* number of block columns         */
        lddat = (lddat+num_gpus-1)/num_gpus; /* number of block columns per GPU */
        lddat = nb*lddat;                    /* number of columns per GPU       */
        lddat = ((lddat+31)/32)*32;          /* make it a multiple of 32        */
        for(i=0; i<num_gpus; i++){
            magma_setdevice(i);
            
            /* local-n and local-ld */
            n_local[i] = ((n/nb)/num_gpus)*nb;
            if (i < (n/nb)%num_gpus)
               n_local[i] += nb;
            else if (i == (n/nb)%num_gpus)
               n_local[i] += n%nb;
            
            /* workspaces */
            if (MAGMA_SUCCESS != magma_cmalloc( &d_panel[i], (3+num_gpus)*nb*maxm )) {
                for( j=0; j<=i; j++ ) {
                    magma_setdevice(j);
                }
                for( j=0; j<i; j++ ) {
                    magma_setdevice(j);
                    magma_free( d_panel[j] );
                    magma_free( d_lAT[j]   );
                }
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }
            
            /* local-matrix storage */
            if (MAGMA_SUCCESS != magma_cmalloc( &d_lAT[i], lddat*maxm )) {
                for( j=0; j<=i; j++ ) {
                    magma_setdevice(j);
                    magma_free( d_panel[j] );
                }
                for( j=0; j<i; j++ ) {
                    magma_setdevice(j);
                    magma_free( d_lAT[j] );
                }
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }
            
            /* create the streams */
            magma_queue_create( &streaml[i][0] );
            magma_queue_create( &streaml[i][1] );
            
            magmablasSetKernelStream(streaml[i][1]);
            magmablas_ctranspose2( d_lAT[i], lddat, d_lA[i], ldda, m, n_local[i] );
        }
        for(i=0; i<num_gpus; i++){
            magma_setdevice(i);
            cudaStreamSynchronize(streaml[i][0]);
            magmablasSetKernelStream(NULL);
        }
        magma_setdevice(0);

        /* cpu workspace */
        lddwork = maxm;
        if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, lddwork*nb*num_gpus )) {
            for(i=0; i<num_gpus; i++ ) {
                magma_setdevice(i);
                magma_free( d_panel[i] );
                magma_free( d_lAT[i]   );
            }
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }

        /* calling multi-gpu interface with allocated workspaces and streams */
        //magma_cgetrf1_mgpu( num_gpus, m, n, nb, 0, d_lAT, lddat, ipiv, d_panel, work, maxm,
        //                   (magma_queue_t **)streaml, info );
        magma_cgetrf2_mgpu(num_gpus, m, n, nb, 0, d_lAT, lddat, ipiv, d_panel, work, maxm,
                           streaml, info);

        /* clean up */
        for( d=0; d<num_gpus; d++ ) {
            magma_setdevice(d);
            
            /* save on output */
            magmablas_ctranspose2( d_lA[d], ldda, d_lAT[d], lddat, n_local[d], m );
            magma_device_sync();
            magma_free( d_lAT[d]   );
            magma_free( d_panel[d] );
            magma_queue_destroy( streaml[d][0] );
            magma_queue_destroy( streaml[d][1] );
            magmablasSetKernelStream(NULL);
        } /* end of for d=1,..,num_gpus */
        magma_setdevice(0);
        magma_free_pinned( work );
    }
        
    return *info;
}
Exemple #3
0
extern "C" magma_int_t
magma_cgetrf(magma_int_t m, magma_int_t n, cuFloatComplex *a, magma_int_t lda, 
             magma_int_t *ipiv, magma_int_t *info)
{
/*  -- MAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2012

    Purpose
    =======
    CGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.  This version does not
    require work space on the GPU passed as input. GPU memory is allocated
    in the routine.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) COMPLEX array, dimension (LDA,N)
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    IPIV    (output) INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
            > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    =====================================================================    */

#define inAT(i,j) (dAT + (i)*nb*ldda + (j)*nb)

    cuFloatComplex *dAT, *dA, *da, *work;
    cuFloatComplex c_one     = MAGMA_C_ONE;
    cuFloatComplex c_neg_one = MAGMA_C_NEG_ONE;
    magma_int_t     iinfo, nb;

    *info = 0;

    if (m < 0)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (lda < max(1,m))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    nb = magma_get_cgetrf_nb(m);

    if ( (nb <= 1) || (nb >= min(m,n)) ) {
        /* Use CPU code. */
        lapackf77_cgetrf(&m, &n, a, &lda, ipiv, info);
    } else {
        /* Use hybrid blocked code. */
        magma_int_t maxm, maxn, ldda, maxdim;
        magma_int_t i, rows, cols, s = min(m, n)/nb;
        
        magma_int_t num_gpus = magma_num_gpus();
        if ( num_gpus > 1 ) {
          /* call multi-GPU non-GPU-resident interface  */
          magma_int_t rval = magma_cgetrf_m(num_gpus, m, n, a, lda, ipiv, info);
          if( *info >= 0 ) magma_cgetrf_piv(num_gpus, m, n, a, lda, ipiv, info);
          return *info;
        }

        maxm = ((m + 31)/32)*32;
        maxn = ((n + 31)/32)*32;
        maxdim = max(maxm, maxn);

        ldda = maxn;
        work = a;

        if (maxdim*maxdim < 2*maxm*maxn)
        {
            if (MAGMA_SUCCESS != magma_cmalloc( &dA, nb*maxm + maxdim*maxdim )) {
                        /* alloc failed so call non-GPU-resident version */ 
                        magma_int_t rval = magma_cgetrf_m(num_gpus, m, n, a, lda, ipiv, info);
                        if( *info >= 0 ) magma_cgetrf_piv(num_gpus, m, n, a, lda, ipiv, info);
                        return *info;
            }
            da = dA + nb*maxm;
            
            ldda = maxdim;
            magma_csetmatrix( m, n, a, lda, da, ldda );
            
            dAT = da;
            magmablas_cinplace_transpose( dAT, ldda, ldda );
        }
        else
        {
            if (MAGMA_SUCCESS != magma_cmalloc( &dA, (nb + maxn)*maxm )) {
                        /* alloc failed so call non-GPU-resident version */
                        magma_int_t rval = magma_cgetrf_m(num_gpus, m, n, a, lda, ipiv, info);
                        if( *info >= 0 ) magma_cgetrf_piv(num_gpus, m, n, a, lda, ipiv, info);
                        return *info;
            }
            da = dA + nb*maxm;
            
            magma_csetmatrix( m, n, a, lda, da, maxm );
            
            if (MAGMA_SUCCESS != magma_cmalloc( &dAT, maxm*maxn )) {
                        /* alloc failed so call non-GPU-resident version */
                        magma_free( dA );
                        magma_int_t rval = magma_cgetrf_m(num_gpus, m, n, a, lda, ipiv, info);
                        if( *info >= 0 ) magma_cgetrf_piv(num_gpus, m, n, a, lda, ipiv, info);
                        return *info;
            }

            magmablas_ctranspose2( dAT, ldda, da, maxm, m, n );
        }
        
        lapackf77_cgetrf( &m, &nb, work, &lda, ipiv, &iinfo);

        for( i = 0; i < s; i++ )
        {
            // download i-th panel
            cols = maxm - i*nb;
            
            if (i>0){
                magmablas_ctranspose( dA, cols, inAT(i,i), ldda, nb, cols );
                magma_cgetmatrix( m-i*nb, nb, dA, cols, work, lda );
                
                // make sure that gpu queue is empty
                magma_device_sync();
                
                magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                             n - (i+1)*nb, nb, 
                             c_one, inAT(i-1,i-1), ldda, 
                                    inAT(i-1,i+1), ldda );
                magma_cgemm( MagmaNoTrans, MagmaNoTrans, 
                             n-(i+1)*nb, m-i*nb, nb, 
                             c_neg_one, inAT(i-1,i+1), ldda, 
                                        inAT(i,  i-1), ldda, 
                             c_one,     inAT(i,  i+1), ldda );

                // do the cpu part
                rows = m - i*nb;
                lapackf77_cgetrf( &rows, &nb, work, &lda, ipiv+i*nb, &iinfo);
            }
            if (*info == 0 && iinfo > 0)
                *info = iinfo + i*nb;
            magmablas_cpermute_long2( ldda, dAT, ldda, ipiv, nb, i*nb );

            // upload i-th panel
            magma_csetmatrix( m-i*nb, nb, work, lda, dA, cols );
            magmablas_ctranspose( inAT(i,i), ldda, dA, cols, cols, nb);

            // do the small non-parallel computations
            if (s > (i+1)){
                magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                             nb, nb, 
                             c_one, inAT(i, i  ), ldda,
                                    inAT(i, i+1), ldda);
                magma_cgemm( MagmaNoTrans, MagmaNoTrans, 
                             nb, m-(i+1)*nb, nb, 
                             c_neg_one, inAT(i,   i+1), ldda,
                                        inAT(i+1, i  ), ldda, 
                             c_one,     inAT(i+1, i+1), ldda );
            }
            else{
                magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                             n-s*nb, nb,
                             c_one, inAT(i, i  ), ldda,
                                    inAT(i, i+1), ldda);
                magma_cgemm( MagmaNoTrans, MagmaNoTrans, 
                             n-(i+1)*nb, m-(i+1)*nb, nb,
                             c_neg_one, inAT(i,   i+1), ldda,
                                        inAT(i+1, i  ), ldda, 
                             c_one,     inAT(i+1, i+1), ldda );
            }
        }
        
        magma_int_t nb0 = min(m - s*nb, n - s*nb);
        if ( nb0 > 0 ) {
            rows = m - s*nb;
            cols = maxm - s*nb;
    
            magmablas_ctranspose2( dA, cols, inAT(s,s), ldda, nb0, rows);
            magma_cgetmatrix( rows, nb0, dA, cols, work, lda );
    
            // make sure that gpu queue is empty
            magma_device_sync();
    
            // do the cpu part
            lapackf77_cgetrf( &rows, &nb0, work, &lda, ipiv+s*nb, &iinfo);
            if (*info == 0 && iinfo > 0)
                *info = iinfo + s*nb;
            magmablas_cpermute_long2( ldda, dAT, ldda, ipiv, nb0, s*nb );
    
            magma_csetmatrix( rows, nb0, work, lda, dA, cols );
            magmablas_ctranspose2( inAT(s,s), ldda, dA, cols, rows, nb0);
    
            magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                         n-s*nb-nb0, nb0,
                         c_one, inAT(s, s),     ldda, 
                                inAT(s, s)+nb0, ldda);
        }
        
        if (maxdim*maxdim< 2*maxm*maxn){
            magmablas_cinplace_transpose( dAT, ldda, ldda );
            magma_cgetmatrix( m, n, da, ldda, a, lda );
        } else {
            magmablas_ctranspose2( da, maxm, dAT, ldda, n, m );
            magma_cgetmatrix( m, n, da, maxm, a, lda );
            magma_free( dAT );
        }

        magma_free( dA );
    }
    
    return *info;
} /* magma_cgetrf */
Exemple #4
0
extern "C" magma_int_t
magma_cgelqf( magma_int_t m, magma_int_t n,
              magmaFloatComplex *a,    magma_int_t lda,   magmaFloatComplex *tau,
              magmaFloatComplex *work, magma_int_t lwork, magma_int_t *info)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    CGELQF computes an LQ factorization of a COMPLEX M-by-N matrix A:
    A = L * Q.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) COMPLEX array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and below the diagonal of the array
            contain the m-by-min(m,n) lower trapezoidal matrix L (L is
            lower triangular if m <= n); the elements above the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of elementary reflectors (see Further Details).

            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    TAU     (output) COMPLEX array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    WORK    (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.

            Higher performance is achieved if WORK is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    LWORK   (input) INTEGER
            The dimension of the array WORK.  LWORK >= max(1,M).
            For optimum performance LWORK >= M*NB, where NB is the
            optimal blocksize.

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  if INFO = -10 internal GPU memory allocation failed.

    Further Details
    ===============
    The matrix Q is represented as a product of elementary reflectors

       Q = H(k) . . . H(2) H(1), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in A(i,i+1:n),
    and tau in TAU(i).
    =====================================================================    */

    #define  a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1))

    magmaFloatComplex *dA, *dAT;
    magmaFloatComplex c_one = MAGMA_C_ONE;
    magma_int_t maxm, maxn, maxdim, nb;
    magma_int_t iinfo, ldda;
    int lquery;

    /* Function Body */
    *info = 0;
    nb = magma_get_cgelqf_nb(m);

    work[0] = MAGMA_C_MAKE( (float)(m*nb), 0 );
    lquery = (lwork == -1);
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,m)) {
        *info = -4;
    } else if (lwork < max(1,m) && ! lquery) {
        *info = -7;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /*  Quick return if possible */
    if (min(m, n) == 0) {
        work[0] = c_one;
        return *info;
    }

    maxm = ((m + 31)/32)*32;
    maxn = ((n + 31)/32)*32;
    maxdim = max(maxm, maxn);

    if (maxdim*maxdim < 2*maxm*maxn)
        {
            ldda = maxdim;

            if (MAGMA_SUCCESS != magma_cmalloc( &dA, maxdim*maxdim )) {
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }

            magma_csetmatrix( m, n, a, lda, dA, ldda );
            dAT = dA;
            magmablas_ctranspose_inplace( ldda, dAT, ldda );
        }
    else
        {
            ldda = maxn;

            if (MAGMA_SUCCESS != magma_cmalloc( &dA, 2*maxn*maxm )) {
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }

            magma_csetmatrix( m, n, a, lda, dA, maxm );

            dAT = dA + maxn * maxm;
            magmablas_ctranspose2( dAT, ldda, dA, maxm, m, n );
        }

    magma_cgeqrf2_gpu(n, m, dAT, ldda, tau, &iinfo);

    if (maxdim*maxdim < 2*maxm*maxn) {
        magmablas_ctranspose_inplace( ldda, dAT, ldda );
        magma_cgetmatrix( m, n, dA, ldda, a, lda );
    } else {
        magmablas_ctranspose2( dA, maxm, dAT, ldda, n, m );
        magma_cgetmatrix( m, n, dA, maxm, a, lda );
    }

    magma_free( dA );

    return *info;
} /* magma_cgelqf */
Exemple #5
0
extern "C" magma_int_t
magma_cgetrf_m(magma_int_t num_gpus0, magma_int_t m, magma_int_t n, magmaFloatComplex *a, magma_int_t lda,
               magma_int_t *ipiv, magma_int_t *info)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    CGETRF_m computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.  This version does not
    require work space on the GPU passed as input. GPU memory is allocated
    in the routine. The matrix may not fit entirely in the GPU memory.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.

    Note: The factorization of big panel is done calling multiple-gpu-interface.
    Pivots are applied on GPU within the big panel.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) COMPLEX array, dimension (LDA,N)
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    IPIV    (output) INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
            > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    =====================================================================    */

#define    A(i,j) (a   + (j)*lda + (i))
#define inAT(d,i,j) (dAT[d] + (i)*nb*ldn_local + (j)*nb)
#define inPT(d,i,j) (dPT[d] + (i)*nb*nb + (j)*nb*maxm)

//#define PROFILE
#ifdef PROFILE
    float flops, time_rmajor = 0, time_rmajor2 = 0, time_rmajor3 = 0, time_mem = 0;
    magma_timestr_t start, start1, start2, end1, end, start0 = get_current_time();
#endif
    magmaFloatComplex    c_one     = MAGMA_C_ONE;
    magmaFloatComplex    c_neg_one = MAGMA_C_NEG_ONE;
    magmaFloatComplex    *dAT[MagmaMaxGPUs], *dA[MagmaMaxGPUs], *dPT[MagmaMaxGPUs];
    magma_int_t        iinfo = 0, nb, nbi, maxm, n_local[MagmaMaxGPUs], ldn_local;
    magma_int_t        N, M, NB, NBk, I, d, num_gpus;
    magma_int_t        ii, jj, h, offset, ib, rows, s;
    
    magma_queue_t stream[MagmaMaxGPUs][2];
    magma_event_t  event[MagmaMaxGPUs][2];

    *info = 0;

    if (m < 0)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (lda < max(1,m))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* initialize nb */
    nb = magma_get_cgetrf_nb(m);
    maxm = ((m  + 31)/32)*32;

    /* figure out NB */
    size_t freeMem, totalMem;
    cudaMemGetInfo( &freeMem, &totalMem );
    freeMem /= sizeof(magmaFloatComplex);
    
    /* number of columns in the big panel */
    h = 1+(2+num_gpus0);
    NB = (magma_int_t)(0.8*freeMem/maxm-h*nb);
    char * ngr_nb_char = getenv("MAGMA_NGR_NB");
    if( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) );
    //NB = 5*max(nb,32);

    if( num_gpus0 > ceil((float)NB/nb) ) {
        num_gpus = (int)ceil((float)NB/nb);
        h = 1+(2+num_gpus);
        NB = (magma_int_t)(0.8*freeMem/maxm-h*nb);
    } else {
        num_gpus = num_gpus0;
    }
    if( num_gpus*NB >= n ) {
        #ifdef CHECK_CGETRF_OOC
        printf( "      * still fit in GPU memory.\n" );
        #endif
        NB = n;
    } else {
        #ifdef CHECK_CGETRF_OOC
        printf( "      * don't fit in GPU memory.\n" );
        #endif
        NB = num_gpus*NB;
        NB = max(nb,(NB / nb) * nb); /* making sure it's devisable by nb (x64) */
    }

    #ifdef CHECK_CGETRF_OOC
    if( NB != n ) printf( "      * running in out-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n",n,NB,nb,(float)freeMem );
    else          printf( "      * running in in-core mode  (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n",n,NB,nb,(float)freeMem );
    #endif

    if ( (nb <= 1) || (nb >= min(m,n)) ) {
        /* Use CPU code for scalar of one tile. */
        lapackf77_cgetrf(&m, &n, a, &lda, ipiv, info);
    } else {
        /* Use hybrid blocked code. */

    /* allocate memory on GPU to store the big panel */
#ifdef PROFILE
    start = get_current_time();
#endif
    n_local[0] = (NB/nb)/num_gpus;
    if( NB%(nb*num_gpus) != 0 ) n_local[0] ++;
    n_local[0] *= nb;
    ldn_local = ((n_local[0]+31)/32)*32;

    for( d=0; d<num_gpus; d++ ) {
        magma_setdevice(d);
        if (MAGMA_SUCCESS != magma_cmalloc( &dA[d], (ldn_local+h*nb)*maxm )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }
        dPT[d] = dA[d] + nb*maxm;      /* for storing the previous panel from CPU */
        dAT[d] = dA[d] + h*nb*maxm;    /* for storing the big panel               */
        magma_queue_create( &stream[d][0] );
        magma_queue_create( &stream[d][1] );
        magma_event_create( &event[d][0] );
        magma_event_create( &event[d][1] );
    }
    //magma_setdevice(0);

#ifdef PROFILE
    end = get_current_time();
    printf( " memory-allocation time: %e\n",GetTimerValue(start, end)/1000.0 );
    start = get_current_time();
#endif
    for( I=0; I<n; I+=NB ) {
        M = m;
        N = min( NB, n-I );       /* number of columns in this big panel             */
        s = min(max(m-I,0),N)/nb; /* number of small block-columns in this big panel */

        maxm = ((M + 31)/32)*32;
        if( num_gpus0 > ceil((float)N/nb) ) {
            num_gpus = (int)ceil((float)N/nb);
        } else {
            num_gpus = num_gpus0;
        }

        for( d=0; d<num_gpus; d++ ) {
            n_local[d] = ((N/nb)/num_gpus)*nb;
            if (d < (N/nb)%num_gpus)
                n_local[d] += nb;
            else if (d == (N/nb)%num_gpus)
                n_local[d] += N%nb;
        }
        ldn_local = ((n_local[0]+31)/32)*32;
        
#ifdef PROFILE
        start2 = get_current_time();
#endif
        /* upload the next big panel into GPU, transpose (A->A'), and pivot it */
        magmablas_csetmatrix_transpose_mgpu(num_gpus, stream, A(0,I), lda,
                                            dAT, ldn_local, dA, maxm, M, N, nb);
        for( d=0; d<num_gpus; d++ ) {
            magma_setdevice(d);
            magma_queue_sync( stream[d][0] );
            magma_queue_sync( stream[d][1] );
            magmablasSetKernelStream(NULL);
        }

#ifdef PROFILE
        start1 = get_current_time();
#endif
        /* == --------------------------------------------------------------- == */
        /* == loop around the previous big-panels to update the new big-panel == */
        for( offset = 0; offset<min(m,I); offset+=NB )
        {
            NBk = min( m-offset, NB );
            /* start sending the first tile from the previous big-panels to gpus */
            for( d=0; d<num_gpus; d++ ) {
                magma_setdevice(d);
                nbi  = min( nb, NBk );
                magma_csetmatrix_async( (M-offset), nbi,
                                        A(offset,offset), lda,
                                        dA[d],            (maxm-offset), stream[d][0] );
                
                /* make sure the previous update finished */
                magmablasSetKernelStream(stream[d][0]);
                //magma_queue_sync( stream[d][1] );
                magma_queue_wait_event( stream[d][0], event[d][0] );
                
                /* transpose */
                magmablas_ctranspose2( inPT(d,0,0), nb, dA[d], maxm-offset, M-offset, nbi);
            }
            
            /* applying the pivot from the previous big-panel */
            for( d=0; d<num_gpus; d++ ) {
                magma_setdevice(d);
                magmablasSetKernelStream(stream[d][1]);
                magmablas_cpermute_long3( inAT(d,0,0), ldn_local, ipiv, NBk, offset );
            }
            
            /* == going through each block-column of previous big-panels == */
            for( jj=0, ib=offset/nb; jj<NBk; jj+=nb, ib++ )
            {
                ii   = offset+jj;
                rows = maxm - ii;
                nbi  = min( nb, NBk-jj );
                for( d=0; d<num_gpus; d++ ) {
                    magma_setdevice(d);
                    
                    /* wait for a block-column on GPU */
                    magma_queue_sync( stream[d][0] );
                    
                    /* start sending next column */
                    if( jj+nb < NBk ) {
                        magma_csetmatrix_async( (M-ii-nb), min(nb,NBk-jj-nb),
                                                A(ii+nb,ii+nb), lda,
                                                dA[d],          (rows-nb), stream[d][0] );
                        
                        /* make sure the previous update finished */
                        magmablasSetKernelStream(stream[d][0]);
                        //magma_queue_sync( stream[d][1] );
                        magma_queue_wait_event( stream[d][0], event[d][(1+jj/nb)%2] );
                        
                        /* transpose next column */
                        magmablas_ctranspose2( inPT(d,0,(1+jj/nb)%2), nb, dA[d], rows-nb, M-ii-nb, nb);
                    }
                    
                    /* update with the block column */
                    magmablasSetKernelStream(stream[d][1]);
                    magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                                 n_local[d], nbi, c_one, inPT(d,0,(jj/nb)%2), nb, inAT(d,ib,0), ldn_local );
                    if( M > ii+nb ) {
                        magma_cgemm( MagmaNoTrans, MagmaNoTrans,
                            n_local[d], M-(ii+nb), nbi, c_neg_one, inAT(d,ib,0), ldn_local,
                            inPT(d,1,(jj/nb)%2), nb, c_one, inAT(d,ib+1,0), ldn_local );
                    }
                    magma_event_record( event[d][(jj/nb)%2], stream[d][1] );
                
                } /* end of for each block-columns in a big-panel */
            }
        } /* end of for each previous big-panels */
        for( d=0; d<num_gpus; d++ ) {
            magma_setdevice(d);
            magma_queue_sync( stream[d][0] );
            magma_queue_sync( stream[d][1] );
            magmablasSetKernelStream(NULL);
        }

        /* calling magma-gpu interface to panel-factorize the big panel */
        if( M > I ) {
            //magma_cgetrf1_mgpu(num_gpus, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, &a[I*lda], lda,
            //                   (magma_queue_t **)stream, &iinfo);
            magma_cgetrf2_mgpu(num_gpus, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, A(0,I), lda,
                               stream, &iinfo);
            if( iinfo < 0 ) {
                *info = iinfo;
                break;
            } else if( iinfo != 0 ) {
                *info = iinfo + I * NB;
                //break;
            }
            /* adjust pivots */
            for( ii=I; ii<min(I+N,m); ii++ )
                ipiv[ii] += I;
        }
#ifdef PROFILE
        end1 = get_current_time();
        time_rmajor  += GetTimerValue(start1, end1);
        time_rmajor3 += GetTimerValue(start2, end1);
        time_mem += (GetTimerValue(start2, end1)-GetTimerValue(start1, end1))/1000.0;
#endif
        /* download the current big panel to CPU */
        magmablas_cgetmatrix_transpose_mgpu(num_gpus, stream, dAT, ldn_local, A(0,I), lda, dA, maxm, M, N, nb);
        for( d=0; d<num_gpus; d++ ) {
            magma_setdevice(d);
            magma_queue_sync( stream[d][0] );
            magma_queue_sync( stream[d][1] );
            magmablasSetKernelStream(NULL);
        }
#ifdef PROFILE
        end1 = get_current_time();
        time_rmajor2 += GetTimerValue(start1, end1);
#endif

    } /* end of for */

#ifdef PROFILE
    end = get_current_time();
    flops = FLOPS_CGETRF( m, n ) / 1000000;
    printf(" NB=%d nb=%d\n",NB,nb);
    printf(" memcopy and transpose %e seconds\n",time_mem );
    printf(" total time %e seconds\n",GetTimerValue(start0,end)/1000.0);
    printf(" Performance %f GFlop/s, %f seconds without htod and dtoh\n",     flops / time_rmajor,  time_rmajor /1000.0);
    printf(" Performance %f GFlop/s, %f seconds with    htod\n",              flops / time_rmajor3, time_rmajor3/1000.0);
    printf(" Performance %f GFlop/s, %f seconds with    dtoh\n",              flops / time_rmajor2, time_rmajor2/1000.0);
    printf(" Performance %f GFlop/s, %f seconds without memory-allocation\n", flops / GetTimerValue(start, end), GetTimerValue(start,end)/1000.0);
#endif

    for( d=0; d<num_gpus0; d++ ) {
        magma_setdevice(d);
        magma_free( dA[d] );
        magma_event_destroy( event[d][0] );
        magma_event_destroy( event[d][1] );
        magma_queue_destroy( stream[d][0] );
        magma_queue_destroy( stream[d][1] );
        magmablasSetKernelStream(NULL);
    }
    magma_setdevice(0);
    
    }
    if( *info >= 0 ) magma_cgetrf_piv(m, n, NB, a, lda, ipiv, info);
    return *info;
} /* magma_cgetrf_m */