Beispiel #1
0
extern "C" magma_int_t
magma_zgetrf2_gpu(
    magma_int_t m, magma_int_t n,
    magmaDoubleComplex_ptr dA, size_t dA_offset, magma_int_t ldda,
    magma_int_t *ipiv,
    magma_queue_t queues[2],
    magma_int_t *info )
{
/*  -- clMAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date November 2014

    Purpose
    =======
    ZGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
        A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N).
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    LDDA     (input) INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).

    IPIV    (output) INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
            > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.
    =====================================================================    */

    #define  dA(i_, j_) dA,   dA_offset  + (i_)*nb       + (j_)*nb*ldda
    #define dAT(i_, j_) dAT,  dAT_offset + (i_)*nb*lddat + (j_)*nb
    #define dAP(i_, j_) dAP,               (i_)          + (j_)*maxm
    #define work(i_)   (work + (i_))

    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;

    magma_int_t iinfo, nb;
    magma_int_t maxm, maxn, mindim;
    magma_int_t i, j, rows, s, lddat, ldwork;
    magmaDoubleComplex_ptr dAT, dAP;
    magmaDoubleComplex *work;
    size_t dAT_offset;

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (ldda < max(1,m))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    nb     = magma_get_zgetrf_nb(m);
    s      = mindim / nb;

    if (nb <= 1 || nb >= min(m,n)) {
        /* Use CPU code. */
        if ( MAGMA_SUCCESS != magma_zmalloc_cpu( &work, m*n )) {
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        magma_zgetmatrix( m, n, dA(0,0), ldda, work(0), m, queues[0] );
        lapackf77_zgetrf( &m, &n, work, &m, ipiv, info );
        magma_zsetmatrix( m, n, work(0), m, dA(0,0), ldda, queues[0] );
        magma_free_cpu( work );
    }
    else {
        /* Use hybrid blocked code. */
        maxm = ((m + 31)/32)*32;
        maxn = ((n + 31)/32)*32;

        if ( MAGMA_SUCCESS != magma_zmalloc( &dAP, nb*maxm )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }

        // square matrices can be done in place;
        // rectangular requires copy to transpose
        if ( m == n ) {
            dAT = dA;
            dAT_offset = dA_offset;
            lddat = ldda;
            magmablas_ztranspose_inplace( m, dAT(0,0), lddat, queues[0] );
        }
        else {
            lddat = maxn;  // N-by-M
            dAT_offset = 0;
            if ( MAGMA_SUCCESS != magma_zmalloc( &dAT, lddat*maxm )) {
                magma_free( dAP );
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }
            magmablas_ztranspose( m, n, dA(0,0), ldda, dAT(0,0), lddat, queues[0] );
        }

        ldwork = maxm;
        /*
        if ( MAGMA_SUCCESS != magma_zmalloc_cpu( &work, ldwork*nb ) ) {
            magma_free( dAP );
            if ( dA != dAT )
                magma_free( dAT );

            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        */
        cl_mem work_mapped = clCreateBuffer( gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, ldwork*nb * sizeof(magmaDoubleComplex), NULL, NULL );
        work = (magmaDoubleComplex*) clEnqueueMapBuffer( queues[0], work_mapped, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, ldwork*nb * sizeof(magmaDoubleComplex), 0, NULL, NULL, NULL );

        for( j=0; j < s; j++ ) {
            // download j-th panel
            magmablas_ztranspose( nb, m-j*nb, dAT(j,j), lddat, dAP(0,0), maxm, queues[0] );
            clFlush( queues[0] );
            magma_queue_sync( queues[0] );
            magma_zgetmatrix_async( m-j*nb, nb, dAP(0,0), maxm, work(0), ldwork, queues[1], NULL );
            clFlush( queues[1] );
            if ( j > 0 ) {
                magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             n - (j+1)*nb, nb,
                             c_one, dAT(j-1,j-1), lddat,
                             dAT(j-1,j+1), lddat, queues[0] );
                magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                             n-(j+1)*nb, m-j*nb, nb,
                             c_neg_one, dAT(j-1,j+1), lddat,
                                        dAT(j,  j-1), lddat,
                             c_one,     dAT(j,  j+1), lddat, queues[0] );
            }

            magma_queue_sync( queues[1] );
            // do the cpu part
            rows = m - j*nb;
            lapackf77_zgetrf( &rows, &nb, work, &ldwork, ipiv+j*nb, &iinfo );
            if ( *info == 0 && iinfo > 0 )
                *info = iinfo + j*nb;

            for( i=j*nb; i < j*nb + nb; ++i ) {
                ipiv[i] += j*nb;
            }
            magmablas_zlaswp( n, dAT(0,0), lddat, j*nb + 1, j*nb + nb, ipiv, 1, queues[0] );
            clFlush( queues[0] );

            // upload j-th panel
            magma_zsetmatrix_async( m-j*nb, nb, work(0), ldwork, dAP(0,0), maxm, queues[1], NULL );
            magma_queue_sync( queues[1] );
            magmablas_ztranspose( m-j*nb, nb, dAP(0,0), maxm, dAT(j,j), lddat, queues[0] );
            clFlush( queues[0] );
            
            // do the small non-parallel computations (next panel update)
            if ( s > (j+1) ) {
                magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             nb, nb,
                             c_one, dAT(j, j  ), lddat,
                             dAT(j, j+1), lddat, queues[0] );
                magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                             nb, m-(j+1)*nb, nb,
                             c_neg_one, dAT(j,   j+1), lddat,
                                        dAT(j+1, j  ), lddat,
                             c_one,     dAT(j+1, j+1), lddat, queues[0] );
            }
            else {
                magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             n-s*nb, nb,
                             c_one, dAT(j, j  ), lddat,
                             dAT(j, j+1), lddat, queues[0] );
                magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                             n-(j+1)*nb, m-(j+1)*nb, nb,
                             c_neg_one, dAT(j,   j+1), lddat,
                                        dAT(j+1, j  ), lddat,
                             c_one,     dAT(j+1, j+1), lddat, queues[0] );
            }
        }

        magma_int_t nb0 = min( m - s*nb, n - s*nb );
        if ( nb0 > 0 ) {
            rows = m - s*nb;
    
            magmablas_ztranspose( nb0, rows, dAT(s,s), lddat, dAP(0,0), maxm, queues[0] );
            clFlush( queues[0] );
            magma_queue_sync( queues[0] );
            magma_zgetmatrix_async( rows, nb0, dAP(0,0), maxm, work(0), ldwork, queues[1], NULL );
            magma_queue_sync( queues[1] );
            
            // do the cpu part
            lapackf77_zgetrf( &rows, &nb0, work, &ldwork, ipiv+s*nb, &iinfo );
            if ( (*info == 0) && (iinfo > 0) )
                *info = iinfo + s*nb;
            
            for( i=s*nb; i < s*nb + nb0; ++i ) {
                ipiv[i] += s*nb;
            }
            magmablas_zlaswp( n, dAT(0,0), lddat, s*nb + 1, s*nb + nb0, ipiv, 1, queues[0] );
            clFlush( queues[0] );
            
            // upload j-th panel
            magma_zsetmatrix_async( rows, nb0, work(0), ldwork, dAP(0,0), maxm, queues[1], NULL );
            magma_queue_sync( queues[1] );
            magmablas_ztranspose( rows, nb0, dAP(0,0), maxm, dAT(s,s), lddat, queues[0] );
            clFlush( queues[0] );
    
            magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                         n-s*nb-nb0, nb0,
                         c_one, dAT(s,s),     lddat,
                         dAT(s,s)+nb0, lddat, queues[0] );
        }

        // undo transpose
        if ( dA == dAT ) {
            magmablas_ztranspose_inplace( m, dAT(0,0), lddat, queues[0] );
        }
        else {
            magmablas_ztranspose( n, m, dAT(0,0), lddat, dA(0,0), ldda, queues[0] );
            magma_free( dAT );
        }
        
        magma_queue_sync( queues[0] );
        magma_queue_sync( queues[1] );
        magma_free( dAP );
        // magma_free_cpu( work );
        clEnqueueUnmapMemObject( queues[0], work_mapped, work, 0, NULL, NULL );
        clReleaseMemObject( work_mapped );
    }

    return *info;
} /* magma_zgetrf_gpu */
Beispiel #2
0
/**
    Purpose
    -------
    CGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
        A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.
    
    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    dA      COMPLEX array on the GPU, dimension (LDDA,N).
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    @param[in]
    ldda     INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).

    @param[out]
    ipiv    INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
      -     > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    @ingroup magma_cgesv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_cgetrf_gpu(
    magma_int_t m, magma_int_t n,
    magmaFloatComplex_ptr dA, magma_int_t ldda,
    magma_int_t *ipiv,
    magma_int_t *info )
{
    #ifdef HAVE_clBLAS
    #define  dA(i_, j_) dA,  (dA_offset  + (i_)       + (j_)*ldda)
    #define dAT(i_, j_) dAT, (dAT_offset + (i_)*lddat + (j_))
    #define dAP(i_, j_) dAP, (             (i_)          + (j_)*maxm)
    #else
    #define  dA(i_, j_) (dA  + (i_)       + (j_)*ldda)
    #define dAT(i_, j_) (dAT + (i_)*lddat + (j_))
    #define dAP(i_, j_) (dAP + (i_)       + (j_)*maxm)
    #endif

    magmaFloatComplex c_one     = MAGMA_C_ONE;
    magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;

    magma_int_t iinfo, nb;
    magma_int_t maxm, maxn, minmn;
    magma_int_t i, j, jb, rows, lddat, ldwork;
    magmaFloatComplex_ptr dAT=NULL, dAP=NULL;
    magmaFloatComplex *work=NULL;

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (ldda < max(1,m))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    minmn = min( m, n );
    nb    = magma_get_cgetrf_nb( m, n );

    magma_queue_t queues[2] = { NULL };
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queues[0] );
    magma_queue_create( cdev, &queues[1] );

    if (nb <= 1 || nb >= min(m,n)) {
        /* Use CPU code. */
        if ( MAGMA_SUCCESS != magma_cmalloc_cpu( &work, m*n )) {
            *info = MAGMA_ERR_HOST_ALLOC;
            goto cleanup;
        }
        magma_cgetmatrix( m, n, dA(0,0), ldda, work, m, queues[0] );
        lapackf77_cgetrf( &m, &n, work, &m, ipiv, info );
        magma_csetmatrix( m, n, work, m, dA(0,0), ldda, queues[0] );
        magma_free_cpu( work );  work=NULL;
    }
    else {
        /* Use hybrid blocked code. */
        maxm = magma_roundup( m, 32 );
        maxn = magma_roundup( n, 32 );

        if (MAGMA_SUCCESS != magma_cmalloc( &dAP, nb*maxm )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            goto cleanup;
        }

        // square matrices can be done in place;
        // rectangular requires copy to transpose
        if ( m == n ) {
            dAT = dA;
            lddat = ldda;
            magmablas_ctranspose_inplace( m, dAT(0,0), lddat, queues[0] );
        }
        else {
            lddat = maxn;  // N-by-M
            if (MAGMA_SUCCESS != magma_cmalloc( &dAT, lddat*maxm )) {
                *info = MAGMA_ERR_DEVICE_ALLOC;
                goto cleanup;
            }
            magmablas_ctranspose( m, n, dA(0,0), ldda, dAT(0,0), lddat, queues[0] );
        }
        magma_queue_sync( queues[0] );  // finish transpose

        ldwork = maxm;
        if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, ldwork*nb )) {
            *info = MAGMA_ERR_HOST_ALLOC;
            goto cleanup;
        }

        for( j=0; j < minmn-nb; j += nb ) {
            // get j-th panel from device
            magmablas_ctranspose( nb, m-j, dAT(j,j), lddat, dAP(0,0), maxm, queues[1] );
            magma_queue_sync( queues[1] );  // wait for transpose
            magma_cgetmatrix_async( m-j, nb, dAP(0,0), maxm, work, ldwork, queues[0] );

            if ( j > 0 ) {
                magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             n-(j+nb), nb,
                             c_one, dAT(j-nb, j-nb), lddat,
                                    dAT(j-nb, j+nb), lddat, queues[1] );
                magma_cgemm( MagmaNoTrans, MagmaNoTrans,
                             n-(j+nb), m-j, nb,
                             c_neg_one, dAT(j-nb, j+nb), lddat,
                                        dAT(j,    j-nb), lddat,
                             c_one,     dAT(j,    j+nb), lddat, queues[1] );
            }

            // do the cpu part
            rows = m - j;
            magma_queue_sync( queues[0] );  // wait to get work
            lapackf77_cgetrf( &rows, &nb, work, &ldwork, ipiv+j, &iinfo );
            if ( *info == 0 && iinfo > 0 )
                *info = iinfo + j;

            // send j-th panel to device
            magma_csetmatrix_async( m-j, nb, work, ldwork, dAP, maxm, queues[0] );

            for( i=j; i < j + nb; ++i ) {
                ipiv[i] += j;
            }
            magmablas_claswp( n, dAT(0,0), lddat, j + 1, j + nb, ipiv, 1, queues[1] );

            magma_queue_sync( queues[0] );  // wait to set dAP
            magmablas_ctranspose( m-j, nb, dAP(0,0), maxm, dAT(j,j), lddat, queues[1] );

            // do the small non-parallel computations (next panel update)
            if ( j + nb < minmn - nb ) {
                magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             nb, nb,
                             c_one, dAT(j, j   ), lddat,
                                    dAT(j, j+nb), lddat, queues[1] );
                magma_cgemm( MagmaNoTrans, MagmaNoTrans,
                             nb, m-(j+nb), nb,
                             c_neg_one, dAT(j,    j+nb), lddat,
                                        dAT(j+nb, j   ), lddat,
                             c_one,     dAT(j+nb, j+nb), lddat, queues[1] );
            }
            else {
                magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             n-(j+nb), nb,
                             c_one, dAT(j, j   ), lddat,
                                    dAT(j, j+nb), lddat, queues[1] );
                magma_cgemm( MagmaNoTrans, MagmaNoTrans,
                             n-(j+nb), m-(j+nb), nb,
                             c_neg_one, dAT(j,    j+nb), lddat,
                                        dAT(j+nb, j   ), lddat,
                             c_one,     dAT(j+nb, j+nb), lddat, queues[1] );
            }
        }

        jb = min( m-j, n-j );
        if ( jb > 0 ) {
            rows = m - j;
            
            magmablas_ctranspose( jb, rows, dAT(j,j), lddat, dAP(0,0), maxm, queues[1] );
            magma_cgetmatrix( rows, jb, dAP(0,0), maxm, work, ldwork, queues[1] );
            
            // do the cpu part
            lapackf77_cgetrf( &rows, &jb, work, &ldwork, ipiv+j, &iinfo );
            if ( *info == 0 && iinfo > 0 )
                *info = iinfo + j;
            
            for( i=j; i < j + jb; ++i ) {
                ipiv[i] += j;
            }
            magmablas_claswp( n, dAT(0,0), lddat, j + 1, j + jb, ipiv, 1, queues[1] );
            
            // send j-th panel to device
            magma_csetmatrix( rows, jb, work, ldwork, dAP(0,0), maxm, queues[1] );
            magmablas_ctranspose( rows, jb, dAP(0,0), maxm, dAT(j,j), lddat, queues[1] );
            
            magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                         n-j-jb, jb,
                         c_one, dAT(j,j),    lddat,
                                dAT(j,j+jb), lddat, queues[1] );
        }
        
        // undo transpose
        if ( m == n ) {
            magmablas_ctranspose_inplace( m, dAT(0,0), lddat, queues[1] );
        }
        else {
            magmablas_ctranspose( n, m, dAT(0,0), lddat, dA(0,0), ldda, queues[1] );
        }
    }
    
cleanup:
    magma_queue_destroy( queues[0] );
    magma_queue_destroy( queues[1] );
    
    magma_free( dAP );
    if (m != n) {
        magma_free( dAT );
    }
    magma_free_pinned( work );
    
    return *info;
} /* magma_cgetrf_gpu */