示例#1
0
/**
    Purpose
    -------
    CGELQF computes an LQ factorization of a COMPLEX M-by-N matrix A:
    A = L * Q.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    A       COMPLEX array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and below the diagonal of the array
            contain the m-by-min(m,n) lower trapezoidal matrix L (L is
            lower triangular if m <= n); the elements above the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of elementary reflectors (see Further Details).
    \n
            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    @param[out]
    tau     COMPLEX array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    work    (workspace) COMPLEX array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.
    \n
            Higher performance is achieved if WORK is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.  LWORK >= max(1,M).
            For optimum performance LWORK >= M*NB, where NB is the
            optimal blocksize.
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  if INFO = -10 internal GPU memory allocation failed.

    Further Details
    ---------------
    The matrix Q is represented as a product of elementary reflectors

       Q = H(k) . . . H(2) H(1), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in A(i,i+1:n),
    and tau in TAU(i).

    @ingroup magma_cgelqf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_cgelqf( magma_int_t m, magma_int_t n,
              magmaFloatComplex *A,    magma_int_t lda,   magmaFloatComplex *tau,
              magmaFloatComplex *work, magma_int_t lwork, magma_int_t *info)
{
    magmaFloatComplex *dA, *dAT;
    magmaFloatComplex c_one = MAGMA_C_ONE;
    magma_int_t maxm, maxn, maxdim, nb;
    magma_int_t iinfo, ldda;
    int lquery;

    /* Function Body */
    *info = 0;
    nb = magma_get_cgelqf_nb(m);

    work[0] = MAGMA_C_MAKE( (float)(m*nb), 0 );
    lquery = (lwork == -1);
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,m)) {
        *info = -4;
    } else if (lwork < max(1,m) && ! lquery) {
        *info = -7;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /*  Quick return if possible */
    if (min(m, n) == 0) {
        work[0] = c_one;
        return *info;
    }

    maxm = ((m + 31)/32)*32;
    maxn = ((n + 31)/32)*32;
    maxdim = max(maxm, maxn);

    if (maxdim*maxdim < 2*maxm*maxn) {
        ldda = maxdim;

        if (MAGMA_SUCCESS != magma_cmalloc( &dA, maxdim*maxdim )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }

        magma_csetmatrix( m, n, A, lda, dA, ldda );
        dAT = dA;
        magmablas_ctranspose_inplace( ldda, dAT, ldda );
    }
    else {
        ldda = maxn;

        if (MAGMA_SUCCESS != magma_cmalloc( &dA, 2*maxn*maxm )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }

        magma_csetmatrix( m, n, A, lda, dA, maxm );

        dAT = dA + maxn * maxm;
        magmablas_ctranspose( m, n, dA, maxm, dAT, ldda );
    }

    magma_cgeqrf2_gpu(n, m, dAT, ldda, tau, &iinfo);

    if (maxdim*maxdim < 2*maxm*maxn) {
        magmablas_ctranspose_inplace( ldda, dAT, ldda );
        magma_cgetmatrix( m, n, dA, ldda, A, lda );
    } else {
        magmablas_ctranspose( n, m, dAT, ldda, dA, maxm );
        magma_cgetmatrix( m, n, dA, maxm, A, lda );
    }

    magma_free( dA );

    return *info;
} /* magma_cgelqf */
示例#2
0
/**
    Purpose
    -------
    CGETRF_m computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.  This version does not
    require work space on the GPU passed as input. GPU memory is allocated
    in the routine. The matrix may exceed the GPU memory.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.

    Note: The factorization of big panel is done calling multiple-gpu-interface.
    Pivots are applied on GPU within the big panel.

    Arguments
    ---------
    @param[in]
    ngpu    INTEGER
            Number of GPUs to use. ngpu > 0.

    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    A       COMPLEX array, dimension (LDA,N)
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.
    \n
            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    @param[out]
    ipiv    INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
      -     > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    @ingroup magma_cgesv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_cgetrf_m(
    magma_int_t ngpu,
    magma_int_t m, magma_int_t n,
    magmaFloatComplex *A, magma_int_t lda, magma_int_t *ipiv,
    magma_int_t *info)
{
#define     A(i,j) (A      + (j)*lda + (i))
#define dAT(d,i,j) (dAT[d] + (i)*nb*ldn_local + (j)*nb)
#define dPT(d,i,j) (dPT[d] + (i)*nb*nb + (j)*nb*maxm)

    magma_timer_t time=0, time_total=0, time_alloc=0, time_set=0, time_get=0, time_comp=0;
    timer_start( time_total );
    real_Double_t flops;

    magmaFloatComplex c_one     = MAGMA_C_ONE;
    magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;
    magmaFloatComplex *dAT[MagmaMaxGPUs], *dA[MagmaMaxGPUs], *dPT[MagmaMaxGPUs];
    magma_int_t        iinfo = 0, nb, nbi, maxm, n_local[MagmaMaxGPUs], ldn_local;
    magma_int_t        N, M, NB, NBk, I, d, ngpu0 = ngpu;
    magma_int_t        ii, jj, h, offset, ib, rows;
    
    magma_queue_t stream[MagmaMaxGPUs][2];
    magma_event_t  event[MagmaMaxGPUs][2];

    *info = 0;
    if (m < 0)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (lda < max(1,m))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );
    
    /* initialize nb */
    nb = magma_get_cgetrf_nb(m);
    maxm = ((m  + 31)/32)*32;

    /* figure out NB */
    size_t freeMem, totalMem;
    cudaMemGetInfo( &freeMem, &totalMem );
    freeMem /= sizeof(magmaFloatComplex);
    
    /* number of columns in the big panel */
    h = 1+(2+ngpu0);
    NB = (magma_int_t)(0.8*freeMem/maxm-h*nb);
    const char* ngr_nb_char = getenv("MAGMA_NGR_NB");
    if ( ngr_nb_char != NULL )
        NB = max( nb, min( NB, atoi(ngr_nb_char) ) );
    //NB = 5*max(nb,32);

    if ( ngpu0 > ceil((float)NB/nb) ) {
        ngpu = (int)ceil((float)NB/nb);
        h = 1+(2+ngpu);
        NB = (magma_int_t)(0.8*freeMem/maxm-h*nb);
    } else {
        ngpu = ngpu0;
    }
    if ( ngpu*NB >= n ) {
        #ifdef CHECK_CGETRF_OOC
        printf( "      * still fit in GPU memory.\n" );
        #endif
        NB = n;
    } else {
        #ifdef CHECK_CGETRF_OOC
        printf( "      * don't fit in GPU memory.\n" );
        #endif
        NB = ngpu*NB;
        NB = max( nb, (NB / nb) * nb); /* making sure it's devisable by nb (x64) */
    }

    #ifdef CHECK_CGETRF_OOC
    if ( NB != n ) printf( "      * running in out-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n", n, NB, nb, (float)freeMem );
    else           printf( "      * running in in-core mode  (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n", n, NB, nb, (float)freeMem );
    #endif

    if ( (nb <= 1) || (nb >= min(m,n)) ) {
        /* Use CPU code for scalar of one tile. */
        lapackf77_cgetrf(&m, &n, A, &lda, ipiv, info);
    } else {
        /* Use hybrid blocked code. */

        /* allocate memory on GPU to store the big panel */
        timer_start( time_alloc );
        n_local[0] = (NB/nb)/ngpu;
        if ( NB%(nb*ngpu) != 0 )
            n_local[0]++;
        n_local[0] *= nb;
        ldn_local = ((n_local[0]+31)/32)*32;
    
        for( d=0; d < ngpu; d++ ) {
            magma_setdevice(d);
            if (MAGMA_SUCCESS != magma_cmalloc( &dA[d], (ldn_local+h*nb)*maxm )) {
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }
            dPT[d] = dA[d] + nb*maxm;      /* for storing the previous panel from CPU */
            dAT[d] = dA[d] + h*nb*maxm;    /* for storing the big panel               */
            magma_queue_create( &stream[d][0] );
            magma_queue_create( &stream[d][1] );
            magma_event_create( &event[d][0] );
            magma_event_create( &event[d][1] );
        }
        //magma_setdevice(0);
        timer_stop( time_alloc );
        
        for( I=0; I < n; I += NB ) {
            M = m;
            N = min( NB, n-I );       /* number of columns in this big panel             */
            //s = min( max(m-I,0), N )/nb; /* number of small block-columns in this big panel */
    
            maxm = ((M + 31)/32)*32;
            if ( ngpu0 > ceil((float)N/nb) ) {
                ngpu = (int)ceil((float)N/nb);
            } else {
                ngpu = ngpu0;
            }
    
            for( d=0; d < ngpu; d++ ) {
                n_local[d] = ((N/nb)/ngpu)*nb;
                if (d < (N/nb)%ngpu)
                    n_local[d] += nb;
                else if (d == (N/nb)%ngpu)
                    n_local[d] += N%nb;
            }
            ldn_local = ((n_local[0]+31)/32)*32;
            
            /* upload the next big panel into GPU, transpose (A->A'), and pivot it */
            timer_start( time );
            magmablas_csetmatrix_transpose_mgpu(ngpu, stream, A(0,I), lda,
                                                dAT, ldn_local, dA, maxm, M, N, nb);
            for( d=0; d < ngpu; d++ ) {
                magma_setdevice(d);
                magma_queue_sync( stream[d][0] );
                magma_queue_sync( stream[d][1] );
                magmablasSetKernelStream(NULL);
            }
            time_set += timer_stop( time );
    
            timer_start( time );
            /* == --------------------------------------------------------------- == */
            /* == loop around the previous big-panels to update the new big-panel == */
            for( offset = 0; offset < min(m,I); offset += NB ) {
                NBk = min( m-offset, NB );
                /* start sending the first tile from the previous big-panels to gpus */
                for( d=0; d < ngpu; d++ ) {
                    magma_setdevice(d);
                    nbi  = min( nb, NBk );
                    magma_csetmatrix_async( (M-offset), nbi,
                                            A(offset,offset), lda,
                                            dA[d],            (maxm-offset), stream[d][0] );
                    
                    /* make sure the previous update finished */
                    magmablasSetKernelStream(stream[d][0]);
                    //magma_queue_sync( stream[d][1] );
                    magma_queue_wait_event( stream[d][0], event[d][0] );
                    
                    /* transpose */
                    magmablas_ctranspose( M-offset, nbi, dA[d], maxm-offset, dPT(d,0,0), nb );
                }
                
                /* applying the pivot from the previous big-panel */
                for( d=0; d < ngpu; d++ ) {
                    magma_setdevice(d);
                    magmablas_claswp_q( ldn_local, dAT(d,0,0), ldn_local, offset+1, offset+NBk, ipiv, 1, stream[d][1] );
                }
                
                /* == going through each block-column of previous big-panels == */
                for( jj=0, ib=offset/nb; jj < NBk; jj += nb, ib++ ) {
                    ii   = offset+jj;
                    rows = maxm - ii;
                    nbi  = min( nb, NBk-jj );
                    for( d=0; d < ngpu; d++ ) {
                        magma_setdevice(d);
                        
                        /* wait for a block-column on GPU */
                        magma_queue_sync( stream[d][0] );
                        
                        /* start sending next column */
                        if ( jj+nb < NBk ) {
                            magma_csetmatrix_async( (M-ii-nb), min(nb,NBk-jj-nb),
                                                    A(ii+nb,ii+nb), lda,
                                                    dA[d],          (rows-nb), stream[d][0] );
                            
                            /* make sure the previous update finished */
                            magmablasSetKernelStream(stream[d][0]);
                            //magma_queue_sync( stream[d][1] );
                            magma_queue_wait_event( stream[d][0], event[d][(1+jj/nb)%2] );
                            
                            /* transpose next column */
                            magmablas_ctranspose( M-ii-nb, nb, dA[d], rows-nb, dPT(d,0,(1+jj/nb)%2), nb );
                        }
                        
                        /* update with the block column */
                        magmablasSetKernelStream(stream[d][1]);
                        magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                                     n_local[d], nbi, c_one, dPT(d,0,(jj/nb)%2), nb, dAT(d,ib,0), ldn_local );
                        if ( M > ii+nb ) {
                            magma_cgemm( MagmaNoTrans, MagmaNoTrans,
                                n_local[d], M-(ii+nb), nbi, c_neg_one, dAT(d,ib,0), ldn_local,
                                dPT(d,1,(jj/nb)%2), nb, c_one, dAT(d,ib+1,0), ldn_local );
                        }
                        magma_event_record( event[d][(jj/nb)%2], stream[d][1] );
                    
                    } /* end of for each block-columns in a big-panel */
                }
            } /* end of for each previous big-panels */
            for( d=0; d < ngpu; d++ ) {
                magma_setdevice(d);
                magma_queue_sync( stream[d][0] );
                magma_queue_sync( stream[d][1] );
            magmablasSetKernelStream(NULL);
            }
    
            /* calling magma-gpu interface to panel-factorize the big panel */
            if ( M > I ) {
                magma_cgetrf2_mgpu(ngpu, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, A(0,I), lda,
                                   stream, &iinfo);
                if ( iinfo < 0 ) {
                    *info = iinfo;
                    break;
                } else if ( iinfo != 0 ) {
                    *info = iinfo + I * NB;
                    //break;
                }
                /* adjust pivots */
                for( ii=I; ii < min(I+N,m); ii++ )
                    ipiv[ii] += I;
            }
            time_comp += timer_stop( time );
    
            /* download the current big panel to CPU */
            timer_start( time );
            magmablas_cgetmatrix_transpose_mgpu(ngpu, stream, dAT, ldn_local, A(0,I), lda, dA, maxm, M, N, nb);
            for( d=0; d < ngpu; d++ ) {
                magma_setdevice(d);
                magma_queue_sync( stream[d][0] );
                magma_queue_sync( stream[d][1] );
            magmablasSetKernelStream(NULL);
            }
            time_get += timer_stop( time );
        } /* end of for */
    
        timer_stop( time_total );
        flops = FLOPS_CGETRF( m, n ) / 1e9;
        timer_printf(" memory-allocation time: %e\n", time_alloc );
        timer_printf(" NB=%d nb=%d\n", (int) NB, (int) nb );
        timer_printf(" memcopy and transpose %e seconds\n", time_set );
        timer_printf(" total time %e seconds\n", time_total );
        timer_printf(" Performance %f GFlop/s, %f seconds without htod and dtoh\n",     flops / (time_comp),               time_comp               );
        timer_printf(" Performance %f GFlop/s, %f seconds with    htod\n",              flops / (time_comp + time_set),    time_comp + time_set    );
        timer_printf(" Performance %f GFlop/s, %f seconds with    dtoh\n",              flops / (time_comp + time_get),    time_comp + time_get    );
        timer_printf(" Performance %f GFlop/s, %f seconds without memory-allocation\n", flops / (time_total - time_alloc), time_total - time_alloc );
    
        for( d=0; d < ngpu0; d++ ) {
            magma_setdevice(d);
            magma_free( dA[d] );
            magma_event_destroy( event[d][0] );
            magma_event_destroy( event[d][1] );
            magma_queue_destroy( stream[d][0] );
            magma_queue_destroy( stream[d][1] );
        }
        magma_setdevice( orig_dev );
        magmablasSetKernelStream( orig_stream );
    }
    if ( *info >= 0 )
        magma_cgetrf_piv(m, n, NB, A, lda, ipiv, info);
    return *info;
} /* magma_cgetrf_m */
示例#3
0
/**
    Purpose
    -------
    CGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.  This version does not
    require work space on the GPU passed as input. GPU memory is allocated
    in the routine.

    The factorization has the form
        A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.
    If the current stream is NULL, this version replaces it with user defined
    stream to overlap computation with communication.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    A       COMPLEX array, dimension (LDA,N)
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.
    \n
            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    @param[out]
    ipiv    INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
      -     > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    @ingroup magma_cgesv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_cgetrf(magma_int_t m, magma_int_t n, magmaFloatComplex *A, magma_int_t lda,
             magma_int_t *ipiv, magma_int_t *info)
{
#define dAT(i,j) (dAT + (i)*nb*ldda + (j)*nb)

    magmaFloatComplex *dAT, *dA, *da, *work;
    magmaFloatComplex c_one     = MAGMA_C_ONE;
    magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;
    magma_int_t     iinfo, nb;

    *info = 0;

    if (m < 0)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (lda < max(1,m))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    nb = magma_get_cgetrf_nb(m);

    if ( (nb <= 1) || (nb >= min(m,n)) ) {
        /* Use CPU code. */
        lapackf77_cgetrf(&m, &n, A, &lda, ipiv, info);
    } else {
        /* Use hybrid blocked code. */
        magma_int_t maxm, maxn, ldda, maxdim;
        magma_int_t i, rows, cols, s = min(m, n)/nb;
        
        maxm = ((m + 31)/32)*32;
        maxn = ((n + 31)/32)*32;
        maxdim = max(maxm, maxn);

        /* set number of GPUs */
        magma_int_t num_gpus = magma_num_gpus();
        if ( num_gpus > 1 ) {
            /* call multi-GPU non-GPU-resident interface  */
            magma_cgetrf_m(num_gpus, m, n, A, lda, ipiv, info);
            return *info;
        }

        /* explicitly checking the memory requirement */
        size_t freeMem, totalMem;
        cudaMemGetInfo( &freeMem, &totalMem );
        freeMem /= sizeof(magmaFloatComplex);

        int h = 1+(2+num_gpus), num_gpus2 = num_gpus;
        int NB = (magma_int_t)(0.8*freeMem/maxm-h*nb);
        const char* ngr_nb_char = getenv("MAGMA_NGR_NB");
        if ( ngr_nb_char != NULL )
            NB = max( nb, min( NB, atoi(ngr_nb_char) ) );

        if ( num_gpus > ceil((float)NB/nb) ) {
            num_gpus2 = (int)ceil((float)NB/nb);
            h = 1+(2+num_gpus2);
            NB = (magma_int_t)(0.8*freeMem/maxm-h*nb);
        }
        if ( num_gpus2*NB < n ) {
            /* require too much memory, so call non-GPU-resident version */
            magma_cgetrf_m(num_gpus, m, n, A, lda, ipiv, info);
            return *info;
        }

        ldda = maxn;
        work = A;
        if (maxdim*maxdim < 2*maxm*maxn) {
            // if close to square, allocate square matrix and transpose in-place
            if (MAGMA_SUCCESS != magma_cmalloc( &dA, nb*maxm + maxdim*maxdim )) {
                /* alloc failed so call non-GPU-resident version */
                magma_cgetrf_m(num_gpus, m, n, A, lda, ipiv, info);
                return *info;
            }
            da = dA + nb*maxm;
            
            ldda = maxdim;
            magma_csetmatrix( m, n, A, lda, da, ldda );
            
            dAT = da;
            magmablas_ctranspose_inplace( ldda, dAT, ldda );
        }
        else {
            // if very rectangular, allocate dA and dAT and transpose out-of-place
            if (MAGMA_SUCCESS != magma_cmalloc( &dA, (nb + maxn)*maxm )) {
                /* alloc failed so call non-GPU-resident version */
                magma_cgetrf_m(num_gpus, m, n, A, lda, ipiv, info);
                return *info;
            }
            da = dA + nb*maxm;
            
            magma_csetmatrix( m, n, A, lda, da, maxm );
            
            if (MAGMA_SUCCESS != magma_cmalloc( &dAT, maxm*maxn )) {
                /* alloc failed so call non-GPU-resident version */
                magma_free( dA );
                magma_cgetrf_m(num_gpus, m, n, A, lda, ipiv, info);
                return *info;
            }

            magmablas_ctranspose( m, n, da, maxm, dAT, ldda );
        }
        
        lapackf77_cgetrf( &m, &nb, work, &lda, ipiv, &iinfo);

        /* Define user stream if current stream is NULL */
        cudaStream_t stream[2], current_stream;
        magmablasGetKernelStream(&current_stream);

        magma_queue_create( &stream[0] );
        if (current_stream == NULL) {
            magma_queue_create( &stream[1] );
            magmablasSetKernelStream(stream[1]);
        }
        else
            stream[1] = current_stream;

        for( i = 0; i < s; i++ ) {
            // download i-th panel
            cols = maxm - i*nb;
            
            if (i > 0) {
                // download i-th panel
                magmablas_ctranspose( nb, cols, dAT(i,i), ldda, dA, cols );

                // make sure that gpu queue is empty
                magma_device_sync();

                magma_cgetmatrix_async( m-i*nb, nb, dA, cols, work, lda,
                                        stream[0]);
                
                magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             n - (i+1)*nb, nb,
                             c_one, dAT(i-1,i-1), ldda,
                                    dAT(i-1,i+1), ldda );
                magma_cgemm( MagmaNoTrans, MagmaNoTrans,
                             n-(i+1)*nb, m-i*nb, nb,
                             c_neg_one, dAT(i-1,i+1), ldda,
                                        dAT(i,  i-1), ldda,
                             c_one,     dAT(i,  i+1), ldda );

                // do the cpu part
                rows = m - i*nb;
                magma_queue_sync( stream[0] );
                lapackf77_cgetrf( &rows, &nb, work, &lda, ipiv+i*nb, &iinfo);
            }
            if (*info == 0 && iinfo > 0)
                *info = iinfo + i*nb;

            // upload i-th panel
            magma_csetmatrix_async( m-i*nb, nb, work, lda, dA, cols,
                                    stream[0]);

            magmablas_cpermute_long2( ldda, dAT, ldda, ipiv, nb, i*nb );

            magma_queue_sync( stream[0] );
            magmablas_ctranspose( cols, nb, dA, cols, dAT(i,i), ldda );

            // do the small non-parallel computations
            if (s > (i+1)) {
                magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             nb, nb,
                             c_one, dAT(i, i  ), ldda,
                                    dAT(i, i+1), ldda);
                magma_cgemm( MagmaNoTrans, MagmaNoTrans,
                             nb, m-(i+1)*nb, nb,
                             c_neg_one, dAT(i,   i+1), ldda,
                                        dAT(i+1, i  ), ldda,
                             c_one,     dAT(i+1, i+1), ldda );
            }
            else {
                magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             n-s*nb, nb,
                             c_one, dAT(i, i  ), ldda,
                                    dAT(i, i+1), ldda);
                magma_cgemm( MagmaNoTrans, MagmaNoTrans,
                             n-(i+1)*nb, m-(i+1)*nb, nb,
                             c_neg_one, dAT(i,   i+1), ldda,
                                        dAT(i+1, i  ), ldda,
                             c_one,     dAT(i+1, i+1), ldda );
            }
        }
        
        magma_int_t nb0 = min(m - s*nb, n - s*nb);
        if ( nb0 > 0 ) {
            rows = m - s*nb;
            cols = maxm - s*nb;
    
            magmablas_ctranspose( nb0, rows, dAT(s,s), ldda, dA, cols );
            magma_cgetmatrix( rows, nb0, dA, cols, work, lda );
    
            // make sure that gpu queue is empty
            magma_device_sync();
    
            // do the cpu part
            lapackf77_cgetrf( &rows, &nb0, work, &lda, ipiv+s*nb, &iinfo);
            if (*info == 0 && iinfo > 0)
                *info = iinfo + s*nb;
            magmablas_cpermute_long2( ldda, dAT, ldda, ipiv, nb0, s*nb );
    
            magma_csetmatrix( rows, nb0, work, lda, dA, cols );
            magmablas_ctranspose( rows, nb0, dA, cols, dAT(s,s), ldda );
    
            magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                         n-s*nb-nb0, nb0,
                         c_one, dAT(s, s),     ldda,
                                dAT(s, s)+nb0, ldda);
        }
       
        if (maxdim*maxdim < 2*maxm*maxn) {
            magmablas_ctranspose_inplace( ldda, dAT, ldda );
            magma_cgetmatrix( m, n, da, ldda, A, lda );
        } else {
            magmablas_ctranspose( n, m, dAT, ldda, da, maxm );
            magma_cgetmatrix( m, n, da, maxm, A, lda );
            magma_free( dAT );
        }

        magma_free( dA );
 
        magma_queue_destroy( stream[0] );
        if (current_stream == NULL) {
            magma_queue_destroy( stream[1] );
            magmablasSetKernelStream(NULL);
        }
    }
    
    return *info;
} /* magma_cgetrf */
示例#4
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing ctranspose
   Code is very similar to testing_csymmetrize.cpp
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    real_Double_t    gbytes, gpu_perf, gpu_time, gpu_perf2=0, gpu_time2=0, cpu_perf, cpu_time;
    float           error, error2, work[1];
    magmaFloatComplex  c_neg_one = MAGMA_C_NEG_ONE;
    magmaFloatComplex *h_A, *h_B, *h_R;
    magmaFloatComplex *d_A, *d_B;
    magma_int_t M, N, size, lda, ldda, ldb, lddb;
    magma_int_t ione     = 1;
    magma_int_t status = 0;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );

    printf("Inplace transpose requires M==N.\n");
    printf("    M     N   CPU GByte/s (ms)    GPU GByte/s (ms)  check   Inplace GB/s (ms)  check\n");
    printf("====================================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            lda    = M;
            ldda   = ((M+31)/32)*32;
            ldb    = N;
            lddb   = ((N+31)/32)*32;
            // load entire matrix, save entire matrix
            gbytes = sizeof(magmaFloatComplex) * 2.*M*N / 1e9;
            
            TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*N  );  // input:  M x N
            TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*M  );  // output: N x M
            TESTING_MALLOC_CPU( h_R, magmaFloatComplex, ldb*M  );  // output: N x M
            
            TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N );  // input:  M x N
            TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*M );  // output: N x M
            
            /* Initialize the matrix */
            for( int j = 0; j < N; ++j ) {
                for( int i = 0; i < M; ++i ) {
                    h_A[i + j*lda] = MAGMA_C_MAKE( i + j/10000., j );
                }
            }
            for( int j = 0; j < M; ++j ) {
                for( int i = 0; i < N; ++i ) {
                    h_B[i + j*ldb] = MAGMA_C_MAKE( i + j/10000., j );
                }
            }
            magma_csetmatrix( N, M, h_B, ldb, d_B, lddb );
            
            /* =====================================================================
               Performs operation using naive out-of-place algorithm
               (LAPACK doesn't implement transpose)
               =================================================================== */
            cpu_time = magma_wtime();
            //for( int j = 1; j < N-1; ++j ) {      // inset by 1 row & col
            //    for( int i = 1; i < M-1; ++i ) {  // inset by 1 row & col
            for( int j = 0; j < N; ++j ) {
                for( int i = 0; i < M; ++i ) {
                    h_B[j + i*ldb] = h_A[i + j*lda];
                }
            }
            cpu_time = magma_wtime() - cpu_time;
            cpu_perf = gbytes / cpu_time;
            
            /* ====================================================================
               Performs operation using MAGMA, out-of-place
               =================================================================== */
            magma_csetmatrix( M, N, h_A, lda, d_A, ldda );
            magma_csetmatrix( N, M, h_B, ldb, d_B, lddb );
            
            gpu_time = magma_sync_wtime( 0 );
            //magmablas_ctranspose( M-2, N-2, d_A+1+ldda, ldda, d_B+1+lddb, lddb );  // inset by 1 row & col
            magmablas_ctranspose( M, N, d_A, ldda, d_B, lddb );
            gpu_time = magma_sync_wtime( 0 ) - gpu_time;
            gpu_perf = gbytes / gpu_time;
            
            /* ====================================================================
               Performs operation using MAGMA, in-place
               =================================================================== */
            if ( M == N ) {
                magma_csetmatrix( M, N, h_A, lda, d_A, ldda );
                
                gpu_time2 = magma_sync_wtime( 0 );
                //magmablas_ctranspose_inplace( N-2, d_A+1+ldda, ldda );  // inset by 1 row & col
                magmablas_ctranspose_inplace( N, d_A, ldda );
                gpu_time2 = magma_sync_wtime( 0 ) - gpu_time2;
                gpu_perf2 = gbytes / gpu_time2;
            }
            
            /* =====================================================================
               Check the result
               =================================================================== */
            // check out-of-place transpose (d_B)
            size = ldb*M;
            magma_cgetmatrix( N, M, d_B, lddb, h_R, ldb );
            blasf77_caxpy( &size, &c_neg_one, h_B, &ione, h_R, &ione );
            error = lapackf77_clange("f", &N, &M, h_R, &ldb, work );
            
            if ( M == N ) {
                // also check in-place tranpose (d_A)
                magma_cgetmatrix( N, M, d_A, ldda, h_R, ldb );
                blasf77_caxpy( &size, &c_neg_one, h_B, &ione, h_R, &ione );
                error2 = lapackf77_clange("f", &N, &M, h_R, &ldb, work );
    
                printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)  %6s  %7.2f (%7.2f)  %s\n",
                       (int) M, (int) N,
                       cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000.,
                       (error  == 0. ? "ok" : "failed"),
                       gpu_perf2, gpu_time2,
                       (error2 == 0. ? "ok" : "failed") );
                status += ! (error == 0. && error2 == 0.);
            }
            else {
                printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)  %6s    ---   (  ---  )\n",
                       (int) M, (int) N,
                       cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000.,
                       (error  == 0. ? "ok" : "failed") );
                status += ! (error == 0.);
            }
            
            TESTING_FREE_CPU( h_A );
            TESTING_FREE_CPU( h_B );
            TESTING_FREE_CPU( h_R );
            
            TESTING_FREE_DEV( d_A );
            TESTING_FREE_DEV( d_B );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    TESTING_FINALIZE();
    return status;
}
示例#5
0
/**
    Purpose
    -------
    CGETRF_INCPIV computes an LU factorization of a general M-by-N tile A
    using partial pivoting with row interchanges.

    The factorization has the form

      A = P * L * U

    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 2.5 BLAS version of the algorithm.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in]
    ib      INTEGER
            The inner-blocking size.  IB >= 0.

    @param[in,out]
    hA      COMPLEX array, dimension(LDHA, N), on cpu.
            On entry, only the M-by-IB first panel needs to be identical to dA(1..M, 1..IB).
            On exit, the content is incomplete. Shouldn't be used.

    @param[in]
    ldha    INTEGER
            The leading dimension of the array hA.  LDHA >= max(1,M).

    @param[in,out]
    dA      COMPLEX array, dimension(LDDA, N), on gpu.
            On entry, the M-by-N tile to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    @param[in]
    ldda    INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).

    @param[out]
    hL      COMPLEX array, dimension(LDHL, min(M,N)), on vpu.
            On exit, contains in the upper part the IB-by-K lower triangular tile,
            and in the lower part IB-by-min(M,N) the inverse of the top part.

    @param[in]
    ldhl    INTEGER
            The leading dimension of the array hL.  LDHL >= max(1,2*IB).

    @param[out]
    dL      COMPLEX array, dimension(LDDL, K), on gpu.
            On exit, contains in the upper part the IB-by-min(M,N) lower triangular tile,
            and in the lower part IB-by-min(M,N) the inverse of the top part.

    @param[in]
    lddl    INTEGER
            The leading dimension of the array dL.  LDDL >= max(1,2*IB).

    @param[out]
    ipiv    INTEGER array, dimension min(M,N), on the cpu.
            The pivot indices array.

    @param[out]
    dWORK   COMPLEX array, dimension(LDDWORK, 2*IB), on gpu.
            Workspace.

    @param[in]
    lddwork INTEGER
            The leading dimension of the array dWORK.  LDDWORK >= max(NB, 1).

    @param[out]
    info    INTEGER
            - PLASMA_SUCCESS successful exit
            - < 0 if INFO = -k, the k-th argument had an illegal value
            - > 0 if INFO = k, U(k,k) is exactly zero. The factorization
                has been completed, but the factor U is exactly
                singular, and division by zero will occur if it is used
                to solve a system of equations.

    @ingroup magma_cgesv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_cgetrf_incpiv_gpu( magma_order_t order, magma_int_t m, magma_int_t n, magma_int_t ib,
                         magmaFloatComplex *hA, magma_int_t ldha, magmaFloatComplex *dA, magma_int_t ldda,
                         magmaFloatComplex *hL, magma_int_t ldhl, magmaFloatComplex *dL, magma_int_t lddl,
                         magma_int_t *ipiv,
                         magmaFloatComplex *dwork, magma_int_t lddwork,
                         magma_int_t *info)
{
#define AT(i,j) (dAT + (i)*ib*ldda + (j)*ib)
#define hA(i,j) (hA  + (i)*ib + (j)*ib*ldha)
#define hL(j)   (hL  + (j)*ib*ldhl         )
#define hL2(j)  (hL2 + (j)*ib*ldhl         )
#define dL(j)   (dL  + (j)*ib*lddl         )
#define dL2(j)  (dL2 + (j)*ib*lddl         )

    magmaFloatComplex c_one     = MAGMA_C_ONE;
    magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;

    magma_int_t iinfo;
    magma_int_t maxm, mindim;
    magma_int_t i, rows, cols, s, ii, sb;
    magmaFloatComplex *dAT;
#ifndef WITHOUTTRTRI
    magmaFloatComplex *dL2 = dL + ib;
    magmaFloatComplex *hL2 = hL + ib;
#endif

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (ldda < max(1,m))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    s      = mindim / ib;

    if ( ib >= mindim ) {
        /* Use CPU code. */
        lapackf77_cgetrf(&m, &n, hA, &ldha, ipiv, info);

#ifndef WITHOUTTRTRI
        CORE_clacpy(PlasmaUpperLower, mindim, mindim,
                    (PLASMA_Complex32_t*)hA, ldha,
                    (PLASMA_Complex32_t*)hL2, ldhl );

        CORE_ctrtri( PlasmaLower, PlasmaUnit, mindim,
                     (PLASMA_Complex32_t*)hL2, ldhl, info );
        if (*info != 0 ) {
            fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info);
        }

        magma_csetmatrix( mindim, mindim, hL2, ldhl, dL2, lddl );
#endif

        if ( order == MagmaRowMajor ) {
            magma_csetmatrix( m, n, hA, ldha, dwork, lddwork );
            magmablas_ctranspose( m, n, dwork, lddwork, dA, ldda );
        } else {
            magma_csetmatrix( m, n, hA, ldha, dA, ldda );
        }
    }
    else {
        /* Use hybrid blocked code. */
        maxm = ((m + 31)/32)*32;

        if ( order == MagmaColMajor ) {
            magmablas_cgetmo_in( dA, dAT, ldda, m, n );
        } else {
            dAT = dA;
        }

        for( i=0; i < s; i++ ) {
            ii = i * ib;
            sb = min(ib, mindim-ii);
            cols = maxm - ii;

            if ( i > 0 ) {
                // download i-th panel
                magmablas_ctranspose( sb, m, AT(0,i), ldda, dwork, maxm );
                magma_cgetmatrix( m, sb, dwork, maxm, hA(0, i), ldha );

                // make sure that gpu queue is empty
                //magma_device_sync();
#ifndef WITHOUTTRTRI
                magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                             n - (ii+sb), ib,
                             c_one, dL2(i-1),    lddl,
                                    AT(i-1,i+1), ldda );
#else
                magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             n - (ii+sb), ib,
                             c_one, AT(i-1,i-1), ldda,
                                    AT(i-1,i+1), ldda );
#endif
                magma_cgemm( MagmaNoTrans, MagmaNoTrans,
                             n-(ii+sb), m-ii, ib,
                             c_neg_one, AT(i-1,i+1), ldda,
                                        AT(i,  i-1), ldda,
                             c_one,     AT(i,  i+1), ldda );
            }

            // do the cpu part
            rows = m - ii;
            lapackf77_cgetrf( &rows, &sb, hA(i, i), &ldha, ipiv+ii, &iinfo);
            if ( (*info == 0) && (iinfo > 0) )
                *info = iinfo + ii;

            {
                int j;
                int fin = ii + sb;
                for (j=ii; j < fin; j++) {
                    ipiv[j] = ii + ipiv[j];
                }
            }
            magmablas_claswp( n-ii, AT(0, i), ldda, ii+1, ii+sb, ipiv, 1 );

#ifndef WITHOUTTRTRI
            CORE_clacpy(PlasmaLower, sb, sb,
                        (PLASMA_Complex32_t*)hA(i, i), ldha,
                        (PLASMA_Complex32_t*)hL2(i), ldhl );

            CORE_ctrtri( PlasmaLower, PlasmaUnit, sb,
                         (PLASMA_Complex32_t*)hL2(i), ldhl, info );
            if (*info != 0 ) {
                fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info);
            }
            magma_csetmatrix( sb, sb, hL2(i), ldhl, dL2(i), lddl );
#endif
            // upload i-th panel
            magma_csetmatrix( rows, sb, hA(i, i), ldha, dwork, cols );
            magmablas_ctranspose( rows, sb, dwork, cols, AT(i,i), ldda );

            // do the small non-parallel computations
            if ( s > (i+1) ) {
#ifndef WITHOUTTRTRI
                magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                             sb, sb,
                             c_one, dL2(i),     lddl,
                                    AT(i, i+1), ldda);
#else
                magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             sb, sb,
                             c_one, AT(i, i  ), ldda,
                                    AT(i, i+1), ldda);
#endif
                magma_cgemm( MagmaNoTrans, MagmaNoTrans,
                             sb, m-(ii+sb), sb,
                             c_neg_one, AT(i,   i+1), ldda,
                                        AT(i+1, i  ), ldda,
                             c_one,     AT(i+1, i+1), ldda );
            }
            else {
                /* Update of the last panel */
#ifndef WITHOUTTRTRI
                magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                             n-mindim, sb,
                             c_one, dL2(i),     lddl,
                                    AT(i, i+1), ldda);
#else
                magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             n-mindim, sb,
                             c_one, AT(i, i  ), ldda,
                                    AT(i, i+1), ldda);
#endif
                /* m-(ii+sb) should be always 0 */
                magma_cgemm( MagmaNoTrans, MagmaNoTrans,
                             n-mindim, m-(ii+sb), sb,
                             c_neg_one, AT(i,   i+1), ldda,
                                        AT(i+1, i  ), ldda,
                             c_one,     AT(i+1, i+1), ldda );
            }
        }

        if ( order == MagmaColMajor ) {
            magmablas_cgetmo_out( dA, dAT, ldda, m, n );
        }
    }
    return *info;
}
示例#6
0
extern "C" int calc_bounding_box(magmaFloatComplex *M, magma_int_t M_lead_dim, float *wReEig, float *wImEig)
{
	magma_int_t rslt = 0;

	//magmaFloatComplex *AT = nullptr;
	magmaFloatComplex *dA = nullptr, *dAT = nullptr,
		*dreA = nullptr, *dimA = nullptr;

	float *dreEig = nullptr;
	float *dimEig = nullptr;

	//magma_int_t *ipiv = NULL;
	magma_int_t lda = M_lead_dim;
	//magma_int_t ldx = lda;
	magma_int_t info = 0;

	magma_int_t nb = 0;

	//magma_vec_t jobvl;
	//magma_vec_t jobvr;

	magmaFloatComplex *work = nullptr;
	magma_int_t  lwork = 0;

	float *rwork = nullptr;
	magma_int_t lrwork = 0;

	magma_int_t *iwork = nullptr;
	magma_int_t liwork = 0;


	nb = magma_get_cgehrd_nb( M_lead_dim );


	lwork = 2 * (M_lead_dim + M_lead_dim*nb); // MagmaNoVec
	//lwork = 2 * max(M_lead_dim + M_lead_dim*nb, 2*M_lead_dim + M_lead_dim*M_lead_dim); // MagmaVec

	lrwork = M_lead_dim; // MagmaNoVec
	//lrwork = 1 + 5 * M_lead_dim + 2*M_lead_dim*M_lead_dim; // MagmaVec

	liwork = 1; // MagmaNoVec
	//liwork = 3 + 5*M_lead_dim; // MagmaVec

	magma_imalloc_cpu(&iwork, liwork);

	magma_smalloc_cpu(&rwork, lrwork);

	//magma_cmalloc_cpu(&A, lda*M_lead_dim);
	//magma_cmalloc_cpu(&AT, lda*M_lead_dim);

	//magma_smalloc_cpu(&reEig, M_lead_dim);
	//magma_smalloc_cpu(&imEig, M_lead_dim);


	magma_cmalloc_pinned(&dA, lda*M_lead_dim);
	magma_cmalloc_pinned(&dAT, lda*M_lead_dim);

	magma_cmalloc_pinned(&dreA, lda*M_lead_dim);
	magma_cmalloc_pinned(&dimA, lda*M_lead_dim);

	//magma_cmalloc_pinned(&VL, lda*M_lead_dim);
	//magma_cmalloc_pinned(&VR, lda*M_lead_dim);

	magma_cmalloc_pinned(&work, lwork);

	magma_smalloc_pinned(&dreEig, M_lead_dim);
	magma_smalloc_pinned(&dimEig, M_lead_dim);


	//matrix_fillzero(AT, M_lead_dim);

	//vector_fillzero(reEig, M_lead_dim);
	//vector_fillzero(imEig, M_lead_dim);

	//prepare_matrix_2(M);

	magma_csetmatrix(M_lead_dim, M_lead_dim, M, lda, dA, M_lead_dim, queue);
	//magma_csetmatrix(M_lead_dim, M_lead_dim, AT, lda, dAT, M_lead_dim, queue);

	//magma_ssetvector(M_lead_dim, wReEig, 1, dreEig, 1, queue);
	//magma_ssetvector(M_lead_dim, wImEig, 1, dimEig, 1, queue);

	//magma_cprint_gpu(M_lead_dim, M_lead_dim, dA, lda);

	// reA = ( (A + A')/2.0 )
	// A'
	magmablas_ctranspose(M_lead_dim, M_lead_dim, dA, M_lead_dim, dAT, M_lead_dim, queue);
	//magma_cprint_gpu(M_lead_dim, M_lead_dim, dAT, lda);

	// AT = A + A'
	magmablas_cgeadd(M_lead_dim, M_lead_dim, MAGMA_C_MAKE(1.0f, 0.0f), dA, M_lead_dim, dAT, M_lead_dim, queue);
	//magma_cprint_gpu(M_lead_dim, M_lead_dim, dAT, lda);
	// AT=AT*0.5
	magma_cscal(lda*M_lead_dim, MAGMA_C_MAKE(0.5f, 0.0f), dAT, 1, queue);
	//magma_cprint_gpu(M_lead_dim, M_lead_dim, dAT, lda);
	// reA = AT
	magma_ccopy(lda*M_lead_dim, dAT, 1, dreA, 1, queue);
	//magma_cprint_gpu(M_lead_dim, M_lead_dim, dreA, lda);
	magma_sync_wtime(queue);


	// imA = ( -1im*(A - A')/2.0 )
	// A'
	magmablas_ctranspose(M_lead_dim, M_lead_dim, dA, M_lead_dim, dAT, M_lead_dim, queue);
	//magma_cprint_gpu(M_lead_dim, M_lead_dim, dAT, lda);
	// AT = A + A'
	magmablas_cgeadd(M_lead_dim, M_lead_dim, MAGMA_C_MAKE(-1.0f, 0.0f), dAT, M_lead_dim, dA, M_lead_dim, queue);
	// A=A*-1j*0.5
	magma_cscal(lda*M_lead_dim, MAGMA_C_MAKE(0.0f, -0.5f), dA, 1, queue);
	// imA = A
	magma_ccopy(lda*M_lead_dim, dA, 1, dimA, 1, queue);
	magma_sync_wtime(queue);

	//magma_cprint_gpu(M_lead_dim, M_lead_dim, dreA, lda);
	//magma_cprint_gpu(M_lead_dim, M_lead_dim, dimA, lda);


	// reEig::Vector=eigvals(reA)
	rslt = magma_cheevd(MagmaNoVec, MagmaLower,
		M_lead_dim,
		dreA, lda,
		dreEig,
		work, lwork,
		rwork, lrwork,
		iwork, liwork,
		&info);

	// imEig::Vector=eigvals(imA)
	rslt = magma_cheevd(MagmaNoVec, MagmaLower,
		M_lead_dim,
		dimA, lda,
		dimEig,
		work, lwork,
		rwork, lrwork,
		iwork, liwork,
		&info);


	//magma_sprint_gpu(M_lead_dim, 1, dreEig, M_lead_dim);
	//magma_sprint_gpu(M_lead_dim, 1, dimEig, M_lead_dim);


	magma_sgetvector(M_lead_dim, dreEig, 1, wReEig, 1, queue);
	//magma_sync_wtime(queue);

	magma_sgetvector(M_lead_dim, dimEig, 1, wImEig, 1, queue);
	//magma_sync_wtime(queue);

	/*
	maxReIdx = magma_isamax(M_lead_dim, dreEig, 1, queue) - 1;
	minReIdx = magma_isamin(M_lead_dim, dreEig, 1, queue) - 1;

	maxImIdx = magma_isamax(M_lead_dim, dimEig, 1, queue) - 1;
	minImIdx = magma_isamin(M_lead_dim, dimEig, 1, queue) - 1;


	printf("max re idx = %d\nmin re idx = %d\n", maxReIdx, minReIdx);
	printf("%f %f\n", wReEig[maxReIdx], wReEig[minReIdx]);

	printf("max im idx = %d\nmin im idx = %d\n", maxImIdx, minImIdx);
	printf("%f %f\n", wImEig[maxImIdx], wImEig[minImIdx]);
	*/

	//printf("test wReEig: %f %f\n", wReEig[0], wReEig[1]);
	//printf("test wImEig: %f %f\n", wImEig[0], wImEig[1]);


	magma_free_cpu(iwork);
	magma_free_cpu(rwork);
	//magma_free_cpu(AT);

	magma_free_pinned(dA);
	magma_free_pinned(dAT);

	magma_free_pinned(dreA);
	magma_free_pinned(dimA);

	magma_free_pinned(work);

	magma_free_pinned(dreEig);
	magma_free_pinned(dimEig);

	return rslt;
}
示例#7
0
extern "C" magma_int_t
magma_cgetrf(magma_int_t m, magma_int_t n, cuFloatComplex *a, magma_int_t lda, 
             magma_int_t *ipiv, magma_int_t *info)
{
/*  -- MAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2012

    Purpose
    =======
    CGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.  This version does not
    require work space on the GPU passed as input. GPU memory is allocated
    in the routine.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) COMPLEX array, dimension (LDA,N)
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    IPIV    (output) INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
            > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    =====================================================================    */

#define inAT(i,j) (dAT + (i)*nb*ldda + (j)*nb)

    cuFloatComplex *dAT, *dA, *da, *work;
    cuFloatComplex c_one     = MAGMA_C_ONE;
    cuFloatComplex c_neg_one = MAGMA_C_NEG_ONE;
    magma_int_t     iinfo, nb;

    *info = 0;

    if (m < 0)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (lda < max(1,m))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    nb = magma_get_cgetrf_nb(m);

    if ( (nb <= 1) || (nb >= min(m,n)) ) {
        /* Use CPU code. */
        lapackf77_cgetrf(&m, &n, a, &lda, ipiv, info);
    } else {
        /* Use hybrid blocked code. */
        magma_int_t maxm, maxn, ldda, maxdim;
        magma_int_t i, rows, cols, s = min(m, n)/nb;
        
        magma_int_t num_gpus = magma_num_gpus();
        if ( num_gpus > 1 ) {
          /* call multi-GPU non-GPU-resident interface  */
          magma_int_t rval = magma_cgetrf_m(num_gpus, m, n, a, lda, ipiv, info);
          if( *info >= 0 ) magma_cgetrf_piv(num_gpus, m, n, a, lda, ipiv, info);
          return *info;
        }

        maxm = ((m + 31)/32)*32;
        maxn = ((n + 31)/32)*32;
        maxdim = max(maxm, maxn);

        ldda = maxn;
        work = a;

        if (maxdim*maxdim < 2*maxm*maxn)
        {
            if (MAGMA_SUCCESS != magma_cmalloc( &dA, nb*maxm + maxdim*maxdim )) {
                        /* alloc failed so call non-GPU-resident version */ 
                        magma_int_t rval = magma_cgetrf_m(num_gpus, m, n, a, lda, ipiv, info);
                        if( *info >= 0 ) magma_cgetrf_piv(num_gpus, m, n, a, lda, ipiv, info);
                        return *info;
            }
            da = dA + nb*maxm;
            
            ldda = maxdim;
            magma_csetmatrix( m, n, a, lda, da, ldda );
            
            dAT = da;
            magmablas_cinplace_transpose( dAT, ldda, ldda );
        }
        else
        {
            if (MAGMA_SUCCESS != magma_cmalloc( &dA, (nb + maxn)*maxm )) {
                        /* alloc failed so call non-GPU-resident version */
                        magma_int_t rval = magma_cgetrf_m(num_gpus, m, n, a, lda, ipiv, info);
                        if( *info >= 0 ) magma_cgetrf_piv(num_gpus, m, n, a, lda, ipiv, info);
                        return *info;
            }
            da = dA + nb*maxm;
            
            magma_csetmatrix( m, n, a, lda, da, maxm );
            
            if (MAGMA_SUCCESS != magma_cmalloc( &dAT, maxm*maxn )) {
                        /* alloc failed so call non-GPU-resident version */
                        magma_free( dA );
                        magma_int_t rval = magma_cgetrf_m(num_gpus, m, n, a, lda, ipiv, info);
                        if( *info >= 0 ) magma_cgetrf_piv(num_gpus, m, n, a, lda, ipiv, info);
                        return *info;
            }

            magmablas_ctranspose2( dAT, ldda, da, maxm, m, n );
        }
        
        lapackf77_cgetrf( &m, &nb, work, &lda, ipiv, &iinfo);

        for( i = 0; i < s; i++ )
        {
            // download i-th panel
            cols = maxm - i*nb;
            
            if (i>0){
                magmablas_ctranspose( dA, cols, inAT(i,i), ldda, nb, cols );
                magma_cgetmatrix( m-i*nb, nb, dA, cols, work, lda );
                
                // make sure that gpu queue is empty
                magma_device_sync();
                
                magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                             n - (i+1)*nb, nb, 
                             c_one, inAT(i-1,i-1), ldda, 
                                    inAT(i-1,i+1), ldda );
                magma_cgemm( MagmaNoTrans, MagmaNoTrans, 
                             n-(i+1)*nb, m-i*nb, nb, 
                             c_neg_one, inAT(i-1,i+1), ldda, 
                                        inAT(i,  i-1), ldda, 
                             c_one,     inAT(i,  i+1), ldda );

                // do the cpu part
                rows = m - i*nb;
                lapackf77_cgetrf( &rows, &nb, work, &lda, ipiv+i*nb, &iinfo);
            }
            if (*info == 0 && iinfo > 0)
                *info = iinfo + i*nb;
            magmablas_cpermute_long2( ldda, dAT, ldda, ipiv, nb, i*nb );

            // upload i-th panel
            magma_csetmatrix( m-i*nb, nb, work, lda, dA, cols );
            magmablas_ctranspose( inAT(i,i), ldda, dA, cols, cols, nb);

            // do the small non-parallel computations
            if (s > (i+1)){
                magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                             nb, nb, 
                             c_one, inAT(i, i  ), ldda,
                                    inAT(i, i+1), ldda);
                magma_cgemm( MagmaNoTrans, MagmaNoTrans, 
                             nb, m-(i+1)*nb, nb, 
                             c_neg_one, inAT(i,   i+1), ldda,
                                        inAT(i+1, i  ), ldda, 
                             c_one,     inAT(i+1, i+1), ldda );
            }
            else{
                magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                             n-s*nb, nb,
                             c_one, inAT(i, i  ), ldda,
                                    inAT(i, i+1), ldda);
                magma_cgemm( MagmaNoTrans, MagmaNoTrans, 
                             n-(i+1)*nb, m-(i+1)*nb, nb,
                             c_neg_one, inAT(i,   i+1), ldda,
                                        inAT(i+1, i  ), ldda, 
                             c_one,     inAT(i+1, i+1), ldda );
            }
        }
        
        magma_int_t nb0 = min(m - s*nb, n - s*nb);
        if ( nb0 > 0 ) {
            rows = m - s*nb;
            cols = maxm - s*nb;
    
            magmablas_ctranspose2( dA, cols, inAT(s,s), ldda, nb0, rows);
            magma_cgetmatrix( rows, nb0, dA, cols, work, lda );
    
            // make sure that gpu queue is empty
            magma_device_sync();
    
            // do the cpu part
            lapackf77_cgetrf( &rows, &nb0, work, &lda, ipiv+s*nb, &iinfo);
            if (*info == 0 && iinfo > 0)
                *info = iinfo + s*nb;
            magmablas_cpermute_long2( ldda, dAT, ldda, ipiv, nb0, s*nb );
    
            magma_csetmatrix( rows, nb0, work, lda, dA, cols );
            magmablas_ctranspose2( inAT(s,s), ldda, dA, cols, rows, nb0);
    
            magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                         n-s*nb-nb0, nb0,
                         c_one, inAT(s, s),     ldda, 
                                inAT(s, s)+nb0, ldda);
        }
        
        if (maxdim*maxdim< 2*maxm*maxn){
            magmablas_cinplace_transpose( dAT, ldda, ldda );
            magma_cgetmatrix( m, n, da, ldda, a, lda );
        } else {
            magmablas_ctranspose2( da, maxm, dAT, ldda, n, m );
            magma_cgetmatrix( m, n, da, maxm, a, lda );
            magma_free( dAT );
        }

        magma_free( dA );
    }
    
    return *info;
} /* magma_cgetrf */
示例#8
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing ctranspose
   Code is very similar to testing_csymmetrize.cpp
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    // OpenCL use:  cl_mem  , offset  (two arguments);
    // else   use:  pointer + offset  (one argument).
    #ifdef HAVE_clBLAS
        #define d_A(i_, j_)   d_A, ((i_) + (j_)*ldda)
        #define d_B(i_, j_)   d_B, ((i_) + (j_)*lddb)
    #else
        #define d_A(i_, j_)  (d_A + (i_) + (j_)*ldda)
        #define d_B(i_, j_)  (d_B + (i_) + (j_)*lddb)
    #endif
    
    real_Double_t    gbytes, gpu_perf, gpu_time, gpu_perf2=0, gpu_time2=0, cpu_perf, cpu_time;
    float           error, error2, work[1];
    magmaFloatComplex  c_neg_one = MAGMA_C_NEG_ONE;
    magmaFloatComplex *h_A, *h_B, *h_R;
    magmaFloatComplex_ptr d_A, d_B;
    magma_int_t M, N, size, lda, ldda, ldb, lddb;
    magma_int_t ione     = 1;
    magma_int_t status = 0;
    
    magma_opts opts;
    opts.parse_opts( argc, argv );
    
    #ifdef COMPLEX
    magma_int_t ntrans = 2;
    magma_trans_t trans[] = { Magma_ConjTrans, MagmaTrans };
    #else
    magma_int_t ntrans = 1;
    magma_trans_t trans[] = { MagmaTrans };
    #endif

    printf("%% Inplace transpose requires M == N.\n");
    printf("%% Trans     M     N   CPU GByte/s (ms)    GPU GByte/s (ms)  check   Inplace GB/s (ms)  check\n");
    printf("%%=========================================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
      for( int itran = 0; itran < ntrans; ++itran ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            lda    = M;
            ldda   = magma_roundup( M, opts.align );  // multiple of 32 by default
            ldb    = N;
            lddb   = magma_roundup( N, opts.align );  // multiple of 32 by default
            // load entire matrix, save entire matrix
            gbytes = sizeof(magmaFloatComplex) * 2.*M*N / 1e9;
            
            TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*N  );  // input:  M x N
            TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*M  );  // output: N x M
            TESTING_MALLOC_CPU( h_R, magmaFloatComplex, ldb*M  );  // output: N x M
            
            TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N );  // input:  M x N
            TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*M );  // output: N x M
            
            /* Initialize the matrix */
            for( int j = 0; j < N; ++j ) {
                for( int i = 0; i < M; ++i ) {
                    h_A[i + j*lda] = MAGMA_C_MAKE( i + j/10000., j );
                }
            }
            for( int j = 0; j < M; ++j ) {
                for( int i = 0; i < N; ++i ) {
                    h_B[i + j*ldb] = MAGMA_C_MAKE( i + j/10000., j );
                }
            }
            magma_csetmatrix( N, M, h_B, ldb, d_B(0,0), lddb, opts.queue );
            
            /* =====================================================================
               Performs operation using naive out-of-place algorithm
               (LAPACK doesn't implement transpose)
               =================================================================== */
            cpu_time = magma_wtime();
            //for( int j = 1; j < N-1; ++j ) {      // inset by 1 row & col
            //    for( int i = 1; i < M-1; ++i ) {  // inset by 1 row & col
            if ( trans[itran] == MagmaTrans ) {
                for( int j = 0; j < N; ++j ) {
                    for( int i = 0; i < M; ++i ) {
                        h_B[j + i*ldb] = h_A[i + j*lda];
                    }
                }
            }
            else {
                for( int j = 0; j < N; ++j ) {
                    for( int i = 0; i < M; ++i ) {
                        h_B[j + i*ldb] = conj( h_A[i + j*lda] );
                    }
                }
            }
            cpu_time = magma_wtime() - cpu_time;
            cpu_perf = gbytes / cpu_time;
            
            /* ====================================================================
               Performs operation using MAGMA, out-of-place
               =================================================================== */
            magma_csetmatrix( M, N, h_A, lda, d_A(0,0), ldda, opts.queue );
            magma_csetmatrix( N, M, h_B, ldb, d_B(0,0), lddb, opts.queue );
            
            gpu_time = magma_sync_wtime( opts.queue );
            if ( trans[itran] == MagmaTrans ) {
                //magmablas_ctranspose( M-2, N-2, d_A(1,1), ldda, d_B(1,1), lddb, opts.queue );  // inset by 1 row & col
                magmablas_ctranspose( M, N, d_A(0,0), ldda, d_B(0,0), lddb, opts.queue );
            }
            #ifdef HAVE_CUBLAS
            else {
                //magmablas_ctranspose_conj( M-2, N-2, d_A(1,1), ldda, d_B(1,1), lddb, opts.queue );  // inset by 1 row & col
                magmablas_ctranspose_conj( M, N, d_A(0,0), ldda, d_B(0,0), lddb, opts.queue );
            }
            #endif
            gpu_time = magma_sync_wtime( opts.queue ) - gpu_time;
            gpu_perf = gbytes / gpu_time;
            
            /* ====================================================================
               Performs operation using MAGMA, in-place
               =================================================================== */
            if ( M == N ) {
                magma_csetmatrix( M, N, h_A, lda, d_A(0,0), ldda, opts.queue );
                
                gpu_time2 = magma_sync_wtime( opts.queue );
                if ( trans[itran] == MagmaTrans ) {
                    //magmablas_ctranspose_inplace( N-2, d_A(1,1), ldda, opts.queue );  // inset by 1 row & col
                    magmablas_ctranspose_inplace( N, d_A(0,0), ldda, opts.queue );
                }
                #ifdef HAVE_CUBLAS
                else {
                    //magmablas_ctranspose_conj_inplace( N-2, d_A(1,1), ldda, opts.queue );  // inset by 1 row & col
                    magmablas_ctranspose_conj_inplace( N, d_A(0,0), ldda, opts.queue );
                }
                #endif
                gpu_time2 = magma_sync_wtime( opts.queue ) - gpu_time2;
                gpu_perf2 = gbytes / gpu_time2;
            }
            
            /* =====================================================================
               Check the result
               =================================================================== */
            // check out-of-place transpose (d_B)
            size = ldb*M;
            magma_cgetmatrix( N, M, d_B(0,0), lddb, h_R, ldb, opts.queue );
            blasf77_caxpy( &size, &c_neg_one, h_B, &ione, h_R, &ione );
            error = lapackf77_clange("f", &N, &M, h_R, &ldb, work );
            
            if ( M == N ) {
                // also check in-place tranpose (d_A)
                magma_cgetmatrix( N, M, d_A(0,0), ldda, h_R, ldb, opts.queue );
                blasf77_caxpy( &size, &c_neg_one, h_B, &ione, h_R, &ione );
                error2 = lapackf77_clange("f", &N, &M, h_R, &ldb, work );
    
                printf("%5c %5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)  %6s  %7.2f (%7.2f)  %s\n",
                       lapacke_trans_const( trans[itran] ),
                       (int) M, (int) N,
                       cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000.,
                       (error  == 0. ? "ok" : "failed"),
                       gpu_perf2, gpu_time2,
                       (error2 == 0. ? "ok" : "failed") );
                status += ! (error == 0. && error2 == 0.);
            }
            else {
                printf("%5c %5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)  %6s    ---   (  ---  )\n",
                       lapacke_trans_const( trans[itran] ),
                       (int) M, (int) N,
                       cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000.,
                       (error  == 0. ? "ok" : "failed") );
                status += ! (error == 0.);
            }
            
            TESTING_FREE_CPU( h_A );
            TESTING_FREE_CPU( h_B );
            TESTING_FREE_CPU( h_R );
            
            TESTING_FREE_DEV( d_A );
            TESTING_FREE_DEV( d_B );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
      }
    }

    opts.cleanup();
    TESTING_FINALIZE();
    return status;
}
示例#9
0
extern "C" magma_int_t
magma_cgetrf2_msub(
    magma_int_t num_subs, magma_int_t ngpu, 
    magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset,
    magmaFloatComplex_ptr *d_lAT, size_t dlAT_offset, magma_int_t lddat, 
    magma_int_t *ipiv,
    magmaFloatComplex_ptr *d_panel, 
    magmaFloatComplex_ptr *d_lAP, size_t dlAP_offset, 
    magmaFloatComplex *w, magma_int_t ldw,
    magma_queue_t *queues,
    magma_int_t *info)
{
/*  -- clMAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date November 2014

    Purpose
    =======
    CGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.
    Use two buffer to send panels..

    Arguments
    =========
    NUM_GPUS 
            (input) INTEGER
            The number of GPUS to be used for the factorization.

    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) COMPLEX array on the GPU, dimension (LDDA,N).
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    LDDA     (input) INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).

    IPIV    (output) INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
            > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.
    =====================================================================    */

#define d_lAT(id,i,j)      d_lAT[(id)], (((offset)+(i)*nb)*lddat + (j)*nb)
#define d_lAT_offset(i, j)              (((offset)+(i)*nb)*lddat + (j)*nb)
#define W(j)     (w +((j)%(1+ngpu))*nb*ldw)

    magmaFloatComplex c_one     = MAGMA_C_ONE;
    magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;

    magma_int_t tot_subs = num_subs * ngpu;
    magma_int_t block_size = 32;
    magma_int_t iinfo, maxm, mindim;
    magma_int_t i, j, d, dd, rows, cols, s;
    magma_int_t id, j_local, j_local2, nb0, nb1;

    /* local submatrix info */
    magma_int_t ldpan[MagmaMaxSubs * MagmaMaxGPUs],
                n_local[MagmaMaxSubs * MagmaMaxGPUs]; 
    size_t dpanel_local_offset[MagmaMaxSubs * MagmaMaxGPUs];
    magmaFloatComplex_ptr dpanel_local[MagmaMaxSubs * MagmaMaxGPUs];

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -2;
    else if (n < 0)
        *info = -3;
    else if (tot_subs*lddat < max(1,n))
        *info = -5;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    if (tot_subs > ceil((float)n/nb)) {
        *info = -1;
        return *info;
    }
    
    else {
        /* Use hybrid blocked code. */
        maxm  = ((m + block_size-1)/block_size)*block_size;
        
        /* some initializations */
        for (i=0; i < tot_subs; i++) {
            n_local[i] = ((n/nb)/tot_subs)*nb;
            if (i < (n/nb)%tot_subs)
                n_local[i] += nb;
            else if (i == (n/nb)%tot_subs)
                n_local[i] += n%nb;
        }
        
        /* start sending the first panel to cpu */
        nb0 = min(mindim, nb);
        magmablas_ctranspose(  nb0, maxm, d_lAT(0,0,0), lddat, d_lAP[0], dlAP_offset, maxm, queues[2*0+1] );
        magma_cgetmatrix_async( m, nb0,
                                d_lAP[0], dlAP_offset, maxm,
                                W(0), ldw, queues[2*0+1], NULL );
        clFlush(queues[2*0+1]);
        /* ------------------------------------------------------------------------------------- */
        
        s = mindim / nb;
        for (j=0; j < s; j++) {
            /* Set the submatrix ID that holds the current panel */
            id = j%tot_subs;
        
            /* Set the local index where the current panel is */
            j_local = j/tot_subs;
            // cols for gpu panel
            cols  = maxm - j*nb;
            // rows for cpu panel
            rows  = m - j*nb;
        
            /* synchrnoize j-th panel from id-th gpu into work */
            magma_queue_sync( queues[2*(id%ngpu)+1] );
        
            /* j-th panel factorization */
            lapackf77_cgetrf( &rows, &nb, W(j), &ldw, ipiv+j*nb, &iinfo);
            if ((*info == 0) && (iinfo > 0)) {
                *info = iinfo + j*nb;
                //break;
            }
        
            /* start sending the panel to all the gpus */
            d = (j+1)%ngpu;
            for (dd=0; dd < ngpu; dd++) {
                magma_csetmatrix_async( rows, nb,
                                        W(j), ldw,
                                        d_lAP[d], dlAP_offset+(j%(2+ngpu))*nb*maxm, maxm, 
                                        queues[2*d+1], NULL );
                d = (d+1)%ngpu;
            }
            /* apply the pivoting */
            for( i=j*nb; i < j*nb + nb; ++i ) {
                ipiv[i] += j*nb;
            }
            d = (j+1)%tot_subs;
            for (dd=0; dd < tot_subs; dd++) {
                magmablas_claswp( lddat, d_lAT(d,0,0), lddat, j*nb + 1, j*nb + nb, ipiv, 1, queues[2*(d%ngpu)] );
                d = (d+1)%tot_subs;
            }
        
            /* update the trailing-matrix/look-ahead */
            d = (j+1)%tot_subs;
            for (dd=0; dd < tot_subs; dd++) {
                /* storage for panel */
                if (d%ngpu == id%ngpu) {
                    /* the panel belond to this gpu */
                    dpanel_local[d] = d_lAT[id];
                    dpanel_local_offset[d] = d_lAT_offset(j, j_local);
                    ldpan[d] = lddat;
                    /* next column */
                    j_local2 = j_local;
                    if ( d <= id )
                        j_local2++;
                } else {
                    /* the panel belong to another gpu */
                    dpanel_local[d] = d_panel[d%ngpu];  
                    dpanel_local_offset[d] = (j%(2+ngpu))*nb*maxm;
                    ldpan[d] = nb;
                    /* next column */
                    j_local2 = j_local;
                    if ( d < id )
                        j_local2++;
                }
                /* the size of the next column */
                if (s > (j+1)) {
                    nb0 = nb;
                } else {
                    nb0 = n_local[d]-nb*(s/tot_subs);
                    if (d < s%tot_subs)
                        nb0 -= nb;
                }
                if (d == (j+1)%tot_subs) {
                    /* owns the next column, look-ahead the column */
                    nb1 = nb0;
                } else {
                    /* update the entire trailing matrix */
                    nb1 = n_local[d] - j_local2*nb;
                }
                
                /* gpu updating the trailing matrix */
                if (d == (j+1)%tot_subs) { /* look-ahead, this is executed first (j.e., dd=0)  */
                    magma_queue_sync(queues[2*(d%ngpu)]);   /* pivoting done? (overwrite with panel) */
                    magmablas_ctranspose( cols, nb,
                                          d_lAP[d%ngpu], dlAP_offset+(j%(2+ngpu))*nb*maxm, maxm,
                                          dpanel_local[d], dpanel_local_offset[d], ldpan[d], 
                                          queues[2*(d%ngpu)+1] );
                    magma_queue_sync(queues[2*(d%ngpu)+1]); /* panel arrived and transposed for remaining update ? */
        
                    magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                                 nb1, nb, c_one,
                                 dpanel_local[d], dpanel_local_offset[d], ldpan[d],
                                 d_lAT(d, j, j_local2), lddat, queues[2*(d%ngpu)+1]);
        
                    magma_cgemm( MagmaNoTrans, MagmaNoTrans, 
                                 nb1, m-(j+1)*nb, nb, 
                                 c_neg_one, d_lAT(d, j,   j_local2),         lddat,
                                            dpanel_local[d], dpanel_local_offset[d]+nb*ldpan[d], ldpan[d], 
                                 c_one,     d_lAT(d, j+1, j_local2),         lddat,
                                 queues[2*(d%ngpu)+1]);
                } else { /* no look-ahead */
                    if (dd < ngpu) {
                        /* synch and transpose only the first time */
                        magma_queue_sync(queues[2*(d%ngpu)+1]); /* panel arrived? */
                        magmablas_ctranspose( cols, nb,
                                              d_lAP[d%ngpu], dlAP_offset+(j%(2+ngpu))*nb*maxm, maxm,
                                              dpanel_local[d], dpanel_local_offset[d], ldpan[d], 
                                              queues[2*(d%ngpu)] );
                    }
        
                    magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                                 nb1, nb, c_one,
                                 dpanel_local[d], dpanel_local_offset[d], ldpan[d],
                                 d_lAT(d, j, j_local2), lddat, queues[2*(d%ngpu)]);
                
                    magma_cgemm( MagmaNoTrans, MagmaNoTrans, 
                                 nb1, m-(j+1)*nb, nb, 
                                 c_neg_one, d_lAT(d, j,   j_local2),         lddat,
                                            dpanel_local[d], dpanel_local_offset[d]+nb*ldpan[d], ldpan[d], 
                                 c_one,     d_lAT(d, j+1, j_local2),         lddat,
                                 queues[2*(d%ngpu)]);    
                }
                if (d == (j+1)%tot_subs) {
                    /* Set the local index where the current panel is */
                    int loff    = j+1;
                    int j_local = (j+1)/tot_subs;
                    int ldda    = maxm - (j+1)*nb;
                    int cols    = m - (j+1)*nb;
                    nb0 = min(nb, mindim - (j+1)*nb); /* size of the diagonal block */
                    
                    if (nb0 > 0) {
                        /* transpose the panel for sending it to cpu */
                        magmablas_ctranspose( nb0, ldda,
                                              d_lAT(d,loff,j_local), lddat,
                                              d_lAP[d%ngpu], dlAP_offset + ((j+1)%(2+ngpu))*nb*maxm, ldda, 
                                              queues[2*(d%ngpu)+1] );
                  
                        /* send the panel to cpu */
                        magma_cgetmatrix_async( cols, nb0, 
                                                d_lAP[d%ngpu], dlAP_offset + ((j+1)%(2+ngpu))*nb*maxm, ldda, 
                                                W(j+1), ldw, queues[2*(d%ngpu)+1], NULL );
                    }
                } else {
                    //trace_gpu_end( d, 0 );
                }
                d = (d+1)%tot_subs;
            }
        
            /* update the remaining matrix by gpu owning the next panel */
            if ((j+1) < s) {
                d = (j+1)%tot_subs;
                int j_local = (j+1)/tot_subs;
                int rows  = m - (j+1)*nb;
                
                magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                             n_local[d] - (j_local+1)*nb, nb, 
                             c_one, dpanel_local[d], dpanel_local_offset[d], ldpan[d], 
                                    d_lAT(d,j,j_local+1), lddat, queues[2*(d%ngpu)] );
                    
                magma_cgemm( MagmaNoTrans, MagmaNoTrans, 
                             n_local[d]-(j_local+1)*nb, rows, nb, 
                             c_neg_one, d_lAT(d,j,j_local+1), lddat, 
                                        dpanel_local[d], dpanel_local_offset[d]+nb*ldpan[d], ldpan[d], 
                             c_one,     d_lAT(d,j+1,  j_local+1), lddat, queues[2*(d%ngpu)] );
            }
        } /* end of for j=1..s */
        /* ------------------------------------------------------------------------------ */
        
        /* Set the GPU number that holds the last panel */
        id = s%tot_subs;
        
        /* Set the local index where the last panel is */
        j_local = s/tot_subs;
        
        /* size of the last diagonal-block */
        nb0 = min(m - s*nb, n - s*nb);
        rows = m    - s*nb;
        cols = maxm - s*nb;
        
        if (nb0 > 0) {
        
            /* wait for the last panel on cpu */
            magma_queue_sync( queues[2*(id%ngpu)+1] );
            
            /* factor on cpu */
            lapackf77_cgetrf( &rows, &nb0, W(s), &ldw, ipiv+s*nb, &iinfo );
            if ( (*info == 0) && (iinfo > 0) )
                *info = iinfo + s*nb;
        
            /* send the factor to gpus */
            for (d=0; d < ngpu; d++) {
                magma_csetmatrix_async( rows, nb0, W(s), ldw,
                                        d_lAP[d], dlAP_offset+(s%(2+ngpu))*nb*maxm, cols, 
                                        queues[2*d+1], NULL );
            }
        
            for( i=s*nb; i < s*nb + nb0; ++i ) {
                ipiv[i] += s*nb;
            }
            for (d=0; d < tot_subs; d++) {
                magmablas_claswp( lddat, d_lAT(d,0,0), lddat, s*nb + 1, s*nb + nb0, ipiv, 1, queues[2*(d%ngpu)] );
            }
        
            d = id;
            for (dd=0; dd < tot_subs; dd++) {
                /* wait for the pivoting to be done */
                if (dd < ngpu) {
                    /* synch only the first time */
                    magma_queue_sync( queues[2*(d%ngpu)] );
                }
        
                j_local2 = j_local;
                if (d%ngpu == id%ngpu) {
                    /* the panel belond to this gpu */
                    dpanel_local[d] = d_lAT[id];
                    dpanel_local_offset[d] = d_lAT_offset(s, j_local);
                    if (dd < ngpu) {
                        magmablas_ctranspose( rows, nb0,
                                              d_lAP[d%ngpu], dlAP_offset+(s%(2+ngpu))*nb*maxm, cols, 
                                              dpanel_local[d], dpanel_local_offset[d], lddat, 
                                              queues[2*(d%ngpu)+1] );
                    }
                    /* size of the "extra" block */
                    if (d == id) { /* the last diagonal block belongs to this submatrix */
                        nb1 = nb0;
                    } else if (d < id) {
                        nb1 = nb;
                    } else {
                        nb1 = 0;
                    }
                    if (n_local[d] > j_local*nb+nb1) {
                        magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                                     n_local[d] - (j_local*nb+nb1), nb0, c_one,
                                     dpanel_local[d], dpanel_local_offset[d], lddat, 
                                     d_lAT(d, s, j_local)+nb1, lddat, queues[2*(d%ngpu)+1]);
                    }
                } else if (n_local[d] > j_local2*nb) {
                    /* the panel belong to another gpu */
                    dpanel_local[d] = d_panel[d%ngpu];
                    dpanel_local_offset[d] = (s%(2+ngpu))*nb*maxm;
        
                    /* next column */
                    if (d < ngpu) {
                        /* transpose only the first time */
                        magmablas_ctranspose( rows, nb0,
                                              d_lAP[d%ngpu], dlAP_offset+(s%(2+ngpu))*nb*maxm, cols, 
                                              dpanel_local[d], dpanel_local_offset[d], nb, 
                                              queues[2*(d%ngpu)+1] );
                    }
                    if (d < id)
                        j_local2++;
                    nb1 = n_local[d] - j_local2*nb;
                    magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                                 nb1, nb0, c_one,
                                 dpanel_local[d], dpanel_local_offset[d], nb, 
                                 d_lAT(d,s,j_local2), lddat, queues[2*(d%ngpu)+1]);
                }
                d = (d+1)%tot_subs;
            }
        } /* if( nb0 > 0 ) */

        /* clean up */
        for (d=0; d < ngpu; d++) {
            magma_queue_sync( queues[2*d] );
            magma_queue_sync( queues[2*d+1] );
        } 
    }
    return *info;
    /* End of MAGMA_CGETRF2_MSUB */
}
示例#10
0
/**
    Purpose
    -------
    CGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
        A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.

    Arguments
    ---------
    @param[in]
    ngpu    INTEGER
            Number of GPUs to use. ngpu > 0.

    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    d_lA    COMPLEX array of pointers on the GPU, dimension (ngpu).
            On entry, the M-by-N matrix A distributed over GPUs
            (d_lA[d] points to the local matrix on d-th GPU).
            It uses 1D block column cyclic format with the block size of nb,
            and each local matrix is stored by column.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    @param[in]
    ldda     INTEGER
            The leading dimension of the array d_lA.  LDDA >= max(1,M).

    @param[out]
    ipiv    INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
      -     > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    @ingroup magma_cgesv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_cgetrf_mgpu(
    magma_int_t ngpu,
    magma_int_t m, magma_int_t n,
    magmaFloatComplex_ptr d_lA[], magma_int_t ldda, magma_int_t *ipiv,
    magma_int_t *info)
{
    magma_int_t nb, n_local[MagmaMaxGPUs];
    magma_int_t maxm;
    magma_int_t i, j, d, lddat, lddwork;
    magmaFloatComplex *d_lAT[MagmaMaxGPUs];
    magmaFloatComplex *d_panel[MagmaMaxGPUs], *work;
    magma_queue_t queues[MagmaMaxGPUs][2];

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -2;
    else if (n < 0)
        *info = -3;
    else if (ldda < max(1,m))
        *info = -5;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* create the queues */
    for( d=0; d < ngpu; d++ ) {
        magma_queue_create( d, &queues[d][0] );
        magma_queue_create( d, &queues[d][1] );
    }

    /* Function Body */
    nb = magma_get_cgetrf_nb( m, n );

    if (nb <= 1 || nb >= n) {
        /* Use CPU code. */
        magma_cmalloc_cpu( &work, m * n );
        if ( work == NULL ) {
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        magma_cgetmatrix( m, n, d_lA[0], ldda, work, m, queues[0][0] );
        lapackf77_cgetrf(&m, &n, work, &m, ipiv, info);
        magma_csetmatrix( m, n, work, m, d_lA[0], ldda, queues[0][0] );
        magma_free_cpu(work);
    } else {
        /* Use hybrid blocked code. */
        magma_device_t orig_dev;
        magma_getdevice( &orig_dev );
        
        maxm = magma_roundup( m, 32 );
        if ( ngpu > ceil((float)n/nb) ) {
            printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu );
            *info = -1;
            return *info;
        }

        /* allocate workspace for each GPU */
        lddat = magma_roundup( ((magma_ceildiv( n, nb )/ngpu)*nb), 32 );
        lddat = magma_ceildiv( n, nb );        /* number of block columns         */
        lddat = magma_ceildiv( lddat, ngpu );  /* number of block columns per GPU */
        lddat = nb*lddat;                      /* number of columns per GPU       */
        lddat = magma_roundup( lddat, 32 );    /* make it a multiple of 32        */
        for (i=0; i < ngpu; i++) {
            magma_setdevice(i);
            
            /* local-n and local-ld */
            n_local[i] = ((n/nb)/ngpu)*nb;
            if (i < (n/nb)%ngpu)
                n_local[i] += nb;
            else if (i == (n/nb)%ngpu)
                n_local[i] += n%nb;
            
            /* workspaces */
            if (MAGMA_SUCCESS != magma_cmalloc( &d_panel[i], (3+ngpu)*nb*maxm )) {
                for( j=0; j <= i; j++ ) {
                    magma_setdevice(j);
                }
                for( j=0; j < i; j++ ) {
                    magma_setdevice(j);
                    magma_free( d_panel[j] );
                    magma_free( d_lAT[j]   );
                }
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }
            
            /* local-matrix storage */
            if (MAGMA_SUCCESS != magma_cmalloc( &d_lAT[i], lddat*maxm )) {
                for( j=0; j <= i; j++ ) {
                    magma_setdevice(j);
                    magma_free( d_panel[j] );
                }
                for( j=0; j < i; j++ ) {
                    magma_setdevice(j);
                    magma_free( d_lAT[j] );
                }
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }
            
            magmablas_ctranspose( m, n_local[i], d_lA[i], ldda, d_lAT[i], lddat, queues[i][1] );
        }
        for (i=0; i < ngpu; i++) {
            magma_setdevice(i);
            magma_queue_sync(queues[i][0]);
        }
        magma_setdevice(0);

        /* cpu workspace */
        lddwork = maxm;
        if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, lddwork*nb*ngpu )) {
            for (i=0; i < ngpu; i++ ) {
                magma_setdevice(i);
                magma_free( d_panel[i] );
                magma_free( d_lAT[i]   );
            }
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }

        /* calling multi-gpu interface with allocated workspaces and queues */
        magma_cgetrf2_mgpu(ngpu, m, n, nb, 0, d_lAT, lddat, ipiv, d_panel, work, maxm,
                           queues, info);

        /* clean up */
        for( d=0; d < ngpu; d++ ) {
            magma_setdevice(d);
            
            /* save on output */
            magmablas_ctranspose( n_local[d], m, d_lAT[d], lddat, d_lA[d], ldda, queues[d][0] );
            magma_queue_sync(queues[d][0]);
            magma_queue_sync(queues[d][1]);

            magma_free( d_lAT[d]   );
            magma_free( d_panel[d] );
        } /* end of for d=1,..,ngpu */
        magma_setdevice( orig_dev );
        magma_free_pinned( work );
    }

    /* clean up */
    for( d=0; d < ngpu; d++ ) {
        magma_setdevice(d);
        magma_queue_destroy( queues[d][0] );
        magma_queue_destroy( queues[d][1] );
    }

    return *info;
}
示例#11
0
extern "C" magma_int_t
magma_ctstrf_gpu( char storev, magma_int_t m, magma_int_t n, magma_int_t ib, magma_int_t nb,
                  magmaFloatComplex *hU, magma_int_t ldhu, magmaFloatComplex *dU, magma_int_t lddu,
                  magmaFloatComplex *hA, magma_int_t ldha, magmaFloatComplex *dA, magma_int_t ldda,
                  magmaFloatComplex *hL, magma_int_t ldhl, magmaFloatComplex *dL, magma_int_t lddl,
                  magma_int_t *ipiv,
                  magmaFloatComplex *hwork, magma_int_t ldhwork, magmaFloatComplex *dwork, magma_int_t lddwork,
                  magma_int_t *info)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    CSSSSM applies the LU factorization update from a complex
    matrix formed by a lower triangular IB-by-K tile L1 on top of a
    M2-by-K tile L2 to a second complex matrix formed by a M1-by-N1
    tile A1 on top of a M2-by-N2 tile A2 (N1 == N2).

    This is the right-looking Level 2.5 BLAS version of the algorithm.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    IB      (input) INTEGER
            The inner-blocking size.  IB >= 0.

    NB      (input) INTEGER
            The blocking size.  NB >= 0.

    hU      (input,output) COMPLEX array, dimension(LDHU, N), on cpu.
            On entry, the NB-by-N upper triangular tile hU.
            On exit, the content is incomplete. Shouldn't be used.

    LDHU    (input) INTEGER
            The leading dimension of the array hU.  LDHU >= max(1,NB).

    dU      (input,output) COMPLEX array, dimension(LDDU, N), on gpu.
            On entry, the NB-by-N upper triangular tile dU identical to hU.
            On exit, the new factor U from the factorization.

    LDDU    (input) INTEGER
            The leading dimension of the array dU.  LDDU >= max(1,NB).

    hA      (input,output) COMPLEX array, dimension(LDHA, N), on cpu.
            On entry, only the M-by-IB first panel needs to be identical to dA(1..M, 1..IB).
            On exit, the content is incomplete. Shouldn't be used.

    LDHA    (input) INTEGER
            The leading dimension of the array hA.  LDHA >= max(1,M).

    dA      (input,output) COMPLEX array, dimension(LDDA, N) , on gpu.
            On entry, the M-by-N tile to be factored.
            On exit, the factor L from the factorization

    LDDA    (input) INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).

    hL      (output) COMPLEX array, dimension(LDHL, K), on vpu.
            On exit, contains in the upper part the IB-by-K lower triangular tile,
            and in the lower part IB-by-K the inverse of the top part.

    LDHL    (input) INTEGER
            The leading dimension of the array hL.  LDHL >= max(1,2*IB).

    dL      (output) COMPLEX array, dimension(LDDL, K), on gpu.
            On exit, contains in the upper part the IB-by-K lower triangular tile,
            and in the lower part IB-by-K the inverse of the top part.

    LDDL    (input) INTEGER
            The leading dimension of the array dL.  LDDL >= max(1,2*IB).

    hWORK   (output) COMPLEX array, dimension(LDHWORK, 2*IB), on cpu.
            Workspace.

    LDHWORK (input) INTEGER
            The leading dimension of the array hWORK.  LDHWORK >= max(NB, 1).

    dWORK   (output) COMPLEX array, dimension(LDDWORK, 2*IB), on gpu.
            Workspace.

    LDDWORK (input) INTEGER
            The leading dimension of the array dWORK.  LDDWORK >= max(NB, 1).

    IPIV    (output) INTEGER array on the cpu.
            The pivot indices array of size K as returned by CTSTRF

    INFO    (output) INTEGER
            - PLASMA_SUCCESS successful exit
            - < 0 if INFO = -k, the k-th argument had an illegal value
            - > 0 if INFO = k, U(k,k) is exactly zero. The factorization
                has been completed, but the factor U is exactly
                singular, and division by zero will occur if it is used
                to solve a system of equations.

    =====================================================================    */

#define UT(i,j) (dUT + (i)*ib*lddu + (j)*ib )
#define AT(i,j) (dAT + (i)*ib*ldda + (j)*ib )
#define L(i)    (dL  + (i)*ib*lddl          )
#define L2(i)   (dL2 + (i)*ib*lddl          )
#define hU(i,j) (hU  + (j)*ib*ldhu + (i)*ib )
#define hA(i,j) (hA  + (j)*ib*ldha + (i)*ib )
#define hL(i)   (hL  + (i)*ib*ldhl          )
#define hL2(i)  (hL2 + (i)*ib*ldhl          )

    magmaFloatComplex c_one     = MAGMA_C_ONE;
    magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;

    int iinfo = 0;
    int maxm, mindim;
    int i, j, im, s, ip, ii, sb, p = 1;
    magmaFloatComplex *dAT, *dUT;
    magmaFloatComplex *dAp, *dUp;
#ifndef WITHOUTTRTRI
    magmaFloatComplex *dL2 = dL + ib;
    magmaFloatComplex *hL2 = hL + ib;
    p = 2;
#endif

    /* Check input arguments */
    *info = 0;
    if (m < 0) {
        *info = -1;
    }
    else if (n < 0) {
        *info = -2;
    }
    else if (ib < 0) {
        *info = -3;
    }
    else if ((lddu < max(1,m)) && (m > 0)) {
        *info = -6;
    }
    else if ((ldda < max(1,m)) && (m > 0)) {
        *info = -8;
    }
    else if ((lddl < max(1,ib)) && (ib > 0)) {
        *info = -10;
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* quick return */
    if ((m == 0) || (n == 0) || (ib == 0))
        return *info;

    ip = 0;

    /* Function Body */
    mindim = min(m, n);
    s      = mindim / ib;

    if ( ib >= mindim ) {
        /* Use CPU code. */
        CORE_ctstrf(m, n, ib, nb,
                    (PLASMA_Complex32_t*)hU, ldhu,
                    (PLASMA_Complex32_t*)hA, ldha,
                    (PLASMA_Complex32_t*)hL, ldhl,
                    ipiv,
                    (PLASMA_Complex32_t*)hwork, ldhwork,
                    info);

#ifndef WITHOUTTRTRI
        CORE_clacpy( PlasmaUpperLower, mindim, mindim,
                     (PLASMA_Complex32_t*)hL, ldhl,
                     (PLASMA_Complex32_t*)hL2, ldhl );
        CORE_ctrtri( PlasmaLower, PlasmaUnit, mindim,
                     (PLASMA_Complex32_t*)hL2, ldhl, info );
        if (*info != 0 ) {
            fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info);
        }
#endif

        if ( (storev == 'R') || (storev == 'r') ) {
            magma_csetmatrix( m, n, hU, ldhu, dwork, lddwork );
            magmablas_ctranspose( dU, lddu, dwork, lddwork, m, n );

            magma_csetmatrix( m, n, hA, ldha, dwork, lddwork );
            magmablas_ctranspose( dA, ldda, dwork, lddwork, m, n );
        } else {
            magma_csetmatrix( m, n, hU, ldhu, dU, lddu );
            magma_csetmatrix( m, n, hA, ldha, dA, ldda );
        }
        magma_csetmatrix( p*ib, n, hL, ldhl, dL, lddl );
            
    }
    else {
        /* Use hybrid blocked code. */
        maxm = ((m + 31)/32)*32;

        if ( (storev == 'C') || (storev == 'c') ) {
            magmablas_cgetmo_in( dU, dUT, lddu, m,  n );
            magmablas_cgetmo_in( dA, dAT, ldda, m,  n );
        } else {
            dUT = dU; dAT = dA;
        }
        dAp = dwork;
        dUp = dAp + ib*lddwork;

        ip = 0;
        for( i=0; i<s; i++ )
        {
            ii = i * ib;
            sb = min(mindim-ii, ib);
            
            if ( i>0 ){
                // download i-th panel
                magmablas_ctranspose( dUp, lddu, UT(0, i), lddu, sb, ii );
                magmablas_ctranspose( dAp, ldda, AT(0, i), ldda, sb, m  );
                
                magma_cgetmatrix( ii, sb, dUp, lddu, hU(0, i), ldhu );
                magma_cgetmatrix( m, sb, dAp, ldda, hA(0, i), ldha );
                
                // make sure that gpu queue is empty
                //magma_device_sync();
                
#ifndef WITHOUTTRTRI
                magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                             n-(ii+sb), ib,
                             c_one, L2(i-1),      lddl,
                                    UT(i-1, i+1), lddu);
#else
                magma_ctrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                             n-(ii+sb), ib,
                             c_one, L(i-1),       lddl,
                                    UT(i-1, i+1), lddu);
#endif
                magma_cgemm( MagmaNoTrans, MagmaNoTrans,
                             n-(ii+sb), m, ib,
                             c_neg_one, UT(i-1, i+1), lddu,
                                        AT(0,   i-1), ldda,
                             c_one,     AT(0,   i+1), ldda );
            }

            // do the cpu part
            CORE_ctstrf(m, sb, ib, nb,
                        (PLASMA_Complex32_t*)hU(i, i), ldhu,
                        (PLASMA_Complex32_t*)hA(0, i), ldha,
                        (PLASMA_Complex32_t*)hL(i),    ldhl,
                        ipiv+ii,
                        (PLASMA_Complex32_t*)hwork, ldhwork,
                        info);

            if ( (*info == 0) && (iinfo > 0) )
                *info = iinfo + ii;
            
            // Need to swap betw U and A
#ifndef NOSWAPBLK
            magmablas_cswapblk( 'R', n-(ii+sb),
                                UT(i, i+1), lddu,
                                AT(0, i+1), ldda,
                                1, sb, ipiv+ii, 1, nb );

            for(j=0; j<ib; j++) {
                im = ipiv[ip]-1;
                if ( im == j ) {
                    ipiv[ip] += ii;
                }
                ip++;
            }
#else
            for(j=0; j<ib; j++) {
                im = ipiv[ip]-1;
                if ( im != (j) ) {
                    im = im - nb;
                    assert( (im>=0) && (im<m) );
                    magmablas_cswap( n-(ii+sb), UT(i, i+1)+j*lddu, 1, AT(0, i+1)+im*ldda, 1 );
                } else {
                    ipiv[ip] += ii;
                }
                ip++;
            }
#endif

#ifndef WITHOUTTRTRI
            CORE_clacpy( PlasmaUpperLower, sb, sb,
                         (PLASMA_Complex32_t*)hL(i), ldhl,
                         (PLASMA_Complex32_t*)hL2(i), ldhl );
            CORE_ctrtri( PlasmaLower, PlasmaUnit, sb,
                         (PLASMA_Complex32_t*)hL2(i), ldhl, info );
            if (*info != 0 ) {
                fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info);
            }
#endif
            // upload i-th panel
            magma_csetmatrix( sb, sb, hU(i, i), ldhu, dUp, lddu );
            magma_csetmatrix( m, sb, hA(0, i), ldha, dAp, ldda );
            magma_csetmatrix( p*ib, sb, hL(i), ldhl, L(i), lddl );
            magmablas_ctranspose( UT(i, i), lddu, dUp, lddu, sb, sb);
            magmablas_ctranspose( AT(0, i), ldda, dAp, ldda, m,  sb);
            
            // make sure that gpu queue is empty
            //magma_device_sync();
            
            // do the small non-parallel computations
            if ( s > (i+1) ) {
#ifndef WITHOUTTRTRI
                magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                             sb, sb,
                             c_one, L2(i),      lddl,
                                    UT(i, i+1), lddu);
#else
                magma_ctrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                             sb, sb,
                             c_one, L(i),      lddl,
                                    UT(i, i+1), lddu);
#endif
                magma_cgemm( MagmaNoTrans, MagmaNoTrans,
                             sb, m, sb,
                             c_neg_one, UT(i, i+1), lddu,
                                        AT(0, i  ), ldda,
                             c_one,     AT(0, i+1), ldda );
            }
            else {
#ifndef WITHOUTTRTRI
                magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                             n-mindim, sb,
                             c_one, L2(i),      lddl,
                                    UT(i, i+1), lddu);
#else
                magma_ctrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                             n-mindim, sb,
                             c_one, L(i),      lddl,
                                    UT(i, i+1), lddu);
#endif
                magma_cgemm( MagmaNoTrans, MagmaNoTrans,
                             n-mindim, m, sb,
                             c_neg_one, UT(i, i+1), lddu,
                                        AT(0, i  ), ldda,
                             c_one,     AT(0, i+1), ldda );
            }
        }

        if ( (storev == 'C') || (storev == 'c') ) {
            magmablas_cgetmo_out( dU, dUT, lddu, m,  n );
            magmablas_cgetmo_out( dA, dAT, ldda, m,  n );
        }
    }
    return *info;
}
示例#12
0
/**
    Purpose
    -------
    CGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
        A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.
    
    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    dA      COMPLEX array on the GPU, dimension (LDDA,N).
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    @param[in]
    ldda     INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).

    @param[out]
    ipiv    INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
      -     > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    @ingroup magma_cgesv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_cgetrf_gpu(
    magma_int_t m, magma_int_t n,
    magmaFloatComplex_ptr dA, magma_int_t ldda,
    magma_int_t *ipiv,
    magma_int_t *info )
{
    #ifdef HAVE_clBLAS
    #define  dA(i_, j_) dA,  (dA_offset  + (i_)       + (j_)*ldda)
    #define dAT(i_, j_) dAT, (dAT_offset + (i_)*lddat + (j_))
    #define dAP(i_, j_) dAP, (             (i_)          + (j_)*maxm)
    #else
    #define  dA(i_, j_) (dA  + (i_)       + (j_)*ldda)
    #define dAT(i_, j_) (dAT + (i_)*lddat + (j_))
    #define dAP(i_, j_) (dAP + (i_)       + (j_)*maxm)
    #endif

    magmaFloatComplex c_one     = MAGMA_C_ONE;
    magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;

    magma_int_t iinfo, nb;
    magma_int_t maxm, maxn, minmn;
    magma_int_t i, j, jb, rows, lddat, ldwork;
    magmaFloatComplex_ptr dAT=NULL, dAP=NULL;
    magmaFloatComplex *work=NULL;

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (ldda < max(1,m))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    minmn = min( m, n );
    nb    = magma_get_cgetrf_nb( m, n );

    magma_queue_t queues[2] = { NULL };
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queues[0] );
    magma_queue_create( cdev, &queues[1] );

    if (nb <= 1 || nb >= min(m,n)) {
        /* Use CPU code. */
        if ( MAGMA_SUCCESS != magma_cmalloc_cpu( &work, m*n )) {
            *info = MAGMA_ERR_HOST_ALLOC;
            goto cleanup;
        }
        magma_cgetmatrix( m, n, dA(0,0), ldda, work, m, queues[0] );
        lapackf77_cgetrf( &m, &n, work, &m, ipiv, info );
        magma_csetmatrix( m, n, work, m, dA(0,0), ldda, queues[0] );
        magma_free_cpu( work );  work=NULL;
    }
    else {
        /* Use hybrid blocked code. */
        maxm = magma_roundup( m, 32 );
        maxn = magma_roundup( n, 32 );

        if (MAGMA_SUCCESS != magma_cmalloc( &dAP, nb*maxm )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            goto cleanup;
        }

        // square matrices can be done in place;
        // rectangular requires copy to transpose
        if ( m == n ) {
            dAT = dA;
            lddat = ldda;
            magmablas_ctranspose_inplace( m, dAT(0,0), lddat, queues[0] );
        }
        else {
            lddat = maxn;  // N-by-M
            if (MAGMA_SUCCESS != magma_cmalloc( &dAT, lddat*maxm )) {
                *info = MAGMA_ERR_DEVICE_ALLOC;
                goto cleanup;
            }
            magmablas_ctranspose( m, n, dA(0,0), ldda, dAT(0,0), lddat, queues[0] );
        }
        magma_queue_sync( queues[0] );  // finish transpose

        ldwork = maxm;
        if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, ldwork*nb )) {
            *info = MAGMA_ERR_HOST_ALLOC;
            goto cleanup;
        }

        for( j=0; j < minmn-nb; j += nb ) {
            // get j-th panel from device
            magmablas_ctranspose( nb, m-j, dAT(j,j), lddat, dAP(0,0), maxm, queues[1] );
            magma_queue_sync( queues[1] );  // wait for transpose
            magma_cgetmatrix_async( m-j, nb, dAP(0,0), maxm, work, ldwork, queues[0] );

            if ( j > 0 ) {
                magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             n-(j+nb), nb,
                             c_one, dAT(j-nb, j-nb), lddat,
                                    dAT(j-nb, j+nb), lddat, queues[1] );
                magma_cgemm( MagmaNoTrans, MagmaNoTrans,
                             n-(j+nb), m-j, nb,
                             c_neg_one, dAT(j-nb, j+nb), lddat,
                                        dAT(j,    j-nb), lddat,
                             c_one,     dAT(j,    j+nb), lddat, queues[1] );
            }

            // do the cpu part
            rows = m - j;
            magma_queue_sync( queues[0] );  // wait to get work
            lapackf77_cgetrf( &rows, &nb, work, &ldwork, ipiv+j, &iinfo );
            if ( *info == 0 && iinfo > 0 )
                *info = iinfo + j;

            // send j-th panel to device
            magma_csetmatrix_async( m-j, nb, work, ldwork, dAP, maxm, queues[0] );

            for( i=j; i < j + nb; ++i ) {
                ipiv[i] += j;
            }
            magmablas_claswp( n, dAT(0,0), lddat, j + 1, j + nb, ipiv, 1, queues[1] );

            magma_queue_sync( queues[0] );  // wait to set dAP
            magmablas_ctranspose( m-j, nb, dAP(0,0), maxm, dAT(j,j), lddat, queues[1] );

            // do the small non-parallel computations (next panel update)
            if ( j + nb < minmn - nb ) {
                magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             nb, nb,
                             c_one, dAT(j, j   ), lddat,
                                    dAT(j, j+nb), lddat, queues[1] );
                magma_cgemm( MagmaNoTrans, MagmaNoTrans,
                             nb, m-(j+nb), nb,
                             c_neg_one, dAT(j,    j+nb), lddat,
                                        dAT(j+nb, j   ), lddat,
                             c_one,     dAT(j+nb, j+nb), lddat, queues[1] );
            }
            else {
                magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             n-(j+nb), nb,
                             c_one, dAT(j, j   ), lddat,
                                    dAT(j, j+nb), lddat, queues[1] );
                magma_cgemm( MagmaNoTrans, MagmaNoTrans,
                             n-(j+nb), m-(j+nb), nb,
                             c_neg_one, dAT(j,    j+nb), lddat,
                                        dAT(j+nb, j   ), lddat,
                             c_one,     dAT(j+nb, j+nb), lddat, queues[1] );
            }
        }

        jb = min( m-j, n-j );
        if ( jb > 0 ) {
            rows = m - j;
            
            magmablas_ctranspose( jb, rows, dAT(j,j), lddat, dAP(0,0), maxm, queues[1] );
            magma_cgetmatrix( rows, jb, dAP(0,0), maxm, work, ldwork, queues[1] );
            
            // do the cpu part
            lapackf77_cgetrf( &rows, &jb, work, &ldwork, ipiv+j, &iinfo );
            if ( *info == 0 && iinfo > 0 )
                *info = iinfo + j;
            
            for( i=j; i < j + jb; ++i ) {
                ipiv[i] += j;
            }
            magmablas_claswp( n, dAT(0,0), lddat, j + 1, j + jb, ipiv, 1, queues[1] );
            
            // send j-th panel to device
            magma_csetmatrix( rows, jb, work, ldwork, dAP(0,0), maxm, queues[1] );
            magmablas_ctranspose( rows, jb, dAP(0,0), maxm, dAT(j,j), lddat, queues[1] );
            
            magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                         n-j-jb, jb,
                         c_one, dAT(j,j),    lddat,
                                dAT(j,j+jb), lddat, queues[1] );
        }
        
        // undo transpose
        if ( m == n ) {
            magmablas_ctranspose_inplace( m, dAT(0,0), lddat, queues[1] );
        }
        else {
            magmablas_ctranspose( n, m, dAT(0,0), lddat, dA(0,0), ldda, queues[1] );
        }
    }
    
cleanup:
    magma_queue_destroy( queues[0] );
    magma_queue_destroy( queues[1] );
    
    magma_free( dAP );
    if (m != n) {
        magma_free( dAT );
    }
    magma_free_pinned( work );
    
    return *info;
} /* magma_cgetrf_gpu */