示例#1
0
/***************************************************************************//**
    Purpose
    -------
    DGEQRF computes a QR factorization of a real M-by-N matrix A:
    A = Q * R.
    
    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    dA_array Array of pointers, dimension (batchCount).
             Each is a DOUBLE PRECISION array on the GPU, dimension (LDDA,N)
             On entry, the M-by-N matrix A.
             On exit, the elements on and above the diagonal of the array
             contain the min(M,N)-by-N upper trapezoidal matrix R (R is
             upper triangular if m >= n); the elements below the diagonal,
             with the array TAU, represent the orthogonal matrix Q as a
             product of min(m,n) elementary reflectors (see Further
             Details).

    @param[in]
    ldda     INTEGER
             The leading dimension of the array dA.  LDDA >= max(1,M).
             To benefit from coalescent memory accesses LDDA must be
             divisible by 16.

    @param[in,out]
    dR_array Array of pointers, dimension (batchCount).
             Each is a DOUBLE PRECISION array on the GPU, dimension (LDDR, N/NB)
             dR should be of size (LDDR, N) when provide_RT > 0 and 
             of size (LDDT, NB) otherwise. NB is the local blocking size.
             On exit, the elements of R are stored in dR only when provide_RT > 0.

    @param[in]
    lddr     INTEGER
             The leading dimension of the array dR.  
             LDDR >= min(M,N) when provide_RT == 1
             otherwise LDDR >= min(NB, min(M,N)). 
             NB is the local blocking size.
             To benefit from coalescent memory accesses LDDR must be
             divisible by 16.

    @param[in,out]
    dT_array Array of pointers, dimension (batchCount).
             Each is a DOUBLE PRECISION array on the GPU, dimension (LDDT, N/NB)
             dT should be of size (LDDT, N) when provide_RT > 0 and 
             of size (LDDT, NB) otherwise. NB is the local blocking size.
             On exit, the elements of T are stored in dT only when provide_RT > 0.

    @param[in]
    lddt     INTEGER
             The leading dimension of the array dT.  
             LDDT >= min(NB,min(M,N)). NB is the local blocking size.
             To benefit from coalescent memory accesses LDDR must be
             divisible by 16.

    @param[out]
    dtau_array Array of pointers, dimension (batchCount).
             Each is a DOUBLE PRECISION array, dimension (min(M,N))
             The scalar factors of the elementary reflectors (see Further
             Details).

    @param[in]
    provide_RT INTEGER
               provide_RT = 0 no R and no T in output. 
               dR and dT are used as local workspace to store the R and T of each step.
               provide_RT = 1 the whole R of size (min(M,N), N) and the nbxnb  block of T are provided in output. 
               provide_RT = 2 the nbxnb diag block of R and of T are provided in output. 

    @param[out]
    info_array  Array of INTEGERs, dimension (batchCount), for corresponding matrices.
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    @param[in]
    batchCount  INTEGER
                The number of matrices to operate on.

    @param[in]
    queue   magma_queue_t
            Queue to execute in.

    Further Details
    ---------------
    The matrix Q is represented as a product of elementary reflectors

        Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

        H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).

    @ingroup magma_geqrf_batched
*******************************************************************************/
extern "C" magma_int_t
magma_dgeqrf_expert_batched(
    magma_int_t m, magma_int_t n, 
    double **dA_array, magma_int_t ldda, 
    double **dR_array, magma_int_t lddr,
    double **dT_array, magma_int_t lddt,
    double **dtau_array, magma_int_t provide_RT,
    magma_int_t *info_array, magma_int_t batchCount, magma_queue_t queue)
{
    #define dA(i, j)  (dA + (i) + (j)*ldda)
    
    /* Local Parameter */
    magma_int_t nb = magma_get_dgeqrf_batched_nb(m);
    
    magma_int_t nnb = 8;
    magma_int_t min_mn = min(m, n);

    /* Check arguments */
    cudaMemset(info_array, 0, batchCount*sizeof(magma_int_t));
    magma_int_t arginfo = 0;
    if (m < 0)
        arginfo = -1;
    else if (n < 0)
        arginfo = -2;
    else if (ldda < max(1,m))
        arginfo = -4;
    else if (lddr < min_mn && provide_RT == 1)
        arginfo = -6;
    else if (lddr < min(min_mn, nb))
        arginfo = -6;
    else if (lddt < min(min_mn, nb))
        arginfo = -8;

    if (arginfo != 0) {
        magma_xerbla( __func__, -(arginfo) );
        return arginfo;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        if (min_mn == 0 ) return arginfo;

    if ( m >  2048 || n > 2048 ) {
        printf("=========================================================================================\n");
        printf("   WARNING batched routines are designed for small sizes it might be better to use the\n   Native/Hybrid classical routines if you want performance\n");
        printf("=========================================================================================\n");
    }


    magma_int_t i, k, ib=nb, jb=nnb, offset_RT=0, use_stream;
    magma_int_t ldw, offset; 

    double **dW0_displ = NULL;
    double **dW1_displ = NULL;
    double **dW2_displ = NULL;
    double **dW3_displ = NULL;
    double **dW4_displ = NULL;
    double **dW5_displ = NULL;
    double **dR_displ  = NULL;
    double **dT_displ  = NULL;

    double *dwork = NULL;
    double **cpuAarray = NULL;
    double **cpuTarray = NULL;

    magma_malloc((void**)&dW0_displ, batchCount * sizeof(*dW0_displ));
    magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ));
    magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ));
    magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ));
    magma_malloc((void**)&dW4_displ, batchCount * sizeof(*dW4_displ));
    magma_malloc((void**)&dW5_displ, batchCount * sizeof(*dW5_displ));
    magma_malloc((void**)&dR_displ,  batchCount * sizeof(*dR_displ));
    magma_malloc((void**)&dT_displ,  batchCount * sizeof(*dT_displ));

    magma_dmalloc(&dwork,  (2 * nb * n) * batchCount);
    magma_malloc_cpu((void**) &cpuAarray, batchCount*sizeof(double*));
    magma_malloc_cpu((void**) &cpuTarray, batchCount*sizeof(double*));

    /* check allocation */
    if ( dW0_displ == NULL || dW1_displ == NULL || dW2_displ == NULL || 
         dW3_displ == NULL || dW4_displ == NULL || dW5_displ == NULL || 
         dR_displ  == NULL || dT_displ  == NULL || dwork     == NULL ||
         cpuAarray == NULL || cpuTarray == NULL ) {
        magma_free(dW0_displ);
        magma_free(dW1_displ);
        magma_free(dW2_displ);
        magma_free(dW3_displ);
        magma_free(dW4_displ);
        magma_free(dW5_displ);
        magma_free(dR_displ);
        magma_free(dT_displ);
        magma_free(dwork);
        magma_free_cpu(cpuAarray);
        magma_free_cpu(cpuTarray);
        magma_int_t info = MAGMA_ERR_DEVICE_ALLOC;
        magma_xerbla( __func__, -(info) );
        return info;
    }

    magma_ddisplace_pointers(dR_displ, dR_array, lddr, 0, 0, batchCount, queue); 
    magma_ddisplace_pointers(dT_displ, dT_array, lddt, 0, 0, batchCount, queue); 
    // set dwork to zero because our GEMM routine does propagate NAN when C=betaC+alphaA*B and beta=0
    magmablas_dlaset_q( MagmaFull, 2*nb, n*batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dwork, 2*nb, queue );

    // set dR and dT to zero. if provide_RT == 0 only a tile of size nbxnb is used and overwritten at each step
    magmablas_dlaset_batched( MagmaFull, lddr, (provide_RT > 0 ? n:min(min_mn,nb)), MAGMA_D_ZERO, MAGMA_D_ZERO, dR_displ, lddr, batchCount, queue ); 
    magmablas_dlaset_batched( MagmaFull, lddt, (provide_RT > 0 ? n:min(min_mn,nb)), MAGMA_D_ZERO, MAGMA_D_ZERO, dT_displ, lddt, batchCount, queue );
    /*
    if ( provide_RT > 0 )
    {
        magmablas_dlaset_q( MagmaFull, lddr, n*batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dR, lddr, queue );
        magmablas_dlaset_q( MagmaFull, lddt, n*batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dT, lddt, queue );
    }
    else
    {
        magmablas_dlaset_q( MagmaFull, lddr, nb*batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dR, lddr, queue );
        magmablas_dlaset_q( MagmaFull, lddt, nb*batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dT, lddt, queue );
    }
    */
    magma_int_t streamid;
    const magma_int_t nbstreams=10;
    magma_queue_t queues[nbstreams];
    for (i=0; i < nbstreams; i++) {
        magma_device_t cdev;
        magma_getdevice( &cdev );
        magma_queue_create( cdev, &queues[i] );
    }
    magma_getvector( batchCount, sizeof(double*), dA_array, 1, cpuAarray, 1, queue);
    magma_getvector( batchCount, sizeof(double*), dT_array, 1, cpuTarray, 1, queue);


    for (i=0; i < min_mn; i += nb)
    {
        ib = min(nb, min_mn-i);  
        //===============================================
        // panel factorization
        //===============================================

        magma_ddisplace_pointers(dW0_displ, dA_array, ldda, i, i, batchCount, queue); 
        magma_ddisplace_pointers(dW2_displ, dtau_array, 1, i, 0, batchCount, queue);
        if ( provide_RT > 0 )
        {
            offset_RT = i;
            magma_ddisplace_pointers(dR_displ, dR_array, lddr, (provide_RT == 1 ? offset_RT:0), offset_RT, batchCount, queue); 
            magma_ddisplace_pointers(dT_displ, dT_array, lddt, 0, offset_RT, batchCount, queue); 
        }

        //dwork is used in panel factorization and trailing matrix update
        //dW4_displ, dW5_displ are used as workspace and configured inside
        magma_dgeqrf_panel_batched(m-i, ib, jb, 
                                   dW0_displ, ldda, 
                                   dW2_displ, 
                                   dT_displ, lddt, 
                                   dR_displ, lddr,
                                   dW1_displ,
                                   dW3_displ,
                                   dwork, 
                                   dW4_displ, dW5_displ,
                                   info_array,
                                   batchCount, queue);
           
        //===============================================
        // end of panel
        //===============================================

        //===============================================
        // update trailing matrix
        //===============================================
        if ( (n-ib-i) > 0)
        {
            //dwork is used in panel factorization and trailing matrix update
            //reset dW4_displ
            ldw = nb;
            magma_dset_pointer( dW4_displ, dwork, 1, 0, 0,  ldw*n, batchCount, queue );
            offset = ldw*n*batchCount;
            magma_dset_pointer( dW5_displ, dwork + offset, 1, 0, 0,  ldw*n, batchCount, queue );    

            // set the diagonal of v as one and the upper triangular part as zero already set inside geqrf_panel
            //magmablas_dlaset_batched( MagmaUpper, ib, ib, MAGMA_D_ZERO, MAGMA_D_ONE, dW0_displ, ldda, batchCount, queue ); 
            //magma_ddisplace_pointers(dW2_displ, dtau_array, 1, i, 0, batchCount, queue); 

            // it is faster since it is using BLAS-3 GEMM routines, different from lapack implementation 
            magma_dlarft_batched(m-i, ib, 0,
                             dW0_displ, ldda,
                             dW2_displ,
                             dT_displ, lddt, 
                             dW4_displ, nb*lddt,
                             batchCount, queue);

            
            // perform C = (I-V T^H V^H) * C, C is the trailing matrix
            //-------------------------------------------
            //          USE STREAM  GEMM
            //-------------------------------------------
            use_stream = magma_drecommend_cublas_gemm_stream(MagmaNoTrans, MagmaNoTrans, m-i-ib, n-i-ib, ib);
            if ( use_stream )   
            { 
                magma_queue_sync(queue); 
                for (k=0; k < batchCount; k++)
                {
                    streamid = k%nbstreams;                                       
                    // the queue gemm must take cpu pointer 
                    magma_dlarfb_gpu_gemm( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                m-i, n-i-ib, ib,
                                cpuAarray[k] + i + i * ldda, ldda, 
                                cpuTarray[k] + offset_RT*lddt, lddt,
                                cpuAarray[k] + i + (i+ib) * ldda, ldda,
                                dwork + nb * n * k, -1,
                                dwork + nb * n * batchCount + nb * n * k, -1, queues[streamid] );
                }

                // need to synchronise to be sure that panel does not start before
                // finishing the update at least of the next panel
                // if queue is NULL, no need to sync
                if ( queue != NULL ) {
                    for (magma_int_t s=0; s < nbstreams; s++)
                        magma_queue_sync(queues[s]);
                }
            }
            //-------------------------------------------
            //          USE BATCHED GEMM
            //-------------------------------------------
            else
            {
                //direct trailing matrix in dW1_displ
                magma_ddisplace_pointers(dW1_displ, dA_array, ldda, i, i+ib, batchCount, queue); 

                magma_dlarfb_gemm_batched( 
                            MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, 
                            m-i, n-i-ib, ib,
                            (const double**)dW0_displ, ldda,
                            (const double**)dT_displ, lddt,
                            dW1_displ,  ldda,
                            dW4_displ,  ldw,
                            dW5_displ, ldw,
                            batchCount, queue );
            }
        }// update the trailing matrix 
        //===============================================

        // copy dR back to V after the trailing matrix update, 
        // only when provide_RT=0 otherwise the nbxnb block of V is set to diag=1/0
        // The upper portion of V could be set totaly to 0 here
        if ( provide_RT == 0 )
        {
            magmablas_dlacpy_batched( MagmaUpper, ib, ib, dR_displ, lddr, dW0_displ, ldda, batchCount, queue );
        }
    }

    magma_queue_sync(queue);
    for (k=0; k < nbstreams; k++) {
        magma_queue_destroy( queues[k] );
    }
    
    magma_free(dW0_displ);
    magma_free(dW1_displ);
    magma_free(dW2_displ);
    magma_free(dW3_displ);
    magma_free(dW4_displ);
    magma_free(dW5_displ);
    magma_free(dR_displ);
    magma_free(dT_displ);
    magma_free(dwork);
    magma_free_cpu(cpuAarray);
    magma_free_cpu(cpuTarray);

    return arginfo;
}
示例#2
0
/**
    Purpose
    -------
    DPOTRS solves a system of linear equations A*X = B with a symmetric
    positive definite matrix A using the Cholesky factorization
    A = U**H*U or A = L*L**H computed by DPOTRF.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of A is stored;
      -     = MagmaLower:  Lower triangle of A is stored.

    @param[in]
    n       INTEGER
            The order of the matrix A.  N >= 0.

    @param[in]
    nrhs    INTEGER
            The number of right hand sides, i.e., the number of columns
            of the matrix B.  NRHS >= 0.

    @param[in]
    dA_array    Array of pointers, dimension (batchCount).
             Each is a DOUBLE PRECISION array on the GPU, dimension (LDDA,N)
             The triangular factor U or L from the Cholesky factorization
             A = U**H*U or A = L*L**H, as computed by DPOTRF.

    @param[in]
    ldda    INTEGER
            The leading dimension of each array A.  LDDA >= max(1,N).

    @param[in,out]
    dB_array      Array of pointers, dimension (batchCount).
             Each is a DOUBLE PRECISION array on the GPU, dimension (LDDB,NRHS)
             On entry, each pointer is a right hand side matrix B.
             On exit, the corresponding solution matrix X.

    @param[in]
    lddb    INTEGER
            The leading dimension of each array B.  LDDB >= max(1,N).

    @param[in]
    batchCount  INTEGER
                The number of matrices to operate on.

    @param[in]
    queue   magma_queue_t
            Queue to execute in.


    @ingroup magma_dposv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_dpotrs_batched(
                  magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs,
                  double **dA_array, magma_int_t ldda,
                  double **dB_array, magma_int_t lddb,
                  magma_int_t batchCount, magma_queue_t queue)
{
    double c_one = MAGMA_D_ONE;
    magma_int_t info = 0;
    if ( uplo != MagmaUpper && uplo != MagmaLower )
        info = -1;
    if ( n < 0 )
        info = -2;
    if ( nrhs < 0)
        info = -3;
    if ( ldda < max(1, n) )
        info = -5;
    if ( lddb < max(1, n) )
        info = -7;
    if (info != 0) {
        magma_xerbla( __func__, -(info) );
        return info;
    }

    /* Quick return if possible */
    if ( (n == 0) || (nrhs == 0) ) {
        return info;
    }
    
    double **dW1_displ  = NULL;
    double **dW2_displ  = NULL;
    double **dW3_displ  = NULL;
    double **dW4_displ  = NULL;
    double **dinvA_array = NULL;
    double **dwork_array = NULL;

    magma_malloc((void**)&dW1_displ,  batchCount * sizeof(*dW1_displ));
    magma_malloc((void**)&dW2_displ,  batchCount * sizeof(*dW2_displ));
    magma_malloc((void**)&dW3_displ,  batchCount * sizeof(*dW3_displ));
    magma_malloc((void**)&dW4_displ,  batchCount * sizeof(*dW4_displ));
    magma_malloc((void**)&dinvA_array, batchCount * sizeof(*dinvA_array));
    magma_malloc((void**)&dwork_array, batchCount * sizeof(*dwork_array));

    magma_int_t invA_msize = magma_roundup( n, TRI_NB )*TRI_NB;
    magma_int_t dwork_msize = n*nrhs;
    double* dinvA      = NULL;
    double* dwork      = NULL; // dinvA and dwork are workspace in dtrsm
    magma_dmalloc( &dinvA, invA_msize * batchCount);
    magma_dmalloc( &dwork, dwork_msize * batchCount );
   /* check allocation */
    if ( dW1_displ == NULL || dW2_displ == NULL || dW3_displ   == NULL || dW4_displ   == NULL || 
         dinvA_array == NULL || dwork_array == NULL || dinvA     == NULL || dwork     == NULL ) {
        magma_free(dW1_displ);
        magma_free(dW2_displ);
        magma_free(dW3_displ);
        magma_free(dW4_displ);
        magma_free(dinvA_array);
        magma_free(dwork_array);
        magma_free( dinvA );
        magma_free( dwork );
        info = MAGMA_ERR_DEVICE_ALLOC;
        magma_xerbla( __func__, -(info) );
        return info;
    }

    magmablas_dlaset_q( MagmaFull, invA_msize, batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dinvA, invA_msize, queue );
    magmablas_dlaset_q( MagmaFull, dwork_msize, batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dwork, dwork_msize, queue );
    magma_dset_pointer( dwork_array, dwork, n, 0, 0, dwork_msize, batchCount, queue );
    magma_dset_pointer( dinvA_array, dinvA, TRI_NB, 0, 0, invA_msize, batchCount, queue );

    if ( uplo == MagmaUpper) {
        if (nrhs > 1)
        {
            // A = U^T U
            // solve U^{T}X = B ==> dworkX = U^-T * B
            magmablas_dtrsm_outofplace_batched( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, 1,
                    n, nrhs,
                    c_one,
                    dA_array,       ldda, // dA
                    dB_array,      lddb, // dB
                    dwork_array,        n, // dX //output
                    dinvA_array,  invA_msize, 
                    dW1_displ,   dW2_displ, 
                    dW3_displ,   dW4_displ,
                    1, batchCount, queue );

            // solve U X = dwork ==> X = U^-1 * dwork
            magmablas_dtrsm_outofplace_batched( MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, 1,
                    n, nrhs,
                    c_one,
                    dA_array,       ldda, // dA
                    dwork_array,        n, // dB 
                    dB_array,   lddb, // dX //output
                    dinvA_array,  invA_msize, 
                    dW1_displ,   dW2_displ, 
                    dW3_displ,   dW4_displ,
                    1, batchCount, queue );
        }
        else
        {
            // A = U^T U
            // solve U^{T}X = B ==> dworkX = U^-T * B
            magmablas_dtrsv_outofplace_batched( MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                    n, 
                    dA_array,       ldda, // dA
                    dB_array,      1, // dB
                    dwork_array,     // dX //output
                    batchCount, queue, 0 );

            // solve U X = dwork ==> X = U^-1 * dwork
            magmablas_dtrsv_outofplace_batched( MagmaUpper, MagmaNoTrans, MagmaNonUnit, 
                    n, 
                    dA_array,       ldda, // dA
                    dwork_array,        1, // dB 
                    dB_array,   // dX //output
                    batchCount, queue, 0 );
        }
    }
    else {
        if (nrhs > 1)
        {
            // A = L L^T
            // solve LX= B ==> dwork = L^{-1} B
            magmablas_dtrsm_outofplace_batched( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit, 1,
                    n, nrhs,
                    c_one,
                    dA_array,       ldda, // dA
                    dB_array,      lddb, // dB
                    dwork_array,        n, // dX //output
                    dinvA_array,  invA_msize, 
                    dW1_displ,   dW2_displ, 
                    dW3_displ,   dW4_displ,
                    1, batchCount, queue );

            // solve L^{T}X= dwork ==> X = L^{-T} dwork
            magmablas_dtrsm_outofplace_batched( MagmaLeft, MagmaLower, MagmaConjTrans, MagmaNonUnit, 1,
                    n, nrhs,
                    c_one,
                    dA_array,       ldda, // dA
                    dwork_array,        n, // dB 
                    dB_array,   lddb, // dX //output
                    dinvA_array,  invA_msize, 
                    dW1_displ,   dW2_displ, 
                    dW3_displ,   dW4_displ,
                    1, batchCount, queue );
        }
        else
        {
            // A = L L^T
            // solve LX= B ==> dwork = L^{-1} B
            magmablas_dtrsv_outofplace_batched( MagmaLower, MagmaNoTrans, MagmaNonUnit, 
                    n,
                    dA_array,       ldda, // dA
                    dB_array,      1, // dB
                    dwork_array,   // dX //output
                    batchCount, queue, 0 );

            // solve L^{T}X= dwork ==> X = L^{-T} dwork
            magmablas_dtrsv_outofplace_batched( MagmaLower, MagmaConjTrans, MagmaNonUnit,
                    n,
                    dA_array,       ldda, // dA
                    dwork_array,        1, // dB 
                    dB_array,     // dX //output
                    batchCount, queue, 0 );
        }
    }

    magma_queue_sync(queue);

    magma_free(dW1_displ);
    magma_free(dW2_displ);
    magma_free(dW3_displ);
    magma_free(dW4_displ);
    magma_free(dinvA_array);
    magma_free(dwork_array);
    magma_free( dinvA );
    magma_free( dwork );

    return info;
}
示例#3
0
/**
    Purpose
    -------
    DORGQR generates an M-by-N DOUBLE PRECISION matrix Q with orthonormal columns,
    which is defined as the first N columns of a product of K elementary
    reflectors of order M

          Q  =  H(1) H(2) . . . H(k)

    as returned by DGEQRF_GPU.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix Q. M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix Q. M >= N >= 0.

    @param[in]
    k       INTEGER
            The number of elementary reflectors whose product defines the
            matrix Q. N >= K >= 0.

    @param[in,out]
    dA      DOUBLE PRECISION array A on the GPU device,
            dimension (LDDA,N). On entry, the i-th column must contain
            the vector which defines the elementary reflector H(i), for
            i = 1,2,...,k, as returned by DGEQRF_GPU in the first k
            columns of its array argument A.
            On exit, the M-by-N matrix Q.

    @param[in]
    ldda    INTEGER
            The first dimension of the array A. LDDA >= max(1,M).

    @param[in]
    tau     DOUBLE PRECISION array, dimension (K)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by DGEQRF_GPU.

    @param[in]
    dT      DOUBLE PRECISION work space array on the GPU device,
            dimension (MIN(M, N) )*NB.
            This must be the 6th argument of magma_dgeqrf_gpu
            [ note that if N here is bigger than N in magma_dgeqrf_gpu,
              the workspace requirement DT in magma_dgeqrf_gpu must be
              as specified in this routine ].

    @param[in]
    nb      INTEGER
            This is the block size used in DGEQRF_GPU, and correspondingly
            the size of the T matrices, used in the factorization, and
            stored in DT.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value

    @ingroup magma_dsyev_2stage
    ********************************************************************/
extern "C" magma_int_t
magma_dorgqr_2stage_gpu(
    magma_int_t m, magma_int_t n, magma_int_t k,
    magmaDouble_ptr dA, magma_int_t ldda,
    double *tau,
    magmaDouble_ptr dT, magma_int_t nb,
    magma_int_t *info)
{
    #define dA(a_1,a_2) (dA + (a_2)*(ldda) + (a_1))
    #define dT(a_1)     (dT + (a_1)*nb)

    double c_zero = MAGMA_D_ZERO;
    double c_one  = MAGMA_D_ONE;
    
    magma_int_t  i__1, i__2, i__3;
    //magma_int_t lwork;
    magma_int_t i, ib, ki, kk;  //, iinfo;
    //magma_int_t lddwork = min(m, n);
    //double *work, *panel;
    magmaDouble_ptr dwork;
    magma_int_t ldt=nb; // need to be an input parameter
    magma_queue_t queues[2];
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queues[0] );
    //magma_queue_create( cdev, &queues[1] );

    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if ((n < 0) || (n > m)) {
        *info = -2;
    } else if ((k < 0) || (k > n)) {
        *info = -3;
    } else if (ldda < max(1,m)) {
        *info = -5;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    if (n <= 0)
        return *info;

    if (MAGMA_SUCCESS != magma_dmalloc( &dwork, n*nb )) {
        printf ("!!!! dorgqr_2stage magma_alloc failed for: dwork\n" );
        exit(-1);
    }

    if ( (nb > 1) && (nb < k) ) {
        /*  Use blocked code after the last block.
            The first kk columns are handled by the block method.
            ki is start of 2nd-to-last block. */
        ki = (k - nb - 1) / nb * nb;
        kk = min(k, ki + nb);

        /* Set A(1:kk,kk+1:n) to zero. */
        /* and A(kk+1:m, kk+1:n) = I */
        magmablas_dlaset_q( MagmaFull, kk,   n-kk, c_zero, c_zero, dA(0, kk), ldda, queues[0] );
        magmablas_dlaset_q( MagmaFull, m-kk, n-kk, c_zero, c_one,  dA(kk,kk), ldda, queues[0] );
    }
    else {
        ki = 0;
        kk = 0;
    }
    
    /* Allocate work space on CPU in pinned memory */
    //lwork = (n+m) * nb;
    //if (kk < n)
    //  lwork = max(lwork, n * nb + (m-kk)*(n-kk));

    //if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, (lwork) )) {
    //    *info = MAGMA_ERR_HOST_ALLOC;
    //    return *info;
    //}
    //panel = work + n * nb;

    /* Use unblocked code for the last or only block. */
    if (kk < n) {
        i__1 = m - kk;
        i__2 = n - kk;
        i__3 = k - kk;
        //magma_dgetmatrix( i__1, i__2, dA(kk, kk), ldda, panel, i__1, queues[0] );
        //lapackf77_dorgqr(&i__1, &i__2, &i__3, panel, &i__1, &tau[kk],
        //                 work, &lwork, &iinfo);
        //
        //magma_dsetmatrix( i__1, i__2, panel, i__1, dA(kk, kk), ldda, queues[0] );
        
        magma_dlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise,
                          i__1, i__2, i__3,
                          dA(kk, kk-nb), ldda, dT(kk-nb), ldt,
                          dA(kk, kk), ldda, dwork, i__2, queues[0] );
        
        //magmablas_dlaset_q( MagmaFull, kk-nb,     nb, c_zero, c_zero, dA(0,kk-nb),     ldda, queues[0] );
        //magmablas_dlaset_q( MagmaFull, m-(kk-nb), nb, c_zero, c_one,  dA(kk-nb,kk-nb), ldda, queues[0] );
    }

    if (kk > 0) {
        /* Use blocked code */
        for (i = ki; i >= nb; i -= nb) {
            ib = min(nb, k - i);
            /* Send current panel to the CPU for update */
            i__2 = m - i;
            //magma_dgetmatrix_async( i__2, ib, dA(i,i), ldda, panel, i__2, queues[0] );  // verify
            if (i + ib < n) {
                /* Apply H to A(i:m,i+ib:n) from the left */
                i__3 = n - i;

                magmablas_dlaset_q( MagmaFull, i,   ib, c_zero, c_zero, dA(0,i), ldda, queues[0] );
                magmablas_dlaset_q( MagmaFull, m-i, ib, c_zero, c_one,  dA(i,i), ldda, queues[0] );

                magma_dlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise,
                                  i__2, i__3, ib,
                                  dA(i, i-nb), ldda, dT(i-nb),             ldt,
                                  dA(i, i), ldda, dwork, i__3, queues[0] );
            }

            /* Apply H to rows i:m of current block on the CPU */
            //magma_queue_sync( queues[0] );
            //lapackf77_dorgqr(&i__2, &ib, &ib, panel, &i__2, &tau[i],
            //                 work, &lwork, &iinfo);
            //magma_dsetmatrix_async( i__2, ib, panel, i__2, dA(i,i), ldda, queues[1] );  // verify

            /* Set rows 1:i-1 of current block to zero */
            i__2 = i + ib;
            //magmablas_dlaset_q( MagmaFull, i-ib,     ib, c_zero, c_zero, dA(0,i-ib),    ldda, queues[0] );
            //magmablas_dlaset_q( MagmaFull, m-(i-ib), ib, c_zero, c_one,  dA(i-ib,i-ib), ldda, queues[0] );
        }
    }

    magmablas_dlaset_q( MagmaFull, m, nb, c_zero, c_one, dA(0,0), ldda, queues[0] );

    magma_queue_sync( queues[0] );
    //magma_queue_sync( queues[1] );
    magma_queue_destroy( queues[0] );
    //magma_queue_destroy( queues[1] );
    magma_free( dwork );
    //magma_free_pinned( work );

    return *info;
} /* magma_dorgqr_gpu */
示例#4
0
/**
    Purpose
    -------
    DGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
        A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.
    
    If the current stream is NULL, this version replaces it with a new
    stream to overlap computation with communication.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    dA      DOUBLE_PRECISION array on the GPU, dimension (LDDA,N).
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    @param[in]
    ldda     INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).

    @param[out]
    ipiv    INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
      -     > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    @ingroup magma_dgesv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_dgetrf_batched(
        magma_int_t m, magma_int_t n,
        double **dA_array, 
        magma_int_t ldda,
        magma_int_t **ipiv_array, 
        magma_int_t *info_array, 
        magma_int_t batchCount, magma_queue_t queue)
{
#define A(i_, j_)  (A + (i_) + (j_)*ldda)   

    magma_int_t min_mn = min(m, n);
    cudaMemset(info_array, 0, batchCount*sizeof(magma_int_t));

    /* Check arguments */
    magma_int_t arginfo = 0;
    if (m < 0)
        arginfo = -1;
    else if (n < 0)
        arginfo = -2;
    else if (ldda < max(1,m))
        arginfo = -4;

    if (arginfo != 0) {
        magma_xerbla( __func__, -(arginfo) );
        return arginfo;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        if(min_mn == 0 ) return arginfo;

    if( m >  2048 || n > 2048 ){
        printf("=========================================================================================\n");
        printf("   WARNING batched routines are designed for small sizes it might be better to use the\n   Native/Hybrid classical routines if you want performance\n");
        printf("=========================================================================================\n");
    }


//#define ENABLE_TIMER3

#if defined(ENABLE_TIMER3)
    real_Double_t   tall=0.0, tloop=0., talloc=0., tdalloc=0.;
    tall   = magma_sync_wtime(0);
    talloc = magma_sync_wtime(0);
#endif

    double neg_one = MAGMA_D_NEG_ONE;
    double one  = MAGMA_D_ONE;
    magma_int_t ib, i, k, pm;
    magma_int_t nb = BATRF_NB;
    magma_int_t gemm_crossover = nb > 32 ? 127 : 160;
    // magma_int_t gemm_crossover = n;// use only stream gemm

#if defined(USE_CUOPT)    
    cublasHandle_t myhandle;
    cublasCreate_v2(&myhandle);
#else
    cublasHandle_t myhandle=NULL;
#endif

    magma_int_t     **dipiv_displ   = NULL;
    double **dA_displ   = NULL;
    double **dW0_displ  = NULL;
    double **dW1_displ  = NULL;
    double **dW2_displ  = NULL;
    double **dW3_displ  = NULL;
    double **dW4_displ  = NULL;
    double **dinvA_array = NULL;
    double **dwork_array = NULL;


    magma_malloc((void**)&dipiv_displ,   batchCount * sizeof(*dipiv_displ));
    magma_malloc((void**)&dA_displ,   batchCount * sizeof(*dA_displ));
    magma_malloc((void**)&dW0_displ,  batchCount * sizeof(*dW0_displ));
    magma_malloc((void**)&dW1_displ,  batchCount * sizeof(*dW1_displ));
    magma_malloc((void**)&dW2_displ,  batchCount * sizeof(*dW2_displ));
    magma_malloc((void**)&dW3_displ,  batchCount * sizeof(*dW3_displ));
    magma_malloc((void**)&dW4_displ,  batchCount * sizeof(*dW4_displ));
    magma_malloc((void**)&dinvA_array, batchCount * sizeof(*dinvA_array));
    magma_malloc((void**)&dwork_array, batchCount * sizeof(*dwork_array));


    magma_int_t invA_msize = ((n+TRI_NB-1)/TRI_NB)*TRI_NB*TRI_NB;
    magma_int_t dwork_msize = n*nb;
    magma_int_t **pivinfo_array    = NULL;
    magma_int_t *pivinfo           = NULL; 
    double* dinvA      = NULL;
    double* dwork      = NULL;// dinvA and dwork are workspace in dtrsm
    double **cpuAarray = NULL;
    magma_dmalloc( &dinvA, invA_msize * batchCount);
    magma_dmalloc( &dwork, dwork_msize * batchCount );
    magma_malloc((void**)&pivinfo_array, batchCount * sizeof(*pivinfo_array));
    magma_malloc((void**)&pivinfo, batchCount * m * sizeof(magma_int_t));
    magma_malloc_cpu((void**) &cpuAarray, batchCount*sizeof(double*));

   /* check allocation */
    if ( dA_displ  == NULL || dW0_displ == NULL || dW1_displ   == NULL || dW2_displ   == NULL || 
         dW3_displ == NULL || dW4_displ == NULL || dinvA_array == NULL || dwork_array == NULL || 
         dinvA     == NULL || dwork     == NULL || cpuAarray   == NULL || 
         dipiv_displ == NULL || pivinfo_array == NULL || pivinfo == NULL) {
        magma_free(dA_displ);
        magma_free(dW0_displ);
        magma_free(dW1_displ);
        magma_free(dW2_displ);
        magma_free(dW3_displ);
        magma_free(dW4_displ);
        magma_free(dinvA_array);
        magma_free(dwork_array);
        magma_free( dinvA );
        magma_free( dwork );
        free(cpuAarray);
        magma_free(dipiv_displ);
        magma_free(pivinfo_array);
        magma_free(pivinfo);
        magma_int_t info = MAGMA_ERR_DEVICE_ALLOC;
        magma_xerbla( __func__, -(info) );
        return info;
    }


    magmablas_dlaset_q(MagmaFull, invA_msize, batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dinvA, invA_msize, queue);
    magmablas_dlaset_q(MagmaFull, dwork_msize, batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dwork, dwork_msize, queue);
    dset_pointer(dwork_array, dwork, 1, 0, 0, dwork_msize, batchCount, queue);
    dset_pointer(dinvA_array, dinvA, TRI_NB, 0, 0, invA_msize, batchCount, queue);
    set_ipointer(pivinfo_array, pivinfo, 1, 0, 0, m, batchCount, queue);


    // printf(" I am in dgetrfbatched\n");
    magma_queue_t cstream;
    magmablasGetKernelStream(&cstream);
    magma_int_t streamid;
    const magma_int_t nbstreams=32;
    magma_queue_t stream[nbstreams];
    for(i=0; i<nbstreams; i++){
        magma_queue_create( &stream[i] );
    }
    magma_getvector( batchCount, sizeof(double*), dA_array, 1, cpuAarray, 1);



#if defined(ENABLE_TIMER3)
    printf(" I am after malloc\n");
    talloc = magma_sync_wtime(0) - talloc;
    tloop  = magma_sync_wtime(0);
#endif


    for(i = 0; i < min_mn; i+=nb) 
    {
        magmablasSetKernelStream(NULL);

        ib = min(nb, min_mn-i);
        pm = m-i;
        magma_idisplace_pointers(dipiv_displ, ipiv_array, ldda, i, 0, batchCount, queue);
        magma_ddisplace_pointers(dA_displ, dA_array, ldda, i, i, batchCount, queue);
        //===============================================
        //  panel factorization
        //===============================================
#if 0
        arginfo = magma_dgetf2_batched(
                pm, ib,
                dA_displ, ldda,
                dW1_displ, dW2_displ, dW3_displ,
                dipiv_displ, 
                info_array, i, batchCount, myhandle);   
#else
        arginfo = magma_dgetrf_recpanel_batched(
                pm, ib, 16,
                dA_displ, ldda,
                dipiv_displ, pivinfo_array,
                dwork_array, nb, 
                dinvA_array, invA_msize, 
                dW0_displ, dW1_displ, dW2_displ, 
                dW3_displ, dW4_displ,
                info_array, i, 
                batchCount, myhandle, queue);   
#endif
        if(arginfo != 0 ) goto fin;
        //===============================================
        // end of panel
        //===============================================

#define RUN_ALL
#ifdef RUN_ALL
        // setup pivinfo before adjusting ipiv
        setup_pivinfo_batched(pivinfo_array, dipiv_displ, pm, ib, batchCount, queue);
        adjust_ipiv_batched(dipiv_displ, ib, i, batchCount, queue);

        // stepinit_ipiv(pivinfo_array, pm, batchCount);// for debug and check swap, it create an ipiv


#if 0
        dlaswp_batched( i, dA_displ, ldda,
                i, i+ib,
                dipiv_displ, pivinfo_array, batchCount);
#else
        magma_ddisplace_pointers(dA_displ, dA_array, ldda, i, 0, batchCount, queue);
        magma_ddisplace_pointers(dW0_displ, dA_array, ldda, i, 0, batchCount, queue);
        magma_dlaswp_rowparallel_batched( i, dA_displ, ldda,
                dW0_displ, ldda,
                i, i+ib,
                pivinfo_array, batchCount, queue);

#endif

        if( (i + ib) < n)
        {
            // swap right side and trsm     
            magma_ddisplace_pointers(dA_displ, dA_array, ldda, i, i+ib, batchCount, queue);
            dset_pointer(dwork_array, dwork, nb, 0, 0, dwork_msize, batchCount, queue); // I don't think it is needed Azzam
            magma_dlaswp_rowparallel_batched( n-(i+ib), dA_displ, ldda,
                    dwork_array, nb,
                    i, i+ib,
                    pivinfo_array, batchCount, queue);


            magma_ddisplace_pointers(dA_displ, dA_array, ldda, i, i, batchCount, queue);
            magma_ddisplace_pointers(dW0_displ, dA_array, ldda, i, i+ib, batchCount, queue);
            magmablas_dtrsm_outofplace_batched(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, 1,
                    ib, n-i-ib,
                    MAGMA_D_ONE,
                    dA_displ,    ldda, // dA
                    dwork_array,  nb, // dB
                    dW0_displ,   ldda, // dX
                    dinvA_array,  invA_msize, 
                    dW1_displ,   dW2_displ, 
                    dW3_displ,   dW4_displ,
                    0, batchCount, queue);


            if( (i + ib) < m)
            {    
                // if gemm size is >160 use a streamed classical cublas gemm since it is faster
                // the batched is faster only when M=N<=160 for K40c
                //-------------------------------------------
                //          USE STREAM  GEMM
                //-------------------------------------------
                if( (m-i-ib) > gemm_crossover  && (n-i-ib) > gemm_crossover)   
                { 
                    //printf("caling streamed dgemm %d %d %d \n", m-i-ib, n-i-ib, ib);

                    // since it use different stream I need to wait the TRSM and swap.
                    // But since the code use the NULL stream everywhere, 
                    // so I don't need it, because the NULL stream do the sync by itself
                    //magma_queue_sync(NULL); 
                    //
                    for(k=0; k<batchCount; k++)
                    {
                        streamid = k%nbstreams;                                       
                        magmablasSetKernelStream(stream[streamid]);
                        magma_dgemm(MagmaNoTrans, MagmaNoTrans, 
                                m-i-ib, n-i-ib, ib,
                                neg_one, cpuAarray[k] + (i+ib)+i*ldda, ldda, 
                                         cpuAarray[k] + i+(i+ib)*ldda, ldda,
                                one,     cpuAarray[k] + (i+ib)+(i+ib)*ldda, ldda);
                    }
                    // need to synchronise to be sure that dgetf2 do not start before
                    // finishing the update at least of the next panel
                    // BUT no need for it as soon as the other portion of the code 
                    // use the NULL stream which do the sync by itself 
                    //magma_device_sync(); 
                }
                //-------------------------------------------
                //          USE BATCHED GEMM
                //-------------------------------------------
                else
                {
                    magma_ddisplace_pointers(dA_displ, dA_array,  ldda, i+ib,    i, batchCount, queue);
                    magma_ddisplace_pointers(dW1_displ, dA_array, ldda,    i, i+ib, batchCount, queue);
                    magma_ddisplace_pointers(dW2_displ, dA_array, ldda, i+ib, i+ib, batchCount, queue);
                    //printf("caling batched dgemm %d %d %d \n", m-i-ib, n-i-ib, ib);
                    magmablas_dgemm_batched( MagmaNoTrans, MagmaNoTrans, m-i-ib, n-i-ib, ib, 
                            neg_one, dA_displ, ldda, 
                            dW1_displ, ldda, 
                            one,  dW2_displ, ldda, 
                            batchCount, queue);
                } // end of batched/stream gemm
            } // end of  if( (i + ib) < m) 
        } // end of if( (i + ib) < n)
#endif
    }// end of for

fin:
    magma_queue_sync(NULL);

#if defined(ENABLE_TIMER3)
    tloop   = magma_sync_wtime(0) - tloop;
    tdalloc = magma_sync_wtime(0);

#endif

    for(i=0; i<nbstreams; i++){
        magma_queue_destroy( stream[i] );
    }
    magmablasSetKernelStream(cstream);


#if defined(USE_CUOPT)    
    cublasDestroy_v2(myhandle);
#endif

    magma_free(dA_displ);
    magma_free(dW0_displ);
    magma_free(dW1_displ);
    magma_free(dW2_displ);
    magma_free(dW3_displ);
    magma_free(dW4_displ);
    magma_free(dinvA_array);
    magma_free(dwork_array);
    magma_free( dinvA );
    magma_free( dwork );
    free(cpuAarray);
    magma_free(dipiv_displ);
    magma_free(pivinfo_array);
    magma_free(pivinfo);

#if defined(ENABLE_TIMER3)
    tdalloc = magma_sync_wtime(0) - tdalloc;
    tall = magma_sync_wtime(0) - tall;
    printf("here is the timing from inside dgetrf_batched talloc: %10.5f  tloop: %10.5f tdalloc: %10.5f tall: %10.5f sum: %10.5f\n", talloc, tloop, tdalloc, tall, talloc+tloop+tdalloc );
#endif
    
    return arginfo;

}