Beispiel #1
0
extern "C" magma_int_t
magma_ddtohpo(int num_gpus, char *uplo, magma_int_t m, magma_int_t n, 
              magma_int_t off_i, magma_int_t off_j, magma_int_t nb, magma_int_t NB,
              double *a,      magma_int_t lda, 
              double **dwork, magma_int_t ldda, 
              cudaStream_t stream[][3], magma_int_t *info) {

      magma_int_t k, stream_id = 1;
      if( lapackf77_lsame(uplo, "U") ) {
        magma_int_t j, jj, jb, mj;

        /* go through each column */
        for (j=off_j+NB; j<n; j+=nb) {
          jj = (j-off_j)/(nb*num_gpus);
          k  = ((j-off_j)/nb)%num_gpus;
          magma_setdevice(k);

          jb = min(nb, (n-j));
          if(j+jb < off_j+m) mj = (j-off_i)+jb;
          else mj = m;
          magma_dgetmatrix_async( mj, jb,
                                  dA(k, 0, jj*nb), ldda,
                                  A(off_i, j),     lda, 
                                  stream[k][stream_id] );
        }
      } else {
        magma_int_t i, ii, ib, ni;

        /* go through each row */
        for(i=off_i+NB; i<m; i+=nb){
          ii = (i-off_i)/(nb*num_gpus);
          k  = ((i-off_i)/nb)%num_gpus;
          magma_setdevice(k);

          ib = min(nb, (m-i));
          if(i+ib < off_i+n) ni = (i-off_i)+ib;
          else ni = n;

          magma_dgetmatrix_async( ib, ni,
                                  dA(k, ii*nb, 0), ldda,
                                  A(i, off_j),     lda, 
                                  stream[k][stream_id] );
        }
      }
      for( k=0; k<num_gpus; k++ ) {
        magma_setdevice(k);
        magma_queue_sync( stream[k][stream_id] );
      }
      magma_setdevice(0);

      return *info;
}
Beispiel #2
0
  void magma_task_dev_dgetmatrix_async_transpose(Schedule* sched_obj )               
 {
     magma_int_t deviceID;
    magma_int_t m;
    magma_int_t nb;
    double *dA_src;
    magma_int_t dA_LD;
    double *A_dst;
    magma_int_t LDA;
    magma_queue_t stream1;
    double *dwork;
    magma_int_t dwork_LD;
    
    void *dep_ptr;

#if (dbglevel >=1)
    ca_trace_start();
#endif

//    printf("Matrix_get_transpose\n");
    //schedule_unpack_args_8(sched_obj, m,  nb, dA_src,  dA_LD, A_dst,  LDA, dwork, dwork_LD);
    schedule_unpack_args_11(sched_obj,  deviceID, m,  nb, dA_src,  dA_LD, A_dst,  LDA, stream1, dwork, dwork_LD, dep_ptr);

    magma_setdevice(deviceID);
//    printf("Matrix_get_transpose m:%d, nb:%d, dep_ptr:%p\n",m,nb,dep_ptr);

    //magma_dgetmatrix_transpose( m,  nb, dA_src,  dA_LD, A_dst,  LDA);
     // task_getpanel(gpu_nrows, ncols, dAT(K,K+A_N), dAT_LD, A(K,K+A_N), A_LD);
#if (dbglevel==10)
            ca_dbg_printMat_transpose_gpu(nb, m, dA_src, dA_LD, "dA before getMatrix");
#endif
//            pthread_mutex_lock(&mutex_dAP);
    /*1.transpose dA to dwork*/
    magmablas_dtranspose2s(dwork, dwork_LD, dA_src, dA_LD,  nb, m, compute_stream[deviceID]);
//    printf("Matrix_get_transpose m:%d, nb:%d 1:done\n",m,nb);
#if (dbglevel==10)
//            ca_dbg_printMat_gpu(m, nb, dwork, dwork_LD, "dwork after dA transpose");
#endif

    magma_queue_sync(compute_stream[deviceID]);

    /*2. copy dwork to A: send the panel to GPU*/
    magma_dgetmatrix_async(m, nb, dwork, dwork_LD, A_dst, LDA, stream1);
//    printf("Matrix_get_transpose m:%d, nb:%d 2:done\n",m,nb);
//    pthread_mutex_unlock(&mutex_dAP);

    magma_queue_sync(stream1); //make sure dwork is set before the transpose

#if (dbglevel==10)
            ca_dbg_printMat(m, nb, A_dst, LDA, "A after getMatrix");
#endif 
#if (dbglevel >=1)
ca_trace_end_gpu('G');
ca_trace_end_cpu('C');
#endif

//    printf("End Matrix_get_transpose m:%d, nb:%d\n",m,nb);
 }
Beispiel #3
0
extern "C" magma_int_t 
magma_dsytrf_nopiv(magma_uplo_t uplo, magma_int_t n, 
                   double *A, magma_int_t lda, 
                   magma_int_t *info)
{
/*  -- MAGMA (version 1.6.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2011

    Purpose   
    =======   

    DSYTRF_nopiv computes the LDLt factorization of a real symmetric   
    matrix A. This version does not require work space on the GPU passed 
    as input. GPU memory is allocated in the routine.

    The factorization has the form   
       A = U\*\*H * D * U,  if UPLO = 'U', or   
       A = L  * D * L\*\*H, if UPLO = 'L',   
    where U is an upper triangular matrix, L is lower triangular, and
    D is a diagonal matrix.

    This is the block version of the algorithm, calling Level 3 BLAS.   

    Arguments   
    =========   

    UPLO    (input) CHARACTER*1   
            = 'U':  Upper triangle of A is stored;   
            = 'L':  Lower triangle of A is stored.   

    N       (input) INTEGER   
            The order of the matrix A.  N >= 0.   

    A       (input/output) DOUBLE_PRECISION array, dimension (LDA,N)   
            On entry, the symmetric matrix A.  If UPLO = 'U', the leading   
            N-by-N upper triangular part of A contains the upper   
            triangular part of the matrix A, and the strictly lower   
            triangular part of A is not referenced.  If UPLO = 'L', the   
            leading N-by-N lower triangular part of A contains the lower   
            triangular part of the matrix A, and the strictly upper   
            triangular part of A is not referenced.   

            On exit, if INFO = 0, the factor U or L from the Cholesky   
            factorization A = U\*\*H*U or A = L*L\*\*H.   

            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using cudaMallocHost.

    LDA     (input) INTEGER   
            The leading dimension of the array A.  LDA >= max(1,N).   

    INFO    (output) INTEGER   
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value 
                  if INFO = -6, the GPU memory allocation failed 
            > 0:  if INFO = i, the leading minor of order i is not   
                  positive definite, and the factorization could not be   
                  completed.   

    =====================================================================    */


    /* Local variables */
    double zone  = MAGMA_D_ONE;
    double mzone = MAGMA_D_NEG_ONE;
    int                upper = (uplo == MagmaUpper);
    magma_int_t j, k, jb, ldda, nb, ib, iinfo;
    magmaDouble_ptr dA;
    magmaDouble_ptr dW;

    *info = 0;
    if (! upper && uplo != MagmaLower) {
      *info = -1;
    } else if (n < 0) {
      *info = -2;
    } else if (lda < max(1,n)) {
      *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return MAGMA_ERR_ILLEGAL_VALUE;
    }

    /* Quick return */
    if ( n == 0 )
      return MAGMA_SUCCESS;

    ldda = ((n+31)/32)*32;
    nb = magma_get_dsytrf_nopiv_nb(n);
    ib = min(32, nb); // inner-block for diagonal factorization

    if ((MAGMA_SUCCESS != magma_dmalloc(&dA, n *ldda)) ||
        (MAGMA_SUCCESS != magma_dmalloc(&dW, nb*ldda))) {
        /* alloc failed so call the non-GPU-resident version */
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    magma_queue_t stream[2];
    magma_event_t event;
    magma_queue_create(&stream[0]);
    magma_queue_create(&stream[1]);
    magma_event_create( &event );
    trace_init( 1, 1, 2, (CUstream_st**)stream );

    //if (nb <= 1 || nb >= n) 
    //{
    //    lapackf77_dpotrf(uplo_, &n, a, &lda, info);
    //} else 
    {
        /* Use hybrid blocked code. */
        if (upper) {
            //=========================================================
            // Compute the LDLt factorization A = U'*D*U without pivoting.
            // copy matrix to GPU
            for (j=0; j<n; j+=nb) {
                jb = min(nb, (n-j));
                trace_gpu_start( 0, 0, "set", "set" );
                magma_dsetmatrix_async(j+jb, jb, A(0, j), lda, dA(0, j), ldda, stream[0]);
                trace_gpu_end( 0, 0 );
            }

            // main loop
            for (j=0; j<n; j += nb) {
                jb = min(nb, (n-j));

                // copy A(j,j) back to CPU
                trace_gpu_start( 0, 0, "get", "get" );
                magma_dgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, stream[0]);
                trace_gpu_end( 0, 0 );

                // copy j-th column of U back to CPU
                magma_queue_wait_event( stream[1], event );
                trace_gpu_start( 0, 1, "get", "get" );
                magma_dgetmatrix_async(j, jb, dA(0, j), ldda, A(0, j), lda, stream[1]);
                trace_gpu_end( 0, 1 );

                // factorize the diagonal block
                magma_queue_sync(stream[0]);
                trace_cpu_start( 0, "potrf", "potrf" );
                dsytrf_nopiv_cpu(MagmaUpper, jb, ib, A(j, j), lda, info);
                trace_cpu_end( 0 );
                if (*info != 0){
                    *info = *info + j;
                    break;
                }

                // copy A(j,j) back to GPU
                trace_gpu_start( 0, 0, "set", "set" );
                magma_dsetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, stream[0]);
                trace_gpu_end( 0, 0 );
                
                if ( (j+jb) < n) {
                    // compute the off-diagonal blocks of current block column
                    magmablasSetKernelStream( stream[0] );
                    trace_gpu_start( 0, 0, "trsm", "trsm" );
                    magma_dtrsm(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaUnit, 
                                jb, (n-j-jb), 
                                zone, dA(j, j),    ldda, 
                                      dA(j, j+jb), ldda);
                    magma_dcopymatrix( jb, n-j-jb, dA( j, j+jb ), ldda, dWt( 0, j+jb ), nb );

                    // update the trailing submatrix with D
                    magmablas_dlascl_diag(MagmaUpper, jb, n-j-jb,
                                          dA(j,    j), ldda,
                                          dA(j, j+jb), ldda,
                                          &iinfo);
                    magma_event_record( event, stream[0] );
                    trace_gpu_end( 0, 0 );

                    // update the trailing submatrix with U and W
                    trace_gpu_start( 0, 0, "gemm", "gemm" );
                    for (k=j+jb; k<n; k+=nb)
                    {
                        magma_int_t kb = min(nb,n-k);
                        magma_dgemm(MagmaConjTrans, MagmaNoTrans, kb, n-k, jb,
                                    mzone, dWt(0, k), nb, 
                                           dA(j, k), ldda,
                                    zone,  dA(k, k), ldda);
                    }
                    trace_gpu_end( 0, 0 );
                }
            }
        } else {
            //=========================================================
            // Compute the LDLt factorization A = L*D*L' without pivoting.
            // copy the matrix to GPU
            for (j=0; j<n; j+=nb) {
                jb = min(nb, (n-j));
                trace_gpu_start( 0, 0, "set", "set" );
                magma_dsetmatrix_async((n-j), jb, A(j, j), lda, dA(j, j), ldda, stream[0]);
                trace_gpu_end( 0, 0 );
            }

            // main loop
            for (j=0; j<n; j+=nb) {
                jb = min(nb, (n-j));

                // copy A(j,j) back to CPU
                trace_gpu_start( 0, 0, "get", "get" );
                magma_dgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, stream[0]);
                trace_gpu_end( 0, 0 );

                // copy j-th row of L back to CPU
                magma_queue_wait_event( stream[1], event );
                trace_gpu_start( 0, 1, "get", "get" );
                magma_dgetmatrix_async(jb, j, dA(j, 0), ldda, A(j, 0), lda, stream[1]);
                trace_gpu_end( 0, 1 );

                // factorize the diagonal block
                magma_queue_sync(stream[0]);
                trace_cpu_start( 0, "potrf", "potrf" );
                dsytrf_nopiv_cpu(MagmaLower, jb, ib, A(j, j), lda, info);
                trace_cpu_end( 0 );
                if (*info != 0){
                    *info = *info + j;
                    break;
                }
                // copy A(j,j) back to GPU
                trace_gpu_start( 0, 0, "set", "set" );
                magma_dsetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, stream[0]);
                trace_gpu_end( 0, 0 );
                
                if ( (j+jb) < n) {
                    // compute the off-diagonal blocks of current block column
                    magmablasSetKernelStream( stream[0] );
                    trace_gpu_start( 0, 0, "trsm", "trsm" );
                    magma_dtrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaUnit, 
                                (n-j-jb), jb, 
                                zone, dA(j,    j), ldda, 
                                      dA(j+jb, j), ldda);
                    magma_dcopymatrix( n-j-jb,jb, dA( j+jb, j ), ldda, dW( j+jb, 0 ), ldda );

                    // update the trailing submatrix with D
                    magmablas_dlascl_diag(MagmaLower, n-j-jb, jb,
                                          dA(j,    j), ldda,
                                          dA(j+jb, j), ldda,
                                          &iinfo);
                    magma_event_record( event, stream[0] );
                    trace_gpu_end( 0, 0 );

                    // update the trailing submatrix with L and W
                    trace_gpu_start( 0, 0, "gemm", "gemm" );
                    for (k=j+jb; k<n; k+=nb)
                    {
                        magma_int_t kb = min(nb,n-k);
                        magma_dgemm(MagmaNoTrans, MagmaConjTrans, n-k, kb, jb,
                                    mzone, dA(k, j), ldda, 
                                           dW(k, 0), ldda,
                                    zone,  dA(k, k), ldda);
                    }
                    trace_gpu_end( 0, 0 );
                }
            }
        }
    }
    
    trace_finalize( "dsytrf.svg","trace.css" );
    magma_queue_destroy(stream[0]);
    magma_queue_destroy(stream[1]);
    magma_event_destroy( event );
    magma_free(dW);
    magma_free(dA);
    
    return MAGMA_SUCCESS;
} /* magma_dsytrf_nopiv */
Beispiel #4
0
/**
    Purpose
    -------
    DLAHR2 reduces the first NB columns of a real general n-BY-(n-k+1)
    matrix A so that elements below the k-th subdiagonal are zero. The
    reduction is performed by an orthogonal similarity transformation
    Q' * A * Q. The routine returns the matrices V and T which determine
    Q as a block reflector I - V*T*V', and also the matrix Y = A * V.
    (Note this is different than LAPACK, which computes Y = A * V * T.)

    This is an auxiliary routine called by DGEHRD.

    Arguments
    ---------
    @param[in]
    n       INTEGER
            The order of the matrix A.

    @param[in]
    k       INTEGER
            The offset for the reduction. Elements below the k-th
            subdiagonal in the first NB columns are reduced to zero.
            K < N.

    @param[in]
    nb      INTEGER
            The number of columns to be reduced.

    @param[in,out]
    A       DOUBLE_PRECISION array, dimension (LDA,N-K+1)
            On entry, the n-by-(n-k+1) general matrix A.
            On exit, the elements on and above the k-th subdiagonal in
            the first NB columns are overwritten with the corresponding
            elements of the reduced matrix; the elements below the k-th
            subdiagonal, with the array TAU, represent the matrix Q as a
            product of elementary reflectors. The other columns of A are
            unchanged. See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    tau     DOUBLE_PRECISION array, dimension (NB)
            The scalar factors of the elementary reflectors. See Further
            Details.

    @param[out]
    T       DOUBLE_PRECISION array, dimension (LDT,NB)
            The upper triangular matrix T.

    @param[in]
    ldt     INTEGER
            The leading dimension of the array T.  LDT >= NB.

    @param[out]
    Y       DOUBLE_PRECISION array, dimension (LDY,NB)
            The n-by-nb matrix Y.

    @param[in]
    ldy     INTEGER
            The leading dimension of the array Y. LDY >= N.

    @param[in,out]
    data    Structure with pointers to dA, dT, dV, dW, dY
            which are distributed across multiple GPUs.

    Further Details
    ---------------
    The matrix Q is represented as a product of nb elementary reflectors

       Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in
    A(i+k+1:n,i), and tau in TAU(i).

    The elements of the vectors v together form the (n-k+1)-by-nb matrix
    V which is needed, with T and Y, to apply the transformation to the
    unreduced part of the matrix, using an update of the form:
    A := (I - V*T*V') * (A - Y*T*V').

    The contents of A on exit are illustrated by the following example
    with n = 7, k = 3 and nb = 2:

    @verbatim
       ( a   a   a   a   a )
       ( a   a   a   a   a )
       ( a   a   a   a   a )
       ( h   h   a   a   a )
       ( v1  h   a   a   a )
       ( v1  v2  a   a   a )
       ( v1  v2  a   a   a )
    @endverbatim

    where "a" denotes an element of the original matrix A, h denotes a
    modified element of the upper Hessenberg matrix H, and vi denotes an
    element of the vector defining H(i).

    This implementation follows the hybrid algorithm and notations described in

    S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg
    form through hybrid GPU-based computing," University of Tennessee Computer
    Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219),
    May 24, 2009.

    @ingroup magma_dgeev_aux
    ********************************************************************/
extern "C" magma_int_t
magma_dlahr2_m(
    magma_int_t n, magma_int_t k, magma_int_t nb,
    double *A, magma_int_t lda,
    double *tau,
    double *T, magma_int_t ldt,
    double *Y, magma_int_t ldy,
    struct dgehrd_data* data )
{
    #define  A(  i, j ) ( A + (i) + (j)*lda)
    #define  Y(  i, j ) ( Y + (i) + (j)*ldy)
    #define  T(  i, j ) ( T + (i) + (j)*ldt)
    #define dA(  d, i, j ) (data->A [d] + (i) + (j)*ldda)
    #define dTi( d       ) (data->Ti[d])
    #define dV(  d, i, j ) (data->V [d] + (i) + (j)*ldv )
    #define dVd( d, i, j ) (data->Vd[d] + (i) + (j)*ldvd)
    #define dY(  d, i, j ) (data->Y [d] + (i) + (j)*ldda)

    double c_zero    = MAGMA_D_ZERO;
    double c_one     = MAGMA_D_ONE;
    double c_neg_one = MAGMA_D_NEG_ONE;
    double tmp;

    magma_int_t ngpu = data->ngpu;
    magma_int_t ldda = data->ldda;
    magma_int_t ldv  = data->ldv;
    magma_int_t ldvd = data->ldvd;
    
    magma_int_t ione = 1;
    
    magma_int_t d, dki1, dn, nblocks, gblock, lblock, lgid;
    magma_int_t n_k_i_1, n_k;
    double scale;

    magma_int_t i;
    double ei = MAGMA_D_ZERO;

    magma_int_t info_data = 0;
    magma_int_t *info = &info_data;
    if (n < 0) {
        *info = -1;
    } else if (k < 0 || k >= n) {
        *info = -2;
    } else if (nb < 1 || nb > n) {
        *info = -3;
    } else if (lda < max(1,n)) {
        *info = -5;
    } else if (ldt < nb) {
        *info = -8;
    } else if (ldy < max(1,n)) {
        *info = -10;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    
    // adjust from 1-based indexing
    k -= 1;

    // Function Body
    if (n <= 1)
        return 0;
    
    // zero out current top block of V on all GPUs
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magmablasSetKernelStream( data->streams[d] );
        magmablas_dlaset( MagmaFull, nb, nb, c_zero, c_zero, dV(d,k,0), ldv );
    }
    
    // set all Y=0
    lapackf77_dlaset( "Full", &n, &nb, &c_zero, &c_zero, Y, &ldy );
    
    for (i = 0; i < nb; ++i) {
        n_k_i_1 = n - k - i - 1;
        n_k     = n - k;
        
        if (i > 0) {
            // Finish applying I - V * T * V' on right
            tmp = MAGMA_D_NEGATE( tau[i-1] );
            blasf77_daxpy( &n_k, &tmp, Y(k,i-1), &ione, A(k,i), &ione );
            
            // Apply I - V * T' * V' to this column (call it b) from the
            // left, using the last column of T as workspace, w.
            //
            // Let  V = ( V1 )   and   b = ( b1 )   (first i-1 rows)
            //          ( V2 )             ( b2 )
            // where V1 is unit lower triangular
            
            // w := b1 = A(k+1:k+i, i)
            blasf77_dcopy( &i,
                           A(k+1,i), &ione,
                           T(0,nb-1), &ione );
            
            // w := V1' * b1 = VA(k+1:k+i, 0:i-1)' * w
            blasf77_dtrmv( "Lower", "Conj", "Unit", &i,
                           A(k+1,0), &lda,
                           T(0,nb-1), &ione );
            
            // w := w + V2'*b2 = w + VA(k+i+1:n-1, 0:i-1)' * A(k+i+1:n-1, i)
            blasf77_dgemv( "Conj", &n_k_i_1, &i,
                           &c_one, A(k+i+1,0), &lda,
                                   A(k+i+1,i), &ione,
                           &c_one, T(0,nb-1), &ione );
            
            // w := T'*w = T(0:i-1, 0:i-1)' * w
            blasf77_dtrmv( "Upper", "Conj", "Non-unit", &i,
                           T(0,0), &ldt,
                           T(0,nb-1), &ione );
            
            // b2 := b2 - V2*w = A(k+i+1:n-1, i) - VA(k+i+1:n-1, 0:i-1) * w
            blasf77_dgemv( "No trans", &n_k_i_1, &i,
                           &c_neg_one, A(k+i+1,0), &lda,
                                       T(0,nb-1), &ione,
                           &c_one,     A(k+i+1,i), &ione );
            
            // w := V1*w = VA(k+1:k+i, 0:i-1) * w
            blasf77_dtrmv( "Lower", "No trans", "Unit", &i,
                           A(k+1,0), &lda,
                           T(0,nb-1), &ione );
            
            // b1 := b1 - w = A(k+1:k+i-1, i) - w
            blasf77_daxpy( &i,
                           &c_neg_one, T(0,nb-1), &ione,
                                       A(k+1,i), &ione );
            
            // Restore diagonal element, saved below during previous iteration
            *A(k+i,i-1) = ei;
        }
        
        // Generate the elementary reflector H(i) to annihilate A(k+i+1:n-1,i)
        lapackf77_dlarfg( &n_k_i_1,
                          A(k+i+1,i),
                          A(k+i+2,i), &ione, &tau[i] );
        // Save diagonal element and set to one, to simplify multiplying by V
        ei = *A(k+i+1,i);
        *A(k+i+1,i) = c_one;

        // compute yi = A vi = sum_g A{d} vi{d}
        nblocks = (n-1) / nb / ngpu + 1;
        for( d = 0; d < ngpu; ++d ) {
            magma_setdevice( d );
            magmablasSetKernelStream( data->streams[d] );
            
            // dV(k+i+1:n-1, i) = VA(k+i:n, i)
            magma_dsetvector_async( n_k_i_1,
                                    A(k+i+1,i), 1,
                                    dV(d, k+i+1, i), 1, data->streams[d] );
            
            // copy column of dV -> dVd, using block cyclic distribution.
            // This assumes V and Vd have been padded so that
            // a 2D matrix copy doesn't access them out-of-bounds
            gblock = k / nb;
            lblock = gblock / ngpu;
            lgid   = gblock % ngpu;
            if ( d < lgid ) {
                lblock += 1;
            }
            // treat V as (nb*ngpu) x nblock matrix, and Vd as nb x nblock matrix
            magmablas_dlacpy( MagmaFull, nb, nblocks-lblock,
                              dV (d, d*nb + lblock*nb*ngpu, i), nb*ngpu,
                              dVd(d, 0    + lblock*nb,      i), nb );
            
            // convert global indices (k) to local indices (dk)
            magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn );
            
            // dY(k:n, i) = dA(k:n, k+i+1:n) * dV(k+i+1:n, i)
            // skip if matrix is empty
            // each GPU copies to different temporary vector in Y,
            // which are summed in separate loop below
            if ( dn-dki1 > 0 ) {
                magma_dgemv( MagmaNoTrans, n-k, dn-dki1,
                             c_one,  dA (d, k,    dki1), ldda,
                                     dVd(d, dki1,    i), 1,
                             c_zero, dY (d, k,       i), 1 );
                
                // copy vector to host, storing in column nb+d of Y
                // as temporary space (Y has >= nb+ngpu columns)
                magma_dgetvector_async( n-k,
                                        dY(d, k, i), 1,
                                        Y(k, nb+d),  1, data->streams[d] );
            }
        }
        
        // while GPU is doing above Ag*v...
        // Compute T(0:i,i) = [ -tau T V' vi ]
        //                    [  tau         ]
        // T(0:i-1, i) = -tau VA(k+i+1:n-1, 0:i-1)' VA(k+i+1:n-1, i)
        scale = MAGMA_D_NEGATE( tau[i] );
        blasf77_dgemv( "Conj", &n_k_i_1, &i,
                       &scale,  A(k+i+1,0), &lda,
                                A(k+i+1,i), &ione,
                       &c_zero, T(0,i), &ione );
        // T(0:i-1, i) = T(0:i-1, 0:i-1) * T(0:i-1, i)
        blasf77_dtrmv( "Upper", "No trans", "Non-unit", &i,
                       T(0,0), &ldt,
                       T(0,i), &ione );
        *T(i,i) = tau[i];
        
        // apply reflectors to next column, A(i+1), on right only.
        // one axpy will be required to finish this, in the next iteration above
        if ( i > 0 && i+1 < nb ) {
            // Update next column, A(k:n,i+1), applying Q on right.
            // One axpy will be required to finish this, in the next iteration
            // above, after yi is computed.
            // This updates one more row than LAPACK does (row k),
            // making block above panel an even multiple of nb.
            // Use last column of T as workspace, w.
            magma_int_t i1 = i+1;
            
            // If real, conjugate row of V, and undo afterwards
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv( &i1,  A(k+i1,0), &lda );
            #endif
            // w = T(0:i, 0:i+1) * VA(k+i+1, 0:i+1)'
            // T is now rectangular, so we use gemv instead of trmv as in lapack.
            blasf77_dgemv( "No trans", &i, &i1,
                           &c_one,  T(0,0), &ldt,
                                    A(k+i1,0), &lda,
                           &c_zero, T(0,nb-1), &ione );
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv( &i1,  A(k+i1,0), &lda );
            #endif
            
            // A(k:n, i+1) -= Y(k:n, 0:i) * w
            blasf77_dgemv( "No trans", &n_k, &i,
                           &c_neg_one, Y(k,0), &ldy,
                                       T(0,nb-1), &ione,
                           &c_one,     A(k,i1), &ione );
        }
        
        // yi = sum_g yi{d}
        for( d = 0; d < ngpu; ++d ) {
            magma_setdevice( d );
            magma_queue_sync( data->streams[d] );
            magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn );
            if ( dn-dki1 > 0 ) {
                // yi = yi + yi{d}
                blasf77_daxpy( &n_k, &c_one, Y(k,nb+d), &ione, Y(k,i), &ione );
            }
        }
    }
    // Restore diagonal element
    *A(k+nb,nb-1) = ei;
    
    // compute Y = Am V = sum_g Am{d} V{d} --- top part, Y(0:k-1,:)
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magmablasSetKernelStream( data->streams[d] );
        
        // convert global indices (k) to local indices (dk)
        magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn );
        
        // dY(0:k, :) = dA(0:k, k+i+1:n-1) * dV(k+i+1:n-1, :)
        // skip if matrix is empty
        // each GPU copies to different temporary block in Y,
        // which are summed in separate loop below
        if ( dn-dki1 > 0 ) {
            magma_dgemm( MagmaNoTrans, MagmaNoTrans, k, nb, dn-dki1,
                         c_one,  dA (d, 0,    dki1), ldda,
                                 dVd(d, dki1,    0), ldvd,
                         c_zero, dY (d, 0,       0), ldda );
            
            // copy result to host, storing in columns [nb + nb*d : nb + nb*(d+1)] of Y
            // as temporary space (Y has nb + nb*ngpu columns)
            magma_dgetmatrix_async( k, nb,
                                    dY(d, 0, 0),  ldda,
                                    Y(0,nb+nb*d), ldy, data->streams[d] );
        }
    }
    
    // Y = sum_g Y{d}
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magma_queue_sync( 0 );
        magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn );
        if ( dn-dki1 > 0 ) {
            // Y = Y + Am V
            for( i = 0; i < nb; ++i ) {
                blasf77_daxpy( &k, &c_one, Y(0,nb+nb*d+i), &ione, Y(0,i), &ione );
            }
        }
    }
    
    // copy Y and T matrices to GPUs
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magma_dsetmatrix_async( n, nb, Y, ldy, dY(d, 0, 0), ldda, data->streams[d] );
        magma_dsetmatrix_async( nb, nb, T, nb, dTi(d),      nb,   data->streams[d] );
    }

    return 0;
} /* magma_dlahr2 */
Beispiel #5
0
/**
    Purpose
    -------
    DGEHRD reduces a DOUBLE_PRECISION general matrix A to upper Hessenberg form H by
    an orthogonal similarity transformation:  Q' * A * Q = H . This version
    stores the triangular matrices used in the factorization so that they can
    be applied directly (i.e., without being recomputed) later. As a result,
    the application of Q is much faster.

    Arguments
    ---------
    @param[in]
    n       INTEGER
            The order of the matrix A.  N >= 0.

    @param[in]
    ilo     INTEGER
    @param[in]
    ihi     INTEGER
            It is assumed that A is already upper triangular in rows
            and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally
            set by a previous call to DGEBAL; otherwise they should be
            set to 1 and N respectively. See Further Details.
            1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0.

    @param[in,out]
    A       DOUBLE_PRECISION array, dimension (LDA,N)
            On entry, the N-by-N general matrix to be reduced.
            On exit, the upper triangle and the first subdiagonal of A
            are overwritten with the upper Hessenberg matrix H, and the
            elements below the first subdiagonal, with the array TAU,
            represent the orthogonal matrix Q as a product of elementary
            reflectors. See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    tau     DOUBLE_PRECISION array, dimension (N-1)
            The scalar factors of the elementary reflectors (see Further
            Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to
            zero.

    @param[out]
    work    (workspace) DOUBLE_PRECISION array, dimension (LWORK)
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The length of the array WORK.  LWORK >= max(1,N).
            For optimum performance LWORK >= N*NB, where NB is the
            optimal blocksize.
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    @param[out]
    T       DOUBLE_PRECISION array, dimension NB*N,
            where NB is the optimal blocksize. It stores the NB*NB blocks
            of the triangular T matrices used in the reduction.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value.

    Further Details
    ---------------
    The matrix Q is represented as a product of (ihi-ilo) elementary
    reflectors

        Q = H(ilo) H(ilo+1) . . . H(ihi-1).

    Each H(i) has the form

        H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on
    exit in A(i+2:ihi,i), and tau in TAU(i).

    The contents of A are illustrated by the following example, with
    n = 7, ilo = 2 and ihi = 6:

    @verbatim
    on entry,                        on exit,

    ( a   a   a   a   a   a   a )    (  a   a   h   h   h   h   a )
    (     a   a   a   a   a   a )    (      a   h   h   h   h   a )
    (     a   a   a   a   a   a )    (      h   h   h   h   h   h )
    (     a   a   a   a   a   a )    (      v2  h   h   h   h   h )
    (     a   a   a   a   a   a )    (      v2  v3  h   h   h   h )
    (     a   a   a   a   a   a )    (      v2  v3  v4  h   h   h )
    (                         a )    (                          a )
    @endverbatim

    where a denotes an element of the original matrix A, h denotes a
    modified element of the upper Hessenberg matrix H, and vi denotes an
    element of the vector defining H(i).

    This implementation follows the hybrid algorithm and notations described in

    S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg
    form through hybrid GPU-based computing," University of Tennessee Computer
    Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219),
    May 24, 2009.

    This version stores the T matrices, for later use in magma_dorghr.

    @ingroup magma_dgeev_comp
    ********************************************************************/
extern "C" magma_int_t
magma_dgehrd_m(
    magma_int_t n, magma_int_t ilo, magma_int_t ihi,
    double *A, magma_int_t lda,
    double *tau,
    double *work, magma_int_t lwork,
    double *T,
    magma_int_t *info)
{
    #define  A( i, j )    (A + (i) + (j)*lda)
    #define dA( d, i, j ) (data.A[d] + (i) + (j)*ldda)

    double c_one  = MAGMA_D_ONE;
    double c_zero = MAGMA_D_ZERO;

    magma_int_t nb = magma_get_dgehrd_nb(n);

    magma_int_t nh, iws, ldda, min_lblocks, max_lblocks, last_dev, d;
    magma_int_t dpanel, di, nlocal, i, i2, ib, ldwork;
    magma_int_t iinfo;
    magma_int_t lquery;
    struct dgehrd_data data;

    int ngpu = magma_num_gpus();
    
    *info = 0;
    iws = n*(nb + nb*ngpu);
    work[0] = MAGMA_D_MAKE( iws, 0 );

    lquery = (lwork == -1);
    if (n < 0) {
        *info = -1;
    } else if (ilo < 1 || ilo > max(1,n)) {
        *info = -2;
    } else if (ihi < min(ilo,n) || ihi > n) {
        *info = -3;
    } else if (lda < max(1,n)) {
        *info = -5;
    } else if (lwork < max(1,n) && ! lquery) {
        *info = -8;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;

    // Adjust from 1-based indexing
    ilo -= 1;
    
    // Quick return if possible
    nh = ihi - ilo;
    if (nh <= 1) {
        work[0] = c_one;
        return *info;
    }
    
    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );

    // Set elements 0:ILO-1 and IHI-1:N-2 of TAU to zero
    for (i = 0; i < ilo; ++i)
        tau[i] = c_zero;

    for (i = max(0,ihi-1); i < n-1; ++i)
        tau[i] = c_zero;

    // set T to zero
    lapackf77_dlaset( "Full", &nb, &n, &c_zero, &c_zero, T, &nb );

    // set to null, to simplify cleanup code
    for( d = 0; d < ngpu; ++d ) {
        data.A[d]       = NULL;
        data.streams[d] = NULL;
    }
    
    // If not enough workspace, use unblocked code
    if ( lwork < iws ) {
        nb = 1;
    }

    if (nb == 1 || nb >= nh) {
        // Use unblocked code below
        i = ilo;
    }
    else {
        // Use blocked code
        // allocate memory on GPUs for A and workspaces
        ldda = ((n+31)/32)*32;
        min_lblocks = (n     / nb) / ngpu;
        max_lblocks = ((n-1) / nb) / ngpu + 1;
        last_dev    = (n     / nb) % ngpu;
        
        // V and Vd need to be padded for copying in mdlahr2
        data.ngpu = ngpu;
        data.ldda = ldda;
        data.ldv  = nb*max_lblocks*ngpu;
        data.ldvd = nb*max_lblocks;
        
        for( d = 0; d < ngpu; ++d ) {
            magma_setdevice( d );
            nlocal = min_lblocks*nb;
            if ( d < last_dev ) {
                nlocal += nb;
            }
            else if ( d == last_dev ) {
                nlocal += (n % nb);
            }
            
            ldwork = nlocal*ldda   // A
                   + nb*data.ldv   // V
                   + nb*data.ldvd  // Vd
                   + nb*ldda       // Y
                   + nb*ldda       // W
                   + nb*nb;        // Ti
            if ( MAGMA_SUCCESS != magma_dmalloc( &data.A[d], ldwork )) {
                *info = MAGMA_ERR_DEVICE_ALLOC;
                goto CLEANUP;
            }
            data.V [d] = data.A [d] + nlocal*ldda;
            data.Vd[d] = data.V [d] + nb*data.ldv;
            data.Y [d] = data.Vd[d] + nb*data.ldvd;
            data.W [d] = data.Y [d] + nb*ldda;
            data.Ti[d] = data.W [d] + nb*ldda;
            
            magma_queue_create( &data.streams[d] );
        }
        
        // Copy the matrix to GPUs
        magma_dsetmatrix_1D_col_bcyclic( n, n, A, lda, data.A, ldda, ngpu, nb );
        
        // round ilo down to block boundary
        ilo = (ilo/nb)*nb;
        for (i = ilo; i < ihi - 1 - nb; i += nb) {
            //   Reduce columns i:i+nb-1 to Hessenberg form, returning the
            //   matrices V and T of the block reflector H = I - V*T*V'
            //   which performs the reduction, and also the matrix Y = A*V*T
            
            //   Get the current panel (no need for the 1st iteration)
            dpanel =  (i / nb) % ngpu;
            di     = ((i / nb) / ngpu) * nb;
            if ( i > ilo ) {
                magma_setdevice( dpanel );
                magma_dgetmatrix( ihi-i, nb,
                                  dA(dpanel, i, di), ldda,
                                  A(i,i),            lda );
            }
            
            // add 1 to i for 1-based index
            magma_dlahr2_m( ihi, i+1, nb, A(0,i), lda,
                            &tau[i], &T[i*nb], nb, work, n, &data );
            
            magma_dlahru_m( n, ihi, i, nb, A, lda, &data );
            
            // copy first i rows above panel to host
            magma_setdevice( dpanel );
            magma_dgetmatrix_async( i, nb,
                                    dA(dpanel, 0, di), ldda,
                                    A(0,i),            lda, data.streams[dpanel] );
        }
        
        // Copy remainder to host, block-by-block
        for( i2 = i; i2 < n; i2 += nb ) {
            ib = min( nb, n-i2 );
            d  = (i2 / nb) % ngpu;
            di = (i2 / nb) / ngpu * nb;
            magma_setdevice( d );
            magma_dgetmatrix( n, ib,
                              dA(d, 0, di), ldda,
                              A(0,i2),      lda );
        }
    }

    // Use unblocked code to reduce the rest of the matrix
    // add 1 to i for 1-based index
    i += 1;
    lapackf77_dgehd2(&n, &i, &ihi, A, &lda, tau, work, &iinfo);
    work[0] = MAGMA_D_MAKE( iws, 0 );
    
CLEANUP:
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magma_free( data.A[d] );
        magma_queue_destroy( data.streams[d] );
    }
    magma_setdevice( orig_dev );
    
    return *info;
} /* magma_dgehrd */
Beispiel #6
0
/**
    Purpose
    -------
    DLABRD reduces the first NB rows and columns of a real general
    m by n matrix A to upper or lower bidiagonal form by an orthogonal
    transformation Q' * A * P, and returns the matrices X and Y which
    are needed to apply the transformation to the unreduced part of A.

    If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower
    bidiagonal form.

    This is an auxiliary routine called by DGEBRD.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows in the matrix A.

    @param[in]
    n       INTEGER
            The number of columns in the matrix A.

    @param[in]
    nb      INTEGER
            The number of leading rows and columns of A to be reduced.

    @param[in,out]
    A       DOUBLE_PRECISION array, dimension (LDA,N)
            On entry, the m by n general matrix to be reduced.
            On exit, the first NB rows and columns of the matrix are
            overwritten; the rest of the array is unchanged.
            If m >= n, elements on and below the diagonal in the first NB
              columns, with the array TAUQ, represent the orthogonal
              matrix Q as a product of elementary reflectors; and
              elements above the diagonal in the first NB rows, with the
              array TAUP, represent the orthogonal matrix P as a product
              of elementary reflectors.
    \n
            If m < n, elements below the diagonal in the first NB
              columns, with the array TAUQ, represent the orthogonal
              matrix Q as a product of elementary reflectors, and
              elements on and above the diagonal in the first NB rows,
              with the array TAUP, represent the orthogonal matrix P as
              a product of elementary reflectors.
            See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    @param[in,out]
    dA      DOUBLE_PRECISION array, dimension (LDDA,N)
            Copy of A on GPU.

    @param[in]
    ldda    INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).

    @param[out]
    d       DOUBLE_PRECISION array, dimension (NB)
            The diagonal elements of the first NB rows and columns of
            the reduced matrix.  D(i) = A(i,i).

    @param[out]
    e       DOUBLE_PRECISION array, dimension (NB)
            The off-diagonal elements of the first NB rows and columns of
            the reduced matrix.

    @param[out]
    tauq    DOUBLE_PRECISION array dimension (NB)
            The scalar factors of the elementary reflectors which
            represent the orthogonal matrix Q. See Further Details.

    @param[out]
    taup    DOUBLE_PRECISION array, dimension (NB)
            The scalar factors of the elementary reflectors which
            represent the orthogonal matrix P. See Further Details.

    @param[out]
    X       DOUBLE_PRECISION array, dimension (LDX,NB)
            The m-by-nb matrix X required to update the unreduced part
            of A.

    @param[in]
    ldx     INTEGER
            The leading dimension of the array X. LDX >= M.

    @param[out]
    dX      DOUBLE_PRECISION array, dimension (LDDX,NB)
            Copy of X on GPU.

    @param[in]
    lddx    INTEGER
            The leading dimension of the array dX. LDDX >= M.

    @param[out]
    Y       DOUBLE_PRECISION array, dimension (LDY,NB)
            The n-by-nb matrix Y required to update the unreduced part
            of A.

    @param[in]
    ldy     INTEGER
            The leading dimension of the array Y. LDY >= N.

    @param[out]
    dY      DOUBLE_PRECISION array, dimension (LDDY,NB)
            Copy of Y on GPU.

    @param[in]
    lddy    INTEGER
            The leading dimension of the array dY. LDDY >= N.

    Further Details
    ---------------
    The matrices Q and P are represented as products of elementary
    reflectors:

       Q = H(1) H(2) . . . H(nb)  and  P = G(1) G(2) . . . G(nb)

    Each H(i) and G(i) has the form:

       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'

    where tauq and taup are real scalars, and v and u are real vectors.

    If m >= n, v(1:i-1) = 0, v(i) = 1, and v(i:m) is stored on exit in
    A(i:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+1:n) is stored on exit in
    A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i).

    If m < n, v(1:i) = 0, v(i+1) = 1, and v(i+1:m) is stored on exit in
    A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i:n) is stored on exit in
    A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i).

    The elements of the vectors v and u together form the m-by-nb matrix
    V and the nb-by-n matrix U' which are needed, with X and Y, to apply
    the transformation to the unreduced part of the matrix, using a block
    update of the form:  A := A - V*Y' - X*U'.

    The contents of A on exit are illustrated by the following examples
    with nb = 2:

    @verbatim
    m = 6 and n = 5 (m > n):          m = 5 and n = 6 (m < n):

      (  1   1   u1  u1  u1 )           (  1   u1  u1  u1  u1  u1 )
      (  v1  1   1   u2  u2 )           (  1   1   u2  u2  u2  u2 )
      (  v1  v2  a   a   a  )           (  v1  1   a   a   a   a  )
      (  v1  v2  a   a   a  )           (  v1  v2  a   a   a   a  )
      (  v1  v2  a   a   a  )           (  v1  v2  a   a   a   a  )
      (  v1  v2  a   a   a  )
    @endverbatim

    where a denotes an element of the original matrix which is unchanged,
    vi denotes an element of the vector defining H(i), and ui an element
    of the vector defining G(i).

    @ingroup magma_dgesvd_aux
    ********************************************************************/
extern "C" magma_int_t
magma_dlabrd_gpu( magma_int_t m, magma_int_t n, magma_int_t nb,
                  double *A,  magma_int_t lda,
                  double *dA, magma_int_t ldda,
                  double *d, double *e, double *tauq, double *taup,
                  double *X,  magma_int_t ldx,
                  double *dX, magma_int_t lddx,
                  double *Y,  magma_int_t ldy,
                  double *dY, magma_int_t lddy)
{
    #define A(i_,j_) (A + (i_) + (j_)*lda)
    #define X(i_,j_) (X + (i_) + (j_)*ldx)
    #define Y(i_,j_) (Y + (i_) + (j_)*ldy)
    #define dA(i_,j_) (dA + (i_) + (j_)*ldda)
    #define dY(i_,j_) (dY + (i_) + (j_)*lddy)
    #define dX(i_,j_) (dX + (i_) + (j_)*lddx)
    
    double c_neg_one = MAGMA_D_NEG_ONE;
    double c_one     = MAGMA_D_ONE;
    double c_zero    = MAGMA_D_ZERO;
    magma_int_t ione = 1;
    
    magma_int_t i__2, i__3;
    magma_int_t i;
    double alpha;

    A  -= 1 + lda;
    X  -= 1 + ldx;
    dX -= 1 + lddx;
    Y  -= 1 + ldy;
    dY -= 1 + lddy;
    --d;
    --e;
    --tauq;
    --taup;

    /* Quick return if possible */
    magma_int_t info = 0;
    if (m <= 0 || n <= 0) {
        return info;
    }

    double *f;
    magma_queue_t stream;
    magma_queue_create( &stream );
    magma_dmalloc_cpu( &f, max(n,m) );
    if ( f == NULL ) {
        info = MAGMA_ERR_HOST_ALLOC;
        return info;
    }
    
    if (m >= n) {
        /* Reduce to upper bidiagonal form */
        for (i = 1; i <= nb; ++i) {
            /*  Update A(i:m,i) */
            i__2 = m - i + 1;
            i__3 = i - 1;
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv( &i__3, Y(i,1), &ldy );
            #endif
            blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one,
                           A(i,1), &lda,
                           Y(i,1), &ldy, &c_one,
                           A(i,i), &ione );
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv( &i__3, Y(i,1), &ldy );
            #endif
            blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one,
                           X(i,1), &ldx,
                           A(1,i), &ione, &c_one,
                           A(i,i), &ione );
            
            /* Generate reflection Q(i) to annihilate A(i+1:m,i) */
            alpha = *A(i,i);
            i__2 = m - i + 1;
            i__3 = i + 1;
            lapackf77_dlarfg( &i__2, &alpha, A(min(i__3,m),i), &ione, &tauq[i] );
            d[i] = MAGMA_D_REAL( alpha );
            if (i < n) {
                *A(i,i) = c_one;

                /* Compute Y(i+1:n,i) */
                i__2 = m - i + 1;
                i__3 = n - i;

                // 1. Send the block reflector  A(i+1:m,i) to the GPU ------
                magma_dsetvector( i__2,
                                  A(i,i), 1,
                                  dA(i-1,i-1), 1 );
                // 2. Multiply ---------------------------------------------
                magma_dgemv( MagmaConjTrans, i__2, i__3, c_one,
                             dA(i-1,i),   ldda,
                             dA(i-1,i-1), ione, c_zero,
                             dY(i+1,i),   ione );
                
                // 3. Put the result back ----------------------------------
                magma_dgetmatrix_async( i__3, 1,
                                        dY(i+1,i), lddy,
                                        Y(i+1,i),  ldy, stream );
                i__2 = m - i + 1;
                i__3 = i - 1;
                blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_one,
                               A(i,1), &lda,
                               A(i,i), &ione, &c_zero,
                               Y(1,i), &ione );

                i__2 = n - i;
                i__3 = i - 1;
                blasf77_dgemv( "N", &i__2, &i__3, &c_neg_one,
                               Y(i+1,1), &ldy,
                               Y(1,i),   &ione, &c_zero,
                               f,        &ione );
                i__2 = m - i + 1;
                i__3 = i - 1;
                blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_one,
                               X(i,1), &ldx,
                               A(i,i), &ione, &c_zero,
                               Y(1,i), &ione );
                
                // 4. Sync to make sure the result is back ----------------
                magma_queue_sync( stream );

                if (i__3 != 0) {
                    i__2 = n - i;
                    blasf77_daxpy( &i__2, &c_one, f, &ione, Y(i+1,i), &ione );
                }

                i__2 = i - 1;
                i__3 = n - i;
                blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_neg_one,
                               A(1,i+1), &lda,
                               Y(1,i),   &ione, &c_one,
                               Y(i+1,i), &ione );
                i__2 = n - i;
                blasf77_dscal( &i__2, &tauq[i], Y(i+1,i), &ione );

                /* Update A(i,i+1:n) */
                i__2 = n - i;
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_dlacgv( &i__2, A(i,i+1), &lda );
                lapackf77_dlacgv( &i,  A(i,1), &lda );
                #endif
                blasf77_dgemv( "No transpose", &i__2, &i, &c_neg_one,
                               Y(i+1,1), &ldy,
                               A(i,1),   &lda, &c_one,
                               A(i,i+1), &lda );
                i__2 = i - 1;
                i__3 = n - i;
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_dlacgv( &i,  A(i,1), &lda );
                lapackf77_dlacgv( &i__2, X(i,1), &ldx );
                #endif
                blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_neg_one,
                               A(1,i+1), &lda,
                               X(i,1),   &ldx, &c_one,
                               A(i,i+1), &lda );
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_dlacgv( &i__2, X(i,1), &ldx );
                #endif

                /* Generate reflection P(i) to annihilate A(i,i+2:n) */
                i__2 = n - i;
                i__3 = i + 2;
                alpha = *A(i,i+1);
                lapackf77_dlarfg( &i__2, &alpha, A(i,min(i__3,n)), &lda, &taup[i] );
                e[i] = MAGMA_D_REAL( alpha );
                *A(i,i+1) = c_one;

                /* Compute X(i+1:m,i) */
                i__2 = m - i;
                i__3 = n - i;
                // 1. Send the block reflector  A(i+1:m,i) to the GPU ------
                magma_dsetvector( i__3,
                                  A(i,i+1), lda,
                                  dA(i-1,i), ldda );
                // 2. Multiply ---------------------------------------------
                //magma_dcopy( i__3, dA(i-1,i), ldda, dY(1,1), 1 );
                magma_dgemv( MagmaNoTrans, i__2, i__3, c_one,
                             dA(i,i), ldda,
                             dA(i-1,i), ldda,
                             //dY(1,1), 1,
                             c_zero,
                             dX(i+1,i), ione );

                // 3. Put the result back ----------------------------------
                magma_dgetmatrix_async( i__2, 1,
                                        dX(i+1,i), lddx,
                                        X(i+1,i),  ldx, stream );

                i__2 = n - i;
                blasf77_dgemv( MagmaConjTransStr, &i__2, &i, &c_one,
                               Y(i+1,1), &ldy,
                               A(i,i+1), &lda, &c_zero,
                               X(1,i),   &ione );

                i__2 = m - i;
                blasf77_dgemv( "N", &i__2, &i, &c_neg_one,
                               A(i+1,1), &lda,
                               X(1,i),   &ione, &c_zero,
                               f,        &ione );
                i__2 = i - 1;
                i__3 = n - i;
                blasf77_dgemv( "N", &i__2, &i__3, &c_one,
                               A(1,i+1), &lda,
                               A(i,i+1), &lda, &c_zero,
                               X(1,i),   &ione );

                // 4. Sync to make sure the result is back ----------------
                magma_queue_sync( stream );
                if (i != 0) {
                    i__2 = m - i;
                    blasf77_daxpy( &i__2, &c_one, f, &ione, X(i+1,i), &ione );
                }


                i__2 = m - i;
                i__3 = i - 1;
                blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one,
                               X(i+1,1), &ldx,
                               X(1,i),   &ione, &c_one,
                               X(i+1,i), &ione );
                i__2 = m - i;
                blasf77_dscal( &i__2, &taup[i], X(i+1,i), &ione );

                #if defined(PRECISION_z) || defined(PRECISION_c)
                i__2 = n - i;
                lapackf77_dlacgv( &i__2,  A(i,i+1), &lda );
                // 4. Send the block reflector  A(i+1:m,i) to the GPU after DLACGV()
                magma_dsetvector( i__2,
                                  A(i,i+1),  lda,
                                  dA(i-1,i), ldda );
                #endif
            }
        }
    }
    else {
        /* Reduce to lower bidiagonal form */
        for (i = 1; i <= nb; ++i) {
        
            /* Update A(i,i:n) */
            i__2 = n - i + 1;
            i__3 = i - 1;
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv( &i__2, A(i,i), &lda );
            lapackf77_dlacgv( &i__3, A(i,1), &lda );
            #endif
            blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one,
                           Y(i,1), &ldy,
                           A(i,1), &lda, &c_one,
                           A(i,i), &lda );
            i__2 = i - 1;
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv( &i__3, A(i,1), &lda );
            lapackf77_dlacgv( &i__3, X(i,1), &ldx );
            #endif
            i__3 = n - i + 1;
            blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_neg_one,
                           A(1,i), &lda,
                           X(i,1), &ldx, &c_one,
                           A(i,i), &lda );
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv( &i__2, X(i,1), &ldx );
            #endif
            
            /* Generate reflection P(i) to annihilate A(i,i+1:n) */
            i__2 = n - i + 1;
            i__3 = i + 1;
            alpha = *A(i,i);
            lapackf77_dlarfg( &i__2, &alpha, A(i,min(i__3,n)), &lda, &taup[i] );
            d[i] = MAGMA_D_REAL( alpha );
            if (i < m) {
                *A(i,i) = c_one;
                
                /* Compute X(i+1:m,i) */
                i__2 = m - i;
                i__3 = n - i + 1;
                
                // 1. Send the block reflector  A(i,i+1:n) to the GPU ------
                magma_dsetvector( i__3,
                                  A(i,i), lda,
                                  dA(i-1,i-1), ldda );
                
                // 2. Multiply ---------------------------------------------
                //magma_dcopy( i__3, dA(i-1,i-1), ldda, dY(1,1), 1 );
                magma_dgemv( MagmaNoTrans, i__2, i__3, c_one,
                             dA(i,i-1), ldda,
                             dA(i-1,i-1), ldda,
                             //dY(1,1), 1,
                             c_zero,
                             dX(i+1,i), ione );
                
                // 3. Put the result back ----------------------------------
                magma_dgetmatrix_async( i__2, 1,
                                        dX(i+1,i), lddx,
                                        X(i+1,i),  ldx, stream );
                
                i__2 = n - i + 1;
                i__3 = i - 1;
                blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_one,
                               Y(i,1), &ldy,
                               A(i,i), &lda, &c_zero,
                               X(1,i), &ione );
                i__2 = m - i;
                i__3 = i - 1;
                blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one,
                               A(i+1,1), &lda,
                               X(1,i),   &ione, &c_zero,
                               f,        &ione );
                
                i__2 = i - 1;
                i__3 = n - i + 1;
                blasf77_dgemv( "No transpose", &i__2, &i__3, &c_one,
                               A(1,i), &lda,
                               A(i,i), &lda, &c_zero,
                               X(1,i), &ione );
                
                // 4. Sync to make sure the result is back ----------------
                magma_queue_sync( stream );
                if (i__2 != 0) {
                    i__3 = m - i;
                    blasf77_daxpy( &i__3, &c_one, f, &ione, X(i+1,i), &ione );
                }
                
                i__2 = m - i;
                i__3 = i - 1;
                blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one,
                               X(i+1,1), &ldx,
                               X(1,i),   &ione, &c_one,
                               X(i+1,i), &ione );
                i__2 = m - i;
                blasf77_dscal( &i__2, &taup[i], X(i+1,i), &ione );
                i__2 = n - i + 1;
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_dlacgv( &i__2, A(i,i), &lda );
                magma_dsetvector( i__2,
                                  A(i,i), lda,
                                  dA(i-1,i-1), ldda );
                #endif
                
                /* Update A(i+1:m,i) */
                i__2 = m - i;
                i__3 = i - 1;
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_dlacgv( &i__3, Y(i,1), &ldy );
                #endif
                blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one,
                               A(i+1,1), &lda,
                               Y(i,1),   &ldy, &c_one,
                               A(i+1,i), &ione );
                i__2 = m - i;
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_dlacgv( &i__3, Y(i,1), &ldy );
                #endif
                blasf77_dgemv( "No transpose", &i__2, &i, &c_neg_one,
                               X(i+1,1), &ldx,
                               A(1,i),   &ione, &c_one,
                               A(i+1,i), &ione );
                
                /* Generate reflection Q(i) to annihilate A(i+2:m,i) */
                i__2 = m - i;
                i__3 = i + 2;
                alpha = *A(i+1,i);
                lapackf77_dlarfg( &i__2, &alpha, A(min(i__3,m),i), &ione, &tauq[i] );
                e[i] = MAGMA_D_REAL( alpha );
                *A(i+1,i) = c_one;
                
                /* Compute Y(i+1:n,i) */
                i__2 = m - i;
                i__3 = n - i;
                
                // 1. Send the block reflector  A(i+1:m,i) to the GPU ------
                magma_dsetvector( i__2,
                                  A(i+1,i), 1,
                                  dA(i,i-1), 1 );
                // 2. Multiply ---------------------------------------------
                magma_dgemv( MagmaConjTrans, i__2, i__3, c_one,
                             dA(i,i),   ldda,
                             dA(i,i-1), ione, c_zero,
                             dY(i+1,i), ione );
                
                // 3. Put the result back ----------------------------------
                magma_dgetmatrix_async( i__3, 1,
                                        dY(i+1,i), lddy,
                                        Y(i+1,i),  ldy, stream );
                
                i__2 = m - i;
                i__3 = i - 1;
                blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_one,
                               A(i+1,1), &lda,
                               A(i+1,i), &ione, &c_zero,
                               Y(1,i),   &ione );
                i__2 = n - i;
                i__3 = i - 1;
                blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one,
                               Y(i+1,1), &ldy,
                               Y(1,i),   &ione, &c_zero,
                               f,        &ione );
                
                i__2 = m - i;
                blasf77_dgemv( MagmaConjTransStr, &i__2, &i, &c_one,
                               X(i+1,1), &ldx,
                               A(i+1,i), &ione, &c_zero,
                               Y(1,i),   &ione );
                
                // 4. Sync to make sure the result is back ----------------
                magma_queue_sync( stream );
                if (i__3 != 0) {
                    i__2 = n - i;
                    blasf77_daxpy( &i__2, &c_one, f, &ione, Y(i+1,i), &ione );
                }
                
                i__2 = n - i;
                blasf77_dgemv( MagmaConjTransStr, &i, &i__2, &c_neg_one,
                               A(1,i+1), &lda,
                               Y(1,i),   &ione, &c_one,
                               Y(i+1,i), &ione );
                i__2 = n - i;
                blasf77_dscal( &i__2, &tauq[i], Y(i+1,i), &ione );
            }
            #if defined(PRECISION_z) || defined(PRECISION_c)
            else {
                i__2 = n - i + 1;
                lapackf77_dlacgv( &i__2, A(i,i), &lda );
                magma_dsetvector( i__2,
                                  A(i,i), lda,
                                  dA(i-1,i-1), ldda );
            }
            #endif
        }
    }
    
    magma_queue_destroy( stream );
    magma_free_cpu( f );
    
    return info;
} /* magma_dlabrd_gpu */
Beispiel #7
0
/**
    Purpose
    -------
    DGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.  This version does not
    require work space on the GPU passed as input. GPU memory is allocated
    in the routine.

    The factorization has the form
        A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.

    It uses 2 queues to overlap communication and computation.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    A       DOUBLE PRECISION array, dimension (LDA,N)
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.
    \n
            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    @param[out]
    ipiv    INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
      -     > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    @ingroup magma_dgesv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_dgetrf(
    magma_int_t m, magma_int_t n,
    double *A, magma_int_t lda,
    magma_int_t *ipiv,
    magma_int_t *info)
{
    #ifdef HAVE_clBLAS
    #define  dA(i_, j_)     dA, ((i_)*nb  + (j_)*nb*ldda + dA_offset)
    #define dAT(i_, j_)    dAT, ((i_)*nb*lddat + (j_)*nb + dAT_offset)
    #define dwork(i_)    dwork, (i_)
    #else
    #define  dA(i_, j_) (   dA + (i_)*nb  + (j_)*nb*ldda)
    #define dAT(i_, j_) (  dAT + (i_)*nb*lddat + (j_)*nb)
    #define dwork(i_)   (dwork + (i_))
    #endif
    
    // Constants
    const double c_one     = MAGMA_D_ONE;
    const double c_neg_one = MAGMA_D_NEG_ONE;
    
    // Local variables
    double *work;
    magmaDouble_ptr dA, dAT, dwork;
    magma_int_t iinfo, nb;

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (lda < max(1,m))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    nb = magma_get_dgetrf_nb( m, n );

    if ( (nb <= 1) || (nb >= min(m,n)) ) {
        /* Use CPU code. */
        lapackf77_dgetrf( &m, &n, A, &lda, ipiv, info );
    }
    else {
        /* Use hybrid blocked code. */
        magma_int_t maxm, maxn, ldda, lddat, maxdim;
        magma_int_t i, j, rows, cols, s = min(m, n)/nb;
        
        maxm = magma_roundup( m, 32 );
        maxn = magma_roundup( n, 32 );
        maxdim = max( maxm, maxn );
        
        lddat = maxn;
        ldda  = maxm;
        
        /* set number of GPUs */
        magma_int_t ngpu = magma_num_gpus();
        if ( ngpu > 1 ) {
            /* call multi-GPU non-GPU-resident interface  */
            magma_dgetrf_m( ngpu, m, n, A, lda, ipiv, info );
            return *info;
        }
        
        magma_queue_t queues[2] = { NULL, NULL };
        magma_device_t cdev;
        magma_getdevice( &cdev );
        magma_queue_create( cdev, &queues[0] );
        magma_queue_create( cdev, &queues[1] );
        
        /* check the memory requirement */
        size_t mem_size = magma_queue_mem_size( queues[0] );
        mem_size /= sizeof(double);

        magma_int_t h = 1+(2+ngpu);
        magma_int_t ngpu2 = ngpu;
        magma_int_t NB = (magma_int_t)(0.8*mem_size/maxm - h*nb);
        const char* ngr_nb_char = getenv("MAGMA_NGR_NB");
        if ( ngr_nb_char != NULL )
            NB = max( nb, min( NB, atoi(ngr_nb_char) ) );

        if ( ngpu > ceil((double)NB/nb) ) {
            ngpu2 = (magma_int_t)ceil((double)NB/nb);
            h = 1+(2+ngpu2);
            NB = (magma_int_t)(0.8*mem_size/maxm - h*nb);
        }
        if ( ngpu2*NB < n ) {
            /* require too much memory, so call non-GPU-resident version */
            magma_dgetrf_m( ngpu, m, n, A, lda, ipiv, info );
            return *info;
        }

        work = A;
        if (maxdim*maxdim < 2*maxm*maxn) {
            // if close to square, allocate square matrix and transpose in-place
            // dwork is nb*maxm for panel, and maxdim*maxdim for A
            if (MAGMA_SUCCESS != magma_dmalloc( &dwork, nb*maxm + maxdim*maxdim )) {
                /* alloc failed so call non-GPU-resident version */
                magma_dgetrf_m( ngpu, m, n, A, lda, ipiv, info );
                return *info;
            }
            dA = dwork + nb*maxm;
            
            ldda = lddat = maxdim;
            magma_dsetmatrix( m, n, A, lda, dA(0,0), ldda, queues[0] );
            
            dAT = dA;
            magmablas_dtranspose_inplace( maxdim, dAT(0,0), lddat, queues[0] );
        }
        else {
            // if very rectangular, allocate dA and dAT and transpose out-of-place
            // dwork is nb*maxm for panel, and maxm*maxn for A
            if (MAGMA_SUCCESS != magma_dmalloc( &dwork, (nb + maxn)*maxm )) {
                /* alloc failed so call non-GPU-resident version */
                magma_dgetrf_m( ngpu, m, n, A, lda, ipiv, info );
                return *info;
            }
            dA = dwork + nb*maxm;
            
            magma_dsetmatrix( m, n, A, lda, dA(0,0), ldda, queues[0] );
            
            if (MAGMA_SUCCESS != magma_dmalloc( &dAT, maxm*maxn )) {
                /* alloc failed so call non-GPU-resident version */
                magma_free( dwork );
                magma_dgetrf_m( ngpu, m, n, A, lda, ipiv, info );
                return *info;
            }
            
            magmablas_dtranspose( m, n, dA(0,0), ldda, dAT(0,0), lddat, queues[0] );
        }
        
        lapackf77_dgetrf( &m, &nb, work, &lda, ipiv, &iinfo );

        for( j = 0; j < s; j++ ) {
            // get j-th panel from device
            cols = maxm - j*nb;
            
            if (j > 0) {
                magmablas_dtranspose( nb, cols, dAT(j,j), lddat, dwork(0), cols, queues[0] );
                magma_queue_sync( queues[0] );
                
                magma_dgetmatrix_async( m-j*nb, nb, dwork(0), cols, work, lda, queues[1] );
                
                magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             n - (j+1)*nb, nb,
                             c_one, dAT(j-1,j-1), lddat,
                                    dAT(j-1,j+1), lddat, queues[0] );
                magma_dgemm( MagmaNoTrans, MagmaNoTrans,
                             n-(j+1)*nb, m-j*nb, nb,
                             c_neg_one, dAT(j-1,j+1), lddat,
                                        dAT(j,  j-1), lddat,
                             c_one,     dAT(j,  j+1), lddat, queues[0] );
                
                // do the cpu part
                rows = m - j*nb;
                magma_queue_sync( queues[1] );
                lapackf77_dgetrf( &rows, &nb, work, &lda, ipiv+j*nb, &iinfo );
            }
            if (*info == 0 && iinfo > 0)
                *info = iinfo + j*nb;

            // put j-th panel onto device
            magma_dsetmatrix_async( m-j*nb, nb, work, lda, dwork(0), cols, queues[1] );
            
            for( i=j*nb; i < j*nb + nb; ++i ) {
                ipiv[i] += j*nb;
            }
            magmablas_dlaswp( n, dAT(0,0), lddat, j*nb + 1, j*nb + nb, ipiv, 1, queues[0] );

            magma_queue_sync( queues[1] );
            
            magmablas_dtranspose( cols, nb, dwork(0), cols, dAT(j,j), lddat, queues[0] );

            // do the small non-parallel computations (next panel update)
            if (s > (j+1)) {
                magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             nb, nb,
                             c_one, dAT(j, j  ), lddat,
                                    dAT(j, j+1), lddat, queues[0] );
                magma_dgemm( MagmaNoTrans, MagmaNoTrans,
                             nb, m-(j+1)*nb, nb,
                             c_neg_one, dAT(j,   j+1), lddat,
                                        dAT(j+1, j  ), lddat,
                             c_one,     dAT(j+1, j+1), lddat, queues[0] );
            }
            else {
                magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             n-s*nb, nb,
                             c_one, dAT(j, j  ), lddat,
                                    dAT(j, j+1), lddat, queues[0] );
                magma_dgemm( MagmaNoTrans, MagmaNoTrans,
                             n-(j+1)*nb, m-(j+1)*nb, nb,
                             c_neg_one, dAT(j,   j+1), lddat,
                                        dAT(j+1, j  ), lddat,
                             c_one,     dAT(j+1, j+1), lddat, queues[0] );
            }
        }
        
        magma_int_t nb0 = min( m - s*nb, n - s*nb );
        if ( nb0 > 0 ) {
            rows = m - s*nb;
            cols = maxm - s*nb;
            
            magmablas_dtranspose( nb0, rows, dAT(s,s), lddat, dwork(0), cols, queues[0] );
            magma_dgetmatrix_async( rows, nb0, dwork(0), cols, work, lda, queues[0] );
            magma_queue_sync( queues[0] );
            
            // do the cpu part
            lapackf77_dgetrf( &rows, &nb0, work, &lda, ipiv+s*nb, &iinfo );
            if (*info == 0 && iinfo > 0)
                *info = iinfo + s*nb;
            
            for( i=s*nb; i < s*nb + nb0; ++i ) {
                ipiv[i] += s*nb;
            }
            magmablas_dlaswp( n, dAT(0,0), lddat, s*nb + 1, s*nb + nb0, ipiv, 1, queues[0] );
            
            // put j-th panel onto device
            magma_dsetmatrix_async( rows, nb0, work, lda, dwork(0), cols, queues[0] );
            magmablas_dtranspose( rows, nb0, dwork(0), cols, dAT(s,s), lddat, queues[0] );
    
            magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                         n-s*nb-nb0, nb0,
                         c_one, dAT(s, s),     lddat,
                                dAT(s, s)+nb0, lddat, queues[0] );
        }
        
        // undo transpose
        if (maxdim*maxdim < 2*maxm*maxn) {
            magmablas_dtranspose_inplace( maxdim, dAT(0,0), lddat, queues[0] );
            magma_dgetmatrix( m, n, dAT(0,0), lddat, A, lda, queues[0] );
        }
        else {
            magmablas_dtranspose( n, m, dAT(0,0), lddat, dA(0,0), ldda, queues[0] );
            magma_dgetmatrix( m, n, dA(0,0), ldda, A, lda, queues[0] );
            magma_free( dAT );
        }
        magma_free( dwork );
 
        magma_queue_destroy( queues[0] );
        magma_queue_destroy( queues[1] );
    }
    
    return *info;
} /* magma_dgetrf */
Beispiel #8
0
/***************************************************************************//**
    Purpose
    -------
    DLAUUM computes the product U * U^H or L^H * L, where the triangular
    factor U or L is stored in the upper or lower triangular part of
    the array A.

    If UPLO = MagmaUpper then the upper triangle of the result is stored,
    overwriting the factor U in A.
    If UPLO = MagmaLower then the lower triangle of the result is stored,
    overwriting the factor L in A.
    This is the blocked form of the algorithm, calling Level 3 BLAS.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
            Specifies whether the triangular factor stored in the array A
            is upper or lower triangular:
      -     = MagmaUpper:  Upper triangular
      -     = MagmaLower:  Lower triangular

    @param[in]
    n       INTEGER
            The order of the triangular factor U or L.  N >= 0.

    @param[in,out]
    A       DOUBLE PRECISION array, dimension (LDA,N)
            On entry, the triangular factor U or L.
            On exit, if UPLO = MagmaUpper, the upper triangle of A is
            overwritten with the upper triangle of the product U * U^H;
            if UPLO = MagmaLower, the lower triangle of A is overwritten with
            the lower triangle of the product L^H * L.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    info    INTEGER
      -     = 0: successful exit
      -     < 0: if INFO = -k, the k-th argument had an illegal value

    @ingroup magma_lauum
*******************************************************************************/
extern "C" magma_int_t
magma_dlauum(
    magma_uplo_t uplo, magma_int_t n,
    double *A, magma_int_t lda,
    magma_int_t *info)
{
    #define  A(i_, j_) ( A + (i_) + (j_)*lda )
    
    #ifdef HAVE_clBLAS
    #define dA(i_, j_)  dA, ((i_) + (j_)*ldda)
    #else
    #define dA(i_, j_) (dA + (i_) + (j_)*ldda)
    #endif

    /* Constants */
    const double c_one = MAGMA_D_ONE;
    const double             d_one = MAGMA_D_ONE;
    const char* uplo_ = lapack_uplo_const( uplo );
    
    /* Local variables */
    magma_int_t i, ib, ldda, nb;
    magmaDouble_ptr dA;
    bool upper = (uplo == MagmaUpper);

    *info = 0;
    if (! upper && uplo != MagmaLower)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (lda < max(1,n))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return */
    if (n == 0)
        return *info;

    nb = magma_get_dpotrf_nb( n );
    ldda = magma_roundup( n, 32 );

    if (MAGMA_SUCCESS != magma_dmalloc( &dA, n*ldda )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    magma_queue_t queues[2];
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queues[0] );
    magma_queue_create( cdev, &queues[1] );

    if (nb <= 1 || nb >= n) {
        lapackf77_dlauum( uplo_, &n, A, &lda, info );
    }
    else if (upper) {
        /* Compute the product U * U^H. */
        // Computing 2nd block column (diagonal & above):
        // [ u11  u12  u13 ]   [ u11^H               ]   [ ...  u12*u22^H + u13*u23^H  ... ]  
        // [      u22  u23 ] * [ u12^H  u22^H        ] = [ ...  u22*u22^H + u23*u23^H  ... ]
        // [           u33 ]   [ u13^H  u23^H  u33^H ]   [ ...  ...                    ... ]
        for (i=0; i < n; i += nb) {
            ib = min( nb, n-i );

            // Send diagonl block, u22
            // This must finish before lauum below
            magma_dsetmatrix( ib, ib,
                              A(i,i),  lda,
                              dA(i,i), ldda, queues[0] );

            // Send right of diagonl block, u23
            magma_dsetmatrix_async( ib, n-i-ib,
                                    A(i,i+ib),  lda,
                                    dA(i,i+ib), ldda, queues[1] );

            // u12 = u12 * u22^H
            magma_dtrmm( MagmaRight, MagmaUpper,
                         MagmaConjTrans, MagmaNonUnit, i, ib, c_one,
                         dA(i,i), ldda,
                         dA(0,i), ldda, queues[0] );

            // u22 = u22 * u22^H
            lapackf77_dlauum( MagmaUpperStr, &ib, A(i,i), &lda, info );
            
            magma_dsetmatrix_async( ib, ib,
                                    A(i,i),  lda,
                                    dA(i,i), ldda, queues[0] );
            
            if (i+ib < n) {
                // wait for u23
                magma_queue_sync( queues[1] );
                
                // u12 += u13 * u23^H
                magma_dgemm( MagmaNoTrans, MagmaConjTrans,
                             i, ib, n-i-ib,
                             c_one, dA(0,i+ib), ldda,
                                    dA(i,i+ib), ldda,
                             c_one, dA(0,i),    ldda, queues[0] );
                
                // u22 += u23 * u23^H
                magma_dsyrk( MagmaUpper, MagmaNoTrans, ib, n-i-ib,
                             d_one, dA(i,i+ib), ldda,
                             d_one, dA(i,i),    ldda, queues[0] );
            }

            // Get diagonal block & above of current column from device
            // This could be on a different queue -- not needed until return
            magma_dgetmatrix_async( i+ib, ib,
                                    dA(0,i), ldda,
                                    A(0,i),  lda, queues[0] );
        }
    }
    else {
        /* Compute the product L^H * L. */
        for (i=0; i < n; i += nb) {
            ib = min( nb, n-i );
            magma_dsetmatrix( ib, ib,
                              A(i,i),  lda,
                              dA(i,i), ldda, queues[0] );

            magma_dsetmatrix_async( n-i-ib, ib,
                                    A(i+ib,i),  lda,
                                    dA(i+ib,i), ldda, queues[1] );

            magma_dtrmm( MagmaLeft, MagmaLower,
                         MagmaConjTrans, MagmaNonUnit, ib, i, c_one,
                         dA(i,i), ldda,
                         dA(i,0), ldda, queues[0] );


            lapackf77_dlauum( MagmaLowerStr, &ib, A(i,i), &lda, info );

            magma_dsetmatrix_async( ib, ib,
                                    A(i,i),  lda,
                                    dA(i,i), ldda, queues[0] );

            if (i+ib < n) {
                magma_queue_sync( queues[1] );
                
                magma_dgemm( MagmaConjTrans, MagmaNoTrans,
                             ib, i, n-i-ib,
                             c_one, dA(i+ib,i), ldda,
                                    dA(i+ib,0), ldda,
                             c_one, dA(i,0),    ldda, queues[0] );

                magma_dsyrk( MagmaLower, MagmaConjTrans, ib, n-i-ib,
                             d_one, dA(i+ib,i), ldda,
                             d_one, dA(i,i),    ldda, queues[0] );
            }
            
            magma_dgetmatrix_async( ib, i+ib,
                                    dA(i,0), ldda,
                                    A(i,0),  lda, queues[0] );
        }
    }
    
    magma_queue_destroy( queues[0] );
    magma_queue_destroy( queues[1] );

    magma_free( dA );

    return *info;
}
Beispiel #9
0
extern "C" magma_err_t
magma_dgetrf2_mgpu(magma_int_t num_gpus, 
         magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset,
         magmaDouble_ptr *d_lAT, size_t dlAT_offset, magma_int_t lddat, 
         magma_int_t *ipiv,
         magmaDouble_ptr *d_lAP, size_t dlAP_offset, 
         double *w, magma_int_t ldw,
         magma_int_t *info, magma_queue_t *queues)
{
/*  -- clMAGMA (version 1.1.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date January 2014

    Purpose
    =======

    DGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.
    Use two buffer to send panels..

    Arguments
    =========

    NUM_GPUS 
            (input) INTEGER
            The number of GPUS to be used for the factorization.

    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N).
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    LDDA     (input) INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).

    IPIV    (output) INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  if INFO = -7, internal GPU memory allocation failed.
            > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.
    =====================================================================    */

#define inAT(id,i,j)  d_lAT[(id)], (((offset)+(i)*nb)*lddat + (j)*nb)
#define inAT_offset(i, j) (((offset)+(i)*nb)*lddat + (j)*nb)
#define W(j) (w+((j)%num_gpus)*nb*ldw)

    double c_one     = MAGMA_D_ONE;
    double c_neg_one = MAGMA_D_NEG_ONE;

    magma_int_t block_size = 32;
    magma_int_t iinfo, n_local[4]; 
    magma_int_t maxm, mindim;
    magma_int_t i, ii, d, dd, rows, cols, s, ldpan[4];
    magma_int_t id, i_local, i_local2, nb0, nb1;
    magmaDouble_ptr d_panel[4], panel_local[4];
    size_t d_panel_offset[4];
    size_t panel_local_offset[4];
    //cudaStream_t streaml[4][2];

    /* Check arguments */
    *info = 0;
    if (m < 0)
    *info = -2;
    else if (n < 0)
    *info = -3;
    else if (num_gpus*lddat < max(1,n))
    *info = -5;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    //nb     = magma_get_dgetrf_nb(m);
    if( num_gpus > ceil((double)n/nb) ) {
      *info = -1;
      return *info;
    }
    
    else{
      printf("dgetrf2_mgpu num_gpu: %d\n", num_gpus);
      /* Use hybrid blocked code. */
      maxm  = ((m + block_size-1)/block_size)*block_size;

      /* some initializations */
      for(i=0; i<num_gpus; i++){
        n_local[i] = ((n/nb)/num_gpus)*nb;
        if (i < (n/nb)%num_gpus)
           n_local[i] += nb;
        else if (i == (n/nb)%num_gpus)
           n_local[i] += n%nb;

        /* workspaces */
        //d_panel[i] = &(d_lAP[i][nb*maxm]);   /* temporary panel storage */
        d_panel[i] = d_lAP[i];
        d_panel_offset[i] = nb*maxm;

      }
      
      /* start sending the panel to cpu */
      nb0 = min(mindim, nb);
      if( nb0 == nb )
        magma_dtranspose(  d_lAP[0], 0, maxm, inAT(0,0,0), lddat, nb0, maxm, queues[2*0+1] );
      else
        magma_dtranspose2( d_lAP[0], 0, maxm, inAT(0,0,0), lddat, nb0, maxm, queues[2*0+1] );
      magma_dgetmatrix_async( m, nb0,
                              d_lAP[0], 0, maxm,
                              W(0), 0, ldw, queues[2*0+1], NULL );
      clFlush(queues[2*0+1]);
      /* ------------------------------------------------------------------------------------- */
      s = mindim / nb;
      for( i=0; i<s; i++ )
      {
          /* Set the GPU number that holds the current panel */
          id = i%num_gpus;

          /* Set the local index where the current panel is */
          i_local = i/num_gpus;
          // cols for gpu panel
          cols  = maxm - i*nb;
          // rows for cpu panel
          rows  = m - i*nb;

          /* synchrnoize i-th panel from id-th gpu into work */
          magma_queue_sync( queues[2*id+1] );

          /* i-th panel factorization */
          lapackf77_dgetrf( &rows, &nb, W(i), &ldw, ipiv+i*nb, &iinfo);
          
          if ( (*info == 0) && (iinfo > 0) ) {
              *info = iinfo + i*nb;
              //break;
          }

          /* start sending the panel to all the gpus */
          d = (i+1)%num_gpus;
          for( dd=0; dd<num_gpus; dd++ ) {
              magma_dsetmatrix_async( rows, nb,
                                      W(i), 0, ldw,
                                      d_lAP[d], dlAP_offset, cols, queues[2*d+1], NULL );
              d = (d+1)%num_gpus;
          }

          /* apply the pivoting */
          d = (i+1)%num_gpus;
          for( dd=0; dd<num_gpus; dd++ ) {
              if(dd==0){
                // row offset will be added to ipiv in long2  
                magma_dpermute_long2( lddat, inAT(d,0,0), lddat, ipiv, nb, i*nb, queues[2*d] );
              }else{
                // ipiv is already added by row offset, calling long3   
                //magma_dpermute_long2( lddat, inAT(d,0,0), lddat, ipiv, nb, i*nb, queues[2*d] );
                magma_dpermute_long3( lddat, inAT(d,0,0), lddat, ipiv, nb, i*nb, queues[2*d] );
              }
              d = (d+1)%num_gpus;
          }


          /* update the trailing-matrix/look-ahead */
          d = (i+1)%num_gpus;
          for( dd=0; dd<num_gpus; dd++ ) {
              /* storage for panel */
              if( d == id ) {
                  /* the panel belond to this gpu */
                  //panel_local[d] = inAT(d,i,i_local);
                  panel_local[d] = d_lAT[d];
                  panel_local_offset[d] = inAT_offset(i, i_local);
                  ldpan[d] = lddat;
                  /* next column */
                  i_local2 = i_local+1;
              } else {
                  /* the panel belong to another gpu */
                  //panel_local[d] = &d_panel[d][(i%2)*nb*maxm];
                  panel_local[d] = d_panel[d];  
                  panel_local_offset[d] = d_panel_offset[d] + (i%2)*nb*maxm;
                  //panel_local[d] = d_panel[d];
                  ldpan[d] = nb;
                  /* next column */
                  i_local2 = i_local;
                  if( d < id ) i_local2 ++;
              }
              /* the size of the next column */
              if ( s > (i+1) ) {
                  nb0 = nb;
              } else {
                  nb0 = n_local[d]-nb*(s/num_gpus);
                  if( d < s%num_gpus ) nb0 -= nb;
              }
              if( d == (i+1)%num_gpus) {
                  /* owns the next column, look-ahead the column */
                  nb1 = nb0;
                  /* make sure all the pivoting has been applied */
                  //magma_queue_sync(queues[2*d]);
              } else {
                  /* update the entire trailing matrix */
                  nb1 = n_local[d] - i_local2*nb;

                  /* synchronization to make sure panel arrived on gpu */
                  //magma_queue_sync(queues[2*d+1]);
              }
              
             /* 
              magma_queue_sync(queues[2*d]);
              magma_queue_sync(queues[2*d+1]);
             */

              //magma_dtranspose(panel_local[d], panel_local_offset[d], ldpan[d], d_lAP[d], 0, cols, cols, nb, queues[2*d]);
              
              /* gpu updating the trailing matrix */
              if(d == (i+1)%num_gpus){
              magma_queue_sync(queues[2*d]);
              magma_dtranspose(panel_local[d], panel_local_offset[d], ldpan[d], d_lAP[d], 0, cols, cols, nb, queues[2*d+1]);
              magma_queue_sync(queues[2*d+1]);
              magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                           nb1, nb, c_one,
                           panel_local[d], panel_local_offset[d], ldpan[d],
                           inAT(d, i, i_local2), lddat, queues[2*d+1]);
              
              magma_dgemm( MagmaNoTrans, MagmaNoTrans, 
                           nb1, m-(i+1)*nb, nb, 
                           c_neg_one, inAT(d, i,   i_local2),         lddat,
                            panel_local[d], panel_local_offset[d]+nb*ldpan[d], ldpan[d], 
                            c_one,     inAT(d, i+1, i_local2),         lddat,
                            queues[2*d+1]);
              }else{
              magma_queue_sync(queues[2*d+1]);
              magma_dtranspose(panel_local[d], panel_local_offset[d], ldpan[d], d_lAP[d], 0, cols, cols, nb, queues[2*d]);
              magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                           nb1, nb, c_one,
                           panel_local[d], panel_local_offset[d], ldpan[d],
                           inAT(d, i, i_local2), lddat, queues[2*d]);
              
              magma_dgemm( MagmaNoTrans, MagmaNoTrans, 
                           nb1, m-(i+1)*nb, nb, 
                           c_neg_one, inAT(d, i,   i_local2),         lddat,
                            panel_local[d], panel_local_offset[d]+nb*ldpan[d], ldpan[d], 
                            c_one,     inAT(d, i+1, i_local2),         lddat,
                            queues[2*d]);    
              }
              
              if( d == (i+1)%num_gpus ) 
              {
                  /* Set the local index where the current panel is */
                  int loff    = i+1;
                  int i_local = (i+1)/num_gpus;
                  int ldda    = maxm - (i+1)*nb;
                  int cols    = m - (i+1)*nb;
                  nb0 = min(nb, mindim - (i+1)*nb); /* size of the diagonal block */
                  
                  if( nb0 > 0 ) {
                      /* transpose the panel for sending it to cpu */
                      if( i+1 < s ) 
                          magma_dtranspose(  d_lAP[d], 0, ldda, inAT(d,loff,i_local), lddat, nb0, ldda, queues[2*d+1] );
                      else
                          magma_dtranspose2(  d_lAP[d], 0, ldda, inAT(d,loff,i_local), lddat, nb0, ldda, queues[2*d+1] );
                
                      //clFinish(queues[2*d+1]);
                      /* send the panel to cpu */
                      magma_dgetmatrix_async( cols, nb0, 
                                              d_lAP[d], 0, ldda, 
                                              W(i+1), 0,  ldw, queues[2*d+1], NULL );
                  }
              } else {
                    //trace_gpu_end( d, 0 );
              }
              d = (d+1)%num_gpus;
          }

          /* update the remaining matrix by gpu owning the next panel */
          if( (i+1) < s ) {
              int i_local = (i+1)/num_gpus;
              int rows  = m - (i+1)*nb;
              
              d = (i+1)%num_gpus;
              
              magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                               n_local[d] - (i_local+1)*nb, nb, 
                               c_one, panel_local[d], panel_local_offset[d], ldpan[d], 
                               inAT(d,i,i_local+1),  lddat, queues[2*d] );
                  
              magma_dgemm( MagmaNoTrans, MagmaNoTrans, 
                            n_local[d]-(i_local+1)*nb, rows, nb, 
                            c_neg_one, inAT(d,i,i_local+1),            lddat, 
                            panel_local[d], panel_local_offset[d]+nb*ldpan[d], ldpan[d], 
                            c_one,     inAT(d,i+1,  i_local+1),        lddat, queues[2*d] );
          }
      } /* end of for i=1..s */
      /* ------------------------------------------------------------------------------ */

      /* Set the GPU number that holds the last panel */
      id = s%num_gpus;

      /* Set the local index where the last panel is */
      i_local = s/num_gpus;

      /* size of the last diagonal-block */
      nb0 = min(m - s*nb, n - s*nb);
      rows = m    - s*nb;
      cols = maxm - s*nb;

      if( nb0 > 0 ) {

          /* wait for the last panel on cpu */
          magma_queue_sync( queues[2*id+1]);
          
          /* factor on cpu */
          lapackf77_dgetrf( &rows, &nb0, W(s), &ldw, ipiv+s*nb, &iinfo);
          if ( (*info == 0) && (iinfo > 0) )
              *info = iinfo + s*nb;

          /* send the factor to gpus */
          for( d=0; d<num_gpus; d++ ) {
              i_local2 = i_local;
              if( d < id ) i_local2 ++;

              if( d == id || n_local[d] > i_local2*nb ) {
                  magma_dsetmatrix_async( rows, nb0,
                                          W(s), 0,    ldw,
                                          d_lAP[d], 0, cols, queues[2*d+1], NULL );
              }
          }

          for( d=0; d<num_gpus; d++ ) {
              if(d==0){
                  magma_dpermute_long2( lddat, inAT(d,0,0), lddat, ipiv, nb0, s*nb, queues[2*d] );
              }else{
                  //magma_dpermute_long2( lddat, inAT(d,0,0), lddat, ipiv, nb0, s*nb, queues[2*d] );
                  magma_dpermute_long3( lddat, inAT(d,0,0), lddat, ipiv, nb0, s*nb, queues[2*d] );
              }
          }

          for( d=0; d<num_gpus; d++ ) {
              //magma_queue_sync( queues[2*d+1] );
              /* wait for the pivoting to be done */
              magma_queue_sync( queues[2*d] );

              i_local2 = i_local;
              if( d < id ) i_local2++;
              if( d == id ) {
                  /* the panel belond to this gpu */
                  //panel_local[d] = inAT(d,s,i_local);
                  panel_local[d] = d_lAT[d];
                  panel_local_offset[d] = inAT_offset(s, i_local);

                  /* next column */
                  nb1 = n_local[d] - i_local*nb-nb0;

                  magma_dtranspose2( panel_local[d], panel_local_offset[d], lddat, d_lAP[d], 0, cols, rows, nb0, queues[2*d+1]);

                  if( nb1 > 0 ){
                      magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                                   nb1, nb0, c_one,
                                   panel_local[d], panel_local_offset[d], lddat, 
                                   d_lAT[d], inAT_offset(s,i_local)+nb0, lddat, queues[2*d+1]);
                  }
                } else if( n_local[d] > i_local2*nb ) {
                    /* the panel belong to another gpu */
                    //panel_local[d] = &d_panel[d][(s%2)*nb*maxm];
                    panel_local[d] = d_panel[d];
                    panel_local_offset[d] = d_panel_offset[d] + (s%2)*nb*maxm;
                    //panel_local[d] = d_panel[d];

                  /* next column */
                  nb1 = n_local[d] - i_local2*nb;

                  magma_dtranspose2( panel_local[d], panel_local_offset[d], nb, d_lAP[d], 0, cols, rows, nb0, queues[2*d+1]);
                  //cublasDtrsm
                  magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                               nb1, nb0, c_one,
                               panel_local[d], panel_local_offset[d],    nb, 
                               inAT(d,s,i_local2), lddat, queues[2*d+1]);
                }
          }
      } /* if( nb0 > 0 ) */

      /* clean up */
      for( d=0; d<num_gpus; d++ ) {
              magma_queue_sync( queues[2*d] );
              magma_queue_sync( queues[2*d+1] );
              //magma_queue_destroy(streaml[d][0]);
              //magma_queue_destroy(streaml[d][1]);
      } 
    }
    return *info;
    /* End of MAGMA_DGETRF2_MGPU */
}
Beispiel #10
0
/**
    Purpose   
    =======   

    DSYTRF_nopiv computes the LDLt factorization of a real symmetric   
    matrix A. This version does not require work space on the GPU passed 
    as input. GPU memory is allocated in the routine.

    The factorization has the form   
       A = U^H * D * U  , if UPLO = 'U', or   
       A = L   * D * L^H, if UPLO = 'L',   
    where U is an upper triangular matrix, L is lower triangular, and
    D is a diagonal matrix.

    This is the block version of the algorithm, calling Level 3 BLAS.   

    Arguments   
    ---------
    @param[in]
    UPLO    CHARACTER*1   
      -     = 'U':  Upper triangle of A is stored;   
      -     = 'L':  Lower triangle of A is stored.   

    @param[in]
    N       INTEGER   
            The order of the matrix A.  N >= 0.   

    @param[in,out]
    A       DOUBLE_PRECISION array, dimension (LDA,N)   
            On entry, the symmetric matrix A.  If UPLO = 'U', the leading   
            N-by-N upper triangular part of A contains the upper   
            triangular part of the matrix A, and the strictly lower   
            triangular part of A is not referenced.  If UPLO = 'L', the   
            leading N-by-N lower triangular part of A contains the lower   
            triangular part of the matrix A, and the strictly upper   
            triangular part of A is not referenced.   
    \n
            On exit, if INFO = 0, the factor U or L from the Cholesky   
            factorization A = U^H D U or A = L D L^H.   
    \n
            Higher performance is achieved if A is in pinned memory.

    @param[in]
    LDA     INTEGER   
            The leading dimension of the array A.  LDA >= max(1,N).   

    @param[out]
    INFO    INTEGER   
      -     = 0:  successful exit   
      -     < 0:  if INFO = -i, the i-th argument had an illegal value 
                  if INFO = -6, the GPU memory allocation failed 
      -     > 0:  if INFO = i, the leading minor of order i is not   
                  positive definite, and the factorization could not be   
                  completed.

    @ingroup magma_dsysv_comp
    ******************************************************************* */
extern "C" magma_int_t
magma_dsytrf_nopiv(
    magma_uplo_t uplo, magma_int_t n,
    double *A, magma_int_t lda,
    magma_int_t *info)
{
    #define  A(i, j)  ( A +(j)*lda  + (i))
    #define dA(i, j)  (dA +(j)*ldda + (i))
    #define dW(i, j)  (dW +(j)*ldda + (i))
    #define dWt(i, j) (dW +(j)*nb   + (i))

    double zone  = MAGMA_D_ONE;
    double mzone = MAGMA_D_NEG_ONE;
    int                upper = (uplo == MagmaUpper);
    magma_int_t j, k, jb, ldda, nb, ib, iinfo;
    magmaDouble_ptr dA;
    magmaDouble_ptr dW;

    *info = 0;
    if (! upper && uplo != MagmaLower) {
      *info = -1;
    } else if (n < 0) {
      *info = -2;
    } else if (lda < max(1,n)) {
      *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return MAGMA_ERR_ILLEGAL_VALUE;
    }

    /* Quick return */
    if ( n == 0 )
      return MAGMA_SUCCESS;

    ldda = ((n+31)/32)*32;
    nb = magma_get_dsytrf_nopiv_nb(n);
    ib = min(32, nb); // inner-block for diagonal factorization

    if ((MAGMA_SUCCESS != magma_dmalloc(&dA, n *ldda)) ||
        (MAGMA_SUCCESS != magma_dmalloc(&dW, nb*ldda))) {
        /* alloc failed so call the non-GPU-resident version */
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    magma_queue_t stream[2];
    magma_event_t event;
    magma_queue_create(&stream[0]);
    magma_queue_create(&stream[1]);
    magma_event_create( &event );
    trace_init( 1, 1, 2, stream );

    /* Use hybrid blocked code. */
    if (upper) {
        //=========================================================
        // Compute the LDLt factorization A = U'*D*U without pivoting.
        // copy matrix to GPU
        for (j=0; j<n; j+=nb) {
            jb = min(nb, (n-j));
            trace_gpu_start( 0, 0, "set", "set" );
            magma_dsetmatrix_async(j+jb, jb, A(0, j), lda, dA(0, j), ldda, stream[0]);
            trace_gpu_end( 0, 0 );
        }
        
        // main loop
        for (j=0; j<n; j += nb) {
            jb = min(nb, (n-j));
            
            // copy A(j,j) back to CPU
            trace_gpu_start( 0, 0, "get", "get" );
            if ( j!=0) {
                //magma_event_sync(event);
                magma_dgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, stream[1]);
            }
            trace_gpu_end( 0, 0 );
            
            // factorize the diagonal block
            magma_queue_sync(stream[1]);
            trace_cpu_start( 0, "potrf", "potrf" );
            dsytrf_nopiv_cpu(MagmaUpper, jb, ib, A(j, j), lda, info);
            trace_cpu_end( 0 );
            if (*info != 0){
                *info = *info + j;
                break;
            }
            
            // copy A(j,j) back to GPU
            trace_gpu_start( 0, 0, "set", "set" );
            magma_dsetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, stream[0]);
            trace_gpu_end( 0, 0 );
            
            // copy j-th column of U back to CPU
            trace_gpu_start( 0, 1, "get", "get" );                                                                         
            magma_dgetmatrix_async(j, jb, dA(0, j), ldda, A(0, j), lda, stream[1]);                                        
            trace_gpu_end( 0, 1 );

            if ( (j+jb) < n) {
                // compute the off-diagonal blocks of current block column
                magmablasSetKernelStream( stream[0] );
                trace_gpu_start( 0, 0, "trsm", "trsm" );
                magma_dtrsm(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaUnit, 
                            jb, (n-j-jb), 
                            zone, dA(j, j),    ldda, 
                            dA(j, j+jb), ldda);
                magma_dcopymatrix( jb, n-j-jb, dA( j, j+jb ), ldda, dWt( 0, j+jb ), nb );
                
                // update the trailing submatrix with D
                magmablas_dlascl_diag(MagmaUpper, jb, n-j-jb,
                                      dA(j,    j), ldda,
                                      dA(j, j+jb), ldda,
                                      &iinfo);
                trace_gpu_end( 0, 0 );
                
                // update the trailing submatrix with U and W
                trace_gpu_start( 0, 0, "gemm", "gemm" );
                for (k=j+jb; k<n; k+=nb) {
                    magma_int_t kb = min(nb,n-k);
                    magma_dgemm(MagmaConjTrans, MagmaNoTrans, kb, n-k, jb,
                                mzone, dWt(0, k), nb, 
                                       dA(j, k), ldda,
                                zone,  dA(k, k), ldda);
                    if (k==j+jb) {
                        // magma_event_record( event, stream[0] );
                        magma_queue_sync( stream[0] );
                    }
                }
                trace_gpu_end( 0, 0 );
            }
        }
    } else {
        //=========================================================
        // Compute the LDLt factorization A = L*D*L' without pivoting.
        // copy the matrix to GPU
        for (j=0; j<n; j+=nb) {
            jb = min(nb, (n-j));
            trace_gpu_start( 0, 0, "set", "set" );
            magma_dsetmatrix_async((n-j), jb, A(j, j), lda, dA(j, j), ldda, stream[0]);
            trace_gpu_end( 0, 0 );
        }
        
        // main loop
        for (j=0; j<n; j+=nb) {
            jb = min(nb, (n-j));
            
            // copy A(j,j) back to CPU
            trace_gpu_start( 0, 0, "get", "get" );
            if (j!=0) {
                //magma_event_sync(event);
                magma_dgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, stream[1]);
            }
            trace_gpu_end( 0, 0 );
            
            // factorize the diagonal block
            magma_queue_sync(stream[1]);
            trace_cpu_start( 0, "potrf", "potrf" );
            dsytrf_nopiv_cpu(MagmaLower, jb, ib, A(j, j), lda, info);
            trace_cpu_end( 0 );
            if (*info != 0){
                *info = *info + j;
                break;
            }

            // copy A(j,j) back to GPU
            trace_gpu_start( 0, 0, "set", "set" );
            magma_dsetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, stream[0]);
            trace_gpu_end( 0, 0 );
            
            // copy j-th row of L back to CPU
            trace_gpu_start( 0, 1, "get", "get" );
            magma_dgetmatrix_async(jb, j, dA(j, 0), ldda, A(j, 0), lda, stream[1]);
            trace_gpu_end( 0, 1 ); 
            
            if ( (j+jb) < n) {
                // compute the off-diagonal blocks of current block column
                magmablasSetKernelStream( stream[0] );
                trace_gpu_start( 0, 0, "trsm", "trsm" );
                magma_dtrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaUnit, 
                            (n-j-jb), jb, 
                            zone, dA(j,    j), ldda, 
                            dA(j+jb, j), ldda);
                magma_dcopymatrix( n-j-jb,jb, dA( j+jb, j ), ldda, dW( j+jb, 0 ), ldda );
                
                // update the trailing submatrix with D
                magmablas_dlascl_diag(MagmaLower, n-j-jb, jb,
                                      dA(j,    j), ldda,
                                      dA(j+jb, j), ldda,
                                      &iinfo);
                trace_gpu_end( 0, 0 );
                
                // update the trailing submatrix with L and W
                trace_gpu_start( 0, 0, "gemm", "gemm" );
                for (k=j+jb; k<n; k+=nb) {
                    magma_int_t kb = min(nb,n-k);
                    magma_dgemm(MagmaNoTrans, MagmaConjTrans, n-k, kb, jb,
                                mzone, dA(k, j), ldda, 
                                       dW(k, 0), ldda,
                                zone,  dA(k, k), ldda);
                    if (k==j+jb) {
                        //magma_event_record( event, stream[0] );
                        magma_queue_sync(stream[0]);
                    }
                }
                trace_gpu_end( 0, 0 );
            }
        }
    }
    
    trace_finalize( "dsytrf.svg","trace.css" );
    magma_queue_destroy(stream[0]);
    magma_queue_destroy(stream[1]);
    magma_event_destroy( event );
    magma_free(dW);
    magma_free(dA);
    
    return MAGMA_SUCCESS;
} /* magma_dsytrf_nopiv */
Beispiel #11
0
/**
    Purpose
    -------
    DLATRD2 reduces NB rows and columns of a real symmetric matrix A to
    symmetric tridiagonal form by an orthogonal similarity
    transformation Q' * A * Q, and returns the matrices V and W which are
    needed to apply the transformation to the unreduced part of A.

    If UPLO = MagmaUpper, DLATRD reduces the last NB rows and columns of a
    matrix, of which the upper triangle is supplied;
    if UPLO = MagmaLower, DLATRD reduces the first NB rows and columns of a
    matrix, of which the lower triangle is supplied.

    This is an auxiliary routine called by DSYTRD2_GPU. It uses an
    accelerated HEMV that needs extra memory.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
            Specifies whether the upper or lower triangular part of the
            symmetric matrix A is stored:
      -     = MagmaUpper: Upper triangular
      -     = MagmaLower: Lower triangular

    @param[in]
    n       INTEGER
            The order of the matrix A.

    @param[in]
    nb      INTEGER
            The number of rows and columns to be reduced.

    @param[in,out]
    A       DOUBLE_PRECISION array, dimension (LDA,N)
            On entry, the symmetric matrix A.  If UPLO = MagmaUpper, the leading
            n-by-n upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = MagmaLower, the
            leading n-by-n lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
            On exit:
      -     if UPLO = MagmaUpper, the last NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements above the diagonal
              with the array TAU, represent the orthogonal matrix Q as a
              product of elementary reflectors;
      -     if UPLO = MagmaLower, the first NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements below the diagonal
              with the array TAU, represent the  orthogonal matrix Q as a
              product of elementary reflectors.
            See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= (1,N).

    @param[out]
    e       DOUBLE_PRECISION array, dimension (N-1)
            If UPLO = MagmaUpper, E(n-nb:n-1) contains the superdiagonal
            elements of the last NB columns of the reduced matrix;
            if UPLO = MagmaLower, E(1:nb) contains the subdiagonal elements of
            the first NB columns of the reduced matrix.

    @param[out]
    tau     DOUBLE_PRECISION array, dimension (N-1)
            The scalar factors of the elementary reflectors, stored in
            TAU(n-nb:n-1) if UPLO = MagmaUpper, and in TAU(1:nb) if UPLO = MagmaLower.
            See Further Details.

    @param[out]
    W       DOUBLE_PRECISION array, dimension (LDW,NB)
            The n-by-nb matrix W required to update the unreduced part
            of A.

    @param[in]
    ldw     INTEGER
            The leading dimension of the array W. LDW >= max(1,N).
    
    @param
    dA      TODO: dimension (ldda, n) ??
    
    @param
    ldda    TODO: ldda >= n ??
    
    @param
    dW      TODO: dimension (lddw, 2*nb) ??
    
    @param
    lddw    TODO: lddw >= n ??
    
    @param
    dwork   TODO: dimension (ldwork) ??
    
    @param
    ldwork  TODO: ldwork >= ceil(n/64)*ldda ??

    Further Details
    ---------------
    If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary
    reflectors

        Q = H(n) H(n-1) . . . H(n-nb+1).

    Each H(i) has the form

        H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i),
    and tau in TAU(i-1).

    If UPLO = MagmaLower, the matrix Q is represented as a product of elementary
    reflectors

        Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

        H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i),
    and tau in TAU(i).

    The elements of the vectors v together form the n-by-nb matrix V
    which is needed, with W, to apply the transformation to the unreduced
    part of the matrix, using a symmetric rank-2k update of the form:
    A := A - V*W' - W*V'.

    The contents of A on exit are illustrated by the following examples
    with n = 5 and nb = 2:

    if UPLO = MagmaUpper:                       if UPLO = MagmaLower:

        (  a   a   a   v4  v5 )              (  d                  )
        (      a   a   v4  v5 )              (  1   d              )
        (          a   1   v5 )              (  v1  1   a          )
        (              d   1  )              (  v1  v2  a   a      )
        (                  d  )              (  v1  v2  a   a   a  )

    where d denotes a diagonal element of the reduced matrix, a denotes
    an element of the original matrix that is unchanged, and vi denotes
    an element of the vector defining H(i).

    @ingroup magma_dsyev_aux
    ********************************************************************/
extern "C" magma_int_t
magma_dlatrd2(
    magma_uplo_t uplo, magma_int_t n, magma_int_t nb,
    double *A,  magma_int_t lda,
    double *e, double *tau,
    double *W,  magma_int_t ldw,
    magmaDouble_ptr dA, magma_int_t ldda,
    magmaDouble_ptr dW, magma_int_t lddw,
    magmaDouble_ptr dwork, magma_int_t ldwork)
{
    #define A(i_, j_) (A + (i_) + (j_)*lda)
    #define W(i_, j_) (W + (i_) + (j_)*ldw)
    
    #define dA(i_, j_) (dA + (i_) + (j_)*ldda)
    #define dW(i_, j_) (dW + (i_) + (j_)*lddw)

    const double c_neg_one = MAGMA_D_NEG_ONE;
    const double c_one     = MAGMA_D_ONE;
    const double c_zero    = MAGMA_D_ZERO;
    const magma_int_t ione = 1;

    double alpha, value;
    magma_int_t i, i_n, i_1, iw;

    /* Check arguments */
    magma_int_t info = 0;
    if ( uplo != MagmaLower && uplo != MagmaUpper ) {
        info = -1;
    } else if ( n < 0 ) {
        info = -2;
    } else if ( nb < 1 ) {
        info = -3;
    } else if ( lda < max(1,n) ) {
        info = -5;
    } else if ( ldw < max(1,n) ) {
        info = -9;
    } else if ( ldda < max(1,n) ) {
        info = -11;
    } else if ( lddw < max(1,n) ) {
        info = -13;
    } else if ( ldwork < ldda*ceildiv(n,64) ) {
        info = -15;
    }
    
    if (info != 0) {
        magma_xerbla( __func__, -(info) );
        return info;
    }
    
    /* Quick return if possible */
    if (n == 0) {
        return info;
    }

    magma_queue_t stream;
    magma_queue_create( &stream );
    
    double *f;
    magma_dmalloc_cpu( &f, n );
    if ( f == NULL ) {
        info = MAGMA_ERR_HOST_ALLOC;
        return info;
    }
    
    if (uplo == MagmaUpper) {
        /* Reduce last NB columns of upper triangle */
        for (i = n-1; i >= n - nb; --i) {
            i_1 = i + 1;
            i_n = n - i - 1;
            
            iw = i - n + nb;
            if (i < n-1) {
                /* Update A(1:i,i) */
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_dlacgv( &i_n, W(i, iw+1), &ldw );
                #endif
                blasf77_dgemv( "No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda,
                               W(i, iw+1), &ldw, &c_one, A(0, i), &ione );
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_dlacgv( &i_n, W(i, iw+1), &ldw );
                lapackf77_dlacgv( &i_n, A(i, i+1),  &lda );
                #endif
                blasf77_dgemv( "No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                               A(i, i+1), &lda, &c_one, A(0, i), &ione );
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_dlacgv( &i_n, A(i, i+1), &lda );
                #endif
            }
            if (i > 0) {
                /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */
                alpha = *A(i-1, i);
                
                lapackf77_dlarfg( &i, &alpha, A(0, i), &ione, &tau[i - 1] );
                
                e[i-1] = MAGMA_D_REAL( alpha );
                *A(i-1,i) = MAGMA_D_ONE;
                
                /* Compute W(1:i-1,i) */
                // 1. Send the block reflector  A(0:n-i-1,i) to the GPU
                magma_dsetvector_async( i, A(0, i), 1, dA(0, i), 1, stream );
                
                magmablas_dsymv_work( MagmaUpper, i, c_one, dA(0, 0), ldda,
                                      dA(0, i), ione, c_zero, dW(0, iw), ione,
                                      dwork, ldwork, stream );
                
                // 2. Start getting the result back (asynchronously)
                magma_dgetmatrix_async( i, 1,
                                        dW(0, iw), lddw,
                                        W(0, iw),  ldw, stream );
                
                if (i < n-1) {
                    blasf77_dgemv( MagmaConjTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw,
                                   A(0, i), &ione, &c_zero, W(i+1, iw), &ione );
                }
                
                // 3. Here we need dsymv result W(0, iw)
                magma_queue_sync( stream );
                
                if (i < n-1) {
                    blasf77_dgemv( "No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda,
                                   W(i+1, iw), &ione, &c_one, W(0, iw), &ione );
                    
                    blasf77_dgemv( MagmaConjTransStr, &i, &i_n, &c_one, A(0, i+1), &lda,
                                   A(0, i), &ione, &c_zero, W(i+1, iw), &ione );
                    
                    blasf77_dgemv( "No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                                   W(i+1, iw), &ione, &c_one, W(0, iw), &ione );
                }
                
                blasf77_dscal( &i, &tau[i - 1], W(0, iw), &ione );
                
                value = magma_cblas_ddot( i, W(0,iw), ione, A(0,i), ione );
                alpha = tau[i - 1] * -0.5f * value;
                blasf77_daxpy( &i, &alpha, A(0, i), &ione,
                               W(0, iw), &ione );
            }
        }
    }
    else {
        /*  Reduce first NB columns of lower triangle */
        for (i = 0; i < nb; ++i) {
            /* Update A(i:n,i) */
            i_n = n - i;
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv( &i, W(i, 0), &ldw );
            #endif
            blasf77_dgemv( "No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda,
                           W(i, 0), &ldw, &c_one, A(i, i), &ione );
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv( &i, W(i, 0), &ldw );
            lapackf77_dlacgv( &i, A(i, 0), &lda );
            #endif
            blasf77_dgemv( "No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw,
                           A(i, 0), &lda, &c_one, A(i, i), &ione );
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv( &i, A(i, 0), &lda );
            #endif
            
            if (i < n-1) {
                /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */
                i_n = n - i - 1;
                alpha = *A(i+1, i);
                lapackf77_dlarfg( &i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i] );
                e[i] = MAGMA_D_REAL( alpha );
                *A(i+1,i) = MAGMA_D_ONE;
                
                /* Compute W(i+1:n,i) */
                // 1. Send the block reflector  A(i+1:n,i) to the GPU
                magma_dsetvector_async( i_n, A(i+1, i), 1, dA(i+1, i), 1, stream );
                
                magmablas_dsymv_work( MagmaLower, i_n, c_one, dA(i+1, i+1), ldda,
                                      dA(i+1, i), ione, c_zero, dW(i+1, i), ione,
                                      dwork, ldwork, stream );
                
                // 2. Start getting the result back (asynchronously)
                magma_dgetmatrix_async( i_n, 1,
                                        dW(i+1, i), lddw,
                                        W(i+1, i),  ldw, stream );
                
                blasf77_dgemv( MagmaConjTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw,
                               A(i+1, i), &ione, &c_zero, W(0, i), &ione );
                
                blasf77_dgemv( "No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda,
                               W(0, i), &ione, &c_zero, f, &ione );
                
                blasf77_dgemv( MagmaConjTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda,
                               A(i+1, i), &ione, &c_zero, W(0, i), &ione );
                
                // 3. Here we need dsymv result W(i+1, i)
                magma_queue_sync( stream );
                
                if (i != 0)
                    blasf77_daxpy( &i_n, &c_one, f, &ione, W(i+1, i), &ione );
                
                blasf77_dgemv( "No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw,
                               W(0, i), &ione, &c_one, W(i+1, i), &ione );
                blasf77_dscal( &i_n, &tau[i], W(i+1,i), &ione );
                
                value = magma_cblas_ddot( i_n, W(i+1,i), ione, A(i+1,i), ione );
                alpha = tau[i] * -0.5f * value;
                blasf77_daxpy( &i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione );
            }
        }
    }

    magma_free_cpu( f );
    magma_queue_destroy( stream );

    return info;
} /* magma_dlatrd */
Beispiel #12
0
extern "C" magma_int_t
magma_dsygvd(magma_int_t itype, char jobz, char uplo, magma_int_t n,
             double *a, magma_int_t lda, double *b, magma_int_t ldb,
             double *w, double *work, magma_int_t lwork,
             magma_int_t *iwork, magma_int_t liwork, magma_int_t *info)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    DSYGVD computes all the eigenvalues, and optionally, the eigenvectors
    of a real generalized symmetric-definite eigenproblem, of the form
    A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
    B are assumed to be symmetric and B is also positive definite.
    If eigenvectors are desired, it uses a divide and conquer algorithm.

    The divide and conquer algorithm makes very mild assumptions about
    floating point arithmetic. It will work on machines with a guard
    digit in add/subtract, or on those binary machines without guard
    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
    Cray-2. It could conceivably fail on hexadecimal or decimal machines
    without guard digits, but we know of none.

    Arguments
    =========
    ITYPE   (input) INTEGER
            Specifies the problem type to be solved:
            = 1:  A*x = (lambda)*B*x
            = 2:  A*B*x = (lambda)*x
            = 3:  B*A*x = (lambda)*x

    JOBZ    (input) CHARACTER*1
            = 'N':  Compute eigenvalues only;
            = 'V':  Compute eigenvalues and eigenvectors.

    UPLO    (input) CHARACTER*1
            = 'U':  Upper triangles of A and B are stored;
            = 'L':  Lower triangles of A and B are stored.

    N       (input) INTEGER
            The order of the matrices A and B.  N >= 0.

    A       (input/output) COMPLEX*16 array, dimension (LDA, N)
            On entry, the symmetric matrix A.  If UPLO = 'U', the
            leading N-by-N upper triangular part of A contains the
            upper triangular part of the matrix A.  If UPLO = 'L',
            the leading N-by-N lower triangular part of A contains
            the lower triangular part of the matrix A.

            On exit, if JOBZ = 'V', then if INFO = 0, A contains the
            matrix Z of eigenvectors.  The eigenvectors are normalized
            as follows:
            if ITYPE = 1 or 2, Z**T *   B    * Z = I;
            if ITYPE = 3,      Z**T * inv(B) * Z = I.
            If JOBZ = 'N', then on exit the upper triangle (if UPLO='U')
            or the lower triangle (if UPLO='L') of A, including the
            diagonal, is destroyed.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    B       (input/output) COMPLEX*16 array, dimension (LDB, N)
            On entry, the symmetric matrix B.  If UPLO = 'U', the
            leading N-by-N upper triangular part of B contains the
            upper triangular part of the matrix B.  If UPLO = 'L',
            the leading N-by-N lower triangular part of B contains
            the lower triangular part of the matrix B.

            On exit, if INFO <= N, the part of B containing the matrix is
            overwritten by the triangular factor U or L from the Cholesky
            factorization B = U**T * U or B = L * L**T.

    LDB     (input) INTEGER
            The leading dimension of the array B.  LDB >= max(1,N).

    W       (output) DOUBLE PRECISION array, dimension (N)
            If INFO = 0, the eigenvalues in ascending order.

    WORK    (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.

    WORK    (workspace/output) DOUBLE_PRECISION array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    LWORK   (input) INTEGER
            The length of the array WORK.
            If N <= 1,                LWORK >= 1.
            If JOBZ  = 'N' and N > 1, LWORK >= 2*N + N*NB.
            If JOBZ  = 'V' and N > 1, LWORK >= max( 2*N + N*NB, 1 + 6*N + 2*N**2 ).
            NB can be obtained through magma_get_dsytrd_nb(N).

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal sizes of the WORK and IWORK
            arrays, returns these values as the first entries of the WORK
            and IWORK arrays, and no error message related to LWORK or
            LIWORK is issued by XERBLA.

    IWORK   (workspace/output) INTEGER array, dimension (MAX(1,LIWORK))
            On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK.

    LIWORK  (input) INTEGER
            The dimension of the array IWORK.
            If N <= 1,                LIWORK >= 1.
            If JOBZ  = 'N' and N > 1, LIWORK >= 1.
            If JOBZ  = 'V' and N > 1, LIWORK >= 3 + 5*N.

            If LIWORK = -1, then a workspace query is assumed; the
            routine only calculates the optimal sizes of the WORK and
            IWORK arrays, returns these values as the first entries of
            the WORK and IWORK arrays, and no error message related to
            LWORK or LIWORK is issued by XERBLA.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
            > 0:  DPOTRF or DSYEVD returned an error code:
               <= N:  if INFO = i and JOBZ = 'N', then the algorithm
                      failed to converge; i off-diagonal elements of an
                      intermediate tridiagonal form did not converge to
                      zero;
                      if INFO = i and JOBZ = 'V', then the algorithm
                      failed to compute an eigenvalue while working on
                      the submatrix lying in rows and columns INFO/(N+1)
                      through mod(INFO,N+1);
               > N:   if INFO = N + i, for 1 <= i <= N, then the leading
                      minor of order i of B is not positive definite.
                      The factorization of B could not be completed and
                      no eigenvalues or eigenvectors were computed.

    Further Details
    ===============
    Based on contributions by
       Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA

    Modified so that no backsubstitution is performed if DSYEVD fails to
    converge (NEIG in old code could be greater than N causing out of
    bounds reference to A - reported by Ralf Meyer).  Also corrected the
    description of INFO and the test on ITYPE. Sven, 16 Feb 05.
    =====================================================================  */

    char uplo_[2] = {uplo, 0};
    char jobz_[2] = {jobz, 0};

    double d_one = MAGMA_D_ONE;

    double *da;
    double *db;
    magma_int_t ldda = n;
    magma_int_t lddb = n;

    magma_int_t lower;
    char trans[1];
    magma_int_t wantz, lquery;

    magma_int_t lwmin, liwmin;

    magma_queue_t stream;
    magma_queue_create( &stream );

    wantz = lapackf77_lsame(jobz_, MagmaVecStr);
    lower = lapackf77_lsame(uplo_, MagmaLowerStr);
    lquery = lwork == -1 || liwork == -1;

    *info = 0;
    if (itype < 1 || itype > 3) {
        *info = -1;
    } else if (! (wantz || lapackf77_lsame(jobz_, MagmaNoVecStr))) {
        *info = -2;
    } else if (! (lower || lapackf77_lsame(uplo_, MagmaUpperStr))) {
        *info = -3;
    } else if (n < 0) {
        *info = -4;
    } else if (lda < max(1,n)) {
        *info = -6;
    } else if (ldb < max(1,n)) {
        *info = -8;
    }

    magma_int_t nb = magma_get_dsytrd_nb( n );
    if ( n <= 1 ) {
        lwmin  = 1;
        liwmin = 1;
    }
    else if ( wantz ) {
        lwmin  = max( 2*n + n*nb, 1 + 6*n + 2*n*n );
        liwmin = 3 + 5*n;
    }
    else {
        lwmin  = 2*n + n*nb;
        liwmin = 1;
    }
    // multiply by 1+eps to ensure length gets rounded up,
    // if it cannot be exactly represented in floating point.
    work[0]  = lwmin * (1. + lapackf77_dlamch("Epsilon"));
    iwork[0] = liwmin;

    if (lwork < lwmin && ! lquery) {
        *info = -11;
    } else if (liwork < liwmin && ! lquery) {
        *info = -13;
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return MAGMA_ERR_ILLEGAL_VALUE;
    }
    else if (lquery) {
        return MAGMA_SUCCESS;
    }

    /*  Quick return if possible */
    if (n == 0) {
        return 0;
    }
    /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */
    if (n <= 128){
        #ifdef ENABLE_DEBUG
        printf("--------------------------------------------------------------\n");
        printf("  warning matrix too small N=%d NB=%d, calling lapack on CPU  \n", (int) n, (int) nb);
        printf("--------------------------------------------------------------\n");
        #endif
        lapackf77_dsygvd(&itype, jobz_, uplo_,
                         &n, a, &lda, b, &ldb,
                         w, work, &lwork,
                         iwork, &liwork, info);
        return *info;
    }

    if (MAGMA_SUCCESS != magma_dmalloc( &da, n*ldda ) ||
        MAGMA_SUCCESS != magma_dmalloc( &db, n*lddb )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    /* Form a Cholesky factorization of B. */
    magma_dsetmatrix( n, n, b, ldb, db, lddb );
    magma_dsetmatrix_async( n, n,
                            a,  lda,
                            da, ldda, stream );

#ifdef ENABLE_TIMER
    magma_timestr_t start, end;
    start = get_current_time();
#endif
    magma_dpotrf_gpu(uplo, n, db, lddb, info);
    if (*info != 0) {
        *info = n + *info;
        return 0;
    }
#ifdef ENABLE_TIMER
    end = get_current_time();
    printf("time dpotrf_gpu = %6.2f\n", GetTimerValue(start,end)/1000.);
#endif

    magma_queue_sync( stream );
    magma_dgetmatrix_async( n, n,
                            db, lddb,
                            b,  ldb, stream );

#ifdef ENABLE_TIMER
    start = get_current_time();
#endif
    /*  Transform problem to standard eigenvalue problem and solve. */
    magma_dsygst_gpu(itype, uplo, n, da, ldda, db, lddb, info); 
#ifdef ENABLE_TIMER
    end = get_current_time();
    printf("time dsygst_gpu = %6.2f\n", GetTimerValue(start,end)/1000.);
#endif

    /* simple fix to be able to run bigger size.
     * need to have a dwork here that will be used 
     * a db and then passed to  dsyevd.
     * */
    if(n > 5000){
        magma_queue_sync( stream );
        magma_free( db );
    }

#ifdef ENABLE_TIMER
    start = get_current_time();
#endif
    magma_dsyevd_gpu(jobz, uplo, n, da, ldda, w, a, lda,
                     work, lwork, iwork, liwork, info);
#ifdef ENABLE_TIMER
    end = get_current_time();
    printf("time dsyevd_gpu = %6.2f\n", GetTimerValue(start,end)/1000.);
#endif

    if (wantz && *info == 0) {
#ifdef ENABLE_TIMER
        start = get_current_time();
#endif
        /* allocate and copy db back */
        if(n > 5000){
            if (MAGMA_SUCCESS != magma_dmalloc( &db, n*lddb ) ){
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }
            magma_dsetmatrix( n, n, b, ldb, db, lddb );
        }
        /* Backtransform eigenvectors to the original problem. */
        if (itype == 1 || itype == 2) {
            /* For A*x=(lambda)*B*x and A*B*x=(lambda)*x;
               backtransform eigenvectors: x = inv(L)'*y or inv(U)*y */
            if (lower) {
                *(unsigned char *)trans = MagmaTrans;
            } else {
                *(unsigned char *)trans = MagmaNoTrans;
            }
            magma_dtrsm(MagmaLeft, uplo, *trans, MagmaNonUnit,
                        n, n, d_one, db, lddb, da, ldda);
        }
        else if (itype == 3) {
            /*  For B*A*x=(lambda)*x;
                backtransform eigenvectors: x = L*y or U'*y */
            if (lower) {
                *(unsigned char *)trans = MagmaNoTrans;
            } else {
                *(unsigned char *)trans = MagmaTrans;
            }

            magma_dtrmm(MagmaLeft, uplo, *trans, MagmaNonUnit,
                        n, n, d_one, db, lddb, da, ldda);
        }
        magma_dgetmatrix( n, n, da, ldda, a, lda );
#ifdef ENABLE_TIMER
        end = get_current_time();
        printf("time dtrsm/mm + getmatrix = %6.2f\n", GetTimerValue(start,end)/1000.);
#endif
        /* free db */
        if(n > 5000){        
            magma_free( db );
        }
    }

    magma_queue_sync( stream );
    magma_queue_destroy( stream );

    work[0]  = lwmin * (1. + lapackf77_dlamch("Epsilon"));  // round up
    iwork[0] = liwmin;

    magma_free( da );
    if(n <= 5000){
        magma_free( db );
    }

    return MAGMA_SUCCESS;
} /* magma_dsygvd */
Beispiel #13
0
/**
    Purpose
    -------
    DPOTRF computes the Cholesky factorization of a real symmetric
    positive definite matrix A. This version does not require work
    space on the GPU passed as input. GPU memory is allocated in the
    routine.

    The factorization has the form
        A = U**H * U,  if uplo = MagmaUpper, or
        A = L  * L**H, if uplo = MagmaLower,
    where U is an upper triangular matrix and L is lower triangular.

    This is the block version of the algorithm, calling Level 3 BLAS.

    This uses multiple queues to overlap communication and computation.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of A is stored;
      -     = MagmaLower:  Lower triangle of A is stored.

    @param[in]
    n       INTEGER
            The order of the matrix A.  N >= 0.

    @param[in,out]
    A       DOUBLE PRECISION array, dimension (LDA,N)
            On entry, the symmetric matrix A.  If uplo = MagmaUpper, the leading
            N-by-N upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If uplo = MagmaLower, the
            leading N-by-N lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
    \n
            On exit, if INFO = 0, the factor U or L from the Cholesky
            factorization A = U**H * U or A = L * L**H.
    \n
            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
      -     > 0:  if INFO = i, the leading minor of order i is not
                  positive definite, and the factorization could not be
                  completed.

    @ingroup magma_dposv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_dpotrf(
    magma_uplo_t uplo, magma_int_t n,
    double *A, magma_int_t lda,
    magma_int_t *info )
{
    #define  A(i_, j_)  (A + (i_) + (j_)*lda)
    
    #ifdef HAVE_clBLAS
    #define dA(i_, j_)  dA, ((i_) + (j_)*ldda)
    #else
    #define dA(i_, j_) (dA + (i_) + (j_)*ldda)
    #endif
    
    /* Constants */
    const double c_one     = MAGMA_D_ONE;
    const double c_neg_one = MAGMA_D_NEG_ONE;
    const double d_one     =  1.0;
    const double d_neg_one = -1.0;
    
    /* Local variables */
    const char* uplo_ = lapack_uplo_const( uplo );
    bool upper = (uplo == MagmaUpper);
    
    magma_int_t j, jb, ldda, nb;
    magmaDouble_ptr dA = NULL;
    
    /* Check arguments */
    *info = 0;
    if (! upper && uplo != MagmaLower) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,n)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    
    /* Quick return */
    if ( n == 0 )
        return *info;
    
    nb = magma_get_dpotrf_nb( n );
    
    if (nb <= 1 || nb >= n) {
        lapackf77_dpotrf( uplo_, &n, A, &lda, info );
    }
    else {
        /* Use hybrid blocked code. */
        ldda = magma_roundup( n, 32 );
        
        magma_int_t ngpu = magma_num_gpus();
        if ( ngpu > 1 ) {
            /* call multi-GPU non-GPU-resident interface */
            return magma_dpotrf_m( ngpu, uplo, n, A, lda, info );
        }
        
        if (MAGMA_SUCCESS != magma_dmalloc( &dA, n*ldda )) {
            /* alloc failed so call the non-GPU-resident version */
            return magma_dpotrf_m( ngpu, uplo, n, A, lda, info );
        }
        
        magma_queue_t queues[2] = { NULL, NULL };
        magma_device_t cdev;
        magma_getdevice( &cdev );
        magma_queue_create( cdev, &queues[0] );
        magma_queue_create( cdev, &queues[1] );
        
        if (upper) {
            /* Compute the Cholesky factorization A = U'*U. */
            for (j=0; j < n; j += nb) {
                /* Update and factorize the current diagonal block and test
                   for non-positive-definiteness. */
                jb = min( nb, n-j );
                magma_dsetmatrix_async( jb, n-j,
                                         A(j, j), lda,
                                        dA(j, j), ldda, queues[1] );
                
                magma_dsyrk( MagmaUpper, MagmaConjTrans, jb, j,
                             d_neg_one, dA(0, j), ldda,
                             d_one,     dA(j, j), ldda, queues[1] );
                magma_queue_sync( queues[1] );
                
                magma_dgetmatrix_async( jb, jb,
                                        dA(j, j), ldda,
                                         A(j, j), lda, queues[0] );
                
                if (j+jb < n) {
                    magma_dgemm( MagmaConjTrans, MagmaNoTrans,
                                 jb, n-j-jb, j,
                                 c_neg_one, dA(0, j   ), ldda,
                                            dA(0, j+jb), ldda,
                                 c_one,     dA(j, j+jb), ldda, queues[1] );
                }
                
                magma_queue_sync( queues[0] );
                
                // this could be on any queue; it isn't needed until exit.
                magma_dgetmatrix_async( j, jb,
                                        dA(0, j), ldda,
                                         A(0, j), lda, queues[0] );
                
                lapackf77_dpotrf( MagmaUpperStr, &jb, A(j, j), &lda, info );
                if (*info != 0) {
                    *info = *info + j;
                    break;
                }
                magma_dsetmatrix_async( jb, jb,
                                         A(j, j), lda,
                                        dA(j, j), ldda, queues[0] );
                magma_queue_sync( queues[0] );
                
                if (j+jb < n) {
                    magma_dtrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                 jb, n-j-jb,
                                 c_one, dA(j, j   ), ldda,
                                        dA(j, j+jb), ldda, queues[1] );
                }
            }
        }
        else {


            //used for timing CPU and GPU
            int iter = 0;
            float cpu_time = 0.0;
            float gpu_time = 0.0;

            double gpu_iter1_low = 2103.143311;
            double gpu_iter1_high = 754.506104;
            double cpu_iter1_low = 794.636108;
            double cpu_iter1_high = 600.295227;

            double gpu_pred_high = gpu_iter1_high;
            double gpu_pred_low = gpu_iter1_low;
            double cpu_pred_high = cpu_iter1_high;
            double cpu_pred_low = cpu_iter1_low;

            double ratio_split_freq = 0;
            double time_until_interrupt = 0;

            cudaEvent_t start_cpu, stop_cpu;
            cudaEvent_t start_gpu, stop_gpu;

            // switches for different modes
            bool timing = false; //for initial setting only, greatly impact performance
            bool dvfs = false; //turn on dvfs energy saving
            bool relax = false; //turn on relax scheme
            bool r2h = false; // turn on race to halt

            //these parameters need to be tuned in future works.
            double dvfs_converage = 0.5;
            double prediction_offset_gpu = 0.65;
            double prediction_offset_cpu = 0.65;

            //=========================================================
            // Compute the Cholesky factorization A = L*L'.
            for (j=0; j < n; j += nb) {
                //  Update and factorize the current diagonal block and test
                //  for non-positive-definiteness.
                jb = min( nb, n-j );
                magma_dsetmatrix_async( n-j, jb,
                                         A(j, j), lda,
                                        dA(j, j), ldda, queues[1] );
                
                magma_dsyrk( MagmaLower, MagmaNoTrans, jb, j,
                             d_neg_one, dA(j, 0), ldda,
                             d_one,     dA(j, j), ldda, queues[1] );
                magma_queue_sync( queues[1] );
                
                magma_dgetmatrix_async( jb, jb,
                                        dA(j,j), ldda,
                                         A(j,j), lda, queues[0] );
                
                if (timing) {
                    //start gpu timing
                    cudaEventCreate(&start_gpu);
                    cudaEventCreate(&stop_gpu);
                    cudaEventRecord(start_gpu, 0);
                }
                if (j+jb < n) {
                    magma_dgemm( MagmaNoTrans, MagmaConjTrans,
                                 n-j-jb, jb, j,
                                 c_neg_one, dA(j+jb, 0), ldda,
                                            dA(j,    0), ldda,
                                 c_one,     dA(j+jb, j), ldda, queues[1] );
                }


                double ratio_slack_pred = 1.0 - (double)nb/(n-iter*nb);
                cpu_pred_high = cpu_pred_high * ratio_slack_pred;
                cpu_pred_low = cpu_pred_low * ratio_slack_pred;
                gpu_pred_high = gpu_pred_high * ratio_slack_pred * ratio_slack_pred;
                gpu_pred_low = gpu_pred_low * ratio_slack_pred * ratio_slack_pred;

                if (timing) {
                    printf("iter:%d GPU time pred:%f\n", iter, gpu_pred_high);
                    printf("iter:%d CPU time pred:%f\n", iter, cpu_pred_high);
                }

                if (iter < dvfs_converage*(n/nb)) {
                    if (cpu_pred_high > gpu_pred_high) { //slack on GPU
                        ratio_split_freq = (cpu_pred_high - gpu_pred_high) / (gpu_pred_high * ((gpu_iter1_low / gpu_iter1_high) - 1));
                        time_until_interrupt = gpu_pred_low * ratio_split_freq;
                         //printf("iter:%d time_until_interrupt:%f\n", iter, time_until_interrupt);
                        // printf("iter:%d ratio_split_freq:%f\n", iter, ratio_split_freq);
                        if (dvfs) {
                            if ((!relax) || (relax && ratio_split_freq > 0.05)) {
                                if (ratio_split_freq < 1)
                                    dvfs_adjust(time_until_interrupt*prediction_offset_gpu, 'g');
                                else
                                    dvfs_adjust(cpu_pred_high, 'g');
                            }
                        } else if (r2h) {
                            r2h_adjust(gpu_pred_high, cpu_pred_high - gpu_pred_high, 'g');
                        }
                    } else { //slack on CPU
                        ratio_split_freq = (gpu_pred_high - cpu_pred_high) / (cpu_pred_high * ((cpu_iter1_low / cpu_iter1_high) - 1));
                        time_until_interrupt = cpu_pred_low * ratio_split_freq;
                        if (dvfs) {
                            if ((!relax) || (relax && ratio_split_freq > 0.05)) {
                                if (ratio_split_freq < 1)
                                    dvfs_adjust(time_until_interrupt*prediction_offset_cpu, 'c');
                                else
                                    dvfs_adjust(gpu_pred_high, 'c');
                            }
                        } else if (r2h) {
                            r2h_adjust(cpu_pred_high, gpu_pred_high - cpu_pred_high, 'c');
                        }
                    }
                }


                if (timing) {
                    //end gpu timing
                    cudaEventRecord(stop_gpu, 0);
                    cudaEventSynchronize(stop_gpu);
                    cudaEventElapsedTime(&gpu_time, start_gpu, stop_gpu);
                    cudaEventDestroy(start_gpu);
                    cudaEventDestroy(stop_gpu);

                    //printf("iter:%d GPU time:%f\n", iter, gpu_time);
                }
                magma_queue_sync( queues[0] );
                
                // this could be on any queue; it isn't needed until exit.
                magma_dgetmatrix_async( jb, j,
                                        dA(j, 0), ldda,
                                         A(j, 0), lda, queues[0] );
                
                if (timing) {
                    //start cpu timing
                    cudaEventCreate(&start_cpu);
                    cudaEventCreate(&stop_cpu);
                    cudaEventRecord(start_cpu, 0);
                }

                lapackf77_dpotrf( MagmaLowerStr, &jb, A(j, j), &lda, info );

                if (timing) {
                    //end cpu timing
                    cudaEventRecord(stop_cpu, 0);
                    cudaEventSynchronize(stop_cpu);
                    cudaEventElapsedTime(&cpu_time, start_cpu, stop_cpu);
                    cudaEventDestroy(start_cpu);
                    cudaEventDestroy(stop_cpu);
                    // printf("iter:%d CPU time:%f\n", iter, cpu_time);
                    // if (gpu_time < cpu_time) {
                    //     printf("slack: +\n");
                    // } else {
                    //     printf("slack: -\n");
                    // }
                }

                
                if (*info != 0) {
                    *info = *info + j;
                    break;
                }
                magma_dsetmatrix_async( jb, jb,
                                         A(j, j), lda,
                                        dA(j, j), ldda, queues[0] );
                magma_queue_sync( queues[0] );
                
                if (j+jb < n) {
                    magma_dtrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                 n-j-jb, jb,
                                 c_one, dA(j,    j), ldda,
                                        dA(j+jb, j), ldda, queues[1] );
                }
            }
        }
        magma_queue_destroy( queues[0] );
        magma_queue_destroy( queues[1] );
        
        magma_free( dA );
    }
    
    return *info;
} /* magma_dpotrf */
magma_err_t
magma_dgeqrf2_2q_gpu( magma_int_t m, magma_int_t n,
        magmaDouble_ptr dA, size_t dA_offset, magma_int_t ldda,
        double *tau, magma_err_t *info,
        magma_queue_t* queues)
{
    /*  -- clMAGMA (version 1.1.0) --
        Univ. of Tennessee, Knoxville
        Univ. of California, Berkeley
        Univ. of Colorado, Denver
        @date January 2014

        Purpose
        =======
        DGEQRF computes a QR factorization of a real M-by-N matrix A:
        A = Q * R.

        Arguments
        =========
        M       (input) INTEGER
        The number of rows of the matrix A.  M >= 0.

        N       (input) INTEGER
        The number of columns of the matrix A.  N >= 0.

        dA      (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N)
        On entry, the M-by-N matrix dA.
        On exit, the elements on and above the diagonal of the array
        contain the min(M,N)-by-N upper trapezoidal matrix R (R is
        upper triangular if m >= n); the elements below the diagonal,
        with the array TAU, represent the orthogonal matrix Q as a
        product of min(m,n) elementary reflectors (see Further
        Details).

        LDDA    (input) INTEGER
        The leading dimension of the array dA.  LDDA >= max(1,M).
        To benefit from coalescent memory accesses LDDA must be
        dividable by 16.

        TAU     (output) DOUBLE_PRECISION array, dimension (min(M,N))
        The scalar factors of the elementary reflectors (see Further
        Details).

        INFO    (output) INTEGER
        = 0:  successful exit
        < 0:  if INFO = -i, the i-th argument had an illegal value
        if INFO = -9, internal GPU memory allocation failed.

        Further Details
        ===============

        The matrix Q is represented as a product of elementary reflectors

        Q = H(1) H(2) . . . H(k), where k = min(m,n).

        Each H(i) has the form

        H(i) = I - tau * v * v'

        where tau is a real scalar, and v is a real vector with
        v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
        and tau in TAU(i).
        =====================================================================    */

#define dA(a_1,a_2)    dA, (dA_offset + (a_1) + (a_2)*(ldda))
#define work_ref(a_1)  ( work + (a_1))
#define hwork          ( work + (nb)*(m))

    magmaDouble_ptr dwork;
    double  *work;

    magma_int_t i, k, ldwork, lddwork, old_i, old_ib, rows;
    magma_int_t nbmin, nx, ib, nb;
    magma_int_t lhwork, lwork;

    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    k = min(m,n);
    if (k == 0)
        return MAGMA_SUCCESS;

    nb = magma_get_dgeqrf_nb(m);

    lwork  = (m+n) * nb;
    lhwork = lwork - (m)*nb;


    if ( MAGMA_SUCCESS != magma_dmalloc( &dwork, n*nb )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    /*
       if ( MAGMA_SUCCESS != magma_dmalloc_cpu( &work, lwork ) ) {
     *info = MAGMA_ERR_HOST_ALLOC;
     magma_free( dwork );
     return *info;
     }
     */

    cl_mem buffer = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(double)*lwork, NULL, NULL);
    work = (double*)clEnqueueMapBuffer(queues[0], buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, lwork*sizeof(double), 0, NULL, NULL, NULL);

    nbmin = 2;
    nx    = 2*nb;
    ldwork = m;
    lddwork= n;

    if (nb >= nbmin && nb < k && nx < k) {
        /* Use blocked code initially */
        old_i = 0; old_ib = nb;
        for (i = 0; i < k-nx; i += nb) {
            ib = min(k-i, nb);
            rows = m -i;
            magma_dgetmatrix_async(rows, ib, dA(i, i), ldda, work_ref(i), 0, ldwork, queues[0], NULL);
            clFlush(queues[0]);  
            if (i>0){
                /* Apply H' to A(i:m,i+2*ib:n) from the left */
                magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise,
                        m-old_i, n-old_i-2*old_ib, old_ib,
                        dA(old_i, old_i         ), ldda, dwork,0,      lddwork,
                        dA(old_i, old_i+2*old_ib), ldda, dwork,old_ib, lddwork, queues[1]);

                magma_dsetmatrix_async( old_ib, old_ib, work_ref(old_i), 0, ldwork,
                        dA(old_i, old_i), ldda, queues[1], NULL); 
                clFlush(queues[1]);
            }

            magma_queue_sync(queues[0]);
            lapackf77_dgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info);


            /* Form the triangular factor of the block reflector
               H = H(i) H(i+1) . . . H(i+ib-1) */
            lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, 
                    &rows, &ib, 
                    work_ref(i), &ldwork, tau+i, hwork, &ib);

            dpanel_to_q( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib );
            magma_dsetmatrix(rows, ib, work_ref(i), 0, ldwork, dA(i,i), ldda, queues[0]);
            dq_to_panel( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib );

            if (i + ib < n) 
            {
                magma_dsetmatrix(ib, ib, hwork, 0, ib, dwork, 0, lddwork, queues[1]);

                if (i+nb < k-nx){
                    /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */
                    magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise,
                            rows, ib, ib, 
                            dA(i, i   ), ldda, dwork,0,  lddwork, 
                            dA(i, i+ib), ldda, dwork,ib, lddwork, queues[1]);
                    magma_queue_sync(queues[1]);
                }else {
                    magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise,
                            rows, n-i-ib, ib, 
                            dA(i, i   ), ldda, dwork,0,  lddwork, 
                            dA(i, i+ib), ldda, dwork,ib, lddwork, queues[1]);
                    magma_dsetmatrix(ib, ib, work_ref(i), 0, ldwork, dA(i,i), ldda, queues[1]);
                    clFlush(queues[1]);
                }
                old_i  = i;
                old_ib = ib;
            }
        }
    } else {
        i = 0;
    }

    magma_free(dwork);

    /* Use unblocked code to factor the last or only block. */
    if (i < k) {
        ib   = n-i;
        rows = m-i;
        magma_dgetmatrix(rows, ib, dA(i, i), ldda, work, 0, rows, queues[0]);

        lhwork = lwork - rows*ib;
        lapackf77_dgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info);

        magma_dsetmatrix(rows, ib, work, 0, rows, dA(i, i), ldda, queues[0]);
    }

    clEnqueueUnmapMemObject(queues[0], buffer, work, 0, NULL, NULL);
    clReleaseMemObject(buffer);
    //    magma_free_cpu(work);

    return *info;
} /* magma_dgeqrf2_gpu */
Beispiel #15
0
/**
    Purpose
    -------
    DGEQRF computes a QR factorization of a DOUBLE PRECISION M-by-N matrix A:
    A = Q * R. This version does not require work space on the GPU
    passed as input. GPU memory is allocated in the routine.

    This uses 2 queues to overlap communication and computation.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    A       DOUBLE PRECISION array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).
    \n
            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    @param[out]
    tau     DOUBLE PRECISION array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    work    (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.
    \n
            Higher performance is achieved if WORK is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.  LWORK >= max( N*NB, 2*NB*NB ),
            where NB can be obtained through magma_get_dgeqrf_nb( M, N ).
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ---------------
    The matrix Q is represented as a product of elementary reflectors

        Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

        H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).

    @ingroup magma_dgeqrf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_dgeqrf(
    magma_int_t m, magma_int_t n,
    double *A,    magma_int_t lda,
    double *tau,
    double *work, magma_int_t lwork,
    magma_int_t *info )
{
    #define  A(i_,j_)  (A + (i_) + (j_)*lda)
    
    #ifdef HAVE_clBLAS
    #define dA(i_,j_)  dA,    ((i_) + (j_)*ldda + dA_offset)
    #define dT(i_,j_)  dT,    ((i_) + (j_)*nb   + dT_offset)
    #define dwork(i_)  dwork, ((i_)             + dwork_offset)
    #else
    #define dA(i_,j_) (dA    + (i_) + (j_)*ldda)
    #define dT(i_,j_) (dT    + (i_) + (j_)*nb)
    #define dwork(i_) (dwork + (i_))
    #endif
    
    /* Constants */
    const double c_one = MAGMA_D_ONE;
    
    /* Local variables */
    magmaDouble_ptr dA, dT, dwork;
    magma_int_t i, ib, min_mn, ldda, lddwork, old_i, old_ib;
    
    /* Function Body */
    *info = 0;
    magma_int_t nb = magma_get_dgeqrf_nb( m, n );
    
    // need 2*nb*nb to store T and upper triangle of V simultaneously
    magma_int_t lwkopt = max( n*nb, 2*nb*nb );
    work[0] = magma_dmake_lwork( lwkopt );
    bool lquery = (lwork == -1);
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,m)) {
        *info = -4;
    } else if (lwork < max(1, lwkopt) && ! lquery) {
        *info = -7;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;
    
    min_mn = min( m, n );
    if (min_mn == 0) {
        work[0] = c_one;
        return *info;
    }
    
    // largest N for larfb is n-nb (trailing matrix lacks 1st panel)
    lddwork = magma_roundup( n, 32 ) - nb;
    ldda    = magma_roundup( m, 32 );
    
    magma_int_t ngpu = magma_num_gpus();
    if ( ngpu > 1 ) {
        /* call multiple-GPU interface  */
        return magma_dgeqrf_m( ngpu, m, n, A, lda, tau, work, lwork, info );
    }
    
    // allocate space for dA, dwork, and dT
    if (MAGMA_SUCCESS != magma_dmalloc( &dA, n*ldda + nb*lddwork + nb*nb )) {
        /* alloc failed so call non-GPU-resident version */
        return magma_dgeqrf_ooc( m, n, A, lda, tau, work, lwork, info );
    }
    
    dwork = dA + n*ldda;
    dT    = dA + n*ldda + nb*lddwork;
    
    magma_queue_t queues[2];
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queues[0] );
    magma_queue_create( cdev, &queues[1] );
    

    //used for timing CPU and GPU
    int iter = 0;
    float cpu_time = 0.0;
    float gpu_time = 0.0;

    int affinity = map_cpu(0);
    if(affinity != 0)
    {
        printf("affinity failed\n");
        return -1;
    }


    magma_set_lapack_numthreads(1);

    // for initial setting, better to be automatic in the future
    // SetGPUFreq(324, 324);
    // system("echo 1200000 > /sys/devices/system/cpu/cpu0/cpufreq/scaling_setspeed");
    SetGPUFreq(2600, 705);
    system("echo 2500000 > /sys/devices/system/cpu/cpu0/cpufreq/scaling_setspeed");

    double gpu_iter1_low = 2096.544434;
    double gpu_iter1_high = 478.825226;
    double cpu_iter1_low = 1792.011230;
    double cpu_iter1_high = 1413.732788;

    double gpu_pred_high = gpu_iter1_high;
    double gpu_pred_low = gpu_iter1_low;
    double cpu_pred_high = cpu_iter1_high;
    double cpu_pred_low = cpu_iter1_low;

    double ratio_split_freq = 0;
    double time_until_interrupt = 0;

    cudaEvent_t start_cpu, stop_cpu;
    cudaEvent_t start_gpu, stop_gpu;

    // switches for different modes
    bool timing = false; //for initial setting only, greatly impact performance
    bool dvfs = false; //turn on dvfs energy saving
    bool relax = false; //turn on relax scheme
    bool r2h = false; // turn on race to halt

    //these parameters need to be tuned in future works.
    double dvfs_converage = 0.5;
    double prediction_offset_gpu = 0.65;
    double prediction_offset_cpu = 0.65;


    //for nvprof profiler, brings slight constant performance overhead
    //cudaProfilerStart();

    if ( (nb > 1) && (nb < min_mn) ) {
        /* Use blocked code initially.
           Asynchronously send the matrix to the GPU except the first panel. */
        magma_dsetmatrix_async( m, n-nb,
                                 A(0,nb), lda,
                                dA(0,nb), ldda, queues[0] );
        
        old_i = 0;
        old_ib = nb;
        for (i = 0; i < min_mn-nb; i += nb) {
            ib = min( min_mn-i, nb );
            if (i > 0) {
                /* get i-th panel from device */
                magma_queue_sync( queues[1] );
                magma_dgetmatrix_async( m-i, ib,
                                        dA(i,i), ldda,
                                         A(i,i), lda, queues[0] );
                
                if (timing) {
                    //start gpu timing
                    cudaEventCreate(&start_gpu);
                    cudaEventCreate(&stop_gpu);
                    cudaEventRecord(start_gpu, 0);
                }

                /* Apply H' to A(i:m,i+2*ib:n) from the left */
                magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                  m-old_i, n-old_i-2*old_ib, old_ib,
                                  dA(old_i, old_i),          ldda, dT(0,0),  nb,
                                  dA(old_i, old_i+2*old_ib), ldda, dwork(0), lddwork, queues[1] );
                


                double ratio_slack_pred = 1.0 - (double)nb/(m-iter*nb);
                cpu_pred_high = cpu_pred_high * ratio_slack_pred;
                cpu_pred_low = cpu_pred_low * ratio_slack_pred;
                gpu_pred_high = gpu_pred_high * ratio_slack_pred * ratio_slack_pred;
                gpu_pred_low = gpu_pred_low * ratio_slack_pred * ratio_slack_pred;

                if (timing) {
                    printf("iter:%d GPU time pred:%f\n", iter, gpu_pred_high);
                    printf("iter:%d CPU time pred:%f\n", iter, cpu_pred_high);
                }


                if (iter < dvfs_converage*(min_mn-nb)/nb) {
                    if (cpu_pred_high > gpu_pred_high) { //slack on GPU
                        ratio_split_freq = (cpu_pred_high - gpu_pred_high) / (gpu_pred_high * ((gpu_iter1_low / gpu_iter1_high) - 1));
                        time_until_interrupt = gpu_pred_low * ratio_split_freq;
                         //printf("iter:%d time_until_interrupt:%f\n", iter, time_until_interrupt);
                        // printf("iter:%d ratio_split_freq:%f\n", iter, ratio_split_freq);
                        if (dvfs) {
                            if ((!relax) || (relax && ratio_split_freq > 0.05)) {
                                if (ratio_split_freq < 1)
                                    dvfs_adjust(time_until_interrupt*prediction_offset_gpu, 'g');
                                else
                                    dvfs_adjust(cpu_pred_high, 'g');
                            }
                        } else if (r2h) {
                            r2h_adjust(gpu_pred_high, cpu_pred_high - gpu_pred_high, 'g');
                        }
                    } else { //slack on CPU
                        ratio_split_freq = (gpu_pred_high - cpu_pred_high) / (cpu_pred_high * ((cpu_iter1_low / cpu_iter1_high) - 1));
                        time_until_interrupt = cpu_pred_low * ratio_split_freq;
                        if (dvfs) {
                            if ((!relax) || (relax && ratio_split_freq > 0.05)) {
                                if (ratio_split_freq < 1)
                                    dvfs_adjust(time_until_interrupt*prediction_offset_cpu, 'c');
                                else
                                    dvfs_adjust(gpu_pred_high, 'c');
                            }
                        } else if (r2h) {
                            r2h_adjust(cpu_pred_high, gpu_pred_high - cpu_pred_high, 'c');
                        }
                    }
                }

                if (timing) {
                    //end gpu timing
                    cudaEventRecord(stop_gpu, 0);
                    cudaEventSynchronize(stop_gpu);
                    cudaEventElapsedTime(&gpu_time, start_gpu, stop_gpu);
                    cudaEventDestroy(start_gpu);
                    cudaEventDestroy(stop_gpu);

                    printf("iter:%d GPU time:%f\n", iter, gpu_time);
                }



                magma_dgetmatrix_async( i, ib,
                                        dA(0,i), ldda,
                                         A(0,i), lda, queues[1] );
                magma_queue_sync( queues[0] );
            }
            
            magma_int_t rows = m-i;

            if (timing) {
                //start cpu timing
                cudaEventCreate(&start_cpu);
                cudaEventCreate(&stop_cpu);
                cudaEventRecord(start_cpu, 0);
            }

            lapackf77_dgeqrf( &rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info );
            
            /* Form the triangular factor of the block reflector
               H = H(i) H(i+1) . . . H(i+ib-1) */
            lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr,
                              &rows, &ib, A(i,i), &lda, tau+i, work, &ib );
            

            if (timing) {
                //end cpu timing
                cudaEventRecord(stop_cpu, 0);
                cudaEventSynchronize(stop_cpu);
                cudaEventElapsedTime(&cpu_time, start_cpu, stop_cpu);
                cudaEventDestroy(start_cpu);
                cudaEventDestroy(stop_cpu);
                printf("iter:%d CPU time:%f\n", iter, cpu_time);
                if (gpu_time < cpu_time) {
                    printf("slack: +\n");
                } else {
                    printf("slack: -\n");
                }
            }

            magma_dpanel_to_q( MagmaUpper, ib, A(i,i), lda, work+ib*ib );
            
            /* put i-th V matrix onto device */
            magma_dsetmatrix_async( rows, ib, A(i,i), lda, dA(i,i), ldda, queues[0] );
            
            /* put T matrix onto device */
            magma_queue_sync( queues[1] );
            magma_dsetmatrix_async( ib, ib, work, ib, dT(0,0), nb, queues[0] );
            magma_queue_sync( queues[0] );
            
            if (i + ib < n) {
                if (i+ib < min_mn-nb) {
                    /* Apply H' to A(i:m,i+ib:i+2*ib) from the left (look-ahead) */
                    magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                      rows, ib, ib,
                                      dA(i, i   ), ldda, dT(0,0),  nb,
                                      dA(i, i+ib), ldda, dwork(0), lddwork, queues[1] );
                    magma_dq_to_panel( MagmaUpper, ib, A(i,i), lda, work+ib*ib );
                }
                else {
                    /* After last panel, update whole trailing matrix. */
                    /* Apply H' to A(i:m,i+ib:n) from the left */
                    magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                      rows, n-i-ib, ib,
                                      dA(i, i   ), ldda, dT(0,0),  nb,
                                      dA(i, i+ib), ldda, dwork(0), lddwork, queues[1] );
                    magma_dq_to_panel( MagmaUpper, ib, A(i,i), lda, work+ib*ib );
                }
                
                old_i  = i;
                old_ib = ib;
                
            }
            iter ++;

        }
        //for nvprof profiler.
        //cudaProfilerStop();
    } else {
        i = 0;
    }
    
    /* Use unblocked code to factor the last or only block. */
    if (i < min_mn) {
        ib = n-i;
        if (i != 0) {
            magma_dgetmatrix( m, ib, dA(0,i), ldda, A(0,i), lda, queues[1] );
        }
        magma_int_t rows = m-i;
        lapackf77_dgeqrf( &rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info );
    }
    
    magma_queue_destroy( queues[0] );
    magma_queue_destroy( queues[1] );
    
    magma_free( dA );
    
    return *info;
} /* magma_dgeqrf */
Beispiel #16
0
extern "C" magma_int_t
magma_dpotrf_gpu(char uplo, magma_int_t n,
                 double *dA, magma_int_t ldda, magma_int_t *info)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    DPOTRF computes the Cholesky factorization of a real symmetric
    positive definite matrix dA.

    The factorization has the form
       dA = U**T * U,  if UPLO = 'U', or
       dA = L  * L**T,  if UPLO = 'L',
    where U is an upper triangular matrix and L is lower triangular.

    This is the block version of the algorithm, calling Level 3 BLAS.
    If the current stream is NULL, this version replaces it with user defined
    stream to overlap computation with communication.

    Arguments
    =========
    UPLO    (input) CHARACTER*1
            = 'U':  Upper triangle of dA is stored;
            = 'L':  Lower triangle of dA is stored.

    N       (input) INTEGER
            The order of the matrix dA.  N >= 0.

    dA      (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N)
            On entry, the symmetric matrix dA.  If UPLO = 'U', the leading
            N-by-N upper triangular part of dA contains the upper
            triangular part of the matrix dA, and the strictly lower
            triangular part of dA is not referenced.  If UPLO = 'L', the
            leading N-by-N lower triangular part of dA contains the lower
            triangular part of the matrix dA, and the strictly upper
            triangular part of dA is not referenced.

            On exit, if INFO = 0, the factor U or L from the Cholesky
            factorization dA = U**T * U or dA = L * L**T.

    LDDA     (input) INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            dividable by 16.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
            > 0:  if INFO = i, the leading minor of order i is not
                  positive definite, and the factorization could not be
                  completed.
    =====================================================================   */


    magma_int_t     j, jb, nb;
    char            uplo_[2] = {uplo, 0};
    double c_one     = MAGMA_D_ONE;
    double c_neg_one = MAGMA_D_NEG_ONE;
    double *work;
    double          d_one     =  1.0;
    double          d_neg_one = -1.0;
    int upper = lapackf77_lsame(uplo_, "U");

    *info = 0;
    if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,n)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    nb = magma_get_dpotrf_nb(n);

    if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, nb*nb )) {
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }

    /* Define user stream if current stream is NULL */
    cudaStream_t stream[2], current_stream;
    magmablasGetKernelStream(&current_stream);

    magma_queue_create( &stream[0] );
    if (current_stream == NULL) {
      magma_queue_create( &stream[1] );
      magmablasSetKernelStream(stream[1]);
    }
    else
      stream[1] = current_stream;

    if ((nb <= 1) || (nb >= n)) {
        /*  Use unblocked code. */
        magma_dgetmatrix_async( n, n, dA, ldda, work, n, stream[1] );
        magma_queue_sync( stream[1] );
        lapackf77_dpotrf(uplo_, &n, work, &n, info);
        magma_dsetmatrix_async( n, n, work, n, dA, ldda, stream[1] );
    }
    else {

        /* Use blocked code. */
        if (upper) {
            
            /* Compute the Cholesky factorization A = U'*U. */
            for (j=0; j<n; j+=nb) {
                
                /* Update and factorize the current diagonal block and test
                   for non-positive-definiteness. Computing MIN */
                jb = min(nb, (n-j));
                
                magma_dsyrk(MagmaUpper, MagmaTrans, jb, j,
                            d_neg_one, dA(0, j), ldda,
                            d_one,     dA(j, j), ldda);

                magma_queue_sync( stream[1] );
                magma_dgetmatrix_async( jb, jb,
                                        dA(j, j), ldda,
                                        work,     jb, stream[0] );
                
                if ( (j+jb) < n) {
                    /* Compute the current block row. */
                    magma_dgemm(MagmaTrans, MagmaNoTrans,
                                jb, (n-j-jb), j,
                                c_neg_one, dA(0, j   ), ldda,
                                           dA(0, j+jb), ldda,
                                c_one,     dA(j, j+jb), ldda);
                }
                
                magma_queue_sync( stream[0] );
                lapackf77_dpotrf(MagmaUpperStr, &jb, work, &jb, info);
                magma_dsetmatrix_async( jb, jb,
                                        work,     jb,
                                        dA(j, j), ldda, stream[1] );
                if (*info != 0) {
                    *info = *info + j;
                    break;
                }

                if ( (j+jb) < n) {
                    magma_dtrsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit,
                                 jb, (n-j-jb),
                                 c_one, dA(j, j   ), ldda,
                                        dA(j, j+jb), ldda);
                }
            }
        }
        else {
            //=========================================================
            // Compute the Cholesky factorization A = L*L'.
            for (j=0; j<n; j+=nb) {

                //  Update and factorize the current diagonal block and test
                //  for non-positive-definiteness. Computing MIN
                jb = min(nb, (n-j));

                magma_dsyrk(MagmaLower, MagmaNoTrans, jb, j,
                            d_neg_one, dA(j, 0), ldda,
                            d_one,     dA(j, j), ldda);
                
                magma_queue_sync( stream[1] );
                magma_dgetmatrix_async( jb, jb,
                                        dA(j, j), ldda,
                                        work,     jb, stream[0] );
                
                if ( (j+jb) < n) {
                    magma_dgemm( MagmaNoTrans, MagmaTrans,
                                 (n-j-jb), jb, j,
                                 c_neg_one, dA(j+jb, 0), ldda,
                                            dA(j,    0), ldda,
                                 c_one,     dA(j+jb, j), ldda);
                }

                magma_queue_sync( stream[0] );
                lapackf77_dpotrf(MagmaLowerStr, &jb, work, &jb, info);
                magma_dsetmatrix_async( jb, jb,
                                        work,     jb,
                                        dA(j, j), ldda, stream[1] );
                if (*info != 0) {
                    *info = *info + j;
                    break;
                }
                
                if ( (j+jb) < n) {
                    magma_dtrsm(MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit,
                                (n-j-jb), jb,
                                c_one, dA(j,    j), ldda,
                                       dA(j+jb, j), ldda);
                }
            }
        }
    }

    magma_free_pinned( work );

    magma_queue_destroy( stream[0] );
    if (current_stream == NULL) {
      magma_queue_destroy( stream[1] );
      magmablasSetKernelStream(NULL);
    }

    return *info;
} /* magma_dpotrf_gpu */
Beispiel #17
0
/**
    Purpose
    -------
    DSYGVD computes all the eigenvalues, and optionally, the eigenvectors
    of a real generalized symmetric-definite eigenproblem, of the form
    A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
    B are assumed to be symmetric and B is also positive definite.
    If eigenvectors are desired, it uses a divide and conquer algorithm.

    The divide and conquer algorithm makes very mild assumptions about
    floating point arithmetic. It will work on machines with a guard
    digit in add/subtract, or on those binary machines without guard
    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
    Cray-2. It could conceivably fail on hexadecimal or decimal machines
    without guard digits, but we know of none.

    Arguments
    ---------
    @param[in]
    itype   INTEGER
            Specifies the problem type to be solved:
            = 1:  A*x = (lambda)*B*x
            = 2:  A*B*x = (lambda)*x
            = 3:  B*A*x = (lambda)*x

    @param[in]
    jobz    magma_vec_t
      -     = MagmaNoVec:  Compute eigenvalues only;
      -     = MagmaVec:    Compute eigenvalues and eigenvectors.

    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangles of A and B are stored;
      -     = MagmaLower:  Lower triangles of A and B are stored.

    @param[in]
    n       INTEGER
            The order of the matrices A and B.  N >= 0.

    @param[in,out]
    A       DOUBLE PRECISION array, dimension (LDA, N)
            On entry, the symmetric matrix A.  If UPLO = MagmaUpper, the
            leading N-by-N upper triangular part of A contains the
            upper triangular part of the matrix A.  If UPLO = MagmaLower,
            the leading N-by-N lower triangular part of A contains
            the lower triangular part of the matrix A.
    \n
            On exit, if JOBZ = MagmaVec, then if INFO = 0, A contains the
            matrix Z of eigenvectors.  The eigenvectors are normalized
            as follows:
            if ITYPE = 1 or 2, Z**T *   B    * Z = I;
            if ITYPE = 3,      Z**T * inv(B) * Z = I.
            If JOBZ = MagmaNoVec, then on exit the upper triangle (if UPLO=MagmaUpper)
            or the lower triangle (if UPLO=MagmaLower) of A, including the
            diagonal, is destroyed.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[in,out]
    B       DOUBLE PRECISION array, dimension (LDB, N)
            On entry, the symmetric matrix B.  If UPLO = MagmaUpper, the
            leading N-by-N upper triangular part of B contains the
            upper triangular part of the matrix B.  If UPLO = MagmaLower,
            the leading N-by-N lower triangular part of B contains
            the lower triangular part of the matrix B.
    \n
            On exit, if INFO <= N, the part of B containing the matrix is
            overwritten by the triangular factor U or L from the Cholesky
            factorization B = U**T * U or B = L * L**T.

    @param[in]
    ldb     INTEGER
            The leading dimension of the array B.  LDB >= max(1,N).

    @param[out]
    w       DOUBLE PRECISION array, dimension (N)
            If INFO = 0, the eigenvalues in ascending order.

    @param[out]
    work    (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The length of the array WORK.
            If N <= 1,                      LWORK >= 1.
            If JOBZ = MagmaNoVec and N > 1, LWORK >= 2*N + N*NB.
            If JOBZ = MagmaVec   and N > 1, LWORK >= max( 2*N + N*NB, 1 + 6*N + 2*N**2 ).
            NB can be obtained through magma_get_dsytrd_nb(N).
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal sizes of the WORK and IWORK
            arrays, returns these values as the first entries of the WORK
            and IWORK arrays, and no error message related to LWORK or
            LIWORK is issued by XERBLA.

    @param[out]
    iwork   (workspace) INTEGER array, dimension (MAX(1,LIWORK))
            On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK.

    @param[in]
    liwork  INTEGER
            The dimension of the array IWORK.
            If N <= 1,                      LIWORK >= 1.
            If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1.
            If JOBZ = MagmaVec   and N > 1, LIWORK >= 3 + 5*N.
    \n
            If LIWORK = -1, then a workspace query is assumed; the
            routine only calculates the optimal sizes of the WORK and
            IWORK arrays, returns these values as the first entries of
            the WORK and IWORK arrays, and no error message related to
            LWORK or LIWORK is issued by XERBLA.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
      -     > 0:  DPOTRF or DSYEVD returned an error code:
               <= N:  if INFO = i and JOBZ = MagmaNoVec, then the algorithm
                      failed to converge; i off-diagonal elements of an
                      intermediate tridiagonal form did not converge to
                      zero;
                      if INFO = i and JOBZ = MagmaVec, then the algorithm
                      failed to compute an eigenvalue while working on
                      the submatrix lying in rows and columns INFO/(N+1)
                      through mod(INFO,N+1);
               > N:   if INFO = N + i, for 1 <= i <= N, then the leading
                      minor of order i of B is not positive definite.
                      The factorization of B could not be completed and
                      no eigenvalues or eigenvectors were computed.

    Further Details
    ---------------
    Based on contributions by
       Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA

    Modified so that no backsubstitution is performed if DSYEVD fails to
    converge (NEIG in old code could be greater than N causing out of
    bounds reference to A - reported by Ralf Meyer).  Also corrected the
    description of INFO and the test on ITYPE. Sven, 16 Feb 05.

    @ingroup magma_dsygv_driver
    ********************************************************************/
extern "C" magma_int_t
magma_dsygvd(
    magma_int_t itype, magma_vec_t jobz, magma_uplo_t uplo, magma_int_t n,
    double *A, magma_int_t lda,
    double *B, magma_int_t ldb,
    double *w,
    double *work, magma_int_t lwork,
    #ifdef COMPLEX
    double *rwork, magma_int_t lrwork,
    #endif
    magma_int_t *iwork, magma_int_t liwork,
    magma_int_t *info)
{
    const char* uplo_ = lapack_uplo_const( uplo );
    const char* jobz_ = lapack_vec_const( jobz );

    double d_one = MAGMA_D_ONE;

    double *dA=NULL, *dB=NULL;
    magma_int_t ldda = n;
    magma_int_t lddb = n;

    magma_int_t lower;
    magma_trans_t trans;
    magma_int_t wantz, lquery;

    magma_int_t lwmin, liwmin;

    magma_queue_t stream;
    magma_queue_create( &stream );

    wantz = (jobz == MagmaVec);
    lower = (uplo == MagmaLower);
    lquery = (lwork == -1 || liwork == -1);

    *info = 0;
    if (itype < 1 || itype > 3) {
        *info = -1;
    } else if (! (wantz || (jobz == MagmaNoVec))) {
        *info = -2;
    } else if (! (lower || (uplo == MagmaUpper))) {
        *info = -3;
    } else if (n < 0) {
        *info = -4;
    } else if (lda < max(1,n)) {
        *info = -6;
    } else if (ldb < max(1,n)) {
        *info = -8;
    }

    magma_int_t nb = magma_get_dsytrd_nb( n );
    if ( n <= 1 ) {
        lwmin  = 1;
        liwmin = 1;
    }
    else if ( wantz ) {
        lwmin  = max( 2*n + n*nb, 1 + 6*n + 2*n*n );
        liwmin = 3 + 5*n;
    }
    else {
        lwmin  = 2*n + n*nb;
        liwmin = 1;
    }
    
    // multiply by 1+eps (in Double!) to ensure length gets rounded up,
    // if it cannot be exactly represented in floating point.
    real_Double_t one_eps = 1. + lapackf77_dlamch("Epsilon");
    work[0]  = lwmin * one_eps;
    iwork[0] = liwmin;

    if (lwork < lwmin && ! lquery) {
        *info = -11;
    } else if (liwork < liwmin && ! lquery) {
        *info = -13;
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (n == 0) {
        return *info;
    }
    
    /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */
    if (n <= 128) {
        #ifdef ENABLE_DEBUG
        printf("--------------------------------------------------------------\n");
        printf("  warning matrix too small N=%d NB=%d, calling lapack on CPU  \n", (int) n, (int) nb);
        printf("--------------------------------------------------------------\n");
        #endif
        lapackf77_dsygvd(&itype, jobz_, uplo_,
                         &n, A, &lda, B, &ldb,
                         w, work, &lwork,
                         iwork, &liwork, info);
        return *info;
    }

    if (MAGMA_SUCCESS != magma_dmalloc( &dA, n*ldda ) ||
        MAGMA_SUCCESS != magma_dmalloc( &dB, n*lddb )) {
        magma_free( dA );
        magma_free( dB );
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    /* Form a Cholesky factorization of B. */
    magma_dsetmatrix( n, n, B, ldb, dB, lddb );
    magma_dsetmatrix_async( n, n,
                            A,  lda,
                            dA, ldda, stream );

    magma_timer_t time=0;
    timer_start( time );
    magma_dpotrf_gpu(uplo, n, dB, lddb, info);
    if (*info != 0) {
        *info = n + *info;
        return *info;
    }
    timer_stop( time );
    timer_printf( "time dpotrf_gpu = %6.2f\n", time );

    magma_queue_sync( stream );
    magma_dgetmatrix_async( n, n,
                            dB, lddb,
                            B,  ldb, stream );

    timer_start( time );
    /* Transform problem to standard eigenvalue problem and solve. */
    magma_dsygst_gpu(itype, uplo, n, dA, ldda, dB, lddb, info);
    timer_stop( time );
    timer_printf( "time dsygst_gpu = %6.2f\n", time );

    /* simple fix to be able to run bigger size.
     * need to have a dwork here that will be used
     * as dB and then passed to dsyevd.
     * */
    if (n > 5000) {
        magma_queue_sync( stream );
        magma_free( dB );
    }

    timer_start( time );
    magma_dsyevd_gpu(jobz, uplo, n, dA, ldda, w, A, lda,
                     work, lwork, iwork, liwork, info);
    timer_stop( time );
    timer_printf( "time dsyevd_gpu = %6.2f\n", time );

    if (wantz && *info == 0) {
        timer_start( time );
        
        /* allocate and copy dB back */
        if (n > 5000) {
            if (MAGMA_SUCCESS != magma_dmalloc( &dB, n*lddb ) ) {
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }
            magma_dsetmatrix( n, n, B, ldb, dB, lddb );
        }
        /* Backtransform eigenvectors to the original problem. */
        if (itype == 1 || itype == 2) {
            /* For A*x=(lambda)*B*x and A*B*x=(lambda)*x;
               backtransform eigenvectors: x = inv(L)'*y or inv(U)*y */
            if (lower) {
                trans = MagmaTrans;
            } else {
                trans = MagmaNoTrans;
            }
            magma_dtrsm(MagmaLeft, uplo, trans, MagmaNonUnit,
                        n, n, d_one, dB, lddb, dA, ldda);
        }
        else if (itype == 3) {
            /* For B*A*x=(lambda)*x;
               backtransform eigenvectors: x = L*y or U'*y */
            if (lower) {
                trans = MagmaNoTrans;
            } else {
                trans = MagmaTrans;
            }

            magma_dtrmm(MagmaLeft, uplo, trans, MagmaNonUnit,
                        n, n, d_one, dB, lddb, dA, ldda);
        }
        magma_dgetmatrix( n, n, dA, ldda, A, lda );
        
        /* free dB */
        if (n > 5000) {
            magma_free( dB );
        }
        
        timer_stop( time );
        timer_printf( "time dtrsm/mm + getmatrix = %6.2f\n", time );
    }

    magma_queue_sync( stream );
    magma_queue_destroy( stream );

    work[0]  = lwmin * one_eps;  // round up
    iwork[0] = liwmin;

    magma_free( dA );
    if (n <= 5000) {
        magma_free( dB );
    }

    return *info;
} /* magma_dsygvd */
Beispiel #18
0
extern "C" magma_int_t
magma_dgeqrf(magma_int_t m, magma_int_t n,
             double *A,    magma_int_t lda, double *tau,
             double *work, magma_int_t lwork,
             magma_int_t *info )
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    DGEQRF computes a QR factorization of a DOUBLE_PRECISION M-by-N matrix A:
    A = Q * R. This version does not require work space on the GPU
    passed as input. GPU memory is allocated in the routine.

    If the current stream is NULL, this version replaces it with user defined
    stream to overlap computation with communication.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) DOUBLE_PRECISION array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).

            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    TAU     (output) DOUBLE_PRECISION array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    WORK    (workspace/output) DOUBLE_PRECISION array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.

            Higher performance is achieved if WORK is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    LWORK   (input) INTEGER
            The dimension of the array WORK.  LWORK >= max( N*NB, 2*NB*NB ),
            where NB can be obtained through magma_get_dgeqrf_nb(M).

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ===============
    The matrix Q is represented as a product of elementary reflectors

       Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).
    =====================================================================    */

    #define  A(i,j) ( A + (i) + (j)*lda )
    #define dA(i,j) (dA + (i) + (j)*ldda)

    double *dA, *dwork, *dT;
    double c_one = MAGMA_D_ONE;

    magma_int_t i, k, lddwork, old_i, old_ib;
    magma_int_t ib, ldda;

    /* Function Body */
    *info = 0;
    magma_int_t nb = magma_get_dgeqrf_nb(min(m, n));

    // need 2*nb*nb to store T and upper triangle of V simultaneously
    magma_int_t lwkopt = max(n*nb, 2*nb*nb);
    work[0] = MAGMA_D_MAKE( (double)lwkopt, 0 );
    int lquery = (lwork == -1);
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,m)) {
        *info = -4;
    } else if (lwork < max(1, lwkopt) && ! lquery) {
        *info = -7;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;

    k = min(m,n);
    if (k == 0) {
        work[0] = c_one;
        return *info;
    }

    // largest N for larfb is n-nb (trailing matrix lacks 1st panel)
    lddwork = ((n+31)/32)*32 - nb;
    ldda    = ((m+31)/32)*32;

    magma_int_t num_gpus = magma_num_gpus();
    if( num_gpus > 1 ) {
        /* call multiple-GPU interface  */
        return magma_dgeqrf4(num_gpus, m, n, A, lda, tau, work, lwork, info);
    }

    // allocate space for dA, dwork, and dT
    if (MAGMA_SUCCESS != magma_dmalloc( &dA, n*ldda + nb*lddwork + nb*nb )) {
        /* Switch to the "out-of-core" (out of GPU-memory) version */
        return magma_dgeqrf_ooc(m, n, A, lda, tau, work, lwork, info);
    }

    /* Define user stream if current stream is NULL */
    magma_queue_t stream[3], current_stream;
    magmablasGetKernelStream(&current_stream);

    magma_queue_create( &stream[0] );
    magma_queue_create( &stream[2] );
    if (current_stream == NULL) {
      magma_queue_create( &stream[1] );
      magmablasSetKernelStream(stream[1]);
    }
    else
      stream[1] = current_stream;

    dwork = dA + n*ldda;
    dT    = dA + n*ldda + nb*lddwork;

    if ( (nb > 1) && (nb < k) ) {
        /* Use blocked code initially.
           Asynchronously send the matrix to the GPU except the first panel. */
        magma_dsetmatrix_async( m, n-nb,
                                A(0,nb),  lda,
                                dA(0,nb), ldda, stream[2] );

        old_i = 0;
        old_ib = nb;
        for (i = 0; i < k-nb; i += nb) {
            ib = min(k-i, nb);
            if (i>0) {
                /* download i-th panel */
                magma_queue_sync( stream[1] ); 
                magma_dgetmatrix_async( m-i, ib,
                                        dA(i,i), ldda,
                                        A(i,i),  lda, stream[0] );

                /* Apply H' to A(i:m,i+2*ib:n) from the left */
                magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise,
                                  m-old_i, n-old_i-2*old_ib, old_ib,
                                  dA(old_i, old_i),          ldda, dT,    nb,
                                  dA(old_i, old_i+2*old_ib), ldda, dwork, lddwork);

                magma_dgetmatrix_async( i, ib,
                                        dA(0,i), ldda,
                                        A(0,i),  lda, stream[2] );
                magma_queue_sync( stream[0] );
            }

            magma_int_t rows = m-i;
            lapackf77_dgeqrf(&rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info);
            /* Form the triangular factor of the block reflector
               H = H(i) H(i+1) . . . H(i+ib-1) */
            lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr,
                              &rows, &ib, A(i,i), &lda, tau+i, work, &ib);

            dpanel_to_q(MagmaUpper, ib, A(i,i), lda, work+ib*ib);

            /* download the i-th V matrix */
            magma_dsetmatrix_async( rows, ib, A(i,i), lda, dA(i,i), ldda, stream[0] );

            /* download the T matrix */
            magma_dsetmatrix_async( ib, ib, work, ib, dT, nb, stream[0] );
            magma_queue_sync( stream[0] );

            if (i + ib < n) {

                if (i+ib < k-nb) {
                    /* Apply H' to A(i:m,i+ib:i+2*ib) from the left (look-ahead) */
                    magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise,
                                      rows, ib, ib,
                                      dA(i, i   ), ldda, dT,    nb,
                                      dA(i, i+ib), ldda, dwork, lddwork);
                    dq_to_panel(MagmaUpper, ib, A(i,i), lda, work+ib*ib);
                }
                else {
                    /* After last panel, update whole trailing matrix. */
                    /* Apply H' to A(i:m,i+ib:n) from the left */
                    magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise,
                                      rows, n-i-ib, ib,
                                      dA(i, i   ), ldda, dT,    nb,
                                      dA(i, i+ib), ldda, dwork, lddwork);
                    dq_to_panel(MagmaUpper, ib, A(i,i), lda, work+ib*ib);
                }

                old_i  = i;
                old_ib = ib;
            }
        }
    } else {
        i = 0;
    }
    
    /* Use unblocked code to factor the last or only block. */
    if (i < k) {
        ib = n-i;
        if (i != 0) {
            magma_dgetmatrix( m, ib, dA(0,i), ldda, A(0,i), lda );
        }
        magma_int_t rows = m-i;
        lapackf77_dgeqrf(&rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info);
    }

    magma_queue_destroy( stream[0] );
    magma_queue_destroy( stream[2] );
    if (current_stream == NULL) {
      magma_queue_destroy( stream[1] );
      magmablasSetKernelStream(NULL);
    }

    magma_free( dA );
    
    return *info;
} /* magma_dgeqrf */
Beispiel #19
0
extern "C" magma_int_t
magma_dlabrd_gpu( magma_int_t m, magma_int_t n, magma_int_t nb,
                  double *a, magma_int_t lda, double *da, magma_int_t ldda,
                  double *d, double *e, double *tauq, double *taup,
                  double *x, magma_int_t ldx, double *dx, magma_int_t lddx,
                  double *y, magma_int_t ldy, double *dy, magma_int_t lddy)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    DLABRD reduces the first NB rows and columns of a real general
    m by n matrix A to upper or lower bidiagonal form by an orthogonal
    transformation Q' * A * P, and returns the matrices X and Y which
    are needed to apply the transformation to the unreduced part of A.

    If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower
    bidiagonal form.

    This is an auxiliary routine called by SGEBRD

    Arguments
    =========
    M       (input) INTEGER
            The number of rows in the matrix A.

    N       (input) INTEGER
            The number of columns in the matrix A.

    NB      (input) INTEGER
            The number of leading rows and columns of A to be reduced.

    A       (input/output) DOUBLE_PRECISION array, dimension (LDA,N)
            On entry, the m by n general matrix to be reduced.
            On exit, the first NB rows and columns of the matrix are
            overwritten; the rest of the array is unchanged.
            If m >= n, elements on and below the diagonal in the first NB
              columns, with the array TAUQ, represent the orthogonal
              matrix Q as a product of elementary reflectors; and
              elements above the diagonal in the first NB rows, with the
              array TAUP, represent the orthogonal matrix P as a product
              of elementary reflectors.
            If m < n, elements below the diagonal in the first NB
              columns, with the array TAUQ, represent the orthogonal
              matrix Q as a product of elementary reflectors, and
              elements on and above the diagonal in the first NB rows,
              with the array TAUP, represent the orthogonal matrix P as
              a product of elementary reflectors.
            See Further Details.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    D       (output) DOUBLE_PRECISION array, dimension (NB)
            The diagonal elements of the first NB rows and columns of
            the reduced matrix.  D(i) = A(i,i).

    E       (output) DOUBLE_PRECISION array, dimension (NB)
            The off-diagonal elements of the first NB rows and columns of
            the reduced matrix.

    TAUQ    (output) DOUBLE_PRECISION array dimension (NB)
            The scalar factors of the elementary reflectors which
            represent the orthogonal matrix Q. See Further Details.

    TAUP    (output) DOUBLE_PRECISION array, dimension (NB)
            The scalar factors of the elementary reflectors which
            represent the orthogonal matrix P. See Further Details.

    X       (output) DOUBLE_PRECISION array, dimension (LDX,NB)
            The m-by-nb matrix X required to update the unreduced part
            of A.

    LDX     (input) INTEGER
            The leading dimension of the array X. LDX >= M.

    Y       (output) DOUBLE_PRECISION array, dimension (LDY,NB)
            The n-by-nb matrix Y required to update the unreduced part
            of A.

    LDY     (input) INTEGER
            The leading dimension of the array Y. LDY >= N.

    Further Details
    ===============
    The matrices Q and P are represented as products of elementary
    reflectors:

       Q = H(1) H(2) . . . H(nb)  and  P = G(1) G(2) . . . G(nb)

    Each H(i) and G(i) has the form:

       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'

    where tauq and taup are real scalars, and v and u are real vectors.

    If m >= n, v(1:i-1) = 0, v(i) = 1, and v(i:m) is stored on exit in
    A(i:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+1:n) is stored on exit in
    A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i).

    If m < n, v(1:i) = 0, v(i+1) = 1, and v(i+1:m) is stored on exit in
    A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i:n) is stored on exit in
    A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i).

    The elements of the vectors v and u together form the m-by-nb matrix
    V and the nb-by-n matrix U' which are needed, with X and Y, to apply
    the transformation to the unreduced part of the matrix, using a block
    update of the form:  A := A - V*Y' - X*U'.

    The contents of A on exit are illustrated by the following examples
    with nb = 2:

    m = 6 and n = 5 (m > n):          m = 5 and n = 6 (m < n):

      (  1   1   u1  u1  u1 )           (  1   u1  u1  u1  u1  u1 )
      (  v1  1   1   u2  u2 )           (  1   1   u2  u2  u2  u2 )
      (  v1  v2  a   a   a  )           (  v1  1   a   a   a   a  )
      (  v1  v2  a   a   a  )           (  v1  v2  a   a   a   a  )
      (  v1  v2  a   a   a  )           (  v1  v2  a   a   a   a  )
      (  v1  v2  a   a   a  )

    where a denotes an element of the original matrix which is unchanged,
    vi denotes an element of the vector defining H(i), and ui an element
    of the vector defining G(i).

    =====================================================================    */


    /* Table of constant values */
    double c_neg_one = MAGMA_D_NEG_ONE;
    double c_one = MAGMA_D_ONE;
    double c_zero = MAGMA_D_ZERO;
    magma_int_t c__1 = 1;
    
    /* System generated locals */
    magma_int_t a_dim1, a_offset, x_dim1, x_offset, y_dim1, y_offset, i__2, i__3;
    /* Local variables */
    magma_int_t i__;
    double alpha;

    a_dim1 = lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    --d;
    --e;
    --tauq;
    --taup;

    x_dim1 = ldx;
    x_offset = 1 + x_dim1;
    x -= x_offset;
    dx-= 1 + lddx;

    y_dim1 = ldy;
    y_offset = 1 + y_dim1;
    y -= y_offset;
    dy-= 1 + lddy;

    /* Function Body */
    if (m <= 0 || n <= 0) {
        return 0;
    }

    double *f;
    magma_queue_t stream;
    magma_queue_create( &stream );
    magma_dmalloc_cpu( &f, max(n,m) );
    assert( f != NULL );  // TODO return error, or allocate outside dlatrd
    
    if (m >= n) {

        /* Reduce to upper bidiagonal form */

        for (i__ = 1; i__ <= nb; ++i__) {

            /*  Update A(i:m,i) */
            i__2 = m - i__ + 1;
            i__3 = i__ - 1;
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv( &i__3, &y[i__+y_dim1], &ldy );
            #endif
            blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one, &a[i__ + a_dim1], &lda,
                   &y[i__+y_dim1], &ldy, &c_one, &a[i__ + i__ * a_dim1], &c__1);
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv( &i__3, &y[i__+y_dim1], &ldy );
            #endif
            blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one, &x[i__ + x_dim1], &ldx,
                   &a[i__*a_dim1+1], &c__1, &c_one, &a[i__+i__*a_dim1], &c__1);
            
            /* Generate reflection Q(i) to annihilate A(i+1:m,i) */

            alpha = a[i__ + i__ * a_dim1];
            i__2 = m - i__ + 1;
            i__3 = i__ + 1;
            lapackf77_dlarfg(&i__2, &alpha,
                    &a[min(i__3,m) + i__ * a_dim1], &c__1, &tauq[i__]);
            d[i__] = MAGMA_D_REAL( alpha );
            if (i__ < n) {
                a[i__ + i__ * a_dim1] = c_one;

                /* Compute Y(i+1:n,i) */
                i__2 = m - i__ + 1;
                i__3 = n - i__;

                // 1. Send the block reflector  A(i+1:m,i) to the GPU ------
                magma_dsetvector( i__2,
                                  a + i__   + i__   * a_dim1, 1,
                                  da+(i__-1)+(i__-1)* (ldda), 1 );
                // 2. Multiply ---------------------------------------------
                magma_dgemv(MagmaTrans, i__2, i__3, c_one,
                            da + (i__-1) + ((i__-1) + 1) * (ldda), ldda,
                            da + (i__-1) + (i__-1) * (ldda), c__1, c_zero,
                            dy + i__ + 1 + i__ * y_dim1, c__1);
                
                // 3. Put the result back ----------------------------------
                magma_dgetmatrix_async( i__3, 1,
                                        dy+i__+1+i__*y_dim1, y_dim1,
                                        y+i__+1+i__*y_dim1,  y_dim1, stream );
                i__2 = m - i__ + 1;
                i__3 = i__ - 1;
                blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_one, &a[i__ + a_dim1],
                        &lda, &a[i__ + i__ * a_dim1], &c__1, &c_zero,
                       &y[i__ * y_dim1 + 1], &c__1);

                i__2 = n - i__;
                i__3 = i__ - 1;
                blasf77_dgemv("N", &i__2, &i__3, &c_neg_one, &y[i__ + 1 +y_dim1], &ldy,
                       &y[i__ * y_dim1 + 1], &c__1,
                       &c_zero, f, &c__1);
                i__2 = m - i__ + 1;
                i__3 = i__ - 1;
                blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_one, &x[i__ + x_dim1],
                       &ldx, &a[i__ + i__ * a_dim1], &c__1, &c_zero,
                       &y[i__ * y_dim1 + 1], &c__1);
                
                // 4. Synch to make sure the result is back ----------------
                magma_queue_sync( stream );

                if (i__3!=0){
                    i__2 = n - i__;
                    blasf77_daxpy(&i__2, &c_one, f,&c__1, &y[i__+1+i__*y_dim1],&c__1);
                }

                i__2 = i__ - 1;
                i__3 = n - i__;
                blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_neg_one, &a[(i__ + 1) *
                        a_dim1 + 1], &lda, &y[i__ * y_dim1 + 1], &c__1, &c_one,
                        &y[i__ + 1 + i__ * y_dim1], &c__1);
                i__2 = n - i__;
                blasf77_dscal(&i__2, &tauq[i__], &y[i__ + 1 + i__ * y_dim1], &c__1);

                /* Update A(i,i+1:n) */
                i__2 = n - i__;
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_dlacgv( &i__2, &a[i__+(i__+1)*a_dim1], &lda );
                lapackf77_dlacgv( &i__,  &a[i__+a_dim1], &lda );
                #endif
                blasf77_dgemv("No transpose", &i__2, &i__, &c_neg_one, &y[i__ + 1 +
                        y_dim1], &ldy, &a[i__ + a_dim1], &lda, &c_one, &a[i__ + (
                        i__ + 1) * a_dim1], &lda);
                i__2 = i__ - 1;
                i__3 = n - i__;
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_dlacgv( &i__,  &a[i__+a_dim1], &lda );
                lapackf77_dlacgv( &i__2, &x[i__+x_dim1], &ldx );
                #endif
                blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_neg_one, &a[(i__ + 1) *
                        a_dim1 + 1], &lda, &x[i__ + x_dim1], &ldx, &c_one, &a[
                        i__ + (i__ + 1) * a_dim1], &lda);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_dlacgv( &i__2, &x[i__+x_dim1], &ldx );
                #endif

                /* Generate reflection P(i) to annihilate A(i,i+2:n) */
                i__2 = n - i__;
                /* Computing MIN */
                i__3 = i__ + 2;
                alpha = a[i__ + (i__ + 1) * a_dim1];
                lapackf77_dlarfg(&i__2, &alpha, &a[i__ + min(
                        i__3,n) * a_dim1], &lda, &taup[i__]);
                e[i__] = MAGMA_D_REAL( alpha );
                a[i__ + (i__ + 1) * a_dim1] = c_one;

                /* Compute X(i+1:m,i) */
                i__2 = m - i__;
                i__3 = n - i__;
                // 1. Send the block reflector  A(i+1:m,i) to the GPU ------
                magma_dsetvector( i__3,
                                  a + i__   + (i__   +1)* a_dim1, lda,
                                  da+(i__-1)+((i__-1)+1)*(ldda),  ldda );
                // 2. Multiply ---------------------------------------------
                //magma_dcopy(i__3, da+(i__-1)+((i__-1)+1)*(ldda), ldda,
                //            dy + 1 + lddy, 1);
                magma_dgemv(MagmaNoTrans, i__2, i__3, c_one,
                            da + (i__-1)+1+ ((i__-1)+1) * (ldda), ldda,
                            da + (i__-1) +  ((i__-1)+1) * (ldda), ldda,
                            //dy + 1 + lddy, 1,
                            c_zero, dx + i__ + 1 + i__ * x_dim1, c__1);

                // 3. Put the result back ----------------------------------
                magma_dgetmatrix_async( i__2, 1,
                                        dx+i__+1+i__*x_dim1, x_dim1,
                                        x+i__+1+i__*x_dim1,  x_dim1, stream );

                i__2 = n - i__;
                blasf77_dgemv(MagmaTransStr, &i__2, &i__, &c_one, &y[i__ + 1 + y_dim1],
                        &ldy, &a[i__ + (i__ + 1) * a_dim1], &lda, &c_zero, &x[
                        i__ * x_dim1 + 1], &c__1);

                i__2 = m - i__;
                blasf77_dgemv("N", &i__2, &i__, &c_neg_one, &a[i__ + 1 + a_dim1], &lda,
                       &x[i__ * x_dim1 + 1], &c__1, &c_zero, f, &c__1);
                i__2 = i__ - 1;
                i__3 = n - i__;
                blasf77_dgemv("N", &i__2, &i__3, &c_one, &a[(i__ + 1) * a_dim1 + 1],
                       &lda, &a[i__ + (i__ + 1) * a_dim1], &lda,
                       &c_zero, &x[i__ * x_dim1 + 1], &c__1);

                // 4. Synch to make sure the result is back ----------------
                magma_queue_sync( stream );
                if (i__!=0){
                    i__2 = m - i__;
                    blasf77_daxpy(&i__2, &c_one, f,&c__1, &x[i__+1+i__*x_dim1],&c__1);
                }


                i__2 = m - i__;
                i__3 = i__ - 1;
                blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one, &x[i__ + 1 +
                        x_dim1], &ldx, &x[i__ * x_dim1 + 1], &c__1, &c_one, &x[
                        i__ + 1 + i__ * x_dim1], &c__1);
                i__2 = m - i__;
                blasf77_dscal(&i__2, &taup[i__], &x[i__ + 1 + i__ * x_dim1], &c__1);

                #if defined(PRECISION_z) || defined(PRECISION_c)
                i__2 = n - i__;
                lapackf77_dlacgv( &i__2,  &a[i__+(i__+1)*a_dim1], &lda );
                // 4. Send the block reflector  A(i+1:m,i) to the GPU after DLACGV()
                magma_dsetvector( i__2,
                                  a + i__   + (i__   +1)* a_dim1, lda,
                                  da+(i__-1)+((i__-1)+1)*(ldda),  ldda );
                #endif
            }
        }
    }
    else {

        /* Reduce to lower bidiagonal form */
        
        for (i__ = 1; i__ <= nb; ++i__) {
        
            /* Update A(i,i:n) */
            i__2 = n - i__ + 1;
            i__3 = i__ - 1;
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv(&i__2, &a[i__ + i__ * a_dim1], &lda);
            lapackf77_dlacgv(&i__3, &a[i__ + a_dim1], &lda);
            #endif
            blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one, &y[i__ + y_dim1], &ldy,
                   &a[i__ + a_dim1], &lda, &c_one, &a[i__ + i__ * a_dim1], &lda);
            i__2 = i__ - 1;
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv(&i__3, &a[i__ + a_dim1], &lda);
            lapackf77_dlacgv(&i__3, &x[i__ + x_dim1], &ldx);
            #endif
            i__3 = n - i__ + 1;
            blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_neg_one, &a[i__ * a_dim1 + 1],
                   &lda, &x[i__ + x_dim1], &ldx, &c_one, &a[i__ + i__ * a_dim1], &lda);
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv(&i__2, &x[i__ + x_dim1], &ldx);
            #endif
            
            /* Generate reflection P(i) to annihilate A(i,i+1:n) */
            i__2 = n - i__ + 1;
            /* Computing MIN */
            i__3 = i__ + 1;
            alpha = a[i__ + i__ * a_dim1];
            lapackf77_dlarfg(&i__2, &alpha,
                    &a[i__ + min(i__3,n) * a_dim1], &lda, &taup[i__]);
            d[i__] = MAGMA_D_REAL( alpha );
            if (i__ < m) {
                a[i__ + i__ * a_dim1] = c_one;
                
                /* Compute X(i+1:m,i) */
                i__2 = m - i__;
                i__3 = n - i__ + 1;
                
                // 1. Send the block reflector  A(i,i+1:n) to the GPU ------
                magma_dsetvector( i__3,
                                  a + i__   + i__   * a_dim1, lda,
                                  da+(i__-1)+(i__-1)* (ldda), ldda );
                
                // 2. Multiply ---------------------------------------------
                //magma_dcopy(i__3, da+(i__-1)+(i__-1)*(ldda), ldda,
                //            dy + 1 + lddy, 1);
                magma_dgemv(MagmaNoTrans, i__2, i__3, c_one,
                            da + (i__-1)+1 + (i__-1) * ldda, ldda,
                            da + (i__-1)   + (i__-1) * ldda, ldda,
                            // dy + 1 + lddy, 1,
                            c_zero,
                            dx + i__ + 1 + i__ * x_dim1, c__1);
                
                // 3. Put the result back ----------------------------------
                magma_dgetmatrix_async( i__2, 1,
                                        dx+i__+1+i__*x_dim1, x_dim1,
                                        x+i__+1+i__*x_dim1,  x_dim1, stream );
                
                i__2 = n - i__ + 1;
                i__3 = i__ - 1;
                blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_one, &y[i__ + y_dim1],
                       &ldy, &a[i__ + i__ * a_dim1], &lda, &c_zero,
                       &x[i__ *  x_dim1 + 1], &c__1);
                i__2 = m - i__;
                i__3 = i__ - 1;
                blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one,
                              &a[i__ + 1 + a_dim1], &lda, &x[i__ * x_dim1 + 1], &c__1, &c_zero,
                              f, &c__1);
                
                i__2 = i__ - 1;
                i__3 = n - i__ + 1;
                blasf77_dgemv("No transpose", &i__2, &i__3, &c_one,
                       &a[i__ * a_dim1 + 1], &lda, &a[i__ + i__ * a_dim1], &lda, &c_zero,
                       &x[i__ * x_dim1 + 1], &c__1);
                
                // 4. Synch to make sure the result is back ----------------
                magma_queue_sync( stream );
                if (i__2!=0){
                    i__3 = m - i__;
                    blasf77_daxpy(&i__3, &c_one, f,&c__1, &x[i__+1+i__*x_dim1],&c__1);
                }
                
                i__2 = m - i__;
                i__3 = i__ - 1;
                blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one,
                       &x[i__ + 1 + x_dim1], &ldx, &x[i__ * x_dim1 + 1], &c__1, &c_one,
                       &x[i__ + 1 + i__ * x_dim1], &c__1);
                i__2 = m - i__;
                blasf77_dscal(&i__2, &taup[i__], &x[i__ + 1 + i__ * x_dim1], &c__1);
                i__2 = n - i__ + 1;
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_dlacgv(&i__2, &a[i__ + i__ * a_dim1], &lda);
                magma_dsetvector( i__2,
                                  a + i__   + (i__  )* a_dim1, lda,
                                  da+(i__-1)+ (i__-1)*(ldda),  ldda );
                #endif
                
                /* Update A(i+1:m,i) */
                i__2 = m - i__;
                i__3 = i__ - 1;
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_dlacgv(&i__3, &y[i__ + y_dim1], &ldy);
                #endif
                blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one,
                       &a[i__ + 1 + a_dim1], &lda, &y[i__ + y_dim1], &ldy, &c_one,
                       &a[i__ + 1 + i__ * a_dim1], &c__1);
                i__2 = m - i__;
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_dlacgv(&i__3, &y[i__ + y_dim1], &ldy);
                #endif
                blasf77_dgemv("No transpose", &i__2, &i__, &c_neg_one,
                       &x[i__ + 1 + x_dim1], &ldx, &a[i__ * a_dim1 + 1], &c__1, &c_one,
                       &a[i__ + 1 + i__ * a_dim1], &c__1);
                
                /* Generate reflection Q(i) to annihilate A(i+2:m,i) */
                i__2 = m - i__;
                i__3 = i__ + 2;
                alpha = a[i__ + 1 + i__ * a_dim1];
                lapackf77_dlarfg(&i__2, &alpha,
                        &a[min(i__3,m) + i__ * a_dim1], &c__1, &tauq[i__]);
                e[i__] = MAGMA_D_REAL( alpha );
                a[i__ + 1 + i__ * a_dim1] = c_one;
                
                /* Compute Y(i+1:n,i) */
                i__2 = m - i__;
                i__3 = n - i__;
                
                // 1. Send the block reflector  A(i+1:m,i) to the GPU ------
                magma_dsetvector( i__2,
                                  a + i__   +1+  i__   * a_dim1, 1,
                                  da+(i__-1)+1+ (i__-1)*(ldda),  1 );
                // 2. Multiply ---------------------------------------------
                magma_dgemv(MagmaTrans, i__2, i__3, c_one,
                            da + (i__-1)+1+ ((i__-1)+1) * ldda, ldda,
                            da + (i__-1)+1+  (i__-1)    * ldda, c__1,
                            c_zero, dy + i__ + 1 + i__ * y_dim1, c__1);
                
                // 3. Put the result back ----------------------------------
                magma_dgetmatrix_async( i__3, 1,
                                        dy+i__+1+i__*y_dim1, y_dim1,
                                        y+i__+1+i__*y_dim1,  y_dim1, stream );
                
                i__2 = m - i__;
                i__3 = i__ - 1;
                blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_one, &a[i__ + 1 + a_dim1],
                       &lda, &a[i__ + 1 + i__ * a_dim1], &c__1, &c_zero,
                       &y[ i__ * y_dim1 + 1], &c__1);
                i__2 = n - i__;
                i__3 = i__ - 1;
                blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one,
                       &y[i__ + 1 + y_dim1], &ldy, &y[i__ * y_dim1 + 1], &c__1,
                       &c_zero, f, &c__1);
                
                i__2 = m - i__;
                blasf77_dgemv(MagmaTransStr, &i__2, &i__, &c_one, &x[i__ + 1 + x_dim1],
                       &ldx, &a[i__ + 1 + i__ * a_dim1], &c__1, &c_zero,
                       &y[i__ * y_dim1 + 1], &c__1);
                
                // 4. Synch to make sure the result is back ----------------
                magma_queue_sync( stream );
                if (i__3!=0){
                    i__2 = n - i__;
                    blasf77_daxpy(&i__2, &c_one, f,&c__1, &y[i__+1+i__*y_dim1],&c__1);
                }
                
                i__2 = n - i__;
                blasf77_dgemv(MagmaTransStr, &i__, &i__2, &c_neg_one,
                       &a[(i__ + 1) * a_dim1 + 1], &lda, &y[i__ * y_dim1 + 1],
                       &c__1, &c_one, &y[i__ + 1 + i__ * y_dim1], &c__1);
                i__2 = n - i__;
                blasf77_dscal(&i__2, &tauq[i__], &y[i__ + 1 + i__ * y_dim1], &c__1);
            }
            #if defined(PRECISION_z) || defined(PRECISION_c)
            else {
                i__2 = n - i__ + 1;
                lapackf77_dlacgv(&i__2, &a[i__ + i__ * a_dim1], &lda);
                magma_dsetvector( i__2,
                                  a + i__   + (i__  )* a_dim1, lda,
                                  da+(i__-1)+ (i__-1)*(ldda),  ldda );
            }
            #endif
        }
    }
    
    magma_queue_destroy( stream );
    magma_free_cpu(f);
    
    return MAGMA_SUCCESS;
} /* dlabrd */
Beispiel #20
0
extern "C" magma_int_t
magma_dpotrf2_mgpu(
    magma_int_t num_gpus, magma_uplo_t uplo, magma_int_t m, magma_int_t n, 
    magma_int_t off_i, magma_int_t off_j, magma_int_t nb,
    magmaDouble_ptr *d_lA, size_t d_lA_offset, magma_int_t ldda, 
    magmaDouble_ptr *d_lP,  magma_int_t lddp, 
    double *a,      magma_int_t lda,   magma_int_t h,
    magma_queue_t *queues,
    magma_int_t *info )
{
/*  -- clMAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date November 2014

    Purpose   
    =======   
    DPOTRF computes the Cholesky factorization of a real symmetric   
    positive definite matrix dA.   

    The factorization has the form   
       dA = U**H * U,  if UPLO = 'U', or   
       dA = L  * L**H,  if UPLO = 'L',   
    where U is an upper triangular matrix and L is lower triangular.   

    This is the block version of the algorithm, calling Level 3 BLAS.   

    Arguments   
    =========   
    UPLO    (input) CHARACTER*1   
            = 'U':  Upper triangle of dA is stored;   
            = 'L':  Lower triangle of dA is stored.   

    N       (input) INTEGER   
            The order of the matrix dA.  N >= 0.   

    dA      (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N)   
            On entry, the symmetric matrix dA.  If UPLO = 'U', the leading   
            N-by-N upper triangular part of dA contains the upper   
            triangular part of the matrix dA, and the strictly lower   
            triangular part of dA is not referenced.  If UPLO = 'L', the   
            leading N-by-N lower triangular part of dA contains the lower   
            triangular part of the matrix dA, and the strictly upper   
            triangular part of dA is not referenced.   

            On exit, if INFO = 0, the factor U or L from the Cholesky   
            factorization dA = U**H * U or dA = L * L**H.   

    LDDA     (input) INTEGER   
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            divisible by 16.

    INFO    (output) INTEGER   
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value   
            > 0:  if INFO = i, the leading minor of order i is not   
                  positive definite, and the factorization could not be   
                  completed.   
    =====================================================================   */

    magma_int_t     j, jb, nb0, nb2, dd, d, id, j_local, j_local2, buf;
    double c_one     = MAGMA_D_ONE;
    double c_neg_one = MAGMA_D_NEG_ONE;
    double          d_one     =  1.0;
    double          d_neg_one = -1.0;
    magmaDouble_ptr dlpanel;
    size_t dlpanel_offset;
    magma_int_t n_local[MagmaMaxGPUs], ldpanel;
    magma_event_t events[MagmaMaxGPUs];

    *info = 0;
    if ( (uplo != MagmaUpper) && (uplo != MagmaLower) ) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if ((uplo != MagmaUpper) && num_gpus*ldda < max(1,n)) {
        *info = -4;
    } else if ((uplo == MagmaUpper) && ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    {

      for( d=0; d<num_gpus; d++ ) {
        /* local-n and local-ld */
        if (uplo == MagmaUpper) {
            n_local[d] = ((n/nb)/num_gpus)*nb;
            if (d < (n/nb)%num_gpus)
               n_local[d] += nb;
            else if (d == (n/nb)%num_gpus)
              n_local[d] += n%nb;
        } else {
            n_local[d] = ((m/nb)/num_gpus)*nb;
            if (d < (m/nb)%num_gpus)
               n_local[d] += nb;
            else if (d == (m/nb)%num_gpus)
              n_local[d] += m%nb;
        }
      }

      /* Use blocked code. */
      if (uplo == MagmaUpper) 
        {
          /* ---------------------------------------------- */
          /* Upper-triangular case                          */
          /* > Compute the Cholesky factorization A = U'*U. */
          /* ---------------------------------------------- */
            for(j=0;j<m;j+=nb){
                /* Set the GPU number that holds the current panel */
                id  = (j/nb)%num_gpus;
                buf = (j/nb)%num_gpus;
                
                /* Set the local index where the current panel is */
                j_local = j/(nb*num_gpus);
                jb = min(nb, (m-j));

                if(j>0){
                    magma_queue_sync(queues[id*2]); // wait for the column on CPU
                    /* broadcast off-diagonal column to all gpus */
                    d = (j/nb+1)%num_gpus;
                    for(dd=0;dd<num_gpus;dd++){
                        if(d != id){
                            //magma_queue_sync(queues[2*d]);
                            magma_dsetmatrix_async( j, jb, 
                                                    Aup(0,j), lda, 
                                                    dlP(d,jb,0,buf), lddp, 
                                                    queues[d*2], NULL );
                        }
                        d = (d+1)%num_gpus;
                    }
                }
                /* Update the current diagonal block */
                if( j > 0 ) {
                    magma_dsyrk(MagmaUpper, MagmaConjTrans, jb, j, 
                                d_neg_one, dlA(id, 0, nb*j_local), ldda,
                                d_one,     dlA(id, j, nb*j_local), ldda,
                                queues[2*id+1]);
                                                                                                }
                /* send the diagonal to cpu */
                magma_queue_sync(queues[2*id+1]);// wait for syrk
                magma_dgetmatrix_async( jb, jb, 
                                        dlA(id, j, nb*j_local), ldda,
                                        Aup(j,j), lda,
                                        queues[2*id], NULL);
                if(j>0){
                    /* Compute the local block column of the panel. */
                    d = (j/nb+1)%num_gpus;
                    for(dd=0;dd<num_gpus;dd++){
                        j_local2 = j_local+1;
                        if(d>id) j_local2 --;
                        nb0 = nb*j_local2;

                        if(n_local[d]>nb0){
                            /* wait for the off-diagonal */
                            if(d!=id){
                                dlpanel = d_lP[d];
                                dlpanel_offset = dlP_offset(jb, 0, buf);
                                ldpanel = lddp;
                        
                                /* wait for the offdiagonal column */
                                magma_queue_sync(queues[d*2]);
                            }else{
                                dlpanel = d_lA[d];
                                dlpanel_offset = dlA_offset(0, nb*j_local);
                                ldpanel = ldda;
                            }

                            /* update the panel */
                            magma_dgemm(MagmaConjTrans, MagmaNoTrans, 
                                        jb, n_local[d]-nb0, j, 
                                        c_neg_one, dlpanel, dlpanel_offset, ldpanel,
                                        dlA(d, 0, nb0), ldda, 
                                        c_one, dlA(d, j, nb0), ldda,
                                        queues[2*d+1]);

                        }
                        d = (d+1)%num_gpus;
                    }
                }

                /* factor the diagonal */
                magma_queue_sync( queues[id*2] ); // wait for the diagonal
                lapackf77_dpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info);
                if (*info != 0) {
                    *info = *info + j;
                    break;
                }

                /* send the diagonal to gpus */
                if ( (j+jb) < n) {
                    d = (j/nb+1)%num_gpus;
                    for( dd=0; dd<num_gpus; dd++ ) {
                        if( d == id ) {
                            dlpanel = d_lA[d];
                            dlpanel_offset = dlA_offset(j, nb*j_local);
                            ldpanel = ldda;
                        } else {
                            dlpanel = d_lP[d];
                            dlpanel_offset = dlP_offset(0, 0, buf);                                            
                            ldpanel = lddp;
                        }
                        magma_dsetmatrix_async( jb, jb, 
                                                Aup(j,j), lda,
                                                dlpanel, dlpanel_offset,  ldpanel, 
                                                queues[d*2], NULL);
                        d = (d+1)%num_gpus;
                    }
                } else {
                    magma_dsetmatrix_async( jb, jb, 
                                            Aup(j,j), lda, 
                                            dlA(id, j, nb*j_local), ldda,
                                            queues[id*2], NULL );
                }

                /* panel-factorize the off-diagonal */
                if((j+jb)<n){
                    d = (j/nb+1)%num_gpus;
                    for(dd=0;dd<num_gpus;dd++){
                        /* next column */
                        j_local2 = j_local+1;
                        if(d>id) j_local2--;
                        if( d == id ) {
                            dlpanel = d_lA[d];
                            dlpanel_offset = dlA_offset(j, nb*j_local);
                            ldpanel = ldda;
                        } else {
                            dlpanel = d_lP[d];
                            dlpanel_offset = dlP_offset(0, 0, buf);
                            ldpanel = lddp;
                        }
                        nb2 = n_local[d]-nb*j_local2;
                        nb0 = min(nb, nb2 );
                        magma_queue_sync( queues[2*d]); // wait for the diagonal
                        if(j+jb < m && d == (j/nb+1)%num_gpus){
                            /* owns the next column, look-ahead the column */
                            magma_dtrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                         jb, nb0, c_one,
                                         dlpanel, dlpanel_offset, ldpanel,
                                         dlA(d, j, nb*j_local2), ldda, 
                                         queues[2*d+1]);
                            /* send the column to cpu */
                            if(j+jb < m){
                                magma_queue_sync(queues[2*d+1]);  // wait for lookahead
                                 magma_dgetmatrix_async( (j+jb), nb0, 
                                                         dlA(d, 0, nb*j_local2), ldda, 
                                                         Aup(0,j+jb), lda,
                                                         queues[2*d], NULL);
                            }

                            /* update the remaining blocks */
                            nb2 = nb2 - nb0;

                            magma_dtrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                         jb, nb2, c_one, 
                                         dlpanel, dlpanel_offset, ldpanel,
                                         dlA(d, j, nb*j_local2+nb0), ldda, 
                                         queues[2*d+1]);
                        }else if(nb2 > 0){
                            /* update the entire trailing matrix */
                            magma_dtrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, 
                                         jb, nb2, c_one, 
                                         dlpanel, dlpanel_offset, ldpanel,
                                         dlA(d, j, nb*j_local2), ldda,
                                         queues[d*2+1]);
                        }
                        d = (d+1)%num_gpus;
                    }
                }
            }            
        } else { 
            /* -------------------------------------------- */
            /* Lower-triangular case                        */
            /* Compute the Cholesky factorization A = L*L'. */
            /* -------------------------------------------- */
            for (j=0; j<n; j+=nb) {

              /* Set the GPU number that holds the current panel */
              id  = (j/nb)%num_gpus;
              buf = (j/nb)%num_gpus;

              /* Set the local index where the current panel is */
              j_local = j/(nb*num_gpus);
              jb = min(nb, (n-j));

              if( j > 0 ) {
/* needed on pluto... */
//magma_setdevice(id);
                   magma_queue_sync( queues[id*2] ); // wait for the column on CPU

                  /* broadcast offdiagonal row to all gpus */
                  d = (j/nb+1)%num_gpus;
                  for( dd=0; dd<num_gpus; dd++ ) {
                      if( d != id ) {
                          /* wait for it on CPU */
                          //magma_queue_sync( queues[d*2] );

                          /* send it to GPU */
                          magma_dsetmatrix_async( jb, j,
                                                  Alo(j,0), lda,
                                                  dlPT(d,0,jb,buf), nb, 
                                                  queues[d*2], NULL );
                         clFlush(queues[d*2]);
                      }
                      d = (d+1)%num_gpus;
                  }
              }

              /* Update the current diagonal block */
              if( j > 0 ) {
                  magma_dsyrk(MagmaLower, MagmaNoTrans, jb, j,
                              d_neg_one, dlA(id, nb*j_local, 0), ldda,
                              d_one,     dlA(id, nb*j_local, j), ldda,
                              queues[id*2+1]);
                magma_queue_sync( queues[id*2+1] ); // wait for syrk
              }

              /* update the offdiagonal blocks */
              if ( j > 0 ) {
                  /* compute the block-rows of the panel */
                  d = (j/nb+1)%num_gpus;
                  for( dd=0; dd<num_gpus; dd++ ) {
                      j_local2 = j_local+1;
                      if( d > id ) j_local2 --;
                      nb0 = nb*j_local2;

                      if( nb0 < n_local[d] ) {
                          if( d != id ) {
                              //dlpanel = dlPT(d);
                              dlpanel = d_lP[d];
                              dlpanel_offset = dlPT_offset(0, jb, buf);
                              ldpanel = nb;

                              /* wait for offdiagonal row */
                              magma_queue_sync(queues[d*2]);
                          } else {
                              dlpanel = d_lA[d];
                              dlpanel_offset = dlA_offset(nb*j_local, 0);
                              ldpanel = ldda;
                          }

                          magma_dgemm( MagmaNoTrans, MagmaConjTrans,
                                       n_local[d]-nb0, jb, j,
                                       c_neg_one, dlA(d, nb0, 0), ldda,
                                                  dlpanel, dlpanel_offset, ldpanel,
                                       c_one,     dlA(d, nb0, j), ldda, 
                                       queues[d*2+1]);
                      }
                      d = (d+1)%num_gpus;
                  }
              }

              /* send the diagonal to cpu */
              magma_dgetmatrix_async( jb, jb,
                                      dlA(id, nb*j_local, j), ldda,
                                      Alo(j,j),               lda, 
                                      queues[id*2], &events[id] );
              clFlush(queues[id*2]);
              /* factor the diagonal */
              magma_queue_sync( queues[id*2] );
              lapackf77_dpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info);
              if (*info != 0) {
                  printf("row number: %d\n", (int) j);
                  *info = *info + j;
                  break;
              }

              /* send the diagonal to gpus */
              if ( (j+jb) < m ) {
                  d = (j/nb+1)%num_gpus;
                  for( dd=0; dd<num_gpus; dd++ ) {
                      if( d == id ) {
                          dlpanel = d_lA[d];
                          dlpanel_offset = dlA_offset(nb*j_local, j);
                          ldpanel = ldda;
                      } else {
                          //dlpanel = dlPT(d);
                          dlpanel = d_lP[d];
                          dlpanel_offset = dlPT_offset(0, 0, buf);
                          ldpanel = nb;
                      }
                      magma_dsetmatrix_async( jb, jb,
                                              Alo(j,j), lda,
                                              dlpanel,  dlpanel_offset, ldpanel, 
                                              queues[d*2], NULL );
                      clFlush(queues[d*2]);
                      d = (d+1)%num_gpus;
                  }
              } else {
                  magma_dsetmatrix_async( jb, jb,
                                          Alo(j,j),       lda,
                                          dlA(id, nb*j_local, j), ldda, 
                                          queues[id*2], NULL );
                  clFlush(queues[id*2]);
              }

              /* factorize off-diagonal blocks */
              if ( (j+jb) < m ) {
                  d = (j/nb+1)%num_gpus;
                  for( dd=0; dd<num_gpus; dd++ ) {
                      /* next column */
                      j_local2 = j_local+1;
                      if( d > id ) j_local2--;
                      if( d == id ) {
                          dlpanel = d_lA[d];
                          dlpanel_offset = dlA_offset(nb*j_local, j);
                          ldpanel = ldda;
                      } else {         
                          //dlpanel = dlPT(d);
                          dlpanel = d_lP[d];
                          dlpanel_offset = dlPT_offset(0, 0, buf);
                          ldpanel = nb;
                      }
                      nb2 = n_local[d] - j_local2*nb;
                      nb0 = min(nb, nb2 );
                        
                      magma_queue_sync(queues[d*2]);
                      // wait for the diagonal
                      if( j+jb < n && d == (j/nb+1)%num_gpus ) {
                          /* owns the next column, look-ahead the column */
                          magma_dtrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, 
                                       nb0, jb, c_one,
                                       dlpanel,  dlpanel_offset, ldpanel, 
                                       dlA(d, nb*j_local2, j), ldda,
                                       queues[d*2+1]);
                          /* send the column to cpu */
                          if( j+jb < n ) {
                              magma_queue_sync( queues[d*2+1] ); // wait for lookahead
                              magma_dgetmatrix_async( nb0, j+jb,
                                                      dlA(d, nb*j_local2, 0), ldda,
                                                      Alo(j+jb,0),            lda, 
                                                      queues[d*2], NULL);
                              clFlush(queues[d*2]);
                          }
                          /* update the remaining blocks */
                          nb2 = nb2 - nb0;
                          magma_dtrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, 
                                       nb2, jb, c_one,
                                       dlpanel, dlpanel_offset, ldpanel, 
                                       dlA(d, nb*j_local2+nb0, j), ldda, 
                                       queues[d*2+1]);
                      } else if( nb2 > 0 ) {
                          /* update the entire trailing matrix */
                          magma_dtrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, 
                                       nb2, jb, c_one,
                                       dlpanel, dlpanel_offset, ldpanel, 
                                       dlA(d, nb*j_local2, j), ldda, 
                                       queues[d*2+1]);
                      }
                      d = (d+1)%num_gpus;
                  }
              }
            }
          } /* end of else not upper */

          /* clean up */
          for( d=0; d<num_gpus; d++ ) {
              magma_queue_sync( queues[d*2] );
              magma_queue_sync( queues[d*2+1] );
          }

    } /* end of not lapack */

    return *info;
} /* magma_dpotrf_mgpu */
Beispiel #21
0
/**
    Purpose
    -------
    DPOTRF computes the Cholesky factorization of a real symmetric
    positive definite matrix dA.

    The factorization has the form
        dA = U**H * U,   if UPLO = MagmaUpper, or
        dA = L  * L**H,  if UPLO = MagmaLower,
    where U is an upper triangular matrix and L is lower triangular.

    This is the block version of the algorithm, calling Level 3 BLAS.
    If the current stream is NULL, this version replaces it with a new
    stream to overlap computation with communication.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of dA is stored;
      -     = MagmaLower:  Lower triangle of dA is stored.

    @param[in]
    n       INTEGER
            The order of the matrix dA.  N >= 0.

    @param[in,out]
    dA      DOUBLE_PRECISION array on the GPU, dimension (LDDA,N)
            On entry, the symmetric matrix dA.  If UPLO = MagmaUpper, the leading
            N-by-N upper triangular part of dA contains the upper
            triangular part of the matrix dA, and the strictly lower
            triangular part of dA is not referenced.  If UPLO = MagmaLower, the
            leading N-by-N lower triangular part of dA contains the lower
            triangular part of the matrix dA, and the strictly upper
            triangular part of dA is not referenced.
    \n
            On exit, if INFO = 0, the factor U or L from the Cholesky
            factorization dA = U**H * U or dA = L * L**H.

    @param[in]
    ldda     INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            divisible by 16.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
      -     > 0:  if INFO = i, the leading minor of order i is not
                  positive definite, and the factorization could not be
                  completed.

    @ingroup magma_dposv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_dpotrf_gpu(magma_uplo_t uplo, magma_int_t n,
                 double *dA, magma_int_t ldda, magma_int_t *info)
{
#define dA(i, j) (dA + (j)*ldda + (i))

    magma_int_t     j, jb, nb;
    const char* uplo_ = lapack_uplo_const( uplo );
    double c_one     = MAGMA_D_ONE;
    double c_neg_one = MAGMA_D_NEG_ONE;
    double *work;
    double          d_one     =  1.0;
    double          d_neg_one = -1.0;
    int upper = (uplo == MagmaUpper);

    *info = 0;
    if (! upper && uplo != MagmaLower) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,n)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    nb = magma_get_dpotrf_nb(n);

    if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, nb*nb )) {
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }

    /* Define user stream if current stream is NULL */
    magma_queue_t stream[2];
    
    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );
    
    magma_queue_create( &stream[0] );
    if (orig_stream == NULL) {
        magma_queue_create( &stream[1] );
        magmablasSetKernelStream(stream[1]);
    }
    else {
        stream[1] = orig_stream;
    }
    
    if ((nb <= 1) || (nb >= n)) {
        /* Use unblocked code. */
        magma_dgetmatrix_async( n, n, dA, ldda, work, n, stream[1] );
        magma_queue_sync( stream[1] );
        lapackf77_dpotrf(uplo_, &n, work, &n, info);
        magma_dsetmatrix_async( n, n, work, n, dA, ldda, stream[1] );
    }
    else {
        /* Use blocked code. */
        if (upper) {
            
            /* Compute the Cholesky factorization A = U'*U. */
            for (j=0; j < n; j += nb) {
                
                /* Update and factorize the current diagonal block and test
                   for non-positive-definiteness. Computing MIN */
                jb = min(nb, (n-j));
                
                magma_dsyrk(MagmaUpper, MagmaConjTrans, jb, j,
                            d_neg_one, dA(0, j), ldda,
                            d_one,     dA(j, j), ldda);

                magma_queue_sync( stream[1] );
                magma_dgetmatrix_async( jb, jb,
                                        dA(j, j), ldda,
                                        work,     jb, stream[0] );
                
                if ( (j+jb) < n) {
                    /* Compute the current block row. */
                    magma_dgemm(MagmaConjTrans, MagmaNoTrans,
                                jb, (n-j-jb), j,
                                c_neg_one, dA(0, j   ), ldda,
                                           dA(0, j+jb), ldda,
                                c_one,     dA(j, j+jb), ldda);
                }
                
                magma_queue_sync( stream[0] );
                lapackf77_dpotrf(MagmaUpperStr, &jb, work, &jb, info);
                magma_dsetmatrix_async( jb, jb,
                                        work,     jb,
                                        dA(j, j), ldda, stream[1] );
                if (*info != 0) {
                    *info = *info + j;
                    break;
                }

                if ( (j+jb) < n) {
                    magma_dtrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                 jb, (n-j-jb),
                                 c_one, dA(j, j   ), ldda,
                                        dA(j, j+jb), ldda);
                }
            }
        }
        else {
            //=========================================================
            // Compute the Cholesky factorization A = L*L'.
            for (j=0; j < n; j += nb) {
                //  Update and factorize the current diagonal block and test
                //  for non-positive-definiteness. Computing MIN
                jb = min(nb, (n-j));

                magma_dsyrk(MagmaLower, MagmaNoTrans, jb, j,
                            d_neg_one, dA(j, 0), ldda,
                            d_one,     dA(j, j), ldda);
                
                magma_queue_sync( stream[1] );
                magma_dgetmatrix_async( jb, jb,
                                        dA(j, j), ldda,
                                        work,     jb, stream[0] );
                
                if ( (j+jb) < n) {
                    magma_dgemm( MagmaNoTrans, MagmaConjTrans,
                                 (n-j-jb), jb, j,
                                 c_neg_one, dA(j+jb, 0), ldda,
                                            dA(j,    0), ldda,
                                 c_one,     dA(j+jb, j), ldda);
                }

                magma_queue_sync( stream[0] );
                lapackf77_dpotrf(MagmaLowerStr, &jb, work, &jb, info);
                magma_dsetmatrix_async( jb, jb,
                                        work,     jb,
                                        dA(j, j), ldda, stream[1] );
                if (*info != 0) {
                    *info = *info + j;
                    break;
                }
                
                if ( (j+jb) < n) {
                    magma_dtrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                (n-j-jb), jb,
                                c_one, dA(j,    j), ldda,
                                       dA(j+jb, j), ldda);
                }
            }
        }
    }

    magma_free_pinned( work );

    magma_queue_destroy( stream[0] );
    if (orig_stream == NULL) {
        magma_queue_destroy( stream[1] );
    }
    magmablasSetKernelStream( orig_stream );

    return *info;
} /* magma_dpotrf_gpu */
Beispiel #22
0
extern "C" magma_int_t
magma_dlatrd2(char uplo, magma_int_t n, magma_int_t nb,
              double *a,  magma_int_t lda,
              double *e, double *tau,
              double *w,  magma_int_t ldw,
              double *da, magma_int_t ldda,
              double *dw, magma_int_t lddw,
              double *dwork, magma_int_t ldwork)
{
/*  -- MAGMA (version 1.4.1) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       December 2013

    Purpose
    =======
    DLATRD2 reduces NB rows and columns of a real symmetric matrix A to
    symmetric tridiagonal form by an orthogonal similarity
    transformation Q' * A * Q, and returns the matrices V and W which are
    needed to apply the transformation to the unreduced part of A.

    If UPLO = 'U', DLATRD reduces the last NB rows and columns of a
    matrix, of which the upper triangle is supplied;
    if UPLO = 'L', DLATRD reduces the first NB rows and columns of a
    matrix, of which the lower triangle is supplied.

    This is an auxiliary routine called by DSYTRD2_GPU. It uses an
    accelerated HEMV that needs extra memory.

    Arguments
    =========
    UPLO    (input) CHARACTER*1
            Specifies whether the upper or lower triangular part of the
            symmetric matrix A is stored:
            = 'U': Upper triangular
            = 'L': Lower triangular

    N       (input) INTEGER
            The order of the matrix A.

    NB      (input) INTEGER
            The number of rows and columns to be reduced.

    A       (input/output) DOUBLE_PRECISION array, dimension (LDA,N)
            On entry, the symmetric matrix A.  If UPLO = 'U', the leading
            n-by-n upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = 'L', the
            leading n-by-n lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
            On exit:
            if UPLO = 'U', the last NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements above the diagonal
              with the array TAU, represent the orthogonal matrix Q as a
              product of elementary reflectors;
            if UPLO = 'L', the first NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements below the diagonal
              with the array TAU, represent the  orthogonal matrix Q as a
              product of elementary reflectors.
            See Further Details.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= (1,N).

    E       (output) DOUBLE_PRECISION array, dimension (N-1)
            If UPLO = 'U', E(n-nb:n-1) contains the superdiagonal
            elements of the last NB columns of the reduced matrix;
            if UPLO = 'L', E(1:nb) contains the subdiagonal elements of
            the first NB columns of the reduced matrix.

    TAU     (output) DOUBLE_PRECISION array, dimension (N-1)
            The scalar factors of the elementary reflectors, stored in
            TAU(n-nb:n-1) if UPLO = 'U', and in TAU(1:nb) if UPLO = 'L'.
            See Further Details.

    W       (output) DOUBLE_PRECISION array, dimension (LDW,NB)
            The n-by-nb matrix W required to update the unreduced part
            of A.

    LDW     (input) INTEGER
            The leading dimension of the array W. LDW >= max(1,N).

    Further Details
    ===============
    If UPLO = 'U', the matrix Q is represented as a product of elementary
    reflectors

       Q = H(n) H(n-1) . . . H(n-nb+1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i),
    and tau in TAU(i-1).

    If UPLO = 'L', the matrix Q is represented as a product of elementary
    reflectors

       Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i),
    and tau in TAU(i).

    The elements of the vectors v together form the n-by-nb matrix V
    which is needed, with W, to apply the transformation to the unreduced
    part of the matrix, using a symmetric rank-2k update of the form:
    A := A - V*W' - W*V'.

    The contents of A on exit are illustrated by the following examples
    with n = 5 and nb = 2:

    if UPLO = 'U':                       if UPLO = 'L':

      (  a   a   a   v4  v5 )              (  d                  )
      (      a   a   v4  v5 )              (  1   d              )
      (          a   1   v5 )              (  v1  1   a          )
      (              d   1  )              (  v1  v2  a   a      )
      (                  d  )              (  v1  v2  a   a   a  )

    where d denotes a diagonal element of the reduced matrix, a denotes
    an element of the original matrix that is unchanged, and vi denotes
    an element of the vector defining H(i).
    =====================================================================    */
    
    char uplo_[2]  = {uplo, 0};

    magma_int_t i;
    
    double c_neg_one = MAGMA_D_NEG_ONE;
    double c_one     = MAGMA_D_ONE;
    double c_zero    = MAGMA_D_ZERO;

    double value = MAGMA_D_ZERO;
    
    magma_int_t ione = 1;

    magma_int_t i_n, i_1, iw;
    
    double alpha;
    double *f;

    if (n <= 0) {
        return 0;
    }

    magma_queue_t stream;
    magma_queue_create( &stream );
    magma_dmalloc_cpu( &f, n );
    assert( f != NULL );  // TODO return error, or allocate outside dlatrd
    
    if (lapackf77_lsame(uplo_, "U")) {

        /* Reduce last NB columns of upper triangle */
        for (i = n-1; i >= n - nb ; --i) {
            i_1 = i + 1;
            i_n = n - i - 1;
            
            iw = i - n + nb;
            if (i < n-1) {
                /* Update A(1:i,i) */
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_dlacgv(&i_n, W(i, iw+1), &ldw);
                #endif
                blasf77_dgemv("No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda,
                              W(i, iw+1), &ldw, &c_one, A(0, i), &ione);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_dlacgv(&i_n, W(i, iw+1), &ldw);
                lapackf77_dlacgv(&i_n, A(i, i+1), &ldw);
                #endif
                blasf77_dgemv("No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                              A(i, i+1), &lda, &c_one, A(0, i), &ione);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_dlacgv(&i_n, A(i, i+1), &ldw);
                #endif
            }
            if (i > 0) {
                /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */
                
                alpha = *A(i-1, i);
                
                lapackf77_dlarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]);
                
                e[i-1] = MAGMA_D_REAL( alpha );
                *A(i-1,i) = MAGMA_D_MAKE( 1, 0 );
                
                /* Compute W(1:i-1,i) */
                // 1. Send the block reflector  A(0:n-i-1,i) to the GPU
                magma_dsetvector( i, A(0, i), 1, dA(0, i), 1 );
                
                //#if (GPUSHMEM < 200)
                //magma_dsymv(MagmaUpper, i, c_one, dA(0, 0), ldda,
                //            dA(0, i), ione, c_zero, dW(0, iw), ione);
                //#else
                magmablas_dsymv_work(MagmaUpper, i, c_one, dA(0, 0), ldda,
                                     dA(0, i), ione, c_zero, dW(0, iw), ione,
                                     dwork, ldwork);
                //#endif
                
                // 2. Start putting the result back (asynchronously)
                magma_dgetmatrix_async( i, 1,
                                        dW(0, iw),         lddw,
                                        W(0, iw) /*test*/, ldw, stream );
                
                if (i < n-1) {
                    blasf77_dgemv(MagmaTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw,
                                  A(0, i), &ione, &c_zero, W(i+1, iw), &ione);
                }
                
                // 3. Here is where we need it // TODO find the right place
                magma_queue_sync( stream );
                
                if (i < n-1) {
                    blasf77_dgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda,
                                  W(i+1, iw), &ione, &c_one, W(0, iw), &ione);
                    
                    blasf77_dgemv(MagmaTransStr, &i, &i_n, &c_one, A(0, i+1), &lda,
                                  A(0, i), &ione, &c_zero, W(i+1, iw), &ione);
                    
                    blasf77_dgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                                  W(i+1, iw), &ione, &c_one, W(0, iw), &ione);
                }
                
                blasf77_dscal(&i, &tau[i - 1], W(0, iw), &ione);
                
                #if defined(PRECISION_z) || defined(PRECISION_c)
                cblas_ddot_sub( i, W(0,iw), ione, A(0,i), ione, &value );
                #else
                value = cblas_ddot( i, W(0,iw), ione, A(0,i), ione );
                #endif
                alpha = tau[i - 1] * -0.5f * value;
                blasf77_daxpy(&i, &alpha, A(0, i), &ione,
                              W(0, iw), &ione);
            }
        }
    }
    else {
        /*  Reduce first NB columns of lower triangle */
        for (i = 0; i < nb; ++i) {
            
            /* Update A(i:n,i) */
            i_n = n - i;
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv(&i, W(i, 0), &ldw);
            #endif
            blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda,
                          W(i, 0), &ldw, &c_one, A(i, i), &ione);
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv(&i, W(i, 0), &ldw);
            lapackf77_dlacgv(&i, A(i ,0), &lda);
            #endif
            blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw,
                          A(i, 0), &lda, &c_one, A(i, i), &ione);
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv(&i, A(i, 0), &lda);
            #endif
        
            if (i < n-1) {
                /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */
                i_n = n - i - 1;
                alpha = *A(i+1, i);
                lapackf77_dlarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]);
                e[i] = MAGMA_D_REAL( alpha );
                *A(i+1,i) = MAGMA_D_MAKE( 1, 0 );
        
                /* Compute W(i+1:n,i) */
                // 1. Send the block reflector  A(i+1:n,i) to the GPU
                magma_dsetvector( i_n, A(i+1, i), 1, dA(i+1, i), 1 );
            
                //#if (GPUSHMEM < 200)
                //magma_dsymv(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero,
                //            dW(i+1, i), ione);
                //#else
                magmablas_dsymv_work('L', i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero,
                                     dW(i+1, i), ione,
                                     dwork, ldwork);
                //#endif
        
                // 2. Start putting the result back (asynchronously)
                magma_dgetmatrix_async( i_n, 1,
                                        dW(i+1, i), lddw,
                                        W(i+1, i),  ldw, stream );
        
                blasf77_dgemv(MagmaTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw,
                              A(i+1, i), &ione, &c_zero, W(0, i), &ione);
            
                blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda,
                              W(0, i), &ione, &c_zero, f, &ione);
                
                blasf77_dgemv(MagmaTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda,
                              A(i+1, i), &ione, &c_zero, W(0, i), &ione);
        
                // 3. Here is where we need it
                magma_queue_sync( stream );
        
                if (i!=0)
                  blasf77_daxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione);
        
                blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw,
                              W(0, i), &ione, &c_one, W(i+1, i), &ione);
                blasf77_dscal(&i_n, &tau[i], W(i+1,i), &ione);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                cblas_ddot_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value );
                #else
                value = cblas_ddot( i_n, W(i+1,i), ione, A(i+1,i), ione );
                #endif
                alpha = tau[i] * -0.5f * value;
                blasf77_daxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione);
            }
        }
    }

    magma_free_cpu(f);
    magma_queue_destroy( stream );

    return 0;
} /* dlatrd */
Beispiel #23
0
extern "C" magma_err_t
magma_dgeqrf_gpu( magma_int_t m, magma_int_t n,
                  magmaDouble_ptr dA, size_t dA_offset,  magma_int_t ldda,
                  double *tau, magmaDouble_ptr dT, size_t dT_offset,
                  magma_int_t *info, magma_queue_t queue)
{
/*  -- clMAGMA (version 1.1.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date January 2014

    Purpose
    =======
    DGEQRF computes a QR factorization of a DOUBLE_PRECISION M-by-N matrix A:
    A = Q * R. This version stores the triangular matrices used in
    the factorization so that they can be applied directly (i.e.,
    without being recomputed) later. As a result, the application
    of Q is much faster.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).

    LDDA     (input) INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).
            To benefit from coalescent memory accesses LDDA must be
            dividable by 16.

    TAU     (output) DOUBLE_PRECISION array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    dT      (workspace/output)  DOUBLE_PRECISION array on the GPU,
            dimension (2*MIN(M, N) + (N+31)/32*32 )*NB,
            where NB can be obtained through magma_get_dgeqrf_nb(M).
            It starts with MIN(M,N)*NB block that store the triangular T
            matrices, followed by the MIN(M,N)*NB block of the diagonal
            inverses for the R matrix. The rest of the array is used as workspace.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ===============
    The matrix Q is represented as a product of elementary reflectors

       Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).
    =====================================================================    */

    #define a_ref(a_1,a_2) dA, (dA_offset + (a_1) + (a_2)*(ldda))
    #define t_ref(a_1)     dT, (dT_offset + (a_1)*nb)
    #define d_ref(a_1)     dT, (dT_offset + (minmn + (a_1))*nb)
    #define dd_ref(a_1)    dT, (dT_offset + (2*minmn+(a_1))*nb)
    #define work_ref(a_1)  ( work + (a_1))
    #define hwork          ( work + (nb)*(m))

    magma_int_t i, k, minmn, old_i, old_ib, rows, cols;
    magma_int_t ib, nb;
    magma_int_t ldwork, lddwork, lwork, lhwork;
    double *work, *ut;

    /* check arguments */
    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    k = minmn = min(m,n);
    if (k == 0)
        return *info;

    nb = magma_get_dgeqrf_nb(m);

    lwork  = (m + n + nb)*nb;
    lhwork = lwork - m*nb;

    if (MAGMA_SUCCESS != magma_dmalloc_cpu( &work, lwork )) {
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }
    
    ut = hwork+nb*(n);
    memset( ut, 0, nb*nb*sizeof(double));

    magma_event_t event[2] = {NULL, NULL};

    ldwork = m;
    lddwork= n;

    if ( (nb > 1) && (nb < k) ) {
        /* Use blocked code initially */
        old_i = 0; old_ib = nb;
        for (i = 0; i < k-nb; i += nb) {
            ib = min(k-i, nb);
            rows = m -i;
            magma_dgetmatrix_async( rows, ib,
                                    a_ref(i,i),  ldda,
                                    work_ref(i), 0, ldwork, queue, &event[1] );
            if (i>0){
                /* Apply H' to A(i:m,i+2*ib:n) from the left */
                cols = n-old_i-2*old_ib;
                magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise,
                                  m-old_i, cols, old_ib,
                                  a_ref(old_i, old_i         ), ldda, t_ref(old_i), nb,
                                  a_ref(old_i, old_i+2*old_ib), ldda, dd_ref(0),    lddwork, queue);
                
                /* store the diagonal */
                magma_dsetmatrix_async( old_ib, old_ib,
                                        ut, 0, old_ib,
                                        d_ref(old_i), old_ib, queue, &event[0] );
            }
            
            magma_event_sync(event[1]);
            lapackf77_dgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info);
            /* Form the triangular factor of the block reflector
               H = H(i) H(i+1) . . . H(i+ib-1) */
            lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr,
                              &rows, &ib,
                              work_ref(i), &ldwork, tau+i, hwork, &ib);

            /* Put 0s in the upper triangular part of a panel (and 1s on the
               diagonal); copy the upper triangular in ut and invert it     */
            magma_event_sync(event[0]);
            dsplit_diag_block(ib, work_ref(i), ldwork, ut);
            magma_dsetmatrix( rows, ib, work_ref(i), 0, ldwork, a_ref(i,i), ldda, queue);
            
            if (i + ib < n) {
                /* Send the triangular factor T to the GPU */
                magma_dsetmatrix( ib, ib, hwork, 0, ib, t_ref(i), nb, queue );

                if (i+nb < k-nb){
                    /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */
                    magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise,
                                      rows, ib, ib,
                                      a_ref(i, i   ), ldda, t_ref(i),  nb,
                                      a_ref(i, i+ib), ldda, dd_ref(0), lddwork, queue);
                }
                else {
                    cols = n-i-ib;
                    magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise,
                                      rows, cols, ib,
                                      a_ref(i, i   ), ldda, t_ref(i),  nb,
                                      a_ref(i, i+ib), ldda, dd_ref(0), lddwork, queue);
                    /* Fix the diagonal block */
                    magma_dsetmatrix( ib, ib, ut, 0, ib, d_ref(i), ib , queue);
                }
                old_i  = i;
                old_ib = ib;
            }
        }
    } else {
        i = 0;
    }

    /* Use unblocked code to factor the last or only block. */
    if (i < k) {
        ib   = n-i;
        rows = m-i;
        magma_dgetmatrix( rows, ib, a_ref(i, i), ldda, work, 0, rows, queue );
        lhwork = lwork - rows*ib;
        lapackf77_dgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info);
        
        magma_dsetmatrix( rows, ib, work, 0, rows, a_ref(i, i), ldda, queue );
    }

    magma_free_cpu( work );
    return *info;
} /* magma_dgeqrf */
Beispiel #24
0
/**
    Purpose
    -------
    DSYGST reduces a real symmetric-definite generalized
    eigenproblem to standard form.
    
    If ITYPE = 1, the problem is A*x = lambda*B*x,
    and A is overwritten by inv(U^H)*A*inv(U) or inv(L)*A*inv(L^H)
    
    If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or
    B*A*x = lambda*x, and A is overwritten by U*A*U^H or L^H*A*L.
    
    B must have been previously factorized as U^H*U or L*L^H by DPOTRF.
    
    Arguments
    ---------
    @param[in]
    itype   INTEGER
            = 1: compute inv(U^H)*A*inv(U) or inv(L)*A*inv(L^H);
            = 2 or 3: compute U*A*U^H or L^H*A*L.
    
    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of A is stored and B is factored as U^H*U;
      -     = MagmaLower:  Lower triangle of A is stored and B is factored as L*L^H.
    
    @param[in]
    n       INTEGER
            The order of the matrices A and B.  N >= 0.
    
    @param[in,out]
    A       DOUBLE PRECISION array, dimension (LDA,N)
            On entry, the symmetric matrix A.  If UPLO = MagmaUpper, the leading
            N-by-N upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = MagmaLower, the
            leading N-by-N lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
    \n
            On exit, if INFO = 0, the transformed matrix, stored in the
            same format as A.
    
    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).
    
    @param[in,out]
    B       DOUBLE PRECISION array, dimension (LDB,N)
            The triangular factor from the Cholesky factorization of B,
            as returned by DPOTRF.
            
            B is modified by the routine but restored on exit (in lapack dsygst/dsygs2).
    
    @param[in]
    ldb     INTEGER
            The leading dimension of the array B.  LDB >= max(1,N).
    
    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value

    @ingroup magma_dsyev_comp
    ********************************************************************/
extern "C" magma_int_t
magma_dsygst(
    magma_int_t itype, magma_uplo_t uplo, magma_int_t n,
    double *A, magma_int_t lda,
    double *B, magma_int_t ldb,
    magma_int_t *info)
{
    #define A(i_, j_) (A + (i_) + (j_)*lda)
    #define B(i_, j_) (B + (i_) + (j_)*ldb)
    
    #define dA(i_, j_) (dwork + (i_) + (j_)*ldda         )
    #define dB(i_, j_) (dwork + (i_) + (j_)*lddb + n*ldda)

    /* Constants */
    const double c_one      = MAGMA_D_ONE;
    const double c_neg_one  = MAGMA_D_NEG_ONE;
    const double c_half     = MAGMA_D_HALF;
    const double c_neg_half = MAGMA_D_NEG_HALF;
    const double             d_one      = 1.0;
    
    /* Local variables */
    const char* uplo_ = lapack_uplo_const( uplo );
    magma_int_t k, kb, kb2, nb;
    magma_int_t ldda = n;
    magma_int_t lddb = n;
    magmaDouble_ptr dwork;
    bool upper = (uplo == MagmaUpper);
    
    /* Test the input parameters. */
    *info = 0;
    if (itype < 1 || itype > 3) {
        *info = -1;
    } else if (! upper && uplo != MagmaLower) {
        *info = -2;
    } else if (n < 0) {
        *info = -3;
    } else if (lda < max(1,n)) {
        *info = -5;
    } else if (ldb < max(1,n)) {
        *info = -7;
    }
    
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    
    /* Quick return */
    if ( n == 0 )
        return *info;
    
    if (MAGMA_SUCCESS != magma_dmalloc( &dwork, 2*n*n )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }
    
    nb = magma_get_dsygst_nb( n );
    
    magma_queue_t queues[2];
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queues[0] );
    magma_queue_create( cdev, &queues[1] );
    
    magma_dsetmatrix( n, n, A(0, 0), lda, dA(0, 0), ldda, queues[1] );
    magma_dsetmatrix( n, n, B(0, 0), ldb, dB(0, 0), lddb, queues[1] );
    
    /* Use hybrid blocked code */
    if (itype == 1) {
        if (upper) {
            /* Compute inv(U^H)*A*inv(U) */
            for (k = 0; k < n; k += nb) {
                kb  = min( n-k,    nb );
                kb2 = min( n-k-nb, nb );
                
                /* Update the upper triangle of A(k:n,k:n) */
                lapackf77_dsygst( &itype, uplo_, &kb, A(k,k), &lda, B(k,k), &ldb, info );
                
                magma_dsetmatrix_async( kb, kb,
                                         A(k, k), lda,
                                        dA(k, k), ldda, queues[0] );
                
                if (k+kb < n) {
                    magma_dtrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                 kb, n-k-kb,
                                 c_one, dB(k,k),    lddb,
                                        dA(k,k+kb), ldda, queues[1] );
                    
                    magma_queue_sync( queues[0] );  // finish set dA(k,k)
                    
                    magma_dsymm( MagmaLeft, MagmaUpper,
                                 kb, n-k-kb,
                                 c_neg_half, dA(k,k),    ldda,
                                             dB(k,k+kb), lddb,
                                 c_one,      dA(k,k+kb), ldda, queues[1] );
                    
                    magma_dsyr2k( MagmaUpper, MagmaConjTrans,
                                  n-k-kb, kb,
                                  c_neg_one, dA(k,k+kb),    ldda,
                                             dB(k,k+kb),    lddb,
                                  d_one,     dA(k+kb,k+kb), ldda, queues[1] );
                    
                    // Start copying next A block
                    magma_queue_sync( queues[1] );
                    magma_dgetmatrix_async( kb2, kb2,
                                            dA(k+kb, k+kb), ldda,
                                             A(k+kb, k+kb),  lda, queues[0] );
                    
                    magma_dsymm( MagmaLeft, MagmaUpper,
                                 kb, n-k-kb,
                                 c_neg_half, dA(k,k),    ldda,
                                             dB(k,k+kb), lddb,
                                 c_one,      dA(k,k+kb), ldda, queues[1] );
                    
                    magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit,
                                 kb, n-k-kb,
                                 c_one, dB(k+kb,k+kb), lddb,
                                        dA(k,k+kb),    ldda, queues[1] );
                    
                    magma_queue_sync( queues[0] );  // finish get A(k+kb,k+kb)
                }
            }
        }
        else {
            /* Compute inv(L)*A*inv(L^H) */
            for (k = 0; k < n; k += nb) {
                kb  = min( n-k,    nb );
                kb2 = min( n-k-nb, nb );
                
                /* Update the lower triangle of A(k:n,k:n) */
                lapackf77_dsygst( &itype, uplo_, &kb, A(k,k), &lda, B(k,k), &ldb, info );
                
                magma_dsetmatrix_async( kb, kb,
                                         A(k, k), lda,
                                        dA(k, k), ldda, queues[0] );
                
                if (k+kb < n) {
                    magma_dtrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                 n-k-kb, kb,
                                 c_one, dB(k,k),    lddb,
                                        dA(k+kb,k), ldda, queues[1] );
                    
                    magma_queue_sync( queues[0] );  // finish set dA(k,k)
                    
                    magma_dsymm( MagmaRight, MagmaLower,
                                 n-k-kb, kb,
                                 c_neg_half, dA(k,k),     ldda,
                                             dB(k+kb,k),  lddb,
                                 c_one,      dA(k+kb, k), ldda, queues[1] );
                    
                    magma_dsyr2k( MagmaLower, MagmaNoTrans,
                                  n-k-kb, kb,
                                  c_neg_one, dA(k+kb,k),    ldda,
                                             dB(k+kb,k),    lddb,
                                  d_one,     dA(k+kb,k+kb), ldda, queues[1] );
                    
                    // Start copying next A block
                    magma_queue_sync( queues[1] );
                    magma_dgetmatrix_async( kb2, kb2,
                                            dA(k+kb, k+kb), ldda,
                                             A(k+kb, k+kb), lda, queues[0] );
                    
                    magma_dsymm( MagmaRight, MagmaLower,
                                 n-k-kb, kb,
                                 c_neg_half, dA(k,k),     ldda,
                                             dB(k+kb,k),  lddb,
                                 c_one,      dA(k+kb, k), ldda, queues[1] );
                    
                    magma_dtrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit,
                                 n-k-kb, kb,
                                 c_one, dB(k+kb,k+kb), lddb,
                                        dA(k+kb,k),    ldda, queues[1] );
                    
                    magma_queue_sync( queues[0] );  // finish get A(k+kb,k+kb)
                }
            }
        }
    }
    else {  // itype == 2 or 3
        if (upper) {
            /* Compute U*A*U^H */
            for (k = 0; k < n; k += nb) {
                kb = min( n-k, nb );
                
                magma_dgetmatrix_async( kb, kb,
                                        dA(k, k), ldda,
                                         A(k, k),  lda, queues[0] );
                
                /* Update the upper triangle of A(1:k+kb-1,1:k+kb-1) */
                if (k > 0) {
                    magma_dtrmm( MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit,
                                 k, kb,
                                 c_one, dB(0,0), lddb,
                                        dA(0,k), ldda, queues[1] );
                    
                    magma_dsymm( MagmaRight, MagmaUpper,
                                 k, kb,
                                 c_half, dA(k,k), ldda,
                                         dB(0,k), lddb,
                                 c_one,  dA(0,k), ldda, queues[1] );
                    
                    magma_dsyr2k( MagmaUpper, MagmaNoTrans,
                                  k, kb,
                                  c_one, dA(0,k), ldda,
                                         dB(0,k), lddb,
                                  d_one, dA(0,0), ldda, queues[1] );
                    
                    magma_dsymm( MagmaRight, MagmaUpper,
                                 k, kb,
                                 c_half, dA(k,k), ldda,
                                         dB(0,k), lddb,
                                 c_one,  dA(0,k), ldda, queues[1] );
                    
                    magma_dtrmm( MagmaRight, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
                                 k, kb,
                                 c_one, dB(k,k), lddb,
                                        dA(0,k), ldda, queues[1] );
                }
                
                magma_queue_sync( queues[0] );  // finish get A(k,k)
                
                lapackf77_dsygst( &itype, uplo_, &kb, A(k, k), &lda, B(k, k), &ldb, info );

                // this could be done on a 3rd queue
                magma_dsetmatrix_async( kb, kb,
                                         A(k, k), lda,
                                        dA(k, k), ldda, queues[1] );
            }
        }
        else {
            /* Compute L^H*A*L */
            for (k = 0; k < n; k += nb) {
                kb = min( n-k, nb );
                
                magma_dgetmatrix_async( kb, kb,
                                        dA(k, k), ldda,
                                         A(k, k),  lda, queues[0] );
                
                /* Update the lower triangle of A(1:k+kb-1,1:k+kb-1) */
                if (k > 0) {
                    magma_dtrmm( MagmaRight, MagmaLower, MagmaNoTrans, MagmaNonUnit,
                                 kb, k,
                                 c_one, dB(0,0), lddb,
                                        dA(k,0), ldda, queues[1] );
                    
                    magma_dsymm( MagmaLeft, MagmaLower,
                                 kb, k,
                                 c_half, dA(k,k),  ldda,
                                         dB(k,0),  lddb,
                                 c_one,  dA(k, 0), ldda, queues[1] );
                    
                    magma_dsyr2k( MagmaLower, MagmaConjTrans,
                                  k, kb,
                                  c_one, dA(k,0), ldda,
                                         dB(k,0), lddb,
                                  d_one, dA(0,0), ldda, queues[1] );
                    
                    magma_dsymm( MagmaLeft, MagmaLower,
                                 kb, k,
                                 c_half, dA(k,k),  ldda,
                                         dB(k,0),  lddb,
                                 c_one,  dA(k, 0), ldda, queues[1] );
                    
                    magma_dtrmm( MagmaLeft, MagmaLower, MagmaConjTrans, MagmaNonUnit,
                                 kb, k,
                                 c_one, dB(k,k), lddb,
                                        dA(k,0), ldda, queues[1] );
                }
                
                magma_queue_sync( queues[0] );  // finish get A(k,k)
                
                lapackf77_dsygst( &itype, uplo_, &kb, A(k,k), &lda, B(k,k), &ldb, info );

                // this could be done on a 3rd queue
                magma_dsetmatrix_async( kb, kb,
                                         A(k, k), lda,
                                        dA(k, k), ldda, queues[1] );
            }
        }
    }
    
    magma_queue_sync( queues[0] );  // finish set dA(k,k) for itype 1
    magma_dgetmatrix( n, n, dA(0, 0), ldda, A(0, 0), lda, queues[1] );
    
    magma_queue_destroy( queues[0] );
    magma_queue_destroy( queues[1] );
    
    magma_free( dwork );
    
    return *info;
} /* magma_dsygst_gpu */
/**
    Purpose
    -------
    DGETRF_NOPIV_GPU computes an LU factorization of a general M-by-N
    matrix A without any pivoting.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    dA      DOUBLE_PRECISION array on the GPU, dimension (LDDA,N).
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    @param[in]
    ldda     INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
      -     > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    @ingroup magma_dgesv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_dgetrf_nopiv_gpu(magma_int_t m, magma_int_t n,
                       double *dA, magma_int_t ldda,
                       magma_int_t *info)
{
#define dA(i,j) (dA + (i)*nb + (j)*nb*ldda)

    double c_one     = MAGMA_D_ONE;
    double c_neg_one = MAGMA_D_NEG_ONE;

    magma_int_t iinfo, nb;
    magma_int_t maxm, maxn, mindim;
    magma_int_t i, rows, cols, s, lddwork;
    double *work;

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (ldda < max(1,m))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    nb     = magma_get_dgetrf_nb(m);
    s      = mindim / nb;

    if (nb <= 1 || nb >= min(m,n)) {
        /* Use CPU code. */
        magma_dmalloc_cpu( &work, m * n );
        if ( work == NULL ) {
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        magma_dgetmatrix( m, n, dA, ldda, work, m );
        magma_dgetrf_nopiv( m, n, work, m, info);
        magma_dsetmatrix( m, n, work, m, dA, ldda );
        magma_free_cpu(work);
    }
    else {
        /* Use hybrid blocked code. */
        maxm = ((m + 31)/32)*32;
        maxn = ((n + 31)/32)*32;

        lddwork = maxm;

        if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, maxm*nb )) {
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }

        /* Define user stream if current stream is NULL */
        cudaStream_t stream[2], current_stream;
        magmablasGetKernelStream(&current_stream);

        magma_queue_create( &stream[0] );
        if (current_stream == NULL) {
            magma_queue_create( &stream[1] );
            magmablasSetKernelStream(stream[1]);
        }
        else {
            stream[1] = current_stream;
        }

        for( i=0; i < s; i++ ) {
            // download i-th panel
            cols = maxm - i*nb;

            magma_queue_sync( stream[1] );
            magma_dgetmatrix_async( m-i*nb, nb, dA(i,i), ldda, work, lddwork, stream[0] );
            
            if ( i > 0 ) {
                magma_dtrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit,
                             nb, n - (i+1)*nb,
                             c_one, dA(i-1,i-1), ldda,
                             dA(i-1,i+1), ldda );
                magma_dgemm( MagmaNoTrans, MagmaNoTrans,
                             m-i*nb, n-(i+1)*nb, nb,
                             c_neg_one, dA(i,  i-1), ldda, dA(i-1,i+1), ldda,
                             c_one,     dA(i,  i+1), ldda );
            }

            // do the cpu part
            rows = m - i*nb;
            magma_queue_sync( stream[0] );
            magma_dgetrf_nopiv( rows, nb, work, lddwork, &iinfo );
            if ( (*info == 0) && (iinfo > 0) )
                *info = iinfo + i*nb;

            // upload i-th panel
            magma_dsetmatrix_async( m-i*nb, nb, work, lddwork, dA(i, i), ldda, stream[0] );
            magma_queue_sync( stream[0] );

            // do the small non-parallel computations
            if ( s > (i+1) ) {
                magma_dtrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit,
                             nb, nb,
                             c_one, dA(i, i  ), ldda,
                             dA(i, i+1), ldda);
                magma_dgemm( MagmaNoTrans, MagmaNoTrans,
                             m-(i+1)*nb, nb, nb,
                             c_neg_one, dA(i+1, i  ), ldda, dA(i,   i+1), ldda,
                             c_one,     dA(i+1, i+1), ldda );
            }
            else {
                magma_dtrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit,
                             nb, n-s*nb,
                             c_one, dA(i, i  ), ldda,
                             dA(i, i+1), ldda);
                magma_dgemm( MagmaNoTrans, MagmaNoTrans,
                             m-(i+1)*nb, n-(i+1)*nb, nb,
                             c_neg_one, dA(i+1, i  ), ldda, dA(i,   i+1), ldda,
                             c_one,     dA(i+1, i+1), ldda );
            }
        }

        magma_int_t nb0 = min(m - s*nb, n - s*nb);
        rows = m - s*nb;
        cols = maxm - s*nb;
        magma_dgetmatrix( rows, nb0, dA(s,s), ldda, work, lddwork );

        // make sure that gpu queue is empty
        magma_device_sync();

        // do the cpu part
        magma_dgetrf_nopiv( rows, nb0, work, lddwork, &iinfo );
        if ( (*info == 0) && (iinfo > 0) )
            *info = iinfo + s*nb;

        // upload i-th panel
        magma_dsetmatrix( rows, nb0, work, lddwork, dA(s,s), ldda );

        magma_dtrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit,
                     nb0, n-s*nb-nb0,
                     c_one, dA(s,s),     ldda,
                            dA(s,s)+nb0, ldda);

        magma_free_pinned( work );

        magma_queue_destroy( stream[0] );
        if (current_stream == NULL) {
            magma_queue_destroy( stream[1] );
            magmablasSetKernelStream(NULL);
        }
    }

    return *info;
} /* magma_dgetrf_nopiv_gpu */
Beispiel #26
0
/**
    Purpose
    -------
    DGEQRF computes a QR factorization of a real M-by-N matrix A:
    A = Q * R.
    
    This version has LAPACK-complaint arguments.
    
    If the current stream is NULL, this version replaces it with a new
    stream to overlap computation with communication.

    Other versions (magma_dgeqrf_gpu and magma_dgeqrf3_gpu) store the
    intermediate T matrices.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    dA      DOUBLE_PRECISION array on the GPU, dimension (LDDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).

    @param[in]
    ldda    INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).
            To benefit from coalescent memory accesses LDDA must be
            divisible by 16.

    @param[out]
    tau     DOUBLE_PRECISION array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ---------------
    The matrix Q is represented as a product of elementary reflectors

        Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

        H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).

    @ingroup magma_dgeqrf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_dgeqrf2_gpu( magma_int_t m, magma_int_t n,
                   double *dA, magma_int_t ldda,
                   double *tau,
                   magma_int_t *info )
{
    #define dA(a_1,a_2)    ( dA+(a_2)*(ldda) + (a_1))
    #define work_ref(a_1)  ( work + (a_1))
    #define hwork          ( work + (nb)*(m))

    double *dwork;
    double *work;
    magma_int_t i, k, ldwork, lddwork, old_i, old_ib, rows;
    magma_int_t nbmin, nx, ib, nb;
    magma_int_t lhwork, lwork;

    /* Function Body */
    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    k = min(m,n);
    if (k == 0)
        return *info;

    nb = magma_get_dgeqrf_nb(m);

    lwork  = (m+n) * nb;
    lhwork = lwork - (m)*nb;

    if (MAGMA_SUCCESS != magma_dmalloc( &dwork, n*nb )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, lwork )) {
        magma_free( dwork );
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }

    /* Define user stream if current stream is NULL */
    magma_queue_t stream[2];
    
    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );

    magma_queue_create( &stream[0] );
    if (orig_stream == NULL) {
        magma_queue_create( &stream[1] );
        magmablasSetKernelStream(stream[1]);
    }
    else {
        stream[1] = orig_stream;
    }

    nbmin = 2;
    nx    = nb;
    ldwork = m;
    lddwork= n;

    if (nb >= nbmin && nb < k && nx < k) {
        /* Use blocked code initially */
        old_i = 0; old_ib = nb;
        for (i = 0; i < k-nx; i += nb) {
            ib = min(k-i, nb);
            rows = m -i;

            /* download i-th panel */
            magma_queue_sync( stream[1] );
            magma_dgetmatrix_async( rows, ib,
                                    dA(i,i),       ldda,
                                    work_ref(i), ldwork, stream[0] );
            if (i > 0) {
                /* Apply H' to A(i:m,i+2*ib:n) from the left */
                magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                  m-old_i, n-old_i-2*old_ib, old_ib,
                                  dA(old_i, old_i         ), ldda, dwork,        lddwork,
                                  dA(old_i, old_i+2*old_ib), ldda, dwork+old_ib, lddwork);

                magma_dsetmatrix_async( old_ib, old_ib,
                                        work_ref(old_i),  ldwork,
                                        dA(old_i, old_i), ldda, stream[1] );
            }

            magma_queue_sync( stream[0] );
            lapackf77_dgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info);
            /* Form the triangular factor of the block reflector
               H = H(i) H(i+1) . . . H(i+ib-1) */
            lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr,
                              &rows, &ib,
                              work_ref(i), &ldwork, tau+i, hwork, &ib);

            dpanel_to_q( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib );

            /* download the i-th V matrix */
            magma_dsetmatrix_async( rows, ib, work_ref(i), ldwork, dA(i,i), ldda, stream[0] );

            /* download the T matrix */
            magma_queue_sync( stream[1] );
            magma_dsetmatrix_async( ib, ib, hwork, ib, dwork, lddwork, stream[0] );
            magma_queue_sync( stream[0] );

            if (i + ib < n) {
                if (i+nb < k-nx) {
                    /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */
                    magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                      rows, ib, ib,
                                      dA(i, i   ), ldda, dwork,    lddwork,
                                      dA(i, i+ib), ldda, dwork+ib, lddwork);
                    dq_to_panel( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib );
                }
                else {
                    magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                      rows, n-i-ib, ib,
                                      dA(i, i   ), ldda, dwork,    lddwork,
                                      dA(i, i+ib), ldda, dwork+ib, lddwork);
                    dq_to_panel( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib );
                    magma_dsetmatrix_async( ib, ib,
                                            work_ref(i), ldwork,
                                            dA(i,i),     ldda, stream[1] );
                }
                old_i  = i;
                old_ib = ib;
            }
        }
    } else {
        i = 0;
    }
    magma_free( dwork );

    /* Use unblocked code to factor the last or only block. */
    if (i < k) {
        ib   = n-i;
        rows = m-i;
        magma_dgetmatrix_async( rows, ib, dA(i, i), ldda, work, rows, stream[1] );
        magma_queue_sync( stream[1] );
        lhwork = lwork - rows*ib;
        lapackf77_dgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info);
        
        magma_dsetmatrix_async( rows, ib, work, rows, dA(i, i), ldda, stream[1] );
    }

    magma_free_pinned( work );

    magma_queue_destroy( stream[0] );
    if (orig_stream == NULL) {
        magma_queue_destroy( stream[1] );
    }
    magmablasSetKernelStream( orig_stream );

    return *info;
} /* magma_dgeqrf2_gpu */
Beispiel #27
0
extern "C" magma_int_t
magma_dbulge_applyQ_v2_m(
    magma_int_t ngpu,
    magma_side_t side,
    magma_int_t NE, magma_int_t N,
    magma_int_t NB, magma_int_t Vblksiz,
    double *E, magma_int_t lde,
    double *V, magma_int_t ldv,
    double *T, magma_int_t ldt,
    magma_int_t *info)
{
    //%===========================
    //%   local variables
    //%===========================
    magma_int_t Vm, Vn, mt, nt;
    magma_int_t myrow, mycol, blkj, blki;
    magma_int_t blkid,vpos,tpos;
    magma_int_t firstrow, nbcolinvolvd;
    magma_int_t versionL  = 113;
    magma_int_t versionR  = 92;
    magma_int_t Vchunksiz = 10;
    *info=0;

    /* Quick return */
    if ( NE == 0 ) {
        return *info;
    }
    if ( N == 0 ) {
        return *info;
    }
    if ( NB == 0 ) {
        return *info;
    }
    /* ==========================================
     * some infos for developer
     * Initialisation and checking nb of cores
     * ==========================================*/
    /* we have 2 algo for left (113 114) and 2 algo for right (91 92)
     * which correspond to versionL versionR.
     * They are very similar (detail explained in tech report and matlab code)
     * however version 114 and 92 improve locality.
     * while version 113 is used in case WNATZ=1 (construct Q2) which allow
     * the construction to be done in an optimized way taking into
     * consideration that the matrix is Identity so making less flops.
     *
    */

    // Initialize streaming and events
    //magma_device_sync();
    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );

    magma_int_t  nevents =2, nstream=2;
    magma_queue_t queues[MagmaMaxGPUs][20];
    magma_event_t  myevent[MagmaMaxGPUs][20];
    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        magma_setdevice( dev );
        for( magma_int_t i = 0; i < nstream; ++i ) {
            magma_queue_create( dev, &queues[dev][i] );
        }
        for( magma_int_t i = 0; i < nevents; ++i ) {
            cudaEventCreateWithFlags(&myevent[dev][i],cudaEventDisableTiming);
            //magma_event_create(&myevent[dev][i]);
        }
    }



    // Azzam 21/11/2012
    // NOTE THAT dwork was of size 2*NE*Vblksiz+...
    // but I am thinking why not modifing it to NE*Vblksiz+...
    // BUT NO because the 2* is used because of making 2 queues working and so
    // they might be using dwork in parallel
    double *dE[MagmaMaxGPUs];
    double *dwork[MagmaMaxGPUs], *dwork0[MagmaMaxGPUs], *dwork1[MagmaMaxGPUs];
    //double *dwvt[MagmaMaxGPUs];
    double *dwvt0[MagmaMaxGPUs], *dwvt1[MagmaMaxGPUs];
    double *dT0[MagmaMaxGPUs], *dV0[MagmaMaxGPUs], *dT1[MagmaMaxGPUs], *dV1[MagmaMaxGPUs];
    magma_int_t dev;
    magma_int_t ldde = N;
    magma_int_t lddv = ldv;
    magma_int_t lddt = ldt;
    magma_int_t ne_loc = magma_ceildiv(NE, ngpu);
    if (ne_loc < 256)
       ne_loc=256;
    magma_int_t dwVTsiz  = lddv*Vblksiz;  // lddv*lddv + lddv*NE; // lddv*Vblksiz;
    magma_int_t dworksiz = ne_loc*Vblksiz;  // lddv*Vblksiz;      // NE*Vblksiz;

    ngpu = min(ngpu, magma_ceildiv(NE,ne_loc)); // Don't use GPU that will not have data.
    // copy dE to GPUs
    for (dev=0; dev < ngpu; ++dev) {
        magma_setdevice( dev );
        if (MAGMA_SUCCESS != magma_dmalloc( &dE[dev], ldde * ne_loc)) {
            printf ("!!!!  magma_dbulge_applyQ magma_alloc failed for: dE\n" );
            exit(-1);
        }
        if (MAGMA_SUCCESS != magma_dmalloc( &dwork[dev], 2*dworksiz + 2*dwVTsiz + 2*Vchunksiz* (Vblksiz* (lddv+lddt)) )) {
            printf ("!!!!  magma_dbulge_applyQ magma_alloc failed for: dwork\n" );
            exit(-1);
        }

        dwork0[dev] = dwork[dev];               // size = dworksiz;
        dwork1[dev] = dwork0[dev] + dworksiz;   // size = dworksiz;
        dwvt0[dev]  = dwork[dev] + 2*dworksiz;  // size = dwVTsiz;
        dwvt1[dev]  = dwvt0[dev] + dwVTsiz;     // size = dwVTsiz;
        dV0[dev]    = dwork[dev] + 2*dworksiz + 2*dwVTsiz;
        dT0[dev]    = dV0[dev]   + Vchunksiz*Vblksiz*lddv;
        dV1[dev]    = dT0[dev]   + Vchunksiz*Vblksiz*lddt;
        dT1[dev]    = dV1[dev]   + Vchunksiz*Vblksiz*lddv;

        magma_int_t ie_loc = min(ne_loc, NE - ne_loc*dev);
        magma_dsetmatrix_async( N, ie_loc, E+lde*ne_loc*dev, lde, dE(dev, 0, 0), ldde, queues[dev][1] );
    }


    // make overlapped copy
    magma_int_t ncpy = 0;
    magma_int_t copyed=0, copyst=0;
    magma_int_t blkcnt,nothing, mysiz, flip, vld,tld, locpos;
    findVTsiz(N, NB, Vblksiz, &blkcnt, &nothing);
    
    flip = 0;


    /* SIDE LEFT  meaning apply E = Q*E = (q_1*q_2*.....*q_n) * E ==> so traverse Vs in reverse order (forward) from q_n to q_1
     *            Also E is splitten by row meaning each apply consist in a block of row (horizontal block) */
    /* SIDE RIGHT meaning apply E = E*Q = E * (q_1*q_2*.....*q_n) ==> so tarverse Vs in normal  order (forward) from q_1 to q_n
     *            Also E is splitten by col meaning each apply consist in a block of col (vertical block) */

    #ifdef ENABLE_DEBUG
    printf("  APPLY Q_v22_m GPU with NGPU %d  N %d, NE %d,  NB %d, Vblksiz %d, versionL %d versionR %d  SIDE %c \n",
          ngpu, N, NE, NB, Vblksiz, versionL, versionR, side);
    #endif


    /*
     * MagmamaLeft
     */
    if (side == MagmaLeft) {
        /*
         * Version 113:
         * loop over the block_col (nt) and for each find the
         * number of tiles (mt) in this block_col. then loop over mt, find
         * the size of the V's(Vm,Vn) and apply it to the corresponding
         * portion of E.
         */
        if ( versionL == 113 ) {
            nt  = magma_ceildiv((N-1),Vblksiz);
            for (blkj=nt-1; blkj >= 0; blkj--) {
                /* the index of the first row on the top of block (blkj) */
                firstrow = blkj * Vblksiz + 1;
                /*find the number of tile for this block */
                if ( blkj == nt-1 )
                    mt = magma_ceildiv( N -  firstrow,    NB);
                else
                    mt = magma_ceildiv( N - (firstrow+1), NB);
                /*loop over the tiles find the size of the Vs and apply it */
                for (blki=mt; blki > 0; blki--) {
                    /*calculate the size of each losange of Vs= (Vm,Vn)*/
                    myrow     = firstrow + (mt-blki)*NB;
                    mycol     = blkj*Vblksiz;
                    Vm = min( NB+Vblksiz-1, N-myrow);
                    if ( ( blkj == nt-1 ) && ( blki == mt ) ) {
                        Vn = min (Vblksiz, Vm);
                    } else {
                        Vn = min (Vblksiz, Vm-1);
                    }
                    /*calculate the pointer to the Vs and the Ts.
                     * Note that Vs and Ts have special storage done
                     * by the bulgechasing function*/
                    //printf("voici blkj %d blki %d  Vm %d  Vn %d mycol %d vpos %d \n",blkj,blki,Vm, Vn,mycol,vpos);
                    magma_bulge_findpos113(N, NB, Vblksiz, mycol, myrow, &blkid);
               
                    // COPY Vchunksiz Vs and Vchunksiz Ts to GPU and store it in dV0/dV1 and dT0/dT1
                    if (ncpy == 0) {
                        // flip = 1 for this.
                        copyst = 0;                             // meaning that copy will start copying from blkid =copyst
                        copyed = min(copyst+Vchunksiz, blkcnt); // meaning that copy will end copying at blkid =copyed-1==> next copy had to start at copyed
                        mysiz  = copyed-copyst;                 // the size of the chunk to be copied
                        if (mysiz > 0) {
                            ncpy = 1;
                            flip = 1;
                            vpos = copyst*Vblksiz*ldv;
                            tpos = copyst*Vblksiz*ldt;
                            vld  = mysiz * ldv;
                            tld  = mysiz * ldt;
                            for( dev = 0; dev < ngpu; ++dev ) {
                                magma_setdevice( dev );
                                magma_dsetmatrix_async(vld, Vblksiz, V(vpos), vld, dV1[dev], vld, queues[dev][1]);
                                magma_dsetmatrix_async(tld, Vblksiz, T(tpos), tld, dT1[dev], tld, queues[dev][1]);
                            }
                            //printf("doing the first copy   of mysiz %2d copyst %2d copyed %2d vpos %8d tpos %8d into dV1 dT1\n",mysiz,copyst,copyed,vpos,tpos);
                        }
                    }
                   
                    if (blkid == copyst) {
                        flip   = ncpy % 2;
                        copyst = copyed;                             // meaning that copy will start copying from blkid =copyst
                        copyed = min(copyst+Vchunksiz, blkcnt); // meaning that copy will end copying at blkid =copyed-1==> next copy had to start at copyed
                        mysiz  = copyed-copyst;                 // the size of the chunk to be copied
                        //printf(" get to copy blkid %d blkid+(2*Vchunksiz) %d copyst %d copyed %d\n",blkid,blkid+(Vchunksiz),copyst,copyed);
                        if (mysiz > 0) {
                            ncpy = ncpy + 1;
                            vpos = copyst*Vblksiz*ldv;
                            tpos = copyst*Vblksiz*ldt;
                            vld  = mysiz * ldv;
                            tld  = mysiz * ldt;
                            if (flip == 0) { // now I am working on dV0 so copy the next and put it on dV1
                                //printf("doing overlapping copy of mysiz %2d copyst %2d copyed %2d vpos %8d tpos %8d into dV1 dT1\n",mysiz,copyst,copyed,vpos,tpos);
                                for( dev = 0; dev < ngpu; ++dev ) {
                                    magma_setdevice( dev );
                                    magma_dsetmatrix_async(vld, Vblksiz, V(vpos), vld, dV1[dev], vld, queues[dev][1]);
                                    magma_dsetmatrix_async(tld, Vblksiz, T(tpos), tld, dT1[dev], tld, queues[dev][1]);
                                }
                            } else { // now I am working on dV1 so copy the next and put it on dV0
                                //printf("doing overlapping copy of mysiz %2d copyst %2d copyed %2d vpos %8d tpos %8d into dV0 dT0\n",mysiz,copyst,copyed,vpos,tpos);
                                for( dev = 0; dev < ngpu; ++dev ) {
                                    magma_setdevice( dev );
                                    magma_dsetmatrix_async(vld, Vblksiz, V(vpos), vld, dV0[dev], vld, queues[dev][0]);
                                    magma_dsetmatrix_async(tld, Vblksiz, T(tpos), tld, dT0[dev], tld, queues[dev][0]);
                                }
                            }
                        }
                    }

                    if ((Vm > 0) && (Vn > 0)) {
                        locpos = blkid%Vchunksiz;
                        magma_int_t lcvpos   = locpos*Vblksiz*lddv;
                        magma_int_t lctpos   = locpos*Vblksiz*lddt;
                        //printf("voici blkj %d blki %d  Vm %d  Vn %d mycol %d locvpos %5d loctpos %5d  blkid %2d  using data in dV%1d dT%1d \n",blkj,blki,Vm, Vn,mycol,lcvpos,lctpos, blkid,flip,flip);
                        if (flip == 0) {
                            for( dev = 0; dev < ngpu; ++dev ) {
                                magma_int_t ie_loc   = min(ne_loc, NE - ne_loc*dev);
                                magma_int_t nr_bl = magma_ceildiv(ie_loc,10000);       //nr of blocks
                                magma_int_t sz_bl = magma_ceildiv(ie_loc,nr_bl*64)*64; //maximum size of blocks (to have blocks of around the same size and multiple of 64)
                                magma_int_t ib;                                        //size of current block
                                magma_setdevice( dev );
                                magma_queue_wait_event( queues[dev][0], myevent[dev][1] );
                                for (magma_int_t i=0; i < ie_loc; i += sz_bl) {
                                    ib = min(sz_bl, ie_loc-i);
                                    //magma_dlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV0[dev]+lcvpos, lddv, dT0[dev]+lctpos, lddt, dE(dev,myrow,i), ldde, dwork0[dev], ib, queues[dev][0] );
                                    magma_dlarfb_gpu_gemm( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV0[dev]+lcvpos, lddv, dT0[dev]+lctpos, lddt, dE(dev,myrow,i), ldde, dwork0[dev], ib, dwvt0[dev], Vm, queues[dev][0] );
                                }
                                magma_event_record( myevent[dev][0], queues[dev][0] );
                            }
                        } else {
                            for( dev = 0; dev < ngpu; ++dev ) {
                                magma_int_t ie_loc   = min(ne_loc, NE - ne_loc*dev);
                                magma_int_t nr_bl = magma_ceildiv(ie_loc,10000);       //nr of blocks
                                magma_int_t sz_bl = magma_ceildiv(ie_loc,nr_bl*64)*64; //maximum size of blocks (to have blocks of around the same size and multiple of 64)
                                magma_int_t ib;                                        //size of current block
                                magma_setdevice( dev );
                                magma_queue_wait_event( queues[dev][1], myevent[dev][0] );
                                for (magma_int_t i=0; i < ie_loc; i += sz_bl) {
                                    ib = min(sz_bl, ie_loc-i);
                                    //magma_dlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV1[dev]+lcvpos, lddv, dT1[dev]+lctpos, lddt, dE(dev,myrow,i), ldde, dwork1[dev], ib, queues[dev][1] );
                                    magma_dlarfb_gpu_gemm( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV1[dev]+lcvpos, lddv, dT1[dev]+lctpos, lddt, dE(dev,myrow,i), ldde, dwork1[dev], ib, dwvt1[dev], Vm, queues[dev][1] );
                                }
                                magma_event_record( myevent[dev][1], queues[dev][1] );
                            }
                        }
                    }  // end for (Vm &Vn) > 0
                } // end for blki
            } // end for blkj
        } // end if version=113
        /*
         * Version 114:
         * loop over the block_row (mt) and for each find diagonally the
         * number of tiles (nt) in this block_row. then loop over nt, find
         * the size of the V's(Vm,Vn) and apply it to the corresponding
         * portion of E.
         */
        else {
            printf("versionL 114 not implemented in dbulge_applyQ_v2_m\n");
            exit(-1);
            mt    = magma_ceildiv((N-1),NB);
            for (blki = mt; blki > 0; blki--) {
                /* nbcolinvolvd = number of column corresponding to this block_row (blki) */
                nbcolinvolvd = min(N-1, blki*NB);
                /*find the number of tile for this block (diagonal row of tiles) */
                nt = magma_ceildiv(nbcolinvolvd,Vblksiz);
                /*loop over the tiles find the size of the Vs and apply it */
                for (blkj = nt-1; blkj >= 0; blkj--) {
                    /* the index of the first row of the first col meaning
                     * the block on the top left (blki) */
                    firstrow = (mt-blki)*NB+1;
                    /*calculate the size of each losange of Vs= (Vm,Vn)*/
                    myrow    = firstrow + blkj*Vblksiz;
                    mycol    = blkj*Vblksiz;
                    Vm = min( NB+Vblksiz-1, N-myrow);
                    if ( ( blkj == nt-1 ) && ( blki == mt ) ) {
                        Vn = min (Vblksiz, Vm);
                    } else {
                        Vn = min (Vblksiz, Vm-1);
                    }

                    if ((Vm > 0) && (Vn > 0)) {
                        /*calculate the pointer to the Vs and the Ts.
                         * Note that Vs and Ts have special storage done
                         * by the bulgechasing function*/
                        /*
                        magma_bulge_findVTpos(N, NB, Vblksiz, mycol, myrow, ldv, ldt, &vpos, &tpos);
                        magma_dsetmatrix_async(Vm, Vn, V(vpos), ldv, dV0, lddv, queues[dev][0]);
                        magma_dsetmatrix_async(Vn,  Vn, T(tpos), ldt, dT0, lddt, queues[dev][0]);
                        //printf("voici blki %d  rownbm %d mycol %d  coled %d  blkid %d vpos %d  tpos %d\n", blki, rownbm, mycol, coled, blkid, vpos, tpos);
                        for (magma_int_t i=0; i < NE; i += sz_bl) {
                            ib = min(sz_bl, NE-i);
                            magma_dlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV0, lddv, dT0, lddt, dE(myrow,i), ldde, dwork, NE, queues[dev][0] );
                        }
                        */
                    } // end for (Vm &Vn) > 0
                } // end for blkj
            } // end for blki
        } // end version 114
    } // end LEFT
    /*
     * MagmaRight
     */
    else {
        printf("Side 'R' not implemented in dbulge_applyQ_v2_m\n");
        exit(-1);
        /*
         * Version 91:
         */
        if ( versionR == 91 ) {
            nt  = magma_ceildiv((N-1),Vblksiz);
            for (blkj=0; blkj < nt; blkj++) {
                /* the index of the first myrow on the top of block (blkj) */
                firstrow = blkj * Vblksiz + 1;
                /*find the number of tile for this block */
                if ( blkj == nt-1 )
                    mt = magma_ceildiv( N -  firstrow,    NB);
                else
                    mt = magma_ceildiv( N - (firstrow+1), NB);
                /*loop over the tiles find the size of the Vs and apply it */
                for (blki=1; blki <= mt; blki++) {
                    /*calculate the size of each losange of Vs= (Vm,Vn)*/
                    myrow  = firstrow + (mt-blki)*NB;
                    Vm = min( NB+Vblksiz-1, N-myrow);
                    if ( (blkj == nt-1) && (blki == mt) ) {
                        Vn = min (Vblksiz, Vm);
                    } else {
                        Vn = min (Vblksiz, Vm-1);
                    }
                    mycol     = blkj*Vblksiz;
                    if ((Vm > 0) && (Vn > 0)) {
                        /*calculate the pointer to the Vs and the Ts.
                         * Note that Vs and Ts have special storage done
                         * by the bulgechasing function*/
                        /*
                        magma_bulge_findVTpos(N, NB, Vblksiz, mycol, myrow, ldv, ldt, &vpos, &tpos);
                        magma_dsetmatrix_async(Vm, Vn, V(vpos), ldv, dV0, lddv, queues[dev][0]);
                        magma_dsetmatrix_async(Vn,  Vn, T(tpos), ldt, dT0, lddt, queues[dev][0]);
                        magma_dlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, Vm, Vn, dV0, lddv, dT0, lddt, dE(0, myrow), ldde, dwork, NE, queues[dev][0] );
                        */
                    } // end for (Vm &Vn) > 0
                } // end for blki
            } // end fo blkj
        } // end of version 91
        /*
         * Version 92:
         */
        else {
            mt    = magma_ceildiv((N-1),NB);
            for (blki = 1; blki <= mt; blki++) {
                /* nbcolinvolvd = number of column corresponding to this block_row (blki) */
                nbcolinvolvd = min(N-1, blki*NB);
                /*find the number of tile for this block (diagonal row of tiles) */
                nt = magma_ceildiv(nbcolinvolvd,Vblksiz);
                /*loop over the tiles find the size of the Vs and apply it */
                for (blkj = 0; blkj < nt; blkj++) {
                    /* the index of the first row of the first col meaning
                     * the block on the top left (blki) */
                    firstrow = (mt-blki)*NB+1;
                    /*calculate the size of each losange of Vs= (Vm,Vn)*/
                    myrow    = firstrow + blkj*Vblksiz;
                    mycol    = blkj*Vblksiz;
                    Vm = min( NB+Vblksiz-1, N-myrow);
                    if ( ( blkj == nt-1 ) && ( blki == mt ) ) {
                        Vn = min (Vblksiz, Vm);
                    } else {
                        Vn = min (Vblksiz, Vm-1);
                    }
                    if ((Vm > 0) && (Vn > 0)) {
                        /*calculate the pointer to the Vs and the Ts.
                         * Note that Vs and Ts have special storage done
                         * by the bulgechasing function*/
                        /*
                        magma_bulge_findVTpos(N, NB, Vblksiz, mycol, myrow, ldv, ldt, &vpos, &tpos);
                        magma_dsetmatrix_async(Vm, Vn, V(vpos), ldv, dV0, lddv, queues[dev][0]);
                        magma_dsetmatrix_async(Vn,  Vn, T(tpos), ldt, dT0, lddt, queues[dev][0]);
                        magma_dlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, Vm, Vn, dV0, lddv, dT0, lddt, dE(0, myrow), ldde, dwork, NE, queues[dev][0] );
                        */
                    } // end for (Vm &Vn) > 0
                } //end for blkj
            } // end for blki
        } //end of version 92
    } // end RIGHT

    // copy back the dE form each GPU
    for( dev = 0; dev < ngpu; ++dev ) {
        magma_setdevice( dev );
        magma_queue_wait_event( queues[dev][0], myevent[dev][1] );
        magma_queue_wait_event( queues[dev][0], myevent[dev][0] );
        magma_int_t ie_loc   = min(ne_loc, NE - ne_loc*dev);
        magma_dgetmatrix_async( N, ie_loc, dE(dev, 0, 0), ldde, E+lde*ne_loc*dev, lde, queues[dev][0] );
        magma_event_record( myevent[dev][0], queues[dev][0] );
    }


    for( dev = 0; dev < ngpu; ++dev ) {
        magma_setdevice( dev );
        magma_queue_wait_event( queues[dev][0], myevent[dev][0] );
        magma_queue_sync(queues[dev][0]);
        magma_queue_sync(queues[dev][1]);
        magma_free(dwork[dev]);
        magma_free(dE[dev]);
        for( magma_int_t i = 0; i < nevents; ++i ) {
            magma_event_destroy( myevent[dev][i] );
        }
        for( magma_int_t i = 0; i < nstream; ++i ) {
            magma_queue_destroy( queues[dev][i] );
        }
    }

    magma_setdevice( orig_dev );
    return *info;
}
Beispiel #28
0
/**
    Purpose
    -------
    DSYTRD_HE2HB reduces a real symmetric matrix A to real symmetric
    band-diagonal form T by an orthogonal similarity transformation:
    Q**H * A * Q = T.
    This version stores the triangular matrices T used in the accumulated
    Householder transformations (I - V T V').

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of A is stored;
      -     = MagmaLower:  Lower triangle of A is stored.

    @param[in]
    n       INTEGER
            The order of the matrix A.  N >= 0.

    @param[in,out]
    A       DOUBLE_PRECISION array, dimension (LDA,N)
            On entry, the symmetric matrix A.  If UPLO = MagmaUpper, the leading
            N-by-N upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = MagmaLower, the
            leading N-by-N lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
            On exit, if UPLO = MagmaUpper, the Upper band-diagonal of A is
            overwritten by the corresponding elements of the
            band-diagonal matrix T, and the elements above the band
            diagonal, with the array TAU, represent the orthogonal
            matrix Q as a product of elementary reflectors; if UPLO
            = MagmaLower, the the Lower band-diagonal of A is overwritten by
            the corresponding elements of the band-diagonal
            matrix T, and the elements below the band-diagonal, with
            the array TAU, represent the orthogonal matrix Q as a product
            of elementary reflectors. See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    tau     DOUBLE_PRECISION array, dimension (N-1)
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    work    (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.  LWORK >= 1.
            For optimum performance LWORK >= N*NB, where NB is the
            optimal blocksize.
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    @param[out]
    dT      DOUBLE_PRECISION array on the GPU, dimension N*NB,
            where NB is the optimal blocksize.
            On exit dT holds the upper triangular matrices T from the
            accumulated Householder transformations (I - V T V') used
            in the factorization. The nb x nb matrices T are ordered
            consecutively in memory one after another.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value

    Further Details
    ---------------
    If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary
    reflectors

       Q = H(n-1) . . . H(2) H(1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in
    A(1:i-1,i+1), and tau in TAU(i).

    If UPLO = MagmaLower, the matrix Q is represented as a product of elementary
    reflectors

       Q = H(1) H(2) . . . H(n-1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i),
    and tau in TAU(i).

    The contents of A on exit are illustrated by the following examples
    with n = 5:

    if UPLO = MagmaUpper:                if UPLO = MagmaLower:

      (  d   e   v2  v3  v4 )              (  d                  )
      (      d   e   v3  v4 )              (  e   d              )
      (          d   e   v4 )              (  v1  e   d          )
      (              d   e  )              (  v1  v2  e   d      )
      (                  d  )              (  v1  v2  v3  e   d  )

    where d and e denote diagonal and off-diagonal elements of T, and vi
    denotes an element of the vector defining H(i).

    @ingroup magma_dsyev_2stage
    ********************************************************************/
extern "C" magma_int_t
magma_dsytrd_sy2sb( magma_uplo_t uplo, magma_int_t n, magma_int_t nb,
                    double *A, magma_int_t lda,
                    double *tau,
                    double *work, magma_int_t lwork,
                    double *dT,
                    magma_int_t *info)
{
    #define  A(a_1,a_2)  ( A + ((a_2)-1)*( lda) + (a_1)-1)
    #define dA(a_1,a_2)  (dA + ((a_2)-1)*(ldda) + (a_1)-1)
    #define tau_ref(a_1) (tau + (a_1)-1)
    #define dT(a_1)      (dT + ((a_1)-1)*(lddt))

    int ldda = ((n+31)/32)*32;
    int lddt = nb;
   
    double c_neg_one  = MAGMA_D_NEG_ONE;
    double c_neg_half = MAGMA_D_NEG_HALF;
    double c_one  = MAGMA_D_ONE;
    double c_zero = MAGMA_D_ZERO;
    double  d_one = MAGMA_D_ONE;

    magma_int_t pm, pn, indi, indj, pk;
    magma_int_t pm_old=0, pn_old=0, indi_old=0, indj_old=0;

    int i;
    int lwkopt;
    int lquery;

    *info = 0;
    int upper = (uplo == MagmaUpper);
    lquery = (lwork == -1);
    if (! upper && uplo != MagmaLower) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,n)) {
        *info = -4;
    } else if (lwork < 1 && ! lquery) {
        *info = -9;
    }

    /* Determine the block size. */
    lwkopt = n * nb;
    if (*info == 0) {
        work[0] = MAGMA_D_MAKE( lwkopt, 0 );
    }

    if (*info != 0)
        return *info;
    else if (lquery)
        return *info;

    /* Quick return if possible */
    if (n == 0) {
        work[0] = c_one;
        return *info;
    }

    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );
    
    double *dA;
    if (MAGMA_SUCCESS != magma_dmalloc( &dA, (n + 2*nb)*ldda )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    // limit to 16 threads
    magma_int_t orig_threads = magma_get_lapack_numthreads();
    magma_set_lapack_numthreads( min(orig_threads,16) );

    /* Use the first panel of dA as work space */
    double *dwork = dA + n*ldda;
    double *dW    = dwork + nb*ldda;

    #ifdef TRACING
    char buf[80];
    #endif
    magma_queue_t stream[3];
    magma_queue_create( &stream[0] );
    magma_queue_create( &stream[1] );
    stream[2] = 0;  // default stream
    
    trace_init( 1, 1, 3, stream );

    double *hT = work + lwork - nb*nb;
    lwork -= nb*nb;
    memset( hT, 0, nb*nb*sizeof(double));

    magmablasSetKernelStream( stream[0] );
    magma_event_t Pupdate_event;
    cudaEventCreateWithFlags(&Pupdate_event,cudaEventDisableTiming);
    //magma_event_create(&Pupdate_event);


    if (upper) {
        printf("DSYTRD_HE2HB is not yet implemented for upper matrix storage. Exit.\n");
        exit(1);
    } else {
        /* Copy the matrix to the GPU */
        if (1 <= n-nb) {
            trace_gpu_start( 0, 0, "set", "set A" );
            magma_dsetmatrix_async( (n-nb), (n-nb),
                                    A(nb+1, nb+1),  lda,
                                    dA(nb+1, nb+1), ldda, stream[0] );
            trace_gpu_end( 0, 0 );
        }

        /* Reduce the lower triangle of A */
        for (i = 1; i <= n-nb; i += nb) {
             indi = i+nb;
             indj = i;
             pm   = n - i - nb + 1;
             //pn   = min(i+nb-1, n-nb) -i + 1;
             pn   = nb;
             
             /*   Get the current panel (no need for the 1st iteration) */
             if (i > 1 ) {
                 // dpanel_to_q copy the upper oof diagonal part of
                 // the matrix to work to be restored later. acctually
                 //  the zero's and one's putted are not used this is only
                 //   because we don't have a function that copy only the
                 //    upper part of A to be restored after copying the
                 //    lookahead panel that has been computted from GPU to CPU.
                 dpanel_to_q(MagmaUpper, pn-1, A(i, i+1), lda, work);

                 trace_gpu_start( 0, 1, "get", "get panel" );
                 //magma_queue_sync( stream[0] );
                 magma_queue_wait_event(stream[1], Pupdate_event);  //, 0);
                 magma_dgetmatrix_async( (pm+pn), pn,
                                         dA( i, i), ldda,
                                         A ( i, i), lda, stream[1] );
                 trace_gpu_end( 0, 1 );

                 trace_gpu_start( 0, 2, "her2k", "her2k" );
                 magma_dsyr2k(MagmaLower, MagmaNoTrans, pm_old-pn_old, pn_old, c_neg_one,
                      dA(indi_old+pn_old, indj_old), ldda,
                      dW + pn_old,            pm_old, d_one,
                      dA(indi_old+pn_old, indi_old+pn_old), ldda);
                 trace_gpu_end( 0, 2 );

                 trace_cpu_start( 0, "sync", "sync on 1" );
                 magma_queue_sync( stream[1] );
                 trace_cpu_end( 0 );
                 dq_to_panel(MagmaUpper, pn-1, A(i, i+1), lda, work);
             }

             /* ==========================================================
                QR factorization on a panel starting nb off of the diagonal.
                Prepare the V and T matrices.
                ==========================================================  */
             #ifdef TRACING
             snprintf( buf, sizeof(buf), "panel %d", i );
             #endif
             trace_cpu_start( 0, "geqrf", buf );
             lapackf77_dgeqrf(&pm, &pn, A(indi, indj), &lda,
                        tau_ref(i), work, &lwork, info);
             
             /* Form the matrix T */
                         pk=min(pm,pn);
             lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr,
                           &pm, &pk, A(indi, indj), &lda,
                           tau_ref(i), hT, &nb);

             /* Prepare V - put 0s in the upper triangular part of the panel
                (and 1s on the diagonal), temporaly storing the original in work */
             dpanel_to_q(MagmaUpper, pk, A(indi, indj), lda, work);
             trace_cpu_end( 0 );

             /* Send V from the CPU to the GPU */
             trace_gpu_start( 0, 0, "set", "set V and T" );
             magma_dsetmatrix_async( pm, pk,
                                     A(indi, indj),  lda,
                                     dA(indi, indj), ldda, stream[0] );

             /* Send the triangular factor T to the GPU */
             magma_dsetmatrix_async( pk, pk,
                                     hT,       nb,
                                     dT(i), lddt, stream[0] );
             trace_gpu_end( 0, 0 );
             
             /* ==========================================================
                Compute W:
                1. X = A (V T)
                2. W = X - 0.5* V * (T' * (V' * X))
                ==========================================================  */
             /* dwork = V T */
             trace_cpu_start( 0, "sync", "sync on 0" );
             // this sync is done here to be sure that the copy has been finished
             // because below we made a restore dq_to_panel and this restore need
             // to ensure that the copy has been finished. we did it here to allow
             // overlapp of restore with next gemm and symm.
             magma_queue_sync( stream[0] );
             trace_cpu_end( 0 );
             
             trace_gpu_start( 0, 2, "gemm", "work = V*T" );
             magma_dgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk,
                         c_one, dA(indi, indj), ldda,
                         dT(i), lddt,
                         c_zero, dwork, pm);
             trace_gpu_end( 0, 2 );
             
             /* dW = X = A*V*T. dW = A*dwork */
             trace_gpu_start( 0, 2, "hemm", "X = A*work" );
             magma_dsymm(MagmaLeft, uplo, pm, pk,
                         c_one, dA(indi, indi), ldda,
                         dwork, pm,
                         c_zero, dW, pm);
             trace_gpu_end( 0, 2 );
             /* restore the panel */
             dq_to_panel(MagmaUpper, pk, A(indi, indj), lda, work);
             
             /* dwork = V*T already ==> dwork' = T'*V'
              * compute T'*V'*X ==> dwork'*W ==>
              * dwork + pm*nb = ((T' * V') * X) = dwork' * X = dwork' * W */
             trace_gpu_start( 0, 2, "gemm", "work = T'*V'*X" );
             magma_dgemm(MagmaConjTrans, MagmaNoTrans, pk, pk, pm,
                         c_one, dwork, pm,
                         dW, pm,
                         c_zero, dwork + pm*nb, nb);
             trace_gpu_end( 0, 2 );
             
             /* W = X - 0.5 * V * T'*V'*X
              *   = X - 0.5 * V * (dwork + pm*nb) = W - 0.5 * V * (dwork + pm*nb) */
             trace_gpu_start( 0, 2, "gemm", "W = X - 0.5*V*(T'*V'*X)" );
             magma_dgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk,
                         c_neg_half, dA(indi, indj), ldda,
                         dwork + pm*nb, nb,
                         c_one,     dW, pm);
             trace_gpu_end( 0, 2 );

             /* ==========================================================
                Update the unreduced submatrix A(i+ib:n,i+ib:n), using
                an update of the form:  A := A - V*W' - W*V'
                ==========================================================  */
             if (i + nb <= n-nb) {
                 /* There would be next iteration;
                    do lookahead - update the next panel */
                 trace_gpu_start( 0, 2, "gemm", "gemm 4 next panel left" );
                 magma_dgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one,
                             dA(indi, indj), ldda,
                             dW,                 pm, c_one,
                             dA(indi, indi), ldda);
                 trace_gpu_end( 0, 2 );
             
                 trace_gpu_start( 0, 2, "gemm", "gemm 5 next panel right" );
                 magma_dgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one,
                             dW,                 pm,
                             dA(indi, indj), ldda, c_one,
                             dA(indi, indi), ldda);
                 trace_gpu_end( 0, 2 );
                 magma_event_record(Pupdate_event, stream[0]);
             }
             else {
                 /* no look-ahead as this is last iteration */
                 trace_gpu_start( 0, 2, "her2k", "her2k last iteration" );
                 magma_dsyr2k(MagmaLower, MagmaNoTrans, pk, pk, c_neg_one,
                              dA(indi, indj), ldda,
                              dW,                 pm, d_one,
                              dA(indi, indi), ldda);
                 trace_gpu_end( 0, 2 );
             }
             
             indi_old = indi;
             indj_old = indj;
             pm_old   = pm;
             pn_old   = pn;
        }  // end loop for (i)

        /* Send the last block to the CPU */
        pk = min(pm,pn);
        if (1 <= n-nb) {
            dpanel_to_q(MagmaUpper, pk-1, A(n-pk+1, n-pk+2), lda, work);
            trace_gpu_start( 0, 2, "get", "get last block" );
            magma_dgetmatrix( pk, pk,
                              dA(n-pk+1, n-pk+1), ldda,
                              A(n-pk+1, n-pk+1),  lda );
            trace_gpu_end( 0, 2 );
            dq_to_panel(MagmaUpper, pk-1, A(n-pk+1, n-pk+2), lda, work);
        }
    }// end of LOWER
    
    trace_finalize( "dsytrd_sy2sb.svg", "trace.css" );

    magma_event_destroy( Pupdate_event );
    magma_queue_destroy( stream[0] );
    magma_queue_destroy( stream[1] );
    magma_free( dA );
    work[0] = MAGMA_D_MAKE( lwkopt, 0 );

    magmablasSetKernelStream( orig_stream );    
    magma_set_lapack_numthreads( orig_threads );

    return *info;
} /* magma_dsytrd_sy2sb */
Beispiel #29
0
/**
    Purpose
    -------
    DGEQLF computes a QL factorization of a DOUBLE_PRECISION M-by-N matrix A:
    A = Q * L.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    A       DOUBLE_PRECISION array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, if m >= n, the lower triangle of the subarray
            A(m-n+1:m,1:n) contains the N-by-N lower triangular matrix L;
            if m <= n, the elements on and below the (n-m)-th
            superdiagonal contain the M-by-N lower trapezoidal matrix L;
            the remaining elements, with the array TAU, represent the
            orthogonal matrix Q as a product of elementary reflectors
            (see Further Details).
    \n
            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    @param[out]
    tau     DOUBLE_PRECISION array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    work    (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.
    \n
            Higher performance is achieved if WORK is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.  LWORK >= max(1,N,2*NB^2).
            For optimum performance LWORK >= max(N*NB, 2*NB^2) where NB can be obtained
            through magma_get_dgeqlf_nb(M).
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ---------------
    The matrix Q is represented as a product of elementary reflectors

       Q = H(k) . . . H(2) H(1), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(m-k+i+1:m) = 0 and v(m-k+i) = 1; v(1:m-k+i-1) is stored on exit in
    A(1:m-k+i-1,n-k+i), and tau in TAU(i).

    @ingroup magma_dgeqlf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_dgeqlf(
    magma_int_t m, magma_int_t n,
    double *A,    magma_int_t lda, double *tau,
    double *work, magma_int_t lwork,
    magma_int_t *info)
{
    #define  A(i_,j_) ( A + (i_) + (j_)*lda)
    #define dA(i_,j_) (dA + (i_) + (j_)*ldda)
    #define dwork(i_) (dwork + (i_))

    magmaDouble_ptr dA, dwork;
    double c_one = MAGMA_D_ONE;
    magma_int_t i, k, lddwork, old_i, old_ib, nb;
    magma_int_t rows, cols;
    magma_int_t ib, ki, kk, mu, nu, iinfo, ldda;
    int lquery;

    nb = magma_get_dgeqlf_nb(m);
    *info = 0;
    lquery = (lwork == -1);

    // silence "uninitialized" warnings
    old_ib = nb;
    old_i  = 0;
    
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,m)) {
        *info = -4;
    }

    k = min(m,n);
    if (*info == 0) {
        if (k == 0)
            work[0] = c_one;
        else {
            work[0] = MAGMA_D_MAKE( max(n*nb, 2*nb*nb), 0 );
        }

        if (lwork < max(max(1,n), 2*nb*nb) && ! lquery)
            *info = -7;
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;

    /* Quick return if possible */
    if (k == 0)
        return *info;

    lddwork = ((n+31)/32)*32;
    ldda    = ((m+31)/32)*32;

    if (MAGMA_SUCCESS != magma_dmalloc( &dA, (n)*ldda + nb*lddwork )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }
    dwork = dA + ldda*n;

    magma_queue_t queues[2];
    magma_queue_create( &queues[0] );
    magma_queue_create( &queues[1] );

    if ( (nb > 1) && (nb < k) ) {
        /*  Use blocked code initially.
            The last kk columns are handled by the block method.
            First, copy the matrix on the GPU except the last kk columns */
        magma_dsetmatrix_async( m, n-nb,
                                A(0, 0),  lda,
                                dA(0, 0), ldda, queues[0] );

        ki = ((k - nb - 1) / nb) * nb;
        kk = min(k, ki + nb);
        for (i = k - kk + ki; i >= k -kk; i -= nb) {
            ib = min(k-i,nb);

            if (i < k - kk + ki) {
                /* 1. Copy asynchronously the current panel to the CPU.
                   2. Copy asynchronously the submatrix below the panel
                   to the CPU)                                        */
                rows = m - k + i + ib;
                magma_dgetmatrix_async( rows, ib,
                                        dA(0, n-k+i), ldda,
                                        A(0, n-k+i),  lda, queues[1] );

                magma_dgetmatrix_async( m-rows, ib,
                                        dA(rows, n-k+i), ldda,
                                        A(rows, n-k+i),  lda, queues[0] );

                /* Apply H' to A(1:m-k+i+ib-1,1:n-k+i-1) from the left in
                   two steps - implementing the lookahead techniques.
                   This is the main update from the lookahead techniques. */
                rows = m - k + old_i + old_ib;
                cols = n - k + old_i - old_ib;
                magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaBackward, MagmaColumnwise,
                                  rows, cols, old_ib,
                                  dA(0, cols+old_ib), ldda, dwork(0),      lddwork,
                                  dA(0, 0          ), ldda, dwork(old_ib), lddwork);
            }

            magma_queue_sync( queues[1] );
            /* Compute the QL factorization of the current block
               A(1:m-k+i+ib-1,n-k+i:n-k+i+ib-1) */
            rows = m - k + i + ib;
            cols = n - k + i;
            lapackf77_dgeqlf( &rows, &ib, A(0,cols), &lda, tau+i, work, &lwork, &iinfo );

            if (cols > 0) {
                /* Form the triangular factor of the block reflector
                   H = H(i+ib-1) . . . H(i+1) H(i) */
                lapackf77_dlarft( MagmaBackwardStr, MagmaColumnwiseStr,
                                  &rows, &ib,
                                  A(0, cols), &lda, tau + i, work, &ib);

                dpanel_to_q( MagmaLower, ib, A(rows-ib,cols), lda, work+ib*ib);
                magma_dsetmatrix( rows, ib,
                                  A(0,cols),  lda,
                                  dA(0,cols), ldda );
                dq_to_panel( MagmaLower, ib, A(rows-ib,cols), lda, work+ib*ib);

                // Send the triangular part on the GPU
                magma_dsetmatrix( ib, ib, work, ib, dwork(0), lddwork );

                /* Apply H' to A(1:m-k+i+ib-1,1:n-k+i-1) from the left in
                   two steps - implementing the lookahead techniques.
                   This is the update of first ib columns.                 */
                if (i-ib >= k -kk)
                    magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaBackward, MagmaColumnwise,
                                      rows, ib, ib,
                                      dA(0, cols),   ldda, dwork(0),  lddwork,
                                      dA(0,cols-ib), ldda, dwork(ib), lddwork);
                else {
                    magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaBackward, MagmaColumnwise,
                                      rows, cols, ib,
                                      dA(0, cols), ldda, dwork(0),  lddwork,
                                      dA(0, 0   ), ldda, dwork(ib), lddwork);
                }

                old_i  = i;
                old_ib = ib;
            }
        }
        mu = m - k + i + nb;
        nu = n - k + i + nb;

        magma_dgetmatrix( m, nu, dA(0,0), ldda, A(0,0), lda );
    } else {
        mu = m;
        nu = n;
    }

    /* Use unblocked code to factor the last or only block */
    if (mu > 0 && nu > 0)
        lapackf77_dgeqlf(&mu, &nu, A(0,0), &lda, tau, work, &lwork, &iinfo);

    magma_queue_destroy( queues[0] );
    magma_queue_destroy( queues[1] );
    magma_free( dA );
    
    return *info;
} /* magma_dgeqlf */
Beispiel #30
0
/**
    Purpose
    -------
    DSYTRD_HE2HB reduces a real symmetric matrix A to real symmetric
    band-diagonal form T by an orthogonal similarity transformation:
    Q**H * A * Q = T.
    This version stores the triangular matrices T used in the accumulated
    Householder transformations (I - V T V').

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of A is stored;
      -     = MagmaLower:  Lower triangle of A is stored.

    @param[in]
    n       INTEGER
            The order of the matrix A.  N >= 0.

    @param[in,out]
    A       DOUBLE_PRECISION array, dimension (LDA,N)
            On entry, the symmetric matrix A.  If UPLO = MagmaUpper, the leading
            N-by-N upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = MagmaLower, the
            leading N-by-N lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
            On exit, if UPLO = MagmaUpper, the Upper band-diagonal of A is
            overwritten by the corresponding elements of the
            band-diagonal matrix T, and the elements above the band
            diagonal, with the array TAU, represent the orthogonal
            matrix Q as a product of elementary reflectors; if UPLO
            = MagmaLower, the the Lower band-diagonal of A is overwritten by
            the corresponding elements of the band-diagonal
            matrix T, and the elements below the band-diagonal, with
            the array TAU, represent the orthogonal matrix Q as a product
            of elementary reflectors. See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    tau     DOUBLE_PRECISION array, dimension (N-1)
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    work    (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.  LWORK >= 1.
            For optimum performance LWORK >= N*NB, where NB is the
            optimal blocksize.
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    @param[out]
    dT      DOUBLE_PRECISION array on the GPU, dimension N*NB,
            where NB is the optimal blocksize.
            On exit dT holds the upper triangular matrices T from the
            accumulated Householder transformations (I - V T V') used
            in the factorization. The nb x nb matrices T are ordered
            consecutively in memory one after another.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value

    Further Details
    ---------------
    If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary
    reflectors

       Q = H(n-1) . . . H(2) H(1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in
    A(1:i-1,i+1), and tau in TAU(i).

    If UPLO = MagmaLower, the matrix Q is represented as a product of elementary
    reflectors

       Q = H(1) H(2) . . . H(n-1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i),
    and tau in TAU(i).

    The contents of A on exit are illustrated by the following examples
    with n = 5:

    if UPLO = MagmaUpper:                if UPLO = MagmaLower:

      (  d   e   v2  v3  v4 )              (  d                  )
      (      d   e   v3  v4 )              (  e   d              )
      (          d   e   v4 )              (  v1  e   d          )
      (              d   e  )              (  v1  v2  e   d      )
      (                  d  )              (  v1  v2  v3  e   d  )

    where d and e denote diagonal and off-diagonal elements of T, and vi
    denotes an element of the vector defining H(i).

    @ingroup magma_dsyev_2stage
    ********************************************************************/
extern "C" magma_int_t
magma_dsytrd_sy2sb_mgpu(
    magma_uplo_t uplo, magma_int_t n, magma_int_t nb,
    double *A, magma_int_t lda,
    double *tau,
    double *work, magma_int_t lwork,
    magmaDouble_ptr dAmgpu[], magma_int_t ldda,
    magmaDouble_ptr dTmgpu[], magma_int_t lddt,
    magma_int_t ngpu, magma_int_t distblk,
    magma_queue_t queues[][20], magma_int_t nqueue,
    magma_int_t *info)
{
    #define A(a_1,a_2)        ( A  + ((a_2)-1)*( lda) + (a_1)-1)
    #define tau_ref(a_1)      (tau + (a_1)-1)
    #define dT(a_0, a_1, a_2) (dTmgpu[a_0] + ((a_2)-1)*(lddt) + (a_1)-1)
    #define dA(a_0, a_1, a_2) (dAmgpu[a_0] + ((a_2)-1)*(ldda) + (a_1)-1)

    double c_neg_one  = MAGMA_D_NEG_ONE;
    double c_neg_half = MAGMA_D_NEG_HALF;
    double c_one  = MAGMA_D_ONE;
    double c_zero = MAGMA_D_ZERO;
    double  d_one = MAGMA_D_ONE;

    magma_int_t pm, pn, indi, indj, pk;
    magma_int_t pm_old=0, pn_old=0, indi_old=0, flipV=-1;
    magma_int_t iblock, idev, di;
    int i;
    int lwkopt;
    int lquery;

    assert (nqueue >= 3);
    assert (nqueue >= (ngpu+1));


    *info = 0;
    int upper = (uplo == MagmaUpper);
    lquery = (lwork == -1);
    if (! upper && uplo != MagmaLower) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,n)) {
        *info = -4;
    } else if (lwork < 1 && ! lquery) {
        *info = -9;
    }

    /* Determine the block size. */
    lwkopt = n * nb;
    if (*info == 0) {
        work[0] = MAGMA_D_MAKE( lwkopt, 0 );
    }


    if (*info != 0)
        return *info;
    else if (lquery)
        return *info;

    /* Quick return if possible */
    if (n == 0) {
        work[0] = c_one;
        return *info;
    }

    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );

    // limit to 16 threads
    magma_int_t orig_threads = magma_get_lapack_numthreads();
    magma_set_lapack_numthreads( min(orig_threads,16) );

    magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2];
    magma_int_t nbcmplx=0;
    magma_buildconnection_mgpu(gnode, &nbcmplx,  ngpu);
    #ifdef ENABLE_DEBUG
    printf(" Initializing communication pattern.... GPU-ncmplx %d\n\n", nbcmplx);
    #endif

    double *dspace[MagmaMaxGPUs];
    double *dwork[MagmaMaxGPUs], *dworkbis[MagmaMaxGPUs];
    double *dvall[MagmaMaxGPUs], *dv[MagmaMaxGPUs], *dw[MagmaMaxGPUs];
    double *workngpu[MagmaMaxGPUs+1];
    magma_event_t     redevents[MagmaMaxGPUs][MagmaMaxGPUs*MagmaMaxGPUs+10];
    magma_int_t nbevents = MagmaMaxGPUs*MagmaMaxGPUs;

    magma_int_t lddv        = ldda;
    magma_int_t lddw        = lddv;
    magma_int_t dwrk2siz    = ldda*nb*(ngpu+1);
    magma_int_t worksiz     = n*nb;
    magma_int_t devworksiz  = 2*nb*lddv + nb*lddw + nb*ldda + dwrk2siz; // 2*dv(dv0+dv1) + dw + dwork +dworkbis

    // local allocation and stream creation
    // TODO check malloc
    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        magma_setdevice( dev );
        magma_dmalloc( &dspace[dev], devworksiz );
        magma_dmalloc_pinned ( &workngpu[dev], worksiz);
        dvall[dev]    = dspace[dev];
        dw[dev]       = dvall[dev]   + 2*nb*lddv;
        dwork[dev]    = dw[dev]      + nb*lddw;
        dworkbis[dev] = dwork[dev]   + nb*ldda;
        magmablasSetKernelStream( queues[ dev ][ 0 ] );
        for( magma_int_t i = 0; i < nbevents; ++i ) {
            cudaEventCreateWithFlags(&redevents[dev][i],cudaEventDisableTiming);
        }
    }
    magma_dmalloc_pinned ( &workngpu[ngpu], worksiz);
    double *worktest = NULL;
    //magma_dmalloc_cpu( &worktest, n*nb ); // not used
    // ======================
  

    double *hT = work + lwork - nb*nb;
    lwork -= nb*nb;
    memset( hT, 0, nb*nb*sizeof(double));

    if (upper) {
        printf("DSYTRD_HE2HB is not yet implemented for upper matrix storage. Exit.\n");
        exit(1);
    } else {
        /* Reduce the lower triangle of A */
        for (i = 1; i <= n-nb; i += nb) {
             indi = i+nb;
             indj = i;
             pm   = n - i - nb + 1;
             //pn   = min(i+nb-1, n-nb) -i + 1;
             pn   = nb;
             
             /*   Get the current panel (no need for the 1st iteration) */
             if (i > 1 ) {
                 // dpanel_to_q copy the upper oof diagonal part of
                 // the matrix to work to be restored later. acctually
                 //  the zero's and one's putted are not used this is only
                 //   because we don't have a function that copy only the
                 //    upper part of A to be restored after copying the
                 //    lookahead panel that has been computted from GPU to CPU.
                 dpanel_to_q(MagmaUpper, pn-1, A(i, i+1), lda, work);

                 // find the device who own the panel then send it to the CPU.
                 // below a -1 was added and then a -1 was done on di because of the fortran indexing
                 iblock = ((i-1) / distblk) / ngpu;          // local block id
                 di     = iblock*distblk + (i-1)%distblk;     // local index in parent matrix
                 idev   = ((i-1) / distblk) % ngpu;          // device with this block


                 //printf("Receiving panel ofsize %d %d from idev %d A(%d,%d) \n",(pm+pn), pn,idev,i-1,di);
                 magma_setdevice( idev );

                 //magma_device_sync();
                 magma_dgetmatrix_async( (pm+pn), pn,
                                         dA(idev, i, di+1), ldda,
                                         A( i, i), lda, queues[ idev ][ nqueue-1 ] );
               
                 //magma_setdevice( 0 );
                 //printf("updating dsyr2k on A(%d,%d) of size %d %d \n",indi_old+pn_old-1,indi_old+pn_old-1,pm_old-pn_old,pn_old);
                 // compute DSYR2K_MGPU
                 magmablas_dsyr2k_mgpu2(
                      MagmaLower, MagmaNoTrans, pm_old-pn_old, pn_old,
                      c_neg_one, dv, pm_old, pn_old,
                                 dw, pm_old, pn_old,
                      d_one,     dAmgpu, ldda, indi_old+pn_old-1,
                      ngpu, distblk, queues, 2 );
                 //magma_setdevice( 0 );

                 magma_setdevice( idev );
                 magma_queue_sync( queues[idev][ nqueue-1 ] );
                 //magma_setdevice( 0 );
                 dq_to_panel(MagmaUpper, pn-1, A(i, i+1), lda, work);
             }

             /* ==========================================================
                QR factorization on a panel starting nb off of the diagonal.
                Prepare the V and T matrices.
                ==========================================================  */
             lapackf77_dgeqrf(&pm, &pn, A(indi, indj), &lda,
                        tau_ref(i), work, &lwork, info);
             
             /* Form the matrix T */
             pk=min(pm,pn);
             lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr,
                           &pm, &pk, A(indi, indj), &lda,
                           tau_ref(i), hT, &nb);

             /* Prepare V - put 0s in the upper triangular part of the panel
                (and 1s on the diagonal), temporaly storing the original in work */
             dpanel_to_q(MagmaUpper, pk, A(indi, indj), lda, work);



             /* Send V and T from the CPU to the GPU */
             // To be able to overlap the GET with the DSYR2K
             // it should be done on last stream.
             // TO Avoid a BUG that is overwriting the old_V
             // used atthis moment by dsyr2k with the new_V
             // send it now, we decide to have a flipflop
             // vector of Vs. if step%2=0 use V[0] else use V[nb*n]
             flipV = ((i-1)/nb)%2;
             for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
                 dv[dev] = dvall[dev] + flipV*nb*lddv;
             }

             for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
                 magma_setdevice( dev );
                // send V
                 magma_dsetmatrix_async( pm, pk,
                                     A(indi, indj),  lda,
                                     dv[dev], pm, queues[dev][nqueue-1] );

                // Send the triangular factor T to the GPU
                magma_dsetmatrix_async( pk, pk,
                                     hT,       nb,
                                     dT(dev, 1, i), lddt, queues[dev][nqueue-1] );
             }

             /* ==========================================================
                Compute W:
                1. X = A (V T)
                2. W = X - 0.5* V * (T' * (V' * X))
                ==========================================================  */
             for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
                 // dwork = V T
                 magma_setdevice( dev );
                 magmablasSetKernelStream( queues[ dev ][ nqueue-1 ] );
                 magma_queue_sync( queues[dev][nqueue-1] );
                 magma_dgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk,
                         c_one, dv[dev], pm,
                         dT(dev, 1, i), lddt,
                         c_zero, dwork[dev], pm);
             }

             // ===============================================
             //   SYNC TO BE SURE THAT BOTH V AND T WERE
             //   RECEIVED AND VT IS COMPUTED and SYR2K is done
             // ===============================================
             for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
                 magma_setdevice( dev );
                 for( magma_int_t s = 0; s < nqueue; ++s )
                 magma_queue_sync( queues[dev][s] );
             }


              // compute DSYMM_MGPU
              // The broadcast of the result done inside this function
              // should be done in stream [0] because i am assuming this
              // for the GEMMs below otherwise I have to SYNC over the
              // Broadcasting stream.
              if (ngpu == 1) {
                 magmablasSetKernelStream( queues[ 0 ][ 0 ] );
                 magma_dsymm(MagmaLeft, uplo, pm, pk,
                         c_one, dAmgpu[0]+(indi-1)*ldda+(indi-1), ldda,
                         dwork[0], pm,
                         c_zero, dw[0], pm);
              } else {
                 magmablas_dsymm_mgpu_com(
                       MagmaLeft, uplo, pm, pk,
                       c_one, dAmgpu, ldda, indi-1,
                                   dwork, pm,
                       c_zero,     dw, pm, dworkbis, dwrk2siz, worktest, pm, workngpu, worksiz,
                       ngpu, distblk, queues, nqueue-1, redevents, nbevents, gnode, nbcmplx);
             }

             
             /* dwork = V*T already ==> dwork' = T'*V'
              * compute T'*V'*X ==> dwork'*W ==>
              * dwork + pm*nb = ((T' * V') * X) = dwork' * X = dwork' * W */
             for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
                 // Here we have to wait until the broadcast of DSYMM has been done.
                 // Note that the broadcast should be done on stream[0] so in a way
                 // we can continue here on the same stream and avoid a sync
                 magma_setdevice( dev );
                 magmablasSetKernelStream( queues[ dev ][ 0 ] );
                 // magma_queue_sync( queues[dev][0] );
                 magma_dgemm(MagmaConjTrans, MagmaNoTrans, pk, pk, pm,
                             c_one, dwork[dev], pm,
                             dw[dev], pm,
                             c_zero, dworkbis[dev], nb);
                 
                 /* W = X - 0.5 * V * T'*V'*X
                  *   = X - 0.5 * V * (dwork + pm*nb) = W - 0.5 * V * (dwork + pm*nb) */
                 magma_dgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk,
                             c_neg_half, dv[dev], pm,
                             dworkbis[dev], nb,
                             c_one,     dw[dev], pm);
             }
             /* restore the panel it is put here to overlap with the previous GEMM*/
             dq_to_panel(MagmaUpper, pk, A(indi, indj), lda, work);
             // ===============================================
             //   SYNC TO BE SURE THAT BOTH V AND W ARE DONE
             // ===============================================
             // Synchronise to be sure that W has been computed
             // because next DSYR2K use streaming and may happen
             // that lunch a gemm on stream 2 while stream 0
             // which compute those 2 GEMM above has not been
             // computed and also used for the same reason in
             // the panel update below and also for the last HER2K
             for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
                 magma_setdevice( dev );
                 magma_queue_sync( queues[dev][0] );
             }

             /* ==========================================================
                Update the unreduced submatrix A(i+ib:n,i+ib:n), using
                an update of the form:  A := A - V*W' - W*V'
                ==========================================================  */
             if (i + nb <= n-nb) {
                 /* There would be next iteration;
                    do lookahead - update the next panel */
                 // below a -1 was added and then a -1 was done on di because of the fortran indexing
                 iblock = ((indi-1) / distblk) / ngpu;          // local block id
                 di     = iblock*distblk + (indi-1)%distblk;     // local index in parent matrix
                 idev   = ((indi-1) / distblk) % ngpu;          // device with this block
                 magma_setdevice( idev );
                 magmablasSetKernelStream( queues[ idev ][ nqueue-1 ] );
                 //magma_queue_sync( queues[idev][0] ); removed because the sync has been done in the loop above
                 magma_dgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one,
                             dv[idev], pm,
                             dw[idev], pm, c_one,
                             dA(idev, indi, di+1), ldda);
             
                 magma_dgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one,
                             dw[idev], pm,
                             dv[idev], pm, c_one,
                             dA(idev, indi, di+1), ldda);
                 //printf("updating next panel distblk %d  idev %d  on A(%d,%d) of size %d %d %d \n",distblk,idev,indi-1,di,pm,pn,pn);
             }
             else {
                 /* no look-ahead as this is last iteration */
                 // below a -1 was added and then a -1 was done on di because of the fortran indexing
                 iblock = ((indi-1) / distblk) / ngpu;          // local block id
                 di     = iblock*distblk + (indi-1)%distblk;     // local index in parent matrix
                 idev   = ((indi-1) / distblk) % ngpu;          // device with this block
                 magma_setdevice( idev );
                 magmablasSetKernelStream( queues[ idev ][ 0 ] );
                 //printf("LAST DSYR2K idev %d on A(%d,%d) of size %d \n",idev, indi-1,di,pk);
                 magma_dsyr2k(MagmaLower, MagmaNoTrans, pk, pk, c_neg_one,
                              dv[idev], pm,
                              dw[idev], pm, d_one,
                              dA(idev, indi, di+1), ldda);


                 /* Send the last block to the CPU */
                 dpanel_to_q(MagmaUpper, pk-1, A(n-pk+1, n-pk+2), lda, work);
                 magma_dgetmatrix( pk, pk,
                                   dA(idev, indi, di+1), ldda,
                                   A(n-pk+1, n-pk+1),  lda );
                 dq_to_panel(MagmaUpper, pk-1, A(n-pk+1, n-pk+2), lda, work);
             }

             indi_old = indi;
             //indj_old = indj;
             pm_old   = pm;
             pn_old   = pn;
        }  // end loop for (i)
    }// end of LOWER
    //magma_setdevice( 0 );

    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        magma_setdevice( dev );
        magma_free( dspace[dev]);
        magma_free_pinned(workngpu[dev]);
        for( magma_int_t e = 0; e < nbevents; ++e ) {
            magma_event_destroy( redevents[dev][e] );
        }
    }
    magma_free_pinned(workngpu[ngpu]);
    magma_free_cpu(worktest);

    magma_setdevice( orig_dev );
    magmablasSetKernelStream( orig_stream );
    magma_set_lapack_numthreads( orig_threads );

    work[0] = MAGMA_D_MAKE( lwkopt, 0 );
    return *info;
} /* magma_dsytrd_sy2sb_mgpu */