示例#1
0
SEXP magQR(SEXP a)
{
   SEXP gpu = GET_SLOT(a, install("gpu")),
        b = PROTECT(NEW_OBJECT(MAKE_CLASS("magmaQR")));
   int *DIMA = INTEGER(GET_DIM(a)), M = DIMA[0], N = DIMA[1],
       MIN_MN = (M < N ? M : N), NB = magma_get_dgeqrf_nb(M), *pivot, info;
   double *A, *tau;

   A = REAL(SET_VECTOR_ELT(b, 0, AS_NUMERIC(duplicate(a))));
   SET_VECTOR_ELT(b, 1, ScalarInteger(MIN_MN));
   tau = REAL(SET_VECTOR_ELT(b, 2, NEW_NUMERIC(MIN_MN)));
   pivot = INTEGER(SET_VECTOR_ELT(b, 3, NEW_INTEGER(N)));

   int i;
   for(i = 1; i <= N; i++) *pivot++ = i;

   if(LOGICAL_VALUE(gpu)) {
      int LENT = (2*MIN_MN + (N+31)/32*32)*NB;
      double *dA, *dT, *work;

      SET_SLOT(b, install("work"), NEW_NUMERIC(LENT));
      work = REAL(GET_SLOT(b, install("work")));

      magma_malloc((void**)&dA, (M*N)*sizeof(double));
      magma_malloc((void**)&dT, LENT*sizeof(double));

      magma_dsetmatrix(M, N, A, M, dA, M);
      magma_dgeqrf3_gpu(M, N, dA, M, tau, dT, &info);
      magma_dgetmatrix(M, N, dA, M, A, M);
      magma_dgetvector(LENT, dT, 1, work, 1);

      magma_free(dA);
      magma_free(dT);
   } else {
      int LWORK = N * NB;
      double *hA, *hwork;

      magma_malloc_pinned((void**)&hA, (M*N)*sizeof(double));
      magma_malloc_pinned((void**)&hwork, LWORK*sizeof(double));

      lapackf77_dlacpy(MagmaUpperLowerStr, &M, &N, A, &M, hA, &M);
      magma_dgeqrf_ooc(M, N, hA, M, tau, hwork, LWORK, &info);
      lapackf77_dlacpy(MagmaUpperLowerStr, &M, &N, hA, &M, A, &M);

      magma_free_pinned(hA);
      magma_free_pinned(hwork);
   }

   if(info < 0) error("illegal argument %d in 'magQR'", -1 * info);

   UNPROTECT(1);

   return b;
}
示例#2
0
extern "C" double
magmablas_dlansy(
    magma_norm_t norm, magma_uplo_t uplo, magma_int_t n,
    magmaDouble_const_ptr dA, size_t dA_offset, magma_int_t ldda,
    magmaDouble_ptr dwork, size_t dwork_offset,
    magma_queue_t queue )
{
    magma_int_t info = 0;
    // 1-norm == inf-norm since A is symmetric
    bool inf_norm = (norm == MagmaInfNorm || norm == MagmaOneNorm);
    bool max_norm = (norm == MagmaMaxNorm);
    
    // inf_norm Double-Complex requires > 16 KB shared data (arch >= 200)
    const bool inf_implemented = true;
    
    if ( ! (max_norm || (inf_norm && inf_implemented)) )
        info = -1;
    else if ( uplo != MagmaUpper && uplo != MagmaLower )
        info = -2;
    else if ( n < 0 )
        info = -3;
    else if ( ldda < n )
        info = -5;
    
    if ( info != 0 ) {
        magma_xerbla( __func__, -(info) );
        return info;
    }
    
    /* Quick return */
    if ( n == 0 )
        return 0;
        
    double res = 0;
    if ( inf_norm ) {
        dlansy_inf( uplo, n, dA,dA_offset, ldda, dwork,dwork_offset, queue );
    }
    else {
        dlansy_max( uplo, n, dA,dA_offset, ldda, dwork,dwork_offset, queue );
    }
    int i = magma_idamax( n, dwork,dwork_offset, 1, queue ) - 1;
    magma_dgetvector(1, dwork,dwork_offset+i, 1, &res, 1, queue );    

    return res;
}
示例#3
0
SEXP magMultmv(SEXP a, SEXP transa, SEXP x, SEXP right)
{
   SEXP gpu = magGetGPU(a, x),
        y = PROTECT(NEW_OBJECT(MAKE_CLASS("magma")));
   int RHS = LOGICAL_VALUE(right), TA = (LOGICAL_VALUE(transa) ^ !RHS),
       *DIMA = INTEGER(GET_DIM(a)),       
       M = DIMA[0], N = DIMA[1], LENX = LENGTH(x), LENY = DIMA[TA], LDA=M;
   char TRANSA = (TA ? 'T' : 'N');
   double *A = REAL(PROTECT(AS_NUMERIC(a))), *X = REAL(PROTECT(AS_NUMERIC(x))),
          *dA, *dX, *dY;

   if(DIMA[!TA] != LENX) error("non-conformable matrices");

   y = SET_SLOT(y, install(".Data"),
                allocMatrix(REALSXP, (RHS ? LENY : 1), (RHS ? 1 : LENY)));
   SET_SLOT(y, install("gpu"), duplicate(gpu));

   magma_malloc((void**)&dA, (M*N)*sizeof(double));
   magma_malloc((void**)&dX, LENX*sizeof(double));
   magma_malloc((void**)&dY, LENY*sizeof(double));

   magma_dsetmatrix(M, N, A, LDA, dA, LDA);
   magma_dsetvector(LENX, X, 1, dX, 1);

   if(LOGICAL_VALUE(gpu)) {
      magmablas_dgemv(TRANSA, M, N, 1.0, dA, LDA, dX, 1, 0.0, dY, 1);
   } else {
      cublasDgemv(TRANSA, M, N, 1.0, dA, LDA, dX, 1, 0.0, dY, 1);
   }

   magma_dgetvector(LENY, dY, 1, REAL(y), 1);

   magma_free(dA);
   magma_free(dX);
   magma_free(dY);

   UNPROTECT(3);

   return y;
}
示例#4
0
int main(int argc, char **argv)
{
    TESTING_INIT();

    real_Double_t   gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf, cpu_time;
    double          magma_error, cublas_error, work[1];
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t M, N, Xm, Ym, lda, sizeA, sizeX, sizeY;
    magma_int_t incx = 1;
    magma_int_t incy = 1;
    double c_neg_one = MAGMA_D_NEG_ONE;
    double alpha = MAGMA_D_MAKE(  1.5, -2.3 );
    double beta  = MAGMA_D_MAKE( -0.6,  0.8 );
    double *A, *X, *Y, *Ycublas, *Ymagma;
    double *dA, *dX, *dY;
    magma_int_t status = 0;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    double tol = opts.tolerance * lapackf77_dlamch("E");

    printf("trans = %s\n", lapack_trans_const(opts.transA) );
    printf("    M     N   MAGMA Gflop/s (ms)  CUBLAS Gflop/s (ms)   CPU Gflop/s (ms)  MAGMA error  CUBLAS error\n");
    printf("===================================================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            lda    = ((M+31)/32)*32;
            gflops = FLOPS_DGEMV( M, N ) / 1e9;

            if ( opts.transA == MagmaNoTrans ) {
                Xm = N;
                Ym = M;
            } else {
                Xm = M;
                Ym = N;
            }

            sizeA = lda*N;
            sizeX = incx*Xm;
            sizeY = incy*Ym;
            
            TESTING_MALLOC_CPU( A,       double, sizeA );
            TESTING_MALLOC_CPU( X,       double, sizeX );
            TESTING_MALLOC_CPU( Y,       double, sizeY );
            TESTING_MALLOC_CPU( Ycublas, double, sizeY );
            TESTING_MALLOC_CPU( Ymagma,  double, sizeY );
            
            TESTING_MALLOC_DEV( dA, double, sizeA );
            TESTING_MALLOC_DEV( dX, double, sizeX );
            TESTING_MALLOC_DEV( dY, double, sizeY );
            
            /* Initialize the matrix */
            lapackf77_dlarnv( &ione, ISEED, &sizeA, A );
            lapackf77_dlarnv( &ione, ISEED, &sizeX, X );
            lapackf77_dlarnv( &ione, ISEED, &sizeY, Y );
            
            /* =====================================================================
               Performs operation using CUBLAS
               =================================================================== */
            magma_dsetmatrix( M, N, A, lda, dA, lda );
            magma_dsetvector( Xm, X, incx, dX, incx );
            magma_dsetvector( Ym, Y, incy, dY, incy );
            
            cublas_time = magma_sync_wtime( 0 );
            cublasDgemv( handle, cublas_trans_const(opts.transA),
                         M, N, &alpha, dA, lda, dX, incx, &beta, dY, incy );
            cublas_time = magma_sync_wtime( 0 ) - cublas_time;
            cublas_perf = gflops / cublas_time;
            
            magma_dgetvector( Ym, dY, incy, Ycublas, incy );
            
            /* =====================================================================
               Performs operation using MAGMABLAS
               =================================================================== */
            magma_dsetvector( Ym, Y, incy, dY, incy );
            
            magma_time = magma_sync_wtime( 0 );
            magmablas_dgemv( opts.transA, M, N, alpha, dA, lda, dX, incx, beta, dY, incy );
            magma_time = magma_sync_wtime( 0 ) - magma_time;
            magma_perf = gflops / magma_time;
            
            magma_dgetvector( Ym, dY, incx, Ymagma, incx );
            
            /* =====================================================================
               Performs operation using CPU BLAS
               =================================================================== */
            cpu_time = magma_wtime();
            blasf77_dgemv( lapack_trans_const(opts.transA), &M, &N,
                           &alpha, A, &lda,
                                   X, &incx,
                           &beta,  Y, &incy );
            cpu_time = magma_wtime() - cpu_time;
            cpu_perf = gflops / cpu_time;
            
            /* =====================================================================
               Check the result
               =================================================================== */
            blasf77_daxpy( &Ym, &c_neg_one, Y, &incy, Ymagma, &incy );
            magma_error = lapackf77_dlange( "M", &Ym, &ione, Ymagma, &Ym, work ) / Ym;
            
            blasf77_daxpy( &Ym, &c_neg_one, Y, &incy, Ycublas, &incy );
            cublas_error = lapackf77_dlange( "M", &Ym, &ione, Ycublas, &Ym, work ) / Ym;
            
            printf("%5d %5d   %7.2f (%7.2f)    %7.2f (%7.2f)   %7.2f (%7.2f)    %8.2e     %8.2e   %s\n",
                   (int) M, (int) N,
                   magma_perf,  1000.*magma_time,
                   cublas_perf, 1000.*cublas_time,
                   cpu_perf,    1000.*cpu_time,
                   magma_error, cublas_error,
                   (magma_error < tol && cublas_error < tol ? "ok" : "failed"));
            status += ! (magma_error < tol && cublas_error < tol);
            
            TESTING_FREE_CPU( A );
            TESTING_FREE_CPU( X );
            TESTING_FREE_CPU( Y );
            TESTING_FREE_CPU( Ycublas );
            TESTING_FREE_CPU( Ymagma  );
            
            TESTING_FREE_DEV( dA );
            TESTING_FREE_DEV( dX );
            TESTING_FREE_DEV( dY );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }
    
    TESTING_FINALIZE();
    return status;
}
示例#5
0
/**
    Purpose
    -------
    DLAHR2 reduces the first NB columns of a real general n-BY-(n-k+1)
    matrix A so that elements below the k-th subdiagonal are zero. The
    reduction is performed by an orthogonal similarity transformation
    Q' * A * Q. The routine returns the matrices V and T which determine
    Q as a block reflector I - V*T*V', and also the matrix Y = A * V.
    (Note this is different than LAPACK, which computes Y = A * V * T.)

    This is an auxiliary routine called by DGEHRD.

    Arguments
    ---------
    @param[in]
    n       INTEGER
            The order of the matrix A.

    @param[in]
    k       INTEGER
            The offset for the reduction. Elements below the k-th
            subdiagonal in the first NB columns are reduced to zero.
            K < N.

    @param[in]
    nb      INTEGER
            The number of columns to be reduced.

    @param[in,out]
    dA      DOUBLE PRECISION array on the GPU, dimension (LDDA,N-K+1)
            On entry, the n-by-(n-k+1) general matrix A.
            On exit, the elements in rows K:N of the first NB columns are
            overwritten with the matrix Y.

    @param[in]
    ldda    INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,N).

    @param[out]
    dV      DOUBLE PRECISION array on the GPU, dimension (LDDV, NB)
            On exit this n-by-nb array contains the Householder vectors of the transformation.

    @param[in]
    lddv    INTEGER
            The leading dimension of the array dV.  LDDV >= max(1,N).

    @param[in,out]
    A       DOUBLE PRECISION array, dimension (LDA,N-K+1)
            On entry, the n-by-(n-k+1) general matrix A.
            On exit, the elements on and above the k-th subdiagonal in
            the first NB columns are overwritten with the corresponding
            elements of the reduced matrix; the elements below the k-th
            subdiagonal, with the array TAU, represent the matrix Q as a
            product of elementary reflectors. The other columns of A are
            unchanged. See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    tau     DOUBLE PRECISION array, dimension (NB)
            The scalar factors of the elementary reflectors. See Further
            Details.

    @param[out]
    T       DOUBLE PRECISION array, dimension (LDT,NB)
            The upper triangular matrix T.

    @param[in]
    ldt     INTEGER
            The leading dimension of the array T.  LDT >= NB.

    @param[out]
    Y       DOUBLE PRECISION array, dimension (LDY,NB)
            The n-by-nb matrix Y.

    @param[in]
    ldy     INTEGER
            The leading dimension of the array Y. LDY >= N.

    @param[in]
    queue   magma_queue_t
            Queue to execute in.

    Further Details
    ---------------
    The matrix Q is represented as a product of nb elementary reflectors

       Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in
    A(i+k+1:n,i), and tau in TAU(i).

    The elements of the vectors v together form the (n-k+1)-by-nb matrix
    V which is needed, with T and Y, to apply the transformation to the
    unreduced part of the matrix, using an update of the form:
    A := (I - V*T*V') * (A - Y*T*V').

    The contents of A on exit are illustrated by the following example
    with n = 7, k = 3 and nb = 2:

    @verbatim
       ( a   a   a   a   a )
       ( a   a   a   a   a )
       ( a   a   a   a   a )
       ( h   h   a   a   a )
       ( v1  h   a   a   a )
       ( v1  v2  a   a   a )
       ( v1  v2  a   a   a )
    @endverbatim

    where "a" denotes an element of the original matrix A, h denotes a
    modified element of the upper Hessenberg matrix H, and vi denotes an
    element of the vector defining H(i).

    This implementation follows the hybrid algorithm and notations described in

    S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg
    form through hybrid GPU-based computing," University of Tennessee Computer
    Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219),
    May 24, 2009.

    @ingroup magma_dgeev_aux
    ********************************************************************/
extern "C" magma_int_t
magma_dlahr2(
    magma_int_t n, magma_int_t k, magma_int_t nb,
    magmaDouble_ptr dA, magma_int_t ldda,
    magmaDouble_ptr dV, magma_int_t lddv,
    double *A,     magma_int_t lda,
    double *tau,
    double *T,     magma_int_t ldt,
    double *Y,     magma_int_t ldy,
    magma_queue_t queue )
{
    #define  A(i_,j_) ( A + (i_) + (j_)*lda)
    #define  Y(i_,j_) ( Y + (i_) + (j_)*ldy)
    #define  T(i_,j_) ( T + (i_) + (j_)*ldt)
    #define dA(i_,j_) (dA + (i_) + (j_)*ldda)
    #define dV(i_,j_) (dV + (i_) + (j_)*lddv)
    
    double c_zero    = MAGMA_D_ZERO;
    double c_one     = MAGMA_D_ONE;
    double c_neg_one = MAGMA_D_NEG_ONE;

    magma_int_t ione = 1;
    
    magma_int_t n_k_i_1, n_k;
    double scale;

    magma_int_t i;
    double ei = MAGMA_D_ZERO;

    magma_int_t info = 0;
    if (n < 0) {
        info = -1;
    } else if (k < 0 || k > n) {
        info = -2;
    } else if (nb < 1 || nb > n) {
        info = -3;
    } else if (ldda < max(1,n)) {
        info = -5;
    } else if (lddv < max(1,n)) {
        info = -7;
    } else if (lda < max(1,n)) {
        info = -9;
    } else if (ldt < max(1,nb)) {
        info = -12;
    } else if (ldy < max(1,n)) {
        info = -13;
    }
    if (info != 0) {
        magma_xerbla( __func__, -(info) );
        return info;
    }

    // adjust from 1-based indexing
    k -= 1;

    if (n <= 1)
        return info;
    
    for (i = 0; i < nb; ++i) {
        n_k_i_1 = n - k - i - 1;
        n_k     = n - k;
        
        if (i > 0) {
            // Update A(k:n-1,i); Update i-th column of A - Y * T * V'
            // This updates one more row than LAPACK does (row k),
            // making the block above the panel an even multiple of nb.
            // Use last column of T as workspace, w.
            // w(0:i-1, nb-1) = VA(k+i, 0:i-1)'
            blasf77_dcopy( &i,
                           A(k+i,0),  &lda,
                           T(0,nb-1), &ione );
            #ifdef COMPLEX
            // If real, conjugate row of V.
            lapackf77_dlacgv(&i, T(0,nb-1), &ione);
            #endif
            
            // w = T(0:i-1, 0:i-1) * w
            blasf77_dtrmv( "Upper", "No trans", "No trans", &i,
                           T(0,0),    &ldt,
                           T(0,nb-1), &ione );
            
            // A(k:n-1, i) -= Y(k:n-1, 0:i-1) * w
            blasf77_dgemv( "No trans", &n_k, &i,
                           &c_neg_one, Y(k,0),    &ldy,
                                       T(0,nb-1), &ione,
                           &c_one,     A(k,i),    &ione );
            
            // Apply I - V * T' * V' to this column (call it b) from the
            // left, using the last column of T as workspace, w.
            //
            // Let  V = ( V1 )   and   b = ( b1 )   (first i-1 rows)
            //          ( V2 )             ( b2 )
            // where V1 is unit lower triangular
            
            // w := b1 = A(k+1:k+i, i)
            blasf77_dcopy( &i,
                           A(k+1,i),  &ione,
                           T(0,nb-1), &ione );
            
            // w := V1' * b1 = VA(k+1:k+i, 0:i-1)' * w
            blasf77_dtrmv( "Lower", "Conj", "Unit", &i,
                           A(k+1,0), &lda,
                           T(0,nb-1), &ione );
            
            // w := w + V2'*b2 = w + VA(k+i+1:n-1, 0:i-1)' * A(k+i+1:n-1, i)
            blasf77_dgemv( "Conj", &n_k_i_1, &i,
                           &c_one, A(k+i+1,0), &lda,
                                   A(k+i+1,i), &ione,
                           &c_one, T(0,nb-1),  &ione );
            
            // w := T'*w = T(0:i-1, 0:i-1)' * w
            blasf77_dtrmv( "Upper", "Conj", "Non-unit", &i,
                           T(0,0), &ldt,
                           T(0,nb-1), &ione );
            
            // b2 := b2 - V2*w = A(k+i+1:n-1, i) - VA(k+i+1:n-1, 0:i-1) * w
            blasf77_dgemv( "No trans", &n_k_i_1, &i,
                           &c_neg_one, A(k+i+1,0), &lda,
                                       T(0,nb-1),  &ione,
                           &c_one,     A(k+i+1,i), &ione );
            
            // w := V1*w = VA(k+1:k+i, 0:i-1) * w
            blasf77_dtrmv( "Lower", "No trans", "Unit", &i,
                           A(k+1,0), &lda,
                           T(0,nb-1), &ione );
            
            // b1 := b1 - w = A(k+1:k+i-1, i) - w
            blasf77_daxpy( &i,
                           &c_neg_one, T(0,nb-1), &ione,
                                       A(k+1,i),  &ione );
            
            // Restore diagonal element, saved below during previous iteration
            *A(k+i,i-1) = ei;
        }
        
        // Generate the elementary reflector H(i) to annihilate A(k+i+1:n-1,i)
        lapackf77_dlarfg( &n_k_i_1,
                          A(k+i+1,i),
                          A(k+i+2,i), &ione, &tau[i] );
        // Save diagonal element and set to one, to simplify multiplying by V
        ei = *A(k+i+1,i);
        *A(k+i+1,i) = c_one;

        // dV(i+1:n-k-1, i) = VA(k+i+1:n-1, i)
        magma_dsetvector( n_k_i_1,
                          A(k+i+1,i), 1,
                          dV(i+1,i),  1, queue );
        
        // Compute Y(k+1:n,i) = A vi
        // dA(k:n-1, i) = dA(k:n-1, i+1:n-k-1) * dV(i+1:n-k-1, i)
        magma_dgemv( MagmaNoTrans, n_k, n_k_i_1,
                     c_one,  dA(k,i+1), ldda,
                             dV(i+1,i), ione,
                     c_zero, dA(k,i),   ione, queue );
        
        // Compute T(0:i,i) = [ -tau T V' vi ]
        //                    [  tau         ]
        // T(0:i-1, i) = -tau VA(k+i+1:n-1, 0:i-1)' VA(k+i+1:n-1, i)
        scale = MAGMA_D_NEGATE( tau[i]);
        blasf77_dgemv( "Conj", &n_k_i_1, &i,
                       &scale,  A(k+i+1,0), &lda,
                                A(k+i+1,i), &ione,
                       &c_zero, T(0,i),     &ione );
        // T(0:i-1, i) = T(0:i-1, 0:i-1) * T(0:i-1, i)
        blasf77_dtrmv( "Upper", "No trans", "Non-unit", &i,
                       T(0,0), &ldt,
                       T(0,i), &ione );
        *T(i,i) = tau[i];

        // Y(k:n-1, i) = dA(k:n-1, i)
        magma_dgetvector( n-k,
                          dA(k,i), 1,
                          Y(k,i),  1, queue );
    }
    // Restore diagonal element
    *A(k+nb,nb-1) = ei;

    return info;
} /* magma_dlahr2 */
示例#6
0
extern "C" magma_int_t
magma_zlaqps_gpu(magma_int_t m, magma_int_t n, magma_int_t offset,
                 magma_int_t nb, magma_int_t *kb,
                 magmaDoubleComplex *A,  magma_int_t lda,
                 magma_int_t *jpvt, magmaDoubleComplex *tau,
                 double *vn1, double *vn2,
                 magmaDoubleComplex *auxv,
                 magmaDoubleComplex *F,  magma_int_t ldf)
{
    /*  -- MAGMA (version 1.4.0) --
           Univ. of Tennessee, Knoxville
           Univ. of California, Berkeley
           Univ. of Colorado, Denver
           August 2013

        Purpose
        =======
        ZLAQPS computes a step of QR factorization with column pivoting
        of a complex M-by-N matrix A by using Blas-3.  It tries to factorize
        NB columns from A starting from the row OFFSET+1, and updates all
        of the matrix with Blas-3 xGEMM.

        In some cases, due to catastrophic cancellations, it cannot
        factorize NB columns.  Hence, the actual number of factorized
        columns is returned in KB.

        Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized.

        Arguments
        =========
        M       (input) INTEGER
                The number of rows of the matrix A. M >= 0.

        N       (input) INTEGER
                The number of columns of the matrix A. N >= 0

        OFFSET  (input) INTEGER
                The number of rows of A that have been factorized in
                previous steps.

        NB      (input) INTEGER
                The number of columns to factorize.

        KB      (output) INTEGER
                The number of columns actually factorized.

        A       (input/output) COMPLEX*16 array, dimension (LDA,N)
                On entry, the M-by-N matrix A.
                On exit, block A(OFFSET+1:M,1:KB) is the triangular
                factor obtained and block A(1:OFFSET,1:N) has been
                accordingly pivoted, but no factorized.
                The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has
                been updated.

        LDA     (input) INTEGER
                The leading dimension of the array A. LDA >= max(1,M).

        JPVT    (input/output) INTEGER array, dimension (N)
                JPVT(I) = K <==> Column K of the full matrix A has been
                permuted into position I in AP.

        TAU     (output) COMPLEX*16 array, dimension (KB)
                The scalar factors of the elementary reflectors.

        VN1     (input/output) DOUBLE PRECISION array, dimension (N)
                The vector with the partial column norms.

        VN2     (input/output) DOUBLE PRECISION array, dimension (N)
                The vector with the exact column norms.

        AUXV    (input/output) COMPLEX*16 array, dimension (NB)
                Auxiliar vector.

        F       (input/output) COMPLEX*16 array, dimension (LDF,NB)
                Matrix F' = L*Y'*A.

        LDF     (input) INTEGER
                The leading dimension of the array F. LDF >= max(1,N).

        =====================================================================    */

#define  A(i, j) (A  + (i) + (j)*(lda ))
#define  F(i, j) (F  + (i) + (j)*(ldf ))

    magmaDoubleComplex c_zero    = MAGMA_Z_MAKE( 0.,0.);
    magmaDoubleComplex c_one     = MAGMA_Z_MAKE( 1.,0.);
    magmaDoubleComplex c_neg_one = MAGMA_Z_MAKE(-1.,0.);
    magma_int_t ione = 1;

    magma_int_t i__1, i__2;
    //double d__1;
    magmaDoubleComplex z__1;

    //magma_int_t j;
    magma_int_t k, rk;
    //magmaDoubleComplex Akk;
    magmaDoubleComplex *Aks;
    magmaDoubleComplex tauk;
    magma_int_t pvt;
    //double temp, temp2;
    double tol3z;
    magma_int_t itemp;

    double lsticc, *lsticcs;
    magma_int_t lastrk;
    magma_dmalloc( &lsticcs, 1+256*(n+255)/256 );

    lastrk = min( m, n + offset );
    tol3z = magma_dsqrt( lapackf77_dlamch("Epsilon"));

    lsticc = 0;
    k = 0;
    magma_zmalloc( &Aks, nb );

    while( k < nb && lsticc == 0 ) {
        rk = offset + k;

        /* Determine ith pivot column and swap if necessary */
        // Fortran: pvt, k, idamax are all 1-based; subtract 1 from k.
        // C:       pvt, k, idamax are all 0-based; don't subtract 1.
        pvt = k - 1 + magma_idamax( n-k, &vn1[k], ione );

        if (pvt != k) {

            /*if (pvt >= nb) {
                // 1. Start copy from GPU
                magma_zgetmatrix_async( m - offset - nb, 1,
                                        dA(offset + nb, pvt), ldda,
                                        A (offset + nb, pvt), lda, stream );
            }*/

            /* F gets swapped so F must be sent at the end to GPU   */
            i__1 = k;
            /*if (pvt < nb){
                // no need of transfer if pivot is within the panel
                blasf77_zswap( &m, A(0, pvt), &ione, A(0, k), &ione );
            }
            else {
                // 1. Finish copy from GPU
                magma_queue_sync( stream );

                // 2. Swap as usual on CPU
                blasf77_zswap(&m, A(0, pvt), &ione, A(0, k), &ione);

                // 3. Restore the GPU
                magma_zsetmatrix_async( m - offset - nb, 1,
                                        A (offset + nb, pvt), lda,
                                        dA(offset + nb, pvt), ldda, stream);
            }*/
            magmablas_zswap( m, A(0, pvt), ione, A(0, k), ione );

            //blasf77_zswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf );
            magmablas_zswap( i__1, F(pvt, 0), ldf, F(k, 0), ldf);
            itemp     = jpvt[pvt];
            jpvt[pvt] = jpvt[k];
            jpvt[k]   = itemp;
            //vn1[pvt] = vn1[k];
            //vn2[pvt] = vn2[k];
#if defined(PRECISION_d) || defined(PRECISION_z)
            //magma_dswap( 1, &vn1[pvt], 1, &vn1[k], 1 );
            //magma_dswap( 1, &vn2[pvt], 1, &vn2[k], 1 );
            magma_dswap( 2, &vn1[pvt], n+offset, &vn1[k], n+offset );
#else
            //magma_sswap( 1, &vn1[pvt], 1, &vn1[k], 1 );
            //magma_sswap( 1, &vn2[pvt], 1, &vn2[k], 1 );
            magma_sswap(2, &vn1[pvt], n+offset, &vn1[k], n+offset);
#endif

        }

        /* Apply previous Householder reflectors to column K:
           A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'.
           Optimization: multiply with beta=0; wait for vector and subtract */
        if (k > 0) {
            /*#if (defined(PRECISION_c) || defined(PRECISION_z))
            for (j = 0; j < k; ++j){
                *F(k,j) = MAGMA_Z_CNJG( *F(k,j) );
            }
            #endif*/

//#define RIGHT_UPDATE
#ifdef RIGHT_UPDATE
            i__1 = m - offset - nb;
            i__2 = k;
            magma_zgemv( MagmaNoTrans, i__1, i__2,
                         c_neg_one, A(offset+nb, 0), lda,
                         F(k,         0), ldf,
                         c_one,     A(offset+nb, k), ione );
#else
            i__1 = m - rk;
            i__2 = k;
            /*blasf77_zgemv( MagmaNoTransStr, &i__1, &i__2,
                           &c_neg_one, A(rk, 0), &lda,
                                       F(k,  0), &ldf,
                           &c_one,     A(rk, k), &ione );*/
            magma_zgemv( MagmaNoTrans, i__1, i__2,
                         c_neg_one, A(rk, 0), lda,
                         F(k,  0), ldf,
                         c_one,     A(rk, k), ione );
#endif

            /*#if (defined(PRECISION_c) || defined(PRECISION_z))
            for (j = 0; j < k; ++j) {
                *F(k,j) = MAGMA_Z_CNJG( *F(k,j) );
            }
            #endif*/
        }

        /*  Generate elementary reflector H(k). */
        magma_zlarfg_gpu(m-rk, A(rk, k), A(rk + 1, k), &tau[k], &vn1[k], &Aks[k]);

        //Akk = *A(rk, k);
        //*A(rk, k) = c_one;
        //magma_zgetvector( 1, &Aks[k],  1, &Akk,     1 );

        /* needed to avoid the race condition */
        if (k == 0) magma_zsetvector(  1,    &c_one,       1, A(rk, k), 1 );
        else        magma_zcopymatrix( 1, 1, A(offset, 0), 1, A(rk, k), 1 );

        /* Compute Kth column of F:
           Compute  F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */
        if (k < n-1 || k > 0) magma_zgetvector( 1, &tau[k], 1, &tauk, 1 );
        if (k < n-1) {
            i__1 = m - rk;
            i__2 = n - k - 1;

            /* Send the vector to the GPU */
            //magma_zsetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda );

            /* Multiply on GPU */
            // was CALL ZGEMV( 'Conjugate transpose', M-RK+1, N-K,
            //                 TAU( K ), A( RK,  K+1 ), LDA,
            //                           A( RK,  K   ), 1,
            //                 CZERO,    F( K+1, K   ), 1 )
            //magma_zgetvector( 1, &tau[k], 1, &tauk, 1 );
            magma_zgemv( MagmaConjTrans, m-rk, n-k-1,
                         tauk,   A( rk,  k+1 ), lda,
                         A( rk,  k   ), 1,
                         c_zero, F( k+1, k   ), 1 );
            //magma_zscal( m-rk, tau[k], F( k+1, k), 1 );
            //magma_int_t i__3 = nb-k-1;
            //magma_int_t i__4 = i__2 - i__3;
            //magma_int_t i__5 = nb-k;
            //magma_zgemv( MagmaConjTrans, i__1 - i__5, i__2 - i__3,
            //             tau[k], dA(rk +i__5, k+1+i__3), ldda,
            //                     dA(rk +i__5, k       ), ione,
            //             c_zero, dF(k+1+i__3, k       ), ione );

            //magma_zgetmatrix_async( i__2-i__3, 1,
            //                        dF(k + 1 +i__3, k), i__2,
            //                        F (k + 1 +i__3, k), i__2, stream );

            //blasf77_zgemv( MagmaConjTransStr, &i__1, &i__3,
            //               &tau[k], A(rk,  k+1), &lda,
            //                        A(rk,  k  ), &ione,
            //               &c_zero, F(k+1, k  ), &ione );

            //magma_queue_sync( stream );
            //blasf77_zgemv( MagmaConjTransStr, &i__5, &i__4,
            //               &tau[k], A(rk, k+1+i__3), &lda,
            //                        A(rk, k       ), &ione,
            //               &c_one,  F(k+1+i__3, k ), &ione );
        }

        /* Padding F(1:K,K) with zeros.
        for (j = 0; j <= k; ++j) {
            magma_zsetvector( 1, &c_zero, 1, F(j, k), 1 );
        }*/

        /* Incremental updating of F:
           F(1:N,K) := F(1:N,K)                        - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K).
           F(1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K)
                    := tau(K)(A(RK:M,K+1:N)' - F(1:N,1:K-1)*A(RK:M,1:K-1)') A(RK:M,K)
           so, F is (updated A)*V */
        //if (k > 0 && k<n-1) {
        if (k > 0) {
            //magma_zgetvector( 1, &tau[k], 1, &tauk, 1 );
            z__1 = MAGMA_Z_NEGATE( tauk );
#ifdef RIGHT_UPDATE
            i__1 = m - offset - nb;
            i__2 = k;
            magma_zgemv( MagmaConjTrans, i__1, i__2,
                         z__1,   A(offset+nb, 0), lda,
                         A(offset+nb, k), ione,
                         c_zero, auxv, ione );

            i__1 = k;
            magma_zgemv( MagmaNoTrans, n-k-1, i__1,
                         c_one, F(k+1,0), ldf,
                         auxv,     ione,
                         c_one, F(k+1,k), ione );
#else
            i__1 = m - rk;
            i__2 = k;
            //blasf77_zgemv( MagmaConjTransStr, &i__1, &i__2,
            //               &z__1,   A(rk, 0), &lda,
            //                        A(rk, k), &ione,
            //               &c_zero, auxv, &ione );

            magma_zgemv( MagmaConjTrans, i__1, i__2,
                         z__1,   A(rk, 0), lda,
                         A(rk, k), ione,
                         c_zero, auxv, ione );

            //i__1 = k;
            //blasf77_zgemv( MagmaNoTransStr, &n, &i__1,
            //               &c_one, F(0,0), &ldf,
            //                       auxv,   &ione,
            //               &c_one, F(0,k), &ione );
            /*magma_zgemv( MagmaNoTrans, n, i__1,
                           c_one, F(0,0), ldf,
                                  auxv,   ione,
                           c_one, F(0,k), ione );*/
            /* I think we only need stricly lower-triangular part :) */
            magma_zgemv( MagmaNoTrans, n-k-1, i__2,
                         c_one, F(k+1,0), ldf,
                         auxv,     ione,
                         c_one, F(k+1,k), ione );
#endif
        }

        /* Optimization: On the last iteration start sending F back to the GPU */

        /* Update the current row of A:
           A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'.               */
        if (k < n-1) {
            i__1 = n - k - 1;
            i__2 = k + 1;
            //blasf77_zgemm( MagmaNoTransStr, MagmaConjTransStr, &ione, &i__1, &i__2,
            //               &c_neg_one, A(rk, 0  ), &lda,
            //                           F(k+1,0  ), &ldf,
            //               &c_one,     A(rk, k+1), &lda );
#ifdef RIGHT_UPDATE
            /* right-looking update of rows,                     */
            magma_zgemm( MagmaNoTrans, MagmaConjTrans, nb-k, i__1, ione,
                         c_neg_one, A(rk,  k  ), lda,
                         F(k+1, k  ), ldf,
                         c_one,     A(rk,  k+1), lda );
#else
            /* left-looking update of rows,                     *
             * since F=A'v with original A, so no right-looking */
            magma_zgemm( MagmaNoTrans, MagmaConjTrans, ione, i__1, i__2,
                         c_neg_one, A(rk, 0  ), lda,
                         F(k+1,0  ), ldf,
                         c_one,     A(rk, k+1), lda );
#endif
        }

        /* Update partial column norms. */
        if (rk < min(m, n+offset)-1 ) {
            magmablas_dznrm2_row_check_adjust(n-k-1, tol3z, &vn1[k+1], &vn2[k+1], A(rk,k+1), lda, lsticcs);

            magma_device_sync();
#if defined(PRECISION_d) || defined(PRECISION_z)
            magma_dgetvector( 1, &lsticcs[0], 1, &lsticc, 1 );
#else
            magma_sgetvector( 1, &lsticcs[0], 1, &lsticc, 1 );
#endif
        }


        /*if (rk < lastrk) {
            for (j = k + 1; j < n; ++j) {
                if (vn1[j] != 0.) {
                    // NOTE: The following 4 lines follow from the analysis in
                    //   Lapack Working Note 176.
                    temp = MAGMA_Z_ABS( *A(rk,j) ) / vn1[j];
                    temp = max( 0., ((1. + temp) * (1. - temp)) );

                    d__1 = vn1[j] / vn2[j];
                    temp2 = temp * (d__1 * d__1);

                    if (temp2 <= tol3z) {
                        vn2[j] = (double) lsticc;
                        lsticc = j;
                    } else {
                        vn1[j] *= magma_dsqrt(temp);
                    }
                }
            }
        }*/

        //*A(rk, k) = Akk;
        //magma_zsetvector( 1, &Akk, 1, A(rk, k), 1 );
        //magma_zswap( 1, &Aks[k], 1, A(rk, k), 1 );

        ++k;
    }
    magma_zcopymatrix( 1, k, Aks, 1, A(offset, 0), lda+1 );

    // leave k as the last column done
    --k;
    *kb = k + 1;
    rk = offset + *kb - 1;

    /* Apply the block reflector to the rest of the matrix:
       A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)'  */
    if (*kb < min(n, m - offset)) {
        i__1 = m - rk - 1;
        i__2 = n - *kb;

        /* Send F to the GPU
        magma_zsetmatrix( i__2, *kb,
                          F (*kb, 0), ldf,
                          dF(*kb, 0), i__2 );*/

        magma_zgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb,
                     c_neg_one, A(rk+1, 0  ), lda,
                     F(*kb,  0  ), ldf,
                     c_one,     A(rk+1, *kb), lda );
    }
    /* Recomputation of difficult columns. */
    if( lsticc > 0 ) {
        printf( " -- recompute dnorms --\n" );
        magmablas_dznrm2_check(m-rk-1, n-*kb, A(rk+1,*kb), lda,
                               &vn1[*kb], lsticcs);
#if defined(PRECISION_d) || defined(PRECISION_z)
        magma_dcopymatrix( n-*kb, 1, &vn1[*kb], *kb, &vn2[*kb], *kb);
#else
        magma_scopymatrix( n-*kb, 1, &vn1[*kb], *kb, &vn2[*kb], *kb);
#endif
        /*while( lsticc > 0 ) {
            itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc]));
            i__1 = m - rk - 1;
            if (lsticc <= nb)
                vn1[lsticc] = cblas_dznrm2(i__1, A(rk + 1, lsticc), ione);
            else {
                // Where is the data, CPU or GPU ?
                double r1, r2;

                r1 = cblas_dznrm2(nb-k, A(rk + 1, lsticc), ione);
                r2 = magma_dznrm2(m-offset-nb, dA(offset + nb + 1, lsticc), ione);

                vn1[lsticc] = magma_dsqrt(r1*r1+r2*r2);
            }

            // NOTE: The computation of VN1( LSTICC ) relies on the fact that
            //   SNRM2 does not fail on vectors with norm below the value of SQRT(DLAMCH('S'))
            vn2[lsticc] = vn1[lsticc];
            lsticc = itemp;*/
    }
    magma_free(Aks);
    magma_free(lsticcs);

    return MAGMA_SUCCESS;
} /* magma_zlaqps */
示例#7
0
extern "C" magma_int_t
magma_dcg_merge(
    magma_d_matrix A, magma_d_matrix b, magma_d_matrix *x,
    magma_d_solver_par *solver_par,
    magma_queue_t queue )
{
    magma_int_t info = MAGMA_NOTCONVERGED;
    
    // prepare solver feedback
    solver_par->solver = Magma_CGMERGE;
    solver_par->numiter = 0;
    solver_par->spmv_count = 0;
    
    // solver variables
    double alpha, beta, gamma, rho, tmp1, *skp_h={0};
    double nom, nom0, betanom, den, nomb;

    // some useful variables
    double c_zero = MAGMA_D_ZERO, c_one = MAGMA_D_ONE;
    magma_int_t dofs = A.num_rows*b.num_cols;

    magma_d_matrix r={Magma_CSR}, d={Magma_CSR}, z={Magma_CSR}, B={Magma_CSR}, C={Magma_CSR};
    double *d1=NULL, *d2=NULL, *skp=NULL;

    // GPU workspace
    CHECK( magma_dvinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_dvinit( &d, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_dvinit( &z, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    
    CHECK( magma_dmalloc( &d1, dofs*(1) ));
    CHECK( magma_dmalloc( &d2, dofs*(1) ));
    // array for the parameters
    CHECK( magma_dmalloc( &skp, 6 ));
    // skp = [alpha|beta|gamma|rho|tmp1|tmp2]
    
    // solver setup
    magma_dscal( dofs, c_zero, x->dval, 1, queue );                      // x = 0
    //CHECK(  magma_dresidualvec( A, b, *x, &r, nom0, queue));
    magma_dcopy( dofs, b.dval, 1, r.dval, 1, queue );                    // r = b
    magma_dcopy( dofs, r.dval, 1, d.dval, 1, queue );                    // d = r
    nom0 = betanom = magma_dnrm2( dofs, r.dval, 1, queue );
    nom = nom0 * nom0;                                           // nom = r' * r
    CHECK( magma_d_spmv( c_one, A, d, c_zero, z, queue ));              // z = A d
    den = MAGMA_D_ABS( magma_ddot( dofs, d.dval, 1, z.dval, 1, queue ) ); // den = d'* z
    solver_par->init_res = nom0;
    
    nomb = magma_dnrm2( dofs, b.dval, 1, queue );
    if ( nomb == 0.0 ){
        nomb=1.0;
    }       
    
    // array on host for the parameters
    CHECK( magma_dmalloc_cpu( &skp_h, 6 ));
    
    alpha = rho = gamma = tmp1 = c_one;
    beta =  magma_ddot( dofs, r.dval, 1, r.dval, 1, queue );
    skp_h[0]=alpha;
    skp_h[1]=beta;
    skp_h[2]=gamma;
    skp_h[3]=rho;
    skp_h[4]=tmp1;
    skp_h[5]=MAGMA_D_MAKE(nom, 0.0);

    magma_dsetvector( 6, skp_h, 1, skp, 1, queue );

    if( nom0 < solver_par->atol ||
        nom0/nomb < solver_par->rtol ){
        info = MAGMA_SUCCESS;
        goto cleanup;
    }
    solver_par->final_res = solver_par->init_res;
    solver_par->iter_res = solver_par->init_res;
    if ( solver_par->verbose > 0 ) {
        solver_par->res_vec[0] = (real_Double_t) nom0;
        solver_par->timing[0] = 0.0;
    }
    // check positive definite
    if (den <= 0.0) {
        info = MAGMA_NONSPD; 
        goto cleanup;
    }
    
    //Chronometry
    real_Double_t tempo1, tempo2;
    tempo1 = magma_sync_wtime( queue );

    solver_par->numiter = 0;
    solver_par->spmv_count = 0;
    // start iteration
    do
    {
        solver_par->numiter++;

        // computes SpMV and dot product
        CHECK( magma_dcgmerge_spmv1(  A, d1, d2, d.dval, z.dval, skp, queue ));
        solver_par->spmv_count++;
        // updates x, r, computes scalars and updates d
        CHECK( magma_dcgmerge_xrbeta( dofs, d1, d2, x->dval, r.dval, d.dval, z.dval, skp, queue ));

        // check stopping criterion (asynchronous copy)
        magma_dgetvector( 1 , skp+1, 1, skp_h+1, 1, queue );
        betanom = sqrt(MAGMA_D_ABS(skp_h[1]));

        if ( solver_par->verbose > 0 ) {
            tempo2 = magma_sync_wtime( queue );
            if ( (solver_par->numiter)%solver_par->verbose==0 ) {
                solver_par->res_vec[(solver_par->numiter)/solver_par->verbose]
                        = (real_Double_t) betanom;
                solver_par->timing[(solver_par->numiter)/solver_par->verbose]
                        = (real_Double_t) tempo2-tempo1;
            }
        }

        if (  betanom  < solver_par->atol || 
              betanom/nomb < solver_par->rtol ) {
            break;
        }
    }
    while ( solver_par->numiter+1 <= solver_par->maxiter );
    
    tempo2 = magma_sync_wtime( queue );
    solver_par->runtime = (real_Double_t) tempo2-tempo1;
    double residual;
    CHECK(  magma_dresidualvec( A, b, *x, &r, &residual, queue));
    solver_par->iter_res = betanom;
    solver_par->final_res = residual;

    if ( solver_par->numiter < solver_par->maxiter ) {
        info = MAGMA_SUCCESS;
    } else if ( solver_par->init_res > solver_par->final_res ) {
        if ( solver_par->verbose > 0 ) {
            if ( (solver_par->numiter)%solver_par->verbose==0 ) {
                solver_par->res_vec[(solver_par->numiter)/solver_par->verbose]
                        = (real_Double_t) betanom;
                solver_par->timing[(solver_par->numiter)/solver_par->verbose]
                        = (real_Double_t) tempo2-tempo1;
            }
        }
        info = MAGMA_SLOW_CONVERGENCE;
        if( solver_par->iter_res < solver_par->atol ||
            solver_par->iter_res/solver_par->init_res < solver_par->rtol ){
            info = MAGMA_SUCCESS;
        }
    }
    else {
        if ( solver_par->verbose > 0 ) {
            if ( (solver_par->numiter)%solver_par->verbose==0 ) {
                solver_par->res_vec[(solver_par->numiter)/solver_par->verbose]
                        = (real_Double_t) betanom;
                solver_par->timing[(solver_par->numiter)/solver_par->verbose]
                        = (real_Double_t) tempo2-tempo1;
            }
        }
        solver_par->info = MAGMA_DIVERGENCE;
    }
    
cleanup:
    magma_dmfree(&r, queue );
    magma_dmfree(&z, queue );
    magma_dmfree(&d, queue );
    magma_dmfree(&B, queue );
    magma_dmfree(&C, queue );

    magma_free( d1 );
    magma_free( d2 );
    magma_free( skp );
    magma_free_cpu( skp_h );

    solver_par->info = info;
    return info;
}   /* magma_dcg_merge */
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing dgeqrf
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    real_Double_t    gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0;
    double           error, work[1];
    double  c_neg_one = MAGMA_D_NEG_ONE;
    double *h_A, *d_A, *h_R, *tau, *dT, *h_work, tmp[1];
    magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb, size;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1}, ISEED2[4];
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
 
    magma_int_t status = 0;
    double tol;
    opts.lapack |= (opts.version == 2 && opts.check == 2);  // check (-c2) implies lapack (-l)

    if ( opts.version != 2 && opts.check == 1 ) {
        printf( "  ===================================================================\n"
                "  NOTE: -c check for this version will be wrong\n"
                "  because tester ignores the special structure of MAGMA dgeqrf resuls.\n"
                "  We reset it to -c2.\n" 
                "  ===================================================================\n\n");
        opts.check = 2;
    }
    if ( opts.version == 2 ) {
        if ( opts.check == 1 ) {
            printf("  M     N     CPU GFlop/s (sec)   GPU GFlop/s (sec)   ||R-Q'A||_1 / (M*||A||_1*eps) ||I-Q'Q||_1 / (M*eps)\n");
            printf("=========================================================================================================\n");
        } else {
            printf("  M     N     CPU GFlop/s (sec)   GPU GFlop/s (sec)   ||R||_F / ||A||_F\n");
            printf("=======================================================================\n");
        }
        tol = 1.0;
    } else {
        printf("  M     N     CPU GFlop/s (sec)   GPU GFlop/s (sec)   ||Ax-b||_F/(N*||A||_F*||x||_F)\n");
        printf("====================================================================================\n");
        tol = opts.tolerance * lapackf77_dlamch("E");
    }
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            min_mn = min(M, N);
            lda    = M;
            n2     = lda*N;
            ldda   = ((M+31)/32)*32;
            gflops = FLOPS_DGEQRF( M, N ) / 1e9;
            
            // query for workspace size
            lwork = -1;
            lapackf77_dgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info);
            lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] );
            
            TESTING_MALLOC_CPU( tau,    double, min_mn );
            TESTING_MALLOC_CPU( h_A,    double, n2     );
            TESTING_MALLOC_CPU( h_work, double, lwork  );
            
            TESTING_MALLOC_PIN( h_R,    double, n2     );
            
            TESTING_MALLOC_DEV( d_A,    double, ldda*N );
            
            /* Initialize the matrix */
            for ( int j=0; j<4; j++ ) ISEED2[j] = ISEED[j]; // saving seeds
            lapackf77_dlarnv( &ione, ISEED, &n2, h_A );
            lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda );
            magma_dsetmatrix( M, N, h_R, lda, d_A, ldda );
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            gpu_time = magma_wtime();
            if ( opts.version == 2 ) {
                magma_dgeqrf2_gpu( M, N, d_A, ldda, tau, &info);
            }
            else {
                nb = magma_get_dgeqrf_nb( M );
                size = (2*min(M, N) + (N+31)/32*32 )*nb;
                TESTING_MALLOC_DEV( dT, double, size );
                if ( opts.version == 3 ) {
                    magma_dgeqrf3_gpu( M, N, d_A, ldda, tau, dT, &info);
                }
                else {
                    magma_dgeqrf_gpu( M, N, d_A, ldda, tau, dT, &info);
                }
            }
            gpu_time = magma_wtime() - gpu_time;
            gpu_perf = gflops / gpu_time;
            if (info != 0)
                printf("magma_dgeqrf returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            
            if ( opts.lapack ) {
                /* =====================================================================
                   Performs operation using LAPACK
                   =================================================================== */
                double *tau2;
                TESTING_MALLOC_CPU( tau2, double, min_mn );
                cpu_time = magma_wtime();
                lapackf77_dgeqrf(&M, &N, h_A, &lda, tau2, h_work, &lwork, &info);
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
                if (info != 0)
                    printf("lapackf77_dgeqrf returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));
                TESTING_FREE_CPU( tau2 );
            }

            if ( opts.check == 1 ) {
                /* =====================================================================
                   Check the result 
                   =================================================================== */
                magma_int_t lwork = n2+N;
                double *h_W1, *h_W2, *h_W3;
                double *h_RW, results[2];
                magma_dgetmatrix( M, N, d_A, ldda, h_R, M );

                TESTING_MALLOC_CPU( h_W1, double, n2    ); // Q
                TESTING_MALLOC_CPU( h_W2, double, n2    ); // R
                TESTING_MALLOC_CPU( h_W3, double, lwork ); // WORK
                TESTING_MALLOC_CPU( h_RW, double, M );  // RWORK
                lapackf77_dlarnv( &ione, ISEED2, &n2, h_A );
                lapackf77_dqrt02( &M, &N, &min_mn, h_A, h_R, h_W1, h_W2, &lda, tau, h_W3, &lwork,
                                  h_RW, results );

                if ( opts.lapack ) {
                    printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e                      %8.2e",
                           (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, results[0], results[1] );
                } else {
                    printf("%5d %5d     ---   (  ---  )   %7.2f (%7.2f)    %8.2e                      %8.2e",
                           (int) M, (int) N, gpu_perf, gpu_time, results[0], results[1] );
                } 
                // todo also check results[1] < tol?
                printf("   %s\n", (results[0] < tol ? "ok" : "failed"));
                status += ! (results[0] < tol);
            
                TESTING_FREE_CPU( h_W1 );
                TESTING_FREE_CPU( h_W2 );
                TESTING_FREE_CPU( h_W3 );
                TESTING_FREE_CPU( h_RW );
            }
            else if ( opts.check == 2 ) {
                if ( opts.version == 2 ) {
                    /* =====================================================================
                       Check the result compared to LAPACK
                       =================================================================== */
                    magma_dgetmatrix( M, N, d_A, ldda, h_R, M );
                    error = lapackf77_dlange("f", &M, &N, h_A, &lda, work);
                    blasf77_daxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione);
                    error = lapackf77_dlange("f", &M, &N, h_R, &lda, work) / error;

                    if ( opts.lapack ) {
                        printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e",
                               (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error );
                    } else {
                        printf("%5d %5d     ---   (  ---  )   %7.2f (%7.2f)   %8.2e",
                               (int) M, (int) N, gpu_perf, gpu_time, error );
                    }
                    printf("   %s\n", (error < tol ? "ok" : "failed"));
                    status += ! (error < tol);
                }
                else if ( M >= N ) {
                    magma_int_t lwork;
                    double *x, *b, *d_B, *hwork;
                    const double c_zero    = MAGMA_D_ZERO;
                    const double c_one     = MAGMA_D_ONE;
                    const double c_neg_one = MAGMA_D_NEG_ONE;
                    const magma_int_t ione = 1;

                    // initialize RHS, b = A*random
                    TESTING_MALLOC_CPU( x, double, N );
                    TESTING_MALLOC_CPU( b, double, M );
                    lapackf77_dlarnv( &ione, ISEED, &N, x );
                    blasf77_dgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_zero, b, &ione );
                    // copy to GPU
                    TESTING_MALLOC_DEV( d_B, double, M );
                    magma_dsetvector( M, b, 1, d_B, 1 );

                    if ( opts.version == 1 ) {
                        // allocate hwork
                        magma_dgeqrs_gpu( M, N, 1,
                                          d_A, ldda, tau, dT,
                                          d_B, M, tmp, -1, &info );
                        lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] );
                        TESTING_MALLOC_CPU( hwork, double, lwork );

                        // solve linear system
                        magma_dgeqrs_gpu( M, N, 1,
                                          d_A, ldda, tau, dT,
                                          d_B, M, hwork, lwork, &info );
                       if (info != 0)
                           printf("magma_dgeqrs returned error %d: %s.\n",
                                  (int) info, magma_strerror( info ));
                        TESTING_FREE_CPU( hwork );
                    }
                    else {
                        // allocate hwork
                        magma_dgeqrs3_gpu( M, N, 1,
                                           d_A, ldda, tau, dT,
                                           d_B, M, tmp, -1, &info );
                        lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] );
                        TESTING_MALLOC_CPU( hwork, double, lwork );

                        // solve linear system
                        magma_dgeqrs3_gpu( M, N, 1,
                                           d_A, ldda, tau, dT,
                                           d_B, M, hwork, lwork, &info );
                       if (info != 0)
                           printf("magma_dgeqrs3 returned error %d: %s.\n",
                                  (int) info, magma_strerror( info ));
                        TESTING_FREE_CPU( hwork );
                    }
                    magma_dgetvector( N, d_B, 1, x, 1 );

                    // compute r = Ax - b, saved in b
                    lapackf77_dlarnv( &ione, ISEED2, &n2, h_A );
                    blasf77_dgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_neg_one, b, &ione );

                    // compute residual |Ax - b| / (n*|A|*|x|)
                    double norm_x, norm_A, norm_r, work[1];
                    norm_A = lapackf77_dlange( "F", &M, &N, h_A, &lda, work );
                    norm_r = lapackf77_dlange( "F", &M, &ione, b, &M, work );
                    norm_x = lapackf77_dlange( "F", &N, &ione, x, &N, work );

                    TESTING_FREE_CPU( x );
                    TESTING_FREE_CPU( b );
                    TESTING_FREE_DEV( d_B );

                    error = norm_r / (N * norm_A * norm_x);
                    if ( opts.lapack ) {
                        printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e",
                               (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error );
                    } else {
                        printf("%5d %5d     ---   (  ---  )   %7.2f (%7.2f)   %8.2e",
                               (int) M, (int) N, gpu_perf, gpu_time, error );
                    }
                    printf("   %s\n", (error < tol ? "ok" : "failed"));
                    status += ! (error < tol);
                }
                else {
                    if ( opts.lapack ) {
                        printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)   --- ",
                               (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time );
                    } else {
                        printf("%5d %5d     ---   (  ---  )   %7.2f (%7.2f)     --- ",
                               (int) M, (int) N, gpu_perf, gpu_time);
                    }
                    printf("%s\n", (opts.check != 0 ? "  (error check only for M >= N)" : ""));
                }
            }
            else {
                if ( opts.lapack ) {
                    printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)   ---\n",
                           (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time );
                } else {
                    printf("%5d %5d     ---   (  ---  )   %7.2f (%7.2f)     ---  \n",
                           (int) M, (int) N, gpu_perf, gpu_time);
                }

            }
            
            TESTING_FREE_CPU( tau    );
            TESTING_FREE_CPU( h_A    );
            TESTING_FREE_CPU( h_work );
            
            TESTING_FREE_PIN( h_R );
            
            TESTING_FREE_DEV( d_A );
            
            if ( opts.version != 2 )
                TESTING_FREE_DEV( dT );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }
    
    TESTING_FINALIZE();
    return status;
}
示例#9
0
extern "C" magma_int_t
magma_dmtransfer(
    magma_d_matrix A,
    magma_d_matrix *B,
    magma_location_t src,
    magma_location_t dst,
    magma_queue_t queue )
{
    magma_int_t info = 0;
    
    B->val = NULL;
    B->diag = NULL;
    B->row = NULL;
    B->rowidx = NULL;
    B->col = NULL;
    B->blockinfo = NULL;
    B->dval = NULL;
    B->ddiag = NULL;
    B->drow = NULL;
    B->drowidx = NULL;
    B->dcol = NULL;
    B->diag = NULL;
    B->ddiag = NULL;
    B->list = NULL;
    B->dlist = NULL;
    

    // first case: copy matrix from host to device
    if ( src == Magma_CPU && dst == Magma_DEV ) {
        //CSR-type
        if ( A.storage_type == Magma_CSR || A.storage_type == Magma_CUCSR
                                        || A.storage_type == Magma_CSC
                                        || A.storage_type == Magma_CSRD
                                        || A.storage_type == Magma_CSRL
                                        || A.storage_type == Magma_CSRU ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.nnz ));
            CHECK( magma_index_malloc( &B->drow, A.num_rows + 1 ));
            CHECK( magma_index_malloc( &B->dcol, A.nnz ));
            // data transfer
            magma_dsetvector( A.nnz, A.val, 1, B->dval, 1, queue );
            magma_index_setvector( A.num_rows + 1, A.row, 1, B->drow, 1, queue );
            magma_index_setvector( A.nnz, A.col, 1, B->dcol, 1, queue );
        }
        //COO-type
        else if ( A.storage_type == Magma_COO ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.nnz ));
            CHECK( magma_index_malloc( &B->dcol, A.nnz ));
            CHECK( magma_index_malloc( &B->drowidx, A.nnz ));
            // data transfer
            magma_dsetvector( A.nnz, A.val, 1, B->dval, 1, queue );
            magma_index_setvector( A.nnz, A.col, 1, B->dcol, 1, queue );
            magma_index_setvector( A.nnz, A.rowidx, 1, B->drowidx, 1, queue );
        }
        //CSRCOO-type
        else if ( A.storage_type == Magma_CSRCOO ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.nnz ));
            CHECK( magma_index_malloc( &B->drow, A.num_rows + 1 ));
            CHECK( magma_index_malloc( &B->dcol, A.nnz ));
            CHECK( magma_index_malloc( &B->drowidx, A.nnz ));
            // data transfer
            magma_dsetvector( A.nnz, A.val, 1, B->dval, 1, queue );
            magma_index_setvector( A.num_rows + 1, A.row, 1, B->drow, 1, queue );
            magma_index_setvector( A.nnz, A.col, 1, B->dcol, 1, queue );
            magma_index_setvector( A.nnz, A.rowidx, 1, B->drowidx, 1, queue );
        }
        //ELL/ELLPACKT-type
        else if ( A.storage_type == Magma_ELLPACKT || A.storage_type == Magma_ELL ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.num_rows * A.max_nnz_row ));
            CHECK( magma_index_malloc( &B->dcol, A.num_rows * A.max_nnz_row ));
            // data transfer
            magma_dsetvector( A.num_rows * A.max_nnz_row, A.val, 1, B->dval, 1, queue );
            magma_index_setvector( A.num_rows * A.max_nnz_row, A.col, 1, B->dcol, 1, queue );
        }
        //ELLD-type
        else if ( A.storage_type == Magma_ELLD ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.num_rows * A.max_nnz_row ));
            CHECK( magma_index_malloc( &B->dcol, A.num_rows * A.max_nnz_row ));
            // data transfer
            magma_dsetvector( A.num_rows * A.max_nnz_row, A.val, 1, B->dval, 1, queue );
            magma_index_setvector( A.num_rows * A.max_nnz_row, A.col, 1, B->dcol, 1, queue );
        }
        //ELLRT-type
        else if ( A.storage_type == Magma_ELLRT ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->blocksize = A.blocksize;
            B->alignment = A.alignment;
            magma_int_t rowlength = magma_roundup( A.max_nnz_row, A.alignment );
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.num_rows * rowlength ));
            CHECK( magma_index_malloc( &B->dcol, A.num_rows * rowlength ));
            CHECK( magma_index_malloc( &B->drow, A.num_rows ));
            // data transfer
            magma_dsetvector( A.num_rows * rowlength, A.val, 1, B->dval, 1, queue );
            magma_index_setvector( A.num_rows * rowlength, A.col, 1, B->dcol, 1, queue );
            magma_index_setvector( A.num_rows, A.row, 1, B->drow, 1, queue );
        }
        //SELLP-type
        else if ( A.storage_type == Magma_SELLP ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->blocksize = A.blocksize;
            B->numblocks = A.numblocks;
            B->alignment = A.alignment;
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.nnz ));
            CHECK( magma_index_malloc( &B->dcol, A.nnz ));
            CHECK( magma_index_malloc( &B->drow, A.numblocks + 1 ));
            // data transfer
            magma_dsetvector( A.nnz, A.val, 1, B->dval, 1, queue );
            magma_index_setvector( A.nnz, A.col, 1, B->dcol, 1, queue );
            magma_index_setvector( A.numblocks + 1, A.row, 1, B->drow, 1, queue );
        }
        //BCSR-type
        else if ( A.storage_type == Magma_BCSR ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->blocksize = A.blocksize;
            B->numblocks = A.numblocks;
            B->alignment = A.alignment;
            magma_int_t size_b = A.blocksize;
            //magma_int_t c_blocks = ceil( (float)A.num_cols / (float)size_b );
                    // max number of blocks per row
            //magma_int_t r_blocks = ceil( (float)A.num_rows / (float)size_b );
            magma_int_t r_blocks = magma_ceildiv( A.num_rows, size_b );
                    // max number of blocks per column
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, size_b * size_b * A.numblocks ));
            CHECK( magma_index_malloc( &B->drow, r_blocks + 1 ));
            CHECK( magma_index_malloc( &B->dcol, A.numblocks ));
            // data transfer
            magma_dsetvector( size_b * size_b * A.numblocks, A.val, 1, B->dval, 1, queue );
            magma_index_setvector( r_blocks + 1, A.row, 1, B->drow, 1, queue );
            magma_index_setvector( A.numblocks, A.col, 1, B->dcol, 1, queue );
        }
        //DENSE-type
        else if ( A.storage_type == Magma_DENSE ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->major = A.major;
            B->ld = A.ld;
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.num_rows * A.num_cols ));
            // data transfer
            magma_dsetvector( A.num_rows * A.num_cols, A.val, 1, B->dval, 1, queue );
        }
    }

    // second case: copy matrix from host to host
    else if ( src == Magma_CPU && dst == Magma_CPU ) {
        //CSR-type
        if ( A.storage_type == Magma_CSR || A.storage_type == Magma_CUCSR
                                        || A.storage_type == Magma_CSC
                                        || A.storage_type == Magma_CSRD
                                        || A.storage_type == Magma_CSRL
                                        || A.storage_type == Magma_CSRU ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, A.nnz ));
            CHECK( magma_index_malloc_cpu( &B->row, A.num_rows + 1 ));
            CHECK( magma_index_malloc_cpu( &B->col, A.nnz ));
            // data transfer
            for( magma_int_t i=0; i<A.nnz; i++ ) {
                B->val[i] = A.val[i];
                B->col[i] = A.col[i];
            }
            for( magma_int_t i=0; i<A.num_rows+1; i++ ) {
                B->row[i] = A.row[i];
            }
        }
        //COO-type
        else if ( A.storage_type == Magma_COO ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, A.nnz ));
            CHECK( magma_index_malloc_cpu( &B->col, A.nnz ));
            CHECK( magma_index_malloc_cpu( &B->rowidx, A.nnz ));
            // data transfer
            for( magma_int_t i=0; i<A.nnz; i++ ) {
                B->val[i] = A.val[i];
                B->col[i] = A.col[i];
                B->rowidx[i] = A.rowidx[i];
            }
        }
        //CSRCOO-type
        else if ( A.storage_type == Magma_CSRCOO ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, A.nnz ));
            CHECK( magma_index_malloc_cpu( &B->row, A.num_rows + 1 ));
            CHECK( magma_index_malloc_cpu( &B->col, A.nnz ));
            CHECK( magma_index_malloc_cpu( &B->rowidx, A.nnz ));
            // data transfer
            for( magma_int_t i=0; i<A.nnz; i++ ) {
                B->val[i] = A.val[i];
                B->col[i] = A.col[i];
                B->rowidx[i] = A.rowidx[i];
            }
            for( magma_int_t i=0; i<A.num_rows+1; i++ ) {
                B->row[i] = A.row[i];
            }
        }
        //ELL/ELLPACKT-type
        else if ( A.storage_type == Magma_ELLPACKT || A.storage_type == Magma_ELL ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, A.num_rows * A.max_nnz_row ));
            CHECK( magma_index_malloc_cpu( &B->col, A.num_rows * A.max_nnz_row ));
            // data transfer
            for( magma_int_t i=0; i<A.num_rows*A.max_nnz_row; i++ ) {
                B->val[i] = A.val[i];
                B->col[i] = A.col[i];
            }
        }
        //ELLD-type
        else if ( A.storage_type == Magma_ELLD ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, A.num_rows * A.max_nnz_row ));
            CHECK( magma_index_malloc_cpu( &B->col, A.num_rows * A.max_nnz_row ));
            // data transfer
            for( magma_int_t i=0; i<A.num_rows*A.max_nnz_row; i++ ) {
                B->val[i] = A.val[i];
                B->col[i] = A.col[i];
            }
        }
        //ELLRT-type
        else if ( A.storage_type == Magma_ELLRT ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->blocksize = A.blocksize;
            B->alignment = A.alignment;
            //int threads_per_row = A.alignment;
            //int rowlength = magma_roundup( A.max_nnz_row, threads_per_row );
            magma_int_t rowlength = magma_roundup( A.max_nnz_row, A.alignment );
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, rowlength * A.num_rows ));
            CHECK( magma_index_malloc_cpu( &B->row, A.num_rows ));
            CHECK( magma_index_malloc_cpu( &B->col, rowlength * A.num_rows ));
            // data transfer
            for( magma_int_t i=0; i<A.num_rows*rowlength; i++ ) {
                B->val[i] = A.val[i];
                B->col[i] = A.col[i];
            }
            for( magma_int_t i=0; i<A.num_rows; i++ ) {
                B->row[i] = A.row[i];
            }
        }
        //SELLP-type
        else if (  A.storage_type == Magma_SELLP ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->blocksize = A.blocksize;
            B->alignment = A.alignment;
            B->numblocks = A.numblocks;
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, A.nnz ));
            CHECK( magma_index_malloc_cpu( &B->col, A.nnz ));
            CHECK( magma_index_malloc_cpu( &B->row, A.numblocks + 1 ));
            // data transfer
            for( magma_int_t i=0; i<A.nnz; i++ ) {
                B->val[i] = A.val[i];
                B->col[i] = A.col[i];
            }
            for( magma_int_t i=0; i<A.numblocks+1; i++ ) {
                B->row[i] = A.row[i];
            }
        }
        //BCSR-type
        else if ( A.storage_type == Magma_BCSR ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->blocksize = A.blocksize;
            B->numblocks = A.numblocks;
            B->alignment = A.alignment;
            magma_int_t size_b = A.blocksize;
            //magma_int_t c_blocks = ceil( (float)A.num_cols / (float)size_b );
                    // max number of blocks per row
            //magma_int_t r_blocks = ceil( (float)A.num_rows / (float)size_b );
            magma_int_t r_blocks = magma_ceildiv( A.num_rows, size_b );
                    // max number of blocks per column
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, size_b * size_b * A.numblocks ));
            CHECK( magma_index_malloc_cpu( &B->row, r_blocks + 1 ));
            CHECK( magma_index_malloc_cpu( &B->col, A.numblocks ));
            // data transfer
            //magma_dsetvector( size_b * size_b * A.numblocks, A.val, 1, B->dval, 1, queue );
            for( magma_int_t i=0; i<size_b*size_b*A.numblocks; i++ ) {
                B->dval[i] = A.val[i];
            }
            //magma_index_setvector( r_blocks + 1, A.row, 1, B->drow, 1, queue );
            for( magma_int_t i=0; i<r_blocks+1; i++ ) {
                B->drow[i] = A.row[i];
            }
            //magma_index_setvector( A.numblocks, A.col, 1, B->dcol, 1, queue );
            for( magma_int_t i=0; i<A.numblocks; i++ ) {
                B->dcol[i] = A.col[i];
            }
        }
        //DENSE-type
        else if ( A.storage_type == Magma_DENSE ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->major = A.major;
            B->ld = A.ld;
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, A.num_rows * A.num_cols ));
            // data transfer
            for( magma_int_t i=0; i<A.num_rows*A.num_cols; i++ ) {
                B->val[i] = A.val[i];
            }
        }
    }

    // third case: copy matrix from device to host
    else if ( src == Magma_DEV && dst == Magma_CPU ) {
        //CSR-type
        if ( A.storage_type == Magma_CSR || A.storage_type == Magma_CUCSR
                                        || A.storage_type == Magma_CSC
                                        || A.storage_type == Magma_CSRD
                                        || A.storage_type == Magma_CSRL
                                        || A.storage_type == Magma_CSRU ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, A.nnz ));
            CHECK( magma_index_malloc_cpu( &B->row, A.num_rows + 1 ));
            CHECK( magma_index_malloc_cpu( &B->col, A.nnz ));
            // data transfer
            magma_dgetvector( A.nnz, A.dval, 1, B->val, 1, queue );
            magma_index_getvector( A.num_rows + 1, A.drow, 1, B->row, 1, queue );
            magma_index_getvector( A.nnz, A.dcol, 1, B->col, 1, queue );
        }
        //COO-type
        else if ( A.storage_type == Magma_COO ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, A.nnz ));
            CHECK( magma_index_malloc_cpu( &B->col, A.nnz ));
            CHECK( magma_index_malloc_cpu( &B->rowidx, A.nnz ));
            // data transfer
            magma_dgetvector( A.nnz, A.dval, 1, B->val, 1, queue );
            magma_index_getvector( A.nnz, A.dcol, 1, B->col, 1, queue );
            magma_index_getvector( A.nnz, A.drowidx, 1, B->rowidx, 1, queue );
        }
        //CSRCOO-type
        else if ( A.storage_type == Magma_CSRCOO ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, A.nnz ));
            CHECK( magma_index_malloc_cpu( &B->row, A.num_rows + 1 ));
            CHECK( magma_index_malloc_cpu( &B->col, A.nnz ));
            CHECK( magma_index_malloc_cpu( &B->rowidx, A.nnz ));
            // data transfer
            magma_dgetvector( A.nnz, A.dval, 1, B->val, 1, queue );
            magma_index_getvector( A.num_rows + 1, A.drow, 1, B->row, 1, queue );
            magma_index_getvector( A.nnz, A.dcol, 1, B->col, 1, queue );
            magma_index_getvector( A.nnz, A.drowidx, 1, B->rowidx, 1, queue );
        }
        //ELL/ELLPACKT-type
        else if ( A.storage_type == Magma_ELLPACKT || A.storage_type == Magma_ELL ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, A.num_rows * A.max_nnz_row ));
            CHECK( magma_index_malloc_cpu( &B->col, A.num_rows * A.max_nnz_row ));
            // data transfer
            magma_dgetvector( A.num_rows * A.max_nnz_row, A.dval, 1, B->val, 1, queue );
            magma_index_getvector( A.num_rows * A.max_nnz_row, A.dcol, 1, B->col, 1, queue );
        }
        //ELLD-type
        else if (  A.storage_type == Magma_ELLD ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, A.num_rows * A.max_nnz_row ));
            CHECK( magma_index_malloc_cpu( &B->col, A.num_rows * A.max_nnz_row ));
            // data transfer
            magma_dgetvector( A.num_rows * A.max_nnz_row, A.dval, 1, B->val, 1, queue );
            magma_index_getvector( A.num_rows * A.max_nnz_row, A.dcol, 1, B->col, 1, queue );
        }
        //ELLRT-type
        else if ( A.storage_type == Magma_ELLRT ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->blocksize = A.blocksize;
            B->alignment = A.alignment;
            //int threads_per_row = A.alignment;
            //int rowlength = magma_roundup( A.max_nnz_row, threads_per_row );
            magma_int_t rowlength = magma_roundup( A.max_nnz_row, A.alignment );
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, rowlength * A.num_rows ));
            CHECK( magma_index_malloc_cpu( &B->row, A.num_rows ));
            CHECK( magma_index_malloc_cpu( &B->col, rowlength * A.num_rows ));
            // data transfer
            magma_dgetvector( A.num_rows * rowlength, A.dval, 1, B->val, 1, queue );
            magma_index_getvector( A.num_rows * rowlength, A.dcol, 1, B->col, 1, queue );
            magma_index_getvector( A.num_rows, A.drow, 1, B->row, 1, queue );
        }
        //SELLP-type
        else if ( A.storage_type == Magma_SELLP ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->blocksize = A.blocksize;
            B->numblocks = A.numblocks;
            B->alignment = A.alignment;
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, A.nnz ));
            CHECK( magma_index_malloc_cpu( &B->col, A.nnz ));
            CHECK( magma_index_malloc_cpu( &B->row, A.numblocks + 1 ));
            // data transfer
            magma_dgetvector( A.nnz, A.dval, 1, B->val, 1, queue );
            magma_index_getvector( A.nnz, A.dcol, 1, B->col, 1, queue );
            magma_index_getvector( A.numblocks + 1, A.drow, 1, B->row, 1, queue );
        }
        //BCSR-type
        else if ( A.storage_type == Magma_BCSR ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->blocksize = A.blocksize;
            B->numblocks = A.numblocks;
            B->alignment = A.alignment;
            magma_int_t size_b = A.blocksize;
            //magma_int_t c_blocks = ceil( (float)A.num_cols / (float)size_b );
                    // max number of blocks per row
            //magma_int_t r_blocks = ceil( (float)A.num_rows / (float)size_b );
            magma_int_t r_blocks = magma_ceildiv( A.num_rows, size_b );
                    // max number of blocks per column
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, size_b * size_b * A.numblocks ));
            CHECK( magma_index_malloc_cpu( &B->row, r_blocks + 1 ));
            CHECK( magma_index_malloc_cpu( &B->col, A.numblocks ));
            // data transfer
            magma_dgetvector( size_b * size_b * A.numblocks, A.dval, 1, B->val, 1, queue );
            magma_index_getvector( r_blocks + 1, A.drow, 1, B->row, 1, queue );
            magma_index_getvector( A.numblocks, A.dcol, 1, B->col, 1, queue );
        }
        //DENSE-type
        else if ( A.storage_type == Magma_DENSE ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->major = A.major;
            B->ld = A.ld;
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, A.num_rows * A.num_cols ));
            // data transfer
            magma_dgetvector( A.num_rows * A.num_cols, A.dval, 1, B->val, 1, queue );
        }
    }

    // fourth case: copy matrix from device to device
    else if ( src == Magma_DEV && dst == Magma_DEV ) {
        //CSR-type
        if ( A.storage_type == Magma_CSR || A.storage_type == Magma_CUCSR
                                        || A.storage_type == Magma_CSC
                                        || A.storage_type == Magma_CSRD
                                        || A.storage_type == Magma_CSRL
                                        || A.storage_type == Magma_CSRU ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.nnz ));
            CHECK( magma_index_malloc( &B->drow, A.num_rows + 1 ));
            CHECK( magma_index_malloc( &B->dcol, A.nnz ));
            // data transfer
            magma_dcopyvector( A.nnz, A.dval, 1, B->dval, 1, queue );
            magma_index_copyvector( A.num_rows + 1, A.drow, 1, B->drow, 1, queue );
            magma_index_copyvector( A.nnz, A.dcol, 1, B->dcol, 1, queue );
        }
        //COO-type
        else if ( A.storage_type == Magma_COO ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.nnz ));
            CHECK( magma_index_malloc( &B->dcol, A.nnz ));
            CHECK( magma_index_malloc( &B->drowidx, A.nnz ));
            // data transfer
            magma_dcopyvector( A.nnz, A.dval, 1, B->dval, 1, queue );
            magma_index_copyvector( A.nnz, A.dcol, 1, B->dcol, 1, queue );
            magma_index_copyvector( A.nnz, A.drowidx, 1, B->drowidx, 1, queue );
        }
        //CSRCOO-type
        else if ( A.storage_type == Magma_CSRCOO ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.nnz ));
            CHECK( magma_index_malloc( &B->drow, A.num_rows + 1 ));
            CHECK( magma_index_malloc( &B->dcol, A.nnz ));
            CHECK( magma_index_malloc( &B->drowidx, A.nnz ));
            // data transfer
            magma_dcopyvector( A.nnz, A.dval, 1, B->dval, 1, queue );
            magma_index_copyvector( A.num_rows + 1, A.drow, 1, B->drow, 1, queue );
            magma_index_copyvector( A.nnz, A.dcol, 1, B->dcol, 1, queue );
            magma_index_copyvector( A.nnz, A.drowidx, 1, B->drowidx, 1, queue );
        }
        //ELL/ELLPACKT-type
        else if ( A.storage_type == Magma_ELLPACKT || A.storage_type == Magma_ELL ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.num_rows * A.max_nnz_row ));
            CHECK( magma_index_malloc( &B->dcol, A.num_rows * A.max_nnz_row ));
            // data transfer
            magma_dcopyvector( A.num_rows * A.max_nnz_row, A.dval, 1, B->dval, 1, queue );
            magma_index_copyvector( A.num_rows * A.max_nnz_row, A.dcol, 1, B->dcol, 1, queue );
        }
        //ELLD-type
        else if ( A.storage_type == Magma_ELLD ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.num_rows * A.max_nnz_row ));
            CHECK( magma_index_malloc( &B->dcol, A.num_rows * A.max_nnz_row ));
            // data transfer
            magma_dcopyvector( A.num_rows * A.max_nnz_row, A.dval, 1, B->dval, 1, queue );
            magma_index_copyvector( A.num_rows * A.max_nnz_row, A.dcol, 1, B->dcol, 1, queue );
        }
        //ELLRT-type
        else if ( A.storage_type == Magma_ELLRT ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->blocksize = A.blocksize;
            B->alignment = A.alignment;
            //int threads_per_row = A.alignment;
            //int rowlength = magma_roundup( A.max_nnz_row, threads_per_row );
            magma_int_t rowlength = magma_roundup( A.max_nnz_row, A.alignment );
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.num_rows * rowlength ));
            CHECK( magma_index_malloc( &B->dcol, A.num_rows * rowlength ));
            CHECK( magma_index_malloc( &B->drow, A.num_rows ));
            // data transfer
            magma_dcopyvector( A.num_rows * rowlength, A.dval, 1, B->dval, 1, queue );
            magma_index_copyvector( A.num_rows * rowlength, A.dcol, 1, B->dcol, 1, queue );
            magma_index_copyvector( A.num_rows, A.drow, 1, B->drow, 1, queue );
        }
        //SELLP-type
        else if ( A.storage_type == Magma_SELLP ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->blocksize = A.blocksize;
            B->numblocks = A.numblocks;
            B->alignment = A.alignment;
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.nnz ));
            CHECK( magma_index_malloc( &B->dcol, A.nnz ));
            CHECK( magma_index_malloc( &B->drow, A.numblocks + 1 ));
            // data transfer
            magma_dcopyvector( A.nnz, A.dval, 1, B->dval, 1, queue );
            magma_index_copyvector( A.nnz, A.dcol, 1, B->dcol, 1, queue );
            magma_index_copyvector( A.numblocks + 1, A.drow, 1, B->drow, 1, queue );
        }
        //BCSR-type
        else if ( A.storage_type == Magma_BCSR ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->blocksize = A.blocksize;
            B->numblocks = A.numblocks;
            B->alignment = A.alignment;
            magma_int_t size_b = A.blocksize;
            //magma_int_t c_blocks = ceil( (float)A.num_cols / (float)size_b );
                    // max number of blocks per row
            //magma_int_t r_blocks = ceil( (float)A.num_rows / (float)size_b );
            magma_int_t r_blocks = magma_ceildiv( A.num_rows, size_b );
                    // max number of blocks per column
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, size_b * size_b * A.numblocks ));
            CHECK( magma_index_malloc( &B->drow, r_blocks + 1 ));
            CHECK( magma_index_malloc( &B->dcol, A.numblocks ));
            // data transfer
            magma_dcopyvector( size_b * size_b * A.numblocks, A.dval, 1, B->dval, 1, queue );
            magma_index_copyvector( r_blocks + 1, A.drow, 1, B->drow, 1, queue );
            magma_index_copyvector( A.numblocks, A.dcol, 1, B->dcol, 1, queue );
        }
        //DENSE-type
        else if ( A.storage_type == Magma_DENSE ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->major = A.major;
            B->ld = A.ld;
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.num_rows * A.num_cols ));
            // data transfer
            magma_dcopyvector( A.num_rows * A.num_cols, A.dval, 1, B->dval, 1, queue );
        }
    }
    
    
cleanup:
    if( info != 0 ){
        magma_dmfree( B, queue );
    }
    return info;
}
示例#10
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing dgeqrf
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    const double             d_neg_one = MAGMA_D_NEG_ONE;
    const double             d_one     = MAGMA_D_ONE;
    const double c_neg_one = MAGMA_D_NEG_ONE;
    const double c_one     = MAGMA_D_ONE;
    const double c_zero    = MAGMA_D_ZERO;
    const magma_int_t        ione      = 1;
    
    real_Double_t    gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0;
    double           Anorm, error=0, error2=0;
    double *h_A, *h_R, *tau, *h_work, tmp[1];
    magmaDouble_ptr d_A, dT;
    magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb, size;
    magma_int_t ISEED[4] = {0,0,0,1};
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    magma_int_t status = 0;
    double tol = opts.tolerance * lapackf77_dlamch("E");
    
    printf( "version %d\n", (int) opts.version );
    if ( opts.version == 2 ) {
        printf("    M     N   CPU GFlop/s (sec)   GPU GFlop/s (sec)   |R - Q^H*A|   |I - Q^H*Q|\n");
        printf("===============================================================================\n");
    }
    else {
        printf("    M     N   CPU GFlop/s (sec)   GPU GFlop/s (sec)    |b - A*x|\n");
        printf("================================================================\n");
    }
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            min_mn = min(M, N);
            lda    = M;
            n2     = lda*N;
            ldda   = ((M+31)/32)*32;
            gflops = FLOPS_DGEQRF( M, N ) / 1e9;
            
            // query for workspace size
            lwork = -1;
            lapackf77_dgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info);
            lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] );
            
            TESTING_MALLOC_CPU( tau,    double, min_mn );
            TESTING_MALLOC_CPU( h_A,    double, n2     );
            TESTING_MALLOC_CPU( h_work, double, lwork  );
            
            TESTING_MALLOC_PIN( h_R,    double, n2     );
            
            TESTING_MALLOC_DEV( d_A,    double, ldda*N );
            
            /* Initialize the matrix */
            lapackf77_dlarnv( &ione, ISEED, &n2, h_A );
            lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda );
            magma_dsetmatrix( M, N, h_R, lda, d_A, ldda );
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            gpu_time = magma_wtime();
            if ( opts.version == 2 ) {
                // LAPACK complaint arguments
                magma_dgeqrf2_gpu( M, N, d_A, ldda, tau, &info );
            }
            else {
                nb = magma_get_dgeqrf_nb( M );
                size = (2*min(M, N) + (N+31)/32*32 )*nb;
                TESTING_MALLOC_DEV( dT, double, size );
                if ( opts.version == 1 ) {
                    // stores dT, V blocks have zeros, R blocks inverted & stored in dT
                    magma_dgeqrf_gpu( M, N, d_A, ldda, tau, dT, &info );
                }
                #ifdef HAVE_CUBLAS
                else if ( opts.version == 3 ) {
                    // stores dT, V blocks have zeros, R blocks stored in dT
                    magma_dgeqrf3_gpu( M, N, d_A, ldda, tau, dT, &info );
                }
                #endif
                else {
                    printf( "Unknown version %d\n", (int) opts.version );
                    exit(1);
                }
            }
            gpu_time = magma_wtime() - gpu_time;
            gpu_perf = gflops / gpu_time;
            if (info != 0)
                printf("magma_dgeqrf returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            
            if ( opts.check && opts.version == 2 ) {
                /* =====================================================================
                   Check the result, following zqrt01 except using the reduced Q.
                   This works for any M,N (square, tall, wide).
                   Only for version 2, which has LAPACK complaint output.
                   =================================================================== */
                magma_dgetmatrix( M, N, d_A, ldda, h_R, lda );
                
                magma_int_t ldq = M;
                magma_int_t ldr = min_mn;
                double *Q, *R;
                double *work;
                TESTING_MALLOC_CPU( Q,    double, ldq*min_mn );  // M by K
                TESTING_MALLOC_CPU( R,    double, ldr*N );       // K by N
                TESTING_MALLOC_CPU( work, double,             min_mn );
                
                // generate M by K matrix Q, where K = min(M,N)
                lapackf77_dlacpy( "Lower", &M, &min_mn, h_R, &lda, Q, &ldq );
                lapackf77_dorgqr( &M, &min_mn, &min_mn, Q, &ldq, tau, h_work, &lwork, &info );
                assert( info == 0 );
                
                // copy K by N matrix R
                lapackf77_dlaset( "Lower", &min_mn, &N, &c_zero, &c_zero, R, &ldr );
                lapackf77_dlacpy( "Upper", &min_mn, &N, h_R, &lda,        R, &ldr );
                
                // error = || R - Q^H*A || / (N * ||A||)
                blasf77_dgemm( "Conj", "NoTrans", &min_mn, &N, &M,
                               &c_neg_one, Q, &ldq, h_A, &lda, &c_one, R, &ldr );
                Anorm = lapackf77_dlange( "1", &M,      &N, h_A, &lda, work );
                error = lapackf77_dlange( "1", &min_mn, &N, R,   &ldr, work );
                if ( N > 0 && Anorm > 0 )
                    error /= (N*Anorm);
                
                // set R = I (K by K identity), then R = I - Q^H*Q
                // error = || I - Q^H*Q || / N
                lapackf77_dlaset( "Upper", &min_mn, &min_mn, &c_zero, &c_one, R, &ldr );
                blasf77_dsyrk( "Upper", "Conj", &min_mn, &M, &d_neg_one, Q, &ldq, &d_one, R, &ldr );
                error2 = lapackf77_dlansy( "1", "Upper", &min_mn, R, &ldr, work );
                if ( N > 0 )
                    error2 /= N;
                
                TESTING_FREE_CPU( Q    );  Q    = NULL;
                TESTING_FREE_CPU( R    );  R    = NULL;
                TESTING_FREE_CPU( work );  work = NULL;
            }
            else if ( opts.check && M >= N ) {
                /* =====================================================================
                   Check the result by solving consistent linear system, A*x = b.
                   Only for versions 1 & 3 with M >= N.
                   =================================================================== */
                magma_int_t lwork;
                double *x, *b, *hwork;
                magmaDouble_ptr d_B;
                const double c_zero    = MAGMA_D_ZERO;
                const double c_one     = MAGMA_D_ONE;
                const double c_neg_one = MAGMA_D_NEG_ONE;
                const magma_int_t ione = 1;

                // initialize RHS, b = A*random
                TESTING_MALLOC_CPU( x, double, N );
                TESTING_MALLOC_CPU( b, double, M );
                lapackf77_dlarnv( &ione, ISEED, &N, x );
                blasf77_dgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_zero, b, &ione );
                // copy to GPU
                TESTING_MALLOC_DEV( d_B, double, M );
                magma_dsetvector( M, b, 1, d_B, 1 );

                if ( opts.version == 1 ) {
                    // allocate hwork
                    magma_dgeqrs_gpu( M, N, 1,
                                      d_A, ldda, tau, dT,
                                      d_B, M, tmp, -1, &info );
                    lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] );
                    TESTING_MALLOC_CPU( hwork, double, lwork );

                    // solve linear system
                    magma_dgeqrs_gpu( M, N, 1,
                                      d_A, ldda, tau, dT,
                                      d_B, M, hwork, lwork, &info );
                    if (info != 0)
                        printf("magma_dgeqrs returned error %d: %s.\n",
                               (int) info, magma_strerror( info ));
                    TESTING_FREE_CPU( hwork );
                }
                #ifdef HAVE_CUBLAS
                else if ( opts.version == 3 ) {
                    // allocate hwork
                    magma_dgeqrs3_gpu( M, N, 1,
                                       d_A, ldda, tau, dT,
                                       d_B, M, tmp, -1, &info );
                    lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] );
                    TESTING_MALLOC_CPU( hwork, double, lwork );

                    // solve linear system
                    magma_dgeqrs3_gpu( M, N, 1,
                                       d_A, ldda, tau, dT,
                                       d_B, M, hwork, lwork, &info );
                    if (info != 0)
                        printf("magma_dgeqrs3 returned error %d: %s.\n",
                               (int) info, magma_strerror( info ));
                    TESTING_FREE_CPU( hwork );
                }
                #endif
                else {
                    printf( "Unknown version %d\n", (int) opts.version );
                    exit(1);
                }
                magma_dgetvector( N, d_B, 1, x, 1 );

                // compute r = Ax - b, saved in b
                blasf77_dgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_neg_one, b, &ione );

                // compute residual |Ax - b| / (n*|A|*|x|)
                double norm_x, norm_A, norm_r, work[1];
                norm_A = lapackf77_dlange( "F", &M, &N, h_A, &lda, work );
                norm_r = lapackf77_dlange( "F", &M, &ione, b, &M, work );
                norm_x = lapackf77_dlange( "F", &N, &ione, x, &N, work );

                TESTING_FREE_CPU( x );
                TESTING_FREE_CPU( b );
                TESTING_FREE_DEV( d_B );

                error = norm_r / (N * norm_A * norm_x);
            }
            
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            if ( opts.lapack ) {
                cpu_time = magma_wtime();
                lapackf77_dgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info);
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
                if (info != 0)
                    printf("lapackf77_dgeqrf returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));
            }
            
            /* =====================================================================
               Print performance and error.
               =================================================================== */
            printf("%5d %5d   ", (int) M, (int) N );
            if ( opts.lapack ) {
                printf( "%7.2f (%7.2f)", cpu_perf, cpu_time );
            }
            else {
                printf("  ---   (  ---  )" );
            }
            printf( "   %7.2f (%7.2f)   ", gpu_perf, gpu_time );
            if ( opts.check ) {
                if ( opts.version == 2 ) {
                    bool okay = (error < tol && error2 < tol);
                    status += ! okay;
                    printf( "%11.2e   %11.2e   %s\n", error, error2, (okay ? "ok" : "failed") );
                }
                else if ( M >= N ) {
                    bool okay = (error < tol);
                    status += ! okay;
                    printf( "%10.2e   %s\n", error, (okay ? "ok" : "failed") );
                }
                else {
                    printf( "(error check only for M >= N)\n" );
                }
            }
            else {
                printf( "    ---\n" );
            }
            
            TESTING_FREE_CPU( tau    );
            TESTING_FREE_CPU( h_A    );
            TESTING_FREE_CPU( h_work );
            
            TESTING_FREE_PIN( h_R );
            
            TESTING_FREE_DEV( d_A );
            
            if ( opts.version != 2 )
                TESTING_FREE_DEV( dT );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }
    
    TESTING_FINALIZE();
    return status;
}
示例#11
0
/**
    @deprecated
    
    Purpose
    -------
    ZLAQPS computes a step of QR factorization with column pivoting
    of a complex M-by-N matrix A by using Blas-3.  It tries to factorize
    NB columns from A starting from the row OFFSET+1, and updates all
    of the matrix with Blas-3 xGEMM.

    In some cases, due to catastrophic cancellations, it cannot
    factorize NB columns.  Hence, the actual number of factorized
    columns is returned in KB.

    Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A. M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A. N >= 0

    @param[in]
    offset  INTEGER
            The number of rows of A that have been factorized in
            previous steps.

    @param[in]
    nb      INTEGER
            The number of columns to factorize.

    @param[out]
    kb      INTEGER
            The number of columns actually factorized.

    @param[in,out]
    dA      COMPLEX_16 array, dimension (LDDA,N), on the GPU.
            On entry, the M-by-N matrix A.
            On exit, block A(OFFSET+1:M,1:KB) is the triangular
            factor obtained and block A(1:OFFSET,1:N) has been
            accordingly pivoted, but no factorized.
            The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has
            been updated.

    @param[in]
    ldda    INTEGER
            The leading dimension of the array A. LDDA >= max(1,M).

    @param[in,out]
    jpvt    INTEGER array, dimension (N)
            JPVT(I) = K <==> Column K of the full matrix A has been
            permuted into position I in AP.

    @param[out]
    tau     COMPLEX_16 array, dimension (KB)
            The scalar factors of the elementary reflectors.

    @param[in,out]
    vn1     DOUBLE PRECISION array, dimension (N)
            The vector with the partial column norms.

    @param[in,out]
    vn2     DOUBLE PRECISION array, dimension (N)
            The vector with the exact column norms.

    @param[in,out]
    dauxv   COMPLEX_16 array, dimension (NB), on the GPU
            Auxiliary vector.

    @param[in,out]
    dF      COMPLEX_16 array, dimension (LDDF,NB), on the GPU
            Matrix F' = L*Y'*A.

    @param[in]
    lddf    INTEGER
            The leading dimension of the array F. LDDF >= max(1,N).

    @ingroup magma_zgeqp3_aux
    ********************************************************************/
extern "C" magma_int_t
magma_zlaqps_gpu(
    magma_int_t m, magma_int_t n, magma_int_t offset,
    magma_int_t nb, magma_int_t *kb,
    magmaDoubleComplex_ptr dA,  magma_int_t ldda,
    magma_int_t *jpvt, magmaDoubleComplex *tau,
    double *vn1, double *vn2,
    magmaDoubleComplex_ptr dauxv,
    magmaDoubleComplex_ptr dF,  magma_int_t lddf)
{
#define  dA(i, j) (dA  + (i) + (j)*(ldda))
#define  dF(i, j) (dF  + (i) + (j)*(lddf))

    magmaDoubleComplex c_zero    = MAGMA_Z_MAKE( 0.,0.);
    magmaDoubleComplex c_one     = MAGMA_Z_MAKE( 1.,0.);
    magmaDoubleComplex c_neg_one = MAGMA_Z_MAKE(-1.,0.);
    magma_int_t ione = 1;
    
    magma_int_t i__1, i__2;
    //double d__1;
    magmaDoubleComplex z__1;
    
    //magma_int_t j;
    magma_int_t k, rk;
    //magmaDoubleComplex Akk;
    magmaDoubleComplex_ptr dAks;
    magmaDoubleComplex tauk = MAGMA_Z_ZERO;
    magma_int_t pvt;
    //double temp, temp2;
    double tol3z;
    magma_int_t itemp;

    double lsticc;
    magmaDouble_ptr dlsticcs;
    magma_dmalloc( &dlsticcs, 1+256*(n+255)/256 );

    //lastrk = min( m, n + offset );
    tol3z = magma_dsqrt( lapackf77_dlamch("Epsilon"));

    lsticc = 0;
    k = 0;
    magma_zmalloc( &dAks, nb );

    while( k < nb && lsticc == 0 ) {
        rk = offset + k;
        
        /* Determine ith pivot column and swap if necessary */
        // subtract 1 from Fortran/CUBLAS idamax; pvt, k are 0-based.
        pvt = k + magma_idamax( n-k, &vn1[k], ione ) - 1;
        
        if (pvt != k) {
            /*if (pvt >= nb) {
                // 1. Start copy from GPU
                magma_zgetmatrix_async( m - offset - nb, 1,
                                        dA(offset + nb, pvt), ldda,
                                        A (offset + nb, pvt), lda, stream );
            }*/

            /* F gets swapped so F must be sent at the end to GPU   */
            i__1 = k;
            /*if (pvt < nb) {
                // no need of transfer if pivot is within the panel
                blasf77_zswap( &m, A(0, pvt), &ione, A(0, k), &ione );
            }
            else {
                // 1. Finish copy from GPU
                magma_queue_sync( stream );

                // 2. Swap as usual on CPU
                blasf77_zswap(&m, A(0, pvt), &ione, A(0, k), &ione);

                // 3. Restore the GPU
                magma_zsetmatrix_async( m - offset - nb, 1,
                                        A (offset + nb, pvt), lda,
                                        dA(offset + nb, pvt), ldda, stream);
            }*/
            magmablas_zswap( m, dA(0, pvt), ione, dA(0, k), ione );

            //blasf77_zswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf );
            magmablas_zswap( i__1, dF(pvt, 0), lddf, dF(k, 0), lddf);
            itemp     = jpvt[pvt];
            jpvt[pvt] = jpvt[k];
            jpvt[k]   = itemp;
            //vn1[pvt] = vn1[k];
            //vn2[pvt] = vn2[k];
            #if defined(PRECISION_d) || defined(PRECISION_z)
                //magma_dswap( 1, &vn1[pvt], 1, &vn1[k], 1 );
                //magma_dswap( 1, &vn2[pvt], 1, &vn2[k], 1 );
                magma_dswap( 2, &vn1[pvt], n+offset, &vn1[k], n+offset );
            #else
                //magma_sswap( 1, &vn1[pvt], 1, &vn1[k], 1 );
                //magma_sswap( 1, &vn2[pvt], 1, &vn2[k], 1 );
                magma_sswap(2, &vn1[pvt], n+offset, &vn1[k], n+offset);
            #endif
        }

        /* Apply previous Householder reflectors to column K:
           A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'.
           Optimization: multiply with beta=0; wait for vector and subtract */
        if (k > 0) {
            /*#if (defined(PRECISION_c) || defined(PRECISION_z))
            for (j = 0; j < k; ++j) {
                *F(k,j) = MAGMA_Z_CNJG( *F(k,j) );
            }
            #endif*/

//#define RIGHT_UPDATE
#ifdef RIGHT_UPDATE
            i__1 = m - offset - nb;
            i__2 = k;
            magma_zgemv( MagmaNoTrans, i__1, i__2,
                         c_neg_one, A(offset+nb, 0), lda,
                                    F(k,         0), ldf,
                         c_one,     A(offset+nb, k), ione );
#else
            i__1 = m - rk;
            i__2 = k;
            /*blasf77_zgemv( MagmaNoTransStr, &i__1, &i__2,
                           &c_neg_one, A(rk, 0), &lda,
                                       F(k,  0), &ldf,
                           &c_one,     A(rk, k), &ione ); */
            magma_zgemv( MagmaNoTrans, i__1, i__2,
                         c_neg_one, dA(rk, 0), ldda,
                                    dF(k,  0), lddf,
                         c_one,     dA(rk, k), ione );
#endif

            /*#if (defined(PRECISION_c) || defined(PRECISION_z))
            for (j = 0; j < k; ++j) {
                *F(k,j) = MAGMA_Z_CNJG( *F(k,j) );
            }
            #endif*/
        }
        
        /*  Generate elementary reflector H(k). */
        magma_zlarfg_gpu( m-rk, dA(rk, k), dA(rk + 1, k), &tau[k], &vn1[k], &dAks[k]);

        //Akk = *A(rk, k);
        //*A(rk, k) = c_one;
        //magma_zgetvector( 1, &dAks[k],  1, &Akk,     1 );

        /* needed to avoid the race condition */
        if (k == 0) magma_zsetvector(  1,    &c_one,        1, dA(rk, k), 1 );
        else        magma_zcopymatrix( 1, 1, dA(offset, 0), 1, dA(rk, k), 1 );

        /* Compute Kth column of F:
           Compute  F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */
        if (k < n-1 || k > 0) magma_zgetvector( 1, &tau[k], 1, &tauk, 1 );
        if (k < n-1) {
            i__1 = m - rk;
            i__2 = n - k - 1;

            /* Send the vector to the GPU */
            //magma_zsetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda );

            /* Multiply on GPU */
            // was CALL ZGEMV( 'Conjugate transpose', M-RK+1, N-K,
            //                 TAU( K ), A( RK,  K+1 ), LDA,
            //                           A( RK,  K   ), 1,
            //                 CZERO,    F( K+1, K   ), 1 )
            //magma_zgetvector( 1, &tau[k], 1, &tauk, 1 );
            magma_zgemv( MagmaConjTrans, m-rk, n-k-1,
                         tauk,   dA( rk,  k+1 ), ldda,
                                 dA( rk,  k   ), 1,
                         c_zero, dF( k+1, k   ), 1 );
            //magma_zscal( m-rk, tau[k], F( k+1, k), 1 );
            //magma_int_t i__3 = nb-k-1;
            //magma_int_t i__4 = i__2 - i__3;
            //magma_int_t i__5 = nb-k;
            //magma_zgemv( MagmaConjTrans, i__1 - i__5, i__2 - i__3,
            //             tau[k], dA(rk +i__5, k+1+i__3), ldda,
            //                     dA(rk +i__5, k       ), ione,
            //             c_zero, dF(k+1+i__3, k       ), ione );
            
            //magma_zgetmatrix_async( i__2-i__3, 1,
            //                        dF(k + 1 +i__3, k), i__2,
            //                        F (k + 1 +i__3, k), i__2, stream );
            
            //blasf77_zgemv( MagmaConjTransStr, &i__1, &i__3,
            //               &tau[k], A(rk,  k+1), &lda,
            //                        A(rk,  k  ), &ione,
            //               &c_zero, F(k+1, k  ), &ione );
            
            //magma_queue_sync( stream );
            //blasf77_zgemv( MagmaConjTransStr, &i__5, &i__4,
            //               &tau[k], A(rk, k+1+i__3), &lda,
            //                        A(rk, k       ), &ione,
            //               &c_one,  F(k+1+i__3, k ), &ione );
        }
        
        /* Padding F(1:K,K) with zeros.
        for (j = 0; j <= k; ++j) {
            magma_zsetvector( 1, &c_zero, 1, F(j, k), 1 );
        }*/
        
        /* Incremental updating of F:
           F(1:N,K) := F(1:N,K)                        - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K).
           F(1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K)
                    := tau(K)(A(RK:M,K+1:N)' - F(1:N,1:K-1)*A(RK:M,1:K-1)') A(RK:M,K)
           so, F is (updated A)*V */
        //if (k > 0 && k < n-1) {
        if (k > 0) {
            //magma_zgetvector( 1, &tau[k], 1, &tauk, 1 );
            z__1 = MAGMA_Z_NEGATE( tauk );
#ifdef RIGHT_UPDATE
            i__1 = m - offset - nb;
            i__2 = k;
            magma_zgemv( MagmaConjTrans, i__1, i__2,
                         z__1,   dA(offset+nb, 0), lda,
                                 dA(offset+nb, k), ione,
                         c_zero, dauxv, ione );
            
            i__1 = k;
            magma_zgemv( MagmaNoTrans, n-k-1, i__1,
                         c_one, F(k+1,0), ldf,
                                dauxv,     ione,
                         c_one, F(k+1,k), ione );
#else
            i__1 = m - rk;
            i__2 = k;
            //blasf77_zgemv( MagmaConjTransStr, &i__1, &i__2,
            //               &z__1,   A(rk, 0), &lda,
            //                        A(rk, k), &ione,
            //               &c_zero, auxv, &ione );

            magma_zgemv( MagmaConjTrans, i__1, i__2,
                         z__1,   dA(rk, 0), ldda,
                                 dA(rk, k), ione,
                         c_zero, dauxv, ione );
            
            //i__1 = k;
            //blasf77_zgemv( MagmaNoTransStr, &n, &i__1,
            //               &c_one, F(0,0), &ldf,
            //                       auxv,   &ione,
            //               &c_one, F(0,k), &ione );
            /*magma_zgemv( MagmaNoTrans, n, i__1,
                           c_one, F(0,0), ldf,
                                  auxv,   ione,
                           c_one, F(0,k), ione ); */
            /* I think we only need stricly lower-triangular part :) */
            magma_zgemv( MagmaNoTrans, n-k-1, i__2,
                         c_one, dF(k+1,0), lddf,
                                dauxv,     ione,
                         c_one, dF(k+1,k), ione );
#endif
        }
        
        /* Optimization: On the last iteration start sending F back to the GPU */
        
        /* Update the current row of A:
           A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'.               */
        if (k < n-1) {
            i__1 = n - k - 1;
            i__2 = k + 1;
            //blasf77_zgemm( MagmaNoTransStr, MagmaConjTransStr, &ione, &i__1, &i__2,
            //               &c_neg_one, A(rk, 0  ), &lda,
            //                           F(k+1,0  ), &ldf,
            //               &c_one,     A(rk, k+1), &lda );
#ifdef RIGHT_UPDATE
            /* right-looking update of rows,                     */
            magma_zgemm( MagmaNoTrans, MagmaConjTrans, nb-k, i__1, ione,
                         c_neg_one, dA(rk,  k  ), ldda,
                                    dF(k+1, k  ), lddf,
                         c_one,     dA(rk,  k+1), ldda );
#else
            /* left-looking update of rows,                     *
             * since F=A'v with original A, so no right-looking */
            magma_zgemm( MagmaNoTrans, MagmaConjTrans, ione, i__1, i__2,
                         c_neg_one, dA(rk, 0  ), ldda,
                                    dF(k+1,0  ), lddf,
                         c_one,     dA(rk, k+1), ldda );
#endif
        }
        
        /* Update partial column norms. */
        if (rk < min(m, n+offset)-1 ) {
            magmablas_dznrm2_row_check_adjust(n-k-1, tol3z, &vn1[k+1], &vn2[k+1], dA(rk,k+1), ldda, dlsticcs);

            magma_device_sync();
            #if defined(PRECISION_d) || defined(PRECISION_z)
            magma_dgetvector( 1, &dlsticcs[0], 1, &lsticc, 1 );
            #else
            magma_sgetvector( 1, &dlsticcs[0], 1, &lsticc, 1 );
            #endif
        }


        /*if (rk < lastrk) {
            for (j = k + 1; j < n; ++j) {
                if (vn1[j] != 0.) {
                    // NOTE: The following 4 lines follow from the analysis in
                    //   Lapack Working Note 176.
                    temp = MAGMA_Z_ABS( *A(rk,j) ) / vn1[j];
                    temp = max( 0., ((1. + temp) * (1. - temp)) );

                    d__1 = vn1[j] / vn2[j];
                    temp2 = temp * (d__1 * d__1);

                    if (temp2 <= tol3z) {
                        vn2[j] = (double) lsticc;
                        lsticc = j;
                    } else {
                        vn1[j] *= magma_dsqrt(temp);
                    }
                }
            }
        }*/
        
        //*A(rk, k) = Akk;
        //magma_zsetvector( 1, &Akk, 1, A(rk, k), 1 );
        //magma_zswap( 1, &dAks[k], 1, A(rk, k), 1 );
        
        ++k;
    }
    magma_zcopymatrix( 1, k, dAks, 1, dA(offset, 0), ldda+1 );

    // leave k as the last column done
    --k;
    *kb = k + 1;
    rk = offset + *kb - 1;

    /* Apply the block reflector to the rest of the matrix:
       A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)'  */
    if (*kb < min(n, m - offset)) {
        i__1 = m - rk - 1;
        i__2 = n - *kb;
        
        /* Send F to the GPU
        magma_zsetmatrix( i__2, *kb,
                          F (*kb, 0), ldf,
                          dF(*kb, 0), i__2 ); */

        magma_zgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb,
                     c_neg_one, dA(rk+1, 0  ), ldda,
                                dF(*kb,  0  ), lddf,
                     c_one,     dA(rk+1, *kb), ldda );
    }
    /* Recomputation of difficult columns. */
    if ( lsticc > 0 ) {
        // printf( " -- recompute dnorms --\n" );
        magmablas_dznrm2_check( m-rk-1, n-*kb, dA(rk+1,*kb), ldda,
                                &vn1[*kb], dlsticcs );
        magma_dcopymatrix( n-*kb, 1, &vn1[*kb], *kb, &vn2[*kb], *kb );
    /*while( lsticc > 0 ) {
        itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc]));
        i__1 = m - rk - 1;
        if (lsticc <= nb)
            vn1[lsticc] = magma_cblas_dznrm2( i__1, A(rk+1,lsticc), ione );
        else {
            // Where is the data, CPU or GPU ?
            double r1, r2;
            
            r1 = magma_cblas_dznrm2( nb-k, A(rk+1,lsticc), ione );
            r2 = magma_dznrm2(m-offset-nb, dA(offset + nb + 1, lsticc), ione);
            
            vn1[lsticc] = magma_dsqrt(r1*r1+r2*r2);
        }
        
        // NOTE: The computation of VN1( LSTICC ) relies on the fact that
        //   SNRM2 does not fail on vectors with norm below the value of SQRT(DLAMCH('S'))
        vn2[lsticc] = vn1[lsticc];
        lsticc = itemp; */
    }
    magma_free(dAks);
    magma_free(dlsticcs);

    return MAGMA_SUCCESS;
} /* magma_zlaqps */
示例#12
0
/**
    @deprecated
    
    Purpose
    -------
    ZLAQPS computes a step of QR factorization with column pivoting
    of a complex M-by-N matrix A by using Blas-3.  It tries to factorize
    NB columns from A starting from the row OFFSET+1, and updates all
    of the matrix with Blas-3 xGEMM.

    In some cases, due to catastrophic cancellations, it cannot
    factorize NB columns.  Hence, the actual number of factorized
    columns is returned in KB.

    Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A. M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A. N >= 0

    @param[in]
    offset  INTEGER
            The number of rows of A that have been factorized in
            previous steps.

    @param[in]
    nb      INTEGER
            The number of columns to factorize.

    @param[out]
    kb      INTEGER
            The number of columns actually factorized.

    @param[in,out]
    dA      COMPLEX_16 array, dimension (LDDA,N), on the GPU.
            On entry, the M-by-N matrix A.
            On exit, block A(OFFSET+1:M,1:KB) is the triangular
            factor obtained and block A(1:OFFSET,1:N) has been
            accordingly pivoted, but no factorized.
            The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has
            been updated.

    @param[in]
    ldda    INTEGER
            The leading dimension of the array A. LDDA >= max(1,M).

    @param[in,out]
    jpvt    INTEGER array, dimension (N)
            JPVT(I) = K <==> Column K of the full matrix A has been
            permuted into position I in AP.

    @param[out]
    tau     COMPLEX_16 array, dimension (KB)
            The scalar factors of the elementary reflectors.

    @param[in,out]
    vn1     DOUBLE PRECISION array, dimension (N)
            The vector with the partial column norms.

    @param[in,out]
    vn2     DOUBLE PRECISION array, dimension (N)
            The vector with the exact column norms.

    @param[in,out]
    dauxv   COMPLEX_16 array, dimension (NB), on the GPU
            Auxiliary vector.

    @param[in,out]
    dF      COMPLEX_16 array, dimension (LDDF,NB), on the GPU
            Matrix F' = L*Y'*A.

    @param[in]
    lddf    INTEGER
            The leading dimension of the array F. LDDF >= max(1,N).

    @ingroup magma_zgeqp3_aux
    ********************************************************************/
extern "C" magma_int_t
magma_zlaqps_gpu(
    magma_int_t m, magma_int_t n, magma_int_t offset,
    magma_int_t nb, magma_int_t *kb,
    magmaDoubleComplex_ptr dA,  magma_int_t ldda,
    magma_int_t *jpvt, magmaDoubleComplex *tau,
    double *vn1, double *vn2,
    magmaDoubleComplex_ptr dauxv,
    magmaDoubleComplex_ptr dF,  magma_int_t lddf)
{
#define  dA(i, j) (dA  + (i) + (j)*(ldda))
#define  dF(i, j) (dF  + (i) + (j)*(lddf))

    magmaDoubleComplex c_zero    = MAGMA_Z_MAKE( 0.,0.);
    magmaDoubleComplex c_one     = MAGMA_Z_MAKE( 1.,0.);
    magmaDoubleComplex c_neg_one = MAGMA_Z_MAKE(-1.,0.);
    magma_int_t ione = 1;
    
    magma_int_t i__1, i__2;
    magmaDoubleComplex z__1;
    
    magma_int_t k, rk;
    magmaDoubleComplex_ptr dAks;
    magmaDoubleComplex tauk = MAGMA_Z_ZERO;
    magma_int_t pvt;
    double tol3z;
    magma_int_t itemp;

    double lsticc;
    magmaDouble_ptr dlsticcs;
    magma_dmalloc( &dlsticcs, 1+256*(n+255)/256 );

    tol3z = magma_dsqrt( lapackf77_dlamch("Epsilon"));

    lsticc = 0;
    k = 0;
    magma_zmalloc( &dAks, nb );

    magma_queue_t queue;
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queue );

    while( k < nb && lsticc == 0 ) {
        rk = offset + k;
        
        /* Determine ith pivot column and swap if necessary */
        // subtract 1 from Fortran/CUBLAS idamax; pvt, k are 0-based.
        pvt = k + magma_idamax( n-k, &vn1[k], ione, queue ) - 1;
        
        if (pvt != k) {
            /* F gets swapped so F must be sent at the end to GPU   */
            i__1 = k;
            magmablas_zswap( m, dA(0, pvt), ione, dA(0, k), ione, queue );

            magmablas_zswap( i__1, dF(pvt, 0), lddf, dF(k, 0), lddf, queue );
            itemp     = jpvt[pvt];
            jpvt[pvt] = jpvt[k];
            jpvt[k]   = itemp;
            magma_dswap( 2, &vn1[pvt], n+offset, &vn1[k], n+offset, queue );
        }

        /* Apply previous Householder reflectors to column K:
           A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'.
           Optimization: multiply with beta=0; wait for vector and subtract */
        if (k > 0) {
            //#define RIGHT_UPDATE
            #ifdef RIGHT_UPDATE
                i__1 = m - offset - nb;
                i__2 = k;
                magma_zgemv( MagmaNoTrans, i__1, i__2,
                             c_neg_one, A(offset+nb, 0), lda,
                                        F(k,         0), ldf,
                             c_one,     A(offset+nb, k), ione, queue );
            #else
                i__1 = m - rk;
                i__2 = k;
                magma_zgemv( MagmaNoTrans, i__1, i__2,
                             c_neg_one, dA(rk, 0), ldda,
                                        dF(k,  0), lddf,
                             c_one,     dA(rk, k), ione, queue );
            #endif
        }
        
        /*  Generate elementary reflector H(k). */
        magma_zlarfg_gpu( m-rk, dA(rk, k), dA(rk + 1, k), &tau[k], &vn1[k], &dAks[k], queue );

        /* needed to avoid the race condition */
        if (k == 0) magma_zsetvector(  1,    &c_one,        1, dA(rk, k), 1, queue );
        else        magma_zcopymatrix( 1, 1, dA(offset, 0), 1, dA(rk, k), 1, queue );

        /* Compute Kth column of F:
           Compute  F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */
        if (k < n-1 || k > 0) magma_zgetvector( 1, &tau[k], 1, &tauk, 1, queue );
        if (k < n-1) {
            i__1 = m - rk;
            i__2 = n - k - 1;

            /* Multiply on GPU */
            magma_zgemv( MagmaConjTrans, m-rk, n-k-1,
                         tauk,   dA( rk,  k+1 ), ldda,
                                 dA( rk,  k   ), 1,
                         c_zero, dF( k+1, k   ), 1, queue );
        }
        
        /* Incremental updating of F:
           F(1:N,K) := F(1:N,K)                        - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K).
           F(1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K)
                    := tau(K)(A(RK:M,K+1:N)' - F(1:N,1:K-1)*A(RK:M,1:K-1)') A(RK:M,K)
           so, F is (updated A)*V */
        if (k > 0) {
            z__1 = MAGMA_Z_NEGATE( tauk );
            #ifdef RIGHT_UPDATE
                i__1 = m - offset - nb;
                i__2 = k;
                magma_zgemv( MagmaConjTrans, i__1, i__2,
                             z__1,   dA(offset+nb, 0), lda,
                                     dA(offset+nb, k), ione,
                             c_zero, dauxv, ione, queue );
                
                i__1 = k;
                magma_zgemv( MagmaNoTrans, n-k-1, i__1,
                             c_one, F(k+1,0), ldf,
                                    dauxv,     ione,
                             c_one, F(k+1,k), ione, queue );
            #else
                i__1 = m - rk;
                i__2 = k;
                magma_zgemv( MagmaConjTrans, i__1, i__2,
                             z__1,   dA(rk, 0), ldda,
                                     dA(rk, k), ione,
                             c_zero, dauxv, ione, queue );
                
                /* I think we only need stricly lower-triangular part :) */
                magma_zgemv( MagmaNoTrans, n-k-1, i__2,
                             c_one, dF(k+1,0), lddf,
                                    dauxv,     ione,
                             c_one, dF(k+1,k), ione, queue );
            #endif
        }
        
        /* Optimization: On the last iteration start sending F back to the GPU */
        
        /* Update the current row of A:
           A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'.               */
        if (k < n-1) {
            i__1 = n - k - 1;
            i__2 = k + 1;
            #ifdef RIGHT_UPDATE
                /* right-looking update of rows,                     */
                magma_zgemm( MagmaNoTrans, MagmaConjTrans, nb-k, i__1, ione,
                             c_neg_one, dA(rk,  k  ), ldda,
                                        dF(k+1, k  ), lddf,
                             c_one,     dA(rk,  k+1), ldda, queue );
            #else
                /* left-looking update of rows,                     *
                 * since F=A'v with original A, so no right-looking */
                magma_zgemm( MagmaNoTrans, MagmaConjTrans, ione, i__1, i__2,
                             c_neg_one, dA(rk, 0  ), ldda,
                                        dF(k+1,0  ), lddf,
                             c_one,     dA(rk, k+1), ldda, queue );
            #endif
        }
        
        /* Update partial column norms. */
        if (rk < min(m, n+offset)-1 ) {
            magmablas_dznrm2_row_check_adjust( n-k-1, tol3z, &vn1[k+1], &vn2[k+1], 
                                               dA(rk,k+1), ldda, dlsticcs, queue );

            //magma_device_sync();
            magma_dgetvector( 1, &dlsticcs[0], 1, &lsticc, 1, queue );
        }
        
        ++k;
    }
    magma_zcopymatrix( 1, k, dAks, 1, dA(offset, 0), ldda+1, queue );

    // leave k as the last column done
    --k;
    *kb = k + 1;
    rk = offset + *kb - 1;

    /* Apply the block reflector to the rest of the matrix:
       A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)'  */
    if (*kb < min(n, m - offset)) {
        i__1 = m - rk - 1;
        i__2 = n - *kb;
        
        magma_zgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb,
                     c_neg_one, dA(rk+1, 0  ), ldda,
                                dF(*kb,  0  ), lddf,
                     c_one,     dA(rk+1, *kb), ldda, queue );
    }
    /* Recomputation of difficult columns. */
    if ( lsticc > 0 ) {
        // printf( " -- recompute dnorms --\n" );
        magmablas_dznrm2_check( m-rk-1, n-*kb, dA(rk+1,*kb), ldda,
                                &vn1[*kb], dlsticcs, queue );
        magma_dcopymatrix( n-*kb, 1, &vn1[*kb], *kb, &vn2[*kb], *kb, queue );
    }
    magma_free( dAks );
    magma_free( dlsticcs );

    magma_queue_destroy( queue );

    return MAGMA_SUCCESS;
} /* magma_zlaqps */
示例#13
0
extern "C" magma_int_t 
magma_dlahr2(magma_int_t n, magma_int_t k, magma_int_t nb,
             double *da, double *dv, 
             double *a, magma_int_t lda,
             double *tau, double *t, magma_int_t ldt, 
             double *y, magma_int_t ldy)
{
/*  -- MAGMA auxiliary routine (version 1.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2012

    Purpose   
    =======   

    DLAHR2 reduces the first NB columns of a real general n-BY-(n-k+1)   
    matrix A so that elements below the k-th subdiagonal are zero. The   
    reduction is performed by an orthogonal similarity transformation   
    Q' * A * Q. The routine returns the matrices V and T which determine   
    Q as a block reflector I - V*T*V', and also the matrix Y = A * V.   

    This is an auxiliary routine called by DGEHRD.   

    Arguments   
    =========   

    N       (input) INTEGER   
            The order of the matrix A.   

    K       (input) INTEGER   
            The offset for the reduction. Elements below the k-th   
            subdiagonal in the first NB columns are reduced to zero.   
            K < N.   

    NB      (input) INTEGER   
            The number of columns to be reduced.

    DA      (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDA,N-K+1)   
            On entry, the n-by-(n-k+1) general matrix A.   
            On exit, the elements on and above the k-th subdiagonal in   
            the first NB columns are overwritten with the corresponding   
            elements of the reduced matrix; the elements below the k-th   
            subdiagonal, with the array TAU, represent the matrix Q as a   
            product of elementary reflectors. The other columns of A are   
            unchanged. See Further Details.   

    DV      (output) DOUBLE_PRECISION array on the GPU, dimension (N, NB)
            On exit this contains the Householder vectors of the transformation.

    LDA     (input) INTEGER   
            The leading dimension of the array A.  LDA >= max(1,N).   

    TAU     (output) DOUBLE_PRECISION array, dimension (NB)   
            The scalar factors of the elementary reflectors. See Further   
            Details.   

    T       (output) DOUBLE_PRECISION array, dimension (LDT,NB)   
            The upper triangular matrix T.   

    LDT     (input) INTEGER   
            The leading dimension of the array T.  LDT >= NB.   

    Y       (output) DOUBLE_PRECISION array, dimension (LDY,NB)   
            The n-by-nb matrix Y.   

    LDY     (input) INTEGER   
            The leading dimension of the array Y. LDY >= N.   

    Further Details   
    ===============   
    The matrix Q is represented as a product of nb elementary reflectors   

       Q = H(1) H(2) . . . H(nb).   

    Each H(i) has the form   

       H(i) = I - tau * v * v'   

    where tau is a real scalar, and v is a real vector with   
    v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in   
    A(i+k+1:n,i), and tau in TAU(i).   

    The elements of the vectors v together form the (n-k+1)-by-nb matrix   
    V which is needed, with T and Y, to apply the transformation to the   
    unreduced part of the matrix, using an update of the form:   
    A := (I - V*T*V') * (A - Y*T*V').   

    The contents of A on exit are illustrated by the following example   
    with n = 7, k = 3 and nb = 2:   

       ( a   a   a   a   a )   
       ( a   a   a   a   a )   
       ( a   a   a   a   a )   
       ( h   h   a   a   a )   
       ( v1  h   a   a   a )   
       ( v1  v2  a   a   a )   
       ( v1  v2  a   a   a )   

    where a denotes an element of the original matrix A, h denotes a   
    modified element of the upper Hessenberg matrix H, and vi denotes an   
    element of the vector defining H(i).

    This implementation follows the hybrid algorithm and notations described in

    S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg
    form through hybrid GPU-based computing," University of Tennessee Computer
    Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219),
    May 24, 2009.
    =====================================================================    */


    double c_zero    = MAGMA_D_ZERO;
    double c_one     = MAGMA_D_ONE;
    double c_neg_one = MAGMA_D_NEG_ONE;

    magma_int_t ldda = lda;
    magma_int_t c__1 = 1;
    
    magma_int_t a_dim1, a_offset, t_dim1, t_offset, y_dim1, y_offset, i__2, i__3;
    double d__1;

    magma_int_t i__;
    double ei;

    --tau;
    a_dim1 = lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    t_dim1 = ldt;
    t_offset = 1 + t_dim1;
    t -= t_offset;
    y_dim1 = ldy;
    y_offset = 1 + y_dim1;
    y -= y_offset;

    /* Function Body */
    if (n <= 1)
      return 0;
    
    for (i__ = 1; i__ <= nb; ++i__) {
        if (i__ > 1) {

          /* Update A(K+1:N,I); Update I-th column of A - Y * V' */
          i__2 = n - k + 1;
          i__3 = i__ - 1;
          #if defined(PRECISION_z) || defined(PRECISION_c)
             lapackf77_dlacgv(&i__3, &a[k+i__-1+a_dim1], &lda);
          #endif
          blasf77_dcopy(&i__3, &a[k+i__-1+a_dim1], &lda, &t[nb*t_dim1+1], &c__1);
          blasf77_dtrmv("u","n","n",&i__3,&t[t_offset], &ldt, &t[nb*t_dim1+1], &c__1);

          blasf77_dgemv("NO TRANSPOSE", &i__2, &i__3, &c_neg_one, &y[k + y_dim1],
                        &ldy, &t[nb*t_dim1+1], &c__1, &c_one, &a[k+i__*a_dim1],&c__1);

          #if defined(PRECISION_z) || defined(PRECISION_c)
             lapackf77_dlacgv(&i__3, &a[k+i__-1+a_dim1], &lda);
          #endif

          /* Apply I - V * T' * V' to this column (call it b) from the   
             left, using the last column of T as workspace   

             Let  V = ( V1 )   and   b = ( b1 )   (first I-1 rows)   
                      ( V2 )             ( b2 )   
             where V1 is unit lower triangular   
             w := V1' * b1                                                 */
          
          i__2 = i__ - 1;
          blasf77_dcopy(&i__2, &a[k+1+i__*a_dim1], &c__1, &t[nb*t_dim1+1], &c__1);
          blasf77_dtrmv("Lower", MagmaTransStr, "UNIT", &i__2, 
                        &a[k + 1 + a_dim1], &lda, &t[nb * t_dim1 + 1], &c__1);

          /* w := w + V2'*b2 */
          i__2 = n - k - i__ + 1;
          i__3 = i__ - 1;
          blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_one, 
                        &a[k + i__ + a_dim1], &lda, &a[k+i__+i__*a_dim1], &c__1, 
                        &c_one, &t[nb*t_dim1+1], &c__1);

          /* w := T'*w */
          i__2 = i__ - 1;
          blasf77_dtrmv("U", MagmaTransStr, "N", &i__2, &t[t_offset], &ldt, 
                        &t[nb*t_dim1+1], &c__1);
          
          /* b2 := b2 - V2*w */
          i__2 = n - k - i__ + 1;
          i__3 = i__ - 1;
          blasf77_dgemv("N", &i__2, &i__3, &c_neg_one, &a[k + i__ + a_dim1], &lda, 
                 &t[nb*t_dim1+1], &c__1, &c_one, &a[k+i__+i__*a_dim1], &c__1);

          /* b1 := b1 - V1*w */
          i__2 = i__ - 1;
          blasf77_dtrmv("L","N","U",&i__2,&a[k+1+a_dim1],&lda,&t[nb*t_dim1+1],&c__1);
          blasf77_daxpy(&i__2, &c_neg_one, &t[nb * t_dim1 + 1], &c__1, 
                 &a[k + 1 + i__ * a_dim1], &c__1);
          
          a[k + i__ - 1 + (i__ - 1) * a_dim1] = ei;
        }
        
        /* Generate the elementary reflector H(I) to annihilate A(K+I+1:N,I) */
        i__2 = n - k - i__ + 1;
        i__3 = k + i__ + 1;
        lapackf77_dlarfg(&i__2, &a[k + i__ + i__ * a_dim1], 
                         &a[min(i__3,n) + i__ * a_dim1], &c__1, &tau[i__]);
        ei = a[k + i__ + i__ * a_dim1];
        a[k + i__ + i__ * a_dim1] = c_one;

        /* Compute  Y(K+1:N,I) */
        i__2 = n - k;
        i__3 = n - k - i__ + 1;
        magma_dsetvector( i__3,
                          &a[k + i__ + i__*a_dim1], 1,
                          dv+(i__-1)*(ldda+1),      1 );

        magma_dgemv(MagmaNoTrans, i__2+1, i__3, c_one, 
                    da -1 + k + i__ * ldda, ldda, 
                    dv+(i__-1)*(ldda+1), c__1, c_zero, 
                    da-1 + k + (i__-1)*ldda, c__1);     
        
        i__2 = n - k - i__ + 1;
        i__3 = i__ - 1;
        blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_one, 
                      &a[k + i__ + a_dim1], &lda, &a[k+i__+i__*a_dim1], &c__1, 
                      &c_zero, &t[i__*t_dim1+1], &c__1);

        /* Compute T(1:I,I) */
        i__2 = i__ - 1;
        d__1 = MAGMA_D_NEGATE( tau[i__] );
        blasf77_dscal(&i__2, &d__1, &t[i__ * t_dim1 + 1], &c__1);
        blasf77_dtrmv("U","N","N", &i__2, &t[t_offset], &ldt, &t[i__*t_dim1+1], &c__1);
        t[i__ + i__ * t_dim1] = tau[i__];

        magma_dgetvector( n - k + 1,
                          da-1+ k+(i__-1)*ldda, 1,
                          y+ k + i__*y_dim1,    1 );
    }
    a[k + nb + nb * a_dim1] = ei;

    return 0;
} /* magma_dlahr2 */
示例#14
0
extern "C" magma_int_t
magma_didr(
    magma_d_matrix A, magma_d_matrix b, magma_d_matrix *x,
    magma_d_solver_par *solver_par,
    magma_queue_t queue )
{
    magma_int_t info = MAGMA_NOTCONVERGED;

    // prepare solver feedback
    solver_par->solver = Magma_IDR;
    solver_par->numiter = 0;
    solver_par->spmv_count = 0;
    solver_par->init_res = 0.0;
    solver_par->final_res = 0.0;
    solver_par->iter_res = 0.0;
    solver_par->runtime = 0.0;

    // constants
    const double c_zero = MAGMA_D_ZERO;
    const double c_one = MAGMA_D_ONE;
    const double c_n_one = MAGMA_D_NEG_ONE;

    // internal user parameters
    const magma_int_t smoothing = 1;   // 0 = disable, 1 = enable
    const double angle = 0.7;          // [0-1]

    // local variables
    magma_int_t iseed[4] = {0, 0, 0, 1};
    magma_int_t dof;
    magma_int_t s;
    magma_int_t distr;
    magma_int_t k, i, sk;
    magma_int_t innerflag;
    double residual;
    double nrm;
    double nrmb;
    double nrmr;
    double nrmt;
    double rho;
    double om;
    double tt;
    double tr;
    double gamma;
    double alpha;
    double mkk;
    double fk;

    // matrices and vectors
    magma_d_matrix dxs = {Magma_CSR};
    magma_d_matrix dr = {Magma_CSR}, drs = {Magma_CSR};
    magma_d_matrix dP = {Magma_CSR}, dP1 = {Magma_CSR};
    magma_d_matrix dG = {Magma_CSR};
    magma_d_matrix dU = {Magma_CSR};
    magma_d_matrix dM = {Magma_CSR};
    magma_d_matrix df = {Magma_CSR};
    magma_d_matrix dt = {Magma_CSR};
    magma_d_matrix dc = {Magma_CSR};
    magma_d_matrix dv = {Magma_CSR};
    magma_d_matrix dbeta = {Magma_CSR}, hbeta = {Magma_CSR};

    // chronometry
    real_Double_t tempo1, tempo2;

    // initial s space
    // TODO: add option for 's' (shadow space number)
    // Hack: uses '--restart' option as the shadow space number.
    //       This is not a good idea because the default value of restart option is used to detect
    //       if the user provided a custom restart. This means that if the default restart value
    //       is changed then the code will think it was the user (unless the default value is
    //       also updated in the 'if' statement below.
    s = 1;
    if ( solver_par->restart != 50 ) {
        if ( solver_par->restart > A.num_cols ) {
            s = A.num_cols;
        } else {
            s = solver_par->restart;
        }
    }
    solver_par->restart = s;

    // set max iterations
    solver_par->maxiter = min( 2 * A.num_cols, solver_par->maxiter );

    // check if matrix A is square
    if ( A.num_rows != A.num_cols ) {
        //printf("Matrix A is not square.\n");
        info = MAGMA_ERR_NOT_SUPPORTED;
        goto cleanup;
    }

    // |b|
    nrmb = magma_dnrm2( b.num_rows, b.dval, 1, queue );
    if ( nrmb == 0.0 ) {
        magma_dscal( x->num_rows, MAGMA_D_ZERO, x->dval, 1, queue );
        info = MAGMA_SUCCESS;
        goto cleanup;
    }

    // r = b - A x
    CHECK( magma_dvinit( &dr, Magma_DEV, b.num_rows, 1, c_zero, queue ));
    CHECK( magma_dresidualvec( A, b, *x, &dr, &nrmr, queue ));
    
    // |r|
    solver_par->init_res = nrmr;
    solver_par->final_res = solver_par->init_res;
    solver_par->iter_res = solver_par->init_res;
    if ( solver_par->verbose > 0 ) {
        solver_par->res_vec[0] = (real_Double_t)nrmr;
    }

    // check if initial is guess good enough
    if ( nrmr <= solver_par->atol ||
        nrmr/nrmb <= solver_par->rtol ) {
        info = MAGMA_SUCCESS;
        goto cleanup;
    }

    // P = randn(n, s)
    // P = ortho(P)
//---------------------------------------
    // P = 0.0
    CHECK( magma_dvinit( &dP, Magma_CPU, A.num_cols, s, c_zero, queue ));

    // P = randn(n, s)
    distr = 3;        // 1 = unif (0,1), 2 = unif (-1,1), 3 = normal (0,1) 
    dof = dP.num_rows * dP.num_cols;
    lapackf77_dlarnv( &distr, iseed, &dof, dP.val );

    // transfer P to device
    CHECK( magma_dmtransfer( dP, &dP1, Magma_CPU, Magma_DEV, queue ));
    magma_dmfree( &dP, queue );

    // P = ortho(P1)
    if ( dP1.num_cols > 1 ) {
        // P = magma_dqr(P1), QR factorization
        CHECK( magma_dqr( dP1.num_rows, dP1.num_cols, dP1, dP1.ld, &dP, NULL, queue ));
    } else {
        // P = P1 / |P1|
        nrm = magma_dnrm2( dof, dP1.dval, 1, queue );
        nrm = 1.0 / nrm;
        magma_dscal( dof, nrm, dP1.dval, 1, queue );
        CHECK( magma_dmtransfer( dP1, &dP, Magma_DEV, Magma_DEV, queue ));
    }
    magma_dmfree( &dP1, queue );
//---------------------------------------

    // allocate memory for the scalar products
    CHECK( magma_dvinit( &hbeta, Magma_CPU, s, 1, c_zero, queue ));
    CHECK( magma_dvinit( &dbeta, Magma_DEV, s, 1, c_zero, queue ));

    // smoothing enabled
    if ( smoothing > 0 ) {
        // set smoothing solution vector
        CHECK( magma_dmtransfer( *x, &dxs, Magma_DEV, Magma_DEV, queue ));

        // set smoothing residual vector
        CHECK( magma_dmtransfer( dr, &drs, Magma_DEV, Magma_DEV, queue ));
    }

    // G(n,s) = 0
    CHECK( magma_dvinit( &dG, Magma_DEV, A.num_cols, s, c_zero, queue ));

    // U(n,s) = 0
    CHECK( magma_dvinit( &dU, Magma_DEV, A.num_cols, s, c_zero, queue ));

    // M(s,s) = I
    CHECK( magma_dvinit( &dM, Magma_DEV, s, s, c_zero, queue ));
    magmablas_dlaset( MagmaFull, s, s, c_zero, c_one, dM.dval, s, queue );

    // f = 0
    CHECK( magma_dvinit( &df, Magma_DEV, dP.num_cols, 1, c_zero, queue ));

    // t = 0
    CHECK( magma_dvinit( &dt, Magma_DEV, dr.num_rows, 1, c_zero, queue ));

    // c = 0
    CHECK( magma_dvinit( &dc, Magma_DEV, dM.num_cols, 1, c_zero, queue ));

    // v = 0
    CHECK( magma_dvinit( &dv, Magma_DEV, dr.num_rows, 1, c_zero, queue ));

    //--------------START TIME---------------
    // chronometry
    tempo1 = magma_sync_wtime( queue );
    if ( solver_par->verbose > 0 ) {
        solver_par->timing[0] = 0.0;
    }

    om = MAGMA_D_ONE;
    innerflag = 0;

    // start iteration
    do
    {
        solver_par->numiter++;
    
        // new RHS for small systems
        // f = P' r
        magmablas_dgemv( MagmaConjTrans, dP.num_rows, dP.num_cols, c_one, dP.dval, dP.ld, dr.dval, 1, c_zero, df.dval, 1, queue );

        // shadow space loop
        for ( k = 0; k < s; ++k ) {
            sk = s - k;
    
            // f(k:s) = M(k:s,k:s) c(k:s)
            magma_dcopyvector( sk, &df.dval[k], 1, &dc.dval[k], 1, queue );
            magma_dtrsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, sk, &dM.dval[k*dM.ld+k], dM.ld, &dc.dval[k], 1, queue );

            // v = r - G(:,k:s) c(k:s)
            magma_dcopyvector( dr.num_rows, dr.dval, 1, dv.dval, 1, queue );
            magmablas_dgemv( MagmaNoTrans, dG.num_rows, sk, c_n_one, &dG.dval[k*dG.ld], dG.ld, &dc.dval[k], 1, c_one, dv.dval, 1, queue );

            // U(:,k) = om * v + U(:,k:s) c(k:s)
            magmablas_dgemv( MagmaNoTrans, dU.num_rows, sk, c_one, &dU.dval[k*dU.ld], dU.ld, &dc.dval[k], 1, om, dv.dval, 1, queue );
            magma_dcopyvector( dU.num_rows, dv.dval, 1, &dU.dval[k*dU.ld], 1, queue );

            // G(:,k) = A U(:,k)
            CHECK( magma_d_spmv( c_one, A, dv, c_zero, dv, queue ));
            solver_par->spmv_count++;
            magma_dcopyvector( dG.num_rows, dv.dval, 1, &dG.dval[k*dG.ld], 1, queue );

            // bi-orthogonalize the new basis vectors
            for ( i = 0; i < k; ++i ) {
                // alpha = P(:,i)' G(:,k)
                alpha = magma_ddot( dP.num_rows, &dP.dval[i*dP.ld], 1, &dG.dval[k*dG.ld], 1, queue );

                // alpha = alpha / M(i,i)
                magma_dgetvector( 1, &dM.dval[i*dM.ld+i], 1, &mkk, 1, queue );
                alpha = alpha / mkk;

                // G(:,k) = G(:,k) - alpha * G(:,i)
                magma_daxpy( dG.num_rows, -alpha, &dG.dval[i*dG.ld], 1, &dG.dval[k*dG.ld], 1, queue );

                // U(:,k) = U(:,k) - alpha * U(:,i)
                magma_daxpy( dU.num_rows, -alpha, &dU.dval[i*dU.ld], 1, &dU.dval[k*dU.ld], 1, queue );
            }

            // new column of M = P'G, first k-1 entries are zero
            // M(k:s,k) = P(:,k:s)' G(:,k)
            magmablas_dgemv( MagmaConjTrans, dP.num_rows, sk, c_one, &dP.dval[k*dP.ld], dP.ld, &dG.dval[k*dG.ld], 1, c_zero, &dM.dval[k*dM.ld+k], 1, queue );

            // check M(k,k) == 0
            magma_dgetvector( 1, &dM.dval[k*dM.ld+k], 1, &mkk, 1, queue );
            if ( MAGMA_D_EQUAL(mkk, MAGMA_D_ZERO) ) {
                innerflag = 1;
                info = MAGMA_DIVERGENCE;
                break;
            }

            // beta = f(k) / M(k,k)
            magma_dgetvector( 1, &df.dval[k], 1, &fk, 1, queue );
            hbeta.val[k] = fk / mkk;

            // check for nan
            if ( magma_d_isnan( hbeta.val[k] ) || magma_d_isinf( hbeta.val[k] )) {
                innerflag = 1;
                info = MAGMA_DIVERGENCE;
                break;
            }

            // r = r - beta * G(:,k)
            magma_daxpy( dr.num_rows, -hbeta.val[k], &dG.dval[k*dG.ld], 1, dr.dval, 1, queue );

            // smoothing disabled
            if ( smoothing <= 0 ) {
                // |r|
                nrmr = magma_dnrm2( dr.num_rows, dr.dval, 1, queue );

            // smoothing enabled
            } else {
                // x = x + beta * U(:,k)
                magma_daxpy( x->num_rows, hbeta.val[k], &dU.dval[k*dU.ld], 1, x->dval, 1, queue );

                // smoothing operation
//---------------------------------------
                // t = rs - r
                magma_dcopyvector( drs.num_rows, drs.dval, 1, dt.dval, 1, queue );
                magma_daxpy( dt.num_rows, c_n_one, dr.dval, 1, dt.dval, 1, queue );

                // t't
                // t'rs 
                tt = magma_ddot( dt.num_rows, dt.dval, 1, dt.dval, 1, queue );
                tr = magma_ddot( dt.num_rows, dt.dval, 1, drs.dval, 1, queue );

                // gamma = (t' * rs) / (t' * t)
                gamma = tr / tt;

                // rs = rs - gamma * (rs - r) 
                magma_daxpy( drs.num_rows, -gamma, dt.dval, 1, drs.dval, 1, queue );

                // xs = xs - gamma * (xs - x) 
                magma_dcopyvector( dxs.num_rows, dxs.dval, 1, dt.dval, 1, queue );
                magma_daxpy( dt.num_rows, c_n_one, x->dval, 1, dt.dval, 1, queue );
                magma_daxpy( dxs.num_rows, -gamma, dt.dval, 1, dxs.dval, 1, queue );

                // |rs|
                nrmr = magma_dnrm2( drs.num_rows, drs.dval, 1, queue );           
//---------------------------------------
            }

            // store current timing and residual
            if ( solver_par->verbose > 0 ) {
                tempo2 = magma_sync_wtime( queue );
                if ( (solver_par->numiter) % solver_par->verbose == 0 ) {
                    solver_par->res_vec[(solver_par->numiter) / solver_par->verbose]
                            = (real_Double_t)nrmr;
                    solver_par->timing[(solver_par->numiter) / solver_par->verbose]
                            = (real_Double_t)tempo2 - tempo1;
                }
            }

            // check convergence
            if ( nrmr <= solver_par->atol ||
                nrmr/nrmb <= solver_par->rtol ) {
                s = k + 1; // for the x-update outside the loop
                innerflag = 2;
                info = MAGMA_SUCCESS;
                break;
            }

            // non-last s iteration
            if ( (k + 1) < s ) {
                // f(k+1:s) = f(k+1:s) - beta * M(k+1:s,k)
                magma_daxpy( sk-1, -hbeta.val[k], &dM.dval[k*dM.ld+(k+1)], 1, &df.dval[k+1], 1, queue );
            }

        }

        // smoothing disabled
        if ( smoothing <= 0 && innerflag != 1 ) {
            // update solution approximation x
            // x = x + U(:,1:s) * beta(1:s)
            magma_dsetvector( s, hbeta.val, 1, dbeta.dval, 1, queue );
            magmablas_dgemv( MagmaNoTrans, dU.num_rows, s, c_one, dU.dval, dU.ld, dbeta.dval, 1, c_one, x->dval, 1, queue );
        }

        // check convergence or iteration limit or invalid result of inner loop
        if ( innerflag > 0 ) {
            break;
        }

        // t = A v
        // t = A r
        CHECK( magma_d_spmv( c_one, A, dr, c_zero, dt, queue ));
        solver_par->spmv_count++;

        // computation of a new omega
//---------------------------------------
        // |t|
        nrmt = magma_dnrm2( dt.num_rows, dt.dval, 1, queue );

        // t'r 
        tr = magma_ddot( dt.num_rows, dt.dval, 1, dr.dval, 1, queue );

        // rho = abs(t' * r) / (|t| * |r|))
        rho = MAGMA_D_ABS( MAGMA_D_REAL(tr) / (nrmt * nrmr) );

        // om = (t' * r) / (|t| * |t|)
        om = tr / (nrmt * nrmt);
        if ( rho < angle ) {
            om = (om * angle) / rho;
        }
//---------------------------------------
        if ( MAGMA_D_EQUAL(om, MAGMA_D_ZERO) ) {
            info = MAGMA_DIVERGENCE;
            break;
        }

        // update approximation vector
        // x = x + om * v
        // x = x + om * r
        magma_daxpy( x->num_rows, om, dr.dval, 1, x->dval, 1, queue );

        // update residual vector
        // r = r - om * t
        magma_daxpy( dr.num_rows, -om, dt.dval, 1, dr.dval, 1, queue );

        // smoothing disabled
        if ( smoothing <= 0 ) {
            // residual norm
            nrmr = magma_dnrm2( b.num_rows, dr.dval, 1, queue );

        // smoothing enabled
        } else {
            // smoothing operation
//---------------------------------------
            // t = rs - r
            magma_dcopyvector( drs.num_rows, drs.dval, 1, dt.dval, 1, queue );
            magma_daxpy( dt.num_rows, c_n_one, dr.dval, 1, dt.dval, 1, queue );

            // t't
            // t'rs
            tt = magma_ddot( dt.num_rows, dt.dval, 1, dt.dval, 1, queue );
            tr = magma_ddot( dt.num_rows, dt.dval, 1, drs.dval, 1, queue );

            // gamma = (t' * rs) / (|t| * |t|)
            gamma = tr / tt;

            // rs = rs - gamma * (rs - r) 
            magma_daxpy( drs.num_rows, -gamma, dt.dval, 1, drs.dval, 1, queue );

            // xs = xs - gamma * (xs - x) 
            magma_dcopyvector( dxs.num_rows, dxs.dval, 1, dt.dval, 1, queue );
            magma_daxpy( dt.num_rows, c_n_one, x->dval, 1, dt.dval, 1, queue );
            magma_daxpy( dxs.num_rows, -gamma, dt.dval, 1, dxs.dval, 1, queue );

            // |rs|
            nrmr = magma_dnrm2( b.num_rows, drs.dval, 1, queue );           
//---------------------------------------
        }

        // store current timing and residual
        if ( solver_par->verbose > 0 ) {
            tempo2 = magma_sync_wtime( queue );
            if ( (solver_par->numiter) % solver_par->verbose == 0 ) {
                solver_par->res_vec[(solver_par->numiter) / solver_par->verbose]
                        = (real_Double_t)nrmr;
                solver_par->timing[(solver_par->numiter) / solver_par->verbose]
                        = (real_Double_t)tempo2 - tempo1;
            }
        }

        // check convergence
        if ( nrmr <= solver_par->atol ||
            nrmr/nrmb <= solver_par->rtol ) { 
            info = MAGMA_SUCCESS;
            break;
        }
    }
    while ( solver_par->numiter + 1 <= solver_par->maxiter );

    // smoothing enabled
    if ( smoothing > 0 ) {
        // x = xs
        magma_dcopyvector( x->num_rows, dxs.dval, 1, x->dval, 1, queue );

        // r = rs
        magma_dcopyvector( dr.num_rows, drs.dval, 1, dr.dval, 1, queue );
    }

    // get last iteration timing
    tempo2 = magma_sync_wtime( queue );
    solver_par->runtime = (real_Double_t)tempo2 - tempo1;
//--------------STOP TIME----------------

    // get final stats
    solver_par->iter_res = nrmr;
    CHECK( magma_dresidualvec( A, b, *x, &dr, &residual, queue ));
    solver_par->final_res = residual;

    // set solver conclusion
    if ( info != MAGMA_SUCCESS && info != MAGMA_DIVERGENCE ) {
        if ( solver_par->init_res > solver_par->final_res ) {
            info = MAGMA_SLOW_CONVERGENCE;
        }
    }


cleanup:
    // free resources
    // smoothing enabled
    if ( smoothing > 0 ) {
        magma_dmfree( &dxs, queue );
        magma_dmfree( &drs, queue );
    }
    magma_dmfree( &dr, queue );
    magma_dmfree( &dP, queue );
    magma_dmfree( &dP1, queue );
    magma_dmfree( &dG, queue );
    magma_dmfree( &dU, queue );
    magma_dmfree( &dM, queue );
    magma_dmfree( &df, queue );
    magma_dmfree( &dt, queue );
    magma_dmfree( &dc, queue );
    magma_dmfree( &dv, queue );
    magma_dmfree( &dbeta, queue );
    magma_dmfree( &hbeta, queue );

    solver_par->info = info;
    return info;
    /* magma_didr */
}
示例#15
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing dtrtri
*/
int main( int argc, char** argv )
{
    TESTING_INIT();

    real_Double_t   gflops, magma_perf, magma_time=0;  //, cpu_perf=0, cpu_time=0;
    double          magma_error, norm_invA, work[1];
    magma_int_t N, lda, ldda, info;
    magma_int_t jb, nb, nblock, sizeA, size_inv;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t *ipiv;

    double *h_A, *h_dinvA;
    double *d_A, *d_dinvA;
    double c_neg_one = MAGMA_D_NEG_ONE;
    magma_int_t status = 0;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    opts.lapack |= opts.check;  // check (-c) implies lapack (-l)
    
    double tol = opts.tolerance * lapackf77_dlamch("E");
    const char *uplo_ = lapack_uplo_const(opts.uplo);

    // this is the NB hard coded into dtrtri_diag.
    nb = 128;
    
    printf("uplo = %s, diag = %s\n",
           lapack_uplo_const(opts.uplo), lapack_diag_const(opts.diag) );
    printf("    N  MAGMA Gflop/s (ms)   MAGMA error\n");
    printf("=======================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            N = opts.nsize[itest];
            lda = N;
            ldda = ((lda+31)/32)*32;
            nblock = (N+nb-1)/nb;
            gflops = nblock * FLOPS_DTRTRI( nb ) / 1e9;
            
            TESTING_MALLOC_CPU( h_A,    double, lda*N );
            TESTING_MALLOC_CPU( ipiv,   magma_int_t,        N     );
            
            size_inv = nblock*nb*nb;
            TESTING_MALLOC_DEV( d_A,    double, ldda*N );
            TESTING_MALLOC_DEV( d_dinvA, double, size_inv );
            TESTING_MALLOC_CPU( h_dinvA, double, size_inv );
            
            /* Initialize the matrices */
            /* Factor A into LU to get well-conditioned triangular matrix.
             * Copy L to U, since L seems okay when used with non-unit diagonal
             * (i.e., from U), while U fails when used with unit diagonal. */
            sizeA = lda*N;            
            lapackf77_dlarnv( &ione, ISEED, &sizeA, h_A );
            lapackf77_dgetrf( &N, &N, h_A, &lda, ipiv, &info );
            for( int j = 0; j < N; ++j ) {
                for( int i = 0; i < j; ++i ) {
                    *h_A(i,j) = *h_A(j,i);
                }
            }
            
            /* =====================================================================
               Performs operation using MAGMABLAS
               =================================================================== */
            magma_dsetmatrix( N, N, h_A, lda, d_A, ldda );
            
            magma_time = magma_sync_wtime( NULL );
            magmablas_dtrtri_diag( opts.uplo, opts.diag, N, d_A, ldda, d_dinvA );
            magma_time = magma_sync_wtime( NULL ) - magma_time;
            magma_perf = gflops / magma_time;
            
            magma_dgetvector( size_inv, d_dinvA, 1, h_dinvA, 1 );
            
            if ( opts.verbose ) {
                printf( "A%d=", (int) N );
                magma_dprint( N, N, h_A, lda );
                printf( "d_dinvA%d=", (int) N );
                magma_dprint( min(N+4, nb), min(N+4, nblock*nb), h_dinvA, nb );
            }
            
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            if ( opts.lapack ) {
                //cpu_time = magma_wtime();
                lapackf77_dtrtri(
                    lapack_uplo_const(opts.uplo), lapack_diag_const(opts.diag),
                    &N, h_A, &lda, &info );
                //cpu_time = magma_wtime() - cpu_time;
                //cpu_perf = gflops / cpu_time;
            }
            
            /* =====================================================================
               Check the result
               =================================================================== */
            if ( opts.check ) {
                // |invA - invA_magma| / |invA|, accumulated over all diagonal blocks
                magma_error = 0;
                norm_invA   = 0;
                for( int i=0; i < N; i += nb ) {
                    jb = min( nb, N-i );
                    dgeadd( jb, jb, c_neg_one, h_A(i, i), lda, h_dinvA(0, i), nb );
                    magma_error = max( magma_error, lapackf77_dlantr( "M", uplo_, MagmaNonUnitStr, &jb, &jb, h_dinvA(0, i), &nb,  work ));
                    norm_invA   = max( norm_invA,   lapackf77_dlantr( "M", uplo_, MagmaNonUnitStr, &jb, &jb, h_A(i, i),     &lda, work ));
                }
                magma_error /= norm_invA;
                
                // CPU is doing N-by-N inverse, while GPU is doing (N/NB) NB-by-NB inverses.
                // So don't compare performance.
                printf("%5d   %7.2f (%7.2f)   %8.2e   %s\n",
                        (int) N,
                        magma_perf,  1000.*magma_time,
                        //cpu_perf,    1000.*cpu_time,
                        magma_error,
                        (magma_error < tol ? "ok" : "failed"));
                status += ! (magma_error < tol);
            }
            else {
                printf("%5d   %7.2f (%7.2f)      ---\n",
                        (int) N,
                        magma_perf,  1000.*magma_time );
            }
            
            TESTING_FREE_CPU( h_A     );
            TESTING_FREE_CPU( ipiv    );
            
            TESTING_FREE_DEV( d_A     );
            TESTING_FREE_DEV( d_dinvA );
            TESTING_FREE_CPU( h_dinvA );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    TESTING_FINALIZE();
    return status;
}
示例#16
0
int main( int argc, char** argv )
{
    TESTING_INIT();
    
    real_Double_t   gflops, t1, t2;
    double c_neg_one = MAGMA_D_NEG_ONE;
    magma_int_t ione = 1;
    const char trans[] = { 'N', 'C', 'T' };
    const char uplo[]  = { 'L', 'U' };
    const char diag[]  = { 'U', 'N' };
    const char side[]  = { 'L', 'R' };
    
    double  *A,  *B,  *C,   *C2, *LU;
    double *dA, *dB, *dC1, *dC2;
    double alpha = MAGMA_D_MAKE( 0.5, 0.1 );
    double beta  = MAGMA_D_MAKE( 0.7, 0.2 );
    double dalpha = 0.6;
    double dbeta  = 0.8;
    double work[1], error, total_error;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t m, n, k, size, maxn, ld, info;
    magma_int_t *piv;
    magma_err_t err;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" );
    
    total_error = 0.;
    for( int i = 0; i < opts.ntest; ++i ) {
        m = opts.msize[i];
        n = opts.nsize[i];
        k = opts.ksize[i];
        printf("=========================================================================\n");
        printf( "M %d, N %d, K %d\n", (int) m, (int) n, (int) k );
        
        // allocate matrices
        // over-allocate so they can be any combination of {m,n,k} x {m,n,k}.
        maxn = max( max( m, n ), k );
        ld = maxn;
        size = maxn*maxn;
        err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) );  assert( err == 0 );
        err = magma_dmalloc_pinned( &A,  size );  assert( err == 0 );
        err = magma_dmalloc_pinned( &B,  size );  assert( err == 0 );
        err = magma_dmalloc_pinned( &C,  size );  assert( err == 0 );
        err = magma_dmalloc_pinned( &C2, size );  assert( err == 0 );
        err = magma_dmalloc_pinned( &LU, size );  assert( err == 0 );
        err = magma_dmalloc( &dA,  size );        assert( err == 0 );
        err = magma_dmalloc( &dB,  size );        assert( err == 0 );
        err = magma_dmalloc( &dC1, size );        assert( err == 0 );
        err = magma_dmalloc( &dC2, size );        assert( err == 0 );
        
        // initialize matrices
        size = maxn*maxn;
        lapackf77_dlarnv( &ione, ISEED, &size, A  );
        lapackf77_dlarnv( &ione, ISEED, &size, B  );
        lapackf77_dlarnv( &ione, ISEED, &size, C  );
        
        printf( "========== Level 1 BLAS ==========\n" );
        
        // ----- test DSWAP
        // swap 2nd and 3rd columns of dA, then copy to C2 and compare with A
        assert( n >= 4 );
        magma_dsetmatrix( m, n, A, ld, dA, ld );
        magma_dsetmatrix( m, n, A, ld, dB, ld );
        magma_dswap( m, dA(0,1), 1, dA(0,2), 1 );
        magma_dswap( m, dB(0,1), 1, dB(0,2), 1 );
        
        // check results, storing diff between magma and cuda calls in C2
        cublasDaxpy( ld*n, c_neg_one, dA, 1, dB, 1 );
        magma_dgetmatrix( m, n, dB, ld, C2, ld );
        error = lapackf77_dlange( "F", &m, &k, C2, &ld, work );
        total_error += error;
        printf( "dswap             diff %.2g\n", error );
        
        // ----- test IDAMAX
        // get argmax of column of A
        magma_dsetmatrix( m, k, A, ld, dA, ld );
        error = 0;
        for( int j = 0; j < k; ++j ) {
            magma_int_t i1 = magma_idamax( m, dA(0,j), 1 );
            magma_int_t i2 = cublasIdamax( m, dA(0,j), 1 );
            assert( i1 == i2 );
            error += abs( i1 - i2 );
        }
        total_error += error;
        gflops = (double)m * k / 1e9;
        printf( "idamax            diff %.2g\n", error );
        printf( "\n" );
        
        printf( "========== Level 2 BLAS ==========\n" );
        
        // ----- test DGEMV
        // c = alpha*A*b + beta*c,  with A m*n; b,c m or n-vectors
        // try no-trans/trans
        for( int ia = 0; ia < 3; ++ia ) {
            magma_dsetmatrix( m, n, A,  ld, dA,  ld );
            magma_dsetvector( maxn, B, 1, dB,  1 );
            magma_dsetvector( maxn, C, 1, dC1, 1 );
            magma_dsetvector( maxn, C, 1, dC2, 1 );
            t1 = magma_sync_wtime( 0 );
            magma_dgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC2, 1 );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            size = (trans[ia] == 'N' ? m : n);
            cublasDaxpy( size, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetvector( size, dC2, 1, C2, 1 );
            error = lapackf77_dlange( "F", &size, &ione, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DGEMV( m, n ) / 1e9;
            printf( "dgemv( %c )        diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    trans[ia], error, gflops/t1, gflops/t2 );
        }
        printf( "\n" );
        
        // ----- test DSYMV
        // c = alpha*A*b + beta*c,  with A m*m symmetric; b,c m-vectors
        // try upper/lower
        for( int iu = 0; iu < 2; ++iu ) {
            magma_dsetmatrix( m, m, A, ld, dA, ld );
            magma_dsetvector( m, B, 1, dB,  1 );
            magma_dsetvector( m, C, 1, dC1, 1 );
            magma_dsetvector( m, C, 1, dC2, 1 );
            t1 = magma_sync_wtime( 0 );
            magma_dsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC2, 1 );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( m, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetvector( m, dC2, 1, C2, 1 );
            error = lapackf77_dlange( "F", &m, &ione, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DSYMV( m ) / 1e9;
            printf( "dsymv( %c )        diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], error, gflops/t1, gflops/t2 );
        }
        printf( "\n" );
        
        // ----- test DTRSV
        // solve A*c = c,  with A m*m triangular; c m-vector
        // try upper/lower, no-trans/trans, unit/non-unit diag
        // Factor A into LU to get well-conditioned triangles, else solve yields garbage.
        // Still can give garbage if solves aren't consistent with LU factors,
        // e.g., using unit diag for U, so copy lower triangle to upper triangle.
        // Also used for trsm later.
        lapackf77_dlacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld );
        lapackf77_dgetrf( &maxn, &maxn, LU, &ld, piv, &info );
        for( int j = 0; j < maxn; ++j ) {
            for( int i = 0; i < j; ++i ) {
                *LU(i,j) = *LU(j,i);
            }
        }
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
        for( int id = 0; id < 2; ++id ) {
            magma_dsetmatrix( m, m, LU, ld, dA, ld );
            magma_dsetvector( m, C, 1, dC1, 1 );
            magma_dsetvector( m, C, 1, dC2, 1 );
            t1 = magma_sync_wtime( 0 );
            magma_dtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC2, 1 );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( m, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetvector( m, dC2, 1, C2, 1 );
            error = lapackf77_dlange( "F", &m, &ione, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DTRSM( MagmaLeft, m, 1 ) / 1e9;
            printf( "dtrsv( %c, %c, %c )  diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], diag[id], error, gflops/t1, gflops/t2 );
        }}}
        printf( "\n" );
        
        printf( "========== Level 3 BLAS ==========\n" );
        
        // ----- test DGEMM
        // C = alpha*A*B + beta*C,  with A m*k or k*m; B k*n or n*k; C m*n
        // try combinations of no-trans/trans
        for( int ia = 0; ia < 3; ++ia ) {
        for( int ib = 0; ib < 3; ++ib ) {
            bool nta = (trans[ia] == 'N');
            bool ntb = (trans[ib] == 'N');
            magma_dsetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA,  ld );
            magma_dsetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB,  ld );
            magma_dsetmatrix( m, n, C, ld, dC1, ld );
            magma_dsetmatrix( m, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_dgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_dlange( "F", &m, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DGEMM( m, n, k ) / 1e9;
            printf( "dgemm( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    trans[ia], trans[ib], error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test DSYMM
        // C = alpha*A*B + beta*C  (left)  with A m*m symmetric; B,C m*n; or
        // C = alpha*B*A + beta*C  (right) with A n*n symmetric; B,C m*n
        // try left/right, upper/lower
        for( int is = 0; is < 2; ++is ) {
        for( int iu = 0; iu < 2; ++iu ) {
            magma_dsetmatrix( m, m, A, ld, dA,  ld );
            magma_dsetmatrix( m, n, B, ld, dB,  ld );
            magma_dsetmatrix( m, n, C, ld, dC1, ld );
            magma_dsetmatrix( m, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_dsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_dlange( "F", &m, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DSYMM( side[is], m, n ) / 1e9;
            printf( "dsymm( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    side[is], uplo[iu], error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test DSYRK
        // C = alpha*A*A^H + beta*C  (no-trans) with A m*k and C m*m symmetric; or
        // C = alpha*A^H*A + beta*C  (trans)    with A k*m and C m*m symmetric
        // try upper/lower, no-trans/trans
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
            magma_dsetmatrix( n, k, A, ld, dA,  ld );
            magma_dsetmatrix( n, n, C, ld, dC1, ld );
            magma_dsetmatrix( n, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_dsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetmatrix( n, n, dC2, ld, C2, ld );
            error = lapackf77_dlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DSYRK( k, n ) / 1e9;
            printf( "dsyrk( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test DSYR2K
        // C = alpha*A*B^H + ^alpha*B*A^H + beta*C  (no-trans) with A,B n*k; C n*n symmetric; or
        // C = alpha*A^H*B + ^alpha*B^H*A + beta*C  (trans)    with A,B k*n; C n*n symmetric
        // try upper/lower, no-trans/trans
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
            bool nt = (trans[it] == 'N');
            magma_dsetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA,  ld );
            magma_dsetmatrix( n, n, C, ld, dC1, ld );
            magma_dsetmatrix( n, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_dsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetmatrix( n, n, dC2, ld, C2, ld );
            error = lapackf77_dlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DSYR2K( k, n ) / 1e9;
            printf( "dsyr2k( %c, %c )    diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test DTRMM
        // C = alpha*A*C  (left)  with A m*m triangular; C m*n; or
        // C = alpha*C*A  (right) with A n*n triangular; C m*n
        // try left/right, upper/lower, no-trans/trans, unit/non-unit
        for( int is = 0; is < 2; ++is ) {
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
        for( int id = 0; id < 2; ++id ) {
            bool left = (side[is] == 'L');
            magma_dsetmatrix( (left ? m : n), (left ? m : n), A, ld, dA,  ld );
            magma_dsetmatrix( m, n, C, ld, dC1, ld );
            magma_dsetmatrix( m, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_dtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_dlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DTRMM( side[is], m, n ) / 1e9;
            printf( "dtrmm( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], error, gflops/t1, gflops/t2 );
        }}}}
        printf( "\n" );
        
        // ----- test DTRSM
        // solve A*X = alpha*B  (left)  with A m*m triangular; B m*n; or
        // solve X*A = alpha*B  (right) with A n*n triangular; B m*n
        // try left/right, upper/lower, no-trans/trans, unit/non-unit
        for( int is = 0; is < 2; ++is ) {
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
        for( int id = 0; id < 2; ++id ) {
            bool left = (side[is] == 'L');
            magma_dsetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA,  ld );
            magma_dsetmatrix( m, n, C, ld, dC1, ld );
            magma_dsetmatrix( m, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_dtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_dlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DTRSM( side[is], m, n ) / 1e9;
            printf( "dtrsm( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], error, gflops/t1, gflops/t2 );
        }}}}
        printf( "\n" );
        
        // cleanup
        magma_free_cpu( piv );
        magma_free_pinned( A  );
        magma_free_pinned( B  );
        magma_free_pinned( C  );
        magma_free_pinned( C2 );
        magma_free_pinned( LU );
        magma_free( dA  );
        magma_free( dB  );
        magma_free( dC1 );
        magma_free( dC2 );
    }
    
    if ( total_error != 0. ) {
        printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n",
                total_error );
    }
    else {
        printf( "all tests passed\n" );
    }
    
    TESTING_FINALIZE();
    return 0;
}
示例#17
0
int main( int argc, char** argv)
{
    TESTING_INIT();

    real_Double_t   gflops, gpu_perf, gpu_time, cpu_perf, cpu_time;
    double *h_x, *h_x2, *h_tau, *h_tau2;
    double *d_x, *d_tau;
    double c_neg_one = MAGMA_D_NEG_ONE;
    double      error, error2, work[1];
    magma_int_t N, nb, lda, ldda, size;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};    magma_int_t status = 0;


    magma_opts opts;
    parse_opts( argc, argv, &opts );

    double tol = opts.tolerance * lapackf77_dlamch("E");
    
    // does larfg on nb columns, one after another
    nb = (opts.nb > 0 ? opts.nb : 64);
    
    magma_queue_t queue = 0;

    printf("    N    nb    CPU GFLop/s (ms)    GPU GFlop/s (ms)   error      tau error\n");
    printf("==========================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            N = opts.nsize[itest];
            lda  = N;
            ldda = ((N+31)/32)*32;
            gflops = FLOPS_DLARFG( N ) / 1e9 * nb;
    
            TESTING_MALLOC_CPU( h_x,    double, N*nb );
            TESTING_MALLOC_CPU( h_x2,   double, N*nb );
            TESTING_MALLOC_CPU( h_tau,  double, nb   );
            TESTING_MALLOC_CPU( h_tau2, double, nb   );
        
            TESTING_MALLOC_DEV( d_x,   double, ldda*nb );
            TESTING_MALLOC_DEV( d_tau, double, nb      );
            
            /* Initialize the vectors */
            size = N*nb;
            lapackf77_dlarnv( &ione, ISEED, &size, h_x );
            
            /* =====================================================================
               Performs operation using MAGMABLAS
               =================================================================== */
            magma_dsetmatrix( N, nb, h_x, N, d_x, ldda );
    
            gpu_time = magma_sync_wtime( queue );
            for( int j = 0; j < nb; ++j ) {
                magmablas_dlarfg( N, &d_x[0+j*ldda], &d_x[1+j*ldda], ione, &d_tau[j] );
            }
            gpu_time = magma_sync_wtime( queue ) - gpu_time;
            gpu_perf = gflops / gpu_time;
            
            magma_dgetmatrix( N, nb, d_x, ldda, h_x2, N );
            magma_dgetvector( nb, d_tau, 1, h_tau2, 1 );
            
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            cpu_time = magma_wtime();
            for( int j = 0; j < nb; ++j ) {
                lapackf77_dlarfg( &N, &h_x[0+j*lda], &h_x[1+j*lda], &ione, &h_tau[j] );
            }
            cpu_time = magma_wtime() - cpu_time;
            cpu_perf = gflops / cpu_time;
            
            /* =====================================================================
               Error Computation and Performance Comparison
               =================================================================== */
            blasf77_daxpy( &size, &c_neg_one, h_x, &ione, h_x2, &ione );
            error = lapackf77_dlange( "F", &N, &nb, h_x2, &N, work )
                  / lapackf77_dlange( "F", &N, &nb, h_x,  &N, work );
            
            // tau can be 0
            blasf77_daxpy( &nb, &c_neg_one, h_tau, &ione, h_tau2, &ione );
            error2 = lapackf77_dlange( "F", &nb, &ione, h_tau,  &nb, work );
            if ( error2 != 0 ) {
                error2 = lapackf77_dlange( "F", &nb, &ione, h_tau2, &nb, work ) / error2;
            }

            printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e   %8.2e   %s\n",
                   (int) N, (int) nb, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time,
                   error, error2,
                   (error < tol && error2 < tol ? "ok" : "failed") );
            status += ! (error < tol && error2 < tol);
            
            TESTING_FREE_CPU( h_x   );
            TESTING_FREE_CPU( h_x2  );
            TESTING_FREE_CPU( h_tau );
        
            TESTING_FREE_DEV( d_x   );
            TESTING_FREE_DEV( d_tau );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    TESTING_FINALIZE();
    return status;
}