Пример #1
0
/**
    Purpose
    -------
    ZGEHRD reduces a COMPLEX_16 general matrix A to upper Hessenberg form H by
    an orthogonal similarity transformation:  Q' * A * Q = H . This version
    stores the triangular matrices used in the factorization so that they can
    be applied directly (i.e., without being recomputed) later. As a result,
    the application of Q is much faster.

    Arguments
    ---------
    @param[in]
    n       INTEGER
            The order of the matrix A.  N >= 0.

    @param[in]
    ilo     INTEGER
    @param[in]
    ihi     INTEGER
            It is assumed that A is already upper triangular in rows
            and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally
            set by a previous call to ZGEBAL; otherwise they should be
            set to 1 and N respectively. See Further Details.
            1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0.

    @param[in,out]
    A       COMPLEX_16 array, dimension (LDA,N)
            On entry, the N-by-N general matrix to be reduced.
            On exit, the upper triangle and the first subdiagonal of A
            are overwritten with the upper Hessenberg matrix H, and the
            elements below the first subdiagonal, with the array TAU,
            represent the orthogonal matrix Q as a product of elementary
            reflectors. See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    tau     COMPLEX_16 array, dimension (N-1)
            The scalar factors of the elementary reflectors (see Further
            Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to
            zero.

    @param[out]
    work    (workspace) COMPLEX_16 array, dimension (LWORK)
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The length of the array WORK.  LWORK >= max(1,N).
            For optimum performance LWORK >= N*NB, where NB is the
            optimal blocksize.
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    @param[out]
    T       COMPLEX_16 array, dimension NB*N,
            where NB is the optimal blocksize. It stores the NB*NB blocks
            of the triangular T matrices used in the reduction.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value.

    Further Details
    ---------------
    The matrix Q is represented as a product of (ihi-ilo) elementary
    reflectors

        Q = H(ilo) H(ilo+1) . . . H(ihi-1).

    Each H(i) has the form

        H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on
    exit in A(i+2:ihi,i), and tau in TAU(i).

    The contents of A are illustrated by the following example, with
    n = 7, ilo = 2 and ihi = 6:

    @verbatim
    on entry,                        on exit,

    ( a   a   a   a   a   a   a )    (  a   a   h   h   h   h   a )
    (     a   a   a   a   a   a )    (      a   h   h   h   h   a )
    (     a   a   a   a   a   a )    (      h   h   h   h   h   h )
    (     a   a   a   a   a   a )    (      v2  h   h   h   h   h )
    (     a   a   a   a   a   a )    (      v2  v3  h   h   h   h )
    (     a   a   a   a   a   a )    (      v2  v3  v4  h   h   h )
    (                         a )    (                          a )
    @endverbatim

    where a denotes an element of the original matrix A, h denotes a
    modified element of the upper Hessenberg matrix H, and vi denotes an
    element of the vector defining H(i).

    This implementation follows the hybrid algorithm and notations described in

    S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg
    form through hybrid GPU-based computing," University of Tennessee Computer
    Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219),
    May 24, 2009.

    This version stores the T matrices, for later use in magma_zunghr.

    @ingroup magma_zgeev_comp
    ********************************************************************/
extern "C" magma_int_t
magma_zgehrd_m(
    magma_int_t n, magma_int_t ilo, magma_int_t ihi,
    magmaDoubleComplex *A, magma_int_t lda,
    magmaDoubleComplex *tau,
    magmaDoubleComplex *work, magma_int_t lwork,
    magmaDoubleComplex *T,
    magma_int_t *info)
{
    #define  A( i, j )    (A + (i) + (j)*lda)
    #define dA( d, i, j ) (data.A[d] + (i) + (j)*ldda)

    magmaDoubleComplex c_one  = MAGMA_Z_ONE;
    magmaDoubleComplex c_zero = MAGMA_Z_ZERO;

    magma_int_t nb = magma_get_zgehrd_nb(n);

    magma_int_t nh, iws, ldda, min_lblocks, max_lblocks, last_dev, d;
    magma_int_t dpanel, di, nlocal, i, i2, ib, ldwork;
    magma_int_t iinfo;
    magma_int_t lquery;
    struct zgehrd_data data;

    int ngpu = magma_num_gpus();
    
    *info = 0;
    iws = n*(nb + nb*ngpu);
    work[0] = MAGMA_Z_MAKE( iws, 0 );

    lquery = (lwork == -1);
    if (n < 0) {
        *info = -1;
    } else if (ilo < 1 || ilo > max(1,n)) {
        *info = -2;
    } else if (ihi < min(ilo,n) || ihi > n) {
        *info = -3;
    } else if (lda < max(1,n)) {
        *info = -5;
    } else if (lwork < max(1,n) && ! lquery) {
        *info = -8;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;

    // Adjust from 1-based indexing
    ilo -= 1;
    
    // Quick return if possible
    nh = ihi - ilo;
    if (nh <= 1) {
        work[0] = c_one;
        return *info;
    }
    
    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );

    // Set elements 0:ILO-1 and IHI-1:N-2 of TAU to zero
    for (i = 0; i < ilo; ++i)
        tau[i] = c_zero;

    for (i = max(0,ihi-1); i < n-1; ++i)
        tau[i] = c_zero;

    // set T to zero
    lapackf77_zlaset( "Full", &nb, &n, &c_zero, &c_zero, T, &nb );

    // set to null, to simplify cleanup code
    for( d = 0; d < ngpu; ++d ) {
        data.A[d]       = NULL;
        data.streams[d] = NULL;
    }
    
    // If not enough workspace, use unblocked code
    if ( lwork < iws ) {
        nb = 1;
    }

    if (nb == 1 || nb >= nh) {
        // Use unblocked code below
        i = ilo;
    }
    else {
        // Use blocked code
        // allocate memory on GPUs for A and workspaces
        ldda = ((n+31)/32)*32;
        min_lblocks = (n     / nb) / ngpu;
        max_lblocks = ((n-1) / nb) / ngpu + 1;
        last_dev    = (n     / nb) % ngpu;
        
        // V and Vd need to be padded for copying in mzlahr2
        data.ngpu = ngpu;
        data.ldda = ldda;
        data.ldv  = nb*max_lblocks*ngpu;
        data.ldvd = nb*max_lblocks;
        
        for( d = 0; d < ngpu; ++d ) {
            magma_setdevice( d );
            nlocal = min_lblocks*nb;
            if ( d < last_dev ) {
                nlocal += nb;
            }
            else if ( d == last_dev ) {
                nlocal += (n % nb);
            }
            
            ldwork = nlocal*ldda   // A
                   + nb*data.ldv   // V
                   + nb*data.ldvd  // Vd
                   + nb*ldda       // Y
                   + nb*ldda       // W
                   + nb*nb;        // Ti
            if ( MAGMA_SUCCESS != magma_zmalloc( &data.A[d], ldwork )) {
                *info = MAGMA_ERR_DEVICE_ALLOC;
                goto CLEANUP;
            }
            data.V [d] = data.A [d] + nlocal*ldda;
            data.Vd[d] = data.V [d] + nb*data.ldv;
            data.Y [d] = data.Vd[d] + nb*data.ldvd;
            data.W [d] = data.Y [d] + nb*ldda;
            data.Ti[d] = data.W [d] + nb*ldda;
            
            magma_queue_create( &data.streams[d] );
        }
        
        // Copy the matrix to GPUs
        magma_zsetmatrix_1D_col_bcyclic( n, n, A, lda, data.A, ldda, ngpu, nb );
        
        // round ilo down to block boundary
        ilo = (ilo/nb)*nb;
        for (i = ilo; i < ihi - 1 - nb; i += nb) {
            //   Reduce columns i:i+nb-1 to Hessenberg form, returning the
            //   matrices V and T of the block reflector H = I - V*T*V'
            //   which performs the reduction, and also the matrix Y = A*V*T
            
            //   Get the current panel (no need for the 1st iteration)
            dpanel =  (i / nb) % ngpu;
            di     = ((i / nb) / ngpu) * nb;
            if ( i > ilo ) {
                magma_setdevice( dpanel );
                magma_zgetmatrix( ihi-i, nb,
                                  dA(dpanel, i, di), ldda,
                                  A(i,i),            lda );
            }
            
            // add 1 to i for 1-based index
            magma_zlahr2_m( ihi, i+1, nb, A(0,i), lda,
                            &tau[i], &T[i*nb], nb, work, n, &data );
            
            magma_zlahru_m( n, ihi, i, nb, A, lda, &data );
            
            // copy first i rows above panel to host
            magma_setdevice( dpanel );
            magma_zgetmatrix_async( i, nb,
                                    dA(dpanel, 0, di), ldda,
                                    A(0,i),            lda, data.streams[dpanel] );
        }
        
        // Copy remainder to host, block-by-block
        for( i2 = i; i2 < n; i2 += nb ) {
            ib = min( nb, n-i2 );
            d  = (i2 / nb) % ngpu;
            di = (i2 / nb) / ngpu * nb;
            magma_setdevice( d );
            magma_zgetmatrix( n, ib,
                              dA(d, 0, di), ldda,
                              A(0,i2),      lda );
        }
    }

    // Use unblocked code to reduce the rest of the matrix
    // add 1 to i for 1-based index
    i += 1;
    lapackf77_zgehd2(&n, &i, &ihi, A, &lda, tau, work, &iinfo);
    work[0] = MAGMA_Z_MAKE( iws, 0 );
    
CLEANUP:
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magma_free( data.A[d] );
        magma_queue_destroy( data.streams[d] );
    }
    magma_setdevice( orig_dev );
    
    return *info;
} /* magma_zgehrd */
Пример #2
0
extern "C" magma_int_t
magma_zbicgstab_merge2(
    magma_z_matrix A, magma_z_matrix b,
    magma_z_matrix *x, magma_z_solver_par *solver_par,
    magma_queue_t queue )
{
    magma_int_t info = MAGMA_NOTCONVERGED;
    
    // prepare solver feedback
    solver_par->solver = Magma_BICGSTABMERGE2;
    solver_par->numiter = 0;
    solver_par->spmv_count = 0;
    
    // solver variables
    magmaDoubleComplex alpha, beta, omega, rho_old, rho_new, *skp_h={0};
    double nom, nom0, betanom, nomb;
    //double den;

    // some useful variables
    magmaDoubleComplex c_zero = MAGMA_Z_ZERO, c_one = MAGMA_Z_ONE;
    
    magma_int_t dofs = A.num_rows;


    // workspace
    magma_z_matrix q={Magma_CSR}, r={Magma_CSR}, rr={Magma_CSR}, p={Magma_CSR}, v={Magma_CSR}, s={Magma_CSR}, t={Magma_CSR};
    magmaDoubleComplex *d1=NULL, *d2=NULL, *skp=NULL;
    d1 = NULL;
    d2 = NULL;
    skp = NULL;
    
    CHECK( magma_zmalloc( &d1, dofs*(2) ));
    CHECK( magma_zmalloc( &d2, dofs*(2) ));

    // array for the parameters
    CHECK( magma_zmalloc( &skp, 8 ));
    // skp = [alpha|beta|omega|rho_old|rho|nom|tmp1|tmp2]
    CHECK( magma_zvinit( &q, Magma_DEV, dofs*6, 1, c_zero, queue ));

    // q = rr|r|p|v|s|t
    rr.memory_location = Magma_DEV; rr.dval = NULL; rr.num_rows = rr.nnz = dofs;
    r.memory_location = Magma_DEV; r.dval = NULL; r.num_rows = r.nnz = dofs;
    p.memory_location = Magma_DEV; p.dval = NULL; p.num_rows = p.nnz = dofs;
    v.memory_location = Magma_DEV; v.dval = NULL; v.num_rows = v.nnz = dofs;
    s.memory_location = Magma_DEV; s.dval = NULL; s.num_rows = s.nnz = dofs;
    t.memory_location = Magma_DEV; t.dval = NULL; t.num_rows = t.nnz = dofs;

    rr.dval = q(0);
    r.dval = q(1);
    p.dval = q(2);
    v.dval = q(3);
    s.dval = q(4);
    t.dval = q(5);

    // solver setup
    magma_zscal( dofs, c_zero, x->dval, 1, queue );                             // x = 0
    CHECK(  magma_zresidualvec( A, b, *x, &r, &nom0, queue));
    magma_zcopy( dofs, r.dval, 1, q(0), 1, queue );                            // rr = r
    magma_zcopy( dofs, r.dval, 1, q(1), 1, queue );                            // q = r
    betanom = nom0;
    nom = nom0*nom0;
    rho_new = magma_zdotc( dofs, r.dval, 1, r.dval, 1, queue );             // rho=<rr,r>
    rho_old = omega = alpha = MAGMA_Z_MAKE( 1.0, 0. );
    beta = rho_new;
    solver_par->init_res = nom0;
    // array on host for the parameters
    CHECK( magma_zmalloc_cpu( &skp_h, 8 ));
    
    skp_h[0]=alpha;
    skp_h[1]=beta;
    skp_h[2]=omega;
    skp_h[3]=rho_old;
    skp_h[4]=rho_new;
    skp_h[5]=MAGMA_Z_MAKE(nom, 0.0);
    magma_zsetvector( 8, skp_h, 1, skp, 1, queue );

    solver_par->final_res = solver_par->init_res;
    solver_par->iter_res = solver_par->init_res;
    if ( solver_par->verbose > 0 ) {
        solver_par->res_vec[0] = nom0;
        solver_par->timing[0] = 0.0;
    }
    nomb = magma_dznrm2( dofs, b.dval, 1, queue );
    if( nom0 < solver_par->atol ||
        nom0/nomb < solver_par->rtol ){
        info = MAGMA_SUCCESS;
        goto cleanup;
    }
    
    CHECK( magma_z_spmv( c_one, A, r, c_zero, v, queue ));             // z = A r

    //Chronometry
    real_Double_t tempo1, tempo2;
    tempo1 = magma_sync_wtime( queue );


    solver_par->numiter = 0;
    solver_par->spmv_count = 0;
    // start iteration
    do
    {
        solver_par->numiter++;

        // computes p=r+beta*(p-omega*v)
        CHECK( magma_zbicgmerge1( dofs, skp, v.dval, r.dval, p.dval, queue ));
        CHECK( magma_zbicgmerge_spmv1(  A, d1, d2, q(2), q(0), q(3), skp, queue ));
        solver_par->spmv_count++;
        CHECK( magma_zbicgmerge2( dofs, skp, r.dval, v.dval, s.dval, queue )); // s=r-alpha*v
        CHECK( magma_zbicgmerge_spmv2( A, d1, d2, q(4), q(5), skp, queue ));
        solver_par->spmv_count++;
        CHECK( magma_zbicgmerge_xrbeta( dofs, d1, d2, q(0), q(1), q(2),
                                                    q(4), q(5), x->dval, skp, queue ));

        // check stopping criterion (asynchronous copy)
        magma_zgetvector( 1 , skp+5, 1, skp_h+5, 1, queue );

        betanom = sqrt(MAGMA_Z_REAL(skp_h[5]));

        if ( solver_par->verbose > 0 ) {
            tempo2 = magma_sync_wtime( queue );
            if ( (solver_par->numiter)%solver_par->verbose==0 ) {
                solver_par->res_vec[(solver_par->numiter)/solver_par->verbose]
                        = (real_Double_t) betanom;
                solver_par->timing[(solver_par->numiter)/solver_par->verbose]
                        = (real_Double_t) tempo2-tempo1;
            }
        }

        if (  betanom  < solver_par->atol || 
              betanom/nomb < solver_par->rtol ) {
            break;
        }
    }
    while ( solver_par->numiter+1 <= solver_par->maxiter );
    
    tempo2 = magma_sync_wtime( queue );
    solver_par->runtime = (real_Double_t) tempo2-tempo1;
    double residual;
    CHECK( magma_zresidual( A, b, *x, &residual, NULL ));
    solver_par->iter_res = betanom;
    solver_par->final_res = residual;

    if ( solver_par->numiter < solver_par->maxiter ) {
        info = MAGMA_SUCCESS;
    } else if ( solver_par->init_res > solver_par->final_res ) {
        if ( solver_par->verbose > 0 ) {
            if ( (solver_par->numiter)%solver_par->verbose==0 ) {
                solver_par->res_vec[(solver_par->numiter)/solver_par->verbose]
                        = (real_Double_t) betanom;
                solver_par->timing[(solver_par->numiter)/solver_par->verbose]
                        = (real_Double_t) tempo2-tempo1;
            }
        }
        info = MAGMA_SLOW_CONVERGENCE;
        if( solver_par->iter_res < solver_par->atol ||
            solver_par->iter_res/solver_par->init_res < solver_par->rtol ){
            info = MAGMA_SUCCESS;
        }
    }
    else {
        if ( solver_par->verbose > 0 ) {
            if ( (solver_par->numiter)%solver_par->verbose==0 ) {
                solver_par->res_vec[(solver_par->numiter)/solver_par->verbose]
                        = (real_Double_t) betanom;
                solver_par->timing[(solver_par->numiter)/solver_par->verbose]
                        = (real_Double_t) tempo2-tempo1;
            }
        }
        info = MAGMA_DIVERGENCE;
    }
    
cleanup:
    magma_zmfree(&q, queue );  // frees all vectors
    magma_free(d1);
    magma_free(d2);
    magma_free( skp );
    magma_free_cpu( skp_h );

    solver_par->info = info;
    return info;
}   /* zbicgstab_merge2 */
Пример #3
0
/**
    Purpose
    -------
    ZGEGQR orthogonalizes the N vectors given by a complex M-by-N matrix A:
           
        A = Q * R.

    On exit, if successful, the orthogonal vectors Q overwrite A
    and R is given in work (on the CPU memory).
    The routine is designed for tall-and-skinny matrices: M >> N, N <= 128.
    
    This version uses normal equations and SVD in an iterative process that
    makes the computation numerically accurate.
    
    Arguments
    ---------
    @param[in]
    ikind   INTEGER
            Several versions are implemented indiceted by the ikind value:
            1:  This version uses normal equations and SVD in an iterative process
                that makes the computation numerically accurate.
            2:  This version uses a standard LAPACK-based orthogonalization through
                MAGMA's QR panel factorization (magma_zgeqr2x3_gpu) and magma_zungqr
            3:  Modified Gram-Schmidt (MGS)
            4.  Cholesky QR [ Note: this method uses the normal equations which
                                    squares the condition number of A, therefore
                                    ||I - Q'Q|| < O(eps cond(A)^2)               ]

    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  m >= n >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A. 128 >= n >= 0.

    @param[in,out]
    dA      COMPLEX_16 array on the GPU, dimension (ldda,n)
            On entry, the m-by-n matrix A.
            On exit, the m-by-n matrix Q with orthogonal columns.

    @param[in]
    ldda     INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,m).
            To benefit from coalescent memory accesses LDDA must be
            divisible by 16.

    @param
    dwork   (GPU workspace) COMPLEX_16 array, dimension:
            n^2                    for ikind = 1
            3 n^2 + min(m, n) + 2  for ikind = 2
            0 (not used)           for ikind = 3
            n^2                    for ikind = 4

    @param[out]
    work    (CPU workspace) COMPLEX_16 array, dimension 3 n^2.
            On exit, work(1:n^2) holds the rectangular matrix R.
            Preferably, for higher performance, work should be in pinned memory.
 
    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
      -     > 0:  for ikind = 4, the normal equations were not
                  positive definite, so the factorization could not be
                  completed, and the solution has not been computed.

    @ingroup magma_zgeqrf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_zgegqr_gpu(
    magma_int_t ikind, magma_int_t m, magma_int_t n,
    magmaDoubleComplex_ptr dA,   magma_int_t ldda,
    magmaDoubleComplex_ptr dwork, magmaDoubleComplex *work,
    magma_int_t *info )
{
    #define work(i_,j_) (work + (i_) + (j_)*n)
    #define dA(i_,j_)   (dA   + (i_) + (j_)*ldda)
    
    magma_int_t i = 0, j, k, n2 = n*n;
    magma_int_t ione = 1;
    magmaDoubleComplex c_zero = MAGMA_Z_ZERO;
    magmaDoubleComplex c_one  = MAGMA_Z_ONE;
    double cn = 200., mins, maxs;

    /* check arguments */
    *info = 0;
    if (ikind < 1 || ikind > 4) {
        *info = -1;
    } else if (m < 0 || m < n) {
        *info = -2;
    } else if (n < 0 || n > 128) {
        *info = -3;
    } else if (ldda < max(1,m)) {
        *info = -5;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    magma_queue_t queue;
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queue );

    if (ikind == 1) {
        // === Iterative, based on SVD ============================================================
        magmaDoubleComplex *U, *VT, *vt, *R, *G, *hwork, *tau;
        double *S;

        R    = work;             // Size n * n
        G    = R    + n*n;       // Size n * n
        VT   = G    + n*n;       // Size n * n
        
        magma_zmalloc_cpu( &hwork, 32 + 2*n*n + 2*n );
        if ( hwork == NULL ) {
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        
        magma_int_t lwork=n*n+32; // First part f hwork; used as workspace in svd
        
        U    = hwork + n*n + 32;   // Size n*n
        S    = (double*)(U + n*n); // Size n
        tau  = U + n*n + n;        // Size n
        
        #ifdef COMPLEX
        double *rwork;
        magma_dmalloc_cpu( &rwork, 5*n );
        if ( rwork == NULL ) {
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        #endif
        
        do {
            i++;
            
            magma_zgemm( MagmaConjTrans, MagmaNoTrans, n, n, m, c_one,
                         dA, ldda, dA, ldda, c_zero, dwork, n, queue );
            magma_zgetmatrix( n, n, dwork, n, G, n, queue );
            
            lapackf77_zgesvd( "n", "a", &n, &n, G, &n, S, U, &n, VT, &n,
                              hwork, &lwork,
                              #ifdef COMPLEX
                              rwork,
                              #endif
                              info );
            
            mins = 100.f, maxs = 0.f;
            for (k=0; k < n; k++) {
                S[k] = magma_dsqrt( S[k] );
                
                if (S[k] < mins)  mins = S[k];
                if (S[k] > maxs)  maxs = S[k];
            }
            
            for (k=0; k < n; k++) {
                vt = VT + k*n;
                for (j=0; j < n; j++)
                    vt[j] *= S[j];
            }
            lapackf77_zgeqrf( &n, &n, VT, &n, tau, hwork, &lwork, info );
            
            if (i == 1)
                blasf77_zcopy( &n2, VT, &ione, R, &ione );
            else
                blasf77_ztrmm( "l", "u", "n", "n", &n, &n, &c_one, VT, &n, R, &n );
            
            magma_zsetmatrix( n, n, VT, n, dwork, n, queue );
            magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit,
                         m, n, c_one, dwork, n, dA, ldda, queue );
            if (mins > 0.00001f)
                cn = maxs/mins;
            
            //fprintf( stderr, "Iteration %d, cond num = %f \n", i, cn );
        } while (cn > 10.f);
        
        magma_free_cpu( hwork );
        #ifdef COMPLEX
        magma_free_cpu( rwork );
        #endif
        // ================== end of ikind == 1 ===================================================
    }
    else if (ikind == 2) {
        // ================== LAPACK based      ===================================================
        magma_int_t min_mn = min(m, n);
        magma_int_t nb = n;

        magmaDoubleComplex_ptr dtau = dwork + 2*n*n;
        magmaDoubleComplex_ptr d_T  = dwork;
        magmaDoubleComplex_ptr ddA  = dwork + n*n;
        magmaDoubleComplex *tau  = work+n*n;

        magmablas_zlaset( MagmaFull, n, n, c_zero, c_zero, d_T, n, queue );
        magma_zgeqr2x3_gpu( m, n, dA, ldda, dtau, d_T, ddA,
                            (double*)(dwork+min_mn+2*n*n), info );
        magma_zgetmatrix( min_mn, 1, dtau, min_mn, tau, min_mn, queue );
        magma_zgetmatrix( n, n, ddA, n, work, n, queue );
        magma_zungqr_gpu( m, n, n, dA, ldda, tau, d_T, nb, info );
        // ================== end of ikind == 2 ===================================================
    }
    else if (ikind == 3) {
        // ================== MGS               ===================================================
        for (j = 0; j < n; j++) {
            for (i = 0; i < j; i++) {
                *work(i, j) = magma_zdotc( m, dA(0,i), 1, dA(0,j), 1, queue );
                magma_zaxpy( m, -(*work(i,j)),  dA(0,i), 1, dA(0,j), 1, queue );
            }
            for (i = j; i < n; i++) {
                *work(i, j) = MAGMA_Z_ZERO;
            }
            //*work(j,j) = MAGMA_Z_MAKE( magma_dznrm2( m, dA(0,j), 1), 0., queue );
            *work(j,j) = magma_zdotc( m, dA(0,j), 1, dA(0,j), 1, queue );
            *work(j,j) = MAGMA_Z_MAKE( sqrt(MAGMA_Z_REAL( *work(j,j) )), 0. );
            magma_zscal( m, 1./ *work(j,j), dA(0,j), 1, queue );
        }
        // ================== end of ikind == 3 ===================================================
    }
    else if (ikind == 4) {
        // ================== Cholesky QR       ===================================================
        magma_zgemm( MagmaConjTrans, MagmaNoTrans, n, n, m, c_one,
                     dA, ldda, dA, ldda, c_zero, dwork, n, queue );
        magma_zgetmatrix( n, n, dwork, n, work, n, queue );
        lapackf77_zpotrf( "u", &n, work, &n, info );
        magma_zsetmatrix( n, n, work, n, dwork, n, queue );
        magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit,
                     m, n, c_one, dwork, n, dA, ldda, queue );
        // ================== end of ikind == 4 ===================================================
    }
             
    magma_queue_destroy( queue );

    return *info;
} /* magma_zgegqr_gpu */
Пример #4
0
extern "C" void
magma_zherk_mgpu2(
    magma_int_t num_gpus, magma_uplo_t uplo, magma_trans_t trans, magma_int_t nb, magma_int_t n, magma_int_t k,
    double alpha,
    magmaDoubleComplex **db, magma_int_t lddb, magma_int_t offset_b,
    double beta,
    magmaDoubleComplex **dc, magma_int_t lddc, magma_int_t offset,
    magma_int_t num_streams, magma_queue_t stream[][10])
{
#define dB(id, i, j)  (db[(id)]+(j)*lddb + (i)+offset_b)
#define dC(id, i, j)  (dc[(id)]+(j)*lddc + (i))

    const char* uplo_  = lapack_uplo_const( uplo  );
    magma_int_t i, id, ib, ii, kk, n1;
    magmaDoubleComplex z_alpha = MAGMA_Z_MAKE(alpha,0.0);
    magmaDoubleComplex z_beta  = MAGMA_Z_MAKE(beta, 0.0);

    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );
    
    /* diagonal update */
    for( i=0; i < n; i += nb ) {
        id = ((i+offset)/nb)%num_gpus;
        kk = STREAM_ID( i+offset );

        ib = min(nb, n-i);
        ii = nb*((i+offset)/(nb*num_gpus));
    }

    if (uplo == MagmaUpper) {
        for( i=0; i < n; i += nb ) {
            id = ((i+offset)/nb)%num_gpus;
            kk = STREAM_ID( i+offset );

            ib = min(nb, n-i);
            ii = nb*((i+offset)/(nb*num_gpus));
            n1 = i+ib;

            magma_setdevice(id);
            magmablasSetKernelStream(stream[id][kk]);

            /* zgemm on diag and off-diagonal blocks */
            magma_zgemm(MagmaNoTrans, MagmaConjTrans, n1, ib, k,
                        z_alpha, dB(id, 0, 0 ), lddb,
                                 dB(id, i, 0 ), lddb,
                        z_beta,  dC(id, 0, ii), lddc);
        }
    }
    else {
        for( i=0; i < n; i += nb ) {
            id = ((i+offset)/nb)%num_gpus;
            kk = STREAM_ID( i+offset );

            ib = min(nb, n-i);
            ii = nb*((i+offset)/(nb*num_gpus));
            n1 = n-i;

            magma_setdevice(id);
            magmablasSetKernelStream(stream[id][kk]);
            trace_gpu_start( id, kk, "gemm_up", "gemm_up" );
            /* zgemm on diag and off-diagonal blocks */
            magma_zgemm(MagmaNoTrans, MagmaConjTrans, n1, ib, k,
                        z_alpha, dB(id, i,         0), lddb,
                                 dB(id, i,         0), lddb,
                        z_beta,  dC(id, i+offset, ii), lddc);
            trace_gpu_end( id, kk );
        }
    }

    // TODO: why not sync?
    //for( id=0; id < num_gpus; id++ ) {
    //    magma_setdevice(id);
    //    //for( kk=0; kk < num_streams; kk++ )
    //    //    magma_queue_sync(stream[id][kk]);
    //}
    magma_setdevice( orig_dev );
    magmablasSetKernelStream( orig_stream );
}
Пример #5
0
/**
    Purpose
    -------
    ZHEGVDX computes selected eigenvalues and, optionally, eigenvectors
    of a complex generalized Hermitian-definite eigenproblem, of the form
    A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
    B are assumed to be Hermitian and B is also positive definite.
    Eigenvalues and eigenvectors can be selected by specifying either a
    range of values or a range of indices for the desired eigenvalues.
    If eigenvectors are desired, it uses a divide and conquer algorithm.

    The divide and conquer algorithm makes very mild assumptions about
    floating point arithmetic. It will work on machines with a guard
    digit in add/subtract, or on those binary machines without guard
    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
    Cray-2. It could conceivably fail on hexadecimal or decimal machines
    without guard digits, but we know of none.

    Arguments
    ---------
    @param[in]
    itype   INTEGER
            Specifies the problem type to be solved:
            = 1:  A*x = (lambda)*B*x
            = 2:  A*B*x = (lambda)*x
            = 3:  B*A*x = (lambda)*x

    @param[in]
    jobz    magma_vec_t
      -     = MagmaNoVec:  Compute eigenvalues only;
      -     = MagmaVec:    Compute eigenvalues and eigenvectors.

    @param[in]
    range   magma_range_t
      -     = MagmaRangeAll: all eigenvalues will be found.
      -     = MagmaRangeV:   all eigenvalues in the half-open interval (VL,VU]
                   will be found.
      -     = MagmaRangeI:   the IL-th through IU-th eigenvalues will be found.

    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangles of A and B are stored;
      -     = MagmaLower:  Lower triangles of A and B are stored.

    @param[in]
    n       INTEGER
            The order of the matrices A and B.  N >= 0.

    @param[in,out]
    A       COMPLEX_16 array, dimension (LDA, N)
            On entry, the Hermitian matrix A.  If UPLO = MagmaUpper, the
            leading N-by-N upper triangular part of A contains the
            upper triangular part of the matrix A.  If UPLO = MagmaLower,
            the leading N-by-N lower triangular part of A contains
            the lower triangular part of the matrix A.
    \n
            On exit, if JOBZ = MagmaVec, then if INFO = 0, A contains the
            matrix Z of eigenvectors.  The eigenvectors are normalized
            as follows:
            if ITYPE = 1 or 2, Z**H*B*Z = I;
            if ITYPE = 3, Z**H*inv(B)*Z = I.
            If JOBZ = MagmaNoVec, then on exit the upper triangle (if UPLO=MagmaUpper)
            or the lower triangle (if UPLO=MagmaLower) of A, including the
            diagonal, is destroyed.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[in,out]
    B       COMPLEX_16 array, dimension (LDB, N)
            On entry, the Hermitian matrix B.  If UPLO = MagmaUpper, the
            leading N-by-N upper triangular part of B contains the
            upper triangular part of the matrix B.  If UPLO = MagmaLower,
            the leading N-by-N lower triangular part of B contains
            the lower triangular part of the matrix B.
    \n
            On exit, if INFO <= N, the part of B containing the matrix is
            overwritten by the triangular factor U or L from the Cholesky
            factorization B = U**H*U or B = L*L**H.

    @param[in]
    ldb     INTEGER
            The leading dimension of the array B.  LDB >= max(1,N).

    @param[in]
    vl      DOUBLE PRECISION
    @param[in]
    vu      DOUBLE PRECISION
            If RANGE=MagmaRangeV, the lower and upper bounds of the interval to
            be searched for eigenvalues. VL < VU.
            Not referenced if RANGE = MagmaRangeAll or MagmaRangeI.

    @param[in]
    il      INTEGER
    @param[in]
    iu      INTEGER
            If RANGE=MagmaRangeI, the indices (in ascending order) of the
            smallest and largest eigenvalues to be returned.
            1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
            Not referenced if RANGE = MagmaRangeAll or MagmaRangeV.

    @param[out]
    m       INTEGER
            The total number of eigenvalues found.  0 <= M <= N.
            If RANGE = MagmaRangeAll, M = N, and if RANGE = MagmaRangeI, M = IU-IL+1.

    @param[out]
    w       DOUBLE PRECISION array, dimension (N)
            If INFO = 0, the eigenvalues in ascending order.

    @param[out]
    work    (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The length of the array WORK.
            If N <= 1,                      LWORK >= 1.
            If JOBZ = MagmaNoVec and N > 1, LWORK >= N + N*NB.
            If JOBZ = MagmaVec   and N > 1, LWORK >= max( N + N*NB, 2*N + N**2 ).
            NB can be obtained through magma_get_zhetrd_nb(N).
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal sizes of the WORK, RWORK and
            IWORK arrays, returns these values as the first entries of
            the WORK, RWORK and IWORK arrays, and no error message
            related to LWORK or LRWORK or LIWORK is issued by XERBLA.

    @param[out]
    rwork   (workspace) DOUBLE PRECISION array, dimension (MAX(1,LRWORK))
            On exit, if INFO = 0, RWORK[0] returns the optimal LRWORK.

    @param[in]
    lrwork  INTEGER
            The dimension of the array RWORK.
            If N <= 1,                      LRWORK >= 1.
            If JOBZ = MagmaNoVec and N > 1, LRWORK >= N.
            If JOBZ = MagmaVec   and N > 1, LRWORK >= 1 + 5*N + 2*N**2.
    \n
            If LRWORK = -1, then a workspace query is assumed; the
            routine only calculates the optimal sizes of the WORK, RWORK
            and IWORK arrays, returns these values as the first entries
            of the WORK, RWORK and IWORK arrays, and no error message
            related to LWORK or LRWORK or LIWORK is issued by XERBLA.

    @param[out]
    iwork   (workspace) INTEGER array, dimension (MAX(1,LIWORK))
            On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK.

    @param[in]
    liwork  INTEGER
            The dimension of the array IWORK.
            If N <= 1,                      LIWORK >= 1.
            If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1.
            If JOBZ = MagmaVec   and N > 1, LIWORK >= 3 + 5*N.
    \n
            If LIWORK = -1, then a workspace query is assumed; the
            routine only calculates the optimal sizes of the WORK, RWORK
            and IWORK arrays, returns these values as the first entries
            of the WORK, RWORK and IWORK arrays, and no error message
            related to LWORK or LRWORK or LIWORK is issued by XERBLA.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
      -     > 0:  ZPOTRF or ZHEEVD returned an error code:
               <= N:  if INFO = i and JOBZ = MagmaNoVec, then the algorithm
                      failed to converge; i off-diagonal elements of an
                      intermediate tridiagonal form did not converge to
                      zero;
                      if INFO = i and JOBZ = MagmaVec, then the algorithm
                      failed to compute an eigenvalue while working on
                      the submatrix lying in rows and columns INFO/(N+1)
                      through mod(INFO,N+1);
               > N:   if INFO = N + i, for 1 <= i <= N, then the leading
                      minor of order i of B is not positive definite.
                      The factorization of B could not be completed and
                      no eigenvalues or eigenvectors were computed.

    Further Details
    ---------------
    Based on contributions by
       Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA

    Modified so that no backsubstitution is performed if ZHEEVD fails to
    converge (NEIG in old code could be greater than N causing out of
    bounds reference to A - reported by Ralf Meyer).  Also corrected the
    description of INFO and the test on ITYPE. Sven, 16 Feb 05.

    @ingroup magma_zhegv_driver
    ********************************************************************/
extern "C" magma_int_t
magma_zhegvdx(magma_int_t itype, magma_vec_t jobz, magma_range_t range, magma_uplo_t uplo, magma_int_t n,
              magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *B, magma_int_t ldb,
              double vl, double vu, magma_int_t il, magma_int_t iu,
              magma_int_t *m, double *w, magmaDoubleComplex *work, magma_int_t lwork, double *rwork,
              magma_int_t lrwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info)
{
    const char* uplo_  = lapack_uplo_const( uplo  );
    const char* jobz_  = lapack_vec_const( jobz  );

    magmaDoubleComplex c_one = MAGMA_Z_ONE;

    magmaDoubleComplex *dA=NULL, *dB=NULL;
    magma_int_t ldda = n;
    magma_int_t lddb = n;

    magma_int_t lower;
    magma_trans_t trans;
    magma_int_t wantz;
    magma_int_t lquery;
    magma_int_t alleig, valeig, indeig;

    magma_int_t lwmin;
    magma_int_t liwmin;
    magma_int_t lrwmin;

    magma_queue_t stream;
    magma_queue_create( &stream );

    wantz  = (jobz == MagmaVec);
    lower  = (uplo == MagmaLower);
    alleig = (range == MagmaRangeAll);
    valeig = (range == MagmaRangeV);
    indeig = (range == MagmaRangeI);
    lquery = (lwork == -1 || lrwork == -1 || liwork == -1);

    *info = 0;
    if (itype < 1 || itype > 3) {
        *info = -1;
    } else if (! (alleig || valeig || indeig)) {
        *info = -2;
    } else if (! (wantz || (jobz == MagmaNoVec))) {
        *info = -3;
    } else if (! (lower || (uplo == MagmaUpper))) {
        *info = -4;
    } else if (n < 0) {
        *info = -5;
    } else if (lda < max(1,n)) {
        *info = -7;
    } else if (ldb < max(1,n)) {
        *info = -9;
    } else {
        if (valeig) {
            if (n > 0 && vu <= vl) {
                *info = -11;
            }
        } else if (indeig) {
            if (il < 1 || il > max(1,n)) {
                *info = -12;
            } else if (iu < min(n,il) || iu > n) {
                *info = -13;
            }
        }
    }

    magma_int_t nb = magma_get_zhetrd_nb( n );
    if ( n <= 1 ) {
        lwmin  = 1;
        lrwmin = 1;
        liwmin = 1;
    }
    else if ( wantz ) {
        lwmin  = max( n + n*nb, 2*n + n*n );
        lrwmin = 1 + 5*n + 2*n*n;
        liwmin = 3 + 5*n;
    }
    else {
        lwmin  = n + n*nb;
        lrwmin = n;
        liwmin = 1;
    }

    // multiply by 1+eps (in Double!) to ensure length gets rounded up,
    // if it cannot be exactly represented in floating point.
    real_Double_t one_eps = 1. + lapackf77_dlamch("Epsilon");
    work[0]  = MAGMA_Z_MAKE( lwmin * one_eps, 0.);  // round up
    rwork[0] = lrwmin * one_eps;
    iwork[0] = liwmin;

    if (lwork < lwmin && ! lquery) {
        *info = -17;
    } else if (lrwork < lrwmin && ! lquery) {
        *info = -19;
    } else if (liwork < liwmin && ! lquery) {
        *info = -21;
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info));
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (n == 0) {
        return *info;
    }

    /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */
    if (n <= 128) {
        #ifdef ENABLE_DEBUG
        printf("--------------------------------------------------------------\n");
        printf("  warning matrix too small N=%d NB=%d, calling lapack on CPU  \n", (int) n, (int) nb);
        printf("--------------------------------------------------------------\n");
        #endif
        lapackf77_zhegvd(&itype, jobz_, uplo_,
                         &n, A, &lda, B, &ldb,
                         w, work, &lwork,
                         #if defined(PRECISION_z) || defined(PRECISION_c)
                         rwork, &lrwork,
                         #endif
                         iwork, &liwork, info);
        *m = n;
        return *info;
    }

    if (MAGMA_SUCCESS != magma_zmalloc( &dA, n*ldda ) ||
        MAGMA_SUCCESS != magma_zmalloc( &dB, n*lddb )) {
        magma_free( dA );
        magma_free( dB );
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    /* Form a Cholesky factorization of B. */
    magma_zsetmatrix( n, n, B, ldb, dB, lddb );

    magma_zsetmatrix_async( n, n,
                            A,  lda,
                            dA, ldda, stream );

    magma_timer_t time=0;
    timer_start( time );
    magma_zpotrf_gpu(uplo, n, dB, lddb, info);
    if (*info != 0) {
        *info = n + *info;
        return *info;
    }
    timer_stop( time );
    timer_printf( "time zpotrf_gpu = %6.2f\n", time );

    magma_queue_sync( stream );
    magma_zgetmatrix_async( n, n,
                            dB, lddb,
                            B,  ldb, stream );

    timer_start( time );
    /* Transform problem to standard eigenvalue problem and solve. */
    magma_zhegst_gpu(itype, uplo, n, dA, ldda, dB, lddb, info);
    timer_stop( time );
    timer_printf( "time zhegst_gpu = %6.2f\n", time );

    /* simple fix to be able to run bigger size.
     * need to have a dwork here that will be used
     * a dB and then passed to  dsyevd.
     * */
    if (n > 5000) {
        magma_queue_sync( stream );
        magma_free( dB );
    }

    timer_start( time );
    magma_zheevdx_gpu(jobz, range, uplo, n, dA, ldda, vl, vu, il, iu, m, w, A, lda,
                      work, lwork, rwork, lrwork, iwork, liwork, info);
    timer_stop( time );
    timer_printf( "time zheevdx_gpu = %6.2f\n", time );

    if (wantz && *info == 0) {
        timer_start( time );
        
        /* allocate and copy dB back */
        if (n > 5000) {
            if (MAGMA_SUCCESS != magma_zmalloc( &dB, n*lddb ) ) {
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }
            magma_zsetmatrix( n, n, B, ldb, dB, lddb );
        }
        /* Backtransform eigenvectors to the original problem. */
        if (itype == 1 || itype == 2) {
            /* For A*x=(lambda)*B*x and A*B*x=(lambda)*x;
               backtransform eigenvectors: x = inv(L)'*y or inv(U)*y */
            if (lower) {
                trans = MagmaConjTrans;
            } else {
                trans = MagmaNoTrans;
            }

            magma_ztrsm(MagmaLeft, uplo, trans, MagmaNonUnit,
                        n, *m, c_one, dB, lddb, dA, ldda);
        }
        else if (itype == 3) {
            /* For B*A*x=(lambda)*x;
               backtransform eigenvectors: x = L*y or U'*y */
            if (lower) {
                trans = MagmaNoTrans;
            } else {
                trans = MagmaConjTrans;
            }

            magma_ztrmm(MagmaLeft, uplo, trans, MagmaNonUnit,
                        n, *m, c_one, dB, lddb, dA, ldda);
        }

        magma_zgetmatrix( n, *m, dA, ldda, A, lda );
        
        /* free dB */
        if (n > 5000) {
            magma_free( dB );
        }
        
        timer_stop( time );
        timer_printf( "time ztrsm/mm + getmatrix = %6.2f\n", time );
    }

    magma_queue_sync( stream );
    magma_queue_destroy( stream );

    work[0]  = MAGMA_Z_MAKE( lwmin * one_eps, 0.);  // round up
    rwork[0] = lrwmin * one_eps;
    iwork[0] = liwmin;

    magma_free( dA );
    if (n <= 5000) {
        magma_free( dB );
    }

    return *info;
} /* magma_zhegvdx */
Пример #6
0
extern "C" magma_int_t
magma_zpbicgstab(
    magma_z_matrix A, magma_z_matrix b, magma_z_matrix *x,
    magma_z_solver_par *solver_par,
    magma_z_preconditioner *precond_par,
    magma_queue_t queue )
{
    magma_int_t info = MAGMA_NOTCONVERGED;
    
    // prepare solver feedback
    solver_par->solver = Magma_PBICGSTAB;
    solver_par->numiter = 0;
    solver_par->spmv_count = 0;

    // some useful variables
    magmaDoubleComplex c_zero = MAGMA_Z_ZERO;
    magmaDoubleComplex c_one  = MAGMA_Z_ONE;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    
    magma_int_t dofs = A.num_rows*b.num_cols;

    // workspace
    magma_z_matrix r={Magma_CSR}, rr={Magma_CSR}, p={Magma_CSR}, v={Magma_CSR}, s={Magma_CSR}, t={Magma_CSR}, ms={Magma_CSR}, mt={Magma_CSR}, y={Magma_CSR}, z={Magma_CSR};
    CHECK( magma_zvinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &rr,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &v, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &s, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &t, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &ms,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &mt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &y, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));
    CHECK( magma_zvinit( &z, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue ));

    
    // solver variables
    magmaDoubleComplex alpha, beta, omega, rho_old, rho_new;
    double nom, betanom, nom0, r0, res, nomb;
    res=0;
    //double den;

    // solver setup
    CHECK(  magma_zresidualvec( A, b, *x, &r, &nom0, queue));
    magma_zcopy( dofs, r.dval, 1, rr.dval, 1, queue );                  // rr = r
    betanom = nom0;
    nom = nom0*nom0;
    rho_new = omega = alpha = MAGMA_Z_MAKE( 1.0, 0. );
    solver_par->init_res = nom0;

    nomb = magma_dznrm2( dofs, b.dval, 1, queue );
    if ( nomb == 0.0 ){
        nomb=1.0;
    }       
    if ( (r0 = nomb * solver_par->rtol) < ATOLERANCE ){
        r0 = ATOLERANCE;
    }
    
    solver_par->final_res = solver_par->init_res;
    solver_par->iter_res = solver_par->init_res;
    if ( solver_par->verbose > 0 ) {
        solver_par->res_vec[0] = nom0;
        solver_par->timing[0] = 0.0;
    }
    if ( nom < r0 ) {
        info = MAGMA_SUCCESS;
        goto cleanup;
    }

    //Chronometry
    real_Double_t tempo1, tempo2, tempop1, tempop2;
    tempo1 = magma_sync_wtime( queue );

    solver_par->numiter = 0;
    solver_par->spmv_count = 0;
    // start iteration
    do
    {
        solver_par->numiter++;
        rho_old = rho_new;                                    // rho_old=rho

        rho_new = magma_zdotc( dofs, rr.dval, 1, r.dval, 1, queue );  // rho=<rr,r>
        beta = rho_new/rho_old * alpha/omega;   // beta=rho/rho_old *alpha/omega
        if( magma_z_isnan_inf( beta ) ){
            info = MAGMA_DIVERGENCE;
            break;
        }
        magma_zscal( dofs, beta, p.dval, 1, queue );                 // p = beta*p
        magma_zaxpy( dofs, c_neg_one * omega * beta, v.dval, 1 , p.dval, 1, queue );
                                                        // p = p-omega*beta*v
        magma_zaxpy( dofs, c_one, r.dval, 1, p.dval, 1, queue );      // p = p+r

        // preconditioner
        tempop1 = magma_sync_wtime( queue );
        CHECK( magma_z_applyprecond_left( MagmaNoTrans, A, p, &mt, precond_par, queue ));
        CHECK( magma_z_applyprecond_right( MagmaNoTrans, A, mt, &y, precond_par, queue ));
        tempop2 = magma_sync_wtime( queue );
        precond_par->runtime += tempop2-tempop1;
        
        CHECK( magma_z_spmv( c_one, A, y, c_zero, v, queue ));      // v = Ap
        solver_par->spmv_count++;
        alpha = rho_new / magma_zdotc( dofs, rr.dval, 1, v.dval, 1, queue );
        if( magma_z_isnan_inf( alpha ) ){
            info = MAGMA_DIVERGENCE;
            break;
        }
        magma_zcopy( dofs, r.dval, 1 , s.dval, 1, queue );            // s=r
        magma_zaxpy( dofs, c_neg_one * alpha, v.dval, 1 , s.dval, 1, queue ); // s=s-alpha*v

        // preconditioner
        tempop1 = magma_sync_wtime( queue );
        CHECK( magma_z_applyprecond_left( MagmaNoTrans, A, s, &ms, precond_par, queue ));
        CHECK( magma_z_applyprecond_right( MagmaNoTrans, A, ms, &z, precond_par, queue ));
        tempop2 = magma_sync_wtime( queue );
        precond_par->runtime += tempop2-tempop1;
        
        CHECK( magma_z_spmv( c_one, A, z, c_zero, t, queue ));       // t=As
        solver_par->spmv_count++;                  
       // omega = <s,t>/<t,t>
        omega = magma_zdotc( dofs, t.dval, 1, s.dval, 1, queue )
                   / magma_zdotc( dofs, t.dval, 1, t.dval, 1, queue );

        magma_zaxpy( dofs, alpha, y.dval, 1 , x->dval, 1, queue );     // x=x+alpha*p
        magma_zaxpy( dofs, omega, z.dval, 1 , x->dval, 1, queue );     // x=x+omega*s

        magma_zcopy( dofs, s.dval, 1 , r.dval, 1, queue );             // r=s
        magma_zaxpy( dofs, c_neg_one * omega, t.dval, 1 , r.dval, 1, queue ); // r=r-omega*t
        res = betanom = magma_dznrm2( dofs, r.dval, 1, queue );

        nom = betanom*betanom;

        if ( solver_par->verbose > 0 ) {
            tempo2 = magma_sync_wtime( queue );
            if ( (solver_par->numiter)%solver_par->verbose==0 ) {
                solver_par->res_vec[(solver_par->numiter)/solver_par->verbose]
                        = (real_Double_t) res;
                solver_par->timing[(solver_par->numiter)/solver_par->verbose]
                        = (real_Double_t) tempo2-tempo1;
            }
        }

        if ( res/nomb <= solver_par->rtol || res <= solver_par->atol ){
            break;
        }
    }
    while ( solver_par->numiter+1 <= solver_par->maxiter );
    
    tempo2 = magma_sync_wtime( queue );
    solver_par->runtime = (real_Double_t) tempo2-tempo1;
    double residual;
    CHECK(  magma_zresidualvec( A, b, *x, &r, &residual, queue));
    solver_par->final_res = residual;
    solver_par->iter_res = res;

    if ( solver_par->numiter < solver_par->maxiter && info == MAGMA_SUCCESS ) {
        info = MAGMA_SUCCESS;
    } else if ( solver_par->init_res > solver_par->final_res ) {
        if ( solver_par->verbose > 0 ) {
            if ( (solver_par->numiter)%solver_par->verbose==0 ) {
                solver_par->res_vec[(solver_par->numiter)/solver_par->verbose]
                        = (real_Double_t) betanom;
                solver_par->timing[(solver_par->numiter)/solver_par->verbose]
                        = (real_Double_t) tempo2-tempo1;
            }
        }
        info = MAGMA_SLOW_CONVERGENCE;
        if( solver_par->iter_res < solver_par->rtol*solver_par->init_res ||
            solver_par->iter_res < solver_par->atol ) {
            info = MAGMA_SUCCESS;
        }
    }
    else {
        if ( solver_par->verbose > 0 ) {
            if ( (solver_par->numiter)%solver_par->verbose==0 ) {
                solver_par->res_vec[(solver_par->numiter)/solver_par->verbose]
                        = (real_Double_t) betanom;
                solver_par->timing[(solver_par->numiter)/solver_par->verbose]
                        = (real_Double_t) tempo2-tempo1;
            }
        }
        info = MAGMA_DIVERGENCE;
    }
    
cleanup:
    magma_zmfree(&r, queue );
    magma_zmfree(&rr, queue );
    magma_zmfree(&p, queue );
    magma_zmfree(&v, queue );
    magma_zmfree(&s, queue );
    magma_zmfree(&t, queue );
    magma_zmfree(&ms, queue );
    magma_zmfree(&mt, queue );
    magma_zmfree(&y, queue );
    magma_zmfree(&z, queue );

    solver_par->info = info;
    return info;
}   /* magma_zbicgstab */
Пример #7
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing dgeev
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    real_Double_t   gpu_time, cpu_time;
    double *h_A, *h_R, *VL, *VR, *h_work, *w1, *w2;
    double *w1i, *w2i;
    magmaDoubleComplex *w1copy, *w2copy;
    magmaDoubleComplex  c_neg_one = MAGMA_Z_NEG_ONE;
    double tnrm, result[9];
    magma_int_t N, n2, lda, nb, lwork, info;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    double ulp, ulpinv, error;
    magma_int_t status = 0;

    ulp = lapackf77_dlamch( "P" );
    ulpinv = 1./ulp;

    magma_opts opts;
    parse_opts( argc, argv, &opts );

    // need slightly looser bound (60*eps instead of 30*eps) for some tests
    opts.tolerance = max( 60., opts.tolerance );
    double tol    = opts.tolerance * lapackf77_dlamch("E");
    double tolulp = opts.tolerance * lapackf77_dlamch("P");

    // enable at least some minimal checks, if requested
    if ( opts.check && !opts.lapack && opts.jobvl == MagmaNoVec && opts.jobvr == MagmaNoVec ) {
        fprintf( stderr, "NOTE: Some checks require vectors to be computed;\n"
                 "      set jobvl=V (option -LV), or jobvr=V (option -RV), or both.\n"
                 "      Some checks require running lapack (-l); setting lapack.\n\n");
        opts.lapack = true;
    }

    printf("    N   CPU Time (sec)   GPU Time (sec)   |W_magma - W_lapack| / |W_lapack|\n");
    printf("===========================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            N = opts.nsize[itest];
            lda   = N;
            n2    = lda*N;
            nb    = magma_get_dgehrd_nb(N);
            lwork = N*(2 + nb);
            // generous workspace - required by dget22
            lwork = max( lwork, N*(5 + 2*N) );

            TESTING_MALLOC_CPU( w1copy, magmaDoubleComplex, N );
            TESTING_MALLOC_CPU( w2copy, magmaDoubleComplex, N );
            TESTING_MALLOC_CPU( w1,  double, N  );
            TESTING_MALLOC_CPU( w2,  double, N  );
            TESTING_MALLOC_CPU( w1i, double, N  );
            TESTING_MALLOC_CPU( w2i, double, N  );
            TESTING_MALLOC_CPU( h_A, double, n2 );

            TESTING_MALLOC_PIN( h_R, double, n2 );
            TESTING_MALLOC_PIN( VL,  double, n2 );
            TESTING_MALLOC_PIN( VR,  double, n2 );
            TESTING_MALLOC_PIN( h_work, double, lwork );

            /* Initialize the matrix */
            lapackf77_dlarnv( &ione, ISEED, &n2, h_A );
            lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda );

            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            gpu_time = magma_wtime();
            magma_dgeev( opts.jobvl, opts.jobvr,
                         N, h_R, lda, w1, w1i,
                         VL, lda, VR, lda,
                         h_work, lwork, opts.queue, &info );
            gpu_time = magma_wtime() - gpu_time;
            if (info != 0)
                printf("magma_dgeev returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));

            /* =====================================================================
               Check the result
               =================================================================== */
            if ( opts.check ) {
                /* ===================================================================
                 * Check the result following LAPACK's [zcds]drvev routine.
                 * The following tests are performed:
                 * (1)   | A * VR - VR * W | / ( n |A| )
                 *
                 *       Here VR is the matrix of unit right eigenvectors.
                 *       W is a diagonal matrix with diagonal entries W(j).
                 *
                 * (2)   | |VR(i)| - 1 |   and whether largest component real
                 *
                 *       VR(i) denotes the i-th column of VR.
                 *
                 * (3)   | A**T * VL - VL * W**T | / ( n |A| )
                 *
                 *       Here VL is the matrix of unit left eigenvectors, A**T is the
                 *       transpose of A, and W is as above.
                 *
                 * (4)   | |VL(i)| - 1 |   and whether largest component real
                 *
                 *       VL(i) denotes the i-th column of VL.
                 *
                 * (5)   W(full) = W(partial, W only) -- currently skipped
                 * (6)   W(full) = W(partial, W and VR)
                 * (7)   W(full) = W(partial, W and VL)
                 *
                 *       W(full) denotes the eigenvalues computed when both VR and VL
                 *       are also computed, and W(partial) denotes the eigenvalues
                 *       computed when only W, only W and VR, or only W and VL are
                 *       computed.
                 *
                 * (8)   VR(full) = VR(partial, W and VR)
                 *
                 *       VR(full) denotes the right eigenvectors computed when both VR
                 *       and VL are computed, and VR(partial) denotes the result
                 *       when only VR is computed.
                 *
                 * (9)   VL(full) = VL(partial, W and VL)
                 *
                 *       VL(full) denotes the left eigenvectors computed when both VR
                 *       and VL are also computed, and VL(partial) denotes the result
                 *       when only VL is computed.
                 *
                 * (1, 2) only if jobvr = V
                 * (3, 4) only if jobvl = V
                 * (5-9)  only if check = 2 (option -c2)
                 ================================================================= */
                double vmx, vrmx, vtst;

                // Initialize result. -1 indicates test was not run.
                for( int j = 0; j < 9; ++j )
                    result[j] = -1.;

                if ( opts.jobvr == MagmaVec ) {
                    // Do test 1: | A * VR - VR * W | / ( n |A| )
                    // Note this writes result[1] also
                    lapackf77_dget22( MagmaNoTransStr, MagmaNoTransStr, MagmaNoTransStr,
                                      &N, h_A, &lda, VR, &lda, w1, w1i,
                                      h_work, &result[0] );
                    result[0] *= ulp;

                    // Do test 2: | |VR(i)| - 1 |   and whether largest component real
                    result[1] = -1.;
                    for( int j = 0; j < N; ++j ) {
                        tnrm = 1.;
                        if (w1i[j] == 0.)
                            tnrm = magma_cblas_dnrm2( N, &VR[j*lda], ione );
                        else if (w1i[j] > 0.)
                            tnrm = magma_dlapy2( magma_cblas_dnrm2( N, &VR[j*lda],     ione ),
                                                 magma_cblas_dnrm2( N, &VR[(j+1)*lda], ione ));

                        result[1] = max( result[1], min( ulpinv, MAGMA_D_ABS(tnrm-1.)/ulp ));

                        if (w1i[j] > 0.) {
                            vmx  = vrmx = 0.;
                            for( int jj = 0; jj < N; ++jj ) {
                                vtst = magma_dlapy2( VR[jj+j*lda], VR[jj+(j+1)*lda]);
                                if (vtst > vmx)
                                    vmx = vtst;

                                if ( (VR[jj + (j+1)*lda])==0. &&
                                        MAGMA_D_ABS( VR[jj+j*lda] ) > vrmx)
                                {
                                    vrmx = MAGMA_D_ABS( VR[jj+j*lda] );
                                }
                            }
                            if (vrmx / vmx < 1. - ulp*2.)
                                result[1] = ulpinv;
                        }
                    }
                    result[1] *= ulp;
                }

                if ( opts.jobvl == MagmaVec ) {
                    // Do test 3: | A**T * VL - VL * W**T | / ( n |A| )
                    // Note this writes result[3] also
                    lapackf77_dget22( MagmaTransStr, MagmaNoTransStr, MagmaTransStr,
                                      &N, h_A, &lda, VL, &lda, w1, w1i,
                                      h_work, &result[2] );
                    result[2] *= ulp;

                    // Do test 4: | |VL(i)| - 1 |   and whether largest component real
                    result[3] = -1.;
                    for( int j = 0; j < N; ++j ) {
                        tnrm = 1.;
                        if (w1i[j] == 0.)
                            tnrm = magma_cblas_dnrm2( N, &VL[j*lda], ione );
                        else if (w1i[j] > 0.)
                            tnrm = magma_dlapy2( magma_cblas_dnrm2( N, &VL[j*lda],     ione ),
                                                 magma_cblas_dnrm2( N, &VL[(j+1)*lda], ione ));

                        result[3] = max( result[3], min( ulpinv, MAGMA_D_ABS(tnrm-1.)/ulp ));

                        if (w1i[j] > 0.) {
                            vmx  = vrmx = 0.;
                            for( int jj = 0; jj < N; ++jj ) {
                                vtst = magma_dlapy2( VL[jj+j*lda], VL[jj+(j+1)*lda]);
                                if (vtst > vmx)
                                    vmx = vtst;

                                if ( (VL[jj + (j+1)*lda])==0. &&
                                        MAGMA_D_ABS( VL[jj+j*lda]) > vrmx)
                                {
                                    vrmx = MAGMA_D_ABS( VL[jj+j*lda] );
                                }
                            }
                            if (vrmx / vmx < 1. - ulp*2.)
                                result[3] = ulpinv;
                        }
                    }
                    result[3] *= ulp;
                }
            }
            if ( opts.check == 2 ) {
                // more extensive tests
                // this is really slow because it calls magma_zgeev multiple times
                double *LRE, DUM;
                TESTING_MALLOC_PIN( LRE, double, n2 );

                lapackf77_dlarnv( &ione, ISEED, &n2, h_A );
                lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda );

                // ----------
                // Compute eigenvalues, left and right eigenvectors
                magma_dgeev( MagmaVec, MagmaVec,
                             N, h_R, lda, w1, w1i,
                             VL, lda, VR, lda,
                             h_work, lwork, opts.queue, &info );
                if (info != 0)
                    printf("magma_zgeev (case V, V) returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));

                // ----------
                // Compute eigenvalues only
                // These are not exactly equal, and not in the same order, so skip for now.
                //lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda );
                //magma_dgeev( MagmaNoVec, MagmaNoVec,
                //             N, h_R, lda, w2, w2i,
                //             &DUM, 1, &DUM, 1,
                //             h_work, lwork, opts.queue, &info );
                //if (info != 0)
                //    printf("magma_dgeev (case N, N) returned error %d: %s.\n",
                //           (int) info, magma_strerror( info ));
                //
                //// Do test 5: W(full) = W(partial, W only)
                //result[4] = 1;
                //for( int j = 0; j < N; ++j )
                //    if ( w1[j] != w2[j] || w1i[j] != w2i[j] )
                //        result[4] = 0;

                // ----------
                // Compute eigenvalues and right eigenvectors
                lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda );
                magma_dgeev( MagmaNoVec, MagmaVec,
                             N, h_R, lda, w2, w2i,
                             &DUM, 1, LRE, lda,
                             h_work, lwork, opts.queue, &info );
                if (info != 0)
                    printf("magma_dgeev (case N, V) returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));

                // Do test 6: W(full) = W(partial, W and VR)
                result[5] = 1;
                for( int j = 0; j < N; ++j )
                    if ( w1[j] != w2[j] || w1i[j] != w2i[j] )
                        result[5] = 0;

                // Do test 8: VR(full) = VR(partial, W and VR)
                result[7] = 1;
                for( int j = 0; j < N; ++j )
                    for( int jj = 0; jj < N; ++jj )
                        if ( ! MAGMA_D_EQUAL( VR[j+jj*lda], LRE[j+jj*lda] ))
                            result[7] = 0;

                // ----------
                // Compute eigenvalues and left eigenvectors
                lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda );
                magma_dgeev( MagmaVec, MagmaNoVec,
                             N, h_R, lda, w2, w2i,
                             LRE, lda, &DUM, 1,
                             h_work, lwork, opts.queue, &info );
                if (info != 0)
                    printf("magma_dgeev (case V, N) returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));

                // Do test 7: W(full) = W(partial, W and VL)
                result[6] = 1;
                for( int j = 0; j < N; ++j )
                    if ( w1[j] != w2[j] || w1i[j] != w2i[j] )
                        result[6] = 0;

                // Do test 9: VL(full) = VL(partial, W and VL)
                result[8] = 1;
                for( int j = 0; j < N; ++j )
                    for( int jj = 0; jj < N; ++jj )
                        if ( ! MAGMA_D_EQUAL( VL[j+jj*lda], LRE[j+jj*lda] ))
                            result[8] = 0;

                TESTING_FREE_PIN( LRE );
            }

            /* =====================================================================
               Performs operation using LAPACK
               Do this after checks, because it overwrites VL and VR.
               =================================================================== */
            if ( opts.lapack ) {
                cpu_time = magma_wtime();
                lapackf77_dgeev( lapack_vec_const(opts.jobvl), lapack_vec_const(opts.jobvr),
                                 &N, h_A, &lda, w2, w2i,
                                 VL, &lda, VR, &lda,
                                 h_work, &lwork, &info );
                cpu_time = magma_wtime() - cpu_time;
                if (info != 0)
                    printf("lapackf77_dgeev returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));

                // check | W_magma - W_lapack | / | W |
                // need to sort eigenvalues first
                // copy them into complex vectors for ease
                for( int j=0; j < N; ++j ) {
                    w1copy[j] = MAGMA_Z_MAKE( w1[j], w1i[j] );
                    w2copy[j] = MAGMA_Z_MAKE( w2[j], w2i[j] );
                }
                std::sort( w1copy, &w1copy[N], lessthan );
                std::sort( w2copy, &w2copy[N], lessthan );

                // adjust sorting to deal with numerical inaccuracy
                // search down w2 for eigenvalue that matches w1's eigenvalue
                for( int j=0; j < N; ++j ) {
                    for( int j2=j; j2 < N; ++j2 ) {
                        magmaDoubleComplex diff = MAGMA_Z_SUB( w1copy[j], w2copy[j2] );
                        double diff2 = magma_dzlapy2( diff ) / max( magma_dzlapy2( w1copy[j] ), tol );
                        if ( diff2 < 100*tol ) {
                            if ( j != j2 ) {
                                std::swap( w2copy[j], w2copy[j2] );
                            }
                            break;
                        }
                    }
                }

                blasf77_zaxpy( &N, &c_neg_one, w2copy, &ione, w1copy, &ione );
                error  = magma_cblas_dznrm2( N, w1copy, 1 );
                error /= magma_cblas_dznrm2( N, w2copy, 1 );

                printf("%5d   %7.2f          %7.2f          %8.2e   %s\n",
                       (int) N, cpu_time, gpu_time,
                       error, (error < tolulp ? "ok" : "failed"));
                status += ! (error < tolulp);
            }
            else {
                printf("%5d     ---            %7.2f\n",
                       (int) N, gpu_time);
            }
            if ( opts.check ) {
                // -1 indicates test was not run
                if ( result[0] != -1 ) {
                    printf("        | A * VR - VR * W | / ( n |A| ) = %8.2e   %s\n", result[0], (result[0] < tol ? "ok" : "failed"));
                }
                if ( result[1] != -1 ) {
                    printf("        |  |VR(i)| - 1    |             = %8.2e   %s\n", result[1], (result[1] < tol ? "ok" : "failed"));
                }
                if ( result[2] != -1 ) {
                    printf("        | A'* VL - VL * W'| / ( n |A| ) = %8.2e   %s\n", result[2], (result[2] < tol ? "ok" : "failed"));
                }
                if ( result[3] != -1 ) {
                    printf("        |  |VL(i)| - 1    |             = %8.2e   %s\n", result[3], (result[3] < tol ? "ok" : "failed"));
                }
                if ( result[4] != -1 ) {
                    printf("        W  (full) == W  (partial, W only)           %s\n",         (result[4] == 1. ? "ok" : "failed"));
                }
                if ( result[5] != -1 ) {
                    printf("        W  (full) == W  (partial, W and VR)         %s\n",         (result[5] == 1. ? "ok" : "failed"));
                }
                if ( result[6] != -1 ) {
                    printf("        W  (full) == W  (partial, W and VL)         %s\n",         (result[6] == 1. ? "ok" : "failed"));
                }
                if ( result[7] != -1 ) {
                    printf("        VR (full) == VR (partial, W and VR)         %s\n",         (result[7] == 1. ? "ok" : "failed"));
                }
                if ( result[8] != -1 ) {
                    printf("        VL (full) == VL (partial, W and VL)         %s\n",         (result[8] == 1. ? "ok" : "failed"));
                }

                int newline = 0;
                if ( result[0] != -1 ) {
                    status += ! (result[0] < tol);
                    newline = 1;
                }
                if ( result[1] != -1 ) {
                    status += ! (result[1] < tol);
                    newline = 1;
                }
                if ( result[2] != -1 ) {
                    status += ! (result[2] < tol);
                    newline = 1;
                }
                if ( result[3] != -1 ) {
                    status += ! (result[3] < tol);
                    newline = 1;
                }
                if ( result[4] != -1 ) {
                    status += ! (result[4] == 1.);
                    newline = 1;
                }
                if ( result[5] != -1 ) {
                    status += ! (result[5] == 1.);
                    newline = 1;
                }
                if ( result[6] != -1 ) {
                    status += ! (result[6] == 1.);
                    newline = 1;
                }
                if ( result[7] != -1 ) {
                    status += ! (result[7] == 1.);
                    newline = 1;
                }
                if ( result[8] != -1 ) {
                    status += ! (result[8] == 1.);
                    newline = 1;
                }
                if ( newline ) {
                    printf( "\n" );
                }
            }

            TESTING_FREE_CPU( w1copy );
            TESTING_FREE_CPU( w2copy );
            TESTING_FREE_CPU( w1  );
            TESTING_FREE_CPU( w2  );
            TESTING_FREE_CPU( w1i );
            TESTING_FREE_CPU( w2i );
            TESTING_FREE_CPU( h_A );

            TESTING_FREE_PIN( h_R );
            TESTING_FREE_PIN( VL  );
            TESTING_FREE_PIN( VR  );
            TESTING_FREE_PIN( h_work );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    TESTING_FINALIZE();
    return status;
}
Пример #8
0
int main(int argc, char **argv)
{
    TESTING_INIT();

    real_Double_t   gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf, cpu_time;
    double          magma_error, cublas_error, work[1];
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t N, lda, ldda, sizeA, sizeX, sizeY, blocks, ldwork;
    magma_int_t incx = 1;
    magma_int_t incy = 1;
    magma_int_t nb   = 64;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    magmaDoubleComplex alpha = MAGMA_Z_MAKE(  1.5, -2.3 );
    magmaDoubleComplex beta  = MAGMA_Z_MAKE( -0.6,  0.8 );
    magmaDoubleComplex *A, *X, *Y, *Ycublas, *Ymagma;
    magmaDoubleComplex *dA, *dX, *dY, *dwork;
    magma_int_t status = 0;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    double tol = opts.tolerance * lapackf77_dlamch("E");

    printf("uplo = %s\n", lapack_uplo_const(opts.uplo) );
    if ( opts.uplo == MagmaUpper ) {
        printf("** for uplo=MagmaUpper, magmablas_zhemv simply calls cublas_zhemv.\n");
    }
    printf("    N   MAGMA Gflop/s (ms)  CUBLAS Gflop/s (ms)   CPU Gflop/s (ms)  MAGMA error  CUBLAS error\n");
    printf("=============================================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            N = opts.nsize[itest];
            lda    = N;
            ldda   = ((N + 31)/32)*32;
            sizeA  = N*lda;
            sizeX  = N*incx;
            sizeY  = N*incy;
            gflops = FLOPS_ZHEMV( N ) / 1e9;
            
            TESTING_MALLOC_CPU( A,       magmaDoubleComplex, sizeA );
            TESTING_MALLOC_CPU( X,       magmaDoubleComplex, sizeX );
            TESTING_MALLOC_CPU( Y,       magmaDoubleComplex, sizeY );
            TESTING_MALLOC_CPU( Ycublas, magmaDoubleComplex, sizeY );
            TESTING_MALLOC_CPU( Ymagma,  magmaDoubleComplex, sizeY );
            
            TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*N );
            TESTING_MALLOC_DEV( dX, magmaDoubleComplex, sizeX );
            TESTING_MALLOC_DEV( dY, magmaDoubleComplex, sizeY );
            
            blocks = (N + nb - 1) / nb;
            ldwork = ldda*blocks;
            TESTING_MALLOC_DEV( dwork, magmaDoubleComplex, ldwork );
            
            magmablas_zlaset( MagmaFull, ldwork, 1, MAGMA_Z_NAN, MAGMA_Z_NAN, dwork, ldwork );
            magmablas_zlaset( MagmaFull, ldda,   N, MAGMA_Z_NAN, MAGMA_Z_NAN, dA,    ldda   );
            
            /* Initialize the matrix */
            lapackf77_zlarnv( &ione, ISEED, &sizeA, A );
            magma_zmake_hermitian( N, A, lda );
            lapackf77_zlarnv( &ione, ISEED, &sizeX, X );
            lapackf77_zlarnv( &ione, ISEED, &sizeY, Y );
            
            /* =====================================================================
               Performs operation using CUBLAS
               =================================================================== */
            magma_zsetmatrix( N, N, A, lda, dA, ldda );
            magma_zsetvector( N, X, incx, dX, incx );
            magma_zsetvector( N, Y, incy, dY, incy );
            
            cublas_time = magma_sync_wtime( 0 );
            cublasZhemv( handle, cublas_uplo_const(opts.uplo),
                         N, &alpha, dA, ldda, dX, incx, &beta, dY, incy );
            cublas_time = magma_sync_wtime( 0 ) - cublas_time;
            cublas_perf = gflops / cublas_time;
            
            magma_zgetvector( N, dY, incy, Ycublas, incy );
            
            /* =====================================================================
               Performs operation using MAGMABLAS
               =================================================================== */
            magma_zsetvector( N, Y, incy, dY, incy );
            
            //magma_zprint_gpu( ldda, blocks, dwork, ldda );
            
            magma_time = magma_sync_wtime( 0 );
            magmablas_zhemv_work( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy, dwork, ldwork );
            // TODO provide option to test non-work interface
            //magmablas_zhemv( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy );
            magma_time = magma_sync_wtime( 0 ) - magma_time;
            magma_perf = gflops / magma_time;
            
            magma_zgetvector( N, dY, incy, Ymagma, incy );
            
            //magma_zprint_gpu( ldda, blocks, dwork, ldda );
            
            /* =====================================================================
               Performs operation using CPU BLAS
               =================================================================== */
            cpu_time = magma_wtime();
            blasf77_zhemv( lapack_uplo_const(opts.uplo), &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy );
            cpu_time = magma_wtime() - cpu_time;
            cpu_perf = gflops / cpu_time;
            
            /* =====================================================================
               Check the result
               =================================================================== */
            blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Ymagma, &incy );
            magma_error = lapackf77_zlange( "M", &N, &ione, Ymagma, &N, work ) / N;
            
            blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Ycublas, &incy );
            cublas_error = lapackf77_zlange( "M", &N, &ione, Ycublas, &N, work ) / N;
            
            printf("%5d   %7.2f (%7.2f)    %7.2f (%7.2f)   %7.2f (%7.2f)    %8.2e     %8.2e   %s\n",
                   (int) N,
                   magma_perf,  1000.*magma_time,
                   cublas_perf, 1000.*cublas_time,
                   cpu_perf,    1000.*cpu_time,
                   magma_error, cublas_error,
                   (magma_error < tol && cublas_error < tol ? "ok" : "failed"));
            status += ! (magma_error < tol && cublas_error < tol);
            
            TESTING_FREE_CPU( A );
            TESTING_FREE_CPU( X );
            TESTING_FREE_CPU( Y );
            TESTING_FREE_CPU( Ycublas );
            TESTING_FREE_CPU( Ymagma  );
            
            TESTING_FREE_DEV( dA );
            TESTING_FREE_DEV( dX );
            TESTING_FREE_DEV( dY );
            TESTING_FREE_DEV( dwork );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    TESTING_FINALIZE();
    return status;
}
Пример #9
0
extern "C" magma_int_t
magma_zhetrd_gpu(char uplo, magma_int_t n,
                 magmaDoubleComplex *da, magma_int_t ldda,
                 double *d, double *e, magmaDoubleComplex *tau,
                 magmaDoubleComplex *wa,  magma_int_t ldwa,
                 magmaDoubleComplex *work, magma_int_t lwork,
                 magma_int_t *info)
{
/*  -- MAGMA (version 1.4.1) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       December 2013

    Purpose
    =======
    ZHETRD_GPU reduces a complex Hermitian matrix A to real symmetric
    tridiagonal form T by an orthogonal similarity transformation:
    Q**H * A * Q = T.

    Arguments
    =========
    UPLO    (input) CHARACTER*1
            = 'U':  Upper triangle of A is stored;
            = 'L':  Lower triangle of A is stored.

    N       (input) INTEGER
            The order of the matrix A.  N >= 0.

    DA      (device input/output) COMPLEX_16 array on the GPU, dimension (LDA,N)
            On entry, the Hermitian matrix A.  If UPLO = 'U', the leading
            N-by-N upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = 'L', the
            leading N-by-N lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
            On exit, if UPLO = 'U', the diagonal and first superdiagonal
            of A are overwritten by the corresponding elements of the
            tridiagonal matrix T, and the elements above the first
            superdiagonal, with the array TAU, represent the orthogonal
            matrix Q as a product of elementary reflectors; if UPLO
            = 'L', the diagonal and first subdiagonal of A are over-
            written by the corresponding elements of the tridiagonal
            matrix T, and the elements below the first subdiagonal, with
            the array TAU, represent the orthogonal matrix Q as a product
            of elementary reflectors. See Further Details.

    LDDA    (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    D       (output) COMPLEX_16 array, dimension (N)
            The diagonal elements of the tridiagonal matrix T:
            D(i) = A(i,i).

    E       (output) COMPLEX_16 array, dimension (N-1)
            The off-diagonal elements of the tridiagonal matrix T:
            E(i) = A(i,i+1) if UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'.

    TAU     (output) COMPLEX_16 array, dimension (N-1)
            The scalar factors of the elementary reflectors (see Further
            Details).

    WA      (workspace/output) COMPLEX_16 array, dimension (LDA,N)
            On exit the diagonal, the  upper part (UPLO='U')
            or the lower part (UPLO='L') are copies of DA

    LDWA    (input) INTEGER
            The leading dimension of the array WA.  LDWA >= max(1,N).

    WORK    (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.

    LWORK   (input) INTEGER
            The dimension of the array WORK.  LWORK >= N*NB, where NB is the
            optimal blocksize given by magma_get_zhetrd_nb().

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value

    Further Details
    ===============
    If UPLO = 'U', the matrix Q is represented as a product of elementary
    reflectors

       Q = H(n-1) . . . H(2) H(1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in
    A(1:i-1,i+1), and tau in TAU(i).

    If UPLO = 'L', the matrix Q is represented as a product of elementary
    reflectors

       Q = H(1) H(2) . . . H(n-1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i),
    and tau in TAU(i).

    The contents of A on exit are illustrated by the following examples
    with n = 5:

    if UPLO = 'U':                       if UPLO = 'L':

      (  d   e   v2  v3  v4 )              (  d                  )
      (      d   e   v3  v4 )              (  e   d              )
      (          d   e   v4 )              (  v1  e   d          )
      (              d   e  )              (  v1  v2  e   d      )
      (                  d  )              (  v1  v2  v3  e   d  )

    where d and e denote diagonal and off-diagonal elements of T, and vi
    denotes an element of the vector defining H(i).
    =====================================================================    */
    
    char uplo_[2] = {uplo, 0};

    magma_int_t nb = magma_get_zhetrd_nb(n);

    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    double          d_one     = MAGMA_D_ONE;
    
    magma_int_t kk, nx;
    magma_int_t i, j, i_n;
    magma_int_t iinfo;
    magma_int_t ldw, lddw, lwkopt;
    magma_int_t lquery;

    *info = 0;
    int upper = lapackf77_lsame(uplo_, "U");
    lquery = lwork == -1;
    if (! upper && ! lapackf77_lsame(uplo_, "L")) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,n)) {
        *info = -4;
    } else if (ldwa < max(1,n)) {
        *info = -9;
    } else if (lwork < nb*n && ! lquery) {
        *info = -11;
    }

    /* Determine the block size. */
    ldw = lddw = n;
    lwkopt = n * nb;
    if (*info == 0) {
        work[0] = MAGMA_Z_MAKE( lwkopt, 0 );
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;

    /* Quick return if possible */
    if (n == 0) {
        work[0] = c_one;
        return *info;
    }

    magmaDoubleComplex *dwork;
    
    if (n < 2048)
        nx = n;
    else
        nx = 512;
    
    if (MAGMA_SUCCESS != magma_zmalloc( &dwork, (ldw*nb) )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    if (upper) {
        /*  Reduce the upper triangle of A.
         Columns 1:kk are handled by the unblocked method. */
        kk = n - (n - nx + nb - 1) / nb * nb;
        
        for (i = n - nb; i >= kk; i -= nb)
        {
            /* Reduce columns i:i+nb-1 to tridiagonal form and form the
             matrix W which is needed to update the unreduced part of
             the matrix */
            
            /*   Get the current panel */
            magma_zgetmatrix( i+nb, nb, dA(0, i), ldda, A(0, i), ldwa );
            
            magma_zlatrd(uplo, i+nb, nb, A(0, 0), ldwa, e, tau,
                         work, ldw, dA(0, 0), ldda, dwork, lddw);
            
            /* Update the unreduced submatrix A(0:i-2,0:i-2), using an
               update of the form:  A := A - V*W' - W*V' */
            
            magma_zsetmatrix( i + nb, nb, work, ldw, dwork, lddw );
            
            magma_zher2k(uplo, MagmaNoTrans, i, nb, c_neg_one,
                         dA(0, i), ldda, dwork,
                         lddw, d_one, dA(0, 0), ldda);
            
            /* Copy superdiagonal elements back into A, and diagonal
               elements into D */
            for (j = i; j < i+nb; ++j) {
                *A(j-1,j) = MAGMA_Z_MAKE( e[j - 1], 0 );
                d[j] = MAGMA_Z_REAL( *A(j, j) );
            }
        }
        
        magma_zgetmatrix( kk, kk, dA(0, 0), ldda, A(0, 0), ldwa );
        
        /*  Use CPU code to reduce the last or only block */
        lapackf77_zhetrd(uplo_, &kk, A(0, 0), &ldwa, d, e, tau, work, &lwork, &iinfo);
        
        magma_zsetmatrix( kk, kk, A(0, 0), ldwa, dA(0, 0), ldda );
    }
    else
    {
        /* Reduce the lower triangle of A */
        for (i = 0; i < n-nx; i += nb)
        {
            /* Reduce columns i:i+nb-1 to tridiagonal form and form the
             matrix W which is needed to update the unreduced part of
             the matrix */
            
            /*   Get the current panel */
            magma_zgetmatrix( n-i, nb, dA(i, i), ldda, A(i, i), ldwa );
            
            magma_zlatrd(uplo, n-i, nb, A(i, i), ldwa, &e[i],
                         &tau[i], work, ldw,
                         dA(i, i), ldda,
                         dwork, lddw);
            
            /* Update the unreduced submatrix A(i+ib:n,i+ib:n), using
             an update of the form:  A := A - V*W' - W*V' */
            
            magma_zsetmatrix( n-i, nb, work, ldw, dwork, lddw );
            
            magma_zher2k(MagmaLower, MagmaNoTrans, n-i-nb, nb, c_neg_one,
                         dA(i+nb, i), ldda,
                         &dwork[nb], lddw, d_one,
                         dA(i+nb, i+nb), ldda);
            
            /* Copy subdiagonal elements back into A, and diagonal
               elements into D */
            for (j = i; j < i+nb; ++j) {
                *A(j+1,j) = MAGMA_Z_MAKE( e[j], 0 );
                d[j] = MAGMA_Z_REAL( *A(j, j) );
            }
        }
        /* Use unblocked code to reduce the last or only block */
        
        magma_zgetmatrix( n-i, n-i, dA(i, i), ldda, A(i, i), ldwa );
        
        i_n = n-i;
        lapackf77_zhetrd(uplo_, &i_n, A(i, i), &ldwa, &d[i], &e[i],
                         &tau[i], work, &lwork, &iinfo);
        
        magma_zsetmatrix( n-i, n-i, A(i, i), ldwa, dA(i, i), ldda );
    }
    
    magma_free( dwork );
    work[0] = MAGMA_Z_MAKE( lwkopt, 0 );
    return *info;
} /* zhetrd_gpu */
Пример #10
0
extern "C" magma_int_t
magma_zgeqrf3(magma_context *cntxt, magma_int_t m, magma_int_t n, 
          cuDoubleComplex *a,    magma_int_t lda, cuDoubleComplex *tau, 
          cuDoubleComplex *work, magma_int_t lwork,
          magma_int_t *info )
{
/*  -- MAGMA (version 1.6.1) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date January 2015

    Purpose   
    =======
    ZGEQRF computes a QR factorization of a COMPLEX_16 M-by-N matrix A:   
    A = Q * R.   

    Arguments   
    =========   
    M       (input) INTEGER   
            The number of rows of the matrix A.  M >= 0.   

    N       (input) INTEGER   
            The number of columns of the matrix A.  N >= 0.   

    A       (input/output) COMPLEX_16 array, dimension (LDA,N)   
            On entry, the M-by-N matrix A.   
            On exit, the elements on and above the diagonal of the array   
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is   
            upper triangular if m >= n); the elements below the diagonal,   
            with the array TAU, represent the orthogonal matrix Q as a   
            product of min(m,n) elementary reflectors (see Further   
            Details).   

    LDA     (input) INTEGER   
            The leading dimension of the array A.  LDA >= max(1,M).   

    TAU     (output) COMPLEX_16 array, dimension (min(M,N))   
            The scalar factors of the elementary reflectors (see Further   
            Details).   

    WORK    (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK))   
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.   

    LWORK   (input) INTEGER   
            The dimension of the array WORK.  LWORK >= N*NB. 

            If LWORK = -1, then a workspace query is assumed; the routine   
            only calculates the optimal size of the WORK array, returns   
            this value as the first entry of the WORK array, and no error   
            message related to LWORK is issued.

    INFO    (output) INTEGER   
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value   

    Further Details   
    ===============   
    The matrix Q is represented as a product of elementary reflectors   

       Q = H(1) H(2) . . . H(k), where k = min(m,n).   

    Each H(i) has the form   

       H(i) = I - tau * v * v'   

    where tau is a complex scalar, and v is a complex vector with   
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),   
    and tau in TAU(i).
    ====================================================================    */

    cuDoubleComplex c_one = MAGMA_Z_ONE;
    int k, ib;
    magma_qr_params *qr_params = (magma_qr_params *)cntxt->params;

    *info = 0;

    int lwkopt = n * qr_params->nb;
    work[0] = MAGMA_Z_MAKE( (double)lwkopt, 0 );
    long int lquery = (lwork == -1);
    if (m < 0) {
      *info = -1;
    } else if (n < 0) {
      *info = -2;
    } else if (lda < max(1,m)) {
      *info = -4;
    } else if (lwork < max(1,n) && ! lquery) {
      *info = -7;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return MAGMA_ERR_ILLEGAL_VALUE;
    }
    else if (lquery)
      return MAGMA_SUCCESS;

    k = min(m,n);
    if (k == 0) {
      work[0] = c_one;
      return MAGMA_SUCCESS;
    }
    
    int M=qr_params->nthreads*qr_params->ob;
    int N=qr_params->nthreads*qr_params->ob;
    
    if (qr_params->m > qr_params->n) 
      M = qr_params->m - (qr_params->n-qr_params->nthreads*qr_params->ob);
    
    /* Use MAGMA code to factor left portion of matrix, waking up threads 
       along the way to perform updates on the right portion of matrix */
    magma_zgeqrf2(cntxt, m, n - qr_params->nthreads * qr_params->ob, 
          a, lda, tau, work, lwork, info);

    /* Wait for all update threads to finish */
    for (k = 0; k < qr_params->nthreads; k++) {
      while (qr_params->sync1[k] == 0) {
        sched_yield();
      }
    }
    
    /* Unzero upper part of each panel */
    for (k = 0; k < qr_params->np_gpu-1; k++){
      ib = min(qr_params->nb,(n-qr_params->nthreads*qr_params->ob)-qr_params->nb*k);
      zq_to_panel(MagmaUpper, ib, a + k*qr_params->nb*lda + k*qr_params->nb, lda, 
          qr_params->w+qr_params->nb*qr_params->nb*k);
    }

    /* Use final blocking size */
    qr_params->nb = qr_params->fb;

    /* Flag MAGMA code to internally unzero upper part of each panel */
    qr_params->flag = 1;
    
    /* Use MAGMA code to perform final factorization if necessary */
    if (qr_params->m > (qr_params->n - (qr_params->nthreads*qr_params->ob)))

      if (M > (qr_params->m-(qr_params->n-(qr_params->ob*qr_params->nthreads))))
        M = qr_params->m-(qr_params->n-(qr_params->ob*qr_params->nthreads));

      magma_zgeqrf2(cntxt, M, N,
            a + (n-qr_params->nthreads*qr_params->ob)*m+
            (n-qr_params->nthreads*qr_params->ob), lda, 
            &tau[n-qr_params->nthreads*qr_params->ob],
            work, lwork, info);

    /* Prepare for next run */
    for (k = 0; k < qr_params->np_gpu; k++) {
      qr_params->p[k] = NULL;
    }

    for (k = 0; k < qr_params->nthreads; k++) {
      qr_params->sync1[k] = 0;
    }

    /* Infrastructure for next run is not in place yet */
    qr_params->sync0 = 0;

    /* Signal update threads to get in position for next run */
    qr_params->sync2 = 1;
}
Пример #11
0
/**
    Purpose
    -------
    ZUNMQR overwrites the general complex M-by-N matrix C with

    @verbatim
                    SIDE = MagmaLeft     SIDE = MagmaRight
    TRANS = MagmaNoTrans:      Q * C          C * Q
    TRANS = MagmaTrans:      Q**H * C       C * Q**H
    @endverbatim

    where Q is a complex unitary matrix defined as the product of k
    elementary reflectors

          Q = H(1) H(2) . . . H(k)

    as returned by ZGEQRF. Q is of order M if SIDE = MagmaLeft and of order N
    if SIDE = MagmaRight.

    Arguments
    ---------
    @param[in]
    nrgpu   INTEGER
            Number of GPUs to use.

    @param[in]
    side    magma_side_t
      -     = MagmaLeft:      apply Q or Q**H from the Left;
      -     = MagmaRight:     apply Q or Q**H from the Right.

    @param[in]
    trans   magma_trans_t
      -     = MagmaNoTrans:    No transpose, apply Q;
      -     = MagmaTrans:      Transpose, apply Q**H.

    @param[in]
    m       INTEGER
            The number of rows of the matrix C. M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix C. N >= 0.

    @param[in]
    k       INTEGER
            The number of elementary reflectors whose product defines
            the matrix Q.
            If SIDE = MagmaLeft,  M >= K >= 0;
            if SIDE = MagmaRight, N >= K >= 0.

    @param[in]
    A       COMPLEX_16 array, dimension (LDA,K)
            The i-th column must contain the vector which defines the
            elementary reflector H(i), for i = 1,2,...,k, as returned by
            ZGEQRF in the first k columns of its array argument A.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.
            If SIDE = MagmaLeft,  LDA >= max(1,M);
            if SIDE = MagmaRight, LDA >= max(1,N).

    @param[in]
    tau     COMPLEX_16 array, dimension (K)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by ZGEQRF.

    @param[in,out]
    C       COMPLEX_16 array, dimension (LDC,N)
            On entry, the M-by-N matrix C.
            On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q.

    @param[in]
    ldc     INTEGER
            The leading dimension of the array C. LDC >= max(1,M).

    @param[out]
    work    (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK(0) returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.
            If SIDE = MagmaLeft,  LWORK >= max(1,N);
            if SIDE = MagmaRight, LWORK >= max(1,M).
            For optimum performance LWORK >= N*NB if SIDE = MagmaLeft, and
            LWORK >= M*NB if SIDE = MagmaRight, where NB is the optimal
            blocksize.
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value

    @ingroup magma_zgeqrf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_zunmqr_m(magma_int_t nrgpu, magma_side_t side, magma_trans_t trans,
               magma_int_t m, magma_int_t n, magma_int_t k,
               magmaDoubleComplex *A,    magma_int_t lda,
               magmaDoubleComplex *tau,
               magmaDoubleComplex *C,    magma_int_t ldc,
               magmaDoubleComplex *work, magma_int_t lwork,
               magma_int_t *info)
{
#define  A(i, j) (A + (j)*lda  + (i))
#define  C(i, j) (C + (j)*ldc  + (i))

#define    dC(gpui,      i, j) (dw[gpui] + (j)*lddc + (i))
#define  dA_c(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddac)
#define  dA_r(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddar)
#define    dT(gpui, ind)       (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + (ind)*((nb+1)*nb))
#define dwork(gpui, ind)       (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + 2*((nb+1)*nb) + (ind)*(lddwork*nb))

    magmaDoubleComplex c_zero = MAGMA_Z_ZERO;
    magmaDoubleComplex c_one  = MAGMA_Z_ONE;

    const char* side_  = lapack_side_const( side );
    const char* trans_ = lapack_trans_const( trans );

    magma_int_t nb = 128;
    magmaDoubleComplex *T;
    magma_zmalloc_pinned(&T, nb*nb);
    //printf("calling zunmqr_m with nb=%d\n", (int) nb);

    magmaDoubleComplex* dw[MagmaMaxGPUs];
    magma_queue_t stream [MagmaMaxGPUs][2];
    magma_event_t  event [MagmaMaxGPUs][2];

    magma_int_t ind_c;

    magma_int_t igpu = 0;
    int gpu_b;
    magma_getdevice(&gpu_b);

    *info = 0;

    magma_int_t left   = (side == MagmaLeft);
    magma_int_t notran = (trans == MagmaNoTrans);
    magma_int_t lquery = (lwork == -1);

    /* NQ is the order of Q and NW is the minimum dimension of WORK */
    magma_int_t nq, nw;
    if (left) {
        nq = m;
        nw = n;
    } else {
        nq = n;
        nw = m;
    }


    if (! left && side != MagmaRight) {
        *info = -1;
    } else if (! notran && trans != MagmaTrans) {
        *info = -2;
    } else if (m < 0) {
        *info = -3;
    } else if (n < 0) {
        *info = -4;
    } else if (k < 0 || k > nq) {
        *info = -5;
    } else if (lda < max(1,nq)) {
        *info = -7;
    } else if (ldc < max(1,m)) {
        *info = -10;
    } else if (lwork < max(1,nw) && ! lquery) {
        *info = -12;
    }

    magma_int_t lwkopt = max(1,nw) * nb;
    if (*info == 0) {
        work[0] = MAGMA_Z_MAKE( lwkopt, 0 );
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0 || k == 0) {
        work[0] = c_one;
        return *info;
    }

    if (nb >= k) {
        /* Use CPU code */
        lapackf77_zunmqr(side_, trans_, &m, &n, &k, A, &lda, tau,
                         C, &ldc, work, &lwork, info);
        return *info;
    }

    magma_int_t lddc = (m+63)/64*64;
    magma_int_t lddac = nq;
    magma_int_t lddar = nb;
    magma_int_t lddwork = nw;

    magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 };

    magma_int_t nb_l=256;
    magma_int_t nbl = (n-1)/nb_l+1; // number of blocks
    magma_int_t maxnlocal = (nbl+nrgpu-1)/nrgpu*nb_l;

    nrgpu = min(nrgpu, (n+nb_l-1)/nb_l); // Don't use GPU that will not have data.

    magma_int_t ldw = maxnlocal*lddc // dC
                    + 2*lddac*lddar // 2*dA
                    + 2*(nb + 1 + lddwork)*nb; // 2*(dT and dwork)

    for (igpu = 0; igpu < nrgpu; ++igpu) {
        magma_setdevice(igpu);
        if (MAGMA_SUCCESS != magma_zmalloc( &dw[igpu], ldw )) {
            magma_xerbla( __func__, -(*info) );
            *info = MAGMA_ERR_DEVICE_ALLOC;

            return *info;
        }
        magma_queue_create( &stream[igpu][0] );
        magma_queue_create( &stream[igpu][1] );
        magma_event_create( &event[igpu][0] );
        magma_event_create( &event[igpu][1] );
    }

    /* Use hybrid CPU-MGPU code */
    if (left) {
        //copy C to mgpus
        for (magma_int_t i = 0; i < nbl; ++i) {
            magma_int_t igpu = i%nrgpu;
            magma_setdevice(igpu);
            magma_int_t kb = min(nb_l, n-i*nb_l);
            magma_zsetmatrix_async( m, kb,
                                   C(0, i*nb_l), ldc,
                                   dC(igpu, 0, i/nrgpu*nb_l), lddc, stream[igpu][0] );
            nlocal[igpu] += kb;
        }

        magma_int_t i1, i2, i3;
        if ( !notran ) {
            i1 = 0;
            i2 = k;
            i3 = nb;
        } else {
            i1 = (k - 1) / nb * nb;
            i2 = 0;
            i3 = -nb;
        }

        ind_c = 0;

        for (magma_int_t i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) {
            // start the copy of A panel
            magma_int_t kb = min(nb, k - i);
            for (igpu = 0; igpu < nrgpu; ++igpu) {
                magma_setdevice(igpu);
                magma_event_sync(event[igpu][ind_c]); // check if the new data can be copied
                magma_zsetmatrix_async(nq-i, kb,
                                       A(i, i),                 lda,
                                       dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] );
                // set upper triangular part of dA to identity
                magmablas_zlaset_band_stream( MagmaUpper, kb, kb, kb, c_zero, c_one, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] );
            }

            /* Form the triangular factor of the block reflector
             H = H(i) H(i+1) . . . H(i+ib-1) */
            magma_int_t nqi = nq - i;
            lapackf77_zlarft("F", "C", &nqi, &kb, A(i, i), &lda,
                             &tau[i], T, &kb);

            /* H or H' is applied to C(1:m,i:n) */

            /* Apply H or H'; First copy T to the GPU */
            for (igpu = 0; igpu < nrgpu; ++igpu) {
                magma_setdevice(igpu);
                magma_zsetmatrix_async(kb, kb,
                                       T,               kb,
                                       dT(igpu, ind_c), kb, stream[igpu][0] );
            }

            for (igpu = 0; igpu < nrgpu; ++igpu) {
                magma_setdevice(igpu);
                magma_queue_sync( stream[igpu][0] ); // check if the data was copied
                magmablasSetKernelStream(stream[igpu][1]);
                magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
                                 m-i, nlocal[igpu], kb,
                                 dA_c(igpu, ind_c, i, 0), lddac, dT(igpu, ind_c), kb,
                                 dC(igpu, i, 0), lddc,
                                 dwork(igpu, ind_c), lddwork);
                magma_event_record(event[igpu][ind_c], stream[igpu][1] );
            }

            ind_c = (ind_c+1)%2;
        }

        for (igpu = 0; igpu < nrgpu; ++igpu) {
            magma_setdevice(igpu);
            magma_queue_sync( stream[igpu][1] );
        }

        //copy C from mgpus
        for (magma_int_t i = 0; i < nbl; ++i) {
            magma_int_t igpu = i%nrgpu;
            magma_setdevice(igpu);
            magma_int_t kb = min(nb_l, n-i*nb_l);
            magma_zgetmatrix( m, kb,
                              dC(igpu, 0, i/nrgpu*nb_l), lddc,
                              C(0, i*nb_l), ldc );
//            magma_zgetmatrix_async( m, kb,
//                                   dC(igpu, 0, i/nrgpu*nb_l), lddc,
//                                   C(0, i*nb_l), ldc, stream[igpu][0] );
        }
    } else {
        fprintf(stderr, "The case (side == right) is not implemented\n");
        *info = MAGMA_ERR_NOT_IMPLEMENTED;
        magma_xerbla( __func__, -(*info) );
        return *info;
        /*
        if ( notran ) {
            i1 = 0;
            i2 = k;
            i3 = nb;
        } else {
            i1 = (k - 1) / nb * nb;
            i2 = 0;
            i3 = -nb;
        }

        mi = m;
        ic = 0;

        for (i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) {
            ib = min(nb, k - i);
            
            // Form the triangular factor of the block reflector
            // H = H(i) H(i+1) . . . H(i+ib-1)
            i__4 = nq - i;
            lapackf77_zlarft("F", "C", &i__4, &ib, A(i, i), &lda,
            &tau[i], T, &ib);
            
            // 1) copy the panel from A to the GPU, and
            // 2) set upper triangular part of dA to identity
            magma_zsetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda );
            magmablas_zlaset_band( MagmaUpper, ib, ib, ib, c_zero, c_one, dA(i, 0), ldda );
            
            // H or H' is applied to C(1:m,i:n)
            ni = n - i;
            jc = i;
            
            // Apply H or H'; First copy T to the GPU
            magma_zsetmatrix( ib, ib, T, ib, dT, ib );
            magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
            mi, ni, ib,
            dA(i, 0), ldda, dT, ib,
            dC(ic, jc), lddc,
            dwork, lddwork);
        }
        */
    }

    work[0] = MAGMA_Z_MAKE( lwkopt, 0 );

    for (igpu = 0; igpu < nrgpu; ++igpu) {
        magma_setdevice(igpu);
        magmablasSetKernelStream(NULL);
        magma_event_destroy( event[igpu][0] );
        magma_event_destroy( event[igpu][1] );
        magma_queue_destroy( stream[igpu][0] );
        magma_queue_destroy( stream[igpu][1] );
        magma_free( dw[igpu] );
    }

    magma_setdevice(gpu_b);

    return *info;
} /* magma_zunmqr */
Пример #12
0
extern "C" magma_int_t
magma_zcg(
    magma_z_sparse_matrix A, magma_z_vector b, magma_z_vector *x,  
    magma_z_solver_par *solver_par,
    magma_queue_t queue )
{
    // set queue for old dense routines
    magma_queue_t orig_queue;
    magmablasGetKernelStream( &orig_queue );

    // prepare solver feedback
    solver_par->solver = Magma_CG;
    solver_par->numiter = 0;
    solver_par->info = MAGMA_SUCCESS; 

    // local variables
    magmaDoubleComplex c_zero = MAGMA_Z_ZERO, c_one = MAGMA_Z_ONE;
    
    magma_int_t dofs = A.num_rows;

    // GPU workspace
    magma_z_vector r, p, q;
    magma_z_vinit( &r, Magma_DEV, dofs, c_zero, queue );
    magma_z_vinit( &p, Magma_DEV, dofs, c_zero, queue );
    magma_z_vinit( &q, Magma_DEV, dofs, c_zero, queue );
    
    // solver variables
    magmaDoubleComplex alpha, beta;
    double nom, nom0, r0, betanom, betanomsq, den;

    // solver setup
    magma_zscal( dofs, c_zero, x->dval, 1) ;                     // x = 0
    magma_zcopy( dofs, b.dval, 1, r.dval, 1 );                    // r = b
    magma_zcopy( dofs, b.dval, 1, p.dval, 1 );                    // p = b
    nom0 = betanom = magma_dznrm2( dofs, r.dval, 1 );           
    nom  = nom0 * nom0;                                // nom = r' * r
    magma_z_spmv( c_one, A, p, c_zero, q, queue );                     // q = A p
    den = MAGMA_Z_REAL( magma_zdotc(dofs, p.dval, 1, q.dval, 1) );// den = p dot q
    solver_par->init_res = nom0;
    
    if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) 
        r0 = ATOLERANCE;
    if ( nom < r0 ) {
        magmablasSetKernelStream( orig_queue );
        return MAGMA_SUCCESS;
    }
    // check positive definite
    if (den <= 0.0) {
        printf("Operator A is not postive definite. (Ar,r) = %f\n", den);
        magmablasSetKernelStream( orig_queue );
        return MAGMA_NONSPD;
        solver_par->info = MAGMA_NONSPD;
    }

    //Chronometry
    real_Double_t tempo1, tempo2;
    tempo1 = magma_sync_wtime( queue );
    if ( solver_par->verbose > 0 ) {
        solver_par->res_vec[0] = (real_Double_t)nom0;
        solver_par->timing[0] = 0.0;
    }
    
    // start iteration
    for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; 
                                                    solver_par->numiter++ ) {
        alpha = MAGMA_Z_MAKE(nom/den, 0.);
        magma_zaxpy(dofs,  alpha, p.dval, 1, x->dval, 1);     // x = x + alpha p
        magma_zaxpy(dofs, -alpha, q.dval, 1, r.dval, 1);      // r = r - alpha q
        betanom = magma_dznrm2(dofs, r.dval, 1);             // betanom = || r ||
        betanomsq = betanom * betanom;                      // betanoms = r' * r

        if ( solver_par->verbose > 0 ) {
            tempo2 = magma_sync_wtime( queue );
            if ( (solver_par->numiter)%solver_par->verbose==0 ) {
                solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] 
                        = (real_Double_t) betanom;
                solver_par->timing[(solver_par->numiter)/solver_par->verbose] 
                        = (real_Double_t) tempo2-tempo1;
            }
        }

        if (  betanom  < r0 ) {
            break;
        }

        beta = MAGMA_Z_MAKE(betanomsq/nom, 0.);           // beta = betanoms/nom
        magma_zscal(dofs, beta, p.dval, 1);                // p = beta*p
        magma_zaxpy(dofs, c_one, r.dval, 1, p.dval, 1);     // p = p + r 
        magma_z_spmv( c_one, A, p, c_zero, q, queue );           // q = A p
        den = MAGMA_Z_REAL(magma_zdotc(dofs, p.dval, 1, q.dval, 1));    
                // den = p dot q
        nom = betanomsq;
    } 
    tempo2 = magma_sync_wtime( queue );
    solver_par->runtime = (real_Double_t) tempo2-tempo1;
    double residual;
    magma_zresidual( A, b, *x, &residual, queue );
    solver_par->final_res = residual;

    if ( solver_par->numiter < solver_par->maxiter) {
        solver_par->info = MAGMA_SUCCESS;
    } else if ( solver_par->init_res > solver_par->final_res ) {
        if ( solver_par->verbose > 0 ) {
            if ( (solver_par->numiter)%solver_par->verbose==0 ) {
                solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] 
                        = (real_Double_t) betanom;
                solver_par->timing[(solver_par->numiter)/solver_par->verbose] 
                        = (real_Double_t) tempo2-tempo1;
            }
        }
        solver_par->info = MAGMA_SLOW_CONVERGENCE;
    }
    else {
        if ( solver_par->verbose > 0 ) {
            if ( (solver_par->numiter)%solver_par->verbose==0 ) {
                solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] 
                        = (real_Double_t) betanom;
                solver_par->timing[(solver_par->numiter)/solver_par->verbose] 
                        = (real_Double_t) tempo2-tempo1;
            }
        }
        solver_par->info = MAGMA_DIVERGENCE;
    }
    magma_z_vfree(&r, queue );
    magma_z_vfree(&p, queue );
    magma_z_vfree(&q, queue );

    magmablasSetKernelStream( orig_queue );
    return MAGMA_SUCCESS;
}   /* magma_zcg */
Пример #13
0
extern "C" magma_int_t
magma_zgelqf( magma_int_t m, magma_int_t n,
              magmaDoubleComplex *a,    magma_int_t lda,   magmaDoubleComplex *tau,
              magmaDoubleComplex *work, magma_int_t lwork, magma_int_t *info)
{
/*  -- MAGMA (version 1.4.1) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       December 2013

    Purpose
    =======
    ZGELQF computes an LQ factorization of a COMPLEX_16 M-by-N matrix A:
    A = L * Q.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) COMPLEX_16 array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and below the diagonal of the array
            contain the m-by-min(m,n) lower trapezoidal matrix L (L is
            lower triangular if m <= n); the elements above the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of elementary reflectors (see Further Details).

            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    TAU     (output) COMPLEX_16 array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    WORK    (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.

            Higher performance is achieved if WORK is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    LWORK   (input) INTEGER
            The dimension of the array WORK.  LWORK >= max(1,M).
            For optimum performance LWORK >= M*NB, where NB is the
            optimal blocksize.

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  if INFO = -10 internal GPU memory allocation failed.

    Further Details
    ===============
    The matrix Q is represented as a product of elementary reflectors

       Q = H(k) . . . H(2) H(1), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in A(i,i+1:n),
    and tau in TAU(i).
    =====================================================================    */

    #define  a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1))

    magmaDoubleComplex *dA, *dAT;
    magmaDoubleComplex c_one = MAGMA_Z_ONE;
    magma_int_t maxm, maxn, maxdim, nb;
    magma_int_t iinfo, ldda;
    int lquery;

    /* Function Body */
    *info = 0;
    nb = magma_get_zgelqf_nb(m);

    work[0] = MAGMA_Z_MAKE( (double)(m*nb), 0 );
    lquery = (lwork == -1);
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,m)) {
        *info = -4;
    } else if (lwork < max(1,m) && ! lquery) {
        *info = -7;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /*  Quick return if possible */
    if (min(m, n) == 0) {
        work[0] = c_one;
        return *info;
    }

    maxm = ((m + 31)/32)*32;
    maxn = ((n + 31)/32)*32;
    maxdim = max(maxm, maxn);

    if (maxdim*maxdim < 2*maxm*maxn)
        {
            ldda = maxdim;

            if (MAGMA_SUCCESS != magma_zmalloc( &dA, maxdim*maxdim )) {
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }

            magma_zsetmatrix( m, n, a, lda, dA, ldda );
            dAT = dA;
            magmablas_ztranspose_inplace( ldda, dAT, ldda );
        }
    else
        {
            ldda = maxn;

            if (MAGMA_SUCCESS != magma_zmalloc( &dA, 2*maxn*maxm )) {
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }

            magma_zsetmatrix( m, n, a, lda, dA, maxm );

            dAT = dA + maxn * maxm;
            magmablas_ztranspose2( dAT, ldda, dA, maxm, m, n );
        }

    magma_zgeqrf2_gpu(n, m, dAT, ldda, tau, &iinfo);

    if (maxdim*maxdim < 2*maxm*maxn) {
        magmablas_ztranspose_inplace( ldda, dAT, ldda );
        magma_zgetmatrix( m, n, dA, ldda, a, lda );
    } else {
        magmablas_ztranspose2( dA, maxm, dAT, ldda, n, m );
        magma_zgetmatrix( m, n, dA, maxm, a, lda );
    }

    magma_free( dA );

    return *info;
} /* magma_zgelqf */
Пример #14
0
/**
    Purpose
    -------
    ZHEEVD_2STAGE computes all eigenvalues and, optionally, eigenvectors of a
    complex Hermitian matrix A. It uses a two-stage algorithm for the tridiagonalization.
    If eigenvectors are desired, it uses a divide and conquer algorithm.

    The divide and conquer algorithm makes very mild assumptions about
    floating point arithmetic. It will work on machines with a guard
    digit in add/subtract, or on those binary machines without guard
    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
    Cray-2. It could conceivably fail on hexadecimal or decimal machines
    without guard digits, but we know of none.

    Arguments
    ---------
    @param[in]
    nrgpu   INTEGER
            Number of GPUs to use.

    @param[in]
    jobz    magma_vec_t
      -     = MagmaNoVec:  Compute eigenvalues only;
      -     = MagmaVec:    Compute eigenvalues and eigenvectors.

    @param[in]
    range   magma_range_t
      -     = MagmaRangeAll: all eigenvalues will be found.
      -     = MagmaRangeV:   all eigenvalues in the half-open interval (VL,VU]
                   will be found.
      -     = MagmaRangeI:   the IL-th through IU-th eigenvalues will be found.

    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of A is stored;
      -     = MagmaLower:  Lower triangle of A is stored.

    @param[in]
    n       INTEGER
            The order of the matrix A.  N >= 0.

    @param[in,out]
    A       COMPLEX_16 array, dimension (LDA, N)
            On entry, the Hermitian matrix A.  If UPLO = MagmaUpper, the
            leading N-by-N upper triangular part of A contains the
            upper triangular part of the matrix A.  If UPLO = MagmaLower,
            the leading N-by-N lower triangular part of A contains
            the lower triangular part of the matrix A.
            On exit, if JOBZ = MagmaVec, then if INFO = 0, the first m columns
            of A contains the required
            orthonormal eigenvectors of the matrix A.
            If JOBZ = MagmaNoVec, then on exit the lower triangle (if UPLO=MagmaLower)
            or the upper triangle (if UPLO=MagmaUpper) of A, including the
            diagonal, is destroyed.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[in]
    vl      DOUBLE PRECISION
    @param[in]
    vu      DOUBLE PRECISION
            If RANGE=MagmaRangeV, the lower and upper bounds of the interval to
            be searched for eigenvalues. VL < VU.
            Not referenced if RANGE = MagmaRangeAll or MagmaRangeI.

    @param[in]
    il      INTEGER
    @param[in]
    iu      INTEGER
            If RANGE=MagmaRangeI, the indices (in ascending order) of the
            smallest and largest eigenvalues to be returned.
            1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
            Not referenced if RANGE = MagmaRangeAll or MagmaRangeV.

    @param[out]
    m       INTEGER
            The total number of eigenvalues found.  0 <= M <= N.
            If RANGE = MagmaRangeAll, M = N, and if RANGE = MagmaRangeI, M = IU-IL+1.

    @param[out]
    w       DOUBLE PRECISION array, dimension (N)
            If INFO = 0, the required m eigenvalues in ascending order.

    @param[out]
    work    (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The length of the array WORK.
            If N <= 1,                      LWORK >= 1.
            If JOBZ = MagmaNoVec and N > 1, LWORK >= LQ2 + N + N*NB.
            If JOBZ = MagmaVec   and N > 1, LWORK >= LQ2 + 2*N + N**2.
            where LQ2 is the size needed to store the Q2 matrix
            and is returned by magma_bulge_get_lq2.
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal sizes of the WORK, RWORK and
            IWORK arrays, returns these values as the first entries of
            the WORK, RWORK and IWORK arrays, and no error message
            related to LWORK or LRWORK or LIWORK is issued by XERBLA.

    @param[out]
    rwork   (workspace) DOUBLE PRECISION array,
                                           dimension (LRWORK)
            On exit, if INFO = 0, RWORK[0] returns the optimal LRWORK.

    @param[in]
    lrwork  INTEGER
            The dimension of the array RWORK.
            If N <= 1,                      LRWORK >= 1.
            If JOBZ = MagmaNoVec and N > 1, LRWORK >= N.
            If JOBZ = MagmaVec   and N > 1, LRWORK >= 1 + 5*N + 2*N**2.
    \n
            If LRWORK = -1, then a workspace query is assumed; the
            routine only calculates the optimal sizes of the WORK, RWORK
            and IWORK arrays, returns these values as the first entries
            of the WORK, RWORK and IWORK arrays, and no error message
            related to LWORK or LRWORK or LIWORK is issued by XERBLA.

    @param[out]
    iwork   (workspace) INTEGER array, dimension (MAX(1,LIWORK))
            On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK.

    @param[in]
    liwork  INTEGER
            The dimension of the array IWORK.
            If N <= 1,                      LIWORK >= 1.
            If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1.
            If JOBZ = MagmaVec   and N > 1, LIWORK >= 3 + 5*N.
    \n
            If LIWORK = -1, then a workspace query is assumed; the
            routine only calculates the optimal sizes of the WORK, RWORK
            and IWORK arrays, returns these values as the first entries
            of the WORK, RWORK and IWORK arrays, and no error message
            related to LWORK or LRWORK or LIWORK is issued by XERBLA.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
      -     > 0:  if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed
                  to converge; i off-diagonal elements of an intermediate
                  tridiagonal form did not converge to zero;
                  if INFO = i and JOBZ = MagmaVec, then the algorithm failed
                  to compute an eigenvalue while working on the submatrix
                  lying in rows and columns INFO/(N+1) through
                  mod(INFO,N+1).

    Further Details
    ---------------
    Based on contributions by
       Jeff Rutter, Computer Science Division, University of California
       at Berkeley, USA

    Modified description of INFO. Sven, 16 Feb 05.

    @ingroup magma_zheev_driver
    ********************************************************************/
extern "C" magma_int_t
magma_zheevdx_2stage_m(magma_int_t nrgpu, magma_vec_t jobz, magma_range_t range, magma_uplo_t uplo,
                       magma_int_t n,
                       magmaDoubleComplex *A, magma_int_t lda,
                       double vl, double vu, magma_int_t il, magma_int_t iu,
                       magma_int_t *m, double *w,
                       magmaDoubleComplex *work, magma_int_t lwork,
                       double *rwork, magma_int_t lrwork,
                       magma_int_t *iwork, magma_int_t liwork,
                       magma_int_t *info)
{
    #define A( i_,j_) (A  + (i_) + (j_)*lda)
    #define A2(i_,j_) (A2 + (i_) + (j_)*lda2)
    
    const char* uplo_  = lapack_uplo_const( uplo  );
    const char* jobz_  = lapack_vec_const( jobz  );
    magmaDoubleComplex c_one  = MAGMA_Z_ONE;
    double d_one = 1.;
    magma_int_t ione = 1;
    magma_int_t izero = 0;

    double d__1;

    double eps;
    double anrm;
    magma_int_t imax;
    double rmin, rmax;
    double sigma;
    //magma_int_t iinfo;
    magma_int_t lwmin, lrwmin, liwmin;
    magma_int_t lower;
    magma_int_t wantz;
    magma_int_t iscale;
    double safmin;
    double bignum;
    double smlnum;
    magma_int_t lquery;
    magma_int_t alleig, valeig, indeig;
    magma_int_t len;

    /* determine the number of threads */
    magma_int_t parallel_threads = magma_get_parallel_numthreads();

    wantz = (jobz == MagmaVec);
    lower = (uplo == MagmaLower);

    alleig = (range == MagmaRangeAll);
    valeig = (range == MagmaRangeV);
    indeig = (range == MagmaRangeI);

    lquery = (lwork == -1 || lrwork == -1 || liwork == -1);

    *info = 0;
    if (! (wantz || (jobz == MagmaNoVec))) {
        *info = -1;
    } else if (! (alleig || valeig || indeig)) {
        *info = -2;
    } else if (! (lower || (uplo == MagmaUpper))) {
        *info = -3;
    } else if (n < 0) {
        *info = -4;
    } else if (lda < max(1,n)) {
        *info = -6;
    } else {
        if (valeig) {
            if (n > 0 && vu <= vl) {
                *info = -8;
            }
        } else if (indeig) {
            if (il < 1 || il > max(1,n)) {
                *info = -9;
            } else if (iu < min(n,il) || iu > n) {
                *info = -10;
            }
        }
    }

    magma_int_t nb = magma_get_zbulge_nb(n, parallel_threads);
    magma_int_t Vblksiz = magma_zbulge_get_Vblksiz(n, nb, parallel_threads);

    magma_int_t ldt = Vblksiz;
    magma_int_t ldv = nb + Vblksiz;
    magma_int_t blkcnt = magma_bulge_get_blkcnt(n, nb, Vblksiz);
    magma_int_t lq2 = magma_zbulge_get_lq2(n, parallel_threads);

    if (wantz) {
        lwmin  = lq2 + 2*n + n*n;
        lrwmin = 1 + 5*n + 2*n*n;
        liwmin = 5*n + 3;
    } else {
        lwmin  = lq2 + n + n*nb;
        lrwmin = n;
        liwmin = 1;
    }

    // multiply by 1+eps (in Double!) to ensure length gets rounded up,
    // if it cannot be exactly represented in floating point.
    real_Double_t one_eps = 1. + lapackf77_dlamch("Epsilon");
    work[0]  = MAGMA_Z_MAKE( lwmin * one_eps, 0.);  // round up
    rwork[0] = lrwmin * one_eps;
    iwork[0] = liwmin;

    if ((lwork < lwmin) && !lquery) {
        *info = -14;
    } else if ((lrwork < lrwmin) && ! lquery) {
        *info = -16;
    } else if ((liwork < liwmin) && ! lquery) {
        *info = -18;
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (n == 0) {
        return *info;
    }

    if (n == 1) {
        w[0] = MAGMA_Z_REAL(A[0]);
        if (wantz) {
            A[0] = MAGMA_Z_ONE;
        }
        return *info;
    }

    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    
    timer_printf("using %d parallel_threads\n", (int) parallel_threads);

    /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */
    magma_int_t ntiles = n/nb;
    if ( ( ntiles < 2 ) || ( n <= 128 ) ) {
        #ifdef ENABLE_DEBUG
        printf("--------------------------------------------------------------\n");
        printf("  warning matrix too small N=%d NB=%d, calling lapack on CPU  \n", (int) n, (int) nb);
        printf("--------------------------------------------------------------\n");
        #endif
        lapackf77_zheevd(jobz_, uplo_, &n,
                         A, &lda, w,
                         work, &lwork,
                         #if defined(PRECISION_z) || defined(PRECISION_c)
                         rwork, &lrwork,
                         #endif
                         iwork, &liwork,
                         info);
        *m = n;
        return *info;
    }
    
    /* Get machine constants. */
    safmin = lapackf77_dlamch("Safe minimum");
    eps = lapackf77_dlamch("Precision");
    smlnum = safmin / eps;
    bignum = 1. / smlnum;
    rmin = magma_dsqrt(smlnum);
    rmax = magma_dsqrt(bignum);

    /* Scale matrix to allowable range, if necessary. */
    anrm = lapackf77_zlanhe("M", uplo_, &n, A, &lda, rwork);
    iscale = 0;
    if (anrm > 0. && anrm < rmin) {
        iscale = 1;
        sigma = rmin / anrm;
    } else if (anrm > rmax) {
        iscale = 1;
        sigma = rmax / anrm;
    }
    if (iscale == 1) {
        lapackf77_zlascl(uplo_, &izero, &izero, &d_one, &sigma, &n, &n, A,
                         &lda, info);
    }

    magma_int_t indT2   = 0;
    magma_int_t indTAU2 = indT2  + blkcnt*ldt*Vblksiz;
    magma_int_t indV2   = indTAU2+ blkcnt*Vblksiz;
    magma_int_t indtau1 = indV2  + blkcnt*ldv*Vblksiz;
    magma_int_t indwrk  = indtau1+ n;
    magma_int_t indwk2  = indwrk + n*n;
    magma_int_t llwork = lwork - indwrk;
    magma_int_t llwrk2 = lwork - indwk2;
    magma_int_t inde = 0;
    magma_int_t indrwk = inde + n;
    magma_int_t llrwk = lrwork - indrwk;

    magma_timer_t time=0, time_total=0, time_alloc=0, time_dist=0, time_band=0;
    timer_start( time_total );

#ifdef HE2HB_SINGLEGPU
    magmaDoubleComplex *dT1;
    if (MAGMA_SUCCESS != magma_zmalloc( &dT1, n*nb)) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }
    timer_start( time_band );
    magma_zhetrd_he2hb(uplo, n, nb, A, lda, &work[indtau1], &work[indwrk], llwork, dT1, info);
    timer_stop( time_band );
    timer_printf( "    1 GPU seq code time zhetrd_he2hb only = %7.4f\n", time_band );
    magma_free(dT1);
#else
    magma_int_t nstream = max(3,nrgpu+2);
    magma_queue_t streams[MagmaMaxGPUs][20];
    magmaDoubleComplex *da[MagmaMaxGPUs], *dT1[MagmaMaxGPUs];
    magma_int_t ldda = ((n+31)/32)*32;

    magma_int_t ver = 0;
    magma_int_t distblk = max(256, 4*nb);

    #ifdef ENABLE_DEBUG
    printf("voici ngpu %d distblk %d NB %d nstream %d version %d \n ", nrgpu, distblk, nb, nstream, ver);
    #endif

    timer_start( time_alloc );
    for( magma_int_t dev = 0; dev < nrgpu; ++dev ) {
        magma_int_t mlocal = ((n / distblk) / nrgpu + 1) * distblk;
        magma_setdevice( dev );
        // TODO check malloc
        magma_zmalloc(&da[dev], ldda*mlocal );
        magma_zmalloc(&dT1[dev], (n*nb) );
        for( int i = 0; i < nstream; ++i ) {
            magma_queue_create( &streams[dev][i] );
        }
    }
    timer_stop( time_alloc );
    
    timer_start( time_dist );
    magma_zsetmatrix_1D_col_bcyclic( n, n, A, lda, da, ldda, nrgpu, distblk );
    magma_setdevice(0);
    timer_stop( time_dist );

    timer_start( time_band );
    if (ver == 30) {
        magma_zhetrd_he2hb_mgpu_spec(uplo, n, nb, A, lda, &work[indtau1], &work[indwrk], llwork, da, ldda, dT1, nb, nrgpu, distblk, streams, nstream, info);
    } else {
        magma_zhetrd_he2hb_mgpu(uplo, n, nb, A, lda, &work[indtau1], &work[indwrk], llwork, da, ldda, dT1, nb, nrgpu, distblk, streams, nstream, info);
    }
    timer_stop( time_band );
    timer_printf("    time alloc %7.4f, ditribution %7.4f, zhetrd_he2hb only = %7.4f\n", time_alloc, time_dist, time_band );

    for( magma_int_t dev = 0; dev < nrgpu; ++dev ) {
        magma_setdevice( dev );
        magma_free( da[dev] );
        magma_free( dT1[dev] );
        for( int i = 0; i < nstream; ++i ) {
            magma_queue_destroy( streams[dev][i] );
        }
    }
#endif // not HE2HB_SINGLEGPU

    timer_stop( time_total );
    timer_printf( "    time zhetrd_he2hb_mgpu = %6.2f\n", time_total );
    timer_start( time_total );
    timer_start( time );

    /* copy the input matrix into WORK(INDWRK) with band storage */
    /* PAY ATTENTION THAT work[indwrk] should be able to be of size lda2*n which it should be checked in any future modification of lwork.*/
    magma_int_t lda2 = 2*nb; //nb+1+(nb-1);
    magmaDoubleComplex* A2 = &work[indwrk];
    memset(A2, 0, n*lda2*sizeof(magmaDoubleComplex));

    for (magma_int_t j = 0; j < n-nb; j++) {
        len = nb+1;
        blasf77_zcopy( &len, A(j,j), &ione, A2(0,j), &ione );
        memset(A(j,j), 0, (nb+1)*sizeof(magmaDoubleComplex));
        *A(nb+j,j) = c_one;
    }
    for (magma_int_t j = 0; j < nb; j++) {
        len = nb-j;
        blasf77_zcopy( &len, A(j+n-nb,j+n-nb), &ione, A2(0,j+n-nb), &ione );
        memset(A(j+n-nb,j+n-nb), 0, (nb-j)*sizeof(magmaDoubleComplex));
    }

    timer_stop( time );
    timer_printf( "    time zhetrd_convert = %6.2f\n", time );
    timer_start( time );

    magma_zhetrd_hb2st(uplo, n, nb, Vblksiz, A2, lda2, w, &rwork[inde], &work[indV2], ldv, &work[indTAU2], wantz, &work[indT2], ldt);

    timer_stop( time );
    timer_stop( time_total );
    timer_printf( "    time zhetrd_hb2st = %6.2f\n", time );
    timer_printf( "  time zhetrd = %6.2f\n", time_total );

    /* For eigenvalues only, call DSTERF.  For eigenvectors, first call
       ZSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the
       tridiagonal matrix, then call ZUNMTR to multiply it to the Householder
       transformations represented as Householder vectors in A. */
    if (! wantz) {
        timer_start( time );

        lapackf77_dsterf(&n, w, &rwork[inde], info);
        magma_dmove_eig(range, n, w, &il, &iu, vl, vu, m);

        timer_stop( time );
        timer_printf( "  time dstedc = %6.2f\n", time );
    }
    else {
        timer_start( time_total );
        timer_start( time );

        magma_zstedx_m(nrgpu, range, n, vl, vu, il, iu, w, &rwork[inde],
                       &work[indwrk], n, &rwork[indrwk],
                       llrwk, iwork, liwork, info);

        timer_stop( time );
        timer_printf( "  time zstedx_m = %6.2f\n", time );
        timer_start( time );

        magma_dmove_eig(range, n, w, &il, &iu, vl, vu, m);
/*
        magmaDoubleComplex *dZ;
        magma_int_t lddz = n;

        if (MAGMA_SUCCESS != magma_zmalloc( &dZ, *m*lddz)) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }

        magma_zbulge_back(uplo, n, nb, *m, Vblksiz, &work[indwrk + n * (il-1)], n, dZ, lddz,
                          &work[indV2], ldv, &work[indTAU2], &work[indT2], ldt, info);

        magma_zgetmatrix( n, *m, dZ, lddz, &work[indwrk], n);

        magma_free(dZ);

*/

        magma_zbulge_back_m(nrgpu, uplo, n, nb, *m, Vblksiz, &work[indwrk + n * (il-1)], n,
                            &work[indV2], ldv, &work[indTAU2], &work[indT2], ldt, info);

        timer_stop( time );
        timer_printf( "    time zbulge_back_m = %6.2f\n", time );
        timer_start( time );

        magma_zunmqr_m(nrgpu, MagmaLeft, MagmaNoTrans, n-nb, *m, n-nb, A+nb, lda, &work[indtau1],
                       &work[indwrk + n * (il-1) + nb], n, &work[indwk2], llwrk2, info);

        lapackf77_zlacpy("A", &n, m, &work[indwrk  + n * (il-1)], &n, A, &lda);

        timer_stop( time );
        timer_stop( time_total );
        timer_printf( "    time zunmqr_m + copy = %6.2f\n", time );
        timer_printf( "  time eigenvectors backtransf. = %6.2f\n", time_total );
    }

    /* If matrix was scaled, then rescale eigenvalues appropriately. */
    if (iscale == 1) {
        if (*info == 0) {
            imax = n;
        } else {
            imax = *info - 1;
        }
        d__1 = 1. / sigma;
        blasf77_dscal(&imax, &d__1, w, &ione);
    }

    work[0]  = MAGMA_Z_MAKE( lwmin * one_eps, 0.);  // round up
    rwork[0] = lrwmin * one_eps;
    iwork[0] = liwmin;

    magma_setdevice( orig_dev );
    
    return *info;
} /* magma_zheevdx_2stage_m */
Пример #15
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing zlaset
   Code is very similar to testing_zlacpy.cpp
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    real_Double_t    gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time;
    double           error, work[1];
    magmaDoubleComplex  c_neg_one = MAGMA_Z_NEG_ONE;
    magmaDoubleComplex *h_A, *h_R;
    magmaDoubleComplex *d_A;
    magmaDoubleComplex offdiag = MAGMA_Z_MAKE( 1.2000, 6.7000 );
    magmaDoubleComplex diag    = MAGMA_Z_MAKE( 3.1415, 2.7183 );
    magma_int_t M, N, size, lda, ldb, ldda;
    magma_int_t ione     = 1;
    magma_int_t status = 0;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );

    magma_uplo_t uplo[] = { MagmaLower, MagmaUpper, MagmaFull };
    
    printf("uplo       M     N   CPU GByte/s (ms)    GPU GByte/s (ms)    check\n");
    printf("==================================================================\n");
    for( int iuplo = 0; iuplo < 3; ++iuplo ) {
      for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            //M += 2;  // space for insets
            //N += 2;
            lda    = M;
            ldb    = lda;
            ldda   = ((M+31)/32)*32;
            size   = lda*N;
            if ( uplo[iuplo] == MagmaLower || uplo[iuplo] == MagmaUpper ) {
                // save triangle (with diagonal)
                // TODO wrong for trapezoid
                gbytes = sizeof(magmaDoubleComplex) * 0.5*N*(N+1) / 1e9;
            }
            else {
                // save entire matrix
                gbytes = sizeof(magmaDoubleComplex) * 1.*M*N / 1e9;
            }
    
            TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, size   );
            TESTING_MALLOC_CPU( h_R, magmaDoubleComplex, size   );
            
            TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N );
            
            /* Initialize the matrix */
            for( int j = 0; j < N; ++j ) {
                for( int i = 0; i < M; ++i ) {
                    h_A[i + j*lda] = MAGMA_Z_MAKE( i + j/10000., j );
                }
            }
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            magma_zsetmatrix( M, N, h_A, lda, d_A, ldda );
            
            gpu_time = magma_sync_wtime( 0 );
            //magmablas_zlaset( uplo[iuplo], M-2, N-2, offdiag, diag, d_A+1+ldda, ldda );  // inset by 1 row & col
            magmablas_zlaset( uplo[iuplo], M, N, offdiag, diag, d_A, ldda );
            gpu_time = magma_sync_wtime( 0 ) - gpu_time;
            gpu_perf = gbytes / gpu_time;
            
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            cpu_time = magma_wtime();
            //magma_int_t M2 = M-2;  // inset by 1 row & col
            //magma_int_t N2 = N-2;
            //lapackf77_zlaset( lapack_uplo_const( uplo[iuplo] ), &M2, &N2, &offdiag, &diag, h_A+1+lda, &lda );
            lapackf77_zlaset( lapack_uplo_const( uplo[iuplo] ), &M, &N, &offdiag, &diag, h_A, &lda );
            cpu_time = magma_wtime() - cpu_time;
            cpu_perf = gbytes / cpu_time;
            
            /* =====================================================================
               Check the result
               =================================================================== */
            magma_zgetmatrix( M, N, d_A, ldda, h_R, lda );
            
            blasf77_zaxpy(&size, &c_neg_one, h_A, &ione, h_R, &ione);
            error = lapackf77_zlange("f", &M, &N, h_R, &lda, work);

            printf("%4c   %5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)   %s\n",
                   lapacke_uplo_const( uplo[iuplo] ), (int) M, (int) N,
                   cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000.,
                   (error == 0. ? "ok" : "failed") );
            status += ! (error == 0.);
            
            TESTING_FREE_CPU( h_A );
            TESTING_FREE_CPU( h_R );
            
            TESTING_FREE_DEV( d_A );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
      }
      printf( "\n" );
    }

    TESTING_FINALIZE();
    return status;
}
Пример #16
0
int main( int argc, char** argv) 
{
    real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time;
    magmaDoubleComplex *hA, *hR;
    magmaDoubleComplex_ptr dA;
    magma_int_t N = 0, n2, lda, ldda;
    magma_int_t size[10] =
        { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 8160, 8192 };
    
    magma_int_t i, info;
    magmaDoubleComplex mz_one = MAGMA_Z_NEG_ONE;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    double      work[1], matnorm, diffnorm;
    
    if (argc != 1){
        for(i = 1; i<argc; i++){        
            if (strcmp("-N", argv[i])==0)
                N = atoi(argv[++i]);
        }
        if (N>0) size[0] = size[9] = N;
        else exit(1);
    }
    else {
        printf("\nUsage: \n");
        printf("  testing_zpotrf2_gpu -N %d\n\n", 1024);
    }

    /* Initialize */
    magma_queue_t  queue1, queue2;
    magma_device_t device;
    magma_int_t num = 0;
    magma_int_t err;
    magma_init();
    err = magma_getdevices( &device, 2, &num );
    if ( err != 0 or num < 1 ) {
        fprintf( stderr, "magma_getdevices failed: %d\n", (int) err );
        exit(-1);
    }
    err = magma_queue_create( device, &queue1 );
    if ( err != 0 ) {
        fprintf( stderr, "magma_queue_create failed: %d\n", (int) err );
        exit(-1);
    }
    err = magma_queue_create( device, &queue2 );
    if ( err != 0 ) {
        fprintf( stderr, "magma_queue_create failed: %d\n", (int) err );
        exit(-1);
    }

    magma_queue_t queues[2] = {queue1, queue2};

    /* Allocate memory for the largest matrix */
    N    = size[9];
    n2   = N * N;
    ldda = ((N+31)/32) * 32;
    TESTING_MALLOC_CPU( hA, magmaDoubleComplex, n2 );
    TESTING_MALLOC_PIN( hR, magmaDoubleComplex, n2 );
    TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*N );
    
    printf("\n\n");
    printf("  N    CPU GFlop/s (sec)    GPU GFlop/s (sec)    ||R_magma-R_lapack||_F / ||R_lapack||_F\n");
    printf("========================================================================================\n");
    for(i=0; i<10; i++){
        N   = size[i];
        lda = N; 
        n2  = lda*N;
        ldda = ((N+31)/32)*32;
        gflops = FLOPS( (double)N ) * 1e-9;
        
        /* Initialize the matrix */
        lapackf77_zlarnv( &ione, ISEED, &n2, hA );
        /* Symmetrize and increase the diagonal */
        for( int i = 0; i < N; ++i ) {
            hA(i,i) = MAGMA_Z_MAKE( MAGMA_Z_REAL(hA(i,i)) + N, 0 );
            for( int j = 0; j < i; ++j ) {
          hA(i, j) = MAGMA_Z_CNJG( hA(j,i) );
            }
        }
        lapackf77_zlacpy( MagmaFullStr, &N, &N, hA, &lda, hR, &lda );

        /* Warm up to measure the performance */
        magma_zsetmatrix( N, N, hA, lda, dA, 0, ldda, queue1);
        clFinish(queue1);
        magma_zpotrf2_gpu( MagmaLower, N, dA, 0, ldda, queues, &info );
        /* ====================================================================
           Performs operation using MAGMA 
           =================================================================== */
        magma_zsetmatrix( N, N, hA, lda, dA, 0, ldda, queue1 );
        clFinish(queue1);
        gpu_time = magma_wtime();
        magma_zpotrf2_gpu( MagmaLower, N, dA, 0, ldda, queues, &info );
        gpu_time = magma_wtime() - gpu_time;
        if (info != 0)
            printf( "magma_zpotrf2 had error %d.\n", info );

        gpu_perf = gflops / gpu_time;
        
        /* =====================================================================
           Performs operation using LAPACK 
           =================================================================== */
        cpu_time = magma_wtime();
        lapackf77_zpotrf( MagmaLowerStr, &N, hA, &lda, &info );
        cpu_time = magma_wtime() - cpu_time;
        if (info != 0)
            printf( "lapackf77_zpotrf had error %d.\n", info );
        
        cpu_perf = gflops / cpu_time;
        
        /* =====================================================================
           Check the result compared to LAPACK
           |R_magma - R_lapack| / |R_lapack|
           =================================================================== */
        magma_zgetmatrix( N, N, dA, 0, ldda, hR, lda, queue1 );
        matnorm = lapackf77_zlange("f", &N, &N, hA, &lda, work);
        blasf77_zaxpy(&n2, &mz_one, hA, &ione, hR, &ione);
        diffnorm = lapackf77_zlange("f", &N, &N, hR, &lda, work);
        printf( "%5d     %6.2f (%6.2f)     %6.2f (%6.2f)         %e\n", 
                N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm );
        
        if (argc != 1)
            break;
    }

    /* clean up */
    TESTING_FREE_CPU( hA );
    TESTING_FREE_PIN( hR );
    TESTING_FREE_DEV( dA );
    magma_queue_destroy( queue1 );
    magma_queue_destroy( queue2 );
    magma_finalize();
}
Пример #17
0
/* ////////////////////////////////////////////////////////////////////////////
   -- testing sparse matrix vector product
*/
int main(  int argc, char** argv )
{
    TESTING_INIT();
    magma_queue_t queue;
    magma_queue_create( /*devices[ opts->device ],*/ &queue );

    magma_z_sparse_matrix hA, hA_SELLP, hA_ELL, dA, dA_SELLP, dA_ELL;
    hA_SELLP.blocksize = 8;
    hA_SELLP.alignment = 8;
    real_Double_t start, end, res;
    magma_int_t *pntre;

    magmaDoubleComplex c_one  = MAGMA_Z_MAKE(1.0, 0.0);
    magmaDoubleComplex c_zero = MAGMA_Z_MAKE(0.0, 0.0);
    
    magma_int_t i, j;
    for( i = 1; i < argc; ++i ) {
        if ( strcmp("--blocksize", argv[i]) == 0 ) {
            hA_SELLP.blocksize = atoi( argv[++i] );
        } else if ( strcmp("--alignment", argv[i]) == 0 ) {
            hA_SELLP.alignment = atoi( argv[++i] );
        } else
            break;
    }
    printf( "\n#    usage: ./run_zspmv"
        " [ --blocksize %d --alignment %d (for SELLP) ]"
        " matrices \n\n", (int) hA_SELLP.blocksize, (int) hA_SELLP.alignment );

    while(  i < argc ) {

        if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) {   // Laplace test
            i++;
            magma_int_t laplace_size = atoi( argv[i] );
            magma_zm_5stencil(  laplace_size, &hA, queue );
        } else {                        // file-matrix test
            magma_z_csr_mtx( &hA,  argv[i], queue );
        }

        printf( "\n# matrix info: %d-by-%d with %d nonzeros\n\n",
                            (int) hA.num_rows,(int) hA.num_cols,(int) hA.nnz );

        real_Double_t FLOPS = 2.0*hA.nnz/1e9;

        magma_z_vector hx, hy, dx, dy, hrefvec, hcheck;

        // init CPU vectors
        magma_z_vinit( &hx, Magma_CPU, hA.num_rows, c_zero, queue );
        magma_z_vinit( &hy, Magma_CPU, hA.num_rows, c_zero, queue );

        // init DEV vectors
        magma_z_vinit( &dx, Magma_DEV, hA.num_rows, c_one, queue );
        magma_z_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue );

        #ifdef MAGMA_WITH_MKL
            // calling MKL with CSR
            pntre = (magma_int_t*)malloc( (hA.num_rows+1)*sizeof(magma_int_t) );
            pntre[0] = 0;
            for (j=0; j<hA.num_rows; j++ ) {
                pntre[j] = hA.row[j+1];
            }
             MKL_INT num_rows = hA.num_rows;
             MKL_INT num_cols = hA.num_cols;
             MKL_INT nnz = hA.nnz;

            MKL_INT *col;
            TESTING_MALLOC_CPU( col, MKL_INT, nnz );
            for( magma_int_t t=0; t < hA.nnz; ++t ) {
                col[ t ] = hA.col[ t ];
            }
            MKL_INT *row;
            TESTING_MALLOC_CPU( row, MKL_INT, num_rows );
            for( magma_int_t t=0; t < hA.num_rows; ++t ) {
                row[ t ] = hA.col[ t ];
            }
    
            start = magma_wtime();
            for (j=0; j<10; j++ ) {
                mkl_zcsrmv( "N", &num_rows, &num_cols, 
                            MKL_ADDR(&c_one), "GFNC", MKL_ADDR(hA.val), 
                            col, row, pntre, 
                                                    MKL_ADDR(hx.val), 
                            MKL_ADDR(&c_zero),        MKL_ADDR(hy.val) );
            }
            end = magma_wtime();
            printf( "\n > MKL  : %.2e seconds %.2e GFLOP/s    (CSR).\n",
                                            (end-start)/10, FLOPS*10/(end-start) );

            TESTING_FREE_CPU( row );
            TESTING_FREE_CPU( col );
            free(pntre);
        #endif // MAGMA_WITH_MKL

        // copy matrix to GPU
        magma_z_mtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue );        
        // SpMV on GPU (CSR) -- this is the reference!
        start = magma_sync_wtime( queue );
        for (j=0; j<10; j++)
            magma_z_spmv( c_one, dA, dx, c_zero, dy, queue );
        end = magma_sync_wtime( queue );
        printf( " > MAGMA: %.2e seconds %.2e GFLOP/s    (standard CSR).\n",
                                        (end-start)/10, FLOPS*10/(end-start) );
        magma_z_mfree(&dA, queue );
        magma_z_vtransfer( dy, &hrefvec , Magma_DEV, Magma_CPU, queue );

        // convert to ELL and copy to GPU
        magma_z_mconvert(  hA, &hA_ELL, Magma_CSR, Magma_ELL, queue );
        magma_z_mtransfer( hA_ELL, &dA_ELL, Magma_CPU, Magma_DEV, queue );
        magma_z_mfree(&hA_ELL, queue );
        magma_z_vfree( &dy, queue );
        magma_z_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue );
        // SpMV on GPU (ELL)
        start = magma_sync_wtime( queue );
        for (j=0; j<10; j++)
            magma_z_spmv( c_one, dA_ELL, dx, c_zero, dy, queue );
        end = magma_sync_wtime( queue );
        printf( " > MAGMA: %.2e seconds %.2e GFLOP/s    (standard ELL).\n",
                                        (end-start)/10, FLOPS*10/(end-start) );
        magma_z_mfree(&dA_ELL, queue );
        magma_z_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue );
        res = 0.0;
        for(magma_int_t k=0; k<hA.num_rows; k++ )
            res=res + MAGMA_Z_REAL(hcheck.val[k]) - MAGMA_Z_REAL(hrefvec.val[k]);
        if ( res < .000001 )
            printf("# tester spmv ELL:  ok\n");
        else
            printf("# tester spmv ELL:  failed\n");
        magma_z_vfree( &hcheck, queue );

        // convert to SELLP and copy to GPU
        magma_z_mconvert(  hA, &hA_SELLP, Magma_CSR, Magma_SELLP, queue );
        magma_z_mtransfer( hA_SELLP, &dA_SELLP, Magma_CPU, Magma_DEV, queue );
        magma_z_mfree(&hA_SELLP, queue );
        magma_z_vfree( &dy, queue );
        magma_z_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue );
        // SpMV on GPU (SELLP)
        start = magma_sync_wtime( queue );
        for (j=0; j<10; j++)
            magma_z_spmv( c_one, dA_SELLP, dx, c_zero, dy, queue );
        end = magma_sync_wtime( queue );
        printf( " > MAGMA: %.2e seconds %.2e GFLOP/s    (SELLP).\n",
                                        (end-start)/10, FLOPS*10/(end-start) );

        magma_z_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue );
        res = 0.0;
        for(magma_int_t k=0; k<hA.num_rows; k++ )
            res=res + MAGMA_Z_REAL(hcheck.val[k]) - MAGMA_Z_REAL(hrefvec.val[k]);
        printf("# |x-y|_F = %8.2e\n", res);
        if ( res < .000001 )
            printf("# tester spmv SELL-P:  ok\n");
        else
            printf("# tester spmv SELL-P:  failed\n");
        magma_z_vfree( &hcheck, queue );

        magma_z_mfree(&dA_SELLP, queue );


        // SpMV on GPU (CUSPARSE - CSR)
        // CUSPARSE context //

        cusparseHandle_t cusparseHandle = 0;
        cusparseStatus_t cusparseStatus;
        cusparseStatus = cusparseCreate(&cusparseHandle);
        cusparseSetStream( cusparseHandle, queue );

        cusparseMatDescr_t descr = 0;
        cusparseStatus = cusparseCreateMatDescr(&descr);

        cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
        cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);
        magmaDoubleComplex alpha = c_one;
        magmaDoubleComplex beta = c_zero;
        magma_z_vfree( &dy, queue );
        magma_z_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue );

        // copy matrix to GPU
        magma_z_mtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue );

        start = magma_sync_wtime( queue );
        for (j=0; j<10; j++)
            cusparseStatus =
            cusparseZcsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, 
                        hA.num_rows, hA.num_cols, hA.nnz, &alpha, descr, 
                        dA.dval, dA.drow, dA.dcol, dx.dval, &beta, dy.dval);
        end = magma_sync_wtime( queue );
        if (cusparseStatus != 0)    printf("error in cuSPARSE CSR\n");
        printf( " > CUSPARSE: %.2e seconds %.2e GFLOP/s    (CSR).\n",
                                        (end-start)/10, FLOPS*10/(end-start) );
        cusparseMatDescr_t descrA;
        cusparseStatus = cusparseCreateMatDescr(&descrA);
         if (cusparseStatus != 0)    printf("error\n");
        cusparseHybMat_t hybA;
        cusparseStatus = cusparseCreateHybMat( &hybA );
         if (cusparseStatus != 0)    printf("error\n");

        magma_z_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue );
        res = 0.0;
        for(magma_int_t k=0; k<hA.num_rows; k++ )
            res=res + MAGMA_Z_REAL(hcheck.val[k]) - MAGMA_Z_REAL(hrefvec.val[k]);
        printf("# |x-y|_F = %8.2e\n", res);
        if ( res < .000001 )
            printf("# tester spmv cuSPARSE CSR:  ok\n");
        else
            printf("# tester spmv cuSPARSE CSR:  failed\n");
        magma_z_vfree( &hcheck, queue );
        magma_z_vfree( &dy, queue );
        magma_z_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue );
       
        cusparseZcsr2hyb(cusparseHandle,  hA.num_rows, hA.num_cols,
                        descrA, dA.dval, dA.drow, dA.dcol,
                        hybA, 0, CUSPARSE_HYB_PARTITION_AUTO);

        start = magma_sync_wtime( queue );
        for (j=0; j<10; j++)
            cusparseStatus =
            cusparseZhybmv( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
               &alpha, descrA, hybA,
               dx.dval, &beta, dy.dval);
        end = magma_sync_wtime( queue );
        if (cusparseStatus != 0)    printf("error in cuSPARSE HYB\n");
        printf( " > CUSPARSE: %.2e seconds %.2e GFLOP/s    (HYB).\n",
                                        (end-start)/10, FLOPS*10/(end-start) );

        magma_z_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue );
        res = 0.0;
        for(magma_int_t k=0; k<hA.num_rows; k++ )
            res=res + MAGMA_Z_REAL(hcheck.val[k]) - MAGMA_Z_REAL(hrefvec.val[k]);
        printf("# |x-y|_F = %8.2e\n", res);
        if ( res < .000001 )
            printf("# tester spmv cuSPARSE HYB:  ok\n");
        else
            printf("# tester spmv cuSPARSE HYB:  failed\n");
        magma_z_vfree( &hcheck, queue );

        cusparseDestroyMatDescr( descrA );
        cusparseDestroyHybMat( hybA );
        cusparseDestroy( cusparseHandle );

        magma_z_mfree(&dA, queue );



        printf("\n\n");


        // free CPU memory
        magma_z_mfree(&hA, queue );
        magma_z_vfree(&hx, queue );
        magma_z_vfree(&hy, queue );
        magma_z_vfree(&hrefvec, queue );
        // free GPU memory
        magma_z_vfree(&dx, queue );
        magma_z_vfree(&dy, queue );

        i++;

    }
    
    magma_queue_destroy( queue );
    TESTING_FINALIZE();
    return 0;
}
Пример #18
0
int main( int argc, char** argv )
{
    TESTING_INIT();
    
    real_Double_t   gflops, t1, t2;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    magma_int_t ione = 1;
    const char trans[] = { 'N', 'C', 'T' };
    const char uplo[]  = { 'L', 'U' };
    const char diag[]  = { 'U', 'N' };
    const char side[]  = { 'L', 'R' };
    
    magmaDoubleComplex  *A,  *B,  *C,   *C2, *LU;
    magmaDoubleComplex *dA, *dB, *dC1, *dC2;
    magmaDoubleComplex alpha = MAGMA_Z_MAKE( 0.5, 0.1 );
    magmaDoubleComplex beta  = MAGMA_Z_MAKE( 0.7, 0.2 );
    double dalpha = 0.6;
    double dbeta  = 0.8;
    double work[1], error, total_error;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t m, n, k, size, maxn, ld, info;
    magma_int_t *piv;
    magma_err_t err;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" );
    
    total_error = 0.;
    for( int i = 0; i < opts.ntest; ++i ) {
        m = opts.msize[i];
        n = opts.nsize[i];
        k = opts.ksize[i];
        printf("=========================================================================\n");
        printf( "m=%d, n=%d, k=%d\n", (int) m, (int) n, (int) k );
        
        // allocate matrices
        // over-allocate so they can be any combination of {m,n,k} x {m,n,k}.
        maxn = max( max( m, n ), k );
        ld = maxn;
        size = maxn*maxn;
        err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) );  assert( err == 0 );
        err = magma_zmalloc_pinned( &A,  size );  assert( err == 0 );
        err = magma_zmalloc_pinned( &B,  size );  assert( err == 0 );
        err = magma_zmalloc_pinned( &C,  size );  assert( err == 0 );
        err = magma_zmalloc_pinned( &C2, size );  assert( err == 0 );
        err = magma_zmalloc_pinned( &LU, size );  assert( err == 0 );
        err = magma_zmalloc( &dA,  size );        assert( err == 0 );
        err = magma_zmalloc( &dB,  size );        assert( err == 0 );
        err = magma_zmalloc( &dC1, size );        assert( err == 0 );
        err = magma_zmalloc( &dC2, size );        assert( err == 0 );
        
        // initialize matrices
        size = maxn*maxn;
        lapackf77_zlarnv( &ione, ISEED, &size, A  );
        lapackf77_zlarnv( &ione, ISEED, &size, B  );
        lapackf77_zlarnv( &ione, ISEED, &size, C  );
        
        printf( "========== Level 1 BLAS ==========\n" );
        
        // ----- test ZSWAP
        // swap columns 2 and 3 of dA, then copy to C2 and compare with A
        if ( n >= 3 ) {
            magma_zsetmatrix( m, n, A, ld, dA, ld );
            magma_zsetmatrix( m, n, A, ld, dB, ld );
            magma_zswap( m, dA(0,1), 1, dA(0,2), 1 );
            magma_zswap( m, dB(0,1), 1, dB(0,2), 1 );
            
            // check results, storing diff between magma and cuda calls in C2
            cublasZaxpy( ld*n, c_neg_one, dA, 1, dB, 1 );
            magma_zgetmatrix( m, n, dB, ld, C2, ld );
            error = lapackf77_zlange( "F", &m, &k, C2, &ld, work );
            total_error += error;
            printf( "zswap             diff %.2g\n", error );
        }
        else {
            printf( "zswap skipped for n < 3\n" );
        }
        
        // ----- test IZAMAX
        // get argmax of column of A
        magma_zsetmatrix( m, k, A, ld, dA, ld );
        error = 0;
        for( int j = 0; j < k; ++j ) {
            magma_int_t i1 = magma_izamax( m, dA(0,j), 1 );
            magma_int_t i2 = cublasIzamax( m, dA(0,j), 1 );
            assert( i1 == i2 );
            error += abs( i1 - i2 );
        }
        total_error += error;
        gflops = (double)m * k / 1e9;
        printf( "izamax            diff %.2g\n", error );
        printf( "\n" );
        
        printf( "========== Level 2 BLAS ==========\n" );
        
        // ----- test ZGEMV
        // c = alpha*A*b + beta*c,  with A m*n; b,c m or n-vectors
        // try no-trans/trans
        for( int ia = 0; ia < 3; ++ia ) {
            magma_zsetmatrix( m, n, A,  ld, dA,  ld );
            magma_zsetvector( maxn, B, 1, dB,  1 );
            magma_zsetvector( maxn, C, 1, dC1, 1 );
            magma_zsetvector( maxn, C, 1, dC2, 1 );
            t1 = magma_sync_wtime( 0 );
            magma_zgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasZgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC2, 1 );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            size = (trans[ia] == 'N' ? m : n);
            cublasZaxpy( size, c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetvector( size, dC2, 1, C2, 1 );
            error = lapackf77_zlange( "F", &size, &ione, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZGEMV( m, n ) / 1e9;
            printf( "zgemv( %c )        diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    trans[ia], error, gflops/t1, gflops/t2 );
        }
        printf( "\n" );
        
        // ----- test ZHEMV
        // c = alpha*A*b + beta*c,  with A m*m symmetric; b,c m-vectors
        // try upper/lower
        for( int iu = 0; iu < 2; ++iu ) {
            magma_zsetmatrix( m, m, A, ld, dA, ld );
            magma_zsetvector( m, B, 1, dB,  1 );
            magma_zsetvector( m, C, 1, dC1, 1 );
            magma_zsetvector( m, C, 1, dC2, 1 );
            t1 = magma_sync_wtime( 0 );
            magma_zhemv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasZhemv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC2, 1 );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( m, c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetvector( m, dC2, 1, C2, 1 );
            error = lapackf77_zlange( "F", &m, &ione, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZHEMV( m ) / 1e9;
            printf( "zhemv( %c )        diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], error, gflops/t1, gflops/t2 );
        }
        printf( "\n" );
        
        // ----- test ZTRSV
        // solve A*c = c,  with A m*m triangular; c m-vector
        // try upper/lower, no-trans/trans, unit/non-unit diag
        // Factor A into LU to get well-conditioned triangles, else solve yields garbage.
        // Still can give garbage if solves aren't consistent with LU factors,
        // e.g., using unit diag for U, so copy lower triangle to upper triangle.
        // Also used for trsm later.
        lapackf77_zlacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld );
        lapackf77_zgetrf( &maxn, &maxn, LU, &ld, piv, &info );
        for( int j = 0; j < maxn; ++j ) {
            for( int i = 0; i < j; ++i ) {
                *LU(i,j) = *LU(j,i);
            }
        }
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
        for( int id = 0; id < 2; ++id ) {
            magma_zsetmatrix( m, m, LU, ld, dA, ld );
            magma_zsetvector( m, C, 1, dC1, 1 );
            magma_zsetvector( m, C, 1, dC2, 1 );
            t1 = magma_sync_wtime( 0 );
            magma_ztrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasZtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC2, 1 );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( m, c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetvector( m, dC2, 1, C2, 1 );
            error = lapackf77_zlange( "F", &m, &ione, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZTRSM( MagmaLeft, m, 1 ) / 1e9;
            printf( "ztrsv( %c, %c, %c )  diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], diag[id], error, gflops/t1, gflops/t2 );
        }}}
        printf( "\n" );
        
        printf( "========== Level 3 BLAS ==========\n" );
        
        // ----- test ZGEMM
        // C = alpha*A*B + beta*C,  with A m*k or k*m; B k*n or n*k; C m*n
        // try combinations of no-trans/trans
        for( int ia = 0; ia < 3; ++ia ) {
        for( int ib = 0; ib < 3; ++ib ) {
            bool nta = (trans[ia] == 'N');
            bool ntb = (trans[ib] == 'N');
            magma_zsetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA,  ld );
            magma_zsetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB,  ld );
            magma_zsetmatrix( m, n, C, ld, dC1, ld );
            magma_zsetmatrix( m, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_zgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasZgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_zlange( "F", &m, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZGEMM( m, n, k ) / 1e9;
            printf( "zgemm( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    trans[ia], trans[ib], error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test ZHEMM
        // C = alpha*A*B + beta*C  (left)  with A m*m symmetric; B,C m*n; or
        // C = alpha*B*A + beta*C  (right) with A n*n symmetric; B,C m*n
        // try left/right, upper/lower
        for( int is = 0; is < 2; ++is ) {
        for( int iu = 0; iu < 2; ++iu ) {
            magma_zsetmatrix( m, m, A, ld, dA,  ld );
            magma_zsetmatrix( m, n, B, ld, dB,  ld );
            magma_zsetmatrix( m, n, C, ld, dC1, ld );
            magma_zsetmatrix( m, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_zhemm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasZhemm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_zlange( "F", &m, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZHEMM( side[is], m, n ) / 1e9;
            printf( "zhemm( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    side[is], uplo[iu], error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test ZHERK
        // C = alpha*A*A^H + beta*C  (no-trans) with A m*k and C m*m symmetric; or
        // C = alpha*A^H*A + beta*C  (trans)    with A k*m and C m*m symmetric
        // try upper/lower, no-trans/trans
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
            magma_zsetmatrix( n, k, A, ld, dA,  ld );
            magma_zsetmatrix( n, n, C, ld, dC1, ld );
            magma_zsetmatrix( n, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_zherk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasZherk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetmatrix( n, n, dC2, ld, C2, ld );
            error = lapackf77_zlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZHERK( k, n ) / 1e9;
            printf( "zherk( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test ZHER2K
        // C = alpha*A*B^H + ^alpha*B*A^H + beta*C  (no-trans) with A,B n*k; C n*n symmetric; or
        // C = alpha*A^H*B + ^alpha*B^H*A + beta*C  (trans)    with A,B k*n; C n*n symmetric
        // try upper/lower, no-trans/trans
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
            bool nt = (trans[it] == 'N');
            magma_zsetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA,  ld );
            magma_zsetmatrix( n, n, C, ld, dC1, ld );
            magma_zsetmatrix( n, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_zher2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasZher2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetmatrix( n, n, dC2, ld, C2, ld );
            error = lapackf77_zlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZHER2K( k, n ) / 1e9;
            printf( "zher2k( %c, %c )    diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test ZTRMM
        // C = alpha*A*C  (left)  with A m*m triangular; C m*n; or
        // C = alpha*C*A  (right) with A n*n triangular; C m*n
        // try left/right, upper/lower, no-trans/trans, unit/non-unit
        for( int is = 0; is < 2; ++is ) {
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
        for( int id = 0; id < 2; ++id ) {
            bool left = (side[is] == 'L');
            magma_zsetmatrix( (left ? m : n), (left ? m : n), A, ld, dA,  ld );
            magma_zsetmatrix( m, n, C, ld, dC1, ld );
            magma_zsetmatrix( m, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_ztrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasZtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_zlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZTRMM( side[is], m, n ) / 1e9;
            printf( "ztrmm( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], error, gflops/t1, gflops/t2 );
        }}}}
        printf( "\n" );
        
        // ----- test ZTRSM
        // solve A*X = alpha*B  (left)  with A m*m triangular; B m*n; or
        // solve X*A = alpha*B  (right) with A n*n triangular; B m*n
        // try left/right, upper/lower, no-trans/trans, unit/non-unit
        for( int is = 0; is < 2; ++is ) {
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
        for( int id = 0; id < 2; ++id ) {
            bool left = (side[is] == 'L');
            magma_zsetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA,  ld );
            magma_zsetmatrix( m, n, C, ld, dC1, ld );
            magma_zsetmatrix( m, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_ztrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasZtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_zlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZTRSM( side[is], m, n ) / 1e9;
            printf( "ztrsm( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], error, gflops/t1, gflops/t2 );
        }}}}
        printf( "\n" );
        
        // cleanup
        magma_free_cpu( piv );
        magma_free_pinned( A  );
        magma_free_pinned( B  );
        magma_free_pinned( C  );
        magma_free_pinned( C2 );
        magma_free_pinned( LU );
        magma_free( dA  );
        magma_free( dB  );
        magma_free( dC1 );
        magma_free( dC2 );
    }
    
    if ( total_error != 0. ) {
        printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n",
                total_error );
    }
    else {
        printf( "all tests passed\n" );
    }
    
    TESTING_FINALIZE();
    return 0;
}
Пример #19
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing zgemm_batched
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    real_Double_t   gflops, magma_perf, magma_time, cpu_perf, cpu_time;
    double          magma_error, magma_err, Ynorm, work[1];
    magma_int_t M, N, Xm, Ym, lda, ldda;
    magma_int_t sizeA, sizeX, sizeY;
    magma_int_t incx = 1;
    magma_int_t incy = 1;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t status = 0;
    magma_int_t batchCount;

    magmaDoubleComplex *h_A, *h_X, *h_Y, *h_Ymagma;
    magmaDoubleComplex *d_A, *d_X, *d_Y;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    magmaDoubleComplex alpha = MAGMA_Z_MAKE(  0.29, -0.86 );
    magmaDoubleComplex beta  = MAGMA_Z_MAKE( -0.48,  0.38 );
    magmaDoubleComplex **A_array = NULL;
    magmaDoubleComplex **X_array = NULL;
    magmaDoubleComplex **Y_array = NULL;


    magma_opts opts;
    parse_opts( argc, argv, &opts );
    batchCount = opts.batchcount;
    opts.lapack |= opts.check;

    //double tol = opts.tolerance * lapackf77_dlamch("E");

    printf("trans = %s\n", lapack_trans_const(opts.transA) );

    printf("BatchCount    M     N     MAGMA Gflop/s (ms)  CPU Gflop/s (ms)  MAGMA error\n");

    printf("===================================================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            lda    = ((M+31)/32)*32;
            gflops = FLOPS_ZGEMV( M, N ) / 1e9 * batchCount;

            if ( opts.transA == MagmaNoTrans ) {
                Xm = N;
                Ym = M;
            } else {
                Xm = M;
                Ym = N;
            }

            sizeA = lda*N*batchCount;
            sizeX = incx*Xm*batchCount;
            sizeY = incy*Ym*batchCount;

            ldda = ((lda+31)/32)*32;

            TESTING_MALLOC_CPU( h_A,  magmaDoubleComplex, sizeA );
            TESTING_MALLOC_CPU( h_X,  magmaDoubleComplex, sizeX );
            TESTING_MALLOC_CPU( h_Y,  magmaDoubleComplex, sizeY  );
            TESTING_MALLOC_CPU( h_Ymagma,  magmaDoubleComplex, sizeY  );


            TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N*batchCount );
            TESTING_MALLOC_DEV( d_X, magmaDoubleComplex, sizeX );
            TESTING_MALLOC_DEV( d_Y, magmaDoubleComplex, sizeY );

            magma_malloc((void**)&A_array, batchCount * sizeof(*A_array));
            magma_malloc((void**)&X_array, batchCount * sizeof(*X_array));
            magma_malloc((void**)&Y_array, batchCount * sizeof(*Y_array));

            /* Initialize the matrices */
            lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A );
            lapackf77_zlarnv( &ione, ISEED, &sizeX, h_X );
            lapackf77_zlarnv( &ione, ISEED, &sizeY, h_Y );

            /* =====================================================================
               Performs operation using MAGMABLAS
               =================================================================== */
            magma_zsetmatrix( M, N*batchCount, h_A, lda, d_A, ldda );
            magma_zsetvector( Xm*batchCount, h_X, incx, d_X, incx );
            magma_zsetvector( Ym*batchCount, h_Y, incy, d_Y, incy );

            zset_pointer(A_array, d_A, ldda, 0, 0, ldda*N, batchCount, magma_stream);
            zset_pointer(X_array, d_X, 1, 0, 0, incx*Xm, batchCount, magma_stream);
            zset_pointer(Y_array, d_Y, 1, 0, 0, incy*Ym, batchCount, magma_stream);

            magma_time = magma_sync_wtime( NULL );
            magmablas_zgemv_batched(opts.transA, M, N,
                                    alpha, A_array, ldda,
                                    X_array, incx,
                                    beta,  Y_array, incy, batchCount, magma_stream);
            magma_time = magma_sync_wtime( NULL ) - magma_time;
            magma_perf = gflops / magma_time;
            magma_zgetvector( Ym*batchCount, d_Y, incy, h_Ymagma, incy );

            /* =====================================================================
               Performs operation using CPU BLAS
               =================================================================== */
            if ( opts.lapack ) {
                cpu_time = magma_wtime();
                for(int i=0; i<batchCount; i++)
                {
                    blasf77_zgemv(
                        lapack_trans_const(opts.transA),
                        &M, &N,
                        &alpha, h_A + i*lda*N, &lda,
                        h_X + i*Xm, &incx,
                        &beta,  h_Y + i*Ym, &incy );
                }
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
            }

            /* =====================================================================
               Check the result
               =================================================================== */
            if ( opts.lapack ) {
                // compute relative error for both magma  relative to lapack,
                // |C_magma - C_lapack| / |C_lapack|
                magma_error = 0.0;

                for(int s=0; s<batchCount; s++)
                {

                    Ynorm = lapackf77_zlange( "M", &M, &ione, h_Y + s*Ym, &incy, work );

                    blasf77_zaxpy( &Ym, &c_neg_one, h_Y + s*Ym, &ione, h_Ymagma + s*Ym, &ione );
                    magma_err = lapackf77_zlange( "M", &M, &ione, h_Ymagma + s*Ym, &incy, work ) / Ynorm;

                    if ( isnan(magma_err) || isinf(magma_err) ) {
                        magma_error = magma_err;
                        break;
                    }
                    magma_error = max(fabs(magma_err), magma_error);

                }

                printf("%10d %5d %5d  %7.2f (%7.2f)    %7.2f (%7.2f)   %8.2e  \n",
                       (int) batchCount, (int) M, (int) N,
                       magma_perf,  1000.*magma_time,
                       cpu_perf,    1000.*cpu_time,
                       magma_error);
            }
            else {

                printf("%10d %5d %5d  %7.2f (%7.2f)    ---   (  ---  )    ---\n",
                       (int) batchCount, (int) M, (int) N,
                       magma_perf,  1000.*magma_time);
            }

            TESTING_FREE_CPU( h_A  );
            TESTING_FREE_CPU( h_X  );
            TESTING_FREE_CPU( h_Y  );
            TESTING_FREE_CPU( h_Ymagma  );


            TESTING_FREE_DEV( d_A );
            TESTING_FREE_DEV( d_X );
            TESTING_FREE_DEV( d_Y );
            TESTING_FREE_DEV( A_array );
            TESTING_FREE_DEV( X_array );
            TESTING_FREE_DEV( Y_array );


            fflush( stdout);

        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    TESTING_FINALIZE();
    return status;
}
Пример #20
0
/**
    Purpose
    -------
    ZHEEVX computes selected eigenvalues and, optionally, eigenvectors
    of a complex Hermitian matrix A.  Eigenvalues and eigenvectors can
    be selected by specifying either a range of values or a range of
    indices for the desired eigenvalues.

    Arguments
    ---------
    @param[in]
    jobz    magma_vec_t
      -     = MagmaNoVec:  Compute eigenvalues only;
      -     = MagmaVec:    Compute eigenvalues and eigenvectors.

    @param[in]
    range   magma_range_t
      -     = MagmaRangeAll: all eigenvalues will be found.
      -     = MagmaRangeV:   all eigenvalues in the half-open interval (VL,VU]
                   will be found.
      -     = MagmaRangeI:   the IL-th through IU-th eigenvalues will be found.

    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of A is stored;
      -     = MagmaLower:  Lower triangle of A is stored.

    @param[in]
    n       INTEGER
            The order of the matrix A.  N >= 0.

    @param[in,out]
    dA      COMPLEX_16 array, dimension (LDDA, N)
            On entry, the Hermitian matrix A.  If UPLO = MagmaUpper, the
            leading N-by-N upper triangular part of A contains the
            upper triangular part of the matrix A.  If UPLO = MagmaLower,
            the leading N-by-N lower triangular part of A contains
            the lower triangular part of the matrix A.
            On exit, the lower triangle (if UPLO=MagmaLower) or the upper
            triangle (if UPLO=MagmaUpper) of A, including the diagonal, is
            destroyed.

    @param[in]
    ldda    INTEGER
            The leading dimension of the array DA.  LDDA >= max(1,N).

    @param[in]
    vl      DOUBLE PRECISION
    @param[in]
    vu      DOUBLE PRECISION
            If RANGE=MagmaRangeV, the lower and upper bounds of the interval to
            be searched for eigenvalues. VL < VU.
            Not referenced if RANGE = MagmaRangeAll or MagmaRangeI.

    @param[in]
    il      INTEGER
    @param[in]
    iu      INTEGER
            If RANGE=MagmaRangeI, the indices (in ascending order) of the
            smallest and largest eigenvalues to be returned.
            1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
            Not referenced if RANGE = MagmaRangeAll or MagmaRangeV.

    @param[in]
    abstol  DOUBLE PRECISION
            The absolute error tolerance for the eigenvalues.
            An approximate eigenvalue is accepted as converged
            when it is determined to lie in an interval [a,b]
            of width less than or equal to

                    ABSTOL + EPS * max( |a|,|b| ),
    \n
            where EPS is the machine precision.  If ABSTOL is less than
            or equal to zero, then  EPS*|T|  will be used in its place,
            where |T| is the 1-norm of the tridiagonal matrix obtained
            by reducing A to tridiagonal form.
    \n
            Eigenvalues will be computed most accurately when ABSTOL is
            set to twice the underflow threshold 2*DLAMCH('S'), not zero.
            If this routine returns with INFO > 0, indicating that some
            eigenvectors did not converge, try setting ABSTOL to
            2*DLAMCH('S').
    \n
            See "Computing Small Singular Values of Bidiagonal Matrices
            with Guaranteed High Relative Accuracy," by Demmel and
            Kahan, LAPACK Working Note #3.

    @param[out]
    m       INTEGER
            The total number of eigenvalues found.  0 <= M <= N.
            If RANGE = MagmaRangeAll, M = N, and if RANGE = MagmaRangeI, M = IU-IL+1.

    @param[out]
    w       DOUBLE PRECISION array, dimension (N)
            On normal exit, the first M elements contain the selected
            eigenvalues in ascending order.

    @param[out]
    dZ      COMPLEX_16 array, dimension (LDDZ, max(1,M))
            If JOBZ = MagmaVec, then if INFO = 0, the first M columns of Z
            contain the orthonormal eigenvectors of the matrix A
            corresponding to the selected eigenvalues, with the i-th
            column of Z holding the eigenvector associated with W(i).
            If an eigenvector fails to converge, then that column of Z
            contains the latest approximation to the eigenvector, and the
            index of the eigenvector is returned in IFAIL.
            If JOBZ = MagmaNoVec, then Z is not referenced.
            Note: the user must ensure that at least max(1,M) columns are
            supplied in the array Z; if RANGE = MagmaRangeV, the exact value of M
            is not known in advance and an upper bound must be used.
*********   (workspace) If FAST_HEMV is defined DZ should be (LDDZ, max(1,N)) in both cases.

    @param[in]
    lddz    INTEGER
            The leading dimension of the array DZ.  LDDZ >= 1, and if
            JOBZ = MagmaVec, LDDZ >= max(1,N).

    @param
    wA      (workspace) COMPLEX_16 array, dimension (LDWA, N)

    @param[in]
    ldwa    INTEGER
            The leading dimension of the array wA.  LDWA >= max(1,N).

    @param
    wZ      (workspace) COMPLEX_16 array, dimension (LDWZ, max(1,M))

    @param[in]
    ldwz    INTEGER
            The leading dimension of the array wZ.  LDWZ >= 1, and if
            JOBZ = MagmaVec, LDWZ >= max(1,N).

    @param[out]
    work    (workspace) COMPLEX_16 array, dimension (LWORK)
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The length of the array WORK.  LWORK >= (NB+1)*N,
            where NB is the max of the blocksize for ZHETRD.
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    @param
    rwork   (workspace) DOUBLE PRECISION array, dimension (7*N)

    @param
    iwork   (workspace) INTEGER array, dimension (5*N)

    @param[out]
    ifail   INTEGER array, dimension (N)
            If JOBZ = MagmaVec, then if INFO = 0, the first M elements of
            IFAIL are zero.  If INFO > 0, then IFAIL contains the
            indices of the eigenvectors that failed to converge.
            If JOBZ = MagmaNoVec, then IFAIL is not referenced.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
      -     > 0:  if INFO = i, then i eigenvectors failed to converge.
                  Their indices are stored in array IFAIL.

    @ingroup magma_zheev_driver
    ********************************************************************/
extern "C" magma_int_t
magma_zheevx_gpu(magma_vec_t jobz, magma_range_t range, magma_uplo_t uplo, magma_int_t n,
                 magmaDoubleComplex *dA, magma_int_t ldda, double vl, double vu,
                 magma_int_t il, magma_int_t iu, double abstol, magma_int_t *m,
                 double *w, magmaDoubleComplex *dZ, magma_int_t lddz,
                 magmaDoubleComplex *wA, magma_int_t ldwa,
                 magmaDoubleComplex *wZ, magma_int_t ldwz,
                 magmaDoubleComplex *work, magma_int_t lwork,
                 double *rwork, magma_int_t *iwork, magma_int_t *ifail, magma_int_t *info)
{
    const char* uplo_  = lapack_uplo_const( uplo  );
    const char* jobz_  = lapack_vec_const( jobz  );
    const char* range_ = lapack_range_const( range );
    
    magma_int_t ione = 1;
    
    const char* order_;
    magma_int_t indd, inde;
    magma_int_t imax;
    magma_int_t lopt, itmp1, indee;
    magma_int_t lower, wantz;
    magma_int_t i, j, jj, i__1;
    magma_int_t alleig, valeig, indeig;
    magma_int_t iscale, indibl;
    magma_int_t indiwk, indisp, indtau;
    magma_int_t indrwk, indwrk;
    magma_int_t llwork, nsplit;
    magma_int_t lquery;
    magma_int_t iinfo;
    double safmin;
    double bignum;
    double smlnum;
    double eps, tmp1;
    double anrm;
    double sigma, d__1;
    double rmin, rmax;
    
    double *dwork;
    
    /* Function Body */
    lower  = (uplo  == MagmaLower);
    wantz  = (jobz  == MagmaVec);
    alleig = (range == MagmaRangeAll);
    valeig = (range == MagmaRangeV);
    indeig = (range == MagmaRangeI);
    lquery = (lwork == -1);
    
    *info = 0;
    if (! (wantz || (jobz == MagmaNoVec))) {
        *info = -1;
    } else if (! (alleig || valeig || indeig)) {
        *info = -2;
    } else if (! (lower || (uplo == MagmaUpper))) {
        *info = -3;
    } else if (n < 0) {
        *info = -4;
    } else if (ldda < max(1,n)) {
        *info = -6;
    } else if (lddz < 1 || (wantz && lddz < n)) {
        *info = -15;
    } else if (ldwa < max(1,n)) {
        *info = -17;
    } else if (ldwz < 1 || (wantz && ldwz < n)) {
        *info = -19;
    } else {
        if (valeig) {
            if (n > 0 && vu <= vl) {
                *info = -8;
            }
        } else if (indeig) {
            if (il < 1 || il > max(1,n)) {
                *info = -9;
            } else if (iu < min(n,il) || iu > n) {
                *info = -10;
            }
        }
    }
    
    magma_int_t nb = magma_get_zhetrd_nb(n);
    
    lopt = n * (nb + 1);
    
    work[0] = MAGMA_Z_MAKE( lopt, 0 );
    
    if (lwork < lopt && ! lquery) {
        *info = -21;
    }
    
    if (*info != 0) {
        magma_xerbla( __func__, -(*info));
        return *info;
    } else if (lquery) {
        return *info;
    }
    
    *m = 0;
    /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */
    if (n <= 128) {
        #ifdef ENABLE_DEBUG
        printf("--------------------------------------------------------------\n");
        printf("  warning matrix too small N=%d NB=%d, calling lapack on CPU  \n", (int) n, (int) nb);
        printf("--------------------------------------------------------------\n");
        #endif
        magmaDoubleComplex *a;
        magma_zmalloc_cpu( &a, n*n );
        magma_zgetmatrix(n, n, dA, ldda, a, n);
        lapackf77_zheevx(jobz_, range_, uplo_,
                         &n, a, &n, &vl, &vu, &il, &iu, &abstol, m,
                         w, wZ, &ldwz, work, &lwork,
                         rwork, iwork, ifail, info);
        magma_zsetmatrix( n,  n,  a,    n, dA, ldda);
        magma_zsetmatrix( n, *m, wZ, ldwz, dZ, lddz);
        magma_free_cpu(a);
        return *info;
    }

    if (MAGMA_SUCCESS != magma_dmalloc( &dwork, n )) {
        fprintf (stderr, "!!!! device memory allocation error (magma_zheevx_gpu)\n");
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }
    
    --w;
    --work;
    --rwork;
    --iwork;
    --ifail;
    
    /* Get machine constants. */
    safmin = lapackf77_dlamch("Safe minimum");
    eps = lapackf77_dlamch("Precision");
    smlnum = safmin / eps;
    bignum = 1. / smlnum;
    rmin = magma_dsqrt(smlnum);
    rmax = magma_dsqrt(bignum);
    
    /* Scale matrix to allowable range, if necessary. */
    anrm = magmablas_zlanhe(MagmaMaxNorm, uplo, n, dA, ldda, dwork);
    iscale = 0;
    sigma  = 1;
    if (anrm > 0. && anrm < rmin) {
        iscale = 1;
        sigma = rmin / anrm;
    } else if (anrm > rmax) {
        iscale = 1;
        sigma = rmax / anrm;
    }
    if (iscale == 1) {
        d__1 = 1.;
        magmablas_zlascl(uplo, 0, 0, 1., sigma, n, n, dA,
                         ldda, info);
        
        if (abstol > 0.) {
            abstol *= sigma;
        }
        if (valeig) {
            vl *= sigma;
            vu *= sigma;
        }
    }
    
    /* Call ZHETRD to reduce Hermitian matrix to tridiagonal form. */
    indd = 1;
    inde = indd + n;
    indrwk = inde + n;
    indtau = 1;
    indwrk = indtau + n;
    llwork = lwork - indwrk + 1;
    
#ifdef FAST_HEMV
    magma_zhetrd2_gpu(uplo, n, dA, ldda, &rwork[indd], &rwork[inde],
                      &work[indtau], wA, ldwa, &work[indwrk], llwork, dZ, lddz*n, &iinfo);
#else
    magma_zhetrd_gpu (uplo, n, dA, ldda, &rwork[indd], &rwork[inde],
                      &work[indtau], wA, ldwa, &work[indwrk], llwork, &iinfo);
#endif

    lopt = n + (magma_int_t)MAGMA_Z_REAL(work[indwrk]);
    
    /* If all eigenvalues are desired and ABSTOL is less than or equal to
       zero, then call DSTERF or ZUNGTR and ZSTEQR.  If this fails for
       some eigenvalue, then try DSTEBZ. */
    if ((alleig || (indeig && il == 1 && iu == n)) && abstol <= 0.) {
        blasf77_dcopy(&n, &rwork[indd], &ione, &w[1], &ione);
        indee = indrwk + 2*n;
        if (! wantz) {
            i__1 = n - 1;
            blasf77_dcopy(&i__1, &rwork[inde], &ione, &rwork[indee], &ione);
            lapackf77_dsterf(&n, &w[1], &rwork[indee], info);
        }
        else {
            lapackf77_zlacpy("A", &n, &n, wA, &ldwa, wZ, &ldwz);
            lapackf77_zungtr(uplo_, &n, wZ, &ldwz, &work[indtau], &work[indwrk], &llwork, &iinfo);
            i__1 = n - 1;
            blasf77_dcopy(&i__1, &rwork[inde], &ione, &rwork[indee], &ione);
            lapackf77_zsteqr(jobz_, &n, &w[1], &rwork[indee], wZ, &ldwz, &rwork[indrwk], info);
            if (*info == 0) {
                for (i = 1; i <= n; ++i) {
                    ifail[i] = 0;
                }
                magma_zsetmatrix( n, n, wZ, ldwz, dZ, lddz );
            }
        }
        if (*info == 0) {
            *m = n;
        }
    }
    
    /* Otherwise, call DSTEBZ and, if eigenvectors are desired, ZSTEIN. */
    if (*m == 0) {
        *info = 0;
        if (wantz) {
            order_ = "B";
        } else {
            order_ = "E";
        }
        indibl = 1;
        indisp = indibl + n;
        indiwk = indisp + n;

        lapackf77_dstebz(range_, order_, &n, &vl, &vu, &il, &iu, &abstol, &rwork[indd], &rwork[inde], m,
                         &nsplit, &w[1], &iwork[indibl], &iwork[indisp], &rwork[indrwk], &iwork[indiwk], info);
        
        if (wantz) {
            
            lapackf77_zstein(&n, &rwork[indd], &rwork[inde], m, &w[1], &iwork[indibl], &iwork[indisp],
                             wZ, &ldwz, &rwork[indrwk], &iwork[indiwk], &ifail[1], info);
            
            magma_zsetmatrix( n, *m, wZ, ldwz, dZ, lddz );
            
            /* Apply unitary matrix used in reduction to tridiagonal
               form to eigenvectors returned by ZSTEIN. */
            magma_zunmtr_gpu(MagmaLeft, uplo, MagmaNoTrans, n, *m, dA, ldda, &work[indtau],
                             dZ, lddz, wA, ldwa, &iinfo);
        }
    }
    /* If matrix was scaled, then rescale eigenvalues appropriately. */
    if (iscale == 1) {
        if (*info == 0) {
            imax = *m;
        } else {
            imax = *info - 1;
        }
        d__1 = 1. / sigma;
        blasf77_dscal(&imax, &d__1, &w[1], &ione);
    }
    
    /* If eigenvalues are not in order, then sort them, along with
       eigenvectors. */
    if (wantz) {
        for (j = 1; j <= *m-1; ++j) {
            i = 0;
            tmp1 = w[j];
            for (jj = j + 1; jj <= *m; ++jj) {
                if (w[jj] < tmp1) {
                    i = jj;
                    tmp1 = w[jj];
                }
            }
            
            if (i != 0) {
                itmp1 = iwork[indibl + i - 1];
                w[i] = w[j];
                iwork[indibl + i - 1] = iwork[indibl + j - 1];
                w[j] = tmp1;
                iwork[indibl + j - 1] = itmp1;
                magma_zswap(n, dZ + (i-1)*lddz, ione, dZ + (j-1)*lddz, ione);
                if (*info != 0) {
                    itmp1 = ifail[i];
                    ifail[i] = ifail[j];
                    ifail[j] = itmp1;
                }
            }
        }
    }
    
    /* Set WORK[0] to optimal complex workspace size. */
    work[1] = MAGMA_Z_MAKE( lopt, 0 );
    
    return *info;
    
} /* magma_zheevx_gpu */
Пример #21
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing ztrsm
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    real_Double_t   gflops, magma_perf, magma_time=0, cublas_perf, cublas_time, cpu_perf=0, cpu_time=0;
    double          magma_error, cublas_error, work[1];
    magma_int_t M, N, info;
    magma_int_t Ak;
    magma_int_t sizeA, sizeB;
    magma_int_t lda, ldb, ldda, lddb;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t *ipiv;

    magmaDoubleComplex *h_A, *h_B, *h_Bcublas, *h_Bmagma, *h_B1, *h_X1, *h_X2;
    magmaDoubleComplex *d_A, *d_B;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    magmaDoubleComplex c_one = MAGMA_Z_ONE;
    magmaDoubleComplex alpha = MAGMA_Z_MAKE(  0.29, -0.86 );
    magma_int_t status = 0;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    double tol = opts.tolerance * lapackf77_dlamch("E");

    printf("side = %s, uplo = %s, transA = %s, diag = %s \n",
           lapack_side_const(opts.side), lapack_uplo_const(opts.uplo),
           lapack_trans_const(opts.transA), lapack_diag_const(opts.diag) );
    printf("    M     N  MAGMA Gflop/s (ms)  CUBLAS Gflop/s (ms)   CPU Gflop/s (ms)  MAGMA error  CUBLAS error\n");
    printf("==================================================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            gflops = FLOPS_ZTRSM(opts.side, M, N) / 1e9;

            if ( opts.side == MagmaLeft ) {
                lda = M;
                Ak = M;
            } else {
                lda = N;
                Ak = N;
            }
            
            ldb = M;
            
            ldda = ((lda+31)/32)*32;
            lddb = ((ldb+31)/32)*32;
            
            sizeA = lda*Ak;
            sizeB = ldb*N;
            
            TESTING_MALLOC_CPU( h_A,       magmaDoubleComplex, lda*Ak  );
            TESTING_MALLOC_CPU( h_B,       magmaDoubleComplex, ldb*N   );
            TESTING_MALLOC_CPU( h_B1,      magmaDoubleComplex, ldb*N   );
            TESTING_MALLOC_CPU( h_X1,      magmaDoubleComplex, ldb*N   );
            TESTING_MALLOC_CPU( h_X2,      magmaDoubleComplex, ldb*N   );
            TESTING_MALLOC_CPU( h_Bcublas, magmaDoubleComplex, ldb*N   );
            TESTING_MALLOC_CPU( h_Bmagma,  magmaDoubleComplex, ldb*N   );
            TESTING_MALLOC_CPU( ipiv,      magma_int_t,        Ak      );
            
            TESTING_MALLOC_DEV( d_A,       magmaDoubleComplex, ldda*Ak );
            TESTING_MALLOC_DEV( d_B,       magmaDoubleComplex, lddb*N  );
            
            /* Initialize the matrices */
            /* Factor A into LU to get well-conditioned triangular matrix.
             * Copy L to U, since L seems okay when used with non-unit diagonal
             * (i.e., from U), while U fails when used with unit diagonal. */
            lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A );
            lapackf77_zgetrf( &Ak, &Ak, h_A, &lda, ipiv, &info );
            for( int j = 0; j < Ak; ++j ) {
                for( int i = 0; i < j; ++i ) {
                    *h_A(i,j) = *h_A(j,i);
                }
            }
            
            lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B );
            memcpy(h_B1, h_B, sizeB*sizeof(magmaDoubleComplex));
            
            /* =====================================================================
               Performs operation using MAGMABLAS
               =================================================================== */
            magma_zsetmatrix( Ak, Ak, h_A, lda, d_A, ldda );
            magma_zsetmatrix( M, N, h_B, ldb, d_B, lddb );
            
            magma_time = magma_sync_wtime( NULL );
            magmablas_ztrsm( opts.side, opts.uplo, opts.transA, opts.diag, 
                             M, N,
                             alpha, d_A, ldda,
                                    d_B, lddb );
            magma_time = magma_sync_wtime( NULL ) - magma_time;
            magma_perf = gflops / magma_time;
            
            magma_zgetmatrix( M, N, d_B, lddb, h_Bmagma, ldb );
            
            /* =====================================================================
               Performs operation using CUBLAS
               =================================================================== */
            magma_zsetmatrix( M, N, h_B, ldb, d_B, lddb );
            
            cublas_time = magma_sync_wtime( NULL );
            cublasZtrsm( handle, cublas_side_const(opts.side), cublas_uplo_const(opts.uplo),
                         cublas_trans_const(opts.transA), cublas_diag_const(opts.diag),
                         M, N, 
                         &alpha, d_A, ldda,
                                 d_B, lddb );
            cublas_time = magma_sync_wtime( NULL ) - cublas_time;
            cublas_perf = gflops / cublas_time;
            
            magma_zgetmatrix( M, N, d_B, lddb, h_Bcublas, ldb );
            
            /* =====================================================================
               Performs operation using CPU BLAS
               =================================================================== */
            if ( opts.lapack ) {
                cpu_time = magma_wtime();
                blasf77_ztrsm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), 
                               &M, &N,
                               &alpha, h_A, &lda,
                                       h_B, &ldb );
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
            }
            
            /* =====================================================================
               Check the result
               =================================================================== */
            // ||b - Ax|| / (||A||*||x||)
            memcpy(h_X1, h_Bmagma, sizeB*sizeof(magmaDoubleComplex));
            
            magmaDoubleComplex alpha2 = MAGMA_Z_DIV(  c_one, alpha );
            blasf77_ztrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), 
                            &M, &N,
                            &alpha2, h_A, &lda,
                            h_X1, &ldb );

            blasf77_zaxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X1, &ione );
            double norm1 =  lapackf77_zlange( "M", &M, &N, h_X1, &ldb, work );
            double normx =  lapackf77_zlange( "M", &M, &N, h_Bmagma, &ldb, work );
            double normA =  lapackf77_zlange( "M", &Ak, &Ak, h_A, &lda, work );

            magma_error = norm1/(normx*normA);

            memcpy(h_X2, h_Bcublas, sizeB*sizeof(magmaDoubleComplex));
            blasf77_ztrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), 
                            &M, &N,
                            &alpha2, h_A, &lda,
                            h_X2, &ldb );

            blasf77_zaxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X2, &ione );
            norm1 =  lapackf77_zlange( "M", &M, &N, h_X2, &ldb, work );
            normx =  lapackf77_zlange( "M", &M, &N, h_Bcublas, &ldb, work );
            normA =  lapackf77_zlange( "M", &Ak, &Ak, h_A, &lda, work );
            
            cublas_error = norm1/(normx*normA);
            
            if ( opts.lapack ) {
                printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e     %8.2e   %s\n",
                        (int) M, (int) N,
                        magma_perf,  1000.*magma_time,
                        cublas_perf, 1000.*cublas_time,
                        cpu_perf,    1000.*cpu_time,
                        magma_error, cublas_error,
                        (magma_error < tol && cublas_error < tol? "ok" : "failed"));
                status += ! (magma_error < tol && cublas_error < tol);
            }
            else {
                printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)     ---   (  ---  )   %8.2e     %8.2e   %s\n",
                        (int) M, (int) N,
                        magma_perf,  1000.*magma_time,
                        cublas_perf, 1000.*cublas_time,
                        magma_error, cublas_error,
                        (magma_error < tol && cublas_error < tol? "ok" : "failed"));
                status += ! (magma_error < tol && cublas_error < tol);
            }
            
            TESTING_FREE_CPU( h_A  );
            TESTING_FREE_CPU( h_B  );
            TESTING_FREE_CPU( h_B1 );
            TESTING_FREE_CPU( h_X1 );
            TESTING_FREE_CPU( h_X2 );
            TESTING_FREE_CPU( h_Bcublas );
            TESTING_FREE_CPU( h_Bmagma  );
            
            TESTING_FREE_DEV( d_A );
            TESTING_FREE_DEV( d_B );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    TESTING_FINALIZE();
    return status;
}
Пример #22
0
/**
    Purpose
    -------
    ZHEEVD computes all eigenvalues and, optionally, eigenvectors of a
    complex Hermitian matrix A.  If eigenvectors are desired, it uses a
    divide and conquer algorithm.

    The divide and conquer algorithm makes very mild assumptions about
    floating point arithmetic. It will work on machines with a guard
    digit in add/subtract, or on those binary machines without guard
    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
    Cray-2. It could conceivably fail on hexadecimal or decimal machines
    without guard digits, but we know of none.

    Arguments
    ---------
    @param[in]
    jobz    magma_vec_t
      -     = MagmaNoVec:  Compute eigenvalues only;
      -     = MagmaVec:    Compute eigenvalues and eigenvectors.

    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of A is stored;
      -     = MagmaLower:  Lower triangle of A is stored.

    @param[in]
    n       INTEGER
            The order of the matrix A.  N >= 0.

    @param[in,out]
    A       COMPLEX_16 array, dimension (LDA, N)
            On entry, the Hermitian matrix A.  If UPLO = MagmaUpper, the
            leading N-by-N upper triangular part of A contains the
            upper triangular part of the matrix A.  If UPLO = MagmaLower,
            the leading N-by-N lower triangular part of A contains
            the lower triangular part of the matrix A.
            On exit, if JOBZ = MagmaVec, then if INFO = 0, A contains the
            orthonormal eigenvectors of the matrix A.
            If JOBZ = MagmaNoVec, then on exit the lower triangle (if UPLO=MagmaLower)
            or the upper triangle (if UPLO=MagmaUpper) of A, including the
            diagonal, is destroyed.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    w       DOUBLE PRECISION array, dimension (N)
            If INFO = 0, the eigenvalues in ascending order.

    @param[out]
    work    (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The length of the array WORK.
            If N <= 1,                      LWORK >= 1.
            If JOBZ = MagmaNoVec and N > 1, LWORK >= N + N*NB.
            If JOBZ = MagmaVec   and N > 1, LWORK >= max( N + N*NB, 2*N + N**2 ).
            NB can be obtained through magma_get_zhetrd_nb(N).
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal sizes of the WORK, RWORK and
            IWORK arrays, returns these values as the first entries of
            the WORK, RWORK and IWORK arrays, and no error message
            related to LWORK or LRWORK or LIWORK is issued by XERBLA.

    @param[out]
    rwork   (workspace) DOUBLE PRECISION array, dimension (LRWORK)
            On exit, if INFO = 0, RWORK[0] returns the optimal LRWORK.

    @param[in]
    lrwork  INTEGER
            The dimension of the array RWORK.
            If N <= 1,                      LRWORK >= 1.
            If JOBZ = MagmaNoVec and N > 1, LRWORK >= N.
            If JOBZ = MagmaVec   and N > 1, LRWORK >= 1 + 5*N + 2*N**2.
    \n
            If LRWORK = -1, then a workspace query is assumed; the
            routine only calculates the optimal sizes of the WORK, RWORK
            and IWORK arrays, returns these values as the first entries
            of the WORK, RWORK and IWORK arrays, and no error message
            related to LWORK or LRWORK or LIWORK is issued by XERBLA.

    @param[out]
    iwork   (workspace) INTEGER array, dimension (MAX(1,LIWORK))
            On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK.

    @param[in]
    liwork  INTEGER
            The dimension of the array IWORK.
            If N <= 1,                      LIWORK >= 1.
            If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1.
            If JOBZ = MagmaVec   and N > 1, LIWORK >= 3 + 5*N.
    \n
            If LIWORK = -1, then a workspace query is assumed; the
            routine only calculates the optimal sizes of the WORK, RWORK
            and IWORK arrays, returns these values as the first entries
            of the WORK, RWORK and IWORK arrays, and no error message
            related to LWORK or LRWORK or LIWORK is issued by XERBLA.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
      -     > 0:  if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed
                  to converge; i off-diagonal elements of an intermediate
                  tridiagonal form did not converge to zero;
                  if INFO = i and JOBZ = MagmaVec, then the algorithm failed
                  to compute an eigenvalue while working on the submatrix
                  lying in rows and columns INFO/(N+1) through
                  mod(INFO,N+1).

    Further Details
    ---------------
    Based on contributions by
       Jeff Rutter, Computer Science Division, University of California
       at Berkeley, USA

    Modified description of INFO. Sven, 16 Feb 05.

    @ingroup magma_zheev_driver
    ********************************************************************/
extern "C" magma_int_t
magma_zheevd(
    magma_vec_t jobz, magma_uplo_t uplo,
    magma_int_t n,
    magmaDoubleComplex *A, magma_int_t lda,
    double *w,
    magmaDoubleComplex *work, magma_int_t lwork,
    #ifdef COMPLEX
    double *rwork, magma_int_t lrwork,
    #endif
    magma_int_t *iwork, magma_int_t liwork,
    magma_int_t *info)
{
    const char* uplo_ = lapack_uplo_const( uplo );
    const char* jobz_ = lapack_vec_const( jobz );
    magma_int_t ione = 1;
    magma_int_t izero = 0;
    double d_one = 1.;

    double d__1;

    double eps;
    magma_int_t inde;
    double anrm;
    magma_int_t imax;
    double rmin, rmax;
    double sigma;
    magma_int_t iinfo, lwmin;
    magma_int_t lower;
    magma_int_t llrwk;
    magma_int_t wantz;
    magma_int_t indwk2, llwrk2;
    magma_int_t iscale;
    double safmin;
    double bignum;
    magma_int_t indtau;
    magma_int_t indrwk, indwrk, liwmin;
    magma_int_t lrwmin, llwork;
    double smlnum;
    magma_int_t lquery;

    double* dwork;

    wantz = (jobz == MagmaVec);
    lower = (uplo == MagmaLower);
    lquery = (lwork == -1 || lrwork == -1 || liwork == -1);

    *info = 0;

    if (! (wantz || (jobz == MagmaNoVec))) {
        *info = -1;
    } else if (! (lower || (uplo == MagmaUpper))) {
        *info = -2;
    } else if (n < 0) {
        *info = -3;
    } else if (lda < max(1,n)) {
        *info = -5;
    }

    magma_int_t nb = magma_get_zhetrd_nb( n );
    if ( n <= 1 ) {
        lwmin  = 1;
        lrwmin = 1;
        liwmin = 1;
    }
    else if ( wantz ) {
        lwmin  = max( n + n*nb, 2*n + n*n );
        lrwmin = 1 + 5*n + 2*n*n;
        liwmin = 3 + 5*n;
    }
    else {
        lwmin  = n + n*nb;
        lrwmin = n;
        liwmin = 1;
    }
    
    // multiply by 1+eps (in Double!) to ensure length gets rounded up,
    // if it cannot be exactly represented in floating point.
    real_Double_t one_eps = 1. + lapackf77_dlamch("Epsilon");
    work[0]  = MAGMA_Z_MAKE( lwmin * one_eps, 0 );
    rwork[0] = lrwmin * one_eps;
    iwork[0] = liwmin;

    if ((lwork < lwmin) && !lquery) {
        *info = -8;
    } else if ((lrwork < lrwmin) && ! lquery) {
        *info = -10;
    } else if ((liwork < liwmin) && ! lquery) {
        *info = -12;
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (n == 0) {
        return *info;
    }

    if (n == 1) {
        w[0] = MAGMA_Z_REAL( A[0] );
        if (wantz) {
            A[0] = MAGMA_Z_ONE;
        }
        return *info;
    }

    /* If matrix is very small, then just call LAPACK on CPU, no need for GPU */
    if (n <= 128) {
        lapackf77_zheevd( jobz_, uplo_,
                          &n, A, &lda,
                          w, work, &lwork,
                          #ifdef COMPLEX
                          rwork, &lrwork,
                          #endif
                          iwork, &liwork, info );
        return *info;
    }

    /* Get machine constants. */
    safmin = lapackf77_dlamch("Safe minimum");
    eps    = lapackf77_dlamch("Precision");
    smlnum = safmin / eps;
    bignum = 1. / smlnum;
    rmin = magma_dsqrt( smlnum );
    rmax = magma_dsqrt( bignum );

    /* Scale matrix to allowable range, if necessary. */
    anrm = lapackf77_zlanhe( "M", uplo_, &n, A, &lda, rwork );
    iscale = 0;
    if (anrm > 0. && anrm < rmin) {
        iscale = 1;
        sigma = rmin / anrm;
    } else if (anrm > rmax) {
        iscale = 1;
        sigma = rmax / anrm;
    }
    if (iscale == 1) {
        lapackf77_zlascl( uplo_, &izero, &izero, &d_one, &sigma, &n, &n, A, &lda, info );
    }

    /* Call ZHETRD to reduce Hermitian matrix to tridiagonal form. */
    // zhetrd rwork: e (n)
    // zstedx rwork: e (n) + llrwk (1 + 4*N + 2*N**2)  ==>  1 + 5n + 2n^2
    inde   = 0;
    indrwk = inde + n;
    llrwk  = lrwork - indrwk;

    // zhetrd work: tau (n) + llwork (n*nb)  ==>  n + n*nb
    // zstedx work: tau (n) + z (n^2)
    // zunmtr work: tau (n) + z (n^2) + llwrk2 (n or n*nb)  ==>  2n + n^2, or n + n*nb + n^2
    indtau = 0;
    indwrk = indtau + n;
    indwk2 = indwrk + n*n;
    llwork = lwork - indwrk;
    llwrk2 = lwork - indwk2;

    magma_timer_t time=0;
    timer_start( time );

    magma_zhetrd( uplo, n, A, lda, w, &rwork[inde],
                  &work[indtau], &work[indwrk], llwork, &iinfo );

    timer_stop( time );
    timer_printf( "time zhetrd = %6.2f\n", time );

    /* For eigenvalues only, call DSTERF.  For eigenvectors, first call
     * ZSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the
     * tridiagonal matrix, then call ZUNMTR to multiply it to the Householder
     * transformations represented as Householder vectors in A. */
    if (! wantz) {
        lapackf77_dsterf( &n, w, &rwork[inde], info );
    }
    else {
        timer_start( time );

        if (MAGMA_SUCCESS != magma_dmalloc( &dwork, 3*n*(n/2 + 1) )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }

        magma_zstedx( MagmaRangeAll, n, 0., 0., 0, 0, w, &rwork[inde],
                      &work[indwrk], n, &rwork[indrwk], llrwk,
                      iwork, liwork, dwork, info );

        magma_free( dwork );

        timer_stop( time );
        timer_printf( "time zstedx = %6.2f\n", time );
        timer_start( time );

        magma_zunmtr( MagmaLeft, uplo, MagmaNoTrans, n, n, A, lda, &work[indtau],
                      &work[indwrk], n, &work[indwk2], llwrk2, &iinfo );

        lapackf77_zlacpy( "A", &n, &n, &work[indwrk], &n, A, &lda );

        timer_stop( time );
        timer_printf( "time zunmtr + copy = %6.2f\n", time );
    }

    /* If matrix was scaled, then rescale eigenvalues appropriately. */
    if (iscale == 1) {
        if (*info == 0) {
            imax = n;
        } else {
            imax = *info - 1;
        }
        d__1 = 1. / sigma;
        blasf77_dscal( &imax, &d__1, w, &ione );
    }

    work[0]  = MAGMA_Z_MAKE( lwmin * one_eps, 0 );  // round up
    rwork[0] = lrwmin * one_eps;
    iwork[0] = liwmin;

    return *info;
} /* magma_zheevd */
Пример #23
0
/**
    Purpose
    -------
    ZPOTRF computes the Cholesky factorization of a complex Hermitian
    positive definite matrix dA.

    The factorization has the form
       dA = U**H * U,   if UPLO = MagmaUpper, or
       dA = L  * L**H,  if UPLO = MagmaLower,
    where U is an upper triangular matrix and L is lower triangular.

    This is the block version of the algorithm, calling Level 3 BLAS.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of dA is stored;
      -     = MagmaLower:  Lower triangle of dA is stored.

    @param[in]
    n       INTEGER
            The order of the matrix dA.  N >= 0.

    @param[in,out]
    d_lA    COMPLEX_16 array of pointers on the GPU, dimension (num_gpus)
            On entry, the Hermitian matrix dA distributed over GPUs
            (dl_A[d] points to the local matrix on the d-th GPU).  
            It is distributed in 1D block column or row cyclic (with the
            block size of nb) if UPLO = MagmaUpper or MagmaLower, respectively.
            If UPLO = MagmaUpper, the leading N-by-N upper triangular 
            part of dA contains the upper triangular part of the matrix dA, 
            and the strictly lower triangular part of dA is not referenced.  
            If UPLO = MagmaLower, the leading N-by-N lower triangular part 
            of dA contains the lower triangular part of the matrix dA, and 
            the strictly upper triangular part of dA is not referenced.
    \n
            On exit, if INFO = 0, the factor U or L from the Cholesky
            factorization dA = U**H * U or dA = L * L**H.

    @param[in]
    ldda     INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,N).
            To benefit from coalescent memory accesses LDDA must be
            divisible by 16.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
      -     > 0:  if INFO = i, the leading minor of order i is not
                  positive definite, and the factorization could not be
                  completed.

    @ingroup magma_zposv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_zpotrf_mgpu_right(magma_int_t num_gpus, magma_uplo_t uplo, magma_int_t n,
                        magmaDoubleComplex **d_lA, magma_int_t ldda, magma_int_t *info )
{
    #define dlA(id, i, j)  (d_lA[(id)] + (j) * ldda + (i))
    #define dlP(id, i, j)  (d_lP[(id)] + (j) * ldda + (i))

    #define panel(j)  (panel + (j))
    #define tmppanel(j)  (tmppanel + (j))
    #define tmpprevpanel(j)  (tmpprevpanel + (j))
    #define STREAM_ID(i) (num_streams > 1 ? 1+((i)/nb)%(num_streams-1) : 0)

    magmaDoubleComplex z_one = MAGMA_Z_MAKE(  1.0, 0.0 );
    magmaDoubleComplex mz_one = MAGMA_Z_MAKE( -1.0, 0.0 );
    double             one =  1.0;
    double             m_one = -1.0;
    const char* uplo_ = lapack_uplo_const( uplo );

    magma_int_t j, nb, d, id, j_local, blkid, crosspoint, prevj, prevtrsmrows, num_streams = 5;
    magmaDoubleComplex *panel, *tmppanel0, *tmppanel1, *tmppanel, *tmpprevpanel;
    magmaDoubleComplex *d_lP[MagmaMaxGPUs], *dlpanel, *dlpanels[MagmaMaxGPUs];
    magma_int_t rows, trsmrows, ngpu, n_local[MagmaMaxGPUs], ldpanel;
    magma_queue_t stream[MagmaMaxGPUs][10];

    *info = 0;
    if ( uplo != MagmaUpper && uplo != MagmaLower ) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (ldda < max(1,n)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );

    nb = magma_get_zpotrf_nb(n);

    ldpanel = ldda;
    magma_setdevice(0);
    if (MAGMA_SUCCESS != magma_zmalloc_pinned( &panel, 2 * nb * ldpanel )) {
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }

    tmppanel0 = panel;
    tmppanel1 = tmppanel0 + nb * ldpanel;

    if ((nb <= 1) || (nb >= n)) {
        // Use unblocked code.
        magma_zgetmatrix( n, n, dlA(0, 0, 0), ldda, panel, ldpanel);
        lapackf77_zpotrf( uplo_, &n, panel, &ldpanel, info);
        magma_zsetmatrix( n, n, panel, ldpanel, dlA(0, 0, 0), ldda );
    } else {
        for( d = 0; d < num_gpus; d++ ) {
            // local-n and local-ld
            n_local[d] = ((n / nb) / num_gpus) * nb;
            if (d < (n / nb) % num_gpus)
                n_local[d] += nb;
            else if (d == (n / nb) % num_gpus)
                n_local[d] += n % nb;

            magma_setdevice(d);
            magma_device_sync();
            if (MAGMA_SUCCESS != magma_zmalloc( &d_lP[d], nb * ldda )) {
                for( j = 0; j < d; j++ ) {
                    magma_setdevice(j);
                    magma_free( d_lP[d] );
                }
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }
            for( j=0; j < num_streams; j++ ) {
                magma_queue_create( &stream[d][j] );
            }
        }

        //#define ENABLE_TIMER
        #if defined (ENABLE_TIMER)
        real_Double_t therk[4], tmtc, tcchol, tctrsm, tctm, tmnp, tcnp;
        real_Double_t ttot_herk[4] = {0,0,0,0}, ttot_mtc = 0, ttot_cchol = 0, ttot_ctrsm = 0, ttot_ctm = 0, ttot_mnp = 0, ttot_cnp = 0;
        printf("\n\n %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n",
                "j", "nb", "row", "mtc", "CPU_np", "panel", "ctrsm", "CH+TRSM", "CPU", "dsyrk[0]", "dsyrk[1]", "dsyrk[2]", "dsyrk[3]", "ctm P", "gpu_np");
        printf("     ====================================================================================================\n");
        #endif

        // Use blocked code.
        if (uplo == MagmaUpper) {
            printf( " === not supported, yet ===\n" );
        } else {
            blkid = -1;
            if (num_gpus == 4)
                crosspoint = n;
            else if (num_gpus == 3)
                crosspoint = n;
            else if (num_gpus == 2)
                crosspoint = 20160;
            else
                crosspoint = 0;
            crosspoint = 0; //n; //n -- > gpu always does next panel, 0 --> cpu always does next panel
            crosspoint = n;

            #if defined (ENABLE_TIMER)
            real_Double_t tget = get_time(), tset = 0.0, ttot = 0.0;
            #endif
            if ( n > nb ) {
                // send first panel to cpu
                magma_setdevice(0);
                tmppanel = tmppanel0;
                magma_zgetmatrix_async(n, nb,
                        dlA(0, 0, 0), ldda,
                        tmppanel(0),  ldpanel,
                        stream[0][0] );
            }
            #if defined (ENABLE_TIMER)
            for( d=0; d < num_gpus; d++ ) {
                magma_setdevice(d);
                magma_device_sync();
            }
            tget = get_time()-tget;
            #endif

            // Compute the Cholesky factorization A = L*L'
            for (j = 0; (j + nb) < n; j += nb) {
                #if defined (ENABLE_TIMER)
                therk[0] = therk[1] = therk[2] = therk[3] = tmtc = tcchol = tctrsm = tctm = tmnp = tcnp = 0.0;
                #endif

                blkid += 1;
                tmppanel = (blkid % 2 == 0) ? tmppanel0 : tmppanel1;
                // Set the gpu number that holds the current panel
                id = (j / nb) % num_gpus;
                magma_setdevice(id);

                // Set the local index where the current panel is
                j_local = j / (nb * num_gpus) * nb;
                
                rows = n - j;
                // Wait for the panel on cpu
                magma_queue_sync( stream[id][0] );
                if (j > 0 && prevtrsmrows > crosspoint) {
                    #if defined (ENABLE_TIMER)
                    tcnp = get_time();
                    #endif

                    tmpprevpanel = ((blkid - 1) % 2) == 0 ? tmppanel0 : tmppanel1;

                    blasf77_zgemm( MagmaNoTransStr, MagmaConjTransStr,
                            &rows, &nb, &nb,
                            &mz_one, tmpprevpanel(j), &ldpanel,
                                     tmpprevpanel(j), &ldpanel,
                            &z_one,      tmppanel(j), &ldpanel );

                    #if defined (ENABLE_TIMER)
                    tcnp = get_time() - tcnp;
                    ttot_cnp += tcnp;
                    #endif
                }

                #if defined (ENABLE_TIMER)
                tcchol = get_time();
                #endif
                lapackf77_zpotrf(MagmaLowerStr, &nb, tmppanel(j), &ldpanel, info);
                if (*info != 0) {
                    *info = *info + j;
                    break;
                }

                #if defined (ENABLE_TIMER)
                tcchol = get_time() - tcchol;
                ttot_cchol += tcchol;
                tctrsm = get_time();
                #endif

                trsmrows = rows - nb;

                if (trsmrows > 0) {
                    blasf77_ztrsm(MagmaRightStr, MagmaLowerStr, MagmaConjTransStr, MagmaNonUnitStr,
                                  &trsmrows, &nb,
                                  &z_one, tmppanel(j), &ldpanel,
                                          tmppanel(j + nb), &ldpanel);
                }

                #if defined (ENABLE_TIMER)
                tctrsm = get_time() - tctrsm;
                ttot_ctrsm += tctrsm;
                tctm = get_time();
                #endif

                d = (id + 1) % num_gpus;
                // send current panel to gpus
                for (ngpu = 0; ngpu < num_gpus; ngpu++, d = (d + 1) % num_gpus ) {
                    magma_int_t myrows = 0;
                    magma_int_t row_offset = 0;
                    if ( d == id ) {
                        dlpanel = dlA(d, j, j_local);
                        myrows = rows;
                        row_offset = 0;
                    } else {
                        dlpanel = dlP(d, 0, 0);
                        myrows = trsmrows;
                        row_offset = nb;
                    }

                    if (myrows > 0) {
                        magma_setdevice(d);
                        magma_zsetmatrix_async(myrows, nb,
                                tmppanel(j + row_offset),    ldpanel,
                                dlpanel, ldda, stream[d][0] );
                    }
                }
                /* make sure panel is on GPUs */
                d = (id + 1) % num_gpus;
                for (ngpu = 0; ngpu < num_gpus; ngpu++, d = (d + 1) % num_gpus ) {
                    magma_setdevice(d);
                    magma_queue_sync( stream[d][0] );
                }

                #if defined (ENABLE_TIMER)
                tctm = get_time() - tctm;
                ttot_ctm += tctm;
                #endif

                if ( (j + nb) < n) {
                    magma_int_t offset = 0;
                    magma_int_t row_offset = 0;
                    if (j + nb + nb < n) {
                        d = (id + 1) % num_gpus;
                        magma_setdevice(d);
                        magma_int_t j_local2 = (j + nb) / (nb * num_gpus) * nb;
                        if (trsmrows <= crosspoint) {
                            #if defined (ENABLE_TIMER)
                            tmnp = get_time();
                            #endif

                            // do gemm on look ahead panel
                            if ( d == id ) {
                                dlpanel = dlA(d, j + nb, j_local);
                            } else {
                                dlpanel = dlP(d, 0, 0);
                            }

                            magmablasSetKernelStream(stream[d][STREAM_ID(j_local2)]);
                            #define ZHERK_ON_DIAG
                            #ifdef  ZHERK_ON_DIAG
                            magma_zherk( MagmaLower, MagmaNoTrans,
                                    nb, nb,
                                    m_one, dlpanel, ldda,
                                     one,  dlA(d, j + nb, j_local2), ldda);
                            magma_zgemm( MagmaNoTrans, MagmaConjTrans,
                                    trsmrows-nb, nb, nb,
                                    mz_one, dlpanel+nb, ldda,
                                            dlpanel,    ldda,
                                     z_one, dlA(d, j + nb +nb, j_local2), ldda);
                            #else
                            magma_zgemm( MagmaNoTrans, MagmaConjTrans,
                                    trsmrows, nb, nb,
                                    mz_one, dlpanel, ldda,
                                            dlpanel, ldda,
                                     z_one, dlA(d, j + nb, j_local2), ldda);
                            #endif

                            #if defined (ENABLE_TIMER)
                            magma_device_sync();
                            tmnp = get_time() - tmnp;
                            ttot_mnp += tmnp;
                            #endif
                        }
                        // send next panel to cpu
                        magma_queue_sync( stream[d][STREAM_ID(j_local2)] ); // make sure lookahead is done
                        tmppanel = ((blkid+1) % 2 == 0) ? tmppanel0 : tmppanel1;
                        magma_zgetmatrix_async(rows-nb, nb,
                                dlA(d, j+nb, j_local2), ldda,
                                tmppanel(j+nb),  ldpanel,
                                stream[d][0] );
                        tmppanel = (blkid % 2 == 0) ? tmppanel0 : tmppanel1;

                        offset = j + nb + nb;
                        row_offset = nb;
                    } else {
                        offset = j + nb;
                        row_offset = 0;
                    }

                    if (n - offset > 0) {
                        // syrk on multiple gpu
                        for (d = 0; d < num_gpus; d++ ) {
                            if ( d == id ) {
                                dlpanels[d] = dlA(d, j + nb + row_offset, j_local);
                            } else {
                                dlpanels[d] = dlP(d, row_offset, 0);
                            }
                        }

                        #if defined (ENABLE_TIMER)
                        for( d=0; d < num_gpus; d++ ) therk[d] = get_time();
                        #endif

                        //magmablasSetKernelStream(stream[d]);
                        //magma_zherk(MagmaLower, MagmaNoTrans, n - offset, nb,
                        //        m_one, dlpanel, ldda,
                        //        one, &d_lA[d][offset + offset*ldda], ldda );
                        #ifdef  ZHERK_ON_DIAG
                        magma_zherk_mgpu
                        #else
                        magma_zherk_mgpu2
                        #endif
                                        (num_gpus, MagmaLower, MagmaNoTrans,
                                         nb, n - offset, nb,
                                         m_one, dlpanels, ldda, 0,
                                         one,   d_lA,     ldda, offset,
                                         num_streams, stream );
                        #if defined (ENABLE_TIMER)
                        for( d=0; d < num_gpus; d++ ) {
                            magma_setdevice(d);
                            magma_device_sync();
                            therk[d] = get_time() - therk[d];
                            ttot_herk[d] += therk[d];
                        }
                        #endif
                    }

                    prevtrsmrows = trsmrows;
                    prevj = j;

                    #if defined (ENABLE_TIMER)
                    ttot += (tcnp+tcchol+tctrsm+therk[0]+therk[1]+therk[2]+tctm+tmnp);
                    printf("%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(%d) %10.3lf\n",
                            j, nb, rows, tmtc,
                            tcnp,     // gemm
                            tcchol,   // potrf
                            tctrsm,   // trsm
                            (tcchol + tctrsm),
                            (tmtc+tcnp+tcchol+tctrsm),
                            therk[0], therk[1], therk[2], therk[3], // syrk
                            tctm, // copy panel to GPU
                            tmnp, // lookahead on GPU
                            (id + 1) % num_gpus,
                            (tcnp+tcchol+tctrsm+therk[0]+therk[1]+therk[2]+tctm+tmnp));
                    fflush(0);
                    #endif
                }
            }
            for( d = 0; d < num_gpus; d++ ) {
                magma_setdevice(d);
                for( id=0; id < num_streams; id++ ) {
                    magma_queue_sync( stream[d][id] );
                }
            }
            #if defined (ENABLE_TIMER)
            printf("\n%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(-) %10.3lf\n",
                    n, n, 0, ttot_mtc,
                    ttot_cnp,     // gemm
                    ttot_cchol,   // potrf
                    ttot_ctrsm,   // trsm
                    (ttot_cchol + ttot_ctrsm),
                    (ttot_mtc+ttot_cnp+ttot_cchol+ttot_ctrsm),
                    ttot_herk[0], ttot_herk[1], ttot_herk[2], ttot_herk[3], // syrk
                    ttot_ctm, // copy panel to GPU
                    ttot_mnp, // lookahead on GPU
                    (ttot_cnp+ttot_cchol+ttot_ctrsm+ttot_herk[0]+ttot_herk[1]+ttot_herk[2]+ttot_ctm+ttot_mnp));
            printf("%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(-) %10.3lf (ratio)\n",
                    n, n, 0, ttot_mtc/ttot,
                    ttot_cnp/ttot,     // gemm
                    ttot_cchol/ttot,   // potrf
                    ttot_ctrsm/ttot,   // trsm
                    (ttot_cchol + ttot_ctrsm)/ttot,
                    (ttot_mtc+ttot_cnp+ttot_cchol+ttot_ctrsm)/ttot,
                    ttot_herk[0]/ttot, ttot_herk[1]/ttot, ttot_herk[2]/ttot, ttot_herk[3]/ttot, // syrk
                    ttot_ctm/ttot, // copy panel to GPU
                    ttot_mnp/ttot, // lookahead on GPU
                    (ttot_cnp+ttot_cchol+ttot_ctrsm+ttot_herk[0]+ttot_herk[1]+ttot_herk[2]+ttot_ctm+ttot_mnp)/ttot);
            #endif

            // cholesky for the last block
            if (j < n && *info == 0) {
                rows = n - j;
                id = (j / nb) % num_gpus;

                // Set the local index where the current panel is
                j_local = j / (nb * num_gpus) * nb;
                
                magma_setdevice(id);
                #if defined (ENABLE_TIMER)
                tset = get_time();
                #endif
                magma_zgetmatrix(rows, rows, dlA(id, j, j_local), ldda, panel(j), ldpanel);
                lapackf77_zpotrf(MagmaLowerStr, &rows, panel(j), &ldpanel, info);
                magma_zsetmatrix(rows, rows, panel(j), ldpanel, dlA(id, j, j_local), ldda);
                #if defined (ENABLE_TIMER)
                tset = get_time() - tset;
                #endif
            }
            #if defined (ENABLE_TIMER)
            printf( " matrix_get,set: %10.3lf %10.3lf -> %10.3lf\n",tget,tset,ttot+tget+tset );
            #endif
        } // end of else not upper

        // clean up
        for( d = 0; d < num_gpus; d++ ) {
            magma_setdevice(d);
            for( j=0; j < num_streams; j++ ) {
                magma_queue_destroy( stream[d][j] );
            }
            magma_free( d_lP[d] );
        }
    } // end of not lapack

    // free workspace
    magma_free_pinned( panel );
    magma_setdevice( orig_dev );
    magmablasSetKernelStream( orig_stream );

    return *info;
} /* magma_zpotrf_mgpu_right */
Пример #24
0
/**
    Purpose
    -------
    Solves the least squares problem
           min || A*X - C ||
    using the QR factorization A = Q*R computed by ZGEQRF_GPU.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A. M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A. M >= N >= 0.

    @param[in]
    nrhs    INTEGER
            The number of columns of the matrix C. NRHS >= 0.

    @param[in]
    dA      COMPLEX_16 array on the GPU, dimension (LDDA,N)
            The i-th column must contain the vector which defines the
            elementary reflector H(i), for i = 1,2,...,n, as returned by
            ZGEQRF_GPU in the first n columns of its array argument A.

    @param[in]
    ldda    INTEGER
            The leading dimension of the array A, LDDA >= M.

    @param[in]
    tau     COMPLEX_16 array, dimension (N)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by MAGMA_ZGEQRF_GPU.

    @param[in,out]
    dB      COMPLEX_16 array on the GPU, dimension (LDDB,NRHS)
            On entry, the M-by-NRHS matrix C.
            On exit, the N-by-NRHS solution matrix X.

    @param[in]
    dT      COMPLEX_16 array that is the output (the 6th argument)
            of magma_zgeqrf_gpu of size
            2*MIN(M, N)*NB + ((N+31)/32*32 )* MAX(NB, NRHS).
            The array starts with a block of size MIN(M,N)*NB that stores
            the triangular T matrices used in the QR factorization,
            followed by MIN(M,N)*NB block storing the diagonal block
            inverses for the R matrix, followed by work space of size
            ((N+31)/32*32 )* MAX(NB, NRHS).

    @param[in]
    lddb    INTEGER
            The leading dimension of the array dB. LDDB >= M.

    @param[out]
    hwork   (workspace) COMPLEX_16 array, dimension (LWORK)
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK,
            LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB,
            where NB is the blocksize given by magma_get_zgeqrf_nb( M ).
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the HWORK array, returns
            this value as the first entry of the WORK array.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value

    @ingroup magma_zgels_comp
    ********************************************************************/
extern "C" magma_int_t
magma_zgeqrs_gpu(
    magma_int_t m, magma_int_t n, magma_int_t nrhs,
    magmaDoubleComplex_ptr dA,    magma_int_t ldda,
    magmaDoubleComplex *tau,
    magmaDoubleComplex_ptr dT,
    magmaDoubleComplex_ptr dB,    magma_int_t lddb,
    magmaDoubleComplex *hwork, magma_int_t lwork,
    magma_int_t *info)
{
    #define dA(a_1,a_2) (dA + (a_2)*(ldda) + (a_1))
    #define dT(a_1)     (dT + (lddwork+(a_1))*nb)

    magmaDoubleComplex c_zero    = MAGMA_Z_ZERO;
    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    magmaDoubleComplex_ptr dwork;
    magma_int_t i, k, lddwork, rows, ib;
    magma_int_t ione = 1;

    magma_int_t nb     = magma_get_zgeqrf_nb(m);
    magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb;
    int lquery = (lwork == -1);

    hwork[0] = MAGMA_Z_MAKE( (double)lwkopt, 0. );

    *info = 0;
    if (m < 0)
        *info = -1;
    else if (n < 0 || m < n)
        *info = -2;
    else if (nrhs < 0)
        *info = -3;
    else if (ldda < max(1,m))
        *info = -5;
    else if (lddb < max(1,m))
        *info = -9;
    else if (lwork < lwkopt && ! lquery)
        *info = -11;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;

    k = min(m,n);
    if (k == 0) {
        hwork[0] = c_one;
        return *info;
    }

    /* B := Q' * B */
    magma_zunmqr_gpu( MagmaLeft, Magma_ConjTrans,
                      m, nrhs, n,
                      dA(0,0), ldda, tau,
                      dB, lddb, hwork, lwork, dT, nb, info );
    if ( *info != 0 ) {
        return *info;
    }

    /* Solve R*X = B(1:n,:) */
    lddwork= k;
    if (nb < k)
        dwork = dT+2*lddwork*nb;
    else
        dwork = dT;
    // To do: Why did we have this line originally; seems to be a bug (Stan)?
    // dwork = dT;

    i    = (k-1)/nb * nb;
    ib   = n-i;
    rows = m-i;

    // TODO: this assumes that, on exit from magma_zunmqr_gpu, hwork contains
    // the last block of A and B (i.e., C in zunmqr). This should be fixed.
    // Seems this data should already be on the GPU, so could switch to
    // magma_ztrsm and drop the zsetmatrix.
    if ( nrhs == 1 ) {
        blasf77_ztrsv( MagmaUpperStr, MagmaNoTransStr, MagmaNonUnitStr,
                       &ib, hwork,         &rows,
                            hwork+rows*ib, &ione);
    } else {
        blasf77_ztrsm( MagmaLeftStr, MagmaUpperStr, MagmaNoTransStr, MagmaNonUnitStr,
                       &ib, &nrhs,
                       &c_one, hwork,         &rows,
                               hwork+rows*ib, &rows);
    }
    
    // update the solution vector
    magma_zsetmatrix( ib, nrhs, hwork+rows*ib, rows, dwork+i, lddwork );

    // update c
    if (nrhs == 1)
        magma_zgemv( MagmaNoTrans, i, ib,
                     c_neg_one, dA(0, i), ldda,
                                dwork + i,   1,
                     c_one,     dB,           1);
    else
        magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                     i, nrhs, ib,
                     c_neg_one, dA(0, i), ldda,
                                dwork + i,   lddwork,
                     c_one,     dB,           lddb);

    int start = i-nb;
    if (nb < k) {
        for (i = start; i >= 0; i -= nb) {
            ib = min(k-i, nb);
            rows = m -i;

            if (i + ib < n) {
                if (nrhs == 1) {
                    magma_zgemv( MagmaNoTrans, ib, ib,
                                 c_one,  dT(i), ib,
                                         dB+i,      1,
                                 c_zero, dwork+i,  1);
                    magma_zgemv( MagmaNoTrans, i, ib,
                                 c_neg_one, dA(0, i), ldda,
                                            dwork + i,   1,
                                 c_one,     dB,           1);
                }
                else {
                    magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                                 ib, nrhs, ib,
                                 c_one,  dT(i), ib,
                                         dB+i,      lddb,
                                 c_zero, dwork+i,  lddwork);
                    magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                                 i, nrhs, ib,
                                 c_neg_one, dA(0, i), ldda,
                                            dwork + i,   lddwork,
                                 c_one,     dB,          lddb);
                }
            }
        }
    }

    magma_zcopymatrix( (n), nrhs,
                       dwork, lddwork,
                       dB,    lddb );
    
    return *info;
}
Пример #25
0
int main(int argc, char **argv)
{
    TESTING_INIT();

    real_Double_t   gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf, cpu_time;
    double          magma_error, cublas_error, work[1];
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t M, N, Xm, Ym, lda, sizeA, sizeX, sizeY;
    magma_int_t incx = 1;
    magma_int_t incy = 1;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    magmaDoubleComplex alpha = MAGMA_Z_MAKE(  1.5, -2.3 );
    magmaDoubleComplex beta  = MAGMA_Z_MAKE( -0.6,  0.8 );
    magmaDoubleComplex *A, *X, *Y, *Ycublas, *Ymagma;
    magmaDoubleComplex *dA, *dX, *dY;
    magma_int_t status = 0;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    double tol = opts.tolerance * lapackf77_dlamch("E");

    printf("trans = %s\n", lapack_trans_const(opts.transA) );
    printf("    M     N   MAGMA Gflop/s (ms)  CUBLAS Gflop/s (ms)   CPU Gflop/s (ms)  MAGMA error  CUBLAS error\n");
    printf("===================================================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            lda    = ((M+31)/32)*32;
            gflops = FLOPS_ZGEMV( M, N ) / 1e9;

            if ( opts.transA == MagmaNoTrans ) {
                Xm = N;
                Ym = M;
            } else {
                Xm = M;
                Ym = N;
            }

            sizeA = lda*N;
            sizeX = incx*Xm;
            sizeY = incy*Ym;
            
            TESTING_MALLOC_CPU( A,       magmaDoubleComplex, sizeA );
            TESTING_MALLOC_CPU( X,       magmaDoubleComplex, sizeX );
            TESTING_MALLOC_CPU( Y,       magmaDoubleComplex, sizeY );
            TESTING_MALLOC_CPU( Ycublas, magmaDoubleComplex, sizeY );
            TESTING_MALLOC_CPU( Ymagma,  magmaDoubleComplex, sizeY );
            
            TESTING_MALLOC_DEV( dA, magmaDoubleComplex, sizeA );
            TESTING_MALLOC_DEV( dX, magmaDoubleComplex, sizeX );
            TESTING_MALLOC_DEV( dY, magmaDoubleComplex, sizeY );
            
            /* Initialize the matrix */
            lapackf77_zlarnv( &ione, ISEED, &sizeA, A );
            lapackf77_zlarnv( &ione, ISEED, &sizeX, X );
            lapackf77_zlarnv( &ione, ISEED, &sizeY, Y );
            
            /* =====================================================================
               Performs operation using CUBLAS
               =================================================================== */
            magma_zsetmatrix( M, N, A, lda, dA, lda );
            magma_zsetvector( Xm, X, incx, dX, incx );
            magma_zsetvector( Ym, Y, incy, dY, incy );
            
            cublas_time = magma_sync_wtime( 0 );
            cublasZgemv( handle, cublas_trans_const(opts.transA),
                         M, N, &alpha, dA, lda, dX, incx, &beta, dY, incy );
            cublas_time = magma_sync_wtime( 0 ) - cublas_time;
            cublas_perf = gflops / cublas_time;
            
            magma_zgetvector( Ym, dY, incy, Ycublas, incy );
            
            /* =====================================================================
               Performs operation using MAGMABLAS
               =================================================================== */
            magma_zsetvector( Ym, Y, incy, dY, incy );
            
            magma_time = magma_sync_wtime( 0 );
            magmablas_zgemv( opts.transA, M, N, alpha, dA, lda, dX, incx, beta, dY, incy );
            magma_time = magma_sync_wtime( 0 ) - magma_time;
            magma_perf = gflops / magma_time;
            
            magma_zgetvector( Ym, dY, incy, Ymagma, incy );
            
            /* =====================================================================
               Performs operation using CPU BLAS
               =================================================================== */
            cpu_time = magma_wtime();
            blasf77_zgemv( lapack_trans_const(opts.transA), &M, &N,
                           &alpha, A, &lda,
                                   X, &incx,
                           &beta,  Y, &incy );
            cpu_time = magma_wtime() - cpu_time;
            cpu_perf = gflops / cpu_time;
            
            /* =====================================================================
               Check the result
               =================================================================== */
            blasf77_zaxpy( &Ym, &c_neg_one, Y, &incy, Ymagma, &incy );
            magma_error = lapackf77_zlange( "M", &Ym, &ione, Ymagma, &Ym, work ) / Ym;
            
            blasf77_zaxpy( &Ym, &c_neg_one, Y, &incy, Ycublas, &incy );
            cublas_error = lapackf77_zlange( "M", &Ym, &ione, Ycublas, &Ym, work ) / Ym;
            
            printf("%5d %5d   %7.2f (%7.2f)    %7.2f (%7.2f)   %7.2f (%7.2f)    %8.2e     %8.2e   %s\n",
                   (int) M, (int) N,
                   magma_perf,  1000.*magma_time,
                   cublas_perf, 1000.*cublas_time,
                   cpu_perf,    1000.*cpu_time,
                   magma_error, cublas_error,
                   (magma_error < tol && cublas_error < tol ? "ok" : "failed"));
            status += ! (magma_error < tol && cublas_error < tol);
            
            TESTING_FREE_CPU( A );
            TESTING_FREE_CPU( X );
            TESTING_FREE_CPU( Y );
            TESTING_FREE_CPU( Ycublas );
            TESTING_FREE_CPU( Ymagma  );
            
            TESTING_FREE_DEV( dA );
            TESTING_FREE_DEV( dX );
            TESTING_FREE_DEV( dY );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }
    
    TESTING_FINALIZE();
    return status;
}
Пример #26
0
/**
    Purpose
    -------
    ZUNMLQ overwrites the general complex M-by-N matrix C with

    @verbatim
                             SIDE = MagmaLeft     SIDE = MagmaRight
    TRANS = MagmaNoTrans:    Q * C                C * Q
    TRANS = Magma_ConjTrans: Q**H * C             C * Q**H
    @endverbatim

    where Q is a complexunitary matrix defined as the product of k
    elementary reflectors

          Q = H(k)**H . . . H(2)**H H(1)**H

    as returned by ZGELQF. Q is of order M if SIDE = MagmaLeft and of order N
    if SIDE = MagmaRight.

    Arguments
    ---------
    @param[in]
    side    magma_side_t
      -     = MagmaLeft:      apply Q or Q**H from the Left;
      -     = MagmaRight:     apply Q or Q**H from the Right.

    @param[in]
    trans   magma_trans_t
      -     = MagmaNoTrans:    No transpose, apply Q;
      -     = Magma_ConjTrans: Conjugate transpose, apply Q**H.

    @param[in]
    m       INTEGER
            The number of rows of the matrix C. M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix C. N >= 0.

    @param[in]
    k       INTEGER
            The number of elementary reflectors whose product defines
            the matrix Q.
            If SIDE = MagmaLeft,  M >= K >= 0;
            if SIDE = MagmaRight, N >= K >= 0.

    @param[in]
    A       COMPLEX_16 array, dimension
                (LDA,M) if SIDE = MagmaLeft,
                (LDA,N) if SIDE = MagmaRight.
            The i-th row must contain the vector which defines the
            elementary reflector H(i), for i = 1,2,...,k, as returned by
            ZGELQF in the first k rows of its array argument A.
            A is modified by the routine but restored on exit.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A. LDA >= max(1,K).

    @param[in]
    tau     COMPLEX_16 array, dimension (K)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by ZGELQF.

    @param[in,out]
    C       COMPLEX_16 array, dimension (LDC,N)
            On entry, the M-by-N matrix C.
            On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q.

    @param[in]
    ldc     INTEGER
            The leading dimension of the array C. LDC >= max(1,M).

    @param[out]
    work    (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.
            If SIDE = MagmaLeft,  LWORK >= max(1,N);
            if SIDE = MagmaRight, LWORK >= max(1,M).
            For optimum performance
            if SIDE = MagmaLeft,  LWORK >= N*NB;
            if SIDE = MagmaRight, LWORK >= M*NB,
            where NB is the optimal blocksize.
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value

    @ingroup magma_zgelqf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_zunmlq(
    magma_side_t side, magma_trans_t trans,
    magma_int_t m, magma_int_t n, magma_int_t k,
    magmaDoubleComplex *A, magma_int_t lda,
    magmaDoubleComplex *tau,
    magmaDoubleComplex *C, magma_int_t ldc,
    magmaDoubleComplex *work, magma_int_t lwork,
    magma_int_t *info)
{
    #define  A(i_,j_) ( A + (i_) + (j_)*lda)
    #define dC(i_,j_) (dC + (i_) + (j_)*lddc)
    #define dV(i_,j_) (dV + (i_) + (j_)*ib)
    #define dT(i_,j_) (dT + (i_) + (j_)*ib)
    #define dwork(i_) (dwork + (i_))

    magmaDoubleComplex *T, *T2;
    magma_int_t i, i1, i2, ib, ic, jc, nb, mi, ni, nq, nq_i, nw, step;
    magma_int_t iinfo, ldwork, lwkopt;
    magma_int_t left, notran, lquery;
    magma_trans_t transt;

    *info = 0;
    left   = (side  == MagmaLeft);
    notran = (trans == MagmaNoTrans);
    lquery = (lwork == -1);

    /* NQ is the order of Q and NW is the minimum dimension of WORK */
    if (left) {
        nq = m;
        nw = n;
    } else {
        nq = n;
        nw = m;
    }
    
    /* Test the input arguments */
    if (! left && side != MagmaRight) {
        *info = -1;
    } else if (! notran && trans != Magma_ConjTrans) {
        *info = -2;
    } else if (m < 0) {
        *info = -3;
    } else if (n < 0) {
        *info = -4;
    } else if (k < 0 || k > nq) {
        *info = -5;
    } else if (lda < max(1,k)) {
        *info = -7;
    } else if (ldc < max(1,m)) {
        *info = -10;
    } else if (lwork < max(1,nw) && ! lquery) {
        *info = -12;
    }

    if (*info == 0) {
        nb = magma_get_zgelqf_nb( min( m, n ));
        lwkopt = max(1,nw)*nb;
        work[0] = MAGMA_Z_MAKE( lwkopt, 0 );
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0 || k == 0) {
        work[0] = MAGMA_Z_ONE;
        return *info;
    }

    ldwork = nw;
    
    if (nb >= k) {
        /* Use CPU code */
        lapackf77_zunmlq( lapack_side_const(side), lapack_trans_const(trans),
            &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, &iinfo);
    }
    else {
        /* Use hybrid CPU-GPU code */
        /* Allocate work space on the GPU.
         * nw*nb  for dwork (m or n) by nb
         * nq*nb  for dV    (n or m) by nb
         * nb*nb  for dT
         * lddc*n for dC.
         */
        magma_int_t lddc = ((m+31)/32)*32;
        magmaDoubleComplex_ptr dwork, dV, dT, dC;
        magma_zmalloc( &dwork, (nw + nq + nb)*nb + lddc*n );
        if ( dwork == NULL ) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }
        dV = dwork + nw*nb;
        dT = dV    + nq*nb;
        dC = dT    + nb*nb;
        
        /* work space on CPU.
         * nb*nb for T
         * nb*nb for T2, used to save and restore diagonal block of panel  */
        magma_zmalloc_cpu( &T, 2*nb*nb );
        if ( T == NULL ) {
            magma_free( dwork );
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        T2 = T + nb*nb;
        
        /* Copy matrix C from the CPU to the GPU */
        magma_zsetmatrix( m, n, C, ldc, dC(0,0), lddc );
        
        if ( (left && notran) || (! left && ! notran) ) {
            i1 = 0;
            i2 = k;
            step = nb;
        } else {
            i1 = ((k - 1) / nb)*nb;
            i2 = 0;
            step = -nb;
        }

        // silence "uninitialized" warnings
        mi = 0;
        ni = 0;
        
        if (left) {
            ni = n;
            jc = 0;
        } else {
            mi = m;
            ic = 0;
        }

        if (notran) {
            transt = Magma_ConjTrans;
        } else {
            transt = MagmaNoTrans;
        }

        for (i = i1; (step < 0 ? i >= i2 : i < i2); i += step) {
            ib = min(nb, k - i);
            
            /* Form the triangular factor of the block reflector
               H = H(i) H(i + 1) . . . H(i + ib-1) */
            nq_i = nq - i;
            lapackf77_zlarft("Forward", "Rowwise", &nq_i, &ib,
                             A(i,i), &lda, &tau[i], T, &ib);

            /* 1) set upper triangle of panel in A to identity,
               2) copy the panel from A to the GPU, and
               3) restore A                                      */
            zpanel_to_q( MagmaLower, ib, A(i,i), lda, T2 );
            magma_zsetmatrix( ib, nq_i,  A(i,i), lda, dV(0,0), ib );
            zq_to_panel( MagmaLower, ib, A(i,i), lda, T2 );
            
            if (left) {
                /* H or H**H is applied to C(i:m,1:n) */
                mi = m - i;
                ic = i;
            }
            else {
                /* H or H**H is applied to C(1:m,i:n) */
                ni = n - i;
                jc = i;
            }
            
            /* Apply H or H**H; First copy T to the GPU */
            magma_zsetmatrix( ib, ib, T, ib, dT(0,0), ib );
            magma_zlarfb_gpu( side, transt, MagmaForward, MagmaRowwise,
                              mi, ni, ib,
                              dV(0,0), ib,
                              dT(0,0), ib,
                              dC(ic,jc), lddc,
                              dwork(0), ldwork );
        }
        magma_zgetmatrix( m, n, dC(0,0), lddc, C, ldc );
        
        magma_free( dwork );
        magma_free_cpu( T );
    }
    work[0] = MAGMA_Z_MAKE( lwkopt, 0 );
    
    return *info;
} /* magma_zunmlq */
Пример #27
0
/**
    Purpose
    -------
    Solves the overdetermined, least squares problem
           min || A*X - C ||
    using the QR factorization A.
    The underdetermined problem (m < n) is not currently handled.


    Arguments
    ---------
    @param[in]
    trans   magma_trans_t
      -     = MagmaNoTrans:   the linear system involves A.
            Only TRANS=MagmaNoTrans is currently handled.

    @param[in]
    m       INTEGER
            The number of rows of the matrix A. M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A. M >= N >= 0.

    @param[in]
    nrhs    INTEGER
            The number of columns of the matrix C. NRHS >= 0.

    @param[in,out]
    dA       COMPLEX_16 array on the GPU, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, A is overwritten by details of its QR
            factorization as returned by ZGEQRF.

    @param[in]
    ldda    INTEGER
            The leading dimension of the array A, LDDA >= M.

    @param[in,out]
    dB      COMPLEX_16 array on the GPU, dimension (LDDB,NRHS)
            On entry, the M-by-NRHS matrix C.
            On exit, the N-by-NRHS solution matrix X.

    @param[in]
    lddb    INTEGER
            The leading dimension of the array dB. LDDB >= M.

    @param[out]
    hwork   (workspace) COMPLEX_16 array, dimension MAX(1,LWORK).
            On exit, if INFO = 0, HWORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The dimension of the array HWORK,
            LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB,
            where NB is the blocksize given by magma_get_zgeqrf_nb( M ).
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the HWORK array, returns
            this value as the first entry of the HWORK array.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value

    @ingroup magma_zgels_driver
    ********************************************************************/
extern "C" magma_int_t
magma_zgels_gpu(
    magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t nrhs,
    magmaDoubleComplex_ptr dA, magma_int_t ldda,
    magmaDoubleComplex_ptr dB, magma_int_t lddb,
    magmaDoubleComplex *hwork, magma_int_t lwork,
    magma_int_t *info)
{
    magmaDoubleComplex_ptr dT;
    magmaDoubleComplex *tau;
    magma_int_t k;

    magma_int_t nb     = magma_get_zgeqrf_nb(m);
    magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb;
    int lquery = (lwork == -1);

    hwork[0] = MAGMA_Z_MAKE( (double)lwkopt, 0. );

    *info = 0;
    /* For now, N is the only case working */
    if ( trans != MagmaNoTrans )
        *info = -1;
    else if (m < 0)
        *info = -2;
    else if (n < 0 || m < n) /* LQ is not handle for now*/
        *info = -3;
    else if (nrhs < 0)
        *info = -4;
    else if (ldda < max(1,m))
        *info = -6;
    else if (lddb < max(1,m))
        *info = -8;
    else if (lwork < lwkopt && ! lquery)
        *info = -10;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;

    k = min(m,n);
    if (k == 0) {
        hwork[0] = MAGMA_Z_ONE;
        return *info;
    }

    /*
     * Allocate temporary buffers
     */
    int ldtwork = ( 2*k + ((n+31)/32)*32 )*nb;
    if (nb < nrhs)
        ldtwork = ( 2*k + ((n+31)/32)*32 )*nrhs;
    if (MAGMA_SUCCESS != magma_zmalloc( &dT, ldtwork )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }
    
    magma_zmalloc_cpu( &tau, k );
    if ( tau == NULL ) {
        magma_free( dT );
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }

    magma_zgeqrf_gpu( m, n, dA, ldda, tau, dT, info );

    if ( *info == 0 ) {
        magma_zgeqrs_gpu( m, n, nrhs,
                          dA, ldda, tau, dT,
                          dB, lddb, hwork, lwork, info );
    }
    
    magma_free( dT );
    magma_free_cpu(tau);
    return *info;
}
Пример #28
0
/**
    Purpose
    -------
    ZLATRD2 reduces NB rows and columns of a complex Hermitian matrix A to
    Hermitian tridiagonal form by an orthogonal similarity
    transformation Q' * A * Q, and returns the matrices V and W which are
    needed to apply the transformation to the unreduced part of A.

    If UPLO = MagmaUpper, ZLATRD reduces the last NB rows and columns of a
    matrix, of which the upper triangle is supplied;
    if UPLO = MagmaLower, ZLATRD reduces the first NB rows and columns of a
    matrix, of which the lower triangle is supplied.

    This is an auxiliary routine called by ZHETRD2_GPU. It uses an
    accelerated HEMV that needs extra memory.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
            Specifies whether the upper or lower triangular part of the
            Hermitian matrix A is stored:
      -     = MagmaUpper: Upper triangular
      -     = MagmaLower: Lower triangular

    @param[in]
    n       INTEGER
            The order of the matrix A.

    @param[in]
    nb      INTEGER
            The number of rows and columns to be reduced.

    @param[in,out]
    A       COMPLEX_16 array, dimension (LDA,N)
            On entry, the Hermitian matrix A.  If UPLO = MagmaUpper, the leading
            n-by-n upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = MagmaLower, the
            leading n-by-n lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
            On exit:
      -     if UPLO = MagmaUpper, the last NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements above the diagonal
              with the array TAU, represent the orthogonal matrix Q as a
              product of elementary reflectors;
      -     if UPLO = MagmaLower, the first NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements below the diagonal
              with the array TAU, represent the  orthogonal matrix Q as a
              product of elementary reflectors.
            See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= (1,N).

    @param[out]
    e       COMPLEX_16 array, dimension (N-1)
            If UPLO = MagmaUpper, E(n-nb:n-1) contains the superdiagonal
            elements of the last NB columns of the reduced matrix;
            if UPLO = MagmaLower, E(1:nb) contains the subdiagonal elements of
            the first NB columns of the reduced matrix.

    @param[out]
    tau     COMPLEX_16 array, dimension (N-1)
            The scalar factors of the elementary reflectors, stored in
            TAU(n-nb:n-1) if UPLO = MagmaUpper, and in TAU(1:nb) if UPLO = MagmaLower.
            See Further Details.

    @param[out]
    W       COMPLEX_16 array, dimension (LDW,NB)
            The n-by-nb matrix W required to update the unreduced part
            of A.

    @param[in]
    ldw     INTEGER
            The leading dimension of the array W. LDW >= max(1,N).

    Further Details
    ---------------
    If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary
    reflectors

        Q = H(n) H(n-1) . . . H(n-nb+1).

    Each H(i) has the form

        H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i),
    and tau in TAU(i-1).

    If UPLO = MagmaLower, the matrix Q is represented as a product of elementary
    reflectors

        Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

        H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i),
    and tau in TAU(i).

    The elements of the vectors v together form the n-by-nb matrix V
    which is needed, with W, to apply the transformation to the unreduced
    part of the matrix, using a Hermitian rank-2k update of the form:
    A := A - V*W' - W*V'.

    The contents of A on exit are illustrated by the following examples
    with n = 5 and nb = 2:

    if UPLO = MagmaUpper:                       if UPLO = MagmaLower:

        (  a   a   a   v4  v5 )              (  d                  )
        (      a   a   v4  v5 )              (  1   d              )
        (          a   1   v5 )              (  v1  1   a          )
        (              d   1  )              (  v1  v2  a   a      )
        (                  d  )              (  v1  v2  a   a   a  )

    where d denotes a diagonal element of the reduced matrix, a denotes
    an element of the original matrix that is unchanged, and vi denotes
    an element of the vector defining H(i).

    @ingroup magma_zheev_aux
    ********************************************************************/
extern "C" magma_int_t
magma_zlatrd2(magma_uplo_t uplo, magma_int_t n, magma_int_t nb,
              magmaDoubleComplex *A,  magma_int_t lda,
              double *e, magmaDoubleComplex *tau,
              magmaDoubleComplex *W,  magma_int_t ldw,
              magmaDoubleComplex *dA, magma_int_t ldda,
              magmaDoubleComplex *dW, magma_int_t lddw,
              magmaDoubleComplex *dwork, magma_int_t ldwork)
{
#define A(i, j) (A + (j)*lda + (i))
#define W(i, j) (W + (j)*ldw + (i))

#define dA(i, j) (dA + (j)*ldda + (i))
#define dW(i, j) (dW + (j)*lddw + (i))

    magma_int_t i;
    
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magmaDoubleComplex c_zero    = MAGMA_Z_ZERO;

    magmaDoubleComplex value = MAGMA_Z_ZERO;
    
    magma_int_t ione = 1;

    magma_int_t i_n, i_1, iw;
    
    magmaDoubleComplex alpha;
    magmaDoubleComplex *f;

    if (n <= 0) {
        return 0;
    }

    magma_queue_t stream;
    magma_queue_create( &stream );
    magma_zmalloc_cpu( &f, n );
    assert( f != NULL );  // TODO return error, or allocate outside zlatrd
    
    if (uplo == MagmaUpper) {
        /* Reduce last NB columns of upper triangle */
        for (i = n-1; i >= n - nb; --i) {
            i_1 = i + 1;
            i_n = n - i - 1;
            
            iw = i - n + nb;
            if (i < n-1) {
                /* Update A(1:i,i) */
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_zlacgv(&i_n, W(i, iw+1), &ldw);
                #endif
                blasf77_zgemv("No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda,
                              W(i, iw+1), &ldw, &c_one, A(0, i), &ione);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_zlacgv(&i_n, W(i, iw+1), &ldw);
                lapackf77_zlacgv(&i_n, A(i, i+1), &ldw);
                #endif
                blasf77_zgemv("No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                              A(i, i+1), &lda, &c_one, A(0, i), &ione);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_zlacgv(&i_n, A(i, i+1), &ldw);
                #endif
            }
            if (i > 0) {
                /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */
                
                alpha = *A(i-1, i);
                
                lapackf77_zlarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]);
                
                e[i-1] = MAGMA_Z_REAL( alpha );
                *A(i-1,i) = MAGMA_Z_MAKE( 1, 0 );
                
                /* Compute W(1:i-1,i) */
                // 1. Send the block reflector  A(0:n-i-1,i) to the GPU
                magma_zsetvector( i, A(0, i), 1, dA(0, i), 1 );
                
                //#if (GPUSHMEM < 200)
                //magma_zhemv(MagmaUpper, i, c_one, dA(0, 0), ldda,
                //            dA(0, i), ione, c_zero, dW(0, iw), ione);
                //#else
                magmablas_zhemv_work(MagmaUpper, i, c_one, dA(0, 0), ldda,
                                     dA(0, i), ione, c_zero, dW(0, iw), ione,
                                     dwork, ldwork);
                //#endif
                
                // 2. Start putting the result back (asynchronously)
                magma_zgetmatrix_async( i, 1,
                                        dW(0, iw),         lddw,
                                        W(0, iw) /*test*/, ldw, stream );
                
                if (i < n-1) {
                    blasf77_zgemv(MagmaConjTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw,
                                  A(0, i), &ione, &c_zero, W(i+1, iw), &ione);
                }
                
                // 3. Here is where we need it // TODO find the right place
                magma_queue_sync( stream );
                
                if (i < n-1) {
                    blasf77_zgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda,
                                  W(i+1, iw), &ione, &c_one, W(0, iw), &ione);
                    
                    blasf77_zgemv(MagmaConjTransStr, &i, &i_n, &c_one, A(0, i+1), &lda,
                                  A(0, i), &ione, &c_zero, W(i+1, iw), &ione);
                    
                    blasf77_zgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                                  W(i+1, iw), &ione, &c_one, W(0, iw), &ione);
                }
                
                blasf77_zscal(&i, &tau[i - 1], W(0, iw), &ione);
                
                #if defined(PRECISION_z) || defined(PRECISION_c)
                cblas_zdotc_sub( i, W(0,iw), ione, A(0,i), ione, &value );
                #else
                value = cblas_zdotc( i, W(0,iw), ione, A(0,i), ione );
                #endif
                alpha = tau[i - 1] * -0.5f * value;
                blasf77_zaxpy(&i, &alpha, A(0, i), &ione,
                              W(0, iw), &ione);
            }
        }
    }
    else {
        /*  Reduce first NB columns of lower triangle */
        for (i = 0; i < nb; ++i) {
            
            /* Update A(i:n,i) */
            i_n = n - i;
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_zlacgv(&i, W(i, 0), &ldw);
            #endif
            blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda,
                          W(i, 0), &ldw, &c_one, A(i, i), &ione);
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_zlacgv(&i, W(i, 0), &ldw);
            lapackf77_zlacgv(&i, A(i, 0), &lda);
            #endif
            blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw,
                          A(i, 0), &lda, &c_one, A(i, i), &ione);
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_zlacgv(&i, A(i, 0), &lda);
            #endif
        
            if (i < n-1) {
                /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */
                i_n = n - i - 1;
                alpha = *A(i+1, i);
                lapackf77_zlarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]);
                e[i] = MAGMA_Z_REAL( alpha );
                *A(i+1,i) = MAGMA_Z_MAKE( 1, 0 );
        
                /* Compute W(i+1:n,i) */
                // 1. Send the block reflector  A(i+1:n,i) to the GPU
                magma_zsetvector( i_n, A(i+1, i), 1, dA(i+1, i), 1 );
            
                //#if (GPUSHMEM < 200)
                //magma_zhemv(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero,
                //            dW(i+1, i), ione);
                //#else
                magmablas_zhemv_work(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero,
                                     dW(i+1, i), ione,
                                     dwork, ldwork);
                //#endif
        
                // 2. Start putting the result back (asynchronously)
                magma_zgetmatrix_async( i_n, 1,
                                        dW(i+1, i), lddw,
                                        W(i+1, i),  ldw, stream );
        
                blasf77_zgemv(MagmaConjTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw,
                              A(i+1, i), &ione, &c_zero, W(0, i), &ione);
            
                blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda,
                              W(0, i), &ione, &c_zero, f, &ione);
                
                blasf77_zgemv(MagmaConjTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda,
                              A(i+1, i), &ione, &c_zero, W(0, i), &ione);
        
                // 3. Here is where we need it
                magma_queue_sync( stream );
        
                if (i != 0)
                    blasf77_zaxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione);
        
                blasf77_zgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw,
                              W(0, i), &ione, &c_one, W(i+1, i), &ione);
                blasf77_zscal(&i_n, &tau[i], W(i+1,i), &ione);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                cblas_zdotc_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value );
                #else
                value = cblas_zdotc( i_n, W(i+1,i), ione, A(i+1,i), ione );
                #endif
                alpha = tau[i] * -0.5f * value;
                blasf77_zaxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione);
            }
        }
    }

    magma_free_cpu(f);
    magma_queue_destroy( stream );

    return 0;
} /* magma_zlatrd */
Пример #29
0
int main(int argc, char **argv)
{
    TESTING_INIT();

    const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    const magma_int_t        ione      = 1;
    
    real_Double_t   atomics_perf=0, atomics_time=0;
    real_Double_t   gflops, magma_perf=0, magma_time=0, cublas_perf, cublas_time, cpu_perf, cpu_time;
    double          magma_error=0, atomics_error=0, cublas_error, work[1];
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t N, lda, ldda, sizeA, sizeX, sizeY, blocks, ldwork;
    magma_int_t incx = 1;
    magma_int_t incy = 1;
    magma_int_t nb   = 64;
    magmaDoubleComplex alpha = MAGMA_Z_MAKE(  1.5, -2.3 );
    magmaDoubleComplex beta  = MAGMA_Z_MAKE( -0.6,  0.8 );
    magmaDoubleComplex *A, *X, *Y, *Yatomics, *Ycublas, *Ymagma;
    magmaDoubleComplex_ptr dA, dX, dY, dwork;
    magma_int_t status = 0;
    
    magma_opts opts;
    opts.parse_opts( argc, argv );
    
    double tol = opts.tolerance * lapackf77_dlamch("E");

    printf("%% uplo = %s\n", lapack_uplo_const(opts.uplo) );
    printf("%%   N   MAGMA Gflop/s (ms)    Atomics Gflop/s      CUBLAS Gflop/s       CPU Gflop/s   MAGMA error  Atomics    CUBLAS\n");
    printf("%%=====================================================================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            N = opts.nsize[itest];
            lda    = N;
            ldda   = magma_roundup( N, opts.align );  // multiple of 32 by default
            sizeA  = N*lda;
            sizeX  = N*incx;
            sizeY  = N*incy;
            gflops = FLOPS_ZHEMV( N ) / 1e9;
            
            TESTING_MALLOC_CPU( A,        magmaDoubleComplex, sizeA );
            TESTING_MALLOC_CPU( X,        magmaDoubleComplex, sizeX );
            TESTING_MALLOC_CPU( Y,        magmaDoubleComplex, sizeY );
            TESTING_MALLOC_CPU( Yatomics, magmaDoubleComplex, sizeY );
            TESTING_MALLOC_CPU( Ycublas,  magmaDoubleComplex, sizeY );
            TESTING_MALLOC_CPU( Ymagma,   magmaDoubleComplex, sizeY );
            
            TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*N );
            TESTING_MALLOC_DEV( dX, magmaDoubleComplex, sizeX );
            TESTING_MALLOC_DEV( dY, magmaDoubleComplex, sizeY );
            
            blocks = magma_ceildiv( N, nb );
            ldwork = ldda*blocks;
            TESTING_MALLOC_DEV( dwork, magmaDoubleComplex, ldwork );
            
            magmablas_zlaset( MagmaFull, ldwork, 1, MAGMA_Z_NAN, MAGMA_Z_NAN, dwork, ldwork );
            magmablas_zlaset( MagmaFull, ldda,   N, MAGMA_Z_NAN, MAGMA_Z_NAN, dA,    ldda   );
            
            /* Initialize the matrix */
            lapackf77_zlarnv( &ione, ISEED, &sizeA, A );
            magma_zmake_hermitian( N, A, lda );
            
            // should not use data from the opposite triangle -- fill with NAN to check
            magma_int_t N1 = N-1;
            if ( opts.uplo == MagmaUpper ) {
                lapackf77_zlaset( "Lower", &N1, &N1, &MAGMA_Z_NAN, &MAGMA_Z_NAN, &A[1], &lda );
            }
            else {
                lapackf77_zlaset( "Upper", &N1, &N1, &MAGMA_Z_NAN, &MAGMA_Z_NAN, &A[lda], &lda );
            }
            
            lapackf77_zlarnv( &ione, ISEED, &sizeX, X );
            lapackf77_zlarnv( &ione, ISEED, &sizeY, Y );
            
            /* =====================================================================
               Performs operation using CUBLAS
               =================================================================== */
            magma_zsetmatrix( N, N, A, lda, dA, ldda );
            magma_zsetvector( N, X, incx, dX, incx );
            magma_zsetvector( N, Y, incy, dY, incy );
            
            magmablasSetKernelStream( opts.queue );  // opts.handle also uses opts.queue
            cublas_time = magma_sync_wtime( opts.queue );
            #ifdef HAVE_CUBLAS
                cublasZhemv( opts.handle, cublas_uplo_const(opts.uplo),
                             N, &alpha, dA, ldda, dX, incx, &beta, dY, incy );
            #else
                magma_zhemv( opts.uplo, N, alpha, dA, 0, ldda, dX, 0, incx, beta, dY, 0, incy, opts.queue );
            #endif
            cublas_time = magma_sync_wtime( opts.queue ) - cublas_time;
            cublas_perf = gflops / cublas_time;
            
            magma_zgetvector( N, dY, incy, Ycublas, incy );
            
            /* =====================================================================
               Performs operation using CUBLAS - using atomics
               =================================================================== */
            #ifdef HAVE_CUBLAS
                cublasSetAtomicsMode( opts.handle, CUBLAS_ATOMICS_ALLOWED );
                magma_zsetvector( N, Y, incy, dY, incy );
                
                // sync on queue doesn't work -- need device sync or use NULL stream -- bug in CUBLAS?
                atomics_time = magma_sync_wtime( NULL /*opts.queue*/ );
                cublasZhemv( opts.handle, cublas_uplo_const(opts.uplo),
                             N, &alpha, dA, ldda, dX, incx, &beta, dY, incy );
                atomics_time = magma_sync_wtime( NULL /*opts.queue*/ ) - atomics_time;
                atomics_perf = gflops / atomics_time;
                
                magma_zgetvector( N, dY, incy, Yatomics, incy );
                cublasSetAtomicsMode( opts.handle, CUBLAS_ATOMICS_NOT_ALLOWED );
            #endif
            
            /* =====================================================================
               Performs operation using MAGMABLAS
               =================================================================== */
            #ifdef HAVE_CUBLAS
                magma_zsetvector( N, Y, incy, dY, incy );
                
                magma_time = magma_sync_wtime( opts.queue );
                if ( opts.version == 1 ) {
                    magmablas_zhemv_work( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy, dwork, ldwork, opts.queue );
                }
                else {
                    // non-work interface (has added overhead)
                    magmablas_zhemv( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy );
                }
                magma_time = magma_sync_wtime( opts.queue ) - magma_time;
                magma_perf = gflops / magma_time;
                
                magma_zgetvector( N, dY, incy, Ymagma, incy );
            #endif
            
            /* =====================================================================
               Performs operation using CPU BLAS
               =================================================================== */
            cpu_time = magma_wtime();
            blasf77_zhemv( lapack_uplo_const(opts.uplo), &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy );
            cpu_time = magma_wtime() - cpu_time;
            cpu_perf = gflops / cpu_time;
            
            /* =====================================================================
               Check the result
               =================================================================== */
            blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Ycublas, &incy );
            cublas_error = lapackf77_zlange( "M", &N, &ione, Ycublas, &N, work ) / N;
            
            #ifdef HAVE_CUBLAS
                blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Yatomics, &incy );
                atomics_error = lapackf77_zlange( "M", &N, &ione, Yatomics, &N, work ) / N;
                
                blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Ymagma, &incy );
                magma_error = lapackf77_zlange( "M", &N, &ione, Ymagma, &N, work ) / N;
            #endif
            
            bool okay = (magma_error < tol && cublas_error < tol && atomics_error < tol);
            status += ! okay;
            printf("%5d   %7.2f (%7.2f)   %7.2f (%7.2f)   %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e   %8.2e   %8.2e   %s\n",
                   (int) N,
                   magma_perf,   1000.*magma_time,
                   atomics_perf, 1000.*atomics_time,
                   cublas_perf,  1000.*cublas_time,
                   cpu_perf,     1000.*cpu_time,
                   magma_error, cublas_error, atomics_error,
                   (okay ? "ok" : "failed"));
            
            TESTING_FREE_CPU( A );
            TESTING_FREE_CPU( X );
            TESTING_FREE_CPU( Y );
            TESTING_FREE_CPU( Ycublas  );
            TESTING_FREE_CPU( Yatomics );
            TESTING_FREE_CPU( Ymagma   );
            
            TESTING_FREE_DEV( dA );
            TESTING_FREE_DEV( dX );
            TESTING_FREE_DEV( dY );
            TESTING_FREE_DEV( dwork );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    opts.cleanup();
    TESTING_FINALIZE();
    return status;
}
Пример #30
0
/* ////////////////////////////////////////////////////////////////////////////
   -- running magma_zjacobi
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    magma_z_solver_par solver_par;
    magma_z_preconditioner precond_par;
    solver_par.maxiter = 1000;
    solver_par.verbose = 0;
    solver_par.num_eigenvalues = 0;
    int format = 0;
    
    magma_z_sparse_matrix A, B, B_d;
    magma_z_vector x, b;
    B.blocksize = 8;
    B.alignment = 8;
    int scale = 0;
    magma_scale_t scaling = Magma_NOSCALE;

    magmaDoubleComplex one = MAGMA_Z_MAKE(1.0, 0.0);
    magmaDoubleComplex zero = MAGMA_Z_MAKE(0.0, 0.0);

    B.storage_type = Magma_CSR;
    int i;
    for( i = 1; i < argc; ++i ) {
     if ( strcmp("--format", argv[i]) == 0 ) {
            format = atoi( argv[++i] );
            switch( format ) {
                case 0: B.storage_type = Magma_CSR; break;
                case 1: B.storage_type = Magma_ELL; break;
                case 2: B.storage_type = Magma_ELLRT; break;
                case 3: B.storage_type = Magma_SELLP; break;
            }
        }else if ( strcmp("--mscale", argv[i]) == 0 ) {
            scale = atoi( argv[++i] );
            switch( scale ) {
                case 0: scaling = Magma_NOSCALE; break;
                case 1: scaling = Magma_UNITDIAG; break;
                case 2: scaling = Magma_UNITROW; break;
            }

        }else if ( strcmp("--blocksize", argv[i]) == 0 ) {
            B.blocksize = atoi( argv[++i] );
        }else if ( strcmp("--alignment", argv[i]) == 0 ) {
            B.alignment = atoi( argv[++i] );
        } else if ( strcmp("--maxiter", argv[i]) == 0 ){ // && i+1< argc ) {
            solver_par.maxiter = atoi( argv[++i] );
        }
        else
            break;
    }
    printf( "\n#    usage: ./run_zjacobi"
        " [ --format %d (0=CSR, 1=ELL 2=ELLRT, 3=SELLP)"
        " [ --blocksize %d --alignment %d ]"
        " --mscale %d (0=no, 1=unitdiag, 2=unitrownrm)"
        " --maxiter %d ]"
        " matrices \n\n", format, (int) B.blocksize, (int) B.alignment,
        (int) scale,
        (int) solver_par.maxiter);

    magma_zsolverinfo_init( &solver_par, &precond_par );

    while(  i < argc ){

        magma_z_csr_mtx( &A,  argv[i]  ); 

        printf( "\n# matrix info: %d-by-%d with %d nonzeros\n\n",
                            (int) A.num_rows,(int) A.num_cols,(int) A.nnz );

        // scale matrix
        magma_zmscale( &A, scaling );

        magma_z_mconvert( A, &B, Magma_CSR, B.storage_type );
        magma_z_mtransfer( B, &B_d, Magma_CPU, Magma_DEV );

        // vectors and initial guess
        magma_z_vinit( &b, Magma_DEV, A.num_cols, one );
        magma_z_vinit( &x, Magma_DEV, A.num_cols, one );
        magma_z_spmv( one, B_d, x, zero, b );                 //  b = A x
        magma_z_vfree(&x);
        magma_z_vinit( &x, Magma_DEV, A.num_cols, zero );

        magma_zjacobi( B_d, b, &x, &solver_par );

        magma_zsolverinfo( &solver_par, &precond_par );

        magma_z_mfree(&B_d);
        magma_z_mfree(&B);
        magma_z_mfree(&A); 
        magma_z_vfree(&x);
        magma_z_vfree(&b);
        
        i++;
    }

    magma_zsolverinfo_free( &solver_par, &precond_par );

    TESTING_FINALIZE();

    return 0;
}