Beispiel #1
0
int main( int argc, char** argv )
{
    TESTING_INIT();
    
    real_Double_t   gflops, t1, t2;
    double c_neg_one = MAGMA_D_NEG_ONE;
    magma_int_t ione = 1;
    const char trans[] = { 'N', 'C', 'T' };
    const char uplo[]  = { 'L', 'U' };
    const char diag[]  = { 'U', 'N' };
    const char side[]  = { 'L', 'R' };
    
    double  *A,  *B,  *C,   *C2, *LU;
    double *dA, *dB, *dC1, *dC2;
    double alpha = MAGMA_D_MAKE( 0.5, 0.1 );
    double beta  = MAGMA_D_MAKE( 0.7, 0.2 );
    double dalpha = 0.6;
    double dbeta  = 0.8;
    double work[1], error, total_error;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t m, n, k, size, maxn, ld, info;
    magma_int_t *piv;
    magma_err_t err;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" );
    
    total_error = 0.;
    for( int i = 0; i < opts.ntest; ++i ) {
        m = opts.msize[i];
        n = opts.nsize[i];
        k = opts.ksize[i];
        printf("=========================================================================\n");
        printf( "M %d, N %d, K %d\n", (int) m, (int) n, (int) k );
        
        // allocate matrices
        // over-allocate so they can be any combination of {m,n,k} x {m,n,k}.
        maxn = max( max( m, n ), k );
        ld = maxn;
        size = maxn*maxn;
        err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) );  assert( err == 0 );
        err = magma_dmalloc_pinned( &A,  size );  assert( err == 0 );
        err = magma_dmalloc_pinned( &B,  size );  assert( err == 0 );
        err = magma_dmalloc_pinned( &C,  size );  assert( err == 0 );
        err = magma_dmalloc_pinned( &C2, size );  assert( err == 0 );
        err = magma_dmalloc_pinned( &LU, size );  assert( err == 0 );
        err = magma_dmalloc( &dA,  size );        assert( err == 0 );
        err = magma_dmalloc( &dB,  size );        assert( err == 0 );
        err = magma_dmalloc( &dC1, size );        assert( err == 0 );
        err = magma_dmalloc( &dC2, size );        assert( err == 0 );
        
        // initialize matrices
        size = maxn*maxn;
        lapackf77_dlarnv( &ione, ISEED, &size, A  );
        lapackf77_dlarnv( &ione, ISEED, &size, B  );
        lapackf77_dlarnv( &ione, ISEED, &size, C  );
        
        printf( "========== Level 1 BLAS ==========\n" );
        
        // ----- test DSWAP
        // swap 2nd and 3rd columns of dA, then copy to C2 and compare with A
        assert( n >= 4 );
        magma_dsetmatrix( m, n, A, ld, dA, ld );
        magma_dsetmatrix( m, n, A, ld, dB, ld );
        magma_dswap( m, dA(0,1), 1, dA(0,2), 1 );
        magma_dswap( m, dB(0,1), 1, dB(0,2), 1 );
        
        // check results, storing diff between magma and cuda calls in C2
        cublasDaxpy( ld*n, c_neg_one, dA, 1, dB, 1 );
        magma_dgetmatrix( m, n, dB, ld, C2, ld );
        error = lapackf77_dlange( "F", &m, &k, C2, &ld, work );
        total_error += error;
        printf( "dswap             diff %.2g\n", error );
        
        // ----- test IDAMAX
        // get argmax of column of A
        magma_dsetmatrix( m, k, A, ld, dA, ld );
        error = 0;
        for( int j = 0; j < k; ++j ) {
            magma_int_t i1 = magma_idamax( m, dA(0,j), 1 );
            magma_int_t i2 = cublasIdamax( m, dA(0,j), 1 );
            assert( i1 == i2 );
            error += abs( i1 - i2 );
        }
        total_error += error;
        gflops = (double)m * k / 1e9;
        printf( "idamax            diff %.2g\n", error );
        printf( "\n" );
        
        printf( "========== Level 2 BLAS ==========\n" );
        
        // ----- test DGEMV
        // c = alpha*A*b + beta*c,  with A m*n; b,c m or n-vectors
        // try no-trans/trans
        for( int ia = 0; ia < 3; ++ia ) {
            magma_dsetmatrix( m, n, A,  ld, dA,  ld );
            magma_dsetvector( maxn, B, 1, dB,  1 );
            magma_dsetvector( maxn, C, 1, dC1, 1 );
            magma_dsetvector( maxn, C, 1, dC2, 1 );
            t1 = magma_sync_wtime( 0 );
            magma_dgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC2, 1 );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            size = (trans[ia] == 'N' ? m : n);
            cublasDaxpy( size, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetvector( size, dC2, 1, C2, 1 );
            error = lapackf77_dlange( "F", &size, &ione, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DGEMV( m, n ) / 1e9;
            printf( "dgemv( %c )        diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    trans[ia], error, gflops/t1, gflops/t2 );
        }
        printf( "\n" );
        
        // ----- test DSYMV
        // c = alpha*A*b + beta*c,  with A m*m symmetric; b,c m-vectors
        // try upper/lower
        for( int iu = 0; iu < 2; ++iu ) {
            magma_dsetmatrix( m, m, A, ld, dA, ld );
            magma_dsetvector( m, B, 1, dB,  1 );
            magma_dsetvector( m, C, 1, dC1, 1 );
            magma_dsetvector( m, C, 1, dC2, 1 );
            t1 = magma_sync_wtime( 0 );
            magma_dsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC2, 1 );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( m, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetvector( m, dC2, 1, C2, 1 );
            error = lapackf77_dlange( "F", &m, &ione, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DSYMV( m ) / 1e9;
            printf( "dsymv( %c )        diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], error, gflops/t1, gflops/t2 );
        }
        printf( "\n" );
        
        // ----- test DTRSV
        // solve A*c = c,  with A m*m triangular; c m-vector
        // try upper/lower, no-trans/trans, unit/non-unit diag
        // Factor A into LU to get well-conditioned triangles, else solve yields garbage.
        // Still can give garbage if solves aren't consistent with LU factors,
        // e.g., using unit diag for U, so copy lower triangle to upper triangle.
        // Also used for trsm later.
        lapackf77_dlacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld );
        lapackf77_dgetrf( &maxn, &maxn, LU, &ld, piv, &info );
        for( int j = 0; j < maxn; ++j ) {
            for( int i = 0; i < j; ++i ) {
                *LU(i,j) = *LU(j,i);
            }
        }
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
        for( int id = 0; id < 2; ++id ) {
            magma_dsetmatrix( m, m, LU, ld, dA, ld );
            magma_dsetvector( m, C, 1, dC1, 1 );
            magma_dsetvector( m, C, 1, dC2, 1 );
            t1 = magma_sync_wtime( 0 );
            magma_dtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC2, 1 );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( m, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetvector( m, dC2, 1, C2, 1 );
            error = lapackf77_dlange( "F", &m, &ione, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DTRSM( MagmaLeft, m, 1 ) / 1e9;
            printf( "dtrsv( %c, %c, %c )  diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], diag[id], error, gflops/t1, gflops/t2 );
        }}}
        printf( "\n" );
        
        printf( "========== Level 3 BLAS ==========\n" );
        
        // ----- test DGEMM
        // C = alpha*A*B + beta*C,  with A m*k or k*m; B k*n or n*k; C m*n
        // try combinations of no-trans/trans
        for( int ia = 0; ia < 3; ++ia ) {
        for( int ib = 0; ib < 3; ++ib ) {
            bool nta = (trans[ia] == 'N');
            bool ntb = (trans[ib] == 'N');
            magma_dsetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA,  ld );
            magma_dsetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB,  ld );
            magma_dsetmatrix( m, n, C, ld, dC1, ld );
            magma_dsetmatrix( m, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_dgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_dlange( "F", &m, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DGEMM( m, n, k ) / 1e9;
            printf( "dgemm( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    trans[ia], trans[ib], error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test DSYMM
        // C = alpha*A*B + beta*C  (left)  with A m*m symmetric; B,C m*n; or
        // C = alpha*B*A + beta*C  (right) with A n*n symmetric; B,C m*n
        // try left/right, upper/lower
        for( int is = 0; is < 2; ++is ) {
        for( int iu = 0; iu < 2; ++iu ) {
            magma_dsetmatrix( m, m, A, ld, dA,  ld );
            magma_dsetmatrix( m, n, B, ld, dB,  ld );
            magma_dsetmatrix( m, n, C, ld, dC1, ld );
            magma_dsetmatrix( m, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_dsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_dlange( "F", &m, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DSYMM( side[is], m, n ) / 1e9;
            printf( "dsymm( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    side[is], uplo[iu], error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test DSYRK
        // C = alpha*A*A^H + beta*C  (no-trans) with A m*k and C m*m symmetric; or
        // C = alpha*A^H*A + beta*C  (trans)    with A k*m and C m*m symmetric
        // try upper/lower, no-trans/trans
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
            magma_dsetmatrix( n, k, A, ld, dA,  ld );
            magma_dsetmatrix( n, n, C, ld, dC1, ld );
            magma_dsetmatrix( n, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_dsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetmatrix( n, n, dC2, ld, C2, ld );
            error = lapackf77_dlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DSYRK( k, n ) / 1e9;
            printf( "dsyrk( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test DSYR2K
        // C = alpha*A*B^H + ^alpha*B*A^H + beta*C  (no-trans) with A,B n*k; C n*n symmetric; or
        // C = alpha*A^H*B + ^alpha*B^H*A + beta*C  (trans)    with A,B k*n; C n*n symmetric
        // try upper/lower, no-trans/trans
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
            bool nt = (trans[it] == 'N');
            magma_dsetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA,  ld );
            magma_dsetmatrix( n, n, C, ld, dC1, ld );
            magma_dsetmatrix( n, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_dsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetmatrix( n, n, dC2, ld, C2, ld );
            error = lapackf77_dlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DSYR2K( k, n ) / 1e9;
            printf( "dsyr2k( %c, %c )    diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test DTRMM
        // C = alpha*A*C  (left)  with A m*m triangular; C m*n; or
        // C = alpha*C*A  (right) with A n*n triangular; C m*n
        // try left/right, upper/lower, no-trans/trans, unit/non-unit
        for( int is = 0; is < 2; ++is ) {
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
        for( int id = 0; id < 2; ++id ) {
            bool left = (side[is] == 'L');
            magma_dsetmatrix( (left ? m : n), (left ? m : n), A, ld, dA,  ld );
            magma_dsetmatrix( m, n, C, ld, dC1, ld );
            magma_dsetmatrix( m, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_dtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_dlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DTRMM( side[is], m, n ) / 1e9;
            printf( "dtrmm( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], error, gflops/t1, gflops/t2 );
        }}}}
        printf( "\n" );
        
        // ----- test DTRSM
        // solve A*X = alpha*B  (left)  with A m*m triangular; B m*n; or
        // solve X*A = alpha*B  (right) with A n*n triangular; B m*n
        // try left/right, upper/lower, no-trans/trans, unit/non-unit
        for( int is = 0; is < 2; ++is ) {
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
        for( int id = 0; id < 2; ++id ) {
            bool left = (side[is] == 'L');
            magma_dsetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA,  ld );
            magma_dsetmatrix( m, n, C, ld, dC1, ld );
            magma_dsetmatrix( m, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_dtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_dlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DTRSM( side[is], m, n ) / 1e9;
            printf( "dtrsm( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], error, gflops/t1, gflops/t2 );
        }}}}
        printf( "\n" );
        
        // cleanup
        magma_free_cpu( piv );
        magma_free_pinned( A  );
        magma_free_pinned( B  );
        magma_free_pinned( C  );
        magma_free_pinned( C2 );
        magma_free_pinned( LU );
        magma_free( dA  );
        magma_free( dB  );
        magma_free( dC1 );
        magma_free( dC2 );
    }
    
    if ( total_error != 0. ) {
        printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n",
                total_error );
    }
    else {
        printf( "all tests passed\n" );
    }
    
    TESTING_FINALIZE();
    return 0;
}
Beispiel #2
0
/**
    Purpose
    -------
    DSPOSV computes the solution to a real system of linear equations
        A * X = B,
    where A is an N-by-N symmetric positive definite matrix and X and B
    are N-by-NRHS matrices.

    DSPOSV first attempts to factorize the matrix in real SINGLE PRECISION
    and use this factorization within an iterative refinement procedure
    to produce a solution with real DOUBLE PRECISION norm-wise backward error
    quality (see below). If the approach fails the method switches to a
    real DOUBLE PRECISION factorization and solve.

    The iterative refinement is not going to be a winning strategy if
    the ratio real SINGLE PRECISION performance over real DOUBLE PRECISION
    performance is too small. A reasonable strategy should take the
    number of right-hand sides and the size of the matrix into account.
    This might be done with a call to ILAENV in the future. Up to now, we
    always try iterative refinement.

    The iterative refinement process is stopped if
        ITER > ITERMAX
    or for all the RHS we have:
        RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX
    where
        o ITER is the number of the current iteration in the iterative
          refinement process
        o RNRM is the infinity-norm of the residual
        o XNRM is the infinity-norm of the solution
        o ANRM is the infinity-operator-norm of the matrix A
        o EPS is the machine epsilon returned by DLAMCH('Epsilon')
    The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of A is stored;
      -     = MagmaLower:  Lower triangle of A is stored.

    @param[in]
    n       INTEGER
            The number of linear equations, i.e., the order of the
            matrix A.  N >= 0.

    @param[in]
    nrhs    INTEGER
            The number of right hand sides, i.e., the number of columns
            of the matrix B.  NRHS >= 0.

    @param[in,out]
    dA      DOUBLE PRECISION array on the GPU, dimension (LDDA,N)
            On entry, the symmetric matrix A.  If UPLO = MagmaUpper, the leading
            N-by-N upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = MagmaLower, the
            leading N-by-N lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
            On exit, if iterative refinement has been successfully used
            (INFO.EQ.0 and ITER.GE.0, see description below), then A is
            unchanged, if double factorization has been used
            (INFO.EQ.0 and ITER.LT.0, see description below), then the
            array dA contains the factor U or L from the Cholesky
            factorization A = U**T*U or A = L*L**T.

    @param[in]
    ldda    INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,N).

    @param[in]
    dB      DOUBLE PRECISION array on the GPU, dimension (LDDB,NRHS)
            The N-by-NRHS right hand side matrix B.

    @param[in]
    lddb    INTEGER
            The leading dimension of the array dB.  LDDB >= max(1,N).

    @param[out]
    dX      DOUBLE PRECISION array on the GPU, dimension (LDDX,NRHS)
            If INFO = 0, the N-by-NRHS solution matrix X.

    @param[in]
    lddx    INTEGER
            The leading dimension of the array dX.  LDDX >= max(1,N).

    @param
    dworkd  (workspace) DOUBLE PRECISION array on the GPU, dimension (N*NRHS)
            This array is used to hold the residual vectors.

    @param
    dworks  (workspace) SINGLE PRECISION array on the GPU, dimension (N*(N+NRHS))
            This array is used to store the real single precision matrix
            and the right-hand sides or solutions in single precision.

    @param[out]
    iter    INTEGER
      -     < 0: iterative refinement has failed, double precision
                 factorization has been performed
        +        -1 : the routine fell back to full precision for
                      implementation- or machine-specific reasons
        +        -2 : narrowing the precision induced an overflow,
                      the routine fell back to full precision
        +        -3 : failure of SPOTRF
        +        -31: stop the iterative refinement after the 30th iteration
      -     > 0: iterative refinement has been successfully used.
                 Returns the number of iterations

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
      -     > 0:  if INFO = i, the leading minor of order i of (DOUBLE
                  PRECISION) A is not positive definite, so the
                  factorization could not be completed, and the solution
                  has not been computed.

    @ingroup magma_dposv_driver
    ********************************************************************/
extern "C" magma_int_t
magma_dsposv_gpu(
    magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs,
    magmaDouble_ptr dA, magma_int_t ldda,
    magmaDouble_ptr dB, magma_int_t lddb,
    magmaDouble_ptr dX, magma_int_t lddx,
    magmaDouble_ptr dworkd, magmaFloat_ptr dworks,
    magma_int_t *iter,
    magma_int_t *info)
{
    #define dB(i,j)     (dB + (i) + (j)*lddb)
    #define dX(i,j)     (dX + (i) + (j)*lddx)
    #define dR(i,j)     (dR + (i) + (j)*lddr)
    #define dSX(i,j)    (dSX + (i) + (j)*lddsx)

    // Constants
    const double      BWDMAX  = 1.0;
    const magma_int_t ITERMAX = 30;
    const double c_neg_one = MAGMA_D_NEG_ONE;
    const double c_one     = MAGMA_D_ONE;
    const magma_int_t ione  = 1;
    
    // Local variables
    magmaDouble_ptr dR;
    magmaFloat_ptr dSA, dSX;
    double Xnrmv, Rnrmv;
    double          Anrm, Xnrm, Rnrm, cte, eps;
    magma_int_t     i, j, iiter, lddsa, lddsx, lddr;

    /* Check arguments */
    *iter = 0;
    *info = 0;
    if ( n < 0 )
        *info = -1;
    else if ( nrhs < 0 )
        *info = -2;
    else if ( ldda < max(1,n))
        *info = -4;
    else if ( lddb < max(1,n))
        *info = -7;
    else if ( lddx < max(1,n))
        *info = -9;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    if ( n == 0 || nrhs == 0 )
        return *info;

    lddsa = n;
    lddsx = n;
    lddr  = n;
    
    dSA = dworks;
    dSX = dSA + lddsa*n;
    dR  = dworkd;

    magma_queue_t queue;
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queue );
    
    eps  = lapackf77_dlamch("Epsilon");
    Anrm = magmablas_dlansy( MagmaInfNorm, uplo, n, dA, ldda, (double*)dworkd, n*nrhs, queue );
    cte  = Anrm * eps * magma_dsqrt( n ) * BWDMAX;

    /*
     * Convert to single precision
     */
    magmablas_dlag2s( n, nrhs, dB, lddb, dSX, lddsx, queue, info );
    if (*info != 0) {
        *iter = -2;
        goto fallback;
    }

    magmablas_dlat2s( uplo, n, dA, ldda, dSA, lddsa, queue, info );
    if (*info != 0) {
        *iter = -2;
        goto fallback;
    }
    
    // factor dSA in single precision
    magma_spotrf_gpu( uplo, n, dSA, lddsa, info );
    if (*info != 0) {
        *iter = -3;
        goto fallback;
    }
    
    // solve dSA*dSX = dB in single precision
    magma_spotrs_gpu( uplo, n, nrhs, dSA, lddsa, dSX, lddsx, info );

    // residual dR = dB - dA*dX in double precision
    magmablas_slag2d( n, nrhs, dSX, lddsx, dX, lddx, queue, info );
    magmablas_dlacpy( MagmaFull, n, nrhs, dB, lddb, dR, lddr, queue );
    if ( nrhs == 1 ) {
        magma_dsymv( uplo, n,
                     c_neg_one, dA, ldda,
                                dX, 1,
                     c_one,     dR, 1, queue );
    }
    else {
        magma_dsymm( MagmaLeft, uplo, n, nrhs,
                     c_neg_one, dA, ldda,
                                dX, lddx,
                     c_one,     dR, lddr, queue );
    }

    // TODO: use MAGMA_D_ABS( dX(i,j) ) instead of dlange?
    for( j=0; j < nrhs; j++ ) {
        i = magma_idamax( n, dX(0,j), 1, queue ) - 1;
        magma_dgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1, queue );
        Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL );

        i = magma_idamax( n, dR(0,j), 1, queue ) - 1;
        magma_dgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1, queue );
        Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL );

        if ( Rnrm >  Xnrm*cte ) {
            goto refinement;
        }
    }
    
    *iter = 0;
    goto cleanup;
    //return *info;

refinement:
    for( iiter=1; iiter < ITERMAX; ) {
        *info = 0;
        // convert residual dR to single precision dSX
        magmablas_dlag2s( n, nrhs, dR, lddr, dSX, lddsx, queue, info );
        if (*info != 0) {
            *iter = -2;
            goto fallback;
        }
        // solve dSA*dSX = R in single precision
        magma_spotrs_gpu( uplo, n, nrhs, dSA, lddsa, dSX, lddsx, info );

        // Add correction and setup residual
        // dX += dSX [including conversion]  --and--
        // dR = dB
        for( j=0; j < nrhs; j++ ) {
            magmablas_dsaxpycp( n, dSX(0,j), dX(0,j), dB(0,j), dR(0,j), queue );
        }

        // residual dR = dB - dA*dX in double precision
        if ( nrhs == 1 ) {
            magma_dsymv( uplo, n,
                         c_neg_one, dA, ldda,
                                    dX, 1,
                         c_one,     dR, 1, queue );
        }
        else {
            magma_dsymm( MagmaLeft, uplo, n, nrhs,
                         c_neg_one, dA, ldda,
                                    dX, lddx,
                         c_one,     dR, lddr, queue );
        }

        // TODO: use MAGMA_D_ABS( dX(i,j) ) instead of dlange?
        /*  Check whether the nrhs normwise backward errors satisfy the
         *  stopping criterion. If yes, set ITER=IITER > 0 and return. */
        for( j=0; j < nrhs; j++ ) {
            i = magma_idamax( n, dX(0,j), 1, queue ) - 1;
            magma_dgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1, queue );
            Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL );

            i = magma_idamax( n, dR(0,j), 1, queue ) - 1;
            magma_dgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1, queue );
            Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL );

            if ( Rnrm >  Xnrm*cte ) {
                goto L20;
            }
        }

        /*  If we are here, the nrhs normwise backward errors satisfy
         *  the stopping criterion, we are good to exit. */
        *iter = iiter;
        goto cleanup;
        //return *info;
        
      L20:
        iiter++;
    }
    
    /* If we are at this place of the code, this is because we have
     * performed ITER=ITERMAX iterations and never satisified the
     * stopping criterion. Set up the ITER flag accordingly and follow
     * up on double precision routine. */
    *iter = -ITERMAX - 1;

fallback:
    /* Single-precision iterative refinement failed to converge to a
     * satisfactory solution, so we resort to double precision. */
    magma_dpotrf_gpu( uplo, n, dA, ldda, info );
    if (*info == 0) {
        magmablas_dlacpy( MagmaFull, n, nrhs, dB, lddb, dX, lddx, queue );
        magma_dpotrs_gpu( uplo, n, nrhs, dA, ldda, dX, lddx, info );
    }
    
cleanup:
    magma_queue_destroy( queue );
    return *info;
}
Beispiel #3
0
extern "C" magma_err_t 
magma_dlatrd(char uplo, magma_int_t n, magma_int_t nb, 
             double *a,  magma_int_t lda, 
             double *e, double *tau, 
             double *w,  magma_int_t ldw,
             magmaDouble_ptr da, size_t da_offset, magma_int_t ldda, 
             magmaDouble_ptr dw, size_t dw_offset, magma_int_t lddw, magma_queue_t queue)
{
/*  -- clMAGMA (version 1.0.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       April 2012

    Purpose   
    =======   
    DLATRD reduces NB rows and columns of a real symmetric matrix A to   
    symmetric tridiagonal form by an orthogonal similarity   
    transformation Q' * A * Q, and returns the matrices V and W which are   
    needed to apply the transformation to the unreduced part of A.   

    If UPLO = 'U', DLATRD reduces the last NB rows and columns of a   
    matrix, of which the upper triangle is supplied;   
    if UPLO = 'L', DLATRD reduces the first NB rows and columns of a   
    matrix, of which the lower triangle is supplied.   

    This is an auxiliary routine called by DSYTRD.   

    Arguments   
    =========   
    UPLO    (input) CHARACTER*1   
            Specifies whether the upper or lower triangular part of the   
            symmetric matrix A is stored:   
            = 'U': Upper triangular   
            = 'L': Lower triangular   

    N       (input) INTEGER   
            The order of the matrix A.   

    NB      (input) INTEGER   
            The number of rows and columns to be reduced.   

    A       (input/output) DOUBLE_PRECISION array, dimension (LDA,N)   
            On entry, the symmetric matrix A.  If UPLO = 'U', the leading   
            n-by-n upper triangular part of A contains the upper   
            triangular part of the matrix A, and the strictly lower   
            triangular part of A is not referenced.  If UPLO = 'L', the   
            leading n-by-n lower triangular part of A contains the lower   
            triangular part of the matrix A, and the strictly upper   
            triangular part of A is not referenced.   
            On exit:   
            if UPLO = 'U', the last NB columns have been reduced to   
              tridiagonal form, with the diagonal elements overwriting   
              the diagonal elements of A; the elements above the diagonal   
              with the array TAU, represent the orthogonal matrix Q as a   
              product of elementary reflectors;   
            if UPLO = 'L', the first NB columns have been reduced to   
              tridiagonal form, with the diagonal elements overwriting   
              the diagonal elements of A; the elements below the diagonal   
              with the array TAU, represent the  orthogonal matrix Q as a   
              product of elementary reflectors.   
            See Further Details.   

    LDA     (input) INTEGER   
            The leading dimension of the array A.  LDA >= (1,N).   

    E       (output) DOUBLE_PRECISION array, dimension (N-1)   
            If UPLO = 'U', E(n-nb:n-1) contains the superdiagonal   
            elements of the last NB columns of the reduced matrix;   
            if UPLO = 'L', E(1:nb) contains the subdiagonal elements of   
            the first NB columns of the reduced matrix.   

    TAU     (output) DOUBLE_PRECISION array, dimension (N-1)   
            The scalar factors of the elementary reflectors, stored in   
            TAU(n-nb:n-1) if UPLO = 'U', and in TAU(1:nb) if UPLO = 'L'.   
            See Further Details.   

    W       (output) DOUBLE_PRECISION array, dimension (LDW,NB)   
            The n-by-nb matrix W required to update the unreduced part   
            of A.   

    LDW     (input) INTEGER   
            The leading dimension of the array W. LDW >= max(1,N).   

    Further Details   
    ===============   
    If UPLO = 'U', the matrix Q is represented as a product of elementary   
    reflectors   

       Q = H(n) H(n-1) . . . H(n-nb+1).   

    Each H(i) has the form   

       H(i) = I - tau * v * v'   

    where tau is a real scalar, and v is a real vector with   
    v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i),   
    and tau in TAU(i-1).   

    If UPLO = 'L', the matrix Q is represented as a product of elementary   
    reflectors   

       Q = H(1) H(2) . . . H(nb).   

    Each H(i) has the form   

       H(i) = I - tau * v * v'   

    where tau is a real scalar, and v is a real vector with   
    v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i),   
    and tau in TAU(i).   

    The elements of the vectors v together form the n-by-nb matrix V   
    which is needed, with W, to apply the transformation to the unreduced   
    part of the matrix, using a symmetric rank-2k update of the form:   
    A := A - V*W' - W*V'.   

    The contents of A on exit are illustrated by the following examples   
    with n = 5 and nb = 2:   

    if UPLO = 'U':                       if UPLO = 'L':   

      (  a   a   a   v4  v5 )              (  d                  )   
      (      a   a   v4  v5 )              (  1   d              )   
      (          a   1   v5 )              (  v1  1   a          )   
      (              d   1  )              (  v1  v2  a   a      )   
      (                  d  )              (  v1  v2  a   a   a  )   

    where d denotes a diagonal element of the reduced matrix, a denotes   
    an element of the original matrix that is unchanged, and vi denotes   
    an element of the vector defining H(i).   
    =====================================================================    */
  
    char uplo_[2]  = {uplo, 0};

    magma_int_t i;
  
    double c_neg_one = MAGMA_D_NEG_ONE;
    double c_one     = MAGMA_D_ONE;
    double c_zero    = MAGMA_D_ZERO;

    double value = MAGMA_D_ZERO;
    
    magma_int_t ione = 1;

    magma_int_t i_n, i_1, iw;
  
    double alpha;

// ACD:
    double *f = (double *)malloc(n*sizeof(double ));
    //double *f = (double *) _aligned_malloc( (size_t)(n*sizeof(double )), (size_t) 32);

    if (n <= 0) {
      return 0;
    }

	magma_event_t event = NULL;

    if (lapackf77_lsame(uplo_, "U")) {

      /* Reduce last NB columns of upper triangle */
      for (i = n-1; i >= n - nb ; --i) {
        i_1 = i + 1;
        i_n = n - i - 1;
        
        iw = i - n + nb;
        if (i < n-1) {
          /* Update A(1:i,i) */
          #if defined(PRECISION_z) || defined(PRECISION_c)
              lapackf77_dlacgv(&i_n, W(i, iw+1), &ldw);
          #endif
          blasf77_dgemv("No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda,
                        W(i, iw+1), &ldw, &c_one, A(0, i), &ione);
          #if defined(PRECISION_z) || defined(PRECISION_c)
              lapackf77_dlacgv(&i_n, W(i, iw+1), &ldw);
              lapackf77_dlacgv(&i_n, A(i, i+1), &lda);
          #endif
          blasf77_dgemv("No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                        A(i, i+1), &lda, &c_one, A(0, i), &ione);
          #if defined(PRECISION_z) || defined(PRECISION_c)
              lapackf77_dlacgv(&i_n, A(i, i+1), &lda);
          #endif
        }
        if (i > 0) {
          /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */
          
          alpha = *A(i-1, i);
          
          lapackf77_dlarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]);
          
          e[i-1] = MAGMA_D_REAL( alpha );
          MAGMA_D_SET2REAL(*A(i-1, i), 1.);
          
          /* Compute W(1:i-1,i) */
          // 1. Send the block reflector  A(0:n-i-1,i) to the GPU
          magma_dsetvector( i, A(0, i), 0, 1, dA(0, i), 1, queue );
          
          magma_dsymv(MagmaUpper, i, c_one, dA(0, 0), ldda,
                      dA(0, i), ione, c_zero, dW(0, iw), ione, queue);
          
          // 2. Start putting the result back (asynchronously)
          magma_dgetmatrix_async( i, 1,
                                  dW(0, iw),         lddw,
                                  W(0, iw)/*test*/, 0, ldw, queue, &event );
          
          if (i < n-1) {

            blasf77_dgemv(MagmaTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw,
                          A(0, i), &ione, &c_zero, W(i+1, iw), &ione);
          }
          
            // 3. Here is where we need it // TODO find the right place
			magma_event_sync(event);

          if (i < n-1) {
          
            blasf77_dgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda,
                          W(i+1, iw), &ione, &c_one, W(0, iw), &ione);

            blasf77_dgemv(MagmaTransStr, &i, &i_n, &c_one, A(0, i+1), &lda,
                          A(0, i), &ione, &c_zero, W(i+1, iw), &ione);

            blasf77_dgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                          W(i+1, iw), &ione, &c_one, W(0, iw), &ione);
          }

          blasf77_dscal(&i, &tau[i - 1], W(0, iw), &ione);

          #if defined(PRECISION_z) || defined(PRECISION_c)
          cblas_ddot_sub( i, W(0,iw), ione, A(0,i), ione, &value );
          #else
          value = cblas_ddot( i, W(0,iw), ione, A(0,i), ione );
          #endif
          alpha = tau[i - 1] * -0.5f * value;
          blasf77_daxpy(&i, &alpha, A(0, i), &ione,
                        W(0, iw), &ione);
        }
      }

    } else {

      /*  Reduce first NB columns of lower triangle */
      for (i = 0; i < nb; ++i)
        {

          /* Update A(i:n,i) */
          i_n = n - i;
          #if defined(PRECISION_z) || defined(PRECISION_c)
              lapackf77_dlacgv(&i, W(i, 0), &ldw);
          #endif
          blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda, 
                        W(i, 0), &ldw, &c_one, A(i, i), &ione);
          #if defined(PRECISION_z) || defined(PRECISION_c)
              lapackf77_dlacgv(&i, W(i, 0), &ldw);
              lapackf77_dlacgv(&i, A(i ,0), &lda);
          #endif
          blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw, 
                        A(i, 0), &lda, &c_one, A(i, i), &ione);
          #if defined(PRECISION_z) || defined(PRECISION_c)
              lapackf77_dlacgv(&i, A(i, 0), &lda);
          #endif

          if (i < n-1) 
            {
              /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */
              i_n = n - i - 1;
              alpha = *A(i+1, i);
              lapackf77_dlarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]);
              e[i] = MAGMA_D_REAL( alpha );
              MAGMA_D_SET2REAL(*A(i+1, i), 1.);

              /* Compute W(i+1:n,i) */ 
              // 1. Send the block reflector  A(i+1:n,i) to the GPU
              magma_dsetvector( i_n, A(i+1, i), 0, 1, dA(i+1, i), 1, queue );          
          
              magma_dsymv(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero,
                          dW(i+1, i), ione, queue);
          
              // 2. Start putting the result back (asynchronously)
              magma_dgetmatrix_async( i_n, 1,
                                      dW(i+1, i), lddw,
                                      W(i+1, i), 0, ldw, queue, &event );

              blasf77_dgemv(MagmaTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw, 
                            A(i+1, i), &ione, &c_zero, W(0, i), &ione);

              blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda, 
                            W(0, i), &ione, &c_zero, f, &ione);
              
              blasf77_dgemv(MagmaTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda, 
                            A(i+1, i), &ione, &c_zero, W(0, i), &ione);

              // 3. Here is where we need it
			  magma_event_sync(event);

              if (i!=0)
                blasf77_daxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione);
     
              blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw, 
                            W(0, i), &ione, &c_one, W(i+1, i), &ione);
              blasf77_dscal(&i_n, &tau[i], W(i+1,i), &ione);
              
              #if defined(PRECISION_z) || defined(PRECISION_c)
              cblas_ddot_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value );
              #else
              value = cblas_ddot( i_n, W(i+1,i), ione, A(i+1,i), ione );
              #endif
              alpha = tau[i] * -0.5f * value;
              blasf77_daxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione);
            }
        }
    }

// ACD:
    free(f);
    //_aligned_free(f);

    return 0;
} /* dlatrd_ */
Beispiel #4
0
extern "C" magma_int_t
magmablas_dsymv_mgpu( magma_int_t num_gpus, magma_int_t k, magma_uplo_t uplo,
                      magma_int_t n, magma_int_t nb,
                      double alpha,
                      double **dA, magma_int_t ldda, magma_int_t offset,
                      double **dx, magma_int_t incx,
                      double beta,
                      double **dy, magma_int_t incy,
                      double **dwork, magma_int_t ldwork,
                      double *work, double *W,
                      magma_queue_t stream[][10] )
{
#define dX(id, i)    (dx[(id)]+incx*(i))
#define dY(id, i, j) (dy[(id)]+incy*(i)+n*(j))

    magma_int_t id;

#ifdef MAGMABLAS_DSYMV_MGPU
    for( id=0; id < num_gpus; id++ ) {
        magma_setdevice(id);
        magmablasSetKernelStream(stream[id][0]);
        trace_gpu_start( id, 0, "memset", "memset" );
        cudaMemset( dwork[id], 0, ldwork*sizeof(double) );
        trace_gpu_end( id, 0 );
        trace_gpu_start( id, 0, "symv", "symv" );
    }

    if ( nb == 32 ) {
        magmablas_dsymv_mgpu_32_offset( uplo, offset+n, alpha, dA, ldda,
                                        dx, incx,
                                        beta,
                                        dy, incy,
                                        dwork, ldwork,
                                        num_gpus, nb, offset,
                                        stream );
    } else {
        magmablas_dsymv_mgpu_offset( uplo, offset+n, alpha, dA, ldda,
                                     dx, incx,
                                     beta,
                                     dy, incy,
                                     dwork, ldwork,
                                     num_gpus, nb, offset,
                                     stream );
    }
    for( id=0; id < num_gpus; id++ ) {
        magma_setdevice(id);
        trace_gpu_end( id, 0 );
        magmablasSetKernelStream(NULL);
    }
    //magma_setdevice(0);
    //magmablasSetKernelStream(stream[0][0]);
    //magma_dsymv(MagmaLower, n, alpha, &dA[0][offset+offset*ldda], ldda, &dx[0][offset], incx, beta, &dy[0][offset], incy );
    //magmablasSetKernelStream(NULL);

    /* send to CPU */
    magma_setdevice(0);
    trace_gpu_start( 0, 0, "comm", "comm" );
    magma_dgetvector_async( n, dY(0, offset, 0), 1, W, 1, stream[0][0] );
    trace_gpu_end( 0, 0 );
    magmablasSetKernelStream(NULL);

    for( id=1; id < num_gpus; id++ ) {
        magma_setdevice(id);
        trace_gpu_start(  id, 0, "comm", "comm" );
        magma_dgetvector_async( n, dY(id, offset, 0), 1, &work[id*n], 1, stream[id][0] );
        trace_gpu_end( id, 0 );
        magmablasSetKernelStream(NULL);
    }
#else
    double c_one = MAGMA_D_ONE;
    const char* uplo_  = lapack_uplo_const( uplo  );
    magma_int_t i, ii, j, kk, ib, ib0, i_1, i_local, idw;
    magma_int_t i_0=n;
    magma_int_t loffset0 = nb*(offset/(nb*num_gpus));
    magma_int_t loffset1 = offset%nb;
    magma_int_t loffset;
    
    //magma_dsymv(uplo, n, alpha, dA, ldda, dx, incx, beta, dy, incy );

    idw = (offset/nb)%num_gpus;

    for( id=0; id < num_gpus; id++ ) {
        magma_setdevice(id);
        magmablasSetKernelStream(stream[id][0]);
        cudaMemset( dy[id], 0, n*k*sizeof(double) );
    }

    if (uplo == MagmaLower) {
        /* the first block */
        if ( loffset1 > 0 ) {
            id = idw;
            kk = 0;

            magma_setdevice(id);
            magmablasSetKernelStream(stream[id][kk]);

            loffset = loffset0+loffset1;
            ib0 = min(nb-loffset1,n);
            // diagonal
            magma_dsymv(MagmaLower, ib0, c_one, dA(id, 0, 0 ), ldda,
                        dX(id, 0), incx, c_one, dY(id, 0, kk), incy);
            // off-diagonl
            if ( ib0 < n ) {
                for( j=ib0; j < n; j += i_0 ) {
                    i_1 = min(i_0, n-j);
                    magma_dgemv(MagmaNoTrans, i_1, ib0, c_one, dA(id, j, 0), ldda,
                                dX(id, 0), incx, c_one, dY(id, j, kk), incy);
                    magma_dgemv(MagmaTrans, i_1, ib0, c_one, dA(id, j, 0), ldda,
                                dX(id, j), incx, c_one, dY(id, 0, kk), incy);
                }
            }
        }
        else {
            ib0 = 0;
        }

        /* diagonal */
        for( i=ib0; i < n; i += nb ) {
            id = ((i+offset)/nb)%num_gpus;
            kk = ((i+loffset1)/(nb*num_gpus))%k;

            magma_setdevice(id);
            magmablasSetKernelStream(stream[id][kk]);

            i_local = (i+loffset1)/(nb*num_gpus);
            ib = min(nb,n-i);

            ii = nb*i_local;

            loffset = loffset0;
            if ( id < idw )
                loffset += nb;
            magma_dsymv(MagmaLower,  ib, c_one, dA(id, i, ii), ldda,
                        dX(id, i), incx, c_one, dY(id, i, kk), incy);
        }

        /* off-diagonal */
        for( i=ib0; i < n-nb; i += nb ) {
            id = ((i+offset)/nb)%num_gpus;
            kk = ((i+loffset1)/(nb*num_gpus))%k;
            magma_setdevice(id);
            magmablasSetKernelStream(stream[id][kk]);

            i_local = ((i+loffset1)/nb)/num_gpus;
            ii = nb*i_local;
            ib = min(nb,n-i);
            loffset = loffset0;
            if ( id < idw )
                loffset += nb;

            for( j=i+ib; j < n; j += i_0 ) {
                i_1 = min(i_0, n-j);
                magma_dgemv(MagmaNoTrans, i_1, ib, c_one, dA(id, j, ii), ldda,
                            dX(id, i), incx, c_one, dY(id, j, kk), incy);
                magma_dgemv(MagmaTrans, i_1, ib, c_one, dA(id, j, ii), ldda,
                            dX(id, j), incx, c_one, dY(id, i, kk), incy);
            }
        }
    } else { /* upper-triangular storage */
        loffset = 0;
        /* diagonal */
        for( i=0; i < n; i += nb ) {
            id = (i/nb)%num_gpus;
            kk = (i/(nb*num_gpus))%k;
            ib = min(nb,n-i);

            magma_setdevice(id);
            magmablasSetKernelStream(stream[id][kk]);

            i_local = i/(nb*num_gpus);
            ii = nb*i_local;

            magma_dsymv(MagmaUpper, ib, c_one, dA(id, i, ii), ldda,
                        dX(id, i), incx, c_one, dY(id, i, kk), incy);
        }

        /* off-diagonal */
        for( i=nb; i < n; i += nb ) {
            id = (i/nb)%num_gpus;
            kk = (i/(nb*num_gpus))%k;
            magma_setdevice(id);
            magmablasSetKernelStream(stream[id][kk]);

            i_local = (i/nb)/num_gpus;
            ii = nb*i_local;
            ib = min(nb,n-i);

            magma_dgemv(MagmaNoTrans, i, ib, c_one, dA(id, 0, ii), ldda,
                        dX(id, i), incx, c_one, dY(id, 0, kk), incy);
            magma_dgemv(MagmaTrans, i, ib, c_one, dA(id, 0, ii), ldda,
                        dX(id, 0), incx, c_one, dY(id, i, kk), incy);
        }
    }
    /* send to CPU */
    magma_setdevice(0);
    magma_dgetvector_async( n, dY(0, 0, 0), 1, W, 1, stream[0][0] );
    for( kk=1; kk < k; kk++ ) {
        magma_dgetvector_async( n, dY(0, 0, kk), 1, &work[kk*n], 1, stream[0][kk] );
    }
    magmablasSetKernelStream(NULL);

    for( id=1; id < num_gpus; id++ ) {
        magma_setdevice(id);
        for( kk=0; kk < k; kk++ ) {
            magma_dgetvector_async( n, dY(id, 0, kk), 1, &work[id*k*n + kk*n], 1, stream[id][kk] );
        }
        magmablasSetKernelStream(NULL);
    }
#endif
    return 0;
}