Exemplo n.º 1
0
/** Solve triangular matrix-matrix system (multiple right-hand sides).
        \f$ op(A) X = \alpha B \f$ (side == MagmaLeft), or \n
        \f$ X op(A) = \alpha B \f$ (side == MagmaRight),   \n
        where \f$ A \f$ is triangular.

    @param[in]
    side    Whether A is on the left or right.

    @param[in]
    uplo    Whether A is upper or lower triangular.

    @param[in]
    trans   Operation to perform on A.

    @param[in]
    diag    Whether the diagonal of A is assumed to be unit or non-unit.

    @param[in]
    m       Number of rows of B. m >= 0.

    @param[in]
    n       Number of columns of B. n >= 0.

    @param[in]
    alpha   Scalar \f$ \alpha \f$

    @param[in]
    dA      COMPLEX_16 array on GPU device.
            If side == MagmaLeft, the m-by-m triangular matrix A of dimension (ldda,m), ldda >= max(1,m); \n
            otherwise,            the n-by-n triangular matrix A of dimension (ldda,n), ldda >= max(1,n).

    @param[in]
    ldda    Leading dimension of dA.

    @param[in,out]
    dB      COMPLEX_16 array on GPU device.
            On entry, m-by-n matrix B of dimension (lddb,n), lddb >= max(1,m).
            On exit, overwritten with the solution matrix X.

    @param[in]
    lddb    Leading dimension of dB.

    @ingroup magma_zblas3
*/
extern "C" void
magma_ztrsm(
    magma_side_t side, magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag,
    magma_int_t m, magma_int_t n,
    magmaDoubleComplex alpha,
    magmaDoubleComplex_const_ptr dA, magma_int_t ldda,
    magmaDoubleComplex_ptr       dB, magma_int_t lddb )
{
    cublasZtrsm(
        cublas_side_const( side ),
        cublas_uplo_const( uplo ),
        cublas_trans_const( trans ),
        cublas_diag_const( diag ),
        m, n,
        alpha, dA, ldda,
        dB, lddb );
}
Exemplo n.º 2
0
int main( int argc, char** argv )
{
    TESTING_INIT();
    
    real_Double_t   gflops, t1, t2;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    magma_int_t ione = 1;
    magma_trans_t trans[] = { MagmaNoTrans, MagmaConjTrans, MagmaTrans };
    magma_uplo_t  uplo [] = { MagmaLower, MagmaUpper };
    magma_diag_t  diag [] = { MagmaUnit, MagmaNonUnit };
    magma_side_t  side [] = { MagmaLeft, MagmaRight };
    
    magmaDoubleComplex  *A,  *B,  *C,   *C2, *LU;
    magmaDoubleComplex *dA, *dB, *dC1, *dC2;
    magmaDoubleComplex alpha = MAGMA_Z_MAKE( 0.5, 0.1 );
    magmaDoubleComplex beta  = MAGMA_Z_MAKE( 0.7, 0.2 );
    double dalpha = 0.6;
    double dbeta  = 0.8;
    double work[1], error, total_error;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t m, n, k, size, maxn, ld, info;
    magma_int_t *piv;
    magma_int_t err;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" );
    
    total_error = 0.;
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        m = opts.msize[itest];
        n = opts.nsize[itest];
        k = opts.ksize[itest];
        printf("=========================================================================\n");
        printf( "m=%d, n=%d, k=%d\n", (int) m, (int) n, (int) k );
        
        // allocate matrices
        // over-allocate so they can be any combination of {m,n,k} x {m,n,k}.
        maxn = max( max( m, n ), k );
        ld = max( 1, maxn );
        size = ld*maxn;
        err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) );  assert( err == 0 );
        err = magma_zmalloc_pinned( &A,  size );  assert( err == 0 );
        err = magma_zmalloc_pinned( &B,  size );  assert( err == 0 );
        err = magma_zmalloc_pinned( &C,  size );  assert( err == 0 );
        err = magma_zmalloc_pinned( &C2, size );  assert( err == 0 );
        err = magma_zmalloc_pinned( &LU, size );  assert( err == 0 );
        err = magma_zmalloc( &dA,  size );        assert( err == 0 );
        err = magma_zmalloc( &dB,  size );        assert( err == 0 );
        err = magma_zmalloc( &dC1, size );        assert( err == 0 );
        err = magma_zmalloc( &dC2, size );        assert( err == 0 );
        
        // initialize matrices
        size = maxn*maxn;
        lapackf77_zlarnv( &ione, ISEED, &size, A  );
        lapackf77_zlarnv( &ione, ISEED, &size, B  );
        lapackf77_zlarnv( &ione, ISEED, &size, C  );
        
        printf( "========== Level 1 BLAS ==========\n" );
        
        // ----- test ZSWAP
        // swap columns 2 and 3 of dA, then copy to C2 and compare with A
        if ( n >= 3 ) {
            magma_zsetmatrix( m, n, A, ld, dA, ld );
            magma_zsetmatrix( m, n, A, ld, dB, ld );
            magma_zswap( m, dA(0,1), 1, dA(0,2), 1 );
            magma_zswap( m, dB(0,1), 1, dB(0,2), 1 );
            
            // check results, storing diff between magma and cuda calls in C2
            cublasZaxpy( handle, ld*n, &c_neg_one, dA, 1, dB, 1 );
            magma_zgetmatrix( m, n, dB, ld, C2, ld );
            error = lapackf77_zlange( "F", &m, &k, C2, &ld, work );
            total_error += error;
            printf( "zswap             diff %.2g\n", error );
        }
        else {
            printf( "zswap skipped for n < 3\n" );
        }
        
        // ----- test IZAMAX
        // get argmax of column of A
        magma_zsetmatrix( m, k, A, ld, dA, ld );
        error = 0;
        for( int j = 0; j < k; ++j ) {
            magma_int_t i1 = magma_izamax( m, dA(0,j), 1 );
            int i2;  // NOT magma_int_t, for cublas
            cublasIzamax( handle, m, dA(0,j), 1, &i2 );
            // todo need sync here?
            assert( i1 == i2 );
            error += abs( i1 - i2 );
        }
        total_error += error;
        gflops = (double)m * k / 1e9;
        printf( "izamax            diff %.2g\n", error );
        printf( "\n" );
        
        printf( "========== Level 2 BLAS ==========\n" );
        
        // ----- test ZGEMV
        // c = alpha*A*b + beta*c,  with A m*n; b,c m or n-vectors
        // try no-trans/trans
        for( int ia = 0; ia < 3; ++ia ) {
            magma_zsetmatrix( m, n, A,  ld, dA,  ld );
            magma_zsetvector( maxn, B, 1, dB,  1 );
            magma_zsetvector( maxn, C, 1, dC1, 1 );
            magma_zsetvector( maxn, C, 1, dC2, 1 );
            
            t1 = magma_sync_wtime( 0 );
            magma_zgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 );
            t1 = magma_sync_wtime( 0 ) - t1;
            
            t2 = magma_sync_wtime( 0 );
            cublasZgemv( handle, cublas_trans_const(trans[ia]),
                         m, n, &alpha, dA, ld, dB, 1, &beta, dC2, 1 );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            size = (trans[ia] == MagmaNoTrans ? m : n);
            cublasZaxpy( handle, size, &c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetvector( size, dC2, 1, C2, 1 );
            error = lapackf77_zlange( "F", &size, &ione, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZGEMV( m, n ) / 1e9;
            printf( "zgemv( %c )        diff %.2g,  Gflop/s %7.2f, %7.2f\n",
                    lapacke_trans_const(trans[ia]), error, gflops/t1, gflops/t2 );
        }
        printf( "\n" );
        
        // ----- test ZHEMV
        // c = alpha*A*b + beta*c,  with A m*m symmetric; b,c m-vectors
        // try upper/lower
        for( int iu = 0; iu < 2; ++iu ) {
            magma_zsetmatrix( m, m, A, ld, dA, ld );
            magma_zsetvector( m, B, 1, dB,  1 );
            magma_zsetvector( m, C, 1, dC1, 1 );
            magma_zsetvector( m, C, 1, dC2, 1 );
            
            t1 = magma_sync_wtime( 0 );
            magma_zhemv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 );
            t1 = magma_sync_wtime( 0 ) - t1;
            
            t2 = magma_sync_wtime( 0 );
            cublasZhemv( handle, cublas_uplo_const(uplo[iu]),
                         m, &alpha, dA, ld, dB, 1, &beta, dC2, 1 );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( handle, m, &c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetvector( m, dC2, 1, C2, 1 );
            error = lapackf77_zlange( "F", &m, &ione, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZHEMV( m ) / 1e9;
            printf( "zhemv( %c )        diff %.2g,  Gflop/s %7.2f, %7.2f\n",
                    lapacke_uplo_const(uplo[iu]), error, gflops/t1, gflops/t2 );
        }
        printf( "\n" );
        
        // ----- test ZTRSV
        // solve A*c = c,  with A m*m triangular; c m-vector
        // try upper/lower, no-trans/trans, unit/non-unit diag
        // Factor A into LU to get well-conditioned triangles, else solve yields garbage.
        // Still can give garbage if solves aren't consistent with LU factors,
        // e.g., using unit diag for U, so copy lower triangle to upper triangle.
        // Also used for trsm later.
        lapackf77_zlacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld );
        lapackf77_zgetrf( &maxn, &maxn, LU, &ld, piv, &info );
        for( int j = 0; j < maxn; ++j ) {
            for( int i = 0; i < j; ++i ) {
                *LU(i,j) = *LU(j,i);
            }
        }
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
        for( int id = 0; id < 2; ++id ) {
            magma_zsetmatrix( m, m, LU, ld, dA, ld );
            magma_zsetvector( m, C, 1, dC1, 1 );
            magma_zsetvector( m, C, 1, dC2, 1 );
            
            t1 = magma_sync_wtime( 0 );
            magma_ztrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 );
            t1 = magma_sync_wtime( 0 ) - t1;
            
            t2 = magma_sync_wtime( 0 );
            cublasZtrsv( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]),
                         cublas_diag_const(diag[id]), m, dA, ld, dC2, 1 );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( handle, m, &c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetvector( m, dC2, 1, C2, 1 );
            error = lapackf77_zlange( "F", &m, &ione, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZTRSM( MagmaLeft, m, 1 ) / 1e9;
            printf( "ztrsv( %c, %c, %c )  diff %.2g,  Gflop/s %7.2f, %7.2f\n",
                    lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), lapacke_diag_const(diag[id]),
                    error, gflops/t1, gflops/t2 );
        }}}
        printf( "\n" );
        
        printf( "========== Level 3 BLAS ==========\n" );
        
        // ----- test ZGEMM
        // C = alpha*A*B + beta*C,  with A m*k or k*m; B k*n or n*k; C m*n
        // try combinations of no-trans/trans
        for( int ia = 0; ia < 3; ++ia ) {
        for( int ib = 0; ib < 3; ++ib ) {
            bool nta = (trans[ia] == MagmaNoTrans);
            bool ntb = (trans[ib] == MagmaNoTrans);
            magma_zsetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA,  ld );
            magma_zsetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB,  ld );
            magma_zsetmatrix( m, n, C, ld, dC1, ld );
            magma_zsetmatrix( m, n, C, ld, dC2, ld );
            
            t1 = magma_sync_wtime( 0 );
            magma_zgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            
            t2 = magma_sync_wtime( 0 );
            cublasZgemm( handle, cublas_trans_const(trans[ia]), cublas_trans_const(trans[ib]),
                         m, n, k, &alpha, dA, ld, dB, ld, &beta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_zlange( "F", &m, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZGEMM( m, n, k ) / 1e9;
            printf( "zgemm( %c, %c )     diff %.2g,  Gflop/s %7.2f, %7.2f\n",
                    lapacke_trans_const(trans[ia]), lapacke_trans_const(trans[ib]),
                    error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test ZHEMM
        // C = alpha*A*B + beta*C  (left)  with A m*m symmetric; B,C m*n; or
        // C = alpha*B*A + beta*C  (right) with A n*n symmetric; B,C m*n
        // try left/right, upper/lower
        for( int is = 0; is < 2; ++is ) {
        for( int iu = 0; iu < 2; ++iu ) {
            magma_zsetmatrix( m, m, A, ld, dA,  ld );
            magma_zsetmatrix( m, n, B, ld, dB,  ld );
            magma_zsetmatrix( m, n, C, ld, dC1, ld );
            magma_zsetmatrix( m, n, C, ld, dC2, ld );
            
            t1 = magma_sync_wtime( 0 );
            magma_zhemm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            
            t2 = magma_sync_wtime( 0 );
            cublasZhemm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]),
                         m, n, &alpha, dA, ld, dB, ld, &beta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_zlange( "F", &m, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZHEMM( side[is], m, n ) / 1e9;
            printf( "zhemm( %c, %c )     diff %.2g,  Gflop/s %7.2f, %7.2f\n",
                    lapacke_side_const(side[is]), lapacke_uplo_const(uplo[iu]),
                    error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test ZHERK
        // C = alpha*A*A^H + beta*C  (no-trans) with A m*k and C m*m symmetric; or
        // C = alpha*A^H*A + beta*C  (trans)    with A k*m and C m*m symmetric
        // try upper/lower, no-trans/trans
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
            magma_zsetmatrix( n, k, A, ld, dA,  ld );
            magma_zsetmatrix( n, n, C, ld, dC1, ld );
            magma_zsetmatrix( n, n, C, ld, dC2, ld );
            
            t1 = magma_sync_wtime( 0 );
            magma_zherk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            
            t2 = magma_sync_wtime( 0 );
            cublasZherk( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]),
                         n, k, &dalpha, dA, ld, &dbeta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetmatrix( n, n, dC2, ld, C2, ld );
            error = lapackf77_zlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZHERK( k, n ) / 1e9;
            printf( "zherk( %c, %c )     diff %.2g,  Gflop/s %7.2f, %7.2f\n",
                    lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]),
                    error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test ZHER2K
        // C = alpha*A*B^H + ^alpha*B*A^H + beta*C  (no-trans) with A,B n*k; C n*n symmetric; or
        // C = alpha*A^H*B + ^alpha*B^H*A + beta*C  (trans)    with A,B k*n; C n*n symmetric
        // try upper/lower, no-trans/trans
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
            bool nt = (trans[it] == MagmaNoTrans);
            magma_zsetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA,  ld );
            magma_zsetmatrix( n, n, C, ld, dC1, ld );
            magma_zsetmatrix( n, n, C, ld, dC2, ld );
            
            t1 = magma_sync_wtime( 0 );
            magma_zher2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            
            t2 = magma_sync_wtime( 0 );
            cublasZher2k( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]),
                          n, k, &alpha, dA, ld, dB, ld, &dbeta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetmatrix( n, n, dC2, ld, C2, ld );
            error = lapackf77_zlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZHER2K( k, n ) / 1e9;
            printf( "zher2k( %c, %c )    diff %.2g,  Gflop/s %7.2f, %7.2f\n",
                    lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]),
                    error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test ZTRMM
        // C = alpha*A*C  (left)  with A m*m triangular; C m*n; or
        // C = alpha*C*A  (right) with A n*n triangular; C m*n
        // try left/right, upper/lower, no-trans/trans, unit/non-unit
        for( int is = 0; is < 2; ++is ) {
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
        for( int id = 0; id < 2; ++id ) {
            bool left = (side[is] == MagmaLeft);
            magma_zsetmatrix( (left ? m : n), (left ? m : n), A, ld, dA,  ld );
            magma_zsetmatrix( m, n, C, ld, dC1, ld );
            magma_zsetmatrix( m, n, C, ld, dC2, ld );
            
            t1 = magma_sync_wtime( 0 );
            magma_ztrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            
            // note cublas does trmm out-of-place (i.e., adds output matrix C),
            // but allows C=B to do in-place.
            t2 = magma_sync_wtime( 0 );
            cublasZtrmm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]),
                         cublas_trans_const(trans[it]), cublas_diag_const(diag[id]),
                         m, n, &alpha, dA, ld, dC2, ld, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_zlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZTRMM( side[is], m, n ) / 1e9;
            printf( "ztrmm( %c, %c )     diff %.2g,  Gflop/s %7.2f, %7.2f\n",
                    lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]),
                    error, gflops/t1, gflops/t2 );
        }}}}
        printf( "\n" );
        
        // ----- test ZTRSM
        // solve A*X = alpha*B  (left)  with A m*m triangular; B m*n; or
        // solve X*A = alpha*B  (right) with A n*n triangular; B m*n
        // try left/right, upper/lower, no-trans/trans, unit/non-unit
        for( int is = 0; is < 2; ++is ) {
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
        for( int id = 0; id < 2; ++id ) {
            bool left = (side[is] == MagmaLeft);
            magma_zsetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA,  ld );
            magma_zsetmatrix( m, n, C, ld, dC1, ld );
            magma_zsetmatrix( m, n, C, ld, dC2, ld );
            
            t1 = magma_sync_wtime( 0 );
            magma_ztrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            
            t2 = magma_sync_wtime( 0 );
            cublasZtrsm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]),
                         cublas_trans_const(trans[it]), cublas_diag_const(diag[id]),
                         m, n, &alpha, dA, ld, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_zlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZTRSM( side[is], m, n ) / 1e9;
            printf( "ztrsm( %c, %c )     diff %.2g,  Gflop/s %7.2f, %7.2f\n",
                    lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]),
                    error, gflops/t1, gflops/t2 );
        }}}}
        printf( "\n" );
        
        // cleanup
        magma_free_cpu( piv );
        magma_free_pinned( A  );
        magma_free_pinned( B  );
        magma_free_pinned( C  );
        magma_free_pinned( C2 );
        magma_free_pinned( LU );
        magma_free( dA  );
        magma_free( dB  );
        magma_free( dC1 );
        magma_free( dC2 );
        fflush( stdout );
    }
    
    if ( total_error != 0. ) {
        printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n",
                total_error );
    }
    else {
        printf( "all tests passed\n" );
    }
    
    TESTING_FINALIZE();
    
    int status = (total_error != 0.);
    return status;
}
Exemplo n.º 3
0
int TEMPLATE2 (CHOLMOD (gpu_lower_potrf))
(
    Int nscol2,     /* S is nscol2-by-nscol2 */
    Int nsrow,      /* leading dimension of S */
    Int psx,        /* S is located at Lx + L_ENTRY*psx */
    double *Lx,     /* contains S; overwritten with Cholesky factor */
    Int *info,      /* BLAS info return value */
    cholmod_common *Common,
    cholmod_gpu_pointers *gpu_p
)
{
    double *devPtrA, *devPtrB, *A ;
    double alpha, beta ;
    cudaError_t cudaStat ;
    cublasStatus_t cublasStatus ;
    Int j, nsrow2, nb, n, gpu_lda, lda, gpu_ldb ;
    int ilda, ijb, iinfo ;
#ifndef NTIMER
    double tstart ;
#endif

    if (nscol2 * L_ENTRY < CHOLMOD_POTRF_LIMIT)
    {
        /* too small for the CUDA BLAS; use the CPU instead */
        return (0) ;
    }

#ifndef NTIMER
    tstart = SuiteSparse_time ( ) ;
    Common->CHOLMOD_GPU_POTRF_CALLS++ ;
#endif

    nsrow2 = nsrow - nscol2 ;

    /* ---------------------------------------------------------------------- */
    /* heuristic to get the block size depending of the problem size */
    /* ---------------------------------------------------------------------- */

    nb = 128 ;
    if (nscol2 > 4096) nb = 256 ;
    if (nscol2 > 8192) nb = 384 ;
    n  = nscol2 ;
    gpu_lda = ((nscol2+31)/32)*32 ;
    lda = nsrow ;

    A = gpu_p->h_Lx[(Common->ibuffer+CHOLMOD_HOST_SUPERNODE_BUFFERS-1)%
                    CHOLMOD_HOST_SUPERNODE_BUFFERS];

    /* ---------------------------------------------------------------------- */
    /* determine the GPU leading dimension of B */
    /* ---------------------------------------------------------------------- */

    gpu_ldb = 0 ;
    if (nsrow2 > 0)
    {
        gpu_ldb = ((nsrow2+31)/32)*32 ;
    }

    /* ---------------------------------------------------------------------- */
    /* remember where device memory is, to be used by triangular solve later */
    /* ---------------------------------------------------------------------- */

    devPtrA = gpu_p->d_Lx[0];
    devPtrB = gpu_p->d_Lx[1];

    /* ---------------------------------------------------------------------- */
    /* copy A from device to device */
    /* ---------------------------------------------------------------------- */

    cudaStat = cudaMemcpy2DAsync ( devPtrA,
       gpu_lda * L_ENTRY * sizeof (devPtrA[0]),
       gpu_p->d_A[1],
       nsrow * L_ENTRY * sizeof (Lx[0]),
       nscol2 * L_ENTRY * sizeof (devPtrA[0]),
       nscol2,
       cudaMemcpyDeviceToDevice,
       Common->gpuStream[0] );

    if ( cudaStat ) {
        ERROR ( CHOLMOD_GPU_PROBLEM, "GPU memcopy device to device");
    }

    /* ---------------------------------------------------------------------- */
    /* copy B in advance, for gpu_triangular_solve */
    /* ---------------------------------------------------------------------- */

    if (nsrow2 > 0)
    {
        cudaStat = cudaMemcpy2DAsync (devPtrB,
            gpu_ldb * L_ENTRY * sizeof (devPtrB [0]),
            gpu_p->d_A[1] + L_ENTRY*nscol2,
            nsrow * L_ENTRY * sizeof (Lx [0]),
            nsrow2 * L_ENTRY * sizeof (devPtrB [0]),
            nscol2,
            cudaMemcpyDeviceToDevice,
            Common->gpuStream[0]) ;
        if (cudaStat)
        {
            ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ;
        }
    }

    /* ------------------------------------------------------------------ */
    /* define the dpotrf stream */
    /* ------------------------------------------------------------------ */

    cublasStatus = cublasSetStream (Common->cublasHandle,
                                    Common->gpuStream [0]) ;
    if (cublasStatus != CUBLAS_STATUS_SUCCESS) {
        ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS stream") ;
    }

    /* ---------------------------------------------------------------------- */
    /* block Cholesky factorization of S */
    /* ---------------------------------------------------------------------- */

    for (j = 0 ; j < n ; j += nb)
    {
        Int jb = nb < (n-j) ? nb : (n-j) ;

        /* ------------------------------------------------------------------ */
        /* do the CUDA BLAS dsyrk */
        /* ------------------------------------------------------------------ */

        alpha = -1.0 ;
        beta  = 1.0 ;
#ifdef REAL
        cublasStatus = cublasDsyrk (Common->cublasHandle,
            CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, jb, j,
            &alpha, devPtrA + j, gpu_lda,
            &beta,  devPtrA + j + j*gpu_lda, gpu_lda) ;

#else
        cublasStatus = cublasZherk (Common->cublasHandle,
            CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, jb, j,
            &alpha, (cuDoubleComplex*)devPtrA + j,
            gpu_lda,
            &beta,
            (cuDoubleComplex*)devPtrA + j + j*gpu_lda,
            gpu_lda) ;
#endif

        if (cublasStatus != CUBLAS_STATUS_SUCCESS)
        {
            ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ;
        }

        /* ------------------------------------------------------------------ */

        cudaStat = cudaEventRecord (Common->cublasEventPotrf [0],
                                    Common->gpuStream [0]) ;
        if (cudaStat)
        {
            ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ;
        }

        cudaStat = cudaStreamWaitEvent (Common->gpuStream [1],
                                        Common->cublasEventPotrf [0], 0) ;
        if (cudaStat)
        {
            ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ;
        }

        /* ------------------------------------------------------------------ */
        /* copy back the jb columns on two different streams */
        /* ------------------------------------------------------------------ */

        cudaStat = cudaMemcpy2DAsync (A + L_ENTRY*(j + j*lda),
            lda * L_ENTRY * sizeof (double),
            devPtrA + L_ENTRY*(j + j*gpu_lda),
            gpu_lda * L_ENTRY * sizeof (double),
            L_ENTRY * sizeof (double)*jb,
            jb,
            cudaMemcpyDeviceToHost,
            Common->gpuStream [1]) ;

        if (cudaStat)
        {
            ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy from device") ;
        }

        /* ------------------------------------------------------------------ */
        /* do the CUDA BLAS dgemm */
        /* ------------------------------------------------------------------ */

        if ((j+jb) < n)
        {

#ifdef REAL
            alpha = -1.0 ;
            beta  = 1.0 ;
            cublasStatus = cublasDgemm (Common->cublasHandle,
                CUBLAS_OP_N, CUBLAS_OP_T,
                (n-j-jb), jb, j,
                &alpha,
                devPtrA + (j+jb), gpu_lda,
                devPtrA + (j)  , gpu_lda,
                &beta,
                devPtrA + (j+jb + j*gpu_lda), gpu_lda) ;

#else
            cuDoubleComplex calpha = {-1.0,0.0} ;
            cuDoubleComplex cbeta  = { 1.0,0.0} ;
            cublasStatus = cublasZgemm (Common->cublasHandle,
                CUBLAS_OP_N, CUBLAS_OP_C,
                (n-j-jb), jb, j,
                &calpha,
                (cuDoubleComplex*)devPtrA + (j+jb),
                gpu_lda,
                (cuDoubleComplex*)devPtrA + (j),
                gpu_lda,
                &cbeta,
                (cuDoubleComplex*)devPtrA +
                (j+jb + j*gpu_lda),
                gpu_lda ) ;
#endif

            if (cublasStatus != CUBLAS_STATUS_SUCCESS)
            {
                ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ;
            }
        }

        cudaStat = cudaStreamSynchronize (Common->gpuStream [1]) ;
        if (cudaStat)
        {
            ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ;
        }

        /* ------------------------------------------------------------------ */
        /* compute the Cholesky factorization of the jbxjb block on the CPU */
        /* ------------------------------------------------------------------ */

        ilda = (int) lda ;
        ijb  = jb ;
#ifdef REAL
        LAPACK_DPOTRF ("L", &ijb, A + L_ENTRY * (j + j*lda), &ilda, &iinfo) ;
#else
        LAPACK_ZPOTRF ("L", &ijb, A + L_ENTRY * (j + j*lda), &ilda, &iinfo) ;
#endif
        *info = iinfo ;

        if (*info != 0)
        {
            *info = *info + j ;
            break ;
        }

        /* ------------------------------------------------------------------ */
        /* copy the result back to the GPU */
        /* ------------------------------------------------------------------ */

        cudaStat = cudaMemcpy2DAsync (devPtrA + L_ENTRY*(j + j*gpu_lda),
            gpu_lda * L_ENTRY * sizeof (double),
            A + L_ENTRY * (j + j*lda),
            lda * L_ENTRY * sizeof (double),
            L_ENTRY * sizeof (double) * jb,
            jb,
            cudaMemcpyHostToDevice,
            Common->gpuStream [0]) ;

        if (cudaStat)
        {
            ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ;
        }

        /* ------------------------------------------------------------------ */
        /* do the CUDA BLAS dtrsm */
        /* ------------------------------------------------------------------ */

        if ((j+jb) < n)
        {

#ifdef REAL
            alpha  = 1.0 ;
            cublasStatus = cublasDtrsm (Common->cublasHandle,
                CUBLAS_SIDE_RIGHT,
                CUBLAS_FILL_MODE_LOWER,
                CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT,
                (n-j-jb), jb,
                &alpha,
                devPtrA + (j + j*gpu_lda), gpu_lda,
                devPtrA + (j+jb + j*gpu_lda), gpu_lda) ;
#else
            cuDoubleComplex calpha  = {1.0,0.0};
            cublasStatus = cublasZtrsm (Common->cublasHandle,
                CUBLAS_SIDE_RIGHT,
                CUBLAS_FILL_MODE_LOWER,
                CUBLAS_OP_C, CUBLAS_DIAG_NON_UNIT,
                (n-j-jb), jb,
                &calpha,
                (cuDoubleComplex *)devPtrA +
                (j + j*gpu_lda),
                gpu_lda,
                (cuDoubleComplex *)devPtrA +
                (j+jb + j*gpu_lda),
                gpu_lda) ;
#endif

            if (cublasStatus != CUBLAS_STATUS_SUCCESS)
            {
                ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ;
            }

            /* -------------------------------------------------------------- */
            /* Copy factored column back to host.                             */
            /* -------------------------------------------------------------- */

            cudaStat = cudaEventRecord (Common->cublasEventPotrf[2],
                                        Common->gpuStream[0]) ;
            if (cudaStat)
            {
                ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ;
            }

            cudaStat = cudaStreamWaitEvent (Common->gpuStream[1],
                                            Common->cublasEventPotrf[2], 0) ;
            if (cudaStat)
            {
                ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ;
            }

            cudaStat = cudaMemcpy2DAsync (A + L_ENTRY*(j + jb + j * lda),
                  lda * L_ENTRY * sizeof (double),
                  devPtrA + L_ENTRY*
                  (j + jb + j * gpu_lda),
                  gpu_lda * L_ENTRY * sizeof (double),
                  L_ENTRY * sizeof (double)*
                  (n - j - jb), jb,
                  cudaMemcpyDeviceToHost,
                  Common->gpuStream[1]) ;

            if (cudaStat)
            {
                ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ;
            }
        }
    }

#ifndef NTIMER
    Common->CHOLMOD_GPU_POTRF_TIME += SuiteSparse_time ( ) - tstart ;
#endif

    return (1) ;
}
Exemplo n.º 4
0
int TEMPLATE2 (CHOLMOD (gpu_triangular_solve))
(
    Int nsrow2,     /* L1 and S2 are nsrow2-by-nscol2 */
    Int nscol2,     /* L1 is nscol2-by-nscol2 */
    Int nsrow,      /* leading dimension of L1, L2, and S2 */
    Int psx,        /* L1 is at Lx+L_ENTRY*psx;
                     * L2 at Lx+L_ENTRY*(psx+nscol2)*/
    double *Lx,     /* holds L1, L2, and S2 */
    cholmod_common *Common,
    cholmod_gpu_pointers *gpu_p
)
{
    double *devPtrA, *devPtrB ;
    cudaError_t cudaStat ;
    cublasStatus_t cublasStatus ;
    Int gpu_lda, gpu_ldb, gpu_rowstep ;

    Int gpu_row_start = 0 ;
    Int gpu_row_max_chunk, gpu_row_chunk;
    int ibuf = 0;
    int iblock = 0;
    int iHostBuff = (Common->ibuffer+CHOLMOD_HOST_SUPERNODE_BUFFERS-1) %
        CHOLMOD_HOST_SUPERNODE_BUFFERS;
    int i, j;
    Int iidx;
    int iwrap;

#ifndef NTIMER
    double tstart ;
#endif

#ifdef REAL
    double alpha  = 1.0 ;
    gpu_row_max_chunk = 768;
#else
    cuDoubleComplex calpha  = {1.0,0.0} ;
    gpu_row_max_chunk = 256;
#endif

    if ( nsrow2 <= 0 )
    {
        return (0) ;
    }

#ifndef NTIMER
    tstart = SuiteSparse_time ( ) ;
    Common->CHOLMOD_GPU_TRSM_CALLS++ ;
#endif

    gpu_lda = ((nscol2+31)/32)*32 ;
    gpu_ldb = ((nsrow2+31)/32)*32 ;

    devPtrA = gpu_p->d_Lx[0];
    devPtrB = gpu_p->d_Lx[1];

    /* make sure the copy of B has completed */
    cudaStreamSynchronize( Common->gpuStream[0] );

    /* ---------------------------------------------------------------------- */
    /* do the CUDA BLAS dtrsm */
    /* ---------------------------------------------------------------------- */

    while ( gpu_row_start < nsrow2 )
    {

        gpu_row_chunk = nsrow2 - gpu_row_start;
        if ( gpu_row_chunk  > gpu_row_max_chunk ) {
            gpu_row_chunk = gpu_row_max_chunk;
        }

        cublasStatus = cublasSetStream ( Common->cublasHandle,
                                         Common->gpuStream[ibuf] );

        if ( cublasStatus != CUBLAS_STATUS_SUCCESS )
        {
            ERROR ( CHOLMOD_GPU_PROBLEM, "GPU CUBLAS stream");
        }

#ifdef REAL
        cublasStatus = cublasDtrsm (Common->cublasHandle,
                                    CUBLAS_SIDE_RIGHT,
                                    CUBLAS_FILL_MODE_LOWER,
                                    CUBLAS_OP_T,
                                    CUBLAS_DIAG_NON_UNIT,
                                    gpu_row_chunk,
                                    nscol2,
                                    &alpha,
                                    devPtrA,
                                    gpu_lda,
                                    devPtrB + gpu_row_start,
                                    gpu_ldb) ;
#else
        cublasStatus = cublasZtrsm (Common->cublasHandle,
                                    CUBLAS_SIDE_RIGHT,
                                    CUBLAS_FILL_MODE_LOWER,
                                    CUBLAS_OP_C,
                                    CUBLAS_DIAG_NON_UNIT,
                                    gpu_row_chunk,
                                    nscol2,
                                    &calpha,
                                    (const cuDoubleComplex *) devPtrA,
                                    gpu_lda,
                                    (cuDoubleComplex *)devPtrB + gpu_row_start ,
                                    gpu_ldb) ;
#endif
        if (cublasStatus != CUBLAS_STATUS_SUCCESS)
        {
            ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ;
        }

        /* ------------------------------------------------------------------ */
        /* copy result back to the CPU */
        /* ------------------------------------------------------------------ */

        cudaStat = cudaMemcpy2DAsync (
            gpu_p->h_Lx[iHostBuff] +
            L_ENTRY*(nscol2+gpu_row_start),
            nsrow * L_ENTRY * sizeof (Lx [0]),
            devPtrB + L_ENTRY*gpu_row_start,
            gpu_ldb * L_ENTRY * sizeof (devPtrB [0]),
            gpu_row_chunk * L_ENTRY *
            sizeof (devPtrB [0]),
            nscol2,
            cudaMemcpyDeviceToHost,
            Common->gpuStream[ibuf]);

        if (cudaStat)
        {
            ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy from device") ;
        }

        cudaEventRecord ( Common->updateCBuffersFree[ibuf],
                          Common->gpuStream[ibuf] );

        gpu_row_start += gpu_row_chunk;
        ibuf++;
        ibuf = ibuf % CHOLMOD_HOST_SUPERNODE_BUFFERS;

        iblock ++;

        if ( iblock >= CHOLMOD_HOST_SUPERNODE_BUFFERS )
        {
            Int gpu_row_start2 ;
            Int gpu_row_end ;

            /* then CHOLMOD_HOST_SUPERNODE_BUFFERS worth of work has been
             *  scheduled, so check for completed events and copy result into
             *  Lx before continuing. */
            cudaEventSynchronize ( Common->updateCBuffersFree
                                   [iblock%CHOLMOD_HOST_SUPERNODE_BUFFERS] );

            /* copy into Lx */
            gpu_row_start2 = nscol2 +
                (iblock-CHOLMOD_HOST_SUPERNODE_BUFFERS)
                *gpu_row_max_chunk;
            gpu_row_end = gpu_row_start2+gpu_row_max_chunk;

            if ( gpu_row_end > nsrow ) gpu_row_end = nsrow;

#pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS)   \
    private(iidx) if ( nscol2 > 32 )

            for ( j=0; j<nscol2; j++ ) {
                for ( i=gpu_row_start2*L_ENTRY; i<gpu_row_end*L_ENTRY; i++ ) {
                    iidx = j*nsrow*L_ENTRY+i;
                    Lx[psx*L_ENTRY+iidx] = gpu_p->h_Lx[iHostBuff][iidx];
                }
            }
        }
    }

    /* Convenient to copy the L1 block here */

#pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS)   \
private ( iidx ) if ( nscol2 > 32 )

    for ( j=0; j<nscol2; j++ ) {
        for ( i=j*L_ENTRY; i<nscol2*L_ENTRY; i++ ) {
            iidx = j*nsrow*L_ENTRY + i;
            Lx[psx*L_ENTRY+iidx] = gpu_p->h_Lx[iHostBuff][iidx];
        }
    }

    /* now account for the last HSTREAMS buffers */
    for ( iwrap=0; iwrap<CHOLMOD_HOST_SUPERNODE_BUFFERS; iwrap++ )
    {
        int i, j;
        Int gpu_row_start2 = nscol2 + (iblock-CHOLMOD_HOST_SUPERNODE_BUFFERS)
            *gpu_row_max_chunk;
        if (iblock-CHOLMOD_HOST_SUPERNODE_BUFFERS >= 0 &&
            gpu_row_start2 < nsrow )
        {
            Int iidx;
            Int gpu_row_end = gpu_row_start2+gpu_row_max_chunk;
            if ( gpu_row_end > nsrow ) gpu_row_end = nsrow;
            cudaEventSynchronize ( Common->updateCBuffersFree
                                   [iblock%CHOLMOD_HOST_SUPERNODE_BUFFERS] );
            /* copy into Lx */

#pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS)   \
    private(iidx) if ( nscol2 > 32 )

            for ( j=0; j<nscol2; j++ ) {
                for ( i=gpu_row_start2*L_ENTRY; i<gpu_row_end*L_ENTRY; i++ ) {
                    iidx = j*nsrow*L_ENTRY+i;
                    Lx[psx*L_ENTRY+iidx] = gpu_p->h_Lx[iHostBuff][iidx];
                }
            }
        }
        iblock++;
    }

    /* ---------------------------------------------------------------------- */
    /* return */
    /* ---------------------------------------------------------------------- */

#ifndef NTIMER
    Common->CHOLMOD_GPU_TRSM_TIME += SuiteSparse_time ( ) - tstart ;
#endif

    return (1) ;
}
Exemplo n.º 5
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing ztrsm
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    real_Double_t   gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf=0, cpu_time=0;
    double          magma_error, cublas_error, work[1];
    magma_int_t M, N, info;
    magma_int_t Ak;
    magma_int_t sizeA, sizeB;
    magma_int_t lda, ldb, ldda, lddb;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
   
    magma_int_t *piv;
    magma_err_t err;

    magmaDoubleComplex *h_A, *h_B, *h_Bcublas, *h_Bmagma, *h_B1, *h_X1, *h_X2, *LU, *LUT;
    magmaDoubleComplex *d_A, *d_B;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    magmaDoubleComplex c_one = MAGMA_Z_ONE;
    magmaDoubleComplex alpha = MAGMA_Z_MAKE(  0.29, -0.86 );
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    printf("If running lapack (option --lapack), MAGMA and CUBLAS error are both computed\n"
           "relative to CPU BLAS result. Else, MAGMA error is computed relative to CUBLAS result.\n\n"
           "side = %c, uplo = %c, transA = %c, diag = %c \n", opts.side, opts.uplo, opts.transA, opts.diag );
    printf("    M     N  MAGMA Gflop/s (ms)  CUBLAS Gflop/s (ms)   CPU Gflop/s (ms)  MAGMA error  CUBLAS error\n");
    printf("==================================================================================================\n");
    for( int i = 0; i < opts.ntest; ++i ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[i];
            N = opts.nsize[i];
            gflops = FLOPS_ZTRSM(opts.side, M, N) / 1e9;

            if ( opts.side == MagmaLeft ) {
                lda = M;
                Ak = M;
            } else {
                lda = N;
                Ak = N;
            }
            
            ldb = M;
            
            ldda = ((lda+31)/32)*32;
            lddb = ((ldb+31)/32)*32;
            
            sizeA = lda*Ak;
            sizeB = ldb*N;
            
            TESTING_MALLOC( h_A,  magmaDoubleComplex, lda*Ak );
            TESTING_MALLOC( LU,      magmaDoubleComplex, lda*Ak );
            TESTING_MALLOC( LUT,  magmaDoubleComplex, lda*Ak );
            TESTING_MALLOC( h_B,  magmaDoubleComplex, ldb*N  );
            TESTING_MALLOC( h_B1,  magmaDoubleComplex, ldb*N );
            TESTING_MALLOC( h_X1,  magmaDoubleComplex, ldb*N );
            TESTING_MALLOC( h_X2,  magmaDoubleComplex, ldb*N );
            TESTING_MALLOC( h_Bcublas, magmaDoubleComplex, ldb*N  );
            TESTING_MALLOC( h_Bmagma, magmaDoubleComplex, ldb*N  );
            
            TESTING_DEVALLOC( d_A, magmaDoubleComplex, ldda*Ak );
            TESTING_DEVALLOC( d_B, magmaDoubleComplex, lddb*N  );
            
            /* Initialize the matrices */
            lapackf77_zlarnv( &ione, ISEED, &sizeA, LU );
            err = magma_malloc_cpu( (void**) &piv, Ak*sizeof(magma_int_t) );  assert( err == 0 );
            lapackf77_zgetrf( &Ak, &Ak, LU, &lda, piv, &info );
        
            int i, j;
            for(i=0;i<Ak;i++){
                for(j=0;j<Ak;j++){
                    LUT[j+i*lda] = LU[i+j*lda];
                }
            }

            lapackf77_zlacpy(MagmaUpperStr, &Ak, &Ak, LUT, &lda, LU, &lda);

            if(opts.uplo == MagmaLower){
                lapackf77_zlacpy(MagmaLowerStr, &Ak, &Ak, LU, &lda, h_A, &lda);
            }else{
                lapackf77_zlacpy(MagmaUpperStr, &Ak, &Ak, LU, &lda, h_A, &lda);
            }
            
            lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B );
            memcpy(h_B1, h_B, sizeB*sizeof(magmaDoubleComplex));
            /* =====================================================================
               Performs operation using MAGMA-BLAS
               =================================================================== */
            magma_zsetmatrix( Ak, Ak, h_A, lda, d_A, ldda );
            magma_zsetmatrix( M, N, h_B, ldb, d_B, lddb );
            
            magma_time = magma_sync_wtime( NULL );
            magmablas_ztrsm( opts.side, opts.uplo, opts.transA, opts.diag, 
                             M, N,
                             alpha, d_A, ldda,
                                    d_B, lddb );
            magma_time = magma_sync_wtime( NULL ) - magma_time;
            magma_perf = gflops / magma_time;
            
            magma_zgetmatrix( M, N, d_B, lddb, h_Bmagma, ldb );
            
            /* =====================================================================
               Performs operation using CUDA-BLAS
               =================================================================== */
            magma_zsetmatrix( M, N, h_B, ldb, d_B, lddb );
            
            cublas_time = magma_sync_wtime( NULL );
            cublasZtrsm( opts.side, opts.uplo, opts.transA, opts.diag,
                         M, N, 
                         alpha, d_A, ldda,
                                d_B, lddb );
            cublas_time = magma_sync_wtime( NULL ) - cublas_time;
            cublas_perf = gflops / cublas_time;
            
            magma_zgetmatrix( M, N, d_B, lddb, h_Bcublas, ldb );
            
            /* =====================================================================
               Performs operation using CPU BLAS
               =================================================================== */
            if ( opts.lapack ) {
                cpu_time = magma_wtime();
                blasf77_ztrsm( &opts.side, &opts.uplo, &opts.transA, &opts.diag, 
                               &M, &N,
                               &alpha, h_A, &lda,
                                       h_B, &ldb );
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
            }
            
            /* =====================================================================
               Check the result
               =================================================================== */
            // ||b - Ax|| / (||A||*||x||)
            memcpy(h_X1, h_Bmagma, sizeB*sizeof(magmaDoubleComplex));
            
            magmaDoubleComplex alpha2 = MAGMA_Z_DIV(  c_one, alpha );
            blasf77_ztrmm( &opts.side, &opts.uplo, &opts.transA, &opts.diag, 
                            &M, &N,
                            &alpha2, h_A, &lda,
                            h_X1, &ldb );

            blasf77_zaxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X1, &ione );
            double norm1 =  lapackf77_zlange( "M", &M, &N, h_X1, &ldb, work );
            double normx =  lapackf77_zlange( "M", &M, &N, h_Bmagma, &ldb, work );
            double normA =  lapackf77_zlange( "M", &Ak, &Ak, h_A, &lda, work );


            magma_error = norm1/(normx*normA);

            memcpy(h_X2, h_Bcublas, sizeB*sizeof(magmaDoubleComplex));
            blasf77_ztrmm( &opts.side, &opts.uplo, &opts.transA, &opts.diag, 
                            &M, &N,
                            &alpha2, h_A, &lda,
                            h_X2, &ldb );

            blasf77_zaxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X2, &ione );
            norm1 =  lapackf77_zlange( "M", &M, &N, h_X2, &ldb, work );
            normx =  lapackf77_zlange( "M", &M, &N, h_Bcublas, &ldb, work );
            normA =  lapackf77_zlange( "M", &Ak, &Ak, h_A, &lda, work );
            
            cublas_error = norm1/(normx*normA);
            
            if ( opts.lapack ) {
                printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e     %8.2e\n",
                        (int) M, (int) N,
                        magma_perf,  1000.*magma_time,
                        cublas_perf, 1000.*cublas_time,
                        cpu_perf,    1000.*cpu_time,
                        magma_error, cublas_error );
            }
            else {
                printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)     ---   (  ---  )   %8.2e     %8.2e\n",
                        (int) M, (int) N,
                        magma_perf,  1000.*magma_time,
                        cublas_perf, 1000.*cublas_time,
                        magma_error, cublas_error );
            }
            
            TESTING_FREE( h_A  );
            TESTING_FREE( LU  );
            TESTING_FREE( LUT );
            TESTING_FREE( h_B  );
            TESTING_FREE( h_Bcublas );
            TESTING_FREE( h_Bmagma );
            TESTING_FREE( h_B1  );
            TESTING_FREE( h_X1 );
            TESTING_FREE( h_X2 );
            
            TESTING_DEVFREE( d_A );
            TESTING_DEVFREE( d_B );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    TESTING_FINALIZE();
    return 0;
}
Exemplo n.º 6
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing ztrsm
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    real_Double_t   gflops, magma_perf, magma_time=0, cublas_perf, cublas_time, cpu_perf=0, cpu_time=0;
    double          magma_error, cublas_error, work[1];
    magma_int_t M, N, info;
    magma_int_t Ak;
    magma_int_t sizeA, sizeB;
    magma_int_t lda, ldb, ldda, lddb;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t *ipiv;

    magmaDoubleComplex *h_A, *h_B, *h_Bcublas, *h_Bmagma, *h_B1, *h_X1, *h_X2;
    magmaDoubleComplex *d_A, *d_B;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    magmaDoubleComplex c_one = MAGMA_Z_ONE;
    magmaDoubleComplex alpha = MAGMA_Z_MAKE(  0.29, -0.86 );
    magma_int_t status = 0;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    double tol = opts.tolerance * lapackf77_dlamch("E");

    printf("side = %s, uplo = %s, transA = %s, diag = %s \n",
           lapack_side_const(opts.side), lapack_uplo_const(opts.uplo),
           lapack_trans_const(opts.transA), lapack_diag_const(opts.diag) );
    printf("    M     N  MAGMA Gflop/s (ms)  CUBLAS Gflop/s (ms)   CPU Gflop/s (ms)  MAGMA error  CUBLAS error\n");
    printf("==================================================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            gflops = FLOPS_ZTRSM(opts.side, M, N) / 1e9;

            if ( opts.side == MagmaLeft ) {
                lda = M;
                Ak = M;
            } else {
                lda = N;
                Ak = N;
            }
            
            ldb = M;
            
            ldda = ((lda+31)/32)*32;
            lddb = ((ldb+31)/32)*32;
            
            sizeA = lda*Ak;
            sizeB = ldb*N;
            
            TESTING_MALLOC_CPU( h_A,       magmaDoubleComplex, lda*Ak  );
            TESTING_MALLOC_CPU( h_B,       magmaDoubleComplex, ldb*N   );
            TESTING_MALLOC_CPU( h_B1,      magmaDoubleComplex, ldb*N   );
            TESTING_MALLOC_CPU( h_X1,      magmaDoubleComplex, ldb*N   );
            TESTING_MALLOC_CPU( h_X2,      magmaDoubleComplex, ldb*N   );
            TESTING_MALLOC_CPU( h_Bcublas, magmaDoubleComplex, ldb*N   );
            TESTING_MALLOC_CPU( h_Bmagma,  magmaDoubleComplex, ldb*N   );
            TESTING_MALLOC_CPU( ipiv,      magma_int_t,        Ak      );
            
            TESTING_MALLOC_DEV( d_A,       magmaDoubleComplex, ldda*Ak );
            TESTING_MALLOC_DEV( d_B,       magmaDoubleComplex, lddb*N  );
            
            /* Initialize the matrices */
            /* Factor A into LU to get well-conditioned triangular matrix.
             * Copy L to U, since L seems okay when used with non-unit diagonal
             * (i.e., from U), while U fails when used with unit diagonal. */
            lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A );
            lapackf77_zgetrf( &Ak, &Ak, h_A, &lda, ipiv, &info );
            for( int j = 0; j < Ak; ++j ) {
                for( int i = 0; i < j; ++i ) {
                    *h_A(i,j) = *h_A(j,i);
                }
            }
            
            lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B );
            memcpy(h_B1, h_B, sizeB*sizeof(magmaDoubleComplex));
            
            /* =====================================================================
               Performs operation using MAGMABLAS
               =================================================================== */
            magma_zsetmatrix( Ak, Ak, h_A, lda, d_A, ldda );
            magma_zsetmatrix( M, N, h_B, ldb, d_B, lddb );
            
            magma_time = magma_sync_wtime( NULL );
            magmablas_ztrsm( opts.side, opts.uplo, opts.transA, opts.diag, 
                             M, N,
                             alpha, d_A, ldda,
                                    d_B, lddb );
            magma_time = magma_sync_wtime( NULL ) - magma_time;
            magma_perf = gflops / magma_time;
            
            magma_zgetmatrix( M, N, d_B, lddb, h_Bmagma, ldb );
            
            /* =====================================================================
               Performs operation using CUBLAS
               =================================================================== */
            magma_zsetmatrix( M, N, h_B, ldb, d_B, lddb );
            
            cublas_time = magma_sync_wtime( NULL );
            cublasZtrsm( handle, cublas_side_const(opts.side), cublas_uplo_const(opts.uplo),
                         cublas_trans_const(opts.transA), cublas_diag_const(opts.diag),
                         M, N, 
                         &alpha, d_A, ldda,
                                 d_B, lddb );
            cublas_time = magma_sync_wtime( NULL ) - cublas_time;
            cublas_perf = gflops / cublas_time;
            
            magma_zgetmatrix( M, N, d_B, lddb, h_Bcublas, ldb );
            
            /* =====================================================================
               Performs operation using CPU BLAS
               =================================================================== */
            if ( opts.lapack ) {
                cpu_time = magma_wtime();
                blasf77_ztrsm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), 
                               &M, &N,
                               &alpha, h_A, &lda,
                                       h_B, &ldb );
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
            }
            
            /* =====================================================================
               Check the result
               =================================================================== */
            // ||b - Ax|| / (||A||*||x||)
            memcpy(h_X1, h_Bmagma, sizeB*sizeof(magmaDoubleComplex));
            
            magmaDoubleComplex alpha2 = MAGMA_Z_DIV(  c_one, alpha );
            blasf77_ztrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), 
                            &M, &N,
                            &alpha2, h_A, &lda,
                            h_X1, &ldb );

            blasf77_zaxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X1, &ione );
            double norm1 =  lapackf77_zlange( "M", &M, &N, h_X1, &ldb, work );
            double normx =  lapackf77_zlange( "M", &M, &N, h_Bmagma, &ldb, work );
            double normA =  lapackf77_zlange( "M", &Ak, &Ak, h_A, &lda, work );

            magma_error = norm1/(normx*normA);

            memcpy(h_X2, h_Bcublas, sizeB*sizeof(magmaDoubleComplex));
            blasf77_ztrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), 
                            &M, &N,
                            &alpha2, h_A, &lda,
                            h_X2, &ldb );

            blasf77_zaxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X2, &ione );
            norm1 =  lapackf77_zlange( "M", &M, &N, h_X2, &ldb, work );
            normx =  lapackf77_zlange( "M", &M, &N, h_Bcublas, &ldb, work );
            normA =  lapackf77_zlange( "M", &Ak, &Ak, h_A, &lda, work );
            
            cublas_error = norm1/(normx*normA);
            
            if ( opts.lapack ) {
                printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e     %8.2e   %s\n",
                        (int) M, (int) N,
                        magma_perf,  1000.*magma_time,
                        cublas_perf, 1000.*cublas_time,
                        cpu_perf,    1000.*cpu_time,
                        magma_error, cublas_error,
                        (magma_error < tol && cublas_error < tol? "ok" : "failed"));
                status += ! (magma_error < tol && cublas_error < tol);
            }
            else {
                printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)     ---   (  ---  )   %8.2e     %8.2e   %s\n",
                        (int) M, (int) N,
                        magma_perf,  1000.*magma_time,
                        cublas_perf, 1000.*cublas_time,
                        magma_error, cublas_error,
                        (magma_error < tol && cublas_error < tol? "ok" : "failed"));
                status += ! (magma_error < tol && cublas_error < tol);
            }
            
            TESTING_FREE_CPU( h_A  );
            TESTING_FREE_CPU( h_B  );
            TESTING_FREE_CPU( h_B1 );
            TESTING_FREE_CPU( h_X1 );
            TESTING_FREE_CPU( h_X2 );
            TESTING_FREE_CPU( h_Bcublas );
            TESTING_FREE_CPU( h_Bmagma  );
            
            TESTING_FREE_DEV( d_A );
            TESTING_FREE_DEV( d_B );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    TESTING_FINALIZE();
    return status;
}