void magma_dtrsm( magma_side_t side, magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n, double alpha, double const* dA, magma_int_t lda, double* dB, magma_int_t ldb ) { cublasDtrsm( cublas_side_const( side ), cublas_uplo_const( uplo ), cublas_trans_const( trans ), cublas_diag_const( diag ), m, n, alpha, dA, lda, dB, ldb ); }
void d_trsm(SEXP rside, SEXP ruplo, SEXP rtrans, SEXP rdiag, SEXP ralpha, SEXP ra, SEXP rlda, SEXP rb, SEXP rldb) { char trans = getTranspose(rtrans), diag = getUnitTri(rdiag), side = getSide(rside), uplo = getSymLoc(ruplo); double alpha = asReal(ralpha), * a, * b; int rowsa, colsa, lda = asInteger(rlda), rowsb, colsb, ldb = asInteger(rldb); unpackMatrix(ra, &rowsa, &colsa, &a); unpackMatrix(rb, &rowsb, &colsb, &b); cublasDtrsm(side, uplo, trans, diag, rowsb, colsb, alpha, a, lda, b, ldb); checkCublasError("d_trsm"); }
int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, t1, t2; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; const char trans[] = { 'N', 'C', 'T' }; const char uplo[] = { 'L', 'U' }; const char diag[] = { 'U', 'N' }; const char side[] = { 'L', 'R' }; double *A, *B, *C, *C2, *LU; double *dA, *dB, *dC1, *dC2; double alpha = MAGMA_D_MAKE( 0.5, 0.1 ); double beta = MAGMA_D_MAKE( 0.7, 0.2 ); double dalpha = 0.6; double dbeta = 0.8; double work[1], error, total_error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld, info; magma_int_t *piv; magma_err_t err; magma_opts opts; parse_opts( argc, argv, &opts ); printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" ); total_error = 0.; for( int i = 0; i < opts.ntest; ++i ) { m = opts.msize[i]; n = opts.nsize[i]; k = opts.ksize[i]; printf("=========================================================================\n"); printf( "M %d, N %d, K %d\n", (int) m, (int) n, (int) k ); // allocate matrices // over-allocate so they can be any combination of {m,n,k} x {m,n,k}. maxn = max( max( m, n ), k ); ld = maxn; size = maxn*maxn; err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) ); assert( err == 0 ); err = magma_dmalloc_pinned( &A, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &B, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &C, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &C2, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &LU, size ); assert( err == 0 ); err = magma_dmalloc( &dA, size ); assert( err == 0 ); err = magma_dmalloc( &dB, size ); assert( err == 0 ); err = magma_dmalloc( &dC1, size ); assert( err == 0 ); err = magma_dmalloc( &dC2, size ); assert( err == 0 ); // initialize matrices size = maxn*maxn; lapackf77_dlarnv( &ione, ISEED, &size, A ); lapackf77_dlarnv( &ione, ISEED, &size, B ); lapackf77_dlarnv( &ione, ISEED, &size, C ); printf( "========== Level 1 BLAS ==========\n" ); // ----- test DSWAP // swap 2nd and 3rd columns of dA, then copy to C2 and compare with A assert( n >= 4 ); magma_dsetmatrix( m, n, A, ld, dA, ld ); magma_dsetmatrix( m, n, A, ld, dB, ld ); magma_dswap( m, dA(0,1), 1, dA(0,2), 1 ); magma_dswap( m, dB(0,1), 1, dB(0,2), 1 ); // check results, storing diff between magma and cuda calls in C2 cublasDaxpy( ld*n, c_neg_one, dA, 1, dB, 1 ); magma_dgetmatrix( m, n, dB, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &k, C2, &ld, work ); total_error += error; printf( "dswap diff %.2g\n", error ); // ----- test IDAMAX // get argmax of column of A magma_dsetmatrix( m, k, A, ld, dA, ld ); error = 0; for( int j = 0; j < k; ++j ) { magma_int_t i1 = magma_idamax( m, dA(0,j), 1 ); magma_int_t i2 = cublasIdamax( m, dA(0,j), 1 ); assert( i1 == i2 ); error += abs( i1 - i2 ); } total_error += error; gflops = (double)m * k / 1e9; printf( "idamax diff %.2g\n", error ); printf( "\n" ); printf( "========== Level 2 BLAS ==========\n" ); // ----- test DGEMV // c = alpha*A*b + beta*c, with A m*n; b,c m or n-vectors // try no-trans/trans for( int ia = 0; ia < 3; ++ia ) { magma_dsetmatrix( m, n, A, ld, dA, ld ); magma_dsetvector( maxn, B, 1, dB, 1 ); magma_dsetvector( maxn, C, 1, dC1, 1 ); magma_dsetvector( maxn, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 size = (trans[ia] == 'N' ? m : n); cublasDaxpy( size, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( size, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &size, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DGEMV( m, n ) / 1e9; printf( "dgemv( %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", trans[ia], error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test DSYMV // c = alpha*A*b + beta*c, with A m*m symmetric; b,c m-vectors // try upper/lower for( int iu = 0; iu < 2; ++iu ) { magma_dsetmatrix( m, m, A, ld, dA, ld ); magma_dsetvector( m, B, 1, dB, 1 ); magma_dsetvector( m, C, 1, dC1, 1 ); magma_dsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( m, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYMV( m ) / 1e9; printf( "dsymv( %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test DTRSV // solve A*c = c, with A m*m triangular; c m-vector // try upper/lower, no-trans/trans, unit/non-unit diag // Factor A into LU to get well-conditioned triangles, else solve yields garbage. // Still can give garbage if solves aren't consistent with LU factors, // e.g., using unit diag for U, so copy lower triangle to upper triangle. // Also used for trsm later. lapackf77_dlacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld ); lapackf77_dgetrf( &maxn, &maxn, LU, &ld, piv, &info ); for( int j = 0; j < maxn; ++j ) { for( int i = 0; i < j; ++i ) { *LU(i,j) = *LU(j,i); } } for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { magma_dsetmatrix( m, m, LU, ld, dA, ld ); magma_dsetvector( m, C, 1, dC1, 1 ); magma_dsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( m, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRSM( MagmaLeft, m, 1 ) / 1e9; printf( "dtrsv( %c, %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], diag[id], error, gflops/t1, gflops/t2 ); }}} printf( "\n" ); printf( "========== Level 3 BLAS ==========\n" ); // ----- test DGEMM // C = alpha*A*B + beta*C, with A m*k or k*m; B k*n or n*k; C m*n // try combinations of no-trans/trans for( int ia = 0; ia < 3; ++ia ) { for( int ib = 0; ib < 3; ++ib ) { bool nta = (trans[ia] == 'N'); bool ntb = (trans[ib] == 'N'); magma_dsetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA, ld ); magma_dsetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DGEMM( m, n, k ) / 1e9; printf( "dgemm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", trans[ia], trans[ib], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYMM // C = alpha*A*B + beta*C (left) with A m*m symmetric; B,C m*n; or // C = alpha*B*A + beta*C (right) with A n*n symmetric; B,C m*n // try left/right, upper/lower for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { magma_dsetmatrix( m, m, A, ld, dA, ld ); magma_dsetmatrix( m, n, B, ld, dB, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYMM( side[is], m, n ) / 1e9; printf( "dsymm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", side[is], uplo[iu], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYRK // C = alpha*A*A^H + beta*C (no-trans) with A m*k and C m*m symmetric; or // C = alpha*A^H*A + beta*C (trans) with A k*m and C m*m symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { magma_dsetmatrix( n, k, A, ld, dA, ld ); magma_dsetmatrix( n, n, C, ld, dC1, ld ); magma_dsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYRK( k, n ) / 1e9; printf( "dsyrk( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYR2K // C = alpha*A*B^H + ^alpha*B*A^H + beta*C (no-trans) with A,B n*k; C n*n symmetric; or // C = alpha*A^H*B + ^alpha*B^H*A + beta*C (trans) with A,B k*n; C n*n symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { bool nt = (trans[it] == 'N'); magma_dsetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA, ld ); magma_dsetmatrix( n, n, C, ld, dC1, ld ); magma_dsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYR2K( k, n ) / 1e9; printf( "dsyr2k( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DTRMM // C = alpha*A*C (left) with A m*m triangular; C m*n; or // C = alpha*C*A (right) with A n*n triangular; C m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == 'L'); magma_dsetmatrix( (left ? m : n), (left ? m : n), A, ld, dA, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRMM( side[is], m, n ) / 1e9; printf( "dtrmm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // ----- test DTRSM // solve A*X = alpha*B (left) with A m*m triangular; B m*n; or // solve X*A = alpha*B (right) with A n*n triangular; B m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == 'L'); magma_dsetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRSM( side[is], m, n ) / 1e9; printf( "dtrsm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // cleanup magma_free_cpu( piv ); magma_free_pinned( A ); magma_free_pinned( B ); magma_free_pinned( C ); magma_free_pinned( C2 ); magma_free_pinned( LU ); magma_free( dA ); magma_free( dB ); magma_free( dC1 ); magma_free( dC2 ); } if ( total_error != 0. ) { printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n", total_error ); } else { printf( "all tests passed\n" ); } TESTING_FINALIZE(); return 0; }
cublasStatus_t cublasXtrsm(cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const double *alpha, const double *A, int lda, double *B, int ldb) { return cublasDtrsm(g_context->cublasHandle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb); }
int TEMPLATE2 (CHOLMOD (gpu_lower_potrf)) ( Int nscol2, /* S is nscol2-by-nscol2 */ Int nsrow, /* leading dimension of S */ Int psx, /* S is located at Lx + L_ENTRY*psx */ double *Lx, /* contains S; overwritten with Cholesky factor */ Int *info, /* BLAS info return value */ cholmod_common *Common, cholmod_gpu_pointers *gpu_p ) { double *devPtrA, *devPtrB, *A ; double alpha, beta ; cudaError_t cudaStat ; cublasStatus_t cublasStatus ; Int j, nsrow2, nb, n, gpu_lda, lda, gpu_ldb ; int ilda, ijb, iinfo ; #ifndef NTIMER double tstart ; #endif if (nscol2 * L_ENTRY < CHOLMOD_POTRF_LIMIT) { /* too small for the CUDA BLAS; use the CPU instead */ return (0) ; } #ifndef NTIMER tstart = SuiteSparse_time ( ) ; Common->CHOLMOD_GPU_POTRF_CALLS++ ; #endif nsrow2 = nsrow - nscol2 ; /* ---------------------------------------------------------------------- */ /* heuristic to get the block size depending of the problem size */ /* ---------------------------------------------------------------------- */ nb = 128 ; if (nscol2 > 4096) nb = 256 ; if (nscol2 > 8192) nb = 384 ; n = nscol2 ; gpu_lda = ((nscol2+31)/32)*32 ; lda = nsrow ; A = gpu_p->h_Lx[(Common->ibuffer+CHOLMOD_HOST_SUPERNODE_BUFFERS-1)% CHOLMOD_HOST_SUPERNODE_BUFFERS]; /* ---------------------------------------------------------------------- */ /* determine the GPU leading dimension of B */ /* ---------------------------------------------------------------------- */ gpu_ldb = 0 ; if (nsrow2 > 0) { gpu_ldb = ((nsrow2+31)/32)*32 ; } /* ---------------------------------------------------------------------- */ /* remember where device memory is, to be used by triangular solve later */ /* ---------------------------------------------------------------------- */ devPtrA = gpu_p->d_Lx[0]; devPtrB = gpu_p->d_Lx[1]; /* ---------------------------------------------------------------------- */ /* copy A from device to device */ /* ---------------------------------------------------------------------- */ cudaStat = cudaMemcpy2DAsync ( devPtrA, gpu_lda * L_ENTRY * sizeof (devPtrA[0]), gpu_p->d_A[1], nsrow * L_ENTRY * sizeof (Lx[0]), nscol2 * L_ENTRY * sizeof (devPtrA[0]), nscol2, cudaMemcpyDeviceToDevice, Common->gpuStream[0] ); if ( cudaStat ) { ERROR ( CHOLMOD_GPU_PROBLEM, "GPU memcopy device to device"); } /* ---------------------------------------------------------------------- */ /* copy B in advance, for gpu_triangular_solve */ /* ---------------------------------------------------------------------- */ if (nsrow2 > 0) { cudaStat = cudaMemcpy2DAsync (devPtrB, gpu_ldb * L_ENTRY * sizeof (devPtrB [0]), gpu_p->d_A[1] + L_ENTRY*nscol2, nsrow * L_ENTRY * sizeof (Lx [0]), nsrow2 * L_ENTRY * sizeof (devPtrB [0]), nscol2, cudaMemcpyDeviceToDevice, Common->gpuStream[0]) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ; } } /* ------------------------------------------------------------------ */ /* define the dpotrf stream */ /* ------------------------------------------------------------------ */ cublasStatus = cublasSetStream (Common->cublasHandle, Common->gpuStream [0]) ; if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS stream") ; } /* ---------------------------------------------------------------------- */ /* block Cholesky factorization of S */ /* ---------------------------------------------------------------------- */ for (j = 0 ; j < n ; j += nb) { Int jb = nb < (n-j) ? nb : (n-j) ; /* ------------------------------------------------------------------ */ /* do the CUDA BLAS dsyrk */ /* ------------------------------------------------------------------ */ alpha = -1.0 ; beta = 1.0 ; #ifdef REAL cublasStatus = cublasDsyrk (Common->cublasHandle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, jb, j, &alpha, devPtrA + j, gpu_lda, &beta, devPtrA + j + j*gpu_lda, gpu_lda) ; #else cublasStatus = cublasZherk (Common->cublasHandle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, jb, j, &alpha, (cuDoubleComplex*)devPtrA + j, gpu_lda, &beta, (cuDoubleComplex*)devPtrA + j + j*gpu_lda, gpu_lda) ; #endif if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ; } /* ------------------------------------------------------------------ */ cudaStat = cudaEventRecord (Common->cublasEventPotrf [0], Common->gpuStream [0]) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ; } cudaStat = cudaStreamWaitEvent (Common->gpuStream [1], Common->cublasEventPotrf [0], 0) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ; } /* ------------------------------------------------------------------ */ /* copy back the jb columns on two different streams */ /* ------------------------------------------------------------------ */ cudaStat = cudaMemcpy2DAsync (A + L_ENTRY*(j + j*lda), lda * L_ENTRY * sizeof (double), devPtrA + L_ENTRY*(j + j*gpu_lda), gpu_lda * L_ENTRY * sizeof (double), L_ENTRY * sizeof (double)*jb, jb, cudaMemcpyDeviceToHost, Common->gpuStream [1]) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy from device") ; } /* ------------------------------------------------------------------ */ /* do the CUDA BLAS dgemm */ /* ------------------------------------------------------------------ */ if ((j+jb) < n) { #ifdef REAL alpha = -1.0 ; beta = 1.0 ; cublasStatus = cublasDgemm (Common->cublasHandle, CUBLAS_OP_N, CUBLAS_OP_T, (n-j-jb), jb, j, &alpha, devPtrA + (j+jb), gpu_lda, devPtrA + (j) , gpu_lda, &beta, devPtrA + (j+jb + j*gpu_lda), gpu_lda) ; #else cuDoubleComplex calpha = {-1.0,0.0} ; cuDoubleComplex cbeta = { 1.0,0.0} ; cublasStatus = cublasZgemm (Common->cublasHandle, CUBLAS_OP_N, CUBLAS_OP_C, (n-j-jb), jb, j, &calpha, (cuDoubleComplex*)devPtrA + (j+jb), gpu_lda, (cuDoubleComplex*)devPtrA + (j), gpu_lda, &cbeta, (cuDoubleComplex*)devPtrA + (j+jb + j*gpu_lda), gpu_lda ) ; #endif if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ; } } cudaStat = cudaStreamSynchronize (Common->gpuStream [1]) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ; } /* ------------------------------------------------------------------ */ /* compute the Cholesky factorization of the jbxjb block on the CPU */ /* ------------------------------------------------------------------ */ ilda = (int) lda ; ijb = jb ; #ifdef REAL LAPACK_DPOTRF ("L", &ijb, A + L_ENTRY * (j + j*lda), &ilda, &iinfo) ; #else LAPACK_ZPOTRF ("L", &ijb, A + L_ENTRY * (j + j*lda), &ilda, &iinfo) ; #endif *info = iinfo ; if (*info != 0) { *info = *info + j ; break ; } /* ------------------------------------------------------------------ */ /* copy the result back to the GPU */ /* ------------------------------------------------------------------ */ cudaStat = cudaMemcpy2DAsync (devPtrA + L_ENTRY*(j + j*gpu_lda), gpu_lda * L_ENTRY * sizeof (double), A + L_ENTRY * (j + j*lda), lda * L_ENTRY * sizeof (double), L_ENTRY * sizeof (double) * jb, jb, cudaMemcpyHostToDevice, Common->gpuStream [0]) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ; } /* ------------------------------------------------------------------ */ /* do the CUDA BLAS dtrsm */ /* ------------------------------------------------------------------ */ if ((j+jb) < n) { #ifdef REAL alpha = 1.0 ; cublasStatus = cublasDtrsm (Common->cublasHandle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, (n-j-jb), jb, &alpha, devPtrA + (j + j*gpu_lda), gpu_lda, devPtrA + (j+jb + j*gpu_lda), gpu_lda) ; #else cuDoubleComplex calpha = {1.0,0.0}; cublasStatus = cublasZtrsm (Common->cublasHandle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_C, CUBLAS_DIAG_NON_UNIT, (n-j-jb), jb, &calpha, (cuDoubleComplex *)devPtrA + (j + j*gpu_lda), gpu_lda, (cuDoubleComplex *)devPtrA + (j+jb + j*gpu_lda), gpu_lda) ; #endif if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ; } /* -------------------------------------------------------------- */ /* Copy factored column back to host. */ /* -------------------------------------------------------------- */ cudaStat = cudaEventRecord (Common->cublasEventPotrf[2], Common->gpuStream[0]) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ; } cudaStat = cudaStreamWaitEvent (Common->gpuStream[1], Common->cublasEventPotrf[2], 0) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ; } cudaStat = cudaMemcpy2DAsync (A + L_ENTRY*(j + jb + j * lda), lda * L_ENTRY * sizeof (double), devPtrA + L_ENTRY* (j + jb + j * gpu_lda), gpu_lda * L_ENTRY * sizeof (double), L_ENTRY * sizeof (double)* (n - j - jb), jb, cudaMemcpyDeviceToHost, Common->gpuStream[1]) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ; } } } #ifndef NTIMER Common->CHOLMOD_GPU_POTRF_TIME += SuiteSparse_time ( ) - tstart ; #endif return (1) ; }
int TEMPLATE2 (CHOLMOD (gpu_triangular_solve)) ( Int nsrow2, /* L1 and S2 are nsrow2-by-nscol2 */ Int nscol2, /* L1 is nscol2-by-nscol2 */ Int nsrow, /* leading dimension of L1, L2, and S2 */ Int psx, /* L1 is at Lx+L_ENTRY*psx; * L2 at Lx+L_ENTRY*(psx+nscol2)*/ double *Lx, /* holds L1, L2, and S2 */ cholmod_common *Common, cholmod_gpu_pointers *gpu_p ) { double *devPtrA, *devPtrB ; cudaError_t cudaStat ; cublasStatus_t cublasStatus ; Int gpu_lda, gpu_ldb, gpu_rowstep ; Int gpu_row_start = 0 ; Int gpu_row_max_chunk, gpu_row_chunk; int ibuf = 0; int iblock = 0; int iHostBuff = (Common->ibuffer+CHOLMOD_HOST_SUPERNODE_BUFFERS-1) % CHOLMOD_HOST_SUPERNODE_BUFFERS; int i, j; Int iidx; int iwrap; #ifndef NTIMER double tstart ; #endif #ifdef REAL double alpha = 1.0 ; gpu_row_max_chunk = 768; #else cuDoubleComplex calpha = {1.0,0.0} ; gpu_row_max_chunk = 256; #endif if ( nsrow2 <= 0 ) { return (0) ; } #ifndef NTIMER tstart = SuiteSparse_time ( ) ; Common->CHOLMOD_GPU_TRSM_CALLS++ ; #endif gpu_lda = ((nscol2+31)/32)*32 ; gpu_ldb = ((nsrow2+31)/32)*32 ; devPtrA = gpu_p->d_Lx[0]; devPtrB = gpu_p->d_Lx[1]; /* make sure the copy of B has completed */ cudaStreamSynchronize( Common->gpuStream[0] ); /* ---------------------------------------------------------------------- */ /* do the CUDA BLAS dtrsm */ /* ---------------------------------------------------------------------- */ while ( gpu_row_start < nsrow2 ) { gpu_row_chunk = nsrow2 - gpu_row_start; if ( gpu_row_chunk > gpu_row_max_chunk ) { gpu_row_chunk = gpu_row_max_chunk; } cublasStatus = cublasSetStream ( Common->cublasHandle, Common->gpuStream[ibuf] ); if ( cublasStatus != CUBLAS_STATUS_SUCCESS ) { ERROR ( CHOLMOD_GPU_PROBLEM, "GPU CUBLAS stream"); } #ifdef REAL cublasStatus = cublasDtrsm (Common->cublasHandle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, gpu_row_chunk, nscol2, &alpha, devPtrA, gpu_lda, devPtrB + gpu_row_start, gpu_ldb) ; #else cublasStatus = cublasZtrsm (Common->cublasHandle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_C, CUBLAS_DIAG_NON_UNIT, gpu_row_chunk, nscol2, &calpha, (const cuDoubleComplex *) devPtrA, gpu_lda, (cuDoubleComplex *)devPtrB + gpu_row_start , gpu_ldb) ; #endif if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ; } /* ------------------------------------------------------------------ */ /* copy result back to the CPU */ /* ------------------------------------------------------------------ */ cudaStat = cudaMemcpy2DAsync ( gpu_p->h_Lx[iHostBuff] + L_ENTRY*(nscol2+gpu_row_start), nsrow * L_ENTRY * sizeof (Lx [0]), devPtrB + L_ENTRY*gpu_row_start, gpu_ldb * L_ENTRY * sizeof (devPtrB [0]), gpu_row_chunk * L_ENTRY * sizeof (devPtrB [0]), nscol2, cudaMemcpyDeviceToHost, Common->gpuStream[ibuf]); if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy from device") ; } cudaEventRecord ( Common->updateCBuffersFree[ibuf], Common->gpuStream[ibuf] ); gpu_row_start += gpu_row_chunk; ibuf++; ibuf = ibuf % CHOLMOD_HOST_SUPERNODE_BUFFERS; iblock ++; if ( iblock >= CHOLMOD_HOST_SUPERNODE_BUFFERS ) { Int gpu_row_start2 ; Int gpu_row_end ; /* then CHOLMOD_HOST_SUPERNODE_BUFFERS worth of work has been * scheduled, so check for completed events and copy result into * Lx before continuing. */ cudaEventSynchronize ( Common->updateCBuffersFree [iblock%CHOLMOD_HOST_SUPERNODE_BUFFERS] ); /* copy into Lx */ gpu_row_start2 = nscol2 + (iblock-CHOLMOD_HOST_SUPERNODE_BUFFERS) *gpu_row_max_chunk; gpu_row_end = gpu_row_start2+gpu_row_max_chunk; if ( gpu_row_end > nsrow ) gpu_row_end = nsrow; #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) \ private(iidx) if ( nscol2 > 32 ) for ( j=0; j<nscol2; j++ ) { for ( i=gpu_row_start2*L_ENTRY; i<gpu_row_end*L_ENTRY; i++ ) { iidx = j*nsrow*L_ENTRY+i; Lx[psx*L_ENTRY+iidx] = gpu_p->h_Lx[iHostBuff][iidx]; } } } } /* Convenient to copy the L1 block here */ #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) \ private ( iidx ) if ( nscol2 > 32 ) for ( j=0; j<nscol2; j++ ) { for ( i=j*L_ENTRY; i<nscol2*L_ENTRY; i++ ) { iidx = j*nsrow*L_ENTRY + i; Lx[psx*L_ENTRY+iidx] = gpu_p->h_Lx[iHostBuff][iidx]; } } /* now account for the last HSTREAMS buffers */ for ( iwrap=0; iwrap<CHOLMOD_HOST_SUPERNODE_BUFFERS; iwrap++ ) { int i, j; Int gpu_row_start2 = nscol2 + (iblock-CHOLMOD_HOST_SUPERNODE_BUFFERS) *gpu_row_max_chunk; if (iblock-CHOLMOD_HOST_SUPERNODE_BUFFERS >= 0 && gpu_row_start2 < nsrow ) { Int iidx; Int gpu_row_end = gpu_row_start2+gpu_row_max_chunk; if ( gpu_row_end > nsrow ) gpu_row_end = nsrow; cudaEventSynchronize ( Common->updateCBuffersFree [iblock%CHOLMOD_HOST_SUPERNODE_BUFFERS] ); /* copy into Lx */ #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) \ private(iidx) if ( nscol2 > 32 ) for ( j=0; j<nscol2; j++ ) { for ( i=gpu_row_start2*L_ENTRY; i<gpu_row_end*L_ENTRY; i++ ) { iidx = j*nsrow*L_ENTRY+i; Lx[psx*L_ENTRY+iidx] = gpu_p->h_Lx[iHostBuff][iidx]; } } } iblock++; } /* ---------------------------------------------------------------------- */ /* return */ /* ---------------------------------------------------------------------- */ #ifndef NTIMER Common->CHOLMOD_GPU_TRSM_TIME += SuiteSparse_time ( ) - tstart ; #endif return (1) ; }
int main ( void ){ cudaError_t cudaStat ; // cudaMalloc status cublasStatus_t stat ; // CUBLAS functions status cublasHandle_t handle ; // CUBLAS context int i,j; // i-row index , j-col. index double * a; // mxm matrix a on the host double * b; // mxn matrix b on the host a=( double *) malloc (m*m* sizeof ( double )); // host memory for a b=( double *) malloc (m*n* sizeof ( double )); // host memory for b int ind =11; // a: for(j=0;j<m;j ++){ // 11 for(i=0;i<m;i ++){ // 12 ,17 if(i >=j){ // 13 ,18 ,22 a[ IDX2C(i,j,m)]=( double )ind ++; // 14 ,19 ,23 ,26 } // 15 ,20 ,24 ,27 ,29 } // 16 ,21 ,25 ,28 ,30 ,31 } printf (" lower triangle of a:\n"); /* for (i=0;i<m;i ++){ for (j=0;j<m;j ++){ if(i >=j) printf (" %5.0f",a[ IDX2C(i,j,m)]); } printf ("\n"); } */ ind =11; // b: for(j=0;j<n;j ++){ // 11 ,17 ,23 ,29 ,35 for(i=0;i<m;i ++){ // 12 ,18 ,24 ,30 ,36 if(i == j) b[IDX2C(i,i,m)] = 1.0; else b[IDX2C(i,j,m)] = 0.0; } } // b[ IDX2C(i,j,m)] = ind++; /*if(i == j) b[ IDX2C(i,j,m)] = 1.0; // 13 ,19 ,25 ,31 ,37 else b[ IDX2C(i, j, m)] = 0.0;*/ //ind ++; // 14 ,20 ,26 ,32 ,38 printf ("b:\n"); /* for (i=0;i<m;i ++){ for (j=0;j<n;j ++){ printf (" %5.0f",b[IDX2C(i,j,m)]); // print b row by row } printf ("\n"); } */ double * d_a; // d_a - a on the device double * d_b; // d_b - b on the device cudaStat = cudaMalloc (( void **)& d_a ,m*m* sizeof (*a)); // device // memory alloc for a cudaStat = cudaMalloc (( void **)& d_b ,m*n* sizeof (*b)); // device // // memory alloc for b stat = cublasCreate (& handle ); // initialize CUBLAS context stat = cublasSetMatrix (m,m, sizeof (*a) ,a,m,d_a ,m); //a -> d_a stat = cublasSetMatrix (m,n, sizeof (*b) ,b,m,d_b ,m); //b -> d_b double al =1.0f; double startime = CycleTimer::currentSeconds(); (cublasDtrsm(handle,CUBLAS_SIDE_LEFT,CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N,CUBLAS_DIAG_NON_UNIT,m,n,&al,d_a,m,d_b,m)); stat = cublasGetMatrix (m,n, sizeof (*b) ,d_b ,m,b,m); // d_b -> b double endtime = CycleTimer::currentSeconds(); printf (" solution x from Strsm :\n"); /* for(i=0;i<m;i ++){ for(j=0;j<n;j ++){ printf (" %11.5f",b[IDX2C(i,j,m )]); // print b after Strsm } printf ("\n"); } */ cudaFree (d_a ); // free device memory cudaFree (d_b ); // free device memory cublasDestroy ( handle ); // destroy CUBLAS context free (a); // free host memory free (b); // free host memory printf("Time taken: %lf\n", endtime - startime); return EXIT_SUCCESS ; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dtrsm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf=0, cpu_time=0; double magma_error, cublas_error, work[1]; magma_int_t M, N, info; magma_int_t Ak; magma_int_t sizeA, sizeB; magma_int_t lda, ldb, ldda, lddb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t *piv; magma_err_t err; double *h_A, *h_B, *h_Bcublas, *h_Bmagma, *h_B1, *h_X1, *h_X2, *LU, *LUT; double *d_A, *d_B; double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; double alpha = MAGMA_D_MAKE( 0.29, -0.86 ); magma_opts opts; parse_opts( argc, argv, &opts ); printf("If running lapack (option --lapack), MAGMA and CUBLAS error are both computed\n" "relative to CPU BLAS result. Else, MAGMA error is computed relative to CUBLAS result.\n\n" "side = %c, uplo = %c, transA = %c, diag = %c \n", opts.side, opts.uplo, opts.transA, opts.diag ); printf(" M N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("==================================================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; gflops = FLOPS_DTRSM(opts.side, M, N) / 1e9; if ( opts.side == MagmaLeft ) { lda = M; Ak = M; } else { lda = N; Ak = N; } ldb = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; sizeA = lda*Ak; sizeB = ldb*N; TESTING_MALLOC( h_A, double, lda*Ak ); TESTING_MALLOC( LU, double, lda*Ak ); TESTING_MALLOC( LUT, double, lda*Ak ); TESTING_MALLOC( h_B, double, ldb*N ); TESTING_MALLOC( h_B1, double, ldb*N ); TESTING_MALLOC( h_X1, double, ldb*N ); TESTING_MALLOC( h_X2, double, ldb*N ); TESTING_MALLOC( h_Bcublas, double, ldb*N ); TESTING_MALLOC( h_Bmagma, double, ldb*N ); TESTING_DEVALLOC( d_A, double, ldda*Ak ); TESTING_DEVALLOC( d_B, double, lddb*N ); /* Initialize the matrices */ lapackf77_dlarnv( &ione, ISEED, &sizeA, LU ); err = magma_malloc_cpu( (void**) &piv, Ak*sizeof(magma_int_t) ); assert( err == 0 ); lapackf77_dgetrf( &Ak, &Ak, LU, &lda, piv, &info ); int i, j; for(i=0;i<Ak;i++){ for(j=0;j<Ak;j++){ LUT[j+i*lda] = LU[i+j*lda]; } } lapackf77_dlacpy(MagmaUpperStr, &Ak, &Ak, LUT, &lda, LU, &lda); if(opts.uplo == MagmaLower){ lapackf77_dlacpy(MagmaLowerStr, &Ak, &Ak, LU, &lda, h_A, &lda); }else{ lapackf77_dlacpy(MagmaUpperStr, &Ak, &Ak, LU, &lda, h_A, &lda); } lapackf77_dlarnv( &ione, ISEED, &sizeB, h_B ); memcpy(h_B1, h_B, sizeB*sizeof(double)); /* ===================================================================== Performs operation using MAGMA-BLAS =================================================================== */ magma_dsetmatrix( Ak, Ak, h_A, lda, d_A, ldda ); magma_dsetmatrix( M, N, h_B, ldb, d_B, lddb ); magma_time = magma_sync_wtime( NULL ); magmablas_dtrsm( opts.side, opts.uplo, opts.transA, opts.diag, M, N, alpha, d_A, ldda, d_B, lddb ); magma_time = magma_sync_wtime( NULL ) - magma_time; magma_perf = gflops / magma_time; magma_dgetmatrix( M, N, d_B, lddb, h_Bmagma, ldb ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_dsetmatrix( M, N, h_B, ldb, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ); cublasDtrsm( opts.side, opts.uplo, opts.transA, opts.diag, M, N, alpha, d_A, ldda, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_dgetmatrix( M, N, d_B, lddb, h_Bcublas, ldb ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_dtrsm( &opts.side, &opts.uplo, &opts.transA, &opts.diag, &M, &N, &alpha, h_A, &lda, h_B, &ldb ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ // ||b - Ax|| / (||A||*||x||) memcpy(h_X1, h_Bmagma, sizeB*sizeof(double)); double alpha2 = MAGMA_D_DIV( c_one, alpha ); blasf77_dtrmm( &opts.side, &opts.uplo, &opts.transA, &opts.diag, &M, &N, &alpha2, h_A, &lda, h_X1, &ldb ); blasf77_daxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X1, &ione ); double norm1 = lapackf77_dlange( "M", &M, &N, h_X1, &ldb, work ); double normx = lapackf77_dlange( "M", &M, &N, h_Bmagma, &ldb, work ); double normA = lapackf77_dlange( "M", &Ak, &Ak, h_A, &lda, work ); magma_error = norm1/(normx*normA); memcpy(h_X2, h_Bcublas, sizeB*sizeof(double)); blasf77_dtrmm( &opts.side, &opts.uplo, &opts.transA, &opts.diag, &M, &N, &alpha2, h_A, &lda, h_X2, &ldb ); blasf77_daxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X2, &ione ); norm1 = lapackf77_dlange( "M", &M, &N, h_X2, &ldb, work ); normx = lapackf77_dlange( "M", &M, &N, h_Bcublas, &ldb, work ); normA = lapackf77_dlange( "M", &Ak, &Ak, h_A, &lda, work ); cublas_error = norm1/(normx*normA); if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error ); } else { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) --- ( --- ) %8.2e %8.2e\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, magma_error, cublas_error ); } TESTING_FREE( h_A ); TESTING_FREE( LU ); TESTING_FREE( LUT ); TESTING_FREE( h_B ); TESTING_FREE( h_Bcublas ); TESTING_FREE( h_Bmagma ); TESTING_FREE( h_B1 ); TESTING_FREE( h_X1 ); TESTING_FREE( h_X2 ); TESTING_DEVFREE( d_A ); TESTING_DEVFREE( d_B ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }