inline void herk( const Order order, const UpLo uplo, const Trans trans, const int n, const int k, const double alpha, const double* a, const int lda, const double beta, double* c, const int ldc ) { BOOST_STATIC_ASSERT( (is_same<Order, tag::column_major>::value) ); cublasDsyrk( blas_option< UpLo >::value, blas_option< Trans >::value, n, k, alpha, a, lda, beta, c, ldc ); }
void d_syrk(SEXP ruplo, SEXP rtrans, SEXP ralpha, SEXP ra, SEXP rlda, SEXP rbeta, SEXP rc, SEXP rldc) { char trans = getTranspose(rtrans), uplo = getSymLoc(ruplo); double alpha = asReal(ralpha), beta = asReal(rbeta), * a, * c; int k, rowsa, colsa, lda = asInteger(rlda), rowsc, colsc, ldc = asInteger(rldc); k = rowsa; if((trans == 'N') || (trans == 'n')) { k = colsa; } unpackMatrix(ra, &rowsa, &colsa, &a); unpackMatrix(rc, &rowsc, &colsc, &c); cublasDsyrk(uplo, trans, rowsc, k, alpha, a, lda, beta, c, ldc); checkCublasError("d_syrk"); }
void magma_dsyrk( magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, double alpha, double const* dA, magma_int_t lda, double beta, double* dC, magma_int_t ldc ) { cublasDsyrk( cublas_uplo_const( uplo ), cublas_trans_const( trans ), n, k, alpha, dA, lda, beta, dC, ldc ); }
int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, t1, t2; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; const char trans[] = { 'N', 'C', 'T' }; const char uplo[] = { 'L', 'U' }; const char diag[] = { 'U', 'N' }; const char side[] = { 'L', 'R' }; double *A, *B, *C, *C2, *LU; double *dA, *dB, *dC1, *dC2; double alpha = MAGMA_D_MAKE( 0.5, 0.1 ); double beta = MAGMA_D_MAKE( 0.7, 0.2 ); double dalpha = 0.6; double dbeta = 0.8; double work[1], error, total_error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld, info; magma_int_t *piv; magma_err_t err; magma_opts opts; parse_opts( argc, argv, &opts ); printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" ); total_error = 0.; for( int i = 0; i < opts.ntest; ++i ) { m = opts.msize[i]; n = opts.nsize[i]; k = opts.ksize[i]; printf("=========================================================================\n"); printf( "M %d, N %d, K %d\n", (int) m, (int) n, (int) k ); // allocate matrices // over-allocate so they can be any combination of {m,n,k} x {m,n,k}. maxn = max( max( m, n ), k ); ld = maxn; size = maxn*maxn; err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) ); assert( err == 0 ); err = magma_dmalloc_pinned( &A, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &B, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &C, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &C2, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &LU, size ); assert( err == 0 ); err = magma_dmalloc( &dA, size ); assert( err == 0 ); err = magma_dmalloc( &dB, size ); assert( err == 0 ); err = magma_dmalloc( &dC1, size ); assert( err == 0 ); err = magma_dmalloc( &dC2, size ); assert( err == 0 ); // initialize matrices size = maxn*maxn; lapackf77_dlarnv( &ione, ISEED, &size, A ); lapackf77_dlarnv( &ione, ISEED, &size, B ); lapackf77_dlarnv( &ione, ISEED, &size, C ); printf( "========== Level 1 BLAS ==========\n" ); // ----- test DSWAP // swap 2nd and 3rd columns of dA, then copy to C2 and compare with A assert( n >= 4 ); magma_dsetmatrix( m, n, A, ld, dA, ld ); magma_dsetmatrix( m, n, A, ld, dB, ld ); magma_dswap( m, dA(0,1), 1, dA(0,2), 1 ); magma_dswap( m, dB(0,1), 1, dB(0,2), 1 ); // check results, storing diff between magma and cuda calls in C2 cublasDaxpy( ld*n, c_neg_one, dA, 1, dB, 1 ); magma_dgetmatrix( m, n, dB, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &k, C2, &ld, work ); total_error += error; printf( "dswap diff %.2g\n", error ); // ----- test IDAMAX // get argmax of column of A magma_dsetmatrix( m, k, A, ld, dA, ld ); error = 0; for( int j = 0; j < k; ++j ) { magma_int_t i1 = magma_idamax( m, dA(0,j), 1 ); magma_int_t i2 = cublasIdamax( m, dA(0,j), 1 ); assert( i1 == i2 ); error += abs( i1 - i2 ); } total_error += error; gflops = (double)m * k / 1e9; printf( "idamax diff %.2g\n", error ); printf( "\n" ); printf( "========== Level 2 BLAS ==========\n" ); // ----- test DGEMV // c = alpha*A*b + beta*c, with A m*n; b,c m or n-vectors // try no-trans/trans for( int ia = 0; ia < 3; ++ia ) { magma_dsetmatrix( m, n, A, ld, dA, ld ); magma_dsetvector( maxn, B, 1, dB, 1 ); magma_dsetvector( maxn, C, 1, dC1, 1 ); magma_dsetvector( maxn, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 size = (trans[ia] == 'N' ? m : n); cublasDaxpy( size, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( size, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &size, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DGEMV( m, n ) / 1e9; printf( "dgemv( %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", trans[ia], error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test DSYMV // c = alpha*A*b + beta*c, with A m*m symmetric; b,c m-vectors // try upper/lower for( int iu = 0; iu < 2; ++iu ) { magma_dsetmatrix( m, m, A, ld, dA, ld ); magma_dsetvector( m, B, 1, dB, 1 ); magma_dsetvector( m, C, 1, dC1, 1 ); magma_dsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( m, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYMV( m ) / 1e9; printf( "dsymv( %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test DTRSV // solve A*c = c, with A m*m triangular; c m-vector // try upper/lower, no-trans/trans, unit/non-unit diag // Factor A into LU to get well-conditioned triangles, else solve yields garbage. // Still can give garbage if solves aren't consistent with LU factors, // e.g., using unit diag for U, so copy lower triangle to upper triangle. // Also used for trsm later. lapackf77_dlacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld ); lapackf77_dgetrf( &maxn, &maxn, LU, &ld, piv, &info ); for( int j = 0; j < maxn; ++j ) { for( int i = 0; i < j; ++i ) { *LU(i,j) = *LU(j,i); } } for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { magma_dsetmatrix( m, m, LU, ld, dA, ld ); magma_dsetvector( m, C, 1, dC1, 1 ); magma_dsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( m, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRSM( MagmaLeft, m, 1 ) / 1e9; printf( "dtrsv( %c, %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], diag[id], error, gflops/t1, gflops/t2 ); }}} printf( "\n" ); printf( "========== Level 3 BLAS ==========\n" ); // ----- test DGEMM // C = alpha*A*B + beta*C, with A m*k or k*m; B k*n or n*k; C m*n // try combinations of no-trans/trans for( int ia = 0; ia < 3; ++ia ) { for( int ib = 0; ib < 3; ++ib ) { bool nta = (trans[ia] == 'N'); bool ntb = (trans[ib] == 'N'); magma_dsetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA, ld ); magma_dsetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DGEMM( m, n, k ) / 1e9; printf( "dgemm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", trans[ia], trans[ib], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYMM // C = alpha*A*B + beta*C (left) with A m*m symmetric; B,C m*n; or // C = alpha*B*A + beta*C (right) with A n*n symmetric; B,C m*n // try left/right, upper/lower for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { magma_dsetmatrix( m, m, A, ld, dA, ld ); magma_dsetmatrix( m, n, B, ld, dB, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYMM( side[is], m, n ) / 1e9; printf( "dsymm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", side[is], uplo[iu], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYRK // C = alpha*A*A^H + beta*C (no-trans) with A m*k and C m*m symmetric; or // C = alpha*A^H*A + beta*C (trans) with A k*m and C m*m symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { magma_dsetmatrix( n, k, A, ld, dA, ld ); magma_dsetmatrix( n, n, C, ld, dC1, ld ); magma_dsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYRK( k, n ) / 1e9; printf( "dsyrk( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYR2K // C = alpha*A*B^H + ^alpha*B*A^H + beta*C (no-trans) with A,B n*k; C n*n symmetric; or // C = alpha*A^H*B + ^alpha*B^H*A + beta*C (trans) with A,B k*n; C n*n symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { bool nt = (trans[it] == 'N'); magma_dsetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA, ld ); magma_dsetmatrix( n, n, C, ld, dC1, ld ); magma_dsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYR2K( k, n ) / 1e9; printf( "dsyr2k( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DTRMM // C = alpha*A*C (left) with A m*m triangular; C m*n; or // C = alpha*C*A (right) with A n*n triangular; C m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == 'L'); magma_dsetmatrix( (left ? m : n), (left ? m : n), A, ld, dA, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRMM( side[is], m, n ) / 1e9; printf( "dtrmm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // ----- test DTRSM // solve A*X = alpha*B (left) with A m*m triangular; B m*n; or // solve X*A = alpha*B (right) with A n*n triangular; B m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == 'L'); magma_dsetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRSM( side[is], m, n ) / 1e9; printf( "dtrsm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // cleanup magma_free_cpu( piv ); magma_free_pinned( A ); magma_free_pinned( B ); magma_free_pinned( C ); magma_free_pinned( C2 ); magma_free_pinned( LU ); magma_free( dA ); magma_free( dB ); magma_free( dC1 ); magma_free( dC2 ); } if ( total_error != 0. ) { printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n", total_error ); } else { printf( "all tests passed\n" ); } TESTING_FINALIZE(); return 0; }
cublasStatus_t cublasXsyrk(cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const double *alpha, const double *A, int lda, const double *beta, double *C, int ldc) { return cublasDsyrk(g_context->cublasHandle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc); }
SEXP magma_dgeMatrix_crossprod(SEXP x, SEXP trans) { #ifdef HIPLAR_WITH_MAGMA int tr = asLogical(trans);/* trans=TRUE: tcrossprod(x) */ SEXP val = PROTECT(NEW_OBJECT(MAKE_CLASS("dpoMatrix"))), nms = VECTOR_ELT(GET_SLOT(x, Matrix_DimNamesSym), tr ? 0 : 1), vDnms = ALLOC_SLOT(val, Matrix_DimNamesSym, VECSXP, 2); int *Dims = INTEGER(GET_SLOT(x, Matrix_DimSym)), *vDims = INTEGER(ALLOC_SLOT(val, Matrix_DimSym, INTSXP, 2)); int k = tr ? Dims[1] : Dims[0], n = tr ? Dims[0] : Dims[1]; double *vx = REAL(ALLOC_SLOT(val, Matrix_xSym, REALSXP, n * n)), one = 1.0, zero = 0.0; double *A = REAL(GET_SLOT(x, Matrix_xSym)); AZERO(vx, n * n); SET_SLOT(val, Matrix_uploSym, mkString("U")); ALLOC_SLOT(val, Matrix_factorSym, VECSXP, 0); vDims[0] = vDims[1] = n; SET_VECTOR_ELT(vDnms, 0, duplicate(nms)); SET_VECTOR_ELT(vDnms, 1, duplicate(nms)); if(n && GPUFlag == 1) { #ifdef HIPLAR_DBG R_ShowMessage("DBG: Performing crossproduct using cublasDsyrk"); #endif cublasStatus retStatus; double *d_A, *d_C; /*retStatus = cublasCreate(&handle); if ( retStatus != CUBLAS_STATUS_SUCCESS ) error(_("CUBLAS initialisation failed")); */ cublasAlloc(n * k, sizeof(double), (void**)&d_A); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasAlloc(n * n, sizeof(double), (void**)&d_C); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasSetVector( n * k , sizeof(double), A, 1, d_A, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ //cublasSetVector( n * n , sizeof(double), vx, 1, d_C, 1); /* Error Checking */ //retStatus = cublasGetError (); //if (retStatus != CUBLAS_STATUS_SUCCESS) // error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ cublasDsyrk('U' , tr ? 'N' : 'T', n, k, one, d_A, Dims[0], zero, d_C, n); cublasGetVector( n * n , sizeof(double), d_C, 1, vx, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer from Device")); /********************************************/ cublasFree(d_A); cublasFree(d_C); } else if(n){ #ifdef HIPLAR_DBG R_ShowMessage("DBG: Performing cross prod with dsyrk"); #endif F77_CALL(dsyrk)("U", tr ? "N" : "T", &n, &k, &one, A, Dims, &zero, vx, &n); } SET_SLOT(val, Matrix_factorSym, allocVector(VECSXP, 0)); UNPROTECT(1); return val; #endif return R_NilValue; }
int TEMPLATE2 (CHOLMOD (gpu_lower_potrf)) ( Int nscol2, /* S is nscol2-by-nscol2 */ Int nsrow, /* leading dimension of S */ Int psx, /* S is located at Lx + L_ENTRY*psx */ double *Lx, /* contains S; overwritten with Cholesky factor */ Int *info, /* BLAS info return value */ cholmod_common *Common, cholmod_gpu_pointers *gpu_p ) { double *devPtrA, *devPtrB, *A ; double alpha, beta ; cudaError_t cudaStat ; cublasStatus_t cublasStatus ; Int j, nsrow2, nb, n, gpu_lda, lda, gpu_ldb ; int ilda, ijb, iinfo ; #ifndef NTIMER double tstart ; #endif if (nscol2 * L_ENTRY < CHOLMOD_POTRF_LIMIT) { /* too small for the CUDA BLAS; use the CPU instead */ return (0) ; } #ifndef NTIMER tstart = SuiteSparse_time ( ) ; Common->CHOLMOD_GPU_POTRF_CALLS++ ; #endif nsrow2 = nsrow - nscol2 ; /* ---------------------------------------------------------------------- */ /* heuristic to get the block size depending of the problem size */ /* ---------------------------------------------------------------------- */ nb = 128 ; if (nscol2 > 4096) nb = 256 ; if (nscol2 > 8192) nb = 384 ; n = nscol2 ; gpu_lda = ((nscol2+31)/32)*32 ; lda = nsrow ; A = gpu_p->h_Lx[(Common->ibuffer+CHOLMOD_HOST_SUPERNODE_BUFFERS-1)% CHOLMOD_HOST_SUPERNODE_BUFFERS]; /* ---------------------------------------------------------------------- */ /* determine the GPU leading dimension of B */ /* ---------------------------------------------------------------------- */ gpu_ldb = 0 ; if (nsrow2 > 0) { gpu_ldb = ((nsrow2+31)/32)*32 ; } /* ---------------------------------------------------------------------- */ /* remember where device memory is, to be used by triangular solve later */ /* ---------------------------------------------------------------------- */ devPtrA = gpu_p->d_Lx[0]; devPtrB = gpu_p->d_Lx[1]; /* ---------------------------------------------------------------------- */ /* copy A from device to device */ /* ---------------------------------------------------------------------- */ cudaStat = cudaMemcpy2DAsync ( devPtrA, gpu_lda * L_ENTRY * sizeof (devPtrA[0]), gpu_p->d_A[1], nsrow * L_ENTRY * sizeof (Lx[0]), nscol2 * L_ENTRY * sizeof (devPtrA[0]), nscol2, cudaMemcpyDeviceToDevice, Common->gpuStream[0] ); if ( cudaStat ) { ERROR ( CHOLMOD_GPU_PROBLEM, "GPU memcopy device to device"); } /* ---------------------------------------------------------------------- */ /* copy B in advance, for gpu_triangular_solve */ /* ---------------------------------------------------------------------- */ if (nsrow2 > 0) { cudaStat = cudaMemcpy2DAsync (devPtrB, gpu_ldb * L_ENTRY * sizeof (devPtrB [0]), gpu_p->d_A[1] + L_ENTRY*nscol2, nsrow * L_ENTRY * sizeof (Lx [0]), nsrow2 * L_ENTRY * sizeof (devPtrB [0]), nscol2, cudaMemcpyDeviceToDevice, Common->gpuStream[0]) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ; } } /* ------------------------------------------------------------------ */ /* define the dpotrf stream */ /* ------------------------------------------------------------------ */ cublasStatus = cublasSetStream (Common->cublasHandle, Common->gpuStream [0]) ; if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS stream") ; } /* ---------------------------------------------------------------------- */ /* block Cholesky factorization of S */ /* ---------------------------------------------------------------------- */ for (j = 0 ; j < n ; j += nb) { Int jb = nb < (n-j) ? nb : (n-j) ; /* ------------------------------------------------------------------ */ /* do the CUDA BLAS dsyrk */ /* ------------------------------------------------------------------ */ alpha = -1.0 ; beta = 1.0 ; #ifdef REAL cublasStatus = cublasDsyrk (Common->cublasHandle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, jb, j, &alpha, devPtrA + j, gpu_lda, &beta, devPtrA + j + j*gpu_lda, gpu_lda) ; #else cublasStatus = cublasZherk (Common->cublasHandle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, jb, j, &alpha, (cuDoubleComplex*)devPtrA + j, gpu_lda, &beta, (cuDoubleComplex*)devPtrA + j + j*gpu_lda, gpu_lda) ; #endif if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ; } /* ------------------------------------------------------------------ */ cudaStat = cudaEventRecord (Common->cublasEventPotrf [0], Common->gpuStream [0]) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ; } cudaStat = cudaStreamWaitEvent (Common->gpuStream [1], Common->cublasEventPotrf [0], 0) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ; } /* ------------------------------------------------------------------ */ /* copy back the jb columns on two different streams */ /* ------------------------------------------------------------------ */ cudaStat = cudaMemcpy2DAsync (A + L_ENTRY*(j + j*lda), lda * L_ENTRY * sizeof (double), devPtrA + L_ENTRY*(j + j*gpu_lda), gpu_lda * L_ENTRY * sizeof (double), L_ENTRY * sizeof (double)*jb, jb, cudaMemcpyDeviceToHost, Common->gpuStream [1]) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy from device") ; } /* ------------------------------------------------------------------ */ /* do the CUDA BLAS dgemm */ /* ------------------------------------------------------------------ */ if ((j+jb) < n) { #ifdef REAL alpha = -1.0 ; beta = 1.0 ; cublasStatus = cublasDgemm (Common->cublasHandle, CUBLAS_OP_N, CUBLAS_OP_T, (n-j-jb), jb, j, &alpha, devPtrA + (j+jb), gpu_lda, devPtrA + (j) , gpu_lda, &beta, devPtrA + (j+jb + j*gpu_lda), gpu_lda) ; #else cuDoubleComplex calpha = {-1.0,0.0} ; cuDoubleComplex cbeta = { 1.0,0.0} ; cublasStatus = cublasZgemm (Common->cublasHandle, CUBLAS_OP_N, CUBLAS_OP_C, (n-j-jb), jb, j, &calpha, (cuDoubleComplex*)devPtrA + (j+jb), gpu_lda, (cuDoubleComplex*)devPtrA + (j), gpu_lda, &cbeta, (cuDoubleComplex*)devPtrA + (j+jb + j*gpu_lda), gpu_lda ) ; #endif if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ; } } cudaStat = cudaStreamSynchronize (Common->gpuStream [1]) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ; } /* ------------------------------------------------------------------ */ /* compute the Cholesky factorization of the jbxjb block on the CPU */ /* ------------------------------------------------------------------ */ ilda = (int) lda ; ijb = jb ; #ifdef REAL LAPACK_DPOTRF ("L", &ijb, A + L_ENTRY * (j + j*lda), &ilda, &iinfo) ; #else LAPACK_ZPOTRF ("L", &ijb, A + L_ENTRY * (j + j*lda), &ilda, &iinfo) ; #endif *info = iinfo ; if (*info != 0) { *info = *info + j ; break ; } /* ------------------------------------------------------------------ */ /* copy the result back to the GPU */ /* ------------------------------------------------------------------ */ cudaStat = cudaMemcpy2DAsync (devPtrA + L_ENTRY*(j + j*gpu_lda), gpu_lda * L_ENTRY * sizeof (double), A + L_ENTRY * (j + j*lda), lda * L_ENTRY * sizeof (double), L_ENTRY * sizeof (double) * jb, jb, cudaMemcpyHostToDevice, Common->gpuStream [0]) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ; } /* ------------------------------------------------------------------ */ /* do the CUDA BLAS dtrsm */ /* ------------------------------------------------------------------ */ if ((j+jb) < n) { #ifdef REAL alpha = 1.0 ; cublasStatus = cublasDtrsm (Common->cublasHandle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, (n-j-jb), jb, &alpha, devPtrA + (j + j*gpu_lda), gpu_lda, devPtrA + (j+jb + j*gpu_lda), gpu_lda) ; #else cuDoubleComplex calpha = {1.0,0.0}; cublasStatus = cublasZtrsm (Common->cublasHandle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_C, CUBLAS_DIAG_NON_UNIT, (n-j-jb), jb, &calpha, (cuDoubleComplex *)devPtrA + (j + j*gpu_lda), gpu_lda, (cuDoubleComplex *)devPtrA + (j+jb + j*gpu_lda), gpu_lda) ; #endif if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ; } /* -------------------------------------------------------------- */ /* Copy factored column back to host. */ /* -------------------------------------------------------------- */ cudaStat = cudaEventRecord (Common->cublasEventPotrf[2], Common->gpuStream[0]) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ; } cudaStat = cudaStreamWaitEvent (Common->gpuStream[1], Common->cublasEventPotrf[2], 0) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ; } cudaStat = cudaMemcpy2DAsync (A + L_ENTRY*(j + jb + j * lda), lda * L_ENTRY * sizeof (double), devPtrA + L_ENTRY* (j + jb + j * gpu_lda), gpu_lda * L_ENTRY * sizeof (double), L_ENTRY * sizeof (double)* (n - j - jb), jb, cudaMemcpyDeviceToHost, Common->gpuStream[1]) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ; } } } #ifndef NTIMER Common->CHOLMOD_GPU_POTRF_TIME += SuiteSparse_time ( ) - tstart ; #endif return (1) ; }
int TEMPLATE2 (CHOLMOD (gpu_updateC)) ( Int ndrow1, /* C is ndrow2-by-ndrow2 */ Int ndrow2, Int ndrow, /* leading dimension of Lx */ Int ndcol, /* L1 is ndrow1-by-ndcol */ Int nsrow, Int pdx1, /* L1 starts at Lx + L_ENTRY*pdx1 */ /* L2 starts at Lx + L_ENTRY*(pdx1 + ndrow1) */ Int pdi1, double *Lx, double *C, cholmod_common *Common, cholmod_gpu_pointers *gpu_p ) { double *devPtrLx, *devPtrC ; double alpha, beta ; cublasStatus_t cublasStatus ; cudaError_t cudaStat [2] ; Int ndrow3 ; int icol, irow; int iHostBuff, iDevBuff ; #ifndef NTIMER double tstart = 0; #endif if ((ndrow2*L_ENTRY < CHOLMOD_ND_ROW_LIMIT) || (ndcol*L_ENTRY < CHOLMOD_ND_COL_LIMIT)) { /* too small for the CUDA BLAS; use the CPU instead */ return (0) ; } ndrow3 = ndrow2 - ndrow1 ; #ifndef NTIMER Common->syrkStart = SuiteSparse_time ( ) ; Common->CHOLMOD_GPU_SYRK_CALLS++ ; #endif /* ---------------------------------------------------------------------- */ /* allocate workspace on the GPU */ /* ---------------------------------------------------------------------- */ iHostBuff = (Common->ibuffer)%CHOLMOD_HOST_SUPERNODE_BUFFERS; iDevBuff = (Common->ibuffer)%CHOLMOD_DEVICE_STREAMS; /* cycle the device Lx buffer, d_Lx, through CHOLMOD_DEVICE_STREAMS, usually 2, so we can overlap the copy of this descendent supernode with the compute of the previous descendant supernode */ devPtrLx = (double *)(gpu_p->d_Lx[iDevBuff]); /* very little overlap between kernels for difference descendant supernodes (since we enforce the supernodes must be large enough to fill the device) so we only need one C buffer */ devPtrC = (double *)(gpu_p->d_C); /* ---------------------------------------------------------------------- */ /* copy Lx to the GPU */ /* ---------------------------------------------------------------------- */ /* copy host data to pinned buffer first for better H2D bandwidth */ #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) if (ndcol > 32) for ( icol=0; icol<ndcol; icol++ ) { for ( irow=0; irow<ndrow2*L_ENTRY; irow++ ) { gpu_p->h_Lx[iHostBuff][icol*ndrow2*L_ENTRY+irow] = Lx[pdx1*L_ENTRY+icol*ndrow*L_ENTRY + irow]; } } cudaStat[0] = cudaMemcpyAsync ( devPtrLx, gpu_p->h_Lx[iHostBuff], ndrow2*ndcol*L_ENTRY*sizeof(devPtrLx[0]), cudaMemcpyHostToDevice, Common->gpuStream[iDevBuff] ); if ( cudaStat[0] ) { CHOLMOD_GPU_PRINTF ((" ERROR cudaMemcpyAsync = %d \n", cudaStat[0])); return (0); } /* make the current stream wait for kernels in previous streams */ cudaStreamWaitEvent ( Common->gpuStream[iDevBuff], Common->updateCKernelsComplete, 0 ) ; /* ---------------------------------------------------------------------- */ /* create the relative map for this descendant supernode */ /* ---------------------------------------------------------------------- */ createRelativeMapOnDevice ( (Int *)(gpu_p->d_Map), (Int *)(gpu_p->d_Ls), (Int *)(gpu_p->d_RelativeMap), pdi1, ndrow2, &(Common->gpuStream[iDevBuff]) ); /* ---------------------------------------------------------------------- */ /* do the CUDA SYRK */ /* ---------------------------------------------------------------------- */ cublasStatus = cublasSetStream (Common->cublasHandle, Common->gpuStream[iDevBuff]) ; if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS stream") ; } alpha = 1.0 ; beta = 0.0 ; #ifdef REAL cublasStatus = cublasDsyrk (Common->cublasHandle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, (int) ndrow1, (int) ndcol, /* N, K: L1 is ndrow1-by-ndcol */ &alpha, /* ALPHA: 1 */ devPtrLx, ndrow2, /* A, LDA: L1, ndrow2 */ &beta, /* BETA: 0 */ devPtrC, ndrow2) ; /* C, LDC: C1 */ #else cublasStatus = cublasZherk (Common->cublasHandle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, (int) ndrow1, (int) ndcol, /* N, K: L1 is ndrow1-by-ndcol*/ &alpha, /* ALPHA: 1 */ (const cuDoubleComplex *) devPtrLx, ndrow2, /* A, LDA: L1, ndrow2 */ &beta, /* BETA: 0 */ (cuDoubleComplex *) devPtrC, ndrow2) ; /* C, LDC: C1 */ #endif if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ; } #ifndef NTIMER Common->CHOLMOD_GPU_SYRK_TIME += SuiteSparse_time() - Common->syrkStart; #endif /* ---------------------------------------------------------------------- */ /* compute remaining (ndrow2-ndrow1)-by-ndrow1 block of C, C2 = L2*L1' */ /* ---------------------------------------------------------------------- */ #ifndef NTIMER Common->CHOLMOD_GPU_GEMM_CALLS++ ; tstart = SuiteSparse_time(); #endif if (ndrow3 > 0) { #ifndef REAL cuDoubleComplex calpha = {1.0,0.0} ; cuDoubleComplex cbeta = {0.0,0.0} ; #endif /* ------------------------------------------------------------------ */ /* do the CUDA BLAS dgemm */ /* ------------------------------------------------------------------ */ #ifdef REAL alpha = 1.0 ; beta = 0.0 ; cublasStatus = cublasDgemm (Common->cublasHandle, CUBLAS_OP_N, CUBLAS_OP_T, ndrow3, ndrow1, ndcol, /* M, N, K */ &alpha, /* ALPHA: 1 */ devPtrLx + L_ENTRY*(ndrow1), /* A, LDA: L2*/ ndrow2, /* ndrow */ devPtrLx, /* B, LDB: L1 */ ndrow2, /* ndrow */ &beta, /* BETA: 0 */ devPtrC + L_ENTRY*ndrow1, /* C, LDC: C2 */ ndrow2) ; #else cublasStatus = cublasZgemm (Common->cublasHandle, CUBLAS_OP_N, CUBLAS_OP_C, ndrow3, ndrow1, ndcol, /* M, N, K */ &calpha, /* ALPHA: 1 */ (const cuDoubleComplex*) devPtrLx + ndrow1, ndrow2, /* ndrow */ (const cuDoubleComplex *) devPtrLx, ndrow2, /* ndrow */ &cbeta, /* BETA: 0 */ (cuDoubleComplex *)devPtrC + ndrow1, ndrow2) ; #endif if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ; } } #ifndef NTIMER Common->CHOLMOD_GPU_GEMM_TIME += SuiteSparse_time() - tstart; #endif /* ------------------------------------------------------------------ */ /* Assemble the update C on the device using the d_RelativeMap */ /* ------------------------------------------------------------------ */ #ifdef REAL addUpdateOnDevice ( gpu_p->d_A[0], devPtrC, gpu_p->d_RelativeMap, ndrow1, ndrow2, nsrow, &(Common->gpuStream[iDevBuff]) ); #else addComplexUpdateOnDevice ( gpu_p->d_A[0], devPtrC, gpu_p->d_RelativeMap, ndrow1, ndrow2, nsrow, &(Common->gpuStream[iDevBuff]) ); #endif /* Record an event indicating that kernels for this descendant are complete */ cudaEventRecord ( Common->updateCKernelsComplete, Common->gpuStream[iDevBuff]); cudaEventRecord ( Common->updateCBuffersFree[iHostBuff], Common->gpuStream[iDevBuff]); return (1) ; }
FLA_Error FLA_Syrk_external_gpu( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu ) { FLA_Datatype datatype; int k_A; int m_A, n_A; int m_C; int ldim_A; int ldim_C; char blas_uplo; char blas_trans; if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING ) FLA_Syrk_check( uplo, trans, alpha, A, beta, C ); if ( FLA_Obj_has_zero_dim( C ) ) return FLA_SUCCESS; datatype = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); ldim_A = FLA_Obj_length( A ); m_C = FLA_Obj_length( C ); ldim_C = FLA_Obj_length( C ); if ( trans == FLA_NO_TRANSPOSE ) k_A = n_A; else k_A = m_A; FLA_Param_map_flame_to_netlib_uplo( uplo, &blas_uplo ); FLA_Param_map_flame_to_netlib_trans( trans, &blas_trans ); switch( datatype ){ case FLA_FLOAT: { float *buff_alpha = ( float * ) FLA_FLOAT_PTR( alpha ); float *buff_beta = ( float * ) FLA_FLOAT_PTR( beta ); cublasSsyrk( blas_uplo, blas_trans, m_C, k_A, *buff_alpha, ( float * ) A_gpu, ldim_A, *buff_beta, ( float * ) C_gpu, ldim_C ); break; } case FLA_DOUBLE: { double *buff_alpha = ( double * ) FLA_DOUBLE_PTR( alpha ); double *buff_beta = ( double * ) FLA_DOUBLE_PTR( beta ); cublasDsyrk( blas_uplo, blas_trans, m_C, k_A, *buff_alpha, ( double * ) A_gpu, ldim_A, *buff_beta, ( double * ) C_gpu, ldim_C ); break; } case FLA_COMPLEX: { cuComplex *buff_alpha = ( cuComplex * ) FLA_COMPLEX_PTR( alpha ); cuComplex *buff_beta = ( cuComplex * ) FLA_COMPLEX_PTR( beta ); cublasCsyrk( blas_uplo, blas_trans, m_C, k_A, *buff_alpha, ( cuComplex * ) A_gpu, ldim_A, *buff_beta, ( cuComplex * ) C_gpu, ldim_C ); break; } case FLA_DOUBLE_COMPLEX: { cuDoubleComplex *buff_alpha = ( cuDoubleComplex * ) FLA_DOUBLE_COMPLEX_PTR( alpha ); cuDoubleComplex *buff_beta = ( cuDoubleComplex * ) FLA_DOUBLE_COMPLEX_PTR( beta ); cublasZsyrk( blas_uplo, blas_trans, m_C, k_A, *buff_alpha, ( cuDoubleComplex * ) A_gpu, ldim_A, *buff_beta, ( cuDoubleComplex * ) C_gpu, ldim_C ); break; } } return FLA_SUCCESS; }