void gpu_cublas1(double *A, double *B, double *C, double *D, double *r, double *nrmC, int N, int N2) { #pragma acc data present(A, B, C, D) { #pragma acc host_data use_device(A, B, C, D) { cublasHandle_t handle; cublasCreate(&handle); const double alpha = 1.0; const double beta = 0.0; cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, N, N, N, &alpha, A, N, B, N, &beta, C, N); printf(" gpu gemm success \n"); cublasDdot(handle, N2, C, 1, B, 1, r); printf(" gpu dot success \n"); *r = -1.0 * *r; cublasDaxpy(handle, N2, r, B, 1, C, 1); printf(" gpu axpy success \n"); cublasDnrm2(handle, N2, C, 1, nrmC); printf(" gpu nrm2 success \n"); cublasDcopy(handle, N2, C, 1, D, 1); printf(" gpu copy success \n"); *nrmC = 1.0 / *nrmC; cublasDscal(handle, N2, nrmC, D, 1); printf(" gpu scal success \n"); cublasDestroy(handle); printf(" gpu destroy success \n"); } } }
void magma_daxpy( magma_int_t n, double alpha, const double *dx, magma_int_t incx, double *dy, magma_int_t incy ) { cublasDaxpy( n, alpha, dx, incx, dy, incy ); }
static double e_cuda(double* target, double* output, size_t count, double* dedy) { /* this is to sort of keep a uniform API w. the matrix */ assert(target == dedy); cublasDaxpy(count, -1, output, 1, dedy, 1); return 0.5 * cublasDnrm2(count, dedy, 1); }
void d_axpy(SEXP ralpha, SEXP rx, SEXP rincx, SEXP ry, SEXP rincy) { int nx, ny, n, incx = asInteger(rincx), incy = asInteger(rincy); double alpha = asReal(ralpha), * x, * y; unpackVector(rx, &nx, &x); unpackVector(ry, &ny, &y); n = imin2(nx, ny); cublasDaxpy(n, alpha, x, incx, y, incy); checkCublasError("d_axpy"); }
static vl::Error axpy(vl::Context & context, ptrdiff_t n, type alpha, type const *x, ptrdiff_t incx, type *y, ptrdiff_t incy) { cublasHandle_t handle ; cublasStatus_t status ; status = context.getCudaHelper().getCublasHandle(&handle) ; if (status != CUBLAS_STATUS_SUCCESS) goto done ; status = cublasDaxpy(handle, (int)n, &alpha, x, (int)incx, y, (int)incy) ; done: return context.setError (context.getCudaHelper().catchCublasError(status, "cublasDaxpy"), __func__) ; }
void cube_blas_d_axpy (cube_t *ctx, int n, const double *alpha, const double *x, int incx, double *y, int incy) { cublasStatus_t status; if (! cube_context_check (ctx)) return; status = cublasDaxpy (ctx->h_blas, n, alpha, x, incx, y, incy); cube_blas_check (ctx, status); }
void mat_add_mat(const double *x, double *y, double scalar, int n){ cudaError_t cudaStat ; // cudaMalloc status cublasStatus_t stat ; // CUBLAS functions status cublasHandle_t handle ; // CUBLAS context // on the device double *d_x; // d_x - x on the device double *d_y; // d_y - y on the device cudaStat = cudaMalloc (( void **)& d_x, n*sizeof(*x)); // device // memory alloc for x cudaStat = cudaMalloc (( void **)& d_y, n*sizeof(*y)); // device // memory alloc for y stat = cublasCreate (& handle ); // initialize CUBLAS context stat = cublasSetVector (n, sizeof (*x), x ,1 ,d_x, 1); // cp x- >d_x stat = cublasSetVector (n, sizeof (*y), y ,1 ,d_y, 1); // cp y- >d_y stat=cublasDaxpy(handle,n,&scalar,d_x,1,d_y,1); cudaFree (d_x ); // free device memory cudaFree (d_y ); // free device memory cublasDestroy ( handle ); // destroy CUBLAS context }
int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, t1, t2; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; const char trans[] = { 'N', 'C', 'T' }; const char uplo[] = { 'L', 'U' }; const char diag[] = { 'U', 'N' }; const char side[] = { 'L', 'R' }; double *A, *B, *C, *C2, *LU; double *dA, *dB, *dC1, *dC2; double alpha = MAGMA_D_MAKE( 0.5, 0.1 ); double beta = MAGMA_D_MAKE( 0.7, 0.2 ); double dalpha = 0.6; double dbeta = 0.8; double work[1], error, total_error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld, info; magma_int_t *piv; magma_err_t err; magma_opts opts; parse_opts( argc, argv, &opts ); printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" ); total_error = 0.; for( int i = 0; i < opts.ntest; ++i ) { m = opts.msize[i]; n = opts.nsize[i]; k = opts.ksize[i]; printf("=========================================================================\n"); printf( "M %d, N %d, K %d\n", (int) m, (int) n, (int) k ); // allocate matrices // over-allocate so they can be any combination of {m,n,k} x {m,n,k}. maxn = max( max( m, n ), k ); ld = maxn; size = maxn*maxn; err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) ); assert( err == 0 ); err = magma_dmalloc_pinned( &A, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &B, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &C, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &C2, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &LU, size ); assert( err == 0 ); err = magma_dmalloc( &dA, size ); assert( err == 0 ); err = magma_dmalloc( &dB, size ); assert( err == 0 ); err = magma_dmalloc( &dC1, size ); assert( err == 0 ); err = magma_dmalloc( &dC2, size ); assert( err == 0 ); // initialize matrices size = maxn*maxn; lapackf77_dlarnv( &ione, ISEED, &size, A ); lapackf77_dlarnv( &ione, ISEED, &size, B ); lapackf77_dlarnv( &ione, ISEED, &size, C ); printf( "========== Level 1 BLAS ==========\n" ); // ----- test DSWAP // swap 2nd and 3rd columns of dA, then copy to C2 and compare with A assert( n >= 4 ); magma_dsetmatrix( m, n, A, ld, dA, ld ); magma_dsetmatrix( m, n, A, ld, dB, ld ); magma_dswap( m, dA(0,1), 1, dA(0,2), 1 ); magma_dswap( m, dB(0,1), 1, dB(0,2), 1 ); // check results, storing diff between magma and cuda calls in C2 cublasDaxpy( ld*n, c_neg_one, dA, 1, dB, 1 ); magma_dgetmatrix( m, n, dB, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &k, C2, &ld, work ); total_error += error; printf( "dswap diff %.2g\n", error ); // ----- test IDAMAX // get argmax of column of A magma_dsetmatrix( m, k, A, ld, dA, ld ); error = 0; for( int j = 0; j < k; ++j ) { magma_int_t i1 = magma_idamax( m, dA(0,j), 1 ); magma_int_t i2 = cublasIdamax( m, dA(0,j), 1 ); assert( i1 == i2 ); error += abs( i1 - i2 ); } total_error += error; gflops = (double)m * k / 1e9; printf( "idamax diff %.2g\n", error ); printf( "\n" ); printf( "========== Level 2 BLAS ==========\n" ); // ----- test DGEMV // c = alpha*A*b + beta*c, with A m*n; b,c m or n-vectors // try no-trans/trans for( int ia = 0; ia < 3; ++ia ) { magma_dsetmatrix( m, n, A, ld, dA, ld ); magma_dsetvector( maxn, B, 1, dB, 1 ); magma_dsetvector( maxn, C, 1, dC1, 1 ); magma_dsetvector( maxn, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 size = (trans[ia] == 'N' ? m : n); cublasDaxpy( size, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( size, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &size, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DGEMV( m, n ) / 1e9; printf( "dgemv( %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", trans[ia], error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test DSYMV // c = alpha*A*b + beta*c, with A m*m symmetric; b,c m-vectors // try upper/lower for( int iu = 0; iu < 2; ++iu ) { magma_dsetmatrix( m, m, A, ld, dA, ld ); magma_dsetvector( m, B, 1, dB, 1 ); magma_dsetvector( m, C, 1, dC1, 1 ); magma_dsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( m, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYMV( m ) / 1e9; printf( "dsymv( %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test DTRSV // solve A*c = c, with A m*m triangular; c m-vector // try upper/lower, no-trans/trans, unit/non-unit diag // Factor A into LU to get well-conditioned triangles, else solve yields garbage. // Still can give garbage if solves aren't consistent with LU factors, // e.g., using unit diag for U, so copy lower triangle to upper triangle. // Also used for trsm later. lapackf77_dlacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld ); lapackf77_dgetrf( &maxn, &maxn, LU, &ld, piv, &info ); for( int j = 0; j < maxn; ++j ) { for( int i = 0; i < j; ++i ) { *LU(i,j) = *LU(j,i); } } for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { magma_dsetmatrix( m, m, LU, ld, dA, ld ); magma_dsetvector( m, C, 1, dC1, 1 ); magma_dsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( m, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRSM( MagmaLeft, m, 1 ) / 1e9; printf( "dtrsv( %c, %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], diag[id], error, gflops/t1, gflops/t2 ); }}} printf( "\n" ); printf( "========== Level 3 BLAS ==========\n" ); // ----- test DGEMM // C = alpha*A*B + beta*C, with A m*k or k*m; B k*n or n*k; C m*n // try combinations of no-trans/trans for( int ia = 0; ia < 3; ++ia ) { for( int ib = 0; ib < 3; ++ib ) { bool nta = (trans[ia] == 'N'); bool ntb = (trans[ib] == 'N'); magma_dsetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA, ld ); magma_dsetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DGEMM( m, n, k ) / 1e9; printf( "dgemm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", trans[ia], trans[ib], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYMM // C = alpha*A*B + beta*C (left) with A m*m symmetric; B,C m*n; or // C = alpha*B*A + beta*C (right) with A n*n symmetric; B,C m*n // try left/right, upper/lower for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { magma_dsetmatrix( m, m, A, ld, dA, ld ); magma_dsetmatrix( m, n, B, ld, dB, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYMM( side[is], m, n ) / 1e9; printf( "dsymm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", side[is], uplo[iu], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYRK // C = alpha*A*A^H + beta*C (no-trans) with A m*k and C m*m symmetric; or // C = alpha*A^H*A + beta*C (trans) with A k*m and C m*m symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { magma_dsetmatrix( n, k, A, ld, dA, ld ); magma_dsetmatrix( n, n, C, ld, dC1, ld ); magma_dsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYRK( k, n ) / 1e9; printf( "dsyrk( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYR2K // C = alpha*A*B^H + ^alpha*B*A^H + beta*C (no-trans) with A,B n*k; C n*n symmetric; or // C = alpha*A^H*B + ^alpha*B^H*A + beta*C (trans) with A,B k*n; C n*n symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { bool nt = (trans[it] == 'N'); magma_dsetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA, ld ); magma_dsetmatrix( n, n, C, ld, dC1, ld ); magma_dsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYR2K( k, n ) / 1e9; printf( "dsyr2k( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DTRMM // C = alpha*A*C (left) with A m*m triangular; C m*n; or // C = alpha*C*A (right) with A n*n triangular; C m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == 'L'); magma_dsetmatrix( (left ? m : n), (left ? m : n), A, ld, dA, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRMM( side[is], m, n ) / 1e9; printf( "dtrmm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // ----- test DTRSM // solve A*X = alpha*B (left) with A m*m triangular; B m*n; or // solve X*A = alpha*B (right) with A n*n triangular; B m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == 'L'); magma_dsetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRSM( side[is], m, n ) / 1e9; printf( "dtrsm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // cleanup magma_free_cpu( piv ); magma_free_pinned( A ); magma_free_pinned( B ); magma_free_pinned( C ); magma_free_pinned( C2 ); magma_free_pinned( LU ); magma_free( dA ); magma_free( dB ); magma_free( dC1 ); magma_free( dC2 ); } if ( total_error != 0. ) { printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n", total_error ); } else { printf( "all tests passed\n" ); } TESTING_FINALIZE(); return 0; }
void caffe_gpu_axpy<double>(const int N, const double alpha, const double* X, double* Y) { CUBLAS_CHECK(cublasDaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1)); }
int main(int argc, char* argv[]) { const int bufsize = 512; char buffer[bufsize]; int m,n,S; double time_st,time_end,time_avg; //omp_set_num_threads(2); // printf("\n-----------------\nnumber of threads fired = %d\n-----------------\n",(int)omp_get_num_threads()); if(argc!=2) { cout<<"Insufficient arguments"<<endl; return 1; } graph G; cerr<<"Start reading "; // time_st=dsecnd(); G.create_graph(argv[1]); // time_end=dsecnd(); // time_avg = (time_end-time_st); // cout<<"Success "<<endl; // cerr<<"Reading time "<<time_avg<<endl; cerr<<"Constructing Matrices "; // time_st=dsecnd(); G.construct_MNA(); // time_end=dsecnd(); // time_avg = (time_end-time_st); // cerr<<"Done "<<time_avg<<endl; // G.construct_sparse_MNA(); m=G.node_array.size()-1; n=G.voltage_edge_id.size(); cout<<endl; cout<<"MATRIX STAT:"<<endl; cout<<"Nonzero elements: "<<G.nonzero<<endl; cout<<"Number of Rows: "<<m+n<<endl; printf("\n Nonzero = %ld", G.nonzero); printf("\n Rows = %d", m+n); cout<<"MAT val: "<<endl; int i,j; // G.Mat_val[0] +=100; /* for(i=0;i<G.nonzero;i++) cout<<" "<<G.Mat_val[i]; cout<<endl; for(i=0;i<G.nonzero;i++) cout<<" "<<G.columns[i]; cout<<endl; for(i=0;i<m+n+1;i++) cout<<" "<<G.rowIndex[i]; cout<<endl; for(i=0;i<m+n;i++) { cout<<endl; int startindex=G.rowIndex[i]; int endindex=G.rowIndex[i+1]; for(j=startindex;j<endindex;j++) cout<<" "<<G.Mat_val[j]; cout<<endl; } */ /* for (i=0;i<m+n+1;i++) { //cout<<endl; if(G.rowIndex[i]==G.rowIndex[i+1]) break; for(j=G.rowIndex[i];j<G.rowIndex[i+1];j++) { if(G.Mat_val[j]>10) cout<<G.Mat_val[j]<<"\t"; } //cout<<endl; /*for(j=G.rowIndex[i];j<G.rowIndex[i+1];j++) { cout<<G.columns[j]<<"\t"; } //cout<<endl; } cout<<endl; */ //printing the matrix printf("\n Fine till here"); printf("\n"); // int* rowmIndex=(int*)calloc(m+1,sizeof(int)); printf("\n Fine till here"); printf("\n"); //int rowmIndex[5]={1,2,3,4,5}; /* for(i=0;i<m+1;i++) { rowmIndex[i]=G.rowIndex[i]; printf(" %d", rowmIndex[i]); } */ cerr<<"Solving Equations "<<endl; double r1, b, alpha, alpham1, beta, r0, a, na; const double tol = 0.1; const int max_iter = 1000000; int *d_col, *d_row; double *d_val, *d_x, dot; double *d_r, *d_p, *d_Ax; int k; cublasHandle_t cublasHandle = 0; cublasStatus_t cublasStatus; cublasStatus = cublasCreate(&cublasHandle); checkCudaErrors(cublasStatus); /* Get handle to the CUSPARSE context */ cusparseHandle_t cusparseHandle = 0; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); checkCudaErrors(cusparseStatus); cusparseMatDescr_t descr = 0; cusparseStatus = cusparseCreateMatDescr(&descr); checkCudaErrors(cusparseStatus); cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); checkCudaErrors(cudaMalloc((void **)&d_col, G.nonzero*sizeof(int))); checkCudaErrors(cudaMalloc((void **)&d_row, (m+n+1)*sizeof(int))); checkCudaErrors(cudaMalloc((void **)&d_val, G.nonzero*sizeof(double))); checkCudaErrors(cudaMalloc((void **)&d_x, (m+n)*sizeof(double))); checkCudaErrors(cudaMalloc((void **)&d_r, (m+n)*sizeof(double))); checkCudaErrors(cudaMalloc((void **)&d_p, (m+n)*sizeof(double))); checkCudaErrors(cudaMalloc((void **)&d_Ax, (m+n)*sizeof(double))); cudaMemcpy(d_col, G.columns, G.nonzero*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_row, G.rowIndex, (m+n+1)*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_val, G.Mat_val, G.nonzero*sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(d_x, G.x, (m+n)*sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(d_r, G.b, (m+n)*sizeof(double), cudaMemcpyHostToDevice); alpha = 1.0; alpham1 = -1.0; beta = 0.0; r0 = 0.; printf("\n Data transferred\n"); cudaEvent_t start,stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); cusparseDcsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, (m+n), (m+n), G.nonzero, &alpha, descr, d_val, d_row, d_col, d_x, &beta, d_Ax); cublasDaxpy(cublasHandle, (m+n), &alpham1, d_Ax, 1, d_r, 1); cublasStatus = cublasDdot(cublasHandle, (m+n), d_r, 1, d_r, 1, &r1); k = 1; while (r1 > tol && k <= max_iter) { if (k > 1) { b = r1 / r0; cublasStatus = cublasDscal(cublasHandle, (m+n), &b, d_p, 1); cublasStatus = cublasDaxpy(cublasHandle, (m+n), &alpha, d_r, 1, d_p, 1); } else { cublasStatus = cublasDcopy(cublasHandle, (m+n), d_r, 1, d_p, 1); } cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, (m+n), (m+n), G.nonzero, &alpha, descr, d_val, d_row, d_col, d_p, &beta, d_Ax); cublasStatus = cublasDdot(cublasHandle, (m+n), d_p, 1, d_Ax, 1, &dot); a = r1 / dot; cublasStatus = cublasDaxpy(cublasHandle, (m+n), &a, d_p, 1, d_x, 1); na = -a; cublasStatus = cublasDaxpy(cublasHandle, (m+n), &na, d_Ax, 1, d_r, 1); r0 = r1; cublasStatus = cublasDdot(cublasHandle, (m+n), d_r, 1, d_r, 1, &r1); // cudaThreadSynchronize(); // printf("iteration = %3d, residual = %e\n", k, sqrt(r1)); k++; } cudaEventRecord(stop, 0); cudaEventSynchronize(stop); float elapsedTime; cudaEventElapsedTime(&elapsedTime, start, stop); printf("Iterations = %3d\tTime : %.6f milli-seconds : \n", k, elapsedTime); cudaMemcpy(G.x, d_x, (m+n)*sizeof(double), cudaMemcpyDeviceToHost); /* printf("\n x = \n"); for(i=0;i<(m+n);i++) { printf("\n x[%d] = %.8f", i, G.x[i]); } */ float rsum, diff, err = 0.0; for (int i = 0; i < (m+n); i++) { rsum = 0.0; for (int j = G.rowIndex[i]; j < G.rowIndex[i+1]; j++) { rsum += G.Mat_val[j]*G.x[G.columns[j]]; } diff = fabs(rsum - G.b[i]); if (diff > err) { err = diff; } } cusparseDestroy(cusparseHandle); cublasDestroy(cublasHandle); /* free(I); free(J); free(val); free(x); free(rhs); */ cudaFree(d_col); cudaFree(d_row); cudaFree(d_val); cudaFree(d_x); cudaFree(d_r); cudaFree(d_p); cudaFree(d_Ax); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); /* printf("\n X is:\n"); for(i=0;i<(m+n);i++) { printf("\n X[%d] = %.4f", i, G.x[i]); } */ printf("Test Summary: Error amount = %f\n", err); exit((k <= max_iter) ? 0 : 1); /* culaSparseHandle handle; if (culaSparseCreate(&handle) != culaSparseNoError) { // this should only fail under extreme conditions std::cout << "fatal error: failed to create library handle!" << std::endl; exit(EXIT_FAILURE); } StatusChecker sc(handle); culaSparsePlan plan; sc = culaSparseCreatePlan(handle, &plan); culaSparseCsrOptions formatOpts; culaSparseCsrOptionsInit(handle, &formatOpts); //formatOpts.indexing=1; sc = culaSparseSetDcsrData(handle, plan, &formatOpts, m+n, G.nonzero, &G.Mat_val[0], &G.rowIndex[0], &G.columns[0], &G.x[0], &G.b[0]); printf("\n Fine till here"); printf("\n"); culaSparseConfig config; sc = culaSparseConfigInit(handle, &config); config.relativeTolerance = 1e-2; config.maxIterations = 100000; config.divergenceTolerance = 20; culaSparseResult result; /* sc = culaSparseSetHostPlatform(handle, plan, 0); // set cg solver sc = culaSparseSetCgSolver(handle, plan, 0); printf("\n Fine till here"); printf("\n"); // perform solve (cg + no preconditioner on host) culaSparseResult result; sc = culaSparseExecutePlan(handle, plan, &config, &result); sc = culaSparseGetResultString(handle, &result, buffer, bufsize); std::cout << buffer << std::endl; if (culaSparsePreinitializeCuda(handle) == culaSparseNoError) { // change to cuda accelerated platform sc = culaSparseSetCudaPlatform(handle, plan, 0); // perform solve (cg + ilu0 on cuda) culaSparseGmresOptions solverOpts; culaSparseStatus status = culaSparseGmresOptionsInit(handle, &solverOpts); solverOpts.restart = 20; sc = culaSparseSetCgSolver(handle, plan, 0); // sc = culaSparseSetJacobiPreconditioner(handle, plan, 0); // sc = culaSparseSetIlu0Preconditioner(handle, plan, 0); sc = culaSparseExecutePlan(handle, plan, &config, &result); sc = culaSparseGetResultString(handle, &result, buffer, bufsize); std::cout << buffer << std::endl; // change preconditioner to fainv // this avoids data transfer by using data cached by the plan // sc = culaSparseSetIlu0Preconditioner(handle, plan, 0); // perform solve (cg + fainv on cuda) // these timing results should indicate minimal overhead /* sc = culaSparseExecutePlan(handle, plan, &config, &result); sc = culaSparseGetResultString(handle, &result, buffer, bufsize); std::cout << buffer << std::endl; // change solver // this avoids preconditioner regeneration by using data cached by the plan sc = culaSparseSetBicgstabSolver(handle, plan, 0); // perform solve (bicgstab + fainv on cuda) // the timing results should indicate minimal overhead and preconditioner generation time sc = culaSparseExecutePlan(handle, plan, &config, &result); sc = culaSparseGetResultString(handle, &result, buffer, bufsize); std::cout << buffer << std::endl; sc = culaSparseSetGmresSolver(handle, plan, 0); // perform solve (bicgstab + fainv on cuda) // the timing results should indicate minimal overhead and preconditioner generation time sc = culaSparseExecutePlan(handle, plan, &config, &result); sc = culaSparseGetResultString(handle, &result, buffer, bufsize); std::cout << buffer << std::endl; } else { std::cout << "alert: no cuda capable gpu found" << std::endl; } // cleanup plan culaSparseDestroyPlan(plan); // cleanup handle culaSparseDestroy(handle); FILE* myWriteFile; myWriteFile=fopen("result.txt","w"); for (i = 0; i < n; i++) { fprintf(myWriteFile,"%1f\n",G.x[i]); // printf ("\n x [%d] = % f", i, x[i]); } fprintf(myWriteFile,".end\n"); fclose(myWriteFile); printf ("\n"); // time_st=dsecnd(); // solver(G.rowIndex,G.columns,G.Mat_val,G.b,G.x,m+n,G.nonzero); // time_end=dsecnd(); // time_avg = (time_end-time_st); // printf("Successfully Solved in : %.6f secs\n",time_avg); cerr<<endl; cerr<<"Fillup Graph "; // time_st=dsecnd(); G.fillup_graph(); // time_end=dsecnd(); // time_avg = (time_end-time_st); // cerr<<"Done "<<time_avg<<endl; //G.output_graph_stdout(); cerr<<"Matching KCL "; // time_st=dsecnd(); G.check_kcl(); // time_end=dsecnd(); // time_avg = (time_end-time_st); // cerr<<"Done "<<time_avg<<endl; /*for (int i=0;i<m+n;i++) { cout<<"M"<<i<<endl; for (int j=0;j<m+n;j++) cout<<" "<<j<<"#"<<M[i][j]<<endl; }*/ }
cublasStatus_t cublasXaxpy(int n, const double* alpha, const double* x, int incx, double* y, int incy) { return cublasDaxpy(g_context->cublasHandle, n, alpha, x, incx, y, incy); }
int main(int argc, char* argv[]) { const int bufsize = 512; char buffer[bufsize]; int m,n,S; double time_st,time_end,time_avg; //omp_set_num_threads(2); // printf("\n-----------------\nnumber of threads fired = %d\n-----------------\n",(int)omp_get_num_threads()); if(argc!=2) { cout<<"Insufficient arguments"<<endl; return 1; } graph G; cerr<<"Start reading "; // time_st=dsecnd(); G.create_graph(argv[1]); // time_end=dsecnd(); // time_avg = (time_end-time_st); // cout<<"Success "<<endl; // cerr<<"Reading time "<<time_avg<<endl; cerr<<"Constructing Matrices "; // time_st=dsecnd(); G.construct_MNA(); G.construct_NA(); // time_end=dsecnd(); // time_avg = (time_end-time_st); // cerr<<"Done "<<time_avg<<endl; // G.construct_sparse_MNA(); m=G.node_array.size()-1; n=G.voltage_edge_id.size(); cout<<endl; cout<<"MATRIX STAT:"<<endl; cout<<"Nonzero elements: "<<G.nonzero<<endl; cout<<"Number of Rows: "<<m+n<<endl; cout<<"Nonzero in G: "<<G.Gnonzero<<endl; cout<<"Number of rows in G: "<<m<<endl; cout<<"Nonzero in P: "<<G.Pnonzero<<endl; cout<<"Number of rows in P: "<<m<<endl; // printf("\n Nonzero = %d", G.nonzero); // printf("\n Rows = %d", m+n); cout<<"MAT val: "<<endl; int i,j; G.Mat_val[0] += 100; G.Gmat[0] +=100; /* for(i=0;i<G.Gnonzero;i++) cout<<" "<<G.Gmat[i]; cout<<endl; for(i=0;i<G.Gnonzero;i++) cout<<" "<<G.Gcolumns[i]; cout<<endl; for(i=0;i<m+1;i++) cout<<" "<<G.GrowIndex[i]; cout<<endl; for(i=0;i<m;i++) printf(" %.8f", G.b1[i]); cout<<endl; for(i=0;i<m;i++) printf(" %.8f", G.x1[i]); cout<<endl; */ SuiteSparse_long *Gnz = (SuiteSparse_long*)calloc(m,sizeof(SuiteSparse_long)); for(i=0;i<m;i++) { // cout<<endl; SuiteSparse_long startindex=G.GrowIndex[i]; SuiteSparse_long endindex=G.GrowIndex[i+1]; Gnz[i] = endindex - startindex; // for(j=startindex;j<endindex;j++) // cout<<" "<<G.Gmat[j]; // cout<<endl; } /* for(i=0;i<G.Pnonzero;i++) cout<<" "<<G.Pmat[i]; cout<<endl; for(i=0;i<G.Pnonzero;i++) cout<<" "<<G.Pcolumns[i]; cout<<endl; for(i=0;i<m+1;i++) cout<<" "<<G.ProwIndex[i]; cout<<endl; /* for(i=0;i<m;i++) printf(" %.8f", G.b1[i]); cout<<endl; for(i=0;i<m;i++) printf(" %.8f", G.x1[i]); cout<<endl; for(i=0;i<m;i++) { cout<<endl; int startindex=G.ProwIndex[i]; int endindex=G.ProwIndex[i+1]; for(j=startindex;j<endindex;j++) cout<<" "<<G.Pmat[j]; cout<<endl; } /* for(i=0;i<G.nonzero;i++) cout<<" "<<G.Mat_val[i]; cout<<endl; for(i=0;i<G.nonzero;i++) cout<<" "<<G.columns[i]; cout<<endl; for(i=0;i<m+n+1;i++) cout<<" "<<G.rowIndex[i]; cout<<endl; for(i=0;i<m+n;i++) printf(" %.8f", G.b[i]); cout<<endl; for(i=0;i<m+n;i++) printf(" %.8f", G.x[i]); cout<<endl; for(i=0;i<m+n;i++) { cout<<endl; int startindex=G.rowIndex[i]; int endindex=G.rowIndex[i+1]; for(j=startindex;j<endindex;j++) cout<<" "<<G.Mat_val[j]; cout<<endl; } */ /* for (i=0;i<m+n+1;i++) { //cout<<endl; if(G.rowIndex[i]==G.rowIndex[i+1]) break; for(j=G.rowIndex[i];j<G.rowIndex[i+1];j++) { if(G.Mat_val[j]>10) cout<<G.Mat_val[j]<<"\t"; } //cout<<endl; /*for(j=G.rowIndex[i];j<G.rowIndex[i+1];j++) { cout<<G.columns[j]<<"\t"; } //cout<<endl; } cout<<endl; */ //printing the matrix printf("\n Fine till here"); printf("\n"); // int* rowmIndex=(int*)calloc(m+1,sizeof(int)); printf("\n Fine till here"); printf("\n"); //int rowmIndex[5]={1,2,3,4,5}; /* for(i=0;i<m+1;i++) { rowmIndex[i]=G.rowIndex[i]; printf(" %d", rowmIndex[i]); } */ printf("\n Allocating GPU memory\n"); cudaDeviceReset(); size_t free, total; cudaMemGetInfo(&free, &total); printf("\n Free Mem = %lf MB, Total mem = %lf MB\n", (double)(free)/(1024*1024), (double)(total)/(1024*1024)); double *dev_csrValA, *dev_b, *dev_x; int *dev_csrRowIdxA, *dev_csrColA; double *dev_GcsrVal, *dev_b1, *dev_x1; double *dev_PcsrVal, *dev_b2, *dev_x2; int *dev_GcsrRowIdx, *dev_PcsrRowIdx, *dev_GcsrCol, *dev_PcsrCol; cudaMalloc((void**)&dev_PcsrVal, G.Pnonzero*sizeof(double)); cudaMalloc((void**)&dev_PcsrRowIdx, (m+1)*sizeof(int)); cudaMalloc((void**)&dev_PcsrCol, G.Pnonzero*sizeof(int)); cudaMalloc((void**)&dev_b1, (m)*sizeof(double)); cudaMalloc((void**)&dev_b2, n*sizeof(double)); cudaMalloc((void**)&dev_x1, m*sizeof(double)); cudaMalloc((void**)&dev_x2, n*sizeof(double)); cudaMemcpy(dev_b1, G.b1, (m)*sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(dev_x1, G.x1, (m)*sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(dev_PcsrVal, G.Pmat, G.Pnonzero*sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(dev_b2, G.b2, (n)*sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(dev_x2, G.x2, (n)*sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(dev_PcsrRowIdx, G.ProwIndex, (m+1)*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(dev_PcsrCol, G.Pcolumns, (G.Pnonzero)*sizeof(int), cudaMemcpyHostToDevice); /* Matrix has been created and stored in CSR format. However, CHOLMOD requires CSC format. Since our matrix is symmetric positive definite, we can simply swap csrColA with csrRowIdx and vice versa */ /* Starting the CHOLMOD routine now*/ printf("\n Initiating CHOLMOD\n"); cholmod_sparse *A, *P; cholmod_dense *x, *b, *r, *midvec; cholmod_factor *L; cholmod_common *Common, cm; Common = &cm; cholmod_l_start(Common); // &Common->useGPU=1; printf("\n m = %d, G.Gnonzero = %d\n", m, G.Gnonzero); cholmod_sparse *C = cholmod_l_allocate_sparse((size_t)(m), (size_t)(m), (size_t)(G.Gnonzero), 1, 0, 1, 1, Common); // P = cholmod_l_allocate_sparse((size_t)(m), (size_t)(n), (size_t)(G.Pnonzero), 1, 0, 0, 1, Common); // printf("\n Allocated \n"); C->itype = CHOLMOD_LONG; // printf("\n Itype \n"); C->p = &G.GrowIndex[0]; // printf("\n Columns \n"); C->nz = &Gnz[0]; // printf("\n Rows \n"); C->i = &G.Gcolumns[0]; C->dtype = 0; C->x = &G.Gmat[0]; /* P->itype = CHOLMOD_LONG; P->p = &G.ProwIndex[0]; P->nz = &Pnz[0]; P->i = &G.Pcolumns[0]; P->dtype = 0; P->x = &G.Pmat[0]; */ b = cholmod_l_allocate_dense((size_t)(m), 1, (size_t)(m), 1, Common); b->dtype=0; b->x = &G.b1[0]; b->xtype = 1; printf("\n CHOLMOD manually set\n"); cholmod_l_print_sparse(C, "A", Common); cholmod_l_print_dense(b, "b", Common); cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); L = cholmod_l_analyze(C, Common); printf("\n Analysis: Flops: %g \t lnz: %g\n", Common->fl, Common->lnz); cholmod_l_factorize(C, L, Common); x = cholmod_l_solve(CHOLMOD_A, L, b, Common); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); float elapsedTime; cudaEventElapsedTime(&elapsedTime, start, stop); printf("\n Time : %.6f secs :\n", elapsedTime); cholmod_l_print_dense(x, "X", Common); double *x1_mod = (double*)x->x; cudaMemcpy(dev_x1, x1_mod, m*sizeof(double), cudaMemcpyHostToDevice); cusparseStatus_t cuSparseStatus; cusparseHandle_t cuspHandle; cuSparseStatus = cusparseCreate(&cuspHandle); cusparseMatDescr_t descrP; cusparseCreateMatDescr(&descrP); cusparseSetMatType(descrP, CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descrP, CUSPARSE_INDEX_BASE_ZERO); double *dev_res1, *dev_simple; double *res1 = (double*)calloc(n,sizeof(double)); cudaMalloc((void**)&dev_res1, n*sizeof(double)); cudaMalloc((void**)&dev_simple, n*sizeof(double)); const double alpha = 1.0, beta=0.0; //alpha = 1.0; //beta = 0.0; //solving P^T * G^-1 * b1 Result stored in dev_res1 cuSparseStatus = cusparseDcsrmv(cuspHandle, CUSPARSE_OPERATION_TRANSPOSE, m, n, G.Pnonzero, &alpha, descrP, dev_PcsrVal, dev_PcsrRowIdx, dev_PcsrCol, dev_x1, &beta, dev_res1); if(cuSparseStatus == CUSPARSE_STATUS_SUCCESS) { /* cudaMemcpy(res1, dev_res1, n*sizeof(double), cudaMemcpyDeviceToHost); for(i=0;i<n;i++) { printf("\nres1[%d] = %.8f", i, res1[i]); } printf("\n P^T * G^-1 * b1 done! Vector stored in res1"); */ } else { printf("\n P^T * G^-1 * b1 failed\n"); exit(1); } const double alphaneg = -1.0; //Solving P^T * G^-1 * b1 - b2 ; Result stored in dev_res1 cublasStatus_t cuBlasStatus; cublasHandle_t cubHandle; cuBlasStatus = cublasCreate(&cubHandle); cuBlasStatus = cublasDaxpy(cubHandle, n, &alphaneg, dev_b2, 1, dev_res1, 1); if(cuBlasStatus == CUBLAS_STATUS_SUCCESS) { // cudaMemcpy(res1, dev_res1, n*sizeof(double), cudaMemcpyDeviceToHost); // for(i=0;i<n;i++) // { // printf("\nres1[%d] = %.8f", i, res1[i]); // } printf("\n res1 = res1 - b2 done\n"); } else { printf("\n res1 = res1 - b2 failed\n"); } ///NOW COMPUTING G^-1 * P int k = 0; int breakloop=0; double **midMat = (double**)malloc(m*sizeof(double*)); for(i=0;i<m;i++) { midMat[i] = (double*)calloc(n,sizeof(double)); } cudaEventRecord(start, 0); for(i=0;i<n;i++) { breakloop = 0; double *vect = (double*)calloc(m,sizeof(double*)); for(j=0;j<m;j++) { int startin = G.ProwIndex[j]; int endin = G.ProwIndex[j+1]; if(startin == endin) continue; k = startin; while(k<endin) { if(G.Pcolumns[k] == i) { vect[j] = G.Pmat[k]; breakloop=1; break; } k++; } if(breakloop == 1) { break; } } midvec = cholmod_l_allocate_dense((size_t)(m), 1, (size_t)(m), 1, Common); midvec->dtype=0; midvec->x=&vect[0]; midvec->xtype = 1; cholmod_dense *res2; res2 = cholmod_l_solve(CHOLMOD_A, L, midvec, Common); double *re = (double*)res2->x; // printf("\n vector %d is:\n", i); int i1, j1, k1; // for(j1=0;j1<m;j1++) // { // midmat2flat[i+j1*n] = re[j1]; // printf(" %lf", re[j1]); // } // printf("\n"); for(i1=0;i1<m;i1++) { midMat[i1][i] = re[i1]; } cholmod_l_free_dense(&midvec, Common); } /* printf("\n Midmat = \n"); for(i=0;i<m;i++) { for(j=0;j<n;j++) { printf(" %lf", midMat[i][j]); } printf("\n"); } */ double *midMatflat = (double*)calloc((m*n),sizeof(double)); double *dev_midMat; double *dev_solut; int counter = 0; for(i=0;i<n;i++) { for(j=0;j<m;j++) { midMatflat[counter] = midMat[j][i]; counter++; } } cudaMalloc((void**)&dev_midMat, m*n*sizeof(double)); cudaMalloc((void**)&dev_solut, n*n*sizeof(double)); cudaMemcpy(dev_midMat, midMatflat, m*n*sizeof(double), cudaMemcpyHostToDevice); //Solving P^T * midMat; Result stored in dev_solut cuSparseStatus = cusparseDcsrmm(cuspHandle, CUSPARSE_OPERATION_TRANSPOSE, m, n, n, G.Pnonzero, &alpha, descrP, dev_PcsrVal, dev_PcsrRowIdx, dev_PcsrCol, dev_midMat, m, &beta, dev_solut, n); if(cuSparseStatus == CUSPARSE_STATUS_SUCCESS) { printf("\n Solved P^T * G^-1 * P. Result stored in solut\n"); } else { printf("\n Failed to Solve P^T * G^-1 * P \n"); exit(1); } /* double *matGflat = (double*)calloc(n*n,sizeof(double)); cudaMemcpy(matGflat, dev_solut, n*n*sizeof(double), cudaMemcpyDeviceToHost); counter = 0; printf("\nBefore LU starts\n"); for(i=0;i<n;i++) { for(j=0;j<n;j++) { printf(" %lf ", matGflat[counter]); counter++; } printf("\n"); } printf("\n"); */ cusolverStatus_t cuSolverStatus; cusolverDnHandle_t cudenHandle; cuSolverStatus = cusolverDnCreate(&cudenHandle); int Lwork = 0; cuSolverStatus = cusolverDnDgetrf_bufferSize(cudenHandle, n, n, dev_solut, n, &Lwork); if(cuSolverStatus == CUSOLVER_STATUS_SUCCESS) { printf("\n Buffer works\n Lwork = %d\n", Lwork); } else { exit(1); } double *dev_Workspace; int *dev_Ipiv, *dev_Info; cudaMalloc((void**)&dev_Workspace, Lwork*sizeof(double)); cudaMalloc((void**)&dev_Ipiv, n*sizeof(int)); cudaMalloc((void**)&dev_Info, sizeof(int)); //Calculating LU for dev_solut // double *nnmat = (double*)calloc(n*n,sizeof(double)); // cudaMemcpy(nnmat, dev_solut, n*n*sizeof(double), cudaMemcpyDeviceToHost); // cuSolverStatus = cusolverDnDgetrfHost(cudenHandle, n, n, cuSolverStatus = cusolverDnDgetrf(cudenHandle, n, n, dev_solut, n, dev_Workspace, dev_Ipiv, dev_Info); if(cuSolverStatus == CUSOLVER_STATUS_SUCCESS) { printf("\n solut has be defactorized into L and U. dev_Ipiv * solut = L * U\n"); } else { printf("\n Unable to defactorize solut into LU\n"); exit(1); } //solving dev_solut * x = dev_res1. Result stored in dev_res1 cuSolverStatus = cusolverDnDgetrs(cudenHandle, CUBLAS_OP_N, n, 1, dev_solut, n, dev_Ipiv, dev_res1, n, dev_Info); if(cuSolverStatus == CUSOLVER_STATUS_SUCCESS) { printf("\n Solution obtained for x2 \n"); } else { printf("\n LU decomposition obtained by LU solver failed\n"); } /* cudaMemcpy(G.x2, dev_res1, n*sizeof(double), cudaMemcpyDeviceToHost); printf("\n x2 = \n"); for(i=0;i<n;i++) { printf("\n x2[%d] = %lf", i, G.x2[i]); } */ double *dev_dummy; cudaMalloc((void**)&dev_dummy, m*sizeof(double)); cudaMemset(dev_dummy, 0.0, m*sizeof(double)); printf("\n Starting solving for x1 \n"); //Solving for x1 //Solving G^-1 * P * x2; G^-1 * P is stored in midMat cuBlasStatus = cublasDgemv(cubHandle, CUBLAS_OP_N, m, n, &alpha, dev_midMat, m, dev_res1, 1, &beta, dev_dummy, 1); if(cuBlasStatus == CUBLAS_STATUS_SUCCESS) { /* double *toprint = (double*)calloc(m,sizeof(double)); cudaMemcpy(toprint, dev_dummy, m*sizeof(double), cudaMemcpyDeviceToHost); printf("\n Intermediate vector :\n"); for(i=0;i<m;i++) { printf("\ndummy[%d] = %lf", i, toprint[i]); } */ printf("\n midmat * x2 obtained. Stored in dummy\n"); } else { printf("\n Failed to obtain midmat * x2\n"); } cuBlasStatus = cublasDaxpy(cubHandle, m, &alphaneg, dev_dummy, 1, dev_x1, 1); if(cuBlasStatus == CUBLAS_STATUS_SUCCESS) { /* cudaMemcpy(G.x1, dev_x1, m*sizeof(double), cudaMemcpyDeviceToHost); printf("\n x1 = \n"); for(i=0;i<m;i++) { printf("\n x1[%d] = %.15f", i, G.x1[i]); } */ printf("\n x1 obtained"); } else { printf("\n Failed to obtain x1"); } printf("\n Solver finished its work\n"); /* cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTime, start, stop); printf("\n Time: %.6f msecs :\n", elapsedTime); */ cholmod_l_finish(Common); return 0; }
void axpy(double alpha, const Vector<double> &x, Vector<double> &y) { assert(x.getSize() == y.getSize()); cublasDaxpy(x.getSize(), alpha, x, x.inc(), y, y.inc()); }
GPUMat& GPUMat::operator+=(const GPUMat& rhs) { double scale = 1; stat=cublasDaxpy(handle,n_elem,&scale,rhs.memptr_GPU(),1,this->memptr_GPU,1); return *this; }
void mpla_daxpy(struct mpla_vector* y, double alpha, struct mpla_vector* x, struct mpla_instance* instance) { // compute process-wise axpy cublasDaxpy(instance->cublas_handle, x->cur_proc_row_count, &alpha, x->data, 1, y->data, 1); }
// // Overloaded function for dispatching to // * CUBLAS backend, and // * double value-type. // inline void axpy( const int n, const double a, const double* x, const int incx, double* y, const int incy ) { cublasDaxpy( n, a, x, incx, y, incy ); }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { // At least 2 arguments expected // Input and result if (nrhs!=6) mexErrMsgTxt("Wrong number of arguments"); if (init == 0) { // Initialize function // mexLock(); // load GPUmat gm = gmGetGPUmat(); init = 1; } /* mex parameters are: 0 Source array [X or one of the y-edge-zeroed copies of it] 1 Destination array [Accumulator, basically] 2 Stack array 1 [We swap XY or YZ planes with this before copying to assure clean shift] - Must be all zeroes and of size max(NxNy, NyNz, NxNz) 3 Stack array 2 4 Shift directions 5 Coefficient on shift */ // Get GPU array pointers GPUtype srcArray = gm->gputype.getGPUtype(prhs[0]); GPUtype dstArray = gm->gputype.getGPUtype(prhs[1]); GPUtype stackArrayX = gm->gputype.getGPUtype(prhs[2]); //GPUtype stackArrayY = gm->gputype.getGPUtype(prhs[3]); GPUtype stackArrayZ = gm->gputype.getGPUtype(prhs[3]); // Get some control variables sorted out double *shiftdirs = mxGetPr(prhs[4]); const int *dims = gm->gputype.getSize(srcArray); double alpha = *mxGetPr(prhs[5]); int shifts[3]; shifts[0] = (int)shiftdirs[0]; shifts[1] = (int)shiftdirs[1]; shifts[2] = (int)shiftdirs[2]; double *cubSrc = (double*)gm->gputype.getGPUptr(srcArray); // Remove appropriate YZ plane if any double *cubDst = (double*)gm->gputype.getGPUptr(stackArrayX); if(shifts[0] == -1) cublasDswap(dims[1]*dims[2], cubSrc, dims[0], cubDst, 1); if(shifts[0] == 1) cublasDswap(dims[1]*dims[2], cubSrc + dims[0]-1, dims[0], cubDst, 1); // Remove appropriate XZ plane if any //stackSwapXZplane(cubSrc, (double*)gm->gputype.getGPUptr(stackArrayY), (int *)dims, shifts); // Remove appropriate XY plane if any cubDst = (double*)gm->gputype.getGPUptr(stackArrayZ); if(shifts[2] == -1) cublasDswap(dims[0]*dims[1], cubSrc, 1, cubDst, 1); if(shifts[2] == 1) cublasDswap(dims[0]*dims[1], cubSrc + dims[0]*dims[1]*(dims[2]-1), 1, cubDst, 1); // Decide the amount of offset to acheive desired shift int theta = shifts[0] + dims[0]*shifts[1] + dims[0]*dims[1]*shifts[2]; int Ntot = dims[0] * dims[1] * dims[2]; cubDst = (double*)gm->gputype.getGPUptr(dstArray); if(theta >= 0) { cublasDaxpy(Ntot-theta, alpha, cubSrc, 1, cubDst + theta, 1); } else { cublasDaxpy(Ntot+theta, alpha, cubSrc - theta, 1, cubDst, 1); } // Replace the XY plane if it was removed cubDst = (double*)gm->gputype.getGPUptr(stackArrayZ); if(shifts[2] == -1) cublasDswap(dims[0]*dims[1], cubSrc, 1, cubDst, 1); if(shifts[2] == 1) cublasDswap(dims[0]*dims[1], cubSrc + dims[0]*dims[1]*(dims[2]-1), 1, cubDst, 1); // replace the XZ plane if it was removed //stackSwapXZplane(cubSrc, (double*)gm->gputype.getGPUptr(stackArrayY), (int *)dims, shifts); // Replace the YZ plane if it was removed cubDst = (double*)gm->gputype.getGPUptr(stackArrayX); if(shifts[0] == -1) cublasDswap(dims[1]*dims[2], cubSrc, dims[0], cubDst, 1); if(shifts[0] == 1) cublasDswap(dims[1]*dims[2], cubSrc + dims[0]-1, dims[0], cubDst, 1); }