Esempio n. 1
0
void gpu_cublas1(double *A, double *B, double *C, double *D, double *r, double *nrmC, int N, int N2)
{
	#pragma acc data present(A, B, C, D)
	{
		#pragma acc host_data use_device(A, B, C, D)
		{
			cublasHandle_t handle;
			cublasCreate(&handle);
			const double alpha = 1.0;
			const double beta = 0.0;
			cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, N, N, N, &alpha, A, N, B, N, &beta, C, N);
			printf(" gpu gemm success \n");
			cublasDdot(handle, N2, C, 1, B, 1, r);
			printf(" gpu dot success \n");
			*r = -1.0 * *r;
			cublasDaxpy(handle, N2, r, B, 1, C, 1);
			printf(" gpu axpy success \n");
			cublasDnrm2(handle, N2, C, 1, nrmC);
			printf(" gpu nrm2 success \n");
			cublasDcopy(handle, N2, C, 1, D, 1);
			printf(" gpu copy success \n");
			*nrmC = 1.0 / *nrmC;
			cublasDscal(handle, N2, nrmC, D, 1);
			printf(" gpu scal success \n");
			cublasDestroy(handle);
			printf(" gpu destroy success \n");
		}
	}
}
Esempio n. 2
0
void magma_daxpy(
    magma_int_t n,
    double alpha,
    const double *dx, magma_int_t incx,
    double       *dy, magma_int_t incy )
{
    cublasDaxpy( n, alpha, dx, incx, dy, incy );
}
  static double e_cuda(double* target, double* output, size_t count, double* dedy) {
    /* this is to sort of keep a uniform API w. the matrix */
    assert(target == dedy);

    cublasDaxpy(count, -1, output, 1, dedy, 1);

    return 0.5 * cublasDnrm2(count, dedy, 1);
  }
Esempio n. 4
0
File: ardblas.c Progetto: rforge/gcb
void d_axpy(SEXP ralpha, SEXP rx, SEXP rincx, SEXP ry, SEXP rincy)
{
	int
		nx, ny, n,
		incx = asInteger(rincx),
		incy = asInteger(rincy);
	double
		alpha = asReal(ralpha),
		* x, * y;

	unpackVector(rx, &nx, &x);
	unpackVector(ry, &ny, &y);
	n = imin2(nx, ny);

	cublasDaxpy(n, alpha, x, incx, y, incy);
	checkCublasError("d_axpy");
}
 static vl::Error
 axpy(vl::Context & context,
      ptrdiff_t n,
      type alpha,
      type const *x, ptrdiff_t incx,
      type *y, ptrdiff_t incy)
 {
   cublasHandle_t handle ;
   cublasStatus_t status ;
   status = context.getCudaHelper().getCublasHandle(&handle) ;
   if (status != CUBLAS_STATUS_SUCCESS) goto done ;
   status = cublasDaxpy(handle,
                        (int)n,
                        &alpha,
                        x, (int)incx,
                        y, (int)incy) ;
 done:
   return context.setError
     (context.getCudaHelper().catchCublasError(status, "cublasDaxpy"), __func__) ;
 }
Esempio n. 6
0
void
cube_blas_d_axpy (cube_t       *ctx,
		  int           n,
		  const double *alpha,
		  const double *x, int incx,
		  double       *y, int incy)
{
  cublasStatus_t status;
  
  if (! cube_context_check (ctx))
    return;

  status = cublasDaxpy (ctx->h_blas,
			n,
			alpha,
			x, incx,
			y, incy);

  cube_blas_check (ctx, status);
}
Esempio n. 7
0
void mat_add_mat(const double *x, double *y, double scalar, int n){
	cudaError_t cudaStat ; // cudaMalloc status
	cublasStatus_t stat ; // CUBLAS functions status
	cublasHandle_t handle ; // CUBLAS context	
// on the device
	double *d_x; // d_x - x on the device
	double *d_y; // d_y - y on the device

	cudaStat = cudaMalloc (( void **)& d_x, n*sizeof(*x)); // device
	// memory alloc for x
	cudaStat = cudaMalloc (( void **)& d_y, n*sizeof(*y)); // device
	// memory alloc for y
	stat = cublasCreate (& handle ); // initialize CUBLAS context
	stat = cublasSetVector (n, sizeof (*x), x ,1 ,d_x, 1); // cp x- >d_x
	stat = cublasSetVector (n, sizeof (*y), y ,1 ,d_y, 1); // cp y- >d_y

	stat=cublasDaxpy(handle,n,&scalar,d_x,1,d_y,1);


	cudaFree (d_x ); // free device memory
	cudaFree (d_y ); // free device memory
	cublasDestroy ( handle ); // destroy CUBLAS context

}
Esempio n. 8
0
int main( int argc, char** argv )
{
    TESTING_INIT();
    
    real_Double_t   gflops, t1, t2;
    double c_neg_one = MAGMA_D_NEG_ONE;
    magma_int_t ione = 1;
    const char trans[] = { 'N', 'C', 'T' };
    const char uplo[]  = { 'L', 'U' };
    const char diag[]  = { 'U', 'N' };
    const char side[]  = { 'L', 'R' };
    
    double  *A,  *B,  *C,   *C2, *LU;
    double *dA, *dB, *dC1, *dC2;
    double alpha = MAGMA_D_MAKE( 0.5, 0.1 );
    double beta  = MAGMA_D_MAKE( 0.7, 0.2 );
    double dalpha = 0.6;
    double dbeta  = 0.8;
    double work[1], error, total_error;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t m, n, k, size, maxn, ld, info;
    magma_int_t *piv;
    magma_err_t err;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" );
    
    total_error = 0.;
    for( int i = 0; i < opts.ntest; ++i ) {
        m = opts.msize[i];
        n = opts.nsize[i];
        k = opts.ksize[i];
        printf("=========================================================================\n");
        printf( "M %d, N %d, K %d\n", (int) m, (int) n, (int) k );
        
        // allocate matrices
        // over-allocate so they can be any combination of {m,n,k} x {m,n,k}.
        maxn = max( max( m, n ), k );
        ld = maxn;
        size = maxn*maxn;
        err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) );  assert( err == 0 );
        err = magma_dmalloc_pinned( &A,  size );  assert( err == 0 );
        err = magma_dmalloc_pinned( &B,  size );  assert( err == 0 );
        err = magma_dmalloc_pinned( &C,  size );  assert( err == 0 );
        err = magma_dmalloc_pinned( &C2, size );  assert( err == 0 );
        err = magma_dmalloc_pinned( &LU, size );  assert( err == 0 );
        err = magma_dmalloc( &dA,  size );        assert( err == 0 );
        err = magma_dmalloc( &dB,  size );        assert( err == 0 );
        err = magma_dmalloc( &dC1, size );        assert( err == 0 );
        err = magma_dmalloc( &dC2, size );        assert( err == 0 );
        
        // initialize matrices
        size = maxn*maxn;
        lapackf77_dlarnv( &ione, ISEED, &size, A  );
        lapackf77_dlarnv( &ione, ISEED, &size, B  );
        lapackf77_dlarnv( &ione, ISEED, &size, C  );
        
        printf( "========== Level 1 BLAS ==========\n" );
        
        // ----- test DSWAP
        // swap 2nd and 3rd columns of dA, then copy to C2 and compare with A
        assert( n >= 4 );
        magma_dsetmatrix( m, n, A, ld, dA, ld );
        magma_dsetmatrix( m, n, A, ld, dB, ld );
        magma_dswap( m, dA(0,1), 1, dA(0,2), 1 );
        magma_dswap( m, dB(0,1), 1, dB(0,2), 1 );
        
        // check results, storing diff between magma and cuda calls in C2
        cublasDaxpy( ld*n, c_neg_one, dA, 1, dB, 1 );
        magma_dgetmatrix( m, n, dB, ld, C2, ld );
        error = lapackf77_dlange( "F", &m, &k, C2, &ld, work );
        total_error += error;
        printf( "dswap             diff %.2g\n", error );
        
        // ----- test IDAMAX
        // get argmax of column of A
        magma_dsetmatrix( m, k, A, ld, dA, ld );
        error = 0;
        for( int j = 0; j < k; ++j ) {
            magma_int_t i1 = magma_idamax( m, dA(0,j), 1 );
            magma_int_t i2 = cublasIdamax( m, dA(0,j), 1 );
            assert( i1 == i2 );
            error += abs( i1 - i2 );
        }
        total_error += error;
        gflops = (double)m * k / 1e9;
        printf( "idamax            diff %.2g\n", error );
        printf( "\n" );
        
        printf( "========== Level 2 BLAS ==========\n" );
        
        // ----- test DGEMV
        // c = alpha*A*b + beta*c,  with A m*n; b,c m or n-vectors
        // try no-trans/trans
        for( int ia = 0; ia < 3; ++ia ) {
            magma_dsetmatrix( m, n, A,  ld, dA,  ld );
            magma_dsetvector( maxn, B, 1, dB,  1 );
            magma_dsetvector( maxn, C, 1, dC1, 1 );
            magma_dsetvector( maxn, C, 1, dC2, 1 );
            t1 = magma_sync_wtime( 0 );
            magma_dgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC2, 1 );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            size = (trans[ia] == 'N' ? m : n);
            cublasDaxpy( size, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetvector( size, dC2, 1, C2, 1 );
            error = lapackf77_dlange( "F", &size, &ione, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DGEMV( m, n ) / 1e9;
            printf( "dgemv( %c )        diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    trans[ia], error, gflops/t1, gflops/t2 );
        }
        printf( "\n" );
        
        // ----- test DSYMV
        // c = alpha*A*b + beta*c,  with A m*m symmetric; b,c m-vectors
        // try upper/lower
        for( int iu = 0; iu < 2; ++iu ) {
            magma_dsetmatrix( m, m, A, ld, dA, ld );
            magma_dsetvector( m, B, 1, dB,  1 );
            magma_dsetvector( m, C, 1, dC1, 1 );
            magma_dsetvector( m, C, 1, dC2, 1 );
            t1 = magma_sync_wtime( 0 );
            magma_dsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC2, 1 );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( m, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetvector( m, dC2, 1, C2, 1 );
            error = lapackf77_dlange( "F", &m, &ione, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DSYMV( m ) / 1e9;
            printf( "dsymv( %c )        diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], error, gflops/t1, gflops/t2 );
        }
        printf( "\n" );
        
        // ----- test DTRSV
        // solve A*c = c,  with A m*m triangular; c m-vector
        // try upper/lower, no-trans/trans, unit/non-unit diag
        // Factor A into LU to get well-conditioned triangles, else solve yields garbage.
        // Still can give garbage if solves aren't consistent with LU factors,
        // e.g., using unit diag for U, so copy lower triangle to upper triangle.
        // Also used for trsm later.
        lapackf77_dlacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld );
        lapackf77_dgetrf( &maxn, &maxn, LU, &ld, piv, &info );
        for( int j = 0; j < maxn; ++j ) {
            for( int i = 0; i < j; ++i ) {
                *LU(i,j) = *LU(j,i);
            }
        }
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
        for( int id = 0; id < 2; ++id ) {
            magma_dsetmatrix( m, m, LU, ld, dA, ld );
            magma_dsetvector( m, C, 1, dC1, 1 );
            magma_dsetvector( m, C, 1, dC2, 1 );
            t1 = magma_sync_wtime( 0 );
            magma_dtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC2, 1 );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( m, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetvector( m, dC2, 1, C2, 1 );
            error = lapackf77_dlange( "F", &m, &ione, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DTRSM( MagmaLeft, m, 1 ) / 1e9;
            printf( "dtrsv( %c, %c, %c )  diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], diag[id], error, gflops/t1, gflops/t2 );
        }}}
        printf( "\n" );
        
        printf( "========== Level 3 BLAS ==========\n" );
        
        // ----- test DGEMM
        // C = alpha*A*B + beta*C,  with A m*k or k*m; B k*n or n*k; C m*n
        // try combinations of no-trans/trans
        for( int ia = 0; ia < 3; ++ia ) {
        for( int ib = 0; ib < 3; ++ib ) {
            bool nta = (trans[ia] == 'N');
            bool ntb = (trans[ib] == 'N');
            magma_dsetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA,  ld );
            magma_dsetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB,  ld );
            magma_dsetmatrix( m, n, C, ld, dC1, ld );
            magma_dsetmatrix( m, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_dgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_dlange( "F", &m, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DGEMM( m, n, k ) / 1e9;
            printf( "dgemm( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    trans[ia], trans[ib], error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test DSYMM
        // C = alpha*A*B + beta*C  (left)  with A m*m symmetric; B,C m*n; or
        // C = alpha*B*A + beta*C  (right) with A n*n symmetric; B,C m*n
        // try left/right, upper/lower
        for( int is = 0; is < 2; ++is ) {
        for( int iu = 0; iu < 2; ++iu ) {
            magma_dsetmatrix( m, m, A, ld, dA,  ld );
            magma_dsetmatrix( m, n, B, ld, dB,  ld );
            magma_dsetmatrix( m, n, C, ld, dC1, ld );
            magma_dsetmatrix( m, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_dsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_dlange( "F", &m, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DSYMM( side[is], m, n ) / 1e9;
            printf( "dsymm( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    side[is], uplo[iu], error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test DSYRK
        // C = alpha*A*A^H + beta*C  (no-trans) with A m*k and C m*m symmetric; or
        // C = alpha*A^H*A + beta*C  (trans)    with A k*m and C m*m symmetric
        // try upper/lower, no-trans/trans
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
            magma_dsetmatrix( n, k, A, ld, dA,  ld );
            magma_dsetmatrix( n, n, C, ld, dC1, ld );
            magma_dsetmatrix( n, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_dsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetmatrix( n, n, dC2, ld, C2, ld );
            error = lapackf77_dlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DSYRK( k, n ) / 1e9;
            printf( "dsyrk( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test DSYR2K
        // C = alpha*A*B^H + ^alpha*B*A^H + beta*C  (no-trans) with A,B n*k; C n*n symmetric; or
        // C = alpha*A^H*B + ^alpha*B^H*A + beta*C  (trans)    with A,B k*n; C n*n symmetric
        // try upper/lower, no-trans/trans
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
            bool nt = (trans[it] == 'N');
            magma_dsetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA,  ld );
            magma_dsetmatrix( n, n, C, ld, dC1, ld );
            magma_dsetmatrix( n, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_dsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetmatrix( n, n, dC2, ld, C2, ld );
            error = lapackf77_dlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DSYR2K( k, n ) / 1e9;
            printf( "dsyr2k( %c, %c )    diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test DTRMM
        // C = alpha*A*C  (left)  with A m*m triangular; C m*n; or
        // C = alpha*C*A  (right) with A n*n triangular; C m*n
        // try left/right, upper/lower, no-trans/trans, unit/non-unit
        for( int is = 0; is < 2; ++is ) {
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
        for( int id = 0; id < 2; ++id ) {
            bool left = (side[is] == 'L');
            magma_dsetmatrix( (left ? m : n), (left ? m : n), A, ld, dA,  ld );
            magma_dsetmatrix( m, n, C, ld, dC1, ld );
            magma_dsetmatrix( m, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_dtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_dlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DTRMM( side[is], m, n ) / 1e9;
            printf( "dtrmm( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], error, gflops/t1, gflops/t2 );
        }}}}
        printf( "\n" );
        
        // ----- test DTRSM
        // solve A*X = alpha*B  (left)  with A m*m triangular; B m*n; or
        // solve X*A = alpha*B  (right) with A n*n triangular; B m*n
        // try left/right, upper/lower, no-trans/trans, unit/non-unit
        for( int is = 0; is < 2; ++is ) {
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
        for( int id = 0; id < 2; ++id ) {
            bool left = (side[is] == 'L');
            magma_dsetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA,  ld );
            magma_dsetmatrix( m, n, C, ld, dC1, ld );
            magma_dsetmatrix( m, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_dtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_dlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DTRSM( side[is], m, n ) / 1e9;
            printf( "dtrsm( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], error, gflops/t1, gflops/t2 );
        }}}}
        printf( "\n" );
        
        // cleanup
        magma_free_cpu( piv );
        magma_free_pinned( A  );
        magma_free_pinned( B  );
        magma_free_pinned( C  );
        magma_free_pinned( C2 );
        magma_free_pinned( LU );
        magma_free( dA  );
        magma_free( dB  );
        magma_free( dC1 );
        magma_free( dC2 );
    }
    
    if ( total_error != 0. ) {
        printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n",
                total_error );
    }
    else {
        printf( "all tests passed\n" );
    }
    
    TESTING_FINALIZE();
    return 0;
}
void caffe_gpu_axpy<double>(const int N, const double alpha, const double* X,
    double* Y) {
  CUBLAS_CHECK(cublasDaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1));
}
int main(int argc, char* argv[])
{
	const int bufsize = 512;
    	char buffer[bufsize];
	int m,n,S;
	double time_st,time_end,time_avg;
	//omp_set_num_threads(2);
//	printf("\n-----------------\nnumber of threads fired = %d\n-----------------\n",(int)omp_get_num_threads());
	if(argc!=2)
	{
		cout<<"Insufficient arguments"<<endl;
		return 1;
	}
	
	graph G;

	cerr<<"Start reading                    ";
//	time_st=dsecnd();
	G.create_graph(argv[1]);
//	time_end=dsecnd();
//	time_avg = (time_end-time_st);
//	cout<<"Success              "<<endl;
//	cerr<<"Reading time                     "<<time_avg<<endl;

	cerr<<"Constructing Matrices            ";
//	time_st=dsecnd();
	G.construct_MNA();
//	time_end=dsecnd();
//	time_avg = (time_end-time_st);
//	cerr<<"Done                 "<<time_avg<<endl;

//	G.construct_sparse_MNA();
	m=G.node_array.size()-1;
	n=G.voltage_edge_id.size();
	
	cout<<endl;
	cout<<"MATRIX STAT:"<<endl;
	cout<<"Nonzero elements:               "<<G.nonzero<<endl;
	cout<<"Number of Rows:                 "<<m+n<<endl;
	printf("\n Nonzero = %ld", G.nonzero);
	printf("\n Rows = %d", m+n);

	cout<<"MAT val:		       "<<endl;
	int i,j;

//	G.Mat_val[0] +=100;
/*	
	for(i=0;i<G.nonzero;i++)
		cout<<" "<<G.Mat_val[i];
	cout<<endl;
	for(i=0;i<G.nonzero;i++)
		cout<<" "<<G.columns[i];
	cout<<endl;	
	for(i=0;i<m+n+1;i++)
		cout<<" "<<G.rowIndex[i];
	cout<<endl;
	
	
	for(i=0;i<m+n;i++)
	{
		cout<<endl;
		int startindex=G.rowIndex[i];
		int endindex=G.rowIndex[i+1];
		for(j=startindex;j<endindex;j++)
			cout<<" "<<G.Mat_val[j];
		cout<<endl;
		
	}
*/
/*	for (i=0;i<m+n+1;i++)
	{
		//cout<<endl;
		if(G.rowIndex[i]==G.rowIndex[i+1])
			break;
		
		for(j=G.rowIndex[i];j<G.rowIndex[i+1];j++)
		{
			if(G.Mat_val[j]>10)
				cout<<G.Mat_val[j]<<"\t";
		}
		//cout<<endl;
		/*for(j=G.rowIndex[i];j<G.rowIndex[i+1];j++)
		{
			cout<<G.columns[j]<<"\t";
		}
		//cout<<endl;
	}
	cout<<endl;
*/

//printing the matrix
	printf("\n Fine till here");
	printf("\n");
//	int* rowmIndex=(int*)calloc(m+1,sizeof(int));
	printf("\n Fine till here");
	printf("\n");
	//int rowmIndex[5]={1,2,3,4,5};
/*	for(i=0;i<m+1;i++)
	{
		rowmIndex[i]=G.rowIndex[i];
		printf(" %d", rowmIndex[i]);
	}
*/
	cerr<<"Solving Equations    "<<endl;


    double r1, b, alpha, alpham1, beta, r0, a, na;
    
    
    const double tol = 0.1;
    const int max_iter = 1000000;
    int *d_col, *d_row;
    double *d_val, *d_x, dot;
    double *d_r, *d_p, *d_Ax;
    int k;
    
    cublasHandle_t cublasHandle = 0;
    cublasStatus_t cublasStatus;
    cublasStatus = cublasCreate(&cublasHandle);

    checkCudaErrors(cublasStatus);

    /* Get handle to the CUSPARSE context */
    cusparseHandle_t cusparseHandle = 0;
    cusparseStatus_t cusparseStatus;
    cusparseStatus = cusparseCreate(&cusparseHandle);

    checkCudaErrors(cusparseStatus);

    cusparseMatDescr_t descr = 0;
    cusparseStatus = cusparseCreateMatDescr(&descr);

    checkCudaErrors(cusparseStatus);

    cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
    cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);

    checkCudaErrors(cudaMalloc((void **)&d_col, G.nonzero*sizeof(int)));
    checkCudaErrors(cudaMalloc((void **)&d_row, (m+n+1)*sizeof(int)));
    checkCudaErrors(cudaMalloc((void **)&d_val, G.nonzero*sizeof(double)));
    checkCudaErrors(cudaMalloc((void **)&d_x, (m+n)*sizeof(double)));
    checkCudaErrors(cudaMalloc((void **)&d_r, (m+n)*sizeof(double)));
    checkCudaErrors(cudaMalloc((void **)&d_p, (m+n)*sizeof(double)));
    checkCudaErrors(cudaMalloc((void **)&d_Ax, (m+n)*sizeof(double)));

    cudaMemcpy(d_col, G.columns, G.nonzero*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_row, G.rowIndex, (m+n+1)*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_val, G.Mat_val, G.nonzero*sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(d_x, G.x, (m+n)*sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(d_r, G.b, (m+n)*sizeof(double), cudaMemcpyHostToDevice);

    alpha = 1.0;
    alpham1 = -1.0;
    beta = 0.0;
    r0 = 0.;

    printf("\n Data transferred\n");

    	cudaEvent_t start,stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	
	cudaEventRecord(start, 0);

    cusparseDcsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, (m+n), (m+n), G.nonzero, &alpha, descr, d_val, d_row, d_col, d_x, &beta, d_Ax);

    cublasDaxpy(cublasHandle, (m+n), &alpham1, d_Ax, 1, d_r, 1);
    cublasStatus = cublasDdot(cublasHandle, (m+n), d_r, 1, d_r, 1, &r1);

    k = 1;

    while (r1 > tol && k <= max_iter)
    {
        if (k > 1)
        {
            b = r1 / r0;
            cublasStatus = cublasDscal(cublasHandle, (m+n), &b, d_p, 1);
            cublasStatus = cublasDaxpy(cublasHandle, (m+n), &alpha, d_r, 1, d_p, 1);
        }
        else
        {
            cublasStatus = cublasDcopy(cublasHandle, (m+n), d_r, 1, d_p, 1);
        }

        cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, (m+n), (m+n), G.nonzero, &alpha, descr, d_val, d_row, d_col, d_p, &beta, d_Ax);
        cublasStatus = cublasDdot(cublasHandle, (m+n), d_p, 1, d_Ax, 1, &dot);
        a = r1 / dot;

        cublasStatus = cublasDaxpy(cublasHandle, (m+n), &a, d_p, 1, d_x, 1);
        na = -a;
        cublasStatus = cublasDaxpy(cublasHandle, (m+n), &na, d_Ax, 1, d_r, 1);

        r0 = r1;
        cublasStatus = cublasDdot(cublasHandle, (m+n), d_r, 1, d_r, 1, &r1);
//        cudaThreadSynchronize();
//        printf("iteration = %3d, residual = %e\n", k, sqrt(r1));
        k++;
    }
    
    	cudaEventRecord(stop, 0);
	
	cudaEventSynchronize(stop);
	float elapsedTime;
	cudaEventElapsedTime(&elapsedTime, start, stop);
	printf("Iterations = %3d\tTime : %.6f milli-seconds : \n", k, elapsedTime);

    cudaMemcpy(G.x, d_x, (m+n)*sizeof(double), cudaMemcpyDeviceToHost);

/*        printf("\n x = \n");
	for(i=0;i<(m+n);i++)
	{
		printf("\n x[%d] = %.8f", i, G.x[i]);
	}	
*/    float rsum, diff, err = 0.0;

    for (int i = 0; i < (m+n); i++)
    {
        rsum = 0.0;

        for (int j = G.rowIndex[i]; j < G.rowIndex[i+1]; j++)
        {
            rsum += G.Mat_val[j]*G.x[G.columns[j]];
        }

        diff = fabs(rsum - G.b[i]);

        if (diff > err)
        {
            err = diff;
        }
    }

    cusparseDestroy(cusparseHandle);
    cublasDestroy(cublasHandle);

/*    free(I);
    free(J);
    free(val);
    free(x);
    free(rhs);
*/    cudaFree(d_col);
    cudaFree(d_row);
    cudaFree(d_val);
    cudaFree(d_x);
    cudaFree(d_r);
    cudaFree(d_p);
    cudaFree(d_Ax);

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
    
/*    printf("\n X is:\n");
    for(i=0;i<(m+n);i++)
    {
    	printf("\n X[%d] = %.4f", i, G.x[i]);
    }
*/    	
    printf("Test Summary:  Error amount = %f\n", err);
    exit((k <= max_iter) ? 0 : 1);




/*	
	culaSparseHandle handle;
    	if (culaSparseCreate(&handle) != culaSparseNoError)
    	{
        	// this should only fail under extreme conditions
        	std::cout << "fatal error: failed to create library handle!" << std::endl;
        	exit(EXIT_FAILURE);
    	}
	
	StatusChecker sc(handle);

	culaSparsePlan plan;
	sc = culaSparseCreatePlan(handle, &plan);
	
	culaSparseCsrOptions formatOpts;
	culaSparseCsrOptionsInit(handle, &formatOpts);
	//formatOpts.indexing=1;	
	sc = culaSparseSetDcsrData(handle, plan, &formatOpts, m+n, G.nonzero, &G.Mat_val[0], &G.rowIndex[0], &G.columns[0], &G.x[0], &G.b[0]);
	printf("\n Fine till here");
	printf("\n");	
	culaSparseConfig config;
    	sc = culaSparseConfigInit(handle, &config);
    	config.relativeTolerance = 1e-2;
    	config.maxIterations = 100000;
	config.divergenceTolerance = 20;
	culaSparseResult result;
    	
/*    	sc = culaSparseSetHostPlatform(handle, plan, 0);

    // set cg solver
    	sc = culaSparseSetCgSolver(handle, plan, 0);
	printf("\n Fine till here");
	printf("\n");    
    // perform solve (cg + no preconditioner on host)
	culaSparseResult result;
	sc = culaSparseExecutePlan(handle, plan, &config, &result);
	sc = culaSparseGetResultString(handle, &result, buffer, bufsize);
	std::cout << buffer << std::endl;
    	
    	if (culaSparsePreinitializeCuda(handle) == culaSparseNoError)
    {
        // change to cuda accelerated platform
        sc = culaSparseSetCudaPlatform(handle, plan, 0);

        // perform solve (cg + ilu0 on cuda)
        culaSparseGmresOptions solverOpts;
	culaSparseStatus status = culaSparseGmresOptionsInit(handle, &solverOpts);
	solverOpts.restart = 20;


	sc = culaSparseSetCgSolver(handle, plan, 0);

//        sc = culaSparseSetJacobiPreconditioner(handle, plan, 0);
//        sc = culaSparseSetIlu0Preconditioner(handle, plan, 0);
        sc = culaSparseExecutePlan(handle, plan, &config, &result);
        sc = culaSparseGetResultString(handle, &result, buffer, bufsize);
        std::cout << buffer << std::endl;

        // change preconditioner to fainv 
        // this avoids data transfer by using data cached by the plan
//        sc = culaSparseSetIlu0Preconditioner(handle, plan, 0);

        // perform solve (cg + fainv on cuda)
        // these timing results should indicate minimal overhead
/*        sc = culaSparseExecutePlan(handle, plan, &config, &result);
        sc = culaSparseGetResultString(handle, &result, buffer, bufsize);
        std::cout << buffer << std::endl;

        // change solver
        // this avoids preconditioner regeneration by using data cached by the plan
        sc = culaSparseSetBicgstabSolver(handle, plan, 0);

        // perform solve (bicgstab + fainv on cuda)
        // the timing results should indicate minimal overhead and preconditioner generation time
        sc = culaSparseExecutePlan(handle, plan, &config, &result);
        sc = culaSparseGetResultString(handle, &result, buffer, bufsize);
        std::cout << buffer << std::endl;
        
        sc = culaSparseSetGmresSolver(handle, plan, 0);

        // perform solve (bicgstab + fainv on cuda)
        // the timing results should indicate minimal overhead and preconditioner generation time
        sc = culaSparseExecutePlan(handle, plan, &config, &result);
        sc = culaSparseGetResultString(handle, &result, buffer, bufsize);
        std::cout << buffer << std::endl;
    }
    else
    {
        std::cout << "alert: no cuda capable gpu found" << std::endl;
    }

    // cleanup plan
    culaSparseDestroyPlan(plan);

    // cleanup handle
    culaSparseDestroy(handle);

    	FILE* myWriteFile;
  	myWriteFile=fopen("result.txt","w");
  	for (i = 0; i < n; i++)
    	{
		fprintf(myWriteFile,"%1f\n",G.x[i]);
    		//  printf ("\n x [%d] = % f", i, x[i]);
    	}
  	fprintf(myWriteFile,".end\n");
  	fclose(myWriteFile);
 
  	printf ("\n");
    	
    	
//	time_st=dsecnd();
//	solver(G.rowIndex,G.columns,G.Mat_val,G.b,G.x,m+n,G.nonzero);
//	time_end=dsecnd();
//	time_avg = (time_end-time_st);
//	printf("Successfully Solved in : %.6f secs\n",time_avg);
	cerr<<endl;

	cerr<<"Fillup Graph                    ";	
//	time_st=dsecnd();
	G.fillup_graph();
//	time_end=dsecnd();
//	time_avg = (time_end-time_st);
//	cerr<<"Done                 "<<time_avg<<endl;

	//G.output_graph_stdout();
	cerr<<"Matching KCL                    ";
//	time_st=dsecnd();
	G.check_kcl();
//	time_end=dsecnd();
//	time_avg = (time_end-time_st);
//	cerr<<"Done                 "<<time_avg<<endl;
	/*for (int i=0;i<m+n;i++)
	  {
	  cout<<"M"<<i<<endl;
	  for (int j=0;j<m+n;j++)
	  cout<<" "<<j<<"#"<<M[i][j]<<endl;
	  }*/
} 
Esempio n. 11
0
		cublasStatus_t cublasXaxpy(int n, const double* alpha, const double* x, int incx, double* y, int incy) {
			return cublasDaxpy(g_context->cublasHandle, n, alpha, x, incx, y, incy);
		}
int main(int argc, char* argv[])
{
	const int bufsize = 512;
    	char buffer[bufsize];
	int m,n,S;
	double time_st,time_end,time_avg;
	//omp_set_num_threads(2);
//	printf("\n-----------------\nnumber of threads fired = %d\n-----------------\n",(int)omp_get_num_threads());
	if(argc!=2)
	{
		cout<<"Insufficient arguments"<<endl;
		return 1;
	}
	
	graph G;

	cerr<<"Start reading                    ";
//	time_st=dsecnd();
	G.create_graph(argv[1]);
//	time_end=dsecnd();
//	time_avg = (time_end-time_st);
//	cout<<"Success              "<<endl;
//	cerr<<"Reading time                     "<<time_avg<<endl;

	cerr<<"Constructing Matrices            ";
//	time_st=dsecnd();
	G.construct_MNA();
	G.construct_NA();
//	time_end=dsecnd();
//	time_avg = (time_end-time_st);
//	cerr<<"Done                 "<<time_avg<<endl;

//	G.construct_sparse_MNA();
	m=G.node_array.size()-1;
	n=G.voltage_edge_id.size();
	
	cout<<endl;
	cout<<"MATRIX STAT:"<<endl;
	cout<<"Nonzero elements:               "<<G.nonzero<<endl;
	cout<<"Number of Rows:                 "<<m+n<<endl;
	cout<<"Nonzero in G:			"<<G.Gnonzero<<endl;
	cout<<"Number of rows in G:		"<<m<<endl;
	cout<<"Nonzero in P: 			"<<G.Pnonzero<<endl;
	cout<<"Number of rows in P:		"<<m<<endl;


//	printf("\n Nonzero = %d", G.nonzero);
//	printf("\n Rows = %d", m+n);

	cout<<"MAT val:		       "<<endl;
	int i,j;

	G.Mat_val[0] += 100;
	G.Gmat[0] +=100;
/*
	for(i=0;i<G.Gnonzero;i++)
		cout<<" "<<G.Gmat[i];
	cout<<endl;
	for(i=0;i<G.Gnonzero;i++)
		cout<<" "<<G.Gcolumns[i];
	cout<<endl;	
	for(i=0;i<m+1;i++)
		cout<<" "<<G.GrowIndex[i];
	cout<<endl;
	for(i=0;i<m;i++)
		printf(" %.8f", G.b1[i]);
	cout<<endl;
	for(i=0;i<m;i++)
		printf(" %.8f", G.x1[i]);
	cout<<endl;	
	
	
*/	SuiteSparse_long *Gnz = (SuiteSparse_long*)calloc(m,sizeof(SuiteSparse_long));
	for(i=0;i<m;i++)
	{
	//	cout<<endl;
		SuiteSparse_long startindex=G.GrowIndex[i];
		SuiteSparse_long endindex=G.GrowIndex[i+1];
		Gnz[i] = endindex - startindex;

//		for(j=startindex;j<endindex;j++)
//			cout<<" "<<G.Gmat[j];
//		cout<<endl;
		
	}


/*	for(i=0;i<G.Pnonzero;i++)
		cout<<" "<<G.Pmat[i];
	cout<<endl;
	for(i=0;i<G.Pnonzero;i++)
		cout<<" "<<G.Pcolumns[i];
	cout<<endl;	
	for(i=0;i<m+1;i++)
		cout<<" "<<G.ProwIndex[i];
	cout<<endl;
/*	for(i=0;i<m;i++)
		printf(" %.8f", G.b1[i]);
	cout<<endl;
	for(i=0;i<m;i++)
		printf(" %.8f", G.x1[i]);
	cout<<endl;	
	
	
	for(i=0;i<m;i++)
	{
		cout<<endl;
		int startindex=G.ProwIndex[i];
		int endindex=G.ProwIndex[i+1];
		for(j=startindex;j<endindex;j++)
			cout<<" "<<G.Pmat[j];
		cout<<endl;
		
	}
	
/*	for(i=0;i<G.nonzero;i++)
		cout<<" "<<G.Mat_val[i];
	cout<<endl;
	for(i=0;i<G.nonzero;i++)
		cout<<" "<<G.columns[i];
	cout<<endl;	
	for(i=0;i<m+n+1;i++)
		cout<<" "<<G.rowIndex[i];
	cout<<endl;
	for(i=0;i<m+n;i++)
		printf(" %.8f", G.b[i]);
	cout<<endl;
	for(i=0;i<m+n;i++)
		printf(" %.8f", G.x[i]);
	cout<<endl;	
	
	
	for(i=0;i<m+n;i++)
	{
		cout<<endl;
		int startindex=G.rowIndex[i];
		int endindex=G.rowIndex[i+1];
		for(j=startindex;j<endindex;j++)
			cout<<" "<<G.Mat_val[j];
		cout<<endl;
		
	}
*/
/*	for (i=0;i<m+n+1;i++)
	{
		//cout<<endl;
		if(G.rowIndex[i]==G.rowIndex[i+1])
			break;
		
		for(j=G.rowIndex[i];j<G.rowIndex[i+1];j++)
		{
			if(G.Mat_val[j]>10)
				cout<<G.Mat_val[j]<<"\t";
		}
		//cout<<endl;
		/*for(j=G.rowIndex[i];j<G.rowIndex[i+1];j++)
		{
			cout<<G.columns[j]<<"\t";
		}
		//cout<<endl;
	}
	cout<<endl;
*/

//printing the matrix
	printf("\n Fine till here");
	printf("\n");
//	int* rowmIndex=(int*)calloc(m+1,sizeof(int));
	printf("\n Fine till here");
	printf("\n");
	//int rowmIndex[5]={1,2,3,4,5};
/*	for(i=0;i<m+1;i++)
	{
		rowmIndex[i]=G.rowIndex[i];
		printf(" %d", rowmIndex[i]);
	}
*/
	printf("\n Allocating GPU memory\n");
	cudaDeviceReset();
	size_t free, total;
	cudaMemGetInfo(&free, &total);
	printf("\n Free Mem = %lf MB, Total mem = %lf MB\n", (double)(free)/(1024*1024), (double)(total)/(1024*1024));


	double *dev_csrValA, *dev_b, *dev_x;
	int *dev_csrRowIdxA, *dev_csrColA;

	double *dev_GcsrVal, *dev_b1, *dev_x1;
	double *dev_PcsrVal, *dev_b2, *dev_x2;
	
	int *dev_GcsrRowIdx, *dev_PcsrRowIdx, *dev_GcsrCol, *dev_PcsrCol;
	

	
	cudaMalloc((void**)&dev_PcsrVal, G.Pnonzero*sizeof(double));
	cudaMalloc((void**)&dev_PcsrRowIdx, (m+1)*sizeof(int));
	cudaMalloc((void**)&dev_PcsrCol, G.Pnonzero*sizeof(int));


	cudaMalloc((void**)&dev_b1, (m)*sizeof(double));
	cudaMalloc((void**)&dev_b2, n*sizeof(double));
	cudaMalloc((void**)&dev_x1, m*sizeof(double));
	cudaMalloc((void**)&dev_x2, n*sizeof(double));

	cudaMemcpy(dev_b1, G.b1, (m)*sizeof(double), cudaMemcpyHostToDevice);
	cudaMemcpy(dev_x1, G.x1, (m)*sizeof(double), cudaMemcpyHostToDevice);

	cudaMemcpy(dev_PcsrVal, G.Pmat, G.Pnonzero*sizeof(double), cudaMemcpyHostToDevice);
	cudaMemcpy(dev_b2, G.b2, (n)*sizeof(double), cudaMemcpyHostToDevice);
	cudaMemcpy(dev_x2, G.x2, (n)*sizeof(double), cudaMemcpyHostToDevice);
	cudaMemcpy(dev_PcsrRowIdx, G.ProwIndex, (m+1)*sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(dev_PcsrCol, G.Pcolumns, (G.Pnonzero)*sizeof(int), cudaMemcpyHostToDevice);


	/* Matrix has been created and stored in CSR format.
	However, CHOLMOD requires CSC format. Since our matrix is symmetric positive definite, we can simply swap 
	csrColA with csrRowIdx and vice versa
	*/

	/* Starting the CHOLMOD routine now*/
	printf("\n Initiating CHOLMOD\n");
	cholmod_sparse *A, *P;
	cholmod_dense *x, *b, *r, *midvec;
	cholmod_factor *L;
	cholmod_common *Common, cm;
	Common = &cm;
	cholmod_l_start(Common);
//	&Common->useGPU=1;
	printf("\n m = %d, G.Gnonzero = %d\n", m, G.Gnonzero);	


	
	cholmod_sparse *C = cholmod_l_allocate_sparse((size_t)(m), (size_t)(m), (size_t)(G.Gnonzero), 1, 0, 1, 1, Common); 
//	P = cholmod_l_allocate_sparse((size_t)(m), (size_t)(n), (size_t)(G.Pnonzero), 1, 0, 0, 1, Common); 
//	printf("\n Allocated \n");

	C->itype = CHOLMOD_LONG;	
//	printf("\n Itype \n");
	C->p = &G.GrowIndex[0];
//	printf("\n Columns \n");
	C->nz = &Gnz[0];
//	printf("\n Rows \n");
	C->i = &G.Gcolumns[0];
	C->dtype = 0;
	C->x = &G.Gmat[0];
	 
/*	P->itype = CHOLMOD_LONG;
	P->p = &G.ProwIndex[0];
	P->nz = &Pnz[0];
	P->i = &G.Pcolumns[0];
	P->dtype = 0;
	P->x = &G.Pmat[0]; 
*/	 

	b = cholmod_l_allocate_dense((size_t)(m), 1, (size_t)(m), 1, Common);
	b->dtype=0;
	b->x = &G.b1[0];
	b->xtype = 1;

	printf("\n CHOLMOD manually set\n");
	cholmod_l_print_sparse(C, "A", Common);
	cholmod_l_print_dense(b, "b", Common);


	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	cudaEventRecord(start, 0);

	L = cholmod_l_analyze(C, Common);
	printf("\n Analysis: Flops: %g \t lnz: %g\n", Common->fl, Common->lnz);
	cholmod_l_factorize(C, L, Common);
	x = cholmod_l_solve(CHOLMOD_A, L, b, Common);
	
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	
	

	float elapsedTime;
	cudaEventElapsedTime(&elapsedTime, start, stop);
	printf("\n Time : %.6f secs :\n", elapsedTime);
	
	cholmod_l_print_dense(x, "X", Common);
	
	double *x1_mod = (double*)x->x;
	
	cudaMemcpy(dev_x1, x1_mod, m*sizeof(double), cudaMemcpyHostToDevice);
	
	cusparseStatus_t cuSparseStatus;
	cusparseHandle_t cuspHandle;
	cuSparseStatus = cusparseCreate(&cuspHandle);

	cusparseMatDescr_t descrP;
	cusparseCreateMatDescr(&descrP);

	cusparseSetMatType(descrP, CUSPARSE_MATRIX_TYPE_GENERAL);	
	cusparseSetMatIndexBase(descrP, CUSPARSE_INDEX_BASE_ZERO);
	
	
	double *dev_res1, *dev_simple;
	double *res1 = (double*)calloc(n,sizeof(double));
	cudaMalloc((void**)&dev_res1, n*sizeof(double));
	cudaMalloc((void**)&dev_simple, n*sizeof(double));
	
	const double alpha = 1.0, beta=0.0;
	//alpha = 1.0;
	//beta = 0.0;
	
	//solving P^T * G^-1 * b1 Result stored in dev_res1
	
	cuSparseStatus = cusparseDcsrmv(cuspHandle, CUSPARSE_OPERATION_TRANSPOSE, m, n, G.Pnonzero, &alpha, descrP, dev_PcsrVal, dev_PcsrRowIdx, dev_PcsrCol, dev_x1, &beta, dev_res1);
		
	if(cuSparseStatus == CUSPARSE_STATUS_SUCCESS)
	{
/*		cudaMemcpy(res1, dev_res1, n*sizeof(double), cudaMemcpyDeviceToHost);
		for(i=0;i<n;i++)
		{
			printf("\nres1[%d] = %.8f", i, res1[i]);
		}
		printf("\n P^T * G^-1 * b1 done! Vector stored in res1");
*/	}
	else
	{
		printf("\n P^T * G^-1 * b1 failed\n");
		exit(1);
	}
	
	const double alphaneg = -1.0;
		
		//Solving P^T * G^-1 * b1 - b2 ; Result stored in dev_res1
	
		
	cublasStatus_t cuBlasStatus;
	cublasHandle_t cubHandle;
	cuBlasStatus = cublasCreate(&cubHandle);
		
	cuBlasStatus = cublasDaxpy(cubHandle, n, &alphaneg, dev_b2, 1, dev_res1, 1);
	if(cuBlasStatus == CUBLAS_STATUS_SUCCESS)
	{
//		cudaMemcpy(res1, dev_res1, n*sizeof(double), cudaMemcpyDeviceToHost);
//		for(i=0;i<n;i++)
//		{
//			printf("\nres1[%d] = %.8f", i, res1[i]);
//		}
		printf("\n res1 = res1 - b2 done\n");
		 
	}
	else
	{
		printf("\n res1 = res1 - b2 failed\n");
	}
		
	
	
	
	///NOW COMPUTING G^-1 * P
	
	
	int k = 0;
	int breakloop=0;
	
	double **midMat = (double**)malloc(m*sizeof(double*));
	for(i=0;i<m;i++)
	{
		midMat[i] = (double*)calloc(n,sizeof(double));
	}
	
	cudaEventRecord(start, 0);
	
	for(i=0;i<n;i++)
	{
		breakloop = 0;
		double *vect = (double*)calloc(m,sizeof(double*));

		for(j=0;j<m;j++)
		{
			int startin = G.ProwIndex[j];
			int endin = G.ProwIndex[j+1];
			if(startin == endin)
				continue;
				
			k = startin;

			while(k<endin)
			{

				if(G.Pcolumns[k] == i)
				{	
					vect[j] = G.Pmat[k];
					breakloop=1;
					break;
				
				}
				k++;
			}
			if(breakloop == 1)
			{
				break;
			}
		}
		
		midvec = cholmod_l_allocate_dense((size_t)(m), 1, (size_t)(m), 1, Common);
		midvec->dtype=0;
		midvec->x=&vect[0];
		midvec->xtype = 1;
		
		cholmod_dense *res2;
		
		res2 = cholmod_l_solve(CHOLMOD_A, L, midvec, Common);

		
		double *re = (double*)res2->x;
		
//		printf("\n vector %d is:\n", i);
		int i1, j1, k1;
//		for(j1=0;j1<m;j1++)
//		{
//			midmat2flat[i+j1*n] = re[j1];
//			printf(" %lf", re[j1]);
//		}
//		printf("\n");
		for(i1=0;i1<m;i1++)
		{
			midMat[i1][i] = re[i1];
		}
		
		cholmod_l_free_dense(&midvec, Common);
			
	}
	
/*	printf("\n Midmat = \n");
	for(i=0;i<m;i++)
	{
		for(j=0;j<n;j++)
		{
			printf(" %lf", midMat[i][j]);
		}
		printf("\n");
	} 
*/
	double *midMatflat = (double*)calloc((m*n),sizeof(double));
	double *dev_midMat;
	double *dev_solut;
	int counter = 0;
	for(i=0;i<n;i++)
	{
		for(j=0;j<m;j++)
		{
			midMatflat[counter] = midMat[j][i];
			counter++; 
		}
	}
	
	cudaMalloc((void**)&dev_midMat, m*n*sizeof(double));
	cudaMalloc((void**)&dev_solut, n*n*sizeof(double));
	
	cudaMemcpy(dev_midMat, midMatflat, m*n*sizeof(double), cudaMemcpyHostToDevice);
		
	//Solving P^T * midMat; Result stored in dev_solut
		
	cuSparseStatus = cusparseDcsrmm(cuspHandle, CUSPARSE_OPERATION_TRANSPOSE, m, n, n, G.Pnonzero, &alpha, descrP, dev_PcsrVal, dev_PcsrRowIdx, dev_PcsrCol, dev_midMat, m, &beta, dev_solut, n);
	
	if(cuSparseStatus == CUSPARSE_STATUS_SUCCESS)
	{
		printf("\n Solved P^T * G^-1 * P. Result stored in solut\n");
		
	}  
	else
	{
		printf("\n  Failed to Solve P^T * G^-1 * P \n");
		exit(1);
	}

/*	double *matGflat = (double*)calloc(n*n,sizeof(double));
	cudaMemcpy(matGflat, dev_solut, n*n*sizeof(double), cudaMemcpyDeviceToHost);
	counter = 0;
	printf("\nBefore LU starts\n");
	for(i=0;i<n;i++)
	{
		for(j=0;j<n;j++)
		{
			printf(" %lf ", matGflat[counter]);
			counter++;
		}
		printf("\n");
	}
	printf("\n");
*/	
	cusolverStatus_t cuSolverStatus;
	
	
	cusolverDnHandle_t cudenHandle;
	cuSolverStatus = cusolverDnCreate(&cudenHandle);
	int Lwork = 0;
	cuSolverStatus = cusolverDnDgetrf_bufferSize(cudenHandle, n, n, dev_solut, n, &Lwork);
	
	if(cuSolverStatus == CUSOLVER_STATUS_SUCCESS)
	{
		printf("\n Buffer works\n Lwork = %d\n", Lwork);
	}
	else
	{
		exit(1);	
	}
	double *dev_Workspace;
	int *dev_Ipiv, *dev_Info;
	
	cudaMalloc((void**)&dev_Workspace, Lwork*sizeof(double));
	cudaMalloc((void**)&dev_Ipiv, n*sizeof(int));
	cudaMalloc((void**)&dev_Info, sizeof(int));
	
	//Calculating LU for dev_solut
//	double *nnmat = (double*)calloc(n*n,sizeof(double));
//	cudaMemcpy(nnmat, dev_solut, n*n*sizeof(double), cudaMemcpyDeviceToHost);
//	cuSolverStatus = cusolverDnDgetrfHost(cudenHandle, n, n, 

	cuSolverStatus = cusolverDnDgetrf(cudenHandle, n, n, dev_solut, n, dev_Workspace, dev_Ipiv, dev_Info);
	
	if(cuSolverStatus == CUSOLVER_STATUS_SUCCESS)
	{
		printf("\n solut has be defactorized into L and U. dev_Ipiv * solut = L * U\n");
	}
	else
	{
	
		printf("\n Unable to defactorize solut into LU\n");
		exit(1);	
	}
	
	//solving dev_solut * x = dev_res1. Result stored in dev_res1
	
	cuSolverStatus = cusolverDnDgetrs(cudenHandle, CUBLAS_OP_N, n, 1, dev_solut, n, dev_Ipiv, dev_res1, n, dev_Info);
	if(cuSolverStatus == CUSOLVER_STATUS_SUCCESS)
	{
		printf("\n Solution obtained for x2 \n");
	}
	else
	{
		printf("\n LU decomposition obtained by LU solver failed\n");
	}

/*	cudaMemcpy(G.x2, dev_res1, n*sizeof(double), cudaMemcpyDeviceToHost);
	printf("\n x2 = \n");
	for(i=0;i<n;i++)
	{
		printf("\n x2[%d] = %lf", i, G.x2[i]); 
	}
*/	
	
	double *dev_dummy;
	cudaMalloc((void**)&dev_dummy, m*sizeof(double));
	cudaMemset(dev_dummy, 0.0, m*sizeof(double));
	
	printf("\n Starting solving for x1 \n");
	//Solving for x1
	
		//Solving G^-1 * P * x2;  G^-1 * P is stored in midMat
	cuBlasStatus = cublasDgemv(cubHandle, CUBLAS_OP_N, m, n, &alpha, dev_midMat, m, dev_res1, 1, &beta, dev_dummy, 1);
	if(cuBlasStatus == CUBLAS_STATUS_SUCCESS)
	{
/*		double *toprint = (double*)calloc(m,sizeof(double));
		cudaMemcpy(toprint, dev_dummy, m*sizeof(double), cudaMemcpyDeviceToHost);
		printf("\n Intermediate vector :\n");
		for(i=0;i<m;i++)
		{
			printf("\ndummy[%d] = %lf", i, toprint[i]);
		}
*/		printf("\n midmat * x2 obtained. Stored in dummy\n");
	}
	else
	{
		printf("\n Failed to obtain midmat * x2\n");
	}
	
	cuBlasStatus = cublasDaxpy(cubHandle, m, &alphaneg, dev_dummy, 1, dev_x1, 1);
	if(cuBlasStatus == CUBLAS_STATUS_SUCCESS)
	{
/*		cudaMemcpy(G.x1, dev_x1, m*sizeof(double), cudaMemcpyDeviceToHost);
		printf("\n x1 = \n");
		for(i=0;i<m;i++)
		{
			printf("\n x1[%d] = %.15f", i, G.x1[i]);
		}
*/		printf("\n x1 obtained");
	}
	else
	{
		printf("\n Failed to obtain x1");
	}

	printf("\n Solver finished its work\n");

/*		cudaEventRecord(stop, 0);
		cudaEventSynchronize(stop);

		cudaEventElapsedTime(&elapsedTime, start, stop);
		printf("\n Time: %.6f msecs :\n", elapsedTime);
*/








	
	
	
	
	

	cholmod_l_finish(Common);
	return 0;

} 
void axpy(double alpha, const Vector<double> &x, Vector<double> &y)
{
  assert(x.getSize() == y.getSize()); 
  cublasDaxpy(x.getSize(), alpha, x, x.inc(), y, y.inc());
}
Esempio n. 14
0
GPUMat& GPUMat::operator+=(const GPUMat& rhs) {

    double scale = 1;
    stat=cublasDaxpy(handle,n_elem,&scale,rhs.memptr_GPU(),1,this->memptr_GPU,1);
    return *this;
}
Esempio n. 15
0
File: mpla.cpp Progetto: zaspel/MPLA
void mpla_daxpy(struct mpla_vector* y, double alpha, struct mpla_vector* x, struct mpla_instance* instance)
{
	// compute process-wise axpy
	cublasDaxpy(instance->cublas_handle, x->cur_proc_row_count, &alpha, x->data, 1, y->data, 1);
}
Esempio n. 16
0
//
// Overloaded function for dispatching to
// * CUBLAS backend, and
// * double value-type.
//
inline void axpy( const int n, const double a, const double* x,
        const int incx, double* y, const int incy ) {
    cublasDaxpy( n, a, x, incx, y, incy );
}
Esempio n. 17
0
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
  // At least 2 arguments expected
  // Input and result
  if (nrhs!=6)
     mexErrMsgTxt("Wrong number of arguments");
  if (init == 0) {
    // Initialize function
    // mexLock();
    // load GPUmat
    gm = gmGetGPUmat();
    init = 1;
  }
  /* mex parameters are:
   0 Source array [X or one of the y-edge-zeroed copies of it]
   1 Destination array [Accumulator, basically]
   2 Stack array 1 [We swap XY or YZ planes with this before copying to assure clean shift] - Must be all zeroes and of size max(NxNy, NyNz, NxNz)
   3 Stack array 2
   4 Shift directions
   5 Coefficient on shift */

  // Get GPU array pointers
  GPUtype srcArray    = gm->gputype.getGPUtype(prhs[0]);
  GPUtype dstArray    = gm->gputype.getGPUtype(prhs[1]);
  GPUtype stackArrayX = gm->gputype.getGPUtype(prhs[2]);
//GPUtype stackArrayY = gm->gputype.getGPUtype(prhs[3]);
  GPUtype stackArrayZ = gm->gputype.getGPUtype(prhs[3]);

  // Get some control variables sorted out
  double *shiftdirs  = mxGetPr(prhs[4]);
  const int *dims    = gm->gputype.getSize(srcArray);

  double alpha       = *mxGetPr(prhs[5]);

  int shifts[3];
  shifts[0] = (int)shiftdirs[0];
  shifts[1] = (int)shiftdirs[1];
  shifts[2] = (int)shiftdirs[2];

  double *cubSrc = (double*)gm->gputype.getGPUptr(srcArray);

  // Remove appropriate YZ plane if any
  double *cubDst = (double*)gm->gputype.getGPUptr(stackArrayX);

  if(shifts[0] == -1) cublasDswap(dims[1]*dims[2], cubSrc,             dims[0], cubDst, 1);
  if(shifts[0] ==  1) cublasDswap(dims[1]*dims[2], cubSrc + dims[0]-1, dims[0], cubDst, 1);

  // Remove appropriate XZ plane if any
  //stackSwapXZplane(cubSrc, (double*)gm->gputype.getGPUptr(stackArrayY), (int *)dims, shifts);

  // Remove appropriate XY plane if any
  cubDst = (double*)gm->gputype.getGPUptr(stackArrayZ);

  if(shifts[2] == -1) cublasDswap(dims[0]*dims[1], cubSrc,                               1, cubDst, 1);
  if(shifts[2] ==  1) cublasDswap(dims[0]*dims[1], cubSrc + dims[0]*dims[1]*(dims[2]-1), 1, cubDst, 1);

  // Decide the amount of offset to acheive desired shift
  int theta = shifts[0] + dims[0]*shifts[1] + dims[0]*dims[1]*shifts[2];
  int Ntot  = dims[0] * dims[1] * dims[2];

  cubDst = (double*)gm->gputype.getGPUptr(dstArray);
  if(theta >= 0) {
    cublasDaxpy(Ntot-theta, alpha, cubSrc,         1, cubDst + theta, 1);
  } else {
    cublasDaxpy(Ntot+theta, alpha, cubSrc - theta, 1, cubDst,         1);
  }

  // Replace the XY plane if it was removed
  cubDst = (double*)gm->gputype.getGPUptr(stackArrayZ);
  if(shifts[2] == -1) cublasDswap(dims[0]*dims[1], cubSrc,                               1, cubDst, 1);
  if(shifts[2] ==  1) cublasDswap(dims[0]*dims[1], cubSrc + dims[0]*dims[1]*(dims[2]-1), 1, cubDst, 1);

  // replace the XZ plane if it was removed
  //stackSwapXZplane(cubSrc, (double*)gm->gputype.getGPUptr(stackArrayY), (int *)dims, shifts);

  // Replace the YZ plane if it was removed
  cubDst = (double*)gm->gputype.getGPUptr(stackArrayX);
  if(shifts[0] == -1) cublasDswap(dims[1]*dims[2], cubSrc,             dims[0], cubDst, 1);
  if(shifts[0] ==  1) cublasDswap(dims[1]*dims[2], cubSrc + dims[0]-1, dims[0], cubDst, 1);
}