inline void gemv( const Order, const Trans, const int m, const int n, const double alpha, const double* a, const int lda, const double* x, const int incx, const double beta, double* y, const int incy ) { BOOST_STATIC_ASSERT( (is_same<Order, tag::column_major>::value) ); cublasDgemv( blas_option< Trans >::value, m, n, alpha, a, lda, x, incx, beta, y, incy ); }
static vl::Error gemv(vl::Context& context, char op, ptrdiff_t m, ptrdiff_t n, type alpha, type const * a, ptrdiff_t lda, type const * x, ptrdiff_t incx, type beta, type * y, ptrdiff_t incy) { cublasHandle_t handle ; cublasStatus_t status ; status = context.getCudaHelper().getCublasHandle(&handle) ; if (status != CUBLAS_STATUS_SUCCESS) goto done ; status = cublasDgemv(handle, (op == 't') ? CUBLAS_OP_T : CUBLAS_OP_N, (int)m, (int)n, &alpha, a, lda, x, (int)incx, &beta, y, (int)incy); done: return context.setError (context.getCudaHelper().catchCublasError(status, "cublasDgemv"), __func__) ; }
void caffe_gpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M, const int N, const double alpha, const double* A, const double* x, const double beta, double* y) { cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; CUBLAS_CHECK(cublasDgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1)); }
static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, double beta, gpudata *Y, size_t offY, int incY) { cuda_context *ctx = A->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; size_t t; ASSERT_BUF(A); ASSERT_BUF(X); ASSERT_BUF(Y); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(M * N) || LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY)) return GA_XLARGE_ERROR; if (order == cb_c) { t = N; N = M; M = t; if (transA == cb_no_trans) { transA = cb_trans; } else { transA = cb_no_trans; } } cuda_enter(ctx); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_ALL)); h->err = cublasDgemv(h->h, convT(transA), M, N, &alpha, ((double *)A->ptr) + offA, lda, ((double *)X->ptr) + offX, incX, &beta, ((double *)Y->ptr) + offY, incY); if (h->err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_ALL)); cuda_exit(ctx); return GA_NO_ERROR; }
static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, double beta, gpudata *Y, size_t offY, int incY) { cuda_context *ctx = A->ctx; cublasStatus_t err; size_t t; ASSERT_BUF(A); ASSERT_BUF(X); ASSERT_BUF(Y); if (order == cb_c) { t = N; N = M; M = t; if (transA == cb_no_trans) { transA = cb_trans; } else { transA = cb_no_trans; } } cuda_enter(ctx); cuda_wait(A, CUDA_WAIT_READ); cuda_wait(X, CUDA_WAIT_READ); cuda_wait(Y, CUDA_WAIT_READ|CUDA_WAIT_WRITE); err = cublasDgemv(((blas_handle *)ctx->blas_handle)->h, convT(transA), M, N, &alpha, ((double *)A->ptr) + offA, lda, ((double *)X->ptr) + offX, incX, &beta, ((double *)Y->ptr) + offY, incY); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } cuda_record(A, CUDA_WAIT_READ); cuda_record(X, CUDA_WAIT_READ); cuda_record(Y, CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_exit(ctx); return GA_NO_ERROR; }
void magma_dgemv( magma_trans_t transA, magma_int_t m, magma_int_t n, double alpha, double const* dA, magma_int_t lda, double const* dx, magma_int_t incx, double beta, double* dy, magma_int_t incy ) { cublasDgemv( cublas_trans_const( transA ), m, n, alpha, dA, lda, dx, incx, beta, dy, incy ); }
void gemv_gpu(double *a, double *b, double *c1, int N) { #pragma acc data present(a, b, c1) { #pragma acc host_data use_device(a, b, c1) { cublasHandle_t handle; cublasCreate(&handle); const double alpha = 1.0; const double beta = 0.0; cublasDgemv(handle, CUBLAS_OP_T, N, N, &alpha, a, N, b, 1, &beta, c1, 1); cublasDestroy(handle); } } // end pragma data }
void d_gemv(SEXP rtrans, SEXP ralpha, SEXP ra, SEXP rlda, SEXP rx, SEXP rincx, SEXP rbeta, SEXP ry, SEXP rincy) { char trans = getTranspose(rtrans); double alpha = asReal(ralpha), beta = asReal(rbeta), * a, * x, * y; int nx, ny, rowsa, colsa, lda = asInteger(rlda), incx = asInteger(rincx), incy = asInteger(rincy); unpackVector(rx, &nx, &x); unpackVector(ry, &ny, &y); unpackMatrix(ra, &rowsa, &colsa, &a); cublasDgemv(trans, rowsa, colsa, alpha, a, lda, x, incx, beta, y, incy); checkCublasError("d_gemv"); }
SEXP magMultmv(SEXP a, SEXP transa, SEXP x, SEXP right) { SEXP gpu = magGetGPU(a, x), y = PROTECT(NEW_OBJECT(MAKE_CLASS("magma"))); int RHS = LOGICAL_VALUE(right), TA = (LOGICAL_VALUE(transa) ^ !RHS), *DIMA = INTEGER(GET_DIM(a)), M = DIMA[0], N = DIMA[1], LENX = LENGTH(x), LENY = DIMA[TA], LDA=M; char TRANSA = (TA ? 'T' : 'N'); double *A = REAL(PROTECT(AS_NUMERIC(a))), *X = REAL(PROTECT(AS_NUMERIC(x))), *dA, *dX, *dY; if(DIMA[!TA] != LENX) error("non-conformable matrices"); y = SET_SLOT(y, install(".Data"), allocMatrix(REALSXP, (RHS ? LENY : 1), (RHS ? 1 : LENY))); SET_SLOT(y, install("gpu"), duplicate(gpu)); magma_malloc((void**)&dA, (M*N)*sizeof(double)); magma_malloc((void**)&dX, LENX*sizeof(double)); magma_malloc((void**)&dY, LENY*sizeof(double)); magma_dsetmatrix(M, N, A, LDA, dA, LDA); magma_dsetvector(LENX, X, 1, dX, 1); if(LOGICAL_VALUE(gpu)) { magmablas_dgemv(TRANSA, M, N, 1.0, dA, LDA, dX, 1, 0.0, dY, 1); } else { cublasDgemv(TRANSA, M, N, 1.0, dA, LDA, dX, 1, 0.0, dY, 1); } magma_dgetvector(LENY, dY, 1, REAL(y), 1); magma_free(dA); magma_free(dX); magma_free(dY); UNPROTECT(3); return y; }
void mpla_dgemv(struct mpla_vector* b, struct mpla_matrix* A, struct mpla_vector* x, struct mpla_instance* instance) { double one = 1; double zero = 0; // allocate redistributed vector struct mpla_vector x_redist; mpla_init_vector_for_block_rows(&x_redist, instance, x->vec_row_count); // redistribute input vector with row-block parallel distribution to column-block parallel distribution mpla_redistribute_vector_for_dgesv(&x_redist, x, A, instance); // computation core: matrix-vector product cublasDgemv((instance->cublas_handle), CUBLAS_OP_N, A->cur_proc_row_count, A->cur_proc_col_count, &one, A->data, A->cur_proc_row_count, x_redist.data, 1, &zero, b->data, 1); // create sub-communicator for each process row int remain_dims[2]; remain_dims[0]=0; remain_dims[1]=1; MPI_Comm row_comm; MPI_Cart_sub(instance->comm, remain_dims, &row_comm); // summation of block row results double* sum; cudaMalloc((void**)&sum, sizeof(double)*b->cur_proc_row_count); cudaThreadSynchronize(); checkCUDAError("cudaMalloc"); MPI_Allreduce(b->data, sum, b->cur_proc_row_count, MPI_DOUBLE, MPI_SUM, row_comm); cudaMemcpy(b->data, sum, sizeof(double)*b->cur_proc_row_count, cudaMemcpyDeviceToDevice); // cleanup cudaFree(sum); mpla_free_vector(&x_redist, instance); MPI_Comm_free(&row_comm); }
int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, t1, t2; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; const char trans[] = { 'N', 'C', 'T' }; const char uplo[] = { 'L', 'U' }; const char diag[] = { 'U', 'N' }; const char side[] = { 'L', 'R' }; double *A, *B, *C, *C2, *LU; double *dA, *dB, *dC1, *dC2; double alpha = MAGMA_D_MAKE( 0.5, 0.1 ); double beta = MAGMA_D_MAKE( 0.7, 0.2 ); double dalpha = 0.6; double dbeta = 0.8; double work[1], error, total_error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld, info; magma_int_t *piv; magma_err_t err; magma_opts opts; parse_opts( argc, argv, &opts ); printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" ); total_error = 0.; for( int i = 0; i < opts.ntest; ++i ) { m = opts.msize[i]; n = opts.nsize[i]; k = opts.ksize[i]; printf("=========================================================================\n"); printf( "M %d, N %d, K %d\n", (int) m, (int) n, (int) k ); // allocate matrices // over-allocate so they can be any combination of {m,n,k} x {m,n,k}. maxn = max( max( m, n ), k ); ld = maxn; size = maxn*maxn; err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) ); assert( err == 0 ); err = magma_dmalloc_pinned( &A, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &B, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &C, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &C2, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &LU, size ); assert( err == 0 ); err = magma_dmalloc( &dA, size ); assert( err == 0 ); err = magma_dmalloc( &dB, size ); assert( err == 0 ); err = magma_dmalloc( &dC1, size ); assert( err == 0 ); err = magma_dmalloc( &dC2, size ); assert( err == 0 ); // initialize matrices size = maxn*maxn; lapackf77_dlarnv( &ione, ISEED, &size, A ); lapackf77_dlarnv( &ione, ISEED, &size, B ); lapackf77_dlarnv( &ione, ISEED, &size, C ); printf( "========== Level 1 BLAS ==========\n" ); // ----- test DSWAP // swap 2nd and 3rd columns of dA, then copy to C2 and compare with A assert( n >= 4 ); magma_dsetmatrix( m, n, A, ld, dA, ld ); magma_dsetmatrix( m, n, A, ld, dB, ld ); magma_dswap( m, dA(0,1), 1, dA(0,2), 1 ); magma_dswap( m, dB(0,1), 1, dB(0,2), 1 ); // check results, storing diff between magma and cuda calls in C2 cublasDaxpy( ld*n, c_neg_one, dA, 1, dB, 1 ); magma_dgetmatrix( m, n, dB, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &k, C2, &ld, work ); total_error += error; printf( "dswap diff %.2g\n", error ); // ----- test IDAMAX // get argmax of column of A magma_dsetmatrix( m, k, A, ld, dA, ld ); error = 0; for( int j = 0; j < k; ++j ) { magma_int_t i1 = magma_idamax( m, dA(0,j), 1 ); magma_int_t i2 = cublasIdamax( m, dA(0,j), 1 ); assert( i1 == i2 ); error += abs( i1 - i2 ); } total_error += error; gflops = (double)m * k / 1e9; printf( "idamax diff %.2g\n", error ); printf( "\n" ); printf( "========== Level 2 BLAS ==========\n" ); // ----- test DGEMV // c = alpha*A*b + beta*c, with A m*n; b,c m or n-vectors // try no-trans/trans for( int ia = 0; ia < 3; ++ia ) { magma_dsetmatrix( m, n, A, ld, dA, ld ); magma_dsetvector( maxn, B, 1, dB, 1 ); magma_dsetvector( maxn, C, 1, dC1, 1 ); magma_dsetvector( maxn, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 size = (trans[ia] == 'N' ? m : n); cublasDaxpy( size, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( size, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &size, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DGEMV( m, n ) / 1e9; printf( "dgemv( %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", trans[ia], error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test DSYMV // c = alpha*A*b + beta*c, with A m*m symmetric; b,c m-vectors // try upper/lower for( int iu = 0; iu < 2; ++iu ) { magma_dsetmatrix( m, m, A, ld, dA, ld ); magma_dsetvector( m, B, 1, dB, 1 ); magma_dsetvector( m, C, 1, dC1, 1 ); magma_dsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( m, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYMV( m ) / 1e9; printf( "dsymv( %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test DTRSV // solve A*c = c, with A m*m triangular; c m-vector // try upper/lower, no-trans/trans, unit/non-unit diag // Factor A into LU to get well-conditioned triangles, else solve yields garbage. // Still can give garbage if solves aren't consistent with LU factors, // e.g., using unit diag for U, so copy lower triangle to upper triangle. // Also used for trsm later. lapackf77_dlacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld ); lapackf77_dgetrf( &maxn, &maxn, LU, &ld, piv, &info ); for( int j = 0; j < maxn; ++j ) { for( int i = 0; i < j; ++i ) { *LU(i,j) = *LU(j,i); } } for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { magma_dsetmatrix( m, m, LU, ld, dA, ld ); magma_dsetvector( m, C, 1, dC1, 1 ); magma_dsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( m, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRSM( MagmaLeft, m, 1 ) / 1e9; printf( "dtrsv( %c, %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], diag[id], error, gflops/t1, gflops/t2 ); }}} printf( "\n" ); printf( "========== Level 3 BLAS ==========\n" ); // ----- test DGEMM // C = alpha*A*B + beta*C, with A m*k or k*m; B k*n or n*k; C m*n // try combinations of no-trans/trans for( int ia = 0; ia < 3; ++ia ) { for( int ib = 0; ib < 3; ++ib ) { bool nta = (trans[ia] == 'N'); bool ntb = (trans[ib] == 'N'); magma_dsetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA, ld ); magma_dsetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DGEMM( m, n, k ) / 1e9; printf( "dgemm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", trans[ia], trans[ib], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYMM // C = alpha*A*B + beta*C (left) with A m*m symmetric; B,C m*n; or // C = alpha*B*A + beta*C (right) with A n*n symmetric; B,C m*n // try left/right, upper/lower for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { magma_dsetmatrix( m, m, A, ld, dA, ld ); magma_dsetmatrix( m, n, B, ld, dB, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYMM( side[is], m, n ) / 1e9; printf( "dsymm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", side[is], uplo[iu], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYRK // C = alpha*A*A^H + beta*C (no-trans) with A m*k and C m*m symmetric; or // C = alpha*A^H*A + beta*C (trans) with A k*m and C m*m symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { magma_dsetmatrix( n, k, A, ld, dA, ld ); magma_dsetmatrix( n, n, C, ld, dC1, ld ); magma_dsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYRK( k, n ) / 1e9; printf( "dsyrk( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYR2K // C = alpha*A*B^H + ^alpha*B*A^H + beta*C (no-trans) with A,B n*k; C n*n symmetric; or // C = alpha*A^H*B + ^alpha*B^H*A + beta*C (trans) with A,B k*n; C n*n symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { bool nt = (trans[it] == 'N'); magma_dsetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA, ld ); magma_dsetmatrix( n, n, C, ld, dC1, ld ); magma_dsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYR2K( k, n ) / 1e9; printf( "dsyr2k( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DTRMM // C = alpha*A*C (left) with A m*m triangular; C m*n; or // C = alpha*C*A (right) with A n*n triangular; C m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == 'L'); magma_dsetmatrix( (left ? m : n), (left ? m : n), A, ld, dA, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRMM( side[is], m, n ) / 1e9; printf( "dtrmm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // ----- test DTRSM // solve A*X = alpha*B (left) with A m*m triangular; B m*n; or // solve X*A = alpha*B (right) with A n*n triangular; B m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == 'L'); magma_dsetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRSM( side[is], m, n ) / 1e9; printf( "dtrsm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // cleanup magma_free_cpu( piv ); magma_free_pinned( A ); magma_free_pinned( B ); magma_free_pinned( C ); magma_free_pinned( C2 ); magma_free_pinned( LU ); magma_free( dA ); magma_free( dB ); magma_free( dC1 ); magma_free( dC2 ); } if ( total_error != 0. ) { printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n", total_error ); } else { printf( "all tests passed\n" ); } TESTING_FINALIZE(); return 0; }
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf, cpu_time; double magma_error, cublas_error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t M, N, Xm, Ym, lda, sizeA, sizeX, sizeY; magma_int_t incx = 1; magma_int_t incy = 1; double c_neg_one = MAGMA_D_NEG_ONE; double alpha = MAGMA_D_MAKE( 1.5, -2.3 ); double beta = MAGMA_D_MAKE( -0.6, 0.8 ); double *A, *X, *Y, *Ycublas, *Ymagma; double *dA, *dX, *dY; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("trans = %s\n", lapack_trans_const(opts.transA) ); printf(" M N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("===================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = ((M+31)/32)*32; gflops = FLOPS_DGEMV( M, N ) / 1e9; if ( opts.transA == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } sizeA = lda*N; sizeX = incx*Xm; sizeY = incy*Ym; TESTING_MALLOC_CPU( A, double, sizeA ); TESTING_MALLOC_CPU( X, double, sizeX ); TESTING_MALLOC_CPU( Y, double, sizeY ); TESTING_MALLOC_CPU( Ycublas, double, sizeY ); TESTING_MALLOC_CPU( Ymagma, double, sizeY ); TESTING_MALLOC_DEV( dA, double, sizeA ); TESTING_MALLOC_DEV( dX, double, sizeX ); TESTING_MALLOC_DEV( dY, double, sizeY ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &sizeA, A ); lapackf77_dlarnv( &ione, ISEED, &sizeX, X ); lapackf77_dlarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_dsetmatrix( M, N, A, lda, dA, lda ); magma_dsetvector( Xm, X, incx, dX, incx ); magma_dsetvector( Ym, Y, incy, dY, incy ); cublas_time = magma_sync_wtime( 0 ); cublasDgemv( handle, cublas_trans_const(opts.transA), M, N, &alpha, dA, lda, dX, incx, &beta, dY, incy ); cublas_time = magma_sync_wtime( 0 ) - cublas_time; cublas_perf = gflops / cublas_time; magma_dgetvector( Ym, dY, incy, Ycublas, incy ); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_dsetvector( Ym, Y, incy, dY, incy ); magma_time = magma_sync_wtime( 0 ); magmablas_dgemv( opts.transA, M, N, alpha, dA, lda, dX, incx, beta, dY, incy ); magma_time = magma_sync_wtime( 0 ) - magma_time; magma_perf = gflops / magma_time; magma_dgetvector( Ym, dY, incx, Ymagma, incx ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_dgemv( lapack_trans_const(opts.transA), &M, &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_daxpy( &Ym, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_dlange( "M", &Ym, &ione, Ymagma, &Ym, work ) / Ym; blasf77_daxpy( &Ym, &c_neg_one, Y, &incy, Ycublas, &incy ); cublas_error = lapackf77_dlange( "M", &Ym, &ione, Ycublas, &Ym, work ) / Ym; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error, (magma_error < tol && cublas_error < tol ? "ok" : "failed")); status += ! (magma_error < tol && cublas_error < tol); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
int main(int argc, char* argv[]) { const int bufsize = 512; char buffer[bufsize]; int m,n,S; double time_st,time_end,time_avg; //omp_set_num_threads(2); // printf("\n-----------------\nnumber of threads fired = %d\n-----------------\n",(int)omp_get_num_threads()); if(argc!=2) { cout<<"Insufficient arguments"<<endl; return 1; } graph G; cerr<<"Start reading "; // time_st=dsecnd(); G.create_graph(argv[1]); // time_end=dsecnd(); // time_avg = (time_end-time_st); // cout<<"Success "<<endl; // cerr<<"Reading time "<<time_avg<<endl; cerr<<"Constructing Matrices "; // time_st=dsecnd(); G.construct_MNA(); G.construct_NA(); // time_end=dsecnd(); // time_avg = (time_end-time_st); // cerr<<"Done "<<time_avg<<endl; // G.construct_sparse_MNA(); m=G.node_array.size()-1; n=G.voltage_edge_id.size(); cout<<endl; cout<<"MATRIX STAT:"<<endl; cout<<"Nonzero elements: "<<G.nonzero<<endl; cout<<"Number of Rows: "<<m+n<<endl; cout<<"Nonzero in G: "<<G.Gnonzero<<endl; cout<<"Number of rows in G: "<<m<<endl; cout<<"Nonzero in P: "<<G.Pnonzero<<endl; cout<<"Number of rows in P: "<<m<<endl; // printf("\n Nonzero = %d", G.nonzero); // printf("\n Rows = %d", m+n); cout<<"MAT val: "<<endl; int i,j; G.Mat_val[0] += 100; G.Gmat[0] +=100; /* for(i=0;i<G.Gnonzero;i++) cout<<" "<<G.Gmat[i]; cout<<endl; for(i=0;i<G.Gnonzero;i++) cout<<" "<<G.Gcolumns[i]; cout<<endl; for(i=0;i<m+1;i++) cout<<" "<<G.GrowIndex[i]; cout<<endl; for(i=0;i<m;i++) printf(" %.8f", G.b1[i]); cout<<endl; for(i=0;i<m;i++) printf(" %.8f", G.x1[i]); cout<<endl; */ SuiteSparse_long *Gnz = (SuiteSparse_long*)calloc(m,sizeof(SuiteSparse_long)); for(i=0;i<m;i++) { // cout<<endl; SuiteSparse_long startindex=G.GrowIndex[i]; SuiteSparse_long endindex=G.GrowIndex[i+1]; Gnz[i] = endindex - startindex; // for(j=startindex;j<endindex;j++) // cout<<" "<<G.Gmat[j]; // cout<<endl; } /* for(i=0;i<G.Pnonzero;i++) cout<<" "<<G.Pmat[i]; cout<<endl; for(i=0;i<G.Pnonzero;i++) cout<<" "<<G.Pcolumns[i]; cout<<endl; for(i=0;i<m+1;i++) cout<<" "<<G.ProwIndex[i]; cout<<endl; /* for(i=0;i<m;i++) printf(" %.8f", G.b1[i]); cout<<endl; for(i=0;i<m;i++) printf(" %.8f", G.x1[i]); cout<<endl; for(i=0;i<m;i++) { cout<<endl; int startindex=G.ProwIndex[i]; int endindex=G.ProwIndex[i+1]; for(j=startindex;j<endindex;j++) cout<<" "<<G.Pmat[j]; cout<<endl; } /* for(i=0;i<G.nonzero;i++) cout<<" "<<G.Mat_val[i]; cout<<endl; for(i=0;i<G.nonzero;i++) cout<<" "<<G.columns[i]; cout<<endl; for(i=0;i<m+n+1;i++) cout<<" "<<G.rowIndex[i]; cout<<endl; for(i=0;i<m+n;i++) printf(" %.8f", G.b[i]); cout<<endl; for(i=0;i<m+n;i++) printf(" %.8f", G.x[i]); cout<<endl; for(i=0;i<m+n;i++) { cout<<endl; int startindex=G.rowIndex[i]; int endindex=G.rowIndex[i+1]; for(j=startindex;j<endindex;j++) cout<<" "<<G.Mat_val[j]; cout<<endl; } */ /* for (i=0;i<m+n+1;i++) { //cout<<endl; if(G.rowIndex[i]==G.rowIndex[i+1]) break; for(j=G.rowIndex[i];j<G.rowIndex[i+1];j++) { if(G.Mat_val[j]>10) cout<<G.Mat_val[j]<<"\t"; } //cout<<endl; /*for(j=G.rowIndex[i];j<G.rowIndex[i+1];j++) { cout<<G.columns[j]<<"\t"; } //cout<<endl; } cout<<endl; */ //printing the matrix printf("\n Fine till here"); printf("\n"); // int* rowmIndex=(int*)calloc(m+1,sizeof(int)); printf("\n Fine till here"); printf("\n"); //int rowmIndex[5]={1,2,3,4,5}; /* for(i=0;i<m+1;i++) { rowmIndex[i]=G.rowIndex[i]; printf(" %d", rowmIndex[i]); } */ printf("\n Allocating GPU memory\n"); cudaDeviceReset(); size_t free, total; cudaMemGetInfo(&free, &total); printf("\n Free Mem = %lf MB, Total mem = %lf MB\n", (double)(free)/(1024*1024), (double)(total)/(1024*1024)); double *dev_csrValA, *dev_b, *dev_x; int *dev_csrRowIdxA, *dev_csrColA; double *dev_GcsrVal, *dev_b1, *dev_x1; double *dev_PcsrVal, *dev_b2, *dev_x2; int *dev_GcsrRowIdx, *dev_PcsrRowIdx, *dev_GcsrCol, *dev_PcsrCol; cudaMalloc((void**)&dev_PcsrVal, G.Pnonzero*sizeof(double)); cudaMalloc((void**)&dev_PcsrRowIdx, (m+1)*sizeof(int)); cudaMalloc((void**)&dev_PcsrCol, G.Pnonzero*sizeof(int)); cudaMalloc((void**)&dev_b1, (m)*sizeof(double)); cudaMalloc((void**)&dev_b2, n*sizeof(double)); cudaMalloc((void**)&dev_x1, m*sizeof(double)); cudaMalloc((void**)&dev_x2, n*sizeof(double)); cudaMemcpy(dev_b1, G.b1, (m)*sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(dev_x1, G.x1, (m)*sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(dev_PcsrVal, G.Pmat, G.Pnonzero*sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(dev_b2, G.b2, (n)*sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(dev_x2, G.x2, (n)*sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(dev_PcsrRowIdx, G.ProwIndex, (m+1)*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(dev_PcsrCol, G.Pcolumns, (G.Pnonzero)*sizeof(int), cudaMemcpyHostToDevice); /* Matrix has been created and stored in CSR format. However, CHOLMOD requires CSC format. Since our matrix is symmetric positive definite, we can simply swap csrColA with csrRowIdx and vice versa */ /* Starting the CHOLMOD routine now*/ printf("\n Initiating CHOLMOD\n"); cholmod_sparse *A, *P; cholmod_dense *x, *b, *r, *midvec; cholmod_factor *L; cholmod_common *Common, cm; Common = &cm; cholmod_l_start(Common); // &Common->useGPU=1; printf("\n m = %d, G.Gnonzero = %d\n", m, G.Gnonzero); cholmod_sparse *C = cholmod_l_allocate_sparse((size_t)(m), (size_t)(m), (size_t)(G.Gnonzero), 1, 0, 1, 1, Common); // P = cholmod_l_allocate_sparse((size_t)(m), (size_t)(n), (size_t)(G.Pnonzero), 1, 0, 0, 1, Common); // printf("\n Allocated \n"); C->itype = CHOLMOD_LONG; // printf("\n Itype \n"); C->p = &G.GrowIndex[0]; // printf("\n Columns \n"); C->nz = &Gnz[0]; // printf("\n Rows \n"); C->i = &G.Gcolumns[0]; C->dtype = 0; C->x = &G.Gmat[0]; /* P->itype = CHOLMOD_LONG; P->p = &G.ProwIndex[0]; P->nz = &Pnz[0]; P->i = &G.Pcolumns[0]; P->dtype = 0; P->x = &G.Pmat[0]; */ b = cholmod_l_allocate_dense((size_t)(m), 1, (size_t)(m), 1, Common); b->dtype=0; b->x = &G.b1[0]; b->xtype = 1; printf("\n CHOLMOD manually set\n"); cholmod_l_print_sparse(C, "A", Common); cholmod_l_print_dense(b, "b", Common); cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); L = cholmod_l_analyze(C, Common); printf("\n Analysis: Flops: %g \t lnz: %g\n", Common->fl, Common->lnz); cholmod_l_factorize(C, L, Common); x = cholmod_l_solve(CHOLMOD_A, L, b, Common); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); float elapsedTime; cudaEventElapsedTime(&elapsedTime, start, stop); printf("\n Time : %.6f secs :\n", elapsedTime); cholmod_l_print_dense(x, "X", Common); double *x1_mod = (double*)x->x; cudaMemcpy(dev_x1, x1_mod, m*sizeof(double), cudaMemcpyHostToDevice); cusparseStatus_t cuSparseStatus; cusparseHandle_t cuspHandle; cuSparseStatus = cusparseCreate(&cuspHandle); cusparseMatDescr_t descrP; cusparseCreateMatDescr(&descrP); cusparseSetMatType(descrP, CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descrP, CUSPARSE_INDEX_BASE_ZERO); double *dev_res1, *dev_simple; double *res1 = (double*)calloc(n,sizeof(double)); cudaMalloc((void**)&dev_res1, n*sizeof(double)); cudaMalloc((void**)&dev_simple, n*sizeof(double)); const double alpha = 1.0, beta=0.0; //alpha = 1.0; //beta = 0.0; //solving P^T * G^-1 * b1 Result stored in dev_res1 cuSparseStatus = cusparseDcsrmv(cuspHandle, CUSPARSE_OPERATION_TRANSPOSE, m, n, G.Pnonzero, &alpha, descrP, dev_PcsrVal, dev_PcsrRowIdx, dev_PcsrCol, dev_x1, &beta, dev_res1); if(cuSparseStatus == CUSPARSE_STATUS_SUCCESS) { /* cudaMemcpy(res1, dev_res1, n*sizeof(double), cudaMemcpyDeviceToHost); for(i=0;i<n;i++) { printf("\nres1[%d] = %.8f", i, res1[i]); } printf("\n P^T * G^-1 * b1 done! Vector stored in res1"); */ } else { printf("\n P^T * G^-1 * b1 failed\n"); exit(1); } const double alphaneg = -1.0; //Solving P^T * G^-1 * b1 - b2 ; Result stored in dev_res1 cublasStatus_t cuBlasStatus; cublasHandle_t cubHandle; cuBlasStatus = cublasCreate(&cubHandle); cuBlasStatus = cublasDaxpy(cubHandle, n, &alphaneg, dev_b2, 1, dev_res1, 1); if(cuBlasStatus == CUBLAS_STATUS_SUCCESS) { // cudaMemcpy(res1, dev_res1, n*sizeof(double), cudaMemcpyDeviceToHost); // for(i=0;i<n;i++) // { // printf("\nres1[%d] = %.8f", i, res1[i]); // } printf("\n res1 = res1 - b2 done\n"); } else { printf("\n res1 = res1 - b2 failed\n"); } ///NOW COMPUTING G^-1 * P int k = 0; int breakloop=0; double **midMat = (double**)malloc(m*sizeof(double*)); for(i=0;i<m;i++) { midMat[i] = (double*)calloc(n,sizeof(double)); } cudaEventRecord(start, 0); for(i=0;i<n;i++) { breakloop = 0; double *vect = (double*)calloc(m,sizeof(double*)); for(j=0;j<m;j++) { int startin = G.ProwIndex[j]; int endin = G.ProwIndex[j+1]; if(startin == endin) continue; k = startin; while(k<endin) { if(G.Pcolumns[k] == i) { vect[j] = G.Pmat[k]; breakloop=1; break; } k++; } if(breakloop == 1) { break; } } midvec = cholmod_l_allocate_dense((size_t)(m), 1, (size_t)(m), 1, Common); midvec->dtype=0; midvec->x=&vect[0]; midvec->xtype = 1; cholmod_dense *res2; res2 = cholmod_l_solve(CHOLMOD_A, L, midvec, Common); double *re = (double*)res2->x; // printf("\n vector %d is:\n", i); int i1, j1, k1; // for(j1=0;j1<m;j1++) // { // midmat2flat[i+j1*n] = re[j1]; // printf(" %lf", re[j1]); // } // printf("\n"); for(i1=0;i1<m;i1++) { midMat[i1][i] = re[i1]; } cholmod_l_free_dense(&midvec, Common); } /* printf("\n Midmat = \n"); for(i=0;i<m;i++) { for(j=0;j<n;j++) { printf(" %lf", midMat[i][j]); } printf("\n"); } */ double *midMatflat = (double*)calloc((m*n),sizeof(double)); double *dev_midMat; double *dev_solut; int counter = 0; for(i=0;i<n;i++) { for(j=0;j<m;j++) { midMatflat[counter] = midMat[j][i]; counter++; } } cudaMalloc((void**)&dev_midMat, m*n*sizeof(double)); cudaMalloc((void**)&dev_solut, n*n*sizeof(double)); cudaMemcpy(dev_midMat, midMatflat, m*n*sizeof(double), cudaMemcpyHostToDevice); //Solving P^T * midMat; Result stored in dev_solut cuSparseStatus = cusparseDcsrmm(cuspHandle, CUSPARSE_OPERATION_TRANSPOSE, m, n, n, G.Pnonzero, &alpha, descrP, dev_PcsrVal, dev_PcsrRowIdx, dev_PcsrCol, dev_midMat, m, &beta, dev_solut, n); if(cuSparseStatus == CUSPARSE_STATUS_SUCCESS) { printf("\n Solved P^T * G^-1 * P. Result stored in solut\n"); } else { printf("\n Failed to Solve P^T * G^-1 * P \n"); exit(1); } /* double *matGflat = (double*)calloc(n*n,sizeof(double)); cudaMemcpy(matGflat, dev_solut, n*n*sizeof(double), cudaMemcpyDeviceToHost); counter = 0; printf("\nBefore LU starts\n"); for(i=0;i<n;i++) { for(j=0;j<n;j++) { printf(" %lf ", matGflat[counter]); counter++; } printf("\n"); } printf("\n"); */ cusolverStatus_t cuSolverStatus; cusolverDnHandle_t cudenHandle; cuSolverStatus = cusolverDnCreate(&cudenHandle); int Lwork = 0; cuSolverStatus = cusolverDnDgetrf_bufferSize(cudenHandle, n, n, dev_solut, n, &Lwork); if(cuSolverStatus == CUSOLVER_STATUS_SUCCESS) { printf("\n Buffer works\n Lwork = %d\n", Lwork); } else { exit(1); } double *dev_Workspace; int *dev_Ipiv, *dev_Info; cudaMalloc((void**)&dev_Workspace, Lwork*sizeof(double)); cudaMalloc((void**)&dev_Ipiv, n*sizeof(int)); cudaMalloc((void**)&dev_Info, sizeof(int)); //Calculating LU for dev_solut // double *nnmat = (double*)calloc(n*n,sizeof(double)); // cudaMemcpy(nnmat, dev_solut, n*n*sizeof(double), cudaMemcpyDeviceToHost); // cuSolverStatus = cusolverDnDgetrfHost(cudenHandle, n, n, cuSolverStatus = cusolverDnDgetrf(cudenHandle, n, n, dev_solut, n, dev_Workspace, dev_Ipiv, dev_Info); if(cuSolverStatus == CUSOLVER_STATUS_SUCCESS) { printf("\n solut has be defactorized into L and U. dev_Ipiv * solut = L * U\n"); } else { printf("\n Unable to defactorize solut into LU\n"); exit(1); } //solving dev_solut * x = dev_res1. Result stored in dev_res1 cuSolverStatus = cusolverDnDgetrs(cudenHandle, CUBLAS_OP_N, n, 1, dev_solut, n, dev_Ipiv, dev_res1, n, dev_Info); if(cuSolverStatus == CUSOLVER_STATUS_SUCCESS) { printf("\n Solution obtained for x2 \n"); } else { printf("\n LU decomposition obtained by LU solver failed\n"); } /* cudaMemcpy(G.x2, dev_res1, n*sizeof(double), cudaMemcpyDeviceToHost); printf("\n x2 = \n"); for(i=0;i<n;i++) { printf("\n x2[%d] = %lf", i, G.x2[i]); } */ double *dev_dummy; cudaMalloc((void**)&dev_dummy, m*sizeof(double)); cudaMemset(dev_dummy, 0.0, m*sizeof(double)); printf("\n Starting solving for x1 \n"); //Solving for x1 //Solving G^-1 * P * x2; G^-1 * P is stored in midMat cuBlasStatus = cublasDgemv(cubHandle, CUBLAS_OP_N, m, n, &alpha, dev_midMat, m, dev_res1, 1, &beta, dev_dummy, 1); if(cuBlasStatus == CUBLAS_STATUS_SUCCESS) { /* double *toprint = (double*)calloc(m,sizeof(double)); cudaMemcpy(toprint, dev_dummy, m*sizeof(double), cudaMemcpyDeviceToHost); printf("\n Intermediate vector :\n"); for(i=0;i<m;i++) { printf("\ndummy[%d] = %lf", i, toprint[i]); } */ printf("\n midmat * x2 obtained. Stored in dummy\n"); } else { printf("\n Failed to obtain midmat * x2\n"); } cuBlasStatus = cublasDaxpy(cubHandle, m, &alphaneg, dev_dummy, 1, dev_x1, 1); if(cuBlasStatus == CUBLAS_STATUS_SUCCESS) { /* cudaMemcpy(G.x1, dev_x1, m*sizeof(double), cudaMemcpyDeviceToHost); printf("\n x1 = \n"); for(i=0;i<m;i++) { printf("\n x1[%d] = %.15f", i, G.x1[i]); } */ printf("\n x1 obtained"); } else { printf("\n Failed to obtain x1"); } printf("\n Solver finished its work\n"); /* cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTime, start, stop); printf("\n Time: %.6f msecs :\n", elapsedTime); */ cholmod_l_finish(Common); return 0; }