cudaError_t cudaFreeHost(void *ptr) { cudaHostUnregister(ptr); VirtioQCArg arg; memset(&arg, 0, sizeof(VirtioQCArg)); ptr(arg.pA, ptr, 0); send_cmd_to_device( VIRTQC_cudaFreeHost, &arg); return (cudaError_t)arg.cmd; }
void dfft_cuda_destroy_plan(dfft_plan plan) { dfft_destroy_plan_common(plan, 1); #ifndef ENABLE_MPI_CUDA cudaHostUnregister(plan.h_stage_in); cudaHostUnregister(plan.h_stage_out); free(plan.h_stage_in); free(plan.h_stage_out); #endif int dmax = plan.max_depth + 2; int d; for (d = 0; d < dmax; ++d) { cudaFree(plan.d_rev_j1[d]); cudaFree(plan.d_rev_partial[d]); cudaFree(plan.d_rev_global[d]); cudaFree(plan.d_c0[d]); cudaFree(plan.d_c1[d]); } for (d = 0; d < plan.max_depth; ++d) { cudaFree(plan.d_alpha[d]); free(plan.h_alpha[d]); } free(plan.d_c0); free(plan.d_c1); free(plan.d_rev_j1); free(plan.d_rev_partial); free(plan.d_rev_global); if (plan.max_depth) { free(plan.d_alpha); free(plan.h_alpha); } cudaFree(plan.d_pidx); cudaFree(plan.d_pdim); cudaFree(plan.d_iembed); cudaFree(plan.d_oembed); cudaFree(plan.d_length); }
void DataBuffer<DT>::page_unlk () { cuda_check (cudaHostUnregister ( data_.dptr)); cuda_check (cudaHostUnregister ( pred_.dptr)); cuda_check (cudaHostUnregister (label_.dptr)); }
void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc ) { cublasOperation_t transa, transb; cublasStatus_t status; /*---error handler---*/ int nrowa, ncola, nrowb, ncolb; if (TransA == CblasNoTrans) { nrowa = M; ncola = K; } else { nrowa = K; ncola = M; } if (TransB == CblasNoTrans) { nrowb = K; ncolb = N; } else { nrowb = N; ncolb = K; } int nrowc = M; int ncolc = N; int info = 0; if (CBLasTransToCuBlasTrans(TransA,&transa) < 0) info = 1; else if (CBLasTransToCuBlasTrans(TransB,&transb) < 0) info = 2; else if (M < 0) info = 3; else if (N < 0) info = 4; else if (K < 0) info = 5; else if (lda < MAX(1, nrowa)) info = 8; else if (ldb < MAX(1, nrowb)) info = 10; else if (ldc < MAX(1, M)) info = 13; if (info != 0) { xerbla_(ERROR_NAME, &info); return; } /*-------------------*/ /*----dispatcher-----*/ int type = 0; //1:cpu 2:cublasxt 3:blasx if (M <= 0 || N <= 0 || K <= 0) type = 1; if (type == 0 && (M > 1000 || N > 1000 || K > 1000)) type = 3; else type = 1; //Blasx_Debug_Output("type after dispatcher:%d\n",type); /*-------------------*/ switch (type) { case 1: CPU_BLAS: Blasx_Debug_Output("calling cblas_sgemm:"); if (cpublas_handle == NULL) blasx_init(CPU); if (cblas_sgemm_p == NULL) blasx_init_cblas_func(&cblas_sgemm_p, "cblas_sgemm"); (*cblas_sgemm_p)(Order,TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc); break; case 2: if (cublasXt_handle == NULL) blasx_init(CUBLASXT); Blasx_Debug_Output("calling cublasSgemmXt:"); status = cublasXtSgemm(cublasXt_handle, transa, transb, M, N, K, (float*)&alpha, (float*)A, lda, (float*)B, ldb, (float*)&beta, (float*)C, ldc); if( status != CUBLAS_STATUS_SUCCESS ) goto CPU_BLAS; break; case 3: Blasx_Debug_Output("calling BLASX:\n"); cudaHostRegister(A,sizeof(float)*nrowa*ncola,cudaHostRegisterPortable); cudaHostRegister(B,sizeof(float)*nrowb*ncolb,cudaHostRegisterPortable); cudaHostRegister(C,sizeof(float)*nrowc*ncolc,cudaHostRegisterPortable); #ifdef BENCHMARK double Gflops = FLOPS_DGEMM(M, N, K)/(1000000000); double gpu_start, gpu_end; gpu_start = get_cur_time(); #endif if (is_blasx_enable == 0) blasx_init(BLASX); assert( is_blasx_enable == 1 ); assert( SYS_GPUS > 0 ); assert( event_SGEMM[0] != NULL ); assert( C_dev_SGEMM[0] != NULL ); assert( handles_SGEMM[0] != NULL ); assert( streams_SGEMM[0] != NULL ); LRU_t* LRUs[10]; int GPU_id = 0; for (GPU_id = 0; GPU_id < SYS_GPUS; GPU_id++) LRUs[GPU_id] = LRU_init( GPU_id ); blasx_sgemm(SYS_GPUS, handles_SGEMM, LRUs, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); for (GPU_id = 0; GPU_id < SYS_GPUS; GPU_id++) LRU_free( LRUs[GPU_id], GPU_id ); #ifdef BENCHMARK gpu_end = get_cur_time(); printf("BLASX (M:%5d,N:%5d,K:%5d) Speed:%9.1f type:%2d\n", M, N, K, (double)Gflops/(gpu_end - gpu_start), type); #endif cudaHostUnregister(A); cudaHostUnregister(B); cudaHostUnregister(C); break; default: break; } //Blasx_Debug_Output("eventually use type:%d to compute\n",type); }
void cg_solve(OperatorType& A, const VectorType& b, VectorType& x, Matvec matvec, typename OperatorType::LocalOrdinalType max_iter, typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& tolerance, typename OperatorType::LocalOrdinalType& num_iters, typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& normr, timer_type* my_cg_times) { typedef typename OperatorType::ScalarType ScalarType; typedef typename OperatorType::GlobalOrdinalType GlobalOrdinalType; typedef typename OperatorType::LocalOrdinalType LocalOrdinalType; typedef typename TypeTraits<ScalarType>::magnitude_type magnitude_type; timer_type t0 = 0, tWAXPY = 0, tDOT = 0, tMATVEC = 0, tMATVECDOT = 0; timer_type total_time = mytimer(); int myproc = 0; #ifdef HAVE_MPI MPI_Comm_rank(MPI_COMM_WORLD, &myproc); #endif if (!A.has_local_indices) { std::cerr << "miniFE::cg_solve ERROR, A.has_local_indices is false, needs to be true. This probably means " << "miniFE::make_local_matrix(A) was not called prior to calling miniFE::cg_solve." << std::endl; return; } size_t nrows = A.rows.size(); LocalOrdinalType ncols = A.num_cols; nvtxRangeId_t r1=nvtxRangeStartA("Allocation of Temporary Vectors"); VectorType r(b.startIndex, nrows); VectorType p(0, ncols); VectorType Ap(b.startIndex, nrows); nvtxRangeEnd(r1); #ifdef HAVE_MPI #ifndef GPUDIRECT //TODO move outside? cudaHostRegister(&p.coefs[0],ncols*sizeof(typename VectorType::ScalarType),0); cudaCheckError(); if(A.send_buffer.size()>0) cudaHostRegister(&A.send_buffer[0],A.send_buffer.size()*sizeof(typename VectorType::ScalarType),0); cudaCheckError(); #endif #endif normr = 0; magnitude_type rtrans = 0; magnitude_type oldrtrans = 0; LocalOrdinalType print_freq = max_iter/10; if (print_freq>50) print_freq = 50; if (print_freq<1) print_freq = 1; ScalarType one = 1.0; ScalarType zero = 0.0; TICK(); waxpby(one, x, zero, x, p); TOCK(tWAXPY); TICK(); matvec(A, p, Ap); TOCK(tMATVEC); TICK(); waxpby(one, b, -one, Ap, r); TOCK(tWAXPY); TICK(); rtrans = dot(r, r); TOCK(tDOT); normr = std::sqrt(rtrans); if (myproc == 0) { std::cout << "Initial Residual = "<< normr << std::endl; } magnitude_type brkdown_tol = std::numeric_limits<magnitude_type>::epsilon(); #ifdef MINIFE_DEBUG std::ostream& os = outstream(); os << "brkdown_tol = " << brkdown_tol << std::endl; #endif for(LocalOrdinalType k=1; k <= max_iter && normr > tolerance; ++k) { if (k == 1) { TICK(); waxpby(one, r, zero, r, p); TOCK(tWAXPY); } else { oldrtrans = rtrans; TICK(); rtrans = dot(r, r); TOCK(tDOT); magnitude_type beta = rtrans/oldrtrans; TICK(); waxpby(one, r, beta, p, p); TOCK(tWAXPY); } normr = std::sqrt(rtrans); if (myproc == 0 && (k%print_freq==0 || k==max_iter)) { std::cout << "Iteration = "<<k<<" Residual = "<<normr<<std::endl; } magnitude_type alpha = 0; magnitude_type p_ap_dot = 0; TICK(); matvec(A, p, Ap); TOCK(tMATVEC); TICK(); p_ap_dot = dot(Ap, p); TOCK(tDOT); #ifdef MINIFE_DEBUG os << "iter " << k << ", p_ap_dot = " << p_ap_dot; os.flush(); #endif //TODO remove false below if (false && p_ap_dot < brkdown_tol) { if (p_ap_dot < 0 || breakdown(p_ap_dot, Ap, p)) { std::cerr << "miniFE::cg_solve ERROR, numerical breakdown!"<<std::endl; #ifdef MINIFE_DEBUG os << "ERROR, numerical breakdown!"<<std::endl; #endif //update the timers before jumping out. my_cg_times[WAXPY] = tWAXPY; my_cg_times[DOT] = tDOT; my_cg_times[MATVEC] = tMATVEC; my_cg_times[TOTAL] = mytimer() - total_time; return; } else brkdown_tol = 0.1 * p_ap_dot; } alpha = rtrans/p_ap_dot; #ifdef MINIFE_DEBUG os << ", rtrans = " << rtrans << ", alpha = " << alpha << std::endl; #endif TICK(); waxpby(one, x, alpha, p, x); waxpby(one, r, -alpha, Ap, r); TOCK(tWAXPY); num_iters = k; } #ifdef HAVE_MPI #ifndef GPUDIRECT //TODO move outside? cudaHostUnregister(&p.coefs[0]); cudaCheckError(); if(A.send_buffer.size()>0) cudaHostUnregister(&A.send_buffer[0]); cudaCheckError(); #endif #endif my_cg_times[WAXPY] = tWAXPY; my_cg_times[DOT] = tDOT; my_cg_times[MATVEC] = tMATVEC; my_cg_times[MATVECDOT] = tMATVECDOT; my_cg_times[TOTAL] = mytimer() - total_time; }
static int RunTest(int *iparam, double *dparam, real_Double_t *t_) { double *A = NULL, *AT, *b, *bT, *x; PLASMA_desc *descA, *descB, *descL; real_Double_t t; int *piv; int nb, nb2, nt; int n = iparam[TIMING_N]; int nrhs = iparam[TIMING_NRHS]; int check = iparam[TIMING_CHECK]; int lda = n; int ldb = n; /* Initialize Plasma */ PLASMA_Init( iparam[TIMING_THRDNBR] ); if ( iparam[TIMING_SCHEDULER] ) PLASMA_Set(PLASMA_SCHEDULING_MODE, PLASMA_DYNAMIC_SCHEDULING ); else PLASMA_Set(PLASMA_SCHEDULING_MODE, PLASMA_STATIC_SCHEDULING ); #if defined(PLASMA_CUDA) core_cublas_init(); #endif /*if ( !iparam[TIMING_AUTOTUNING] ) {*/ PLASMA_Disable(PLASMA_AUTOTUNING); PLASMA_Set(PLASMA_TILE_SIZE, iparam[TIMING_NB] ); PLASMA_Set(PLASMA_INNER_BLOCK_SIZE, iparam[TIMING_IB] ); /* } else { */ /* PLASMA_Get(PLASMA_TILE_SIZE, &iparam[TIMING_NB] ); */ /* PLASMA_Get(PLASMA_INNER_BLOCK_SIZE, &iparam[TIMING_IB] ); */ /* } */ nb = iparam[TIMING_NB]; nb2 = nb * nb; nt = n / nb + ((n % nb == 0) ? 0 : 1); /* Allocate Data */ AT = (double *)malloc(nt*nt*nb2*sizeof(double)); /* Check if unable to allocate memory */ if ( !AT ){ printf("Out of Memory \n "); exit(0); } #if defined(PLASMA_CUDA) cudaHostRegister(AT, nt*nt*nb2*sizeof(double), cudaHostRegisterPortable); #endif /* Initialiaze Data */ PLASMA_Desc_Create(&descA, AT, PlasmaRealDouble, nb, nb, nb*nb, n, n, 0, 0, n, n); LAPACKE_dlarnv_work(1, ISEED, nt*nt*nb2, AT); /* Allocate Workspace */ PLASMA_Alloc_Workspace_dgesv_incpiv_Tile(n, &descL, &piv); { int NB, MT, NT; size_t size; NB = nb; NT = (n%NB==0) ? (n/NB) : ((n/NB)+1); MT = (n%NB==0) ? (n/NB) : ((n/NB)+1); size = (size_t)MT*NT*NB * sizeof(int); #if defined(PLASMA_CUDA) cudaHostRegister((void*)piv, size, cudaHostRegisterPortable); #endif } #if defined(PLASMA_CUDA) cudaHostRegister((void*)descL->mat, descL->lm*descL->ln*sizeof(double), cudaHostRegisterPortable); #endif /* Save AT in lapack layout for check */ if ( check ) { A = (double *)malloc(lda*n *sizeof(double)); PLASMA_Tile_to_Lapack(descA, (void*)A, n); } t = -cWtime(); PLASMA_dgetrf_incpiv_Tile( descA, descL, piv ); t += cWtime(); *t_ = t; /* Check the solution */ if ( check ) { b = (double *)malloc(ldb*nrhs *sizeof(double)); bT = (double *)malloc(nt*nb2 *sizeof(double)); x = (double *)malloc(ldb*nrhs *sizeof(double)); LAPACKE_dlarnv_work(1, ISEED, n*nrhs, b); PLASMA_Desc_Create(&descB, bT, PlasmaRealDouble, nb, nb, nb*nb, n, nrhs, 0, 0, n, nrhs); PLASMA_Lapack_to_Tile((void*)b, n, descB); PLASMA_dgetrs_incpiv_Tile( descA, descL, piv, descB ); PLASMA_Tile_to_Lapack(descB, (void*)x, n); dparam[TIMING_RES] = d_check_solution(n, n, nrhs, A, lda, b, x, ldb, &(dparam[TIMING_ANORM]), &(dparam[TIMING_BNORM]), &(dparam[TIMING_XNORM])); PLASMA_Desc_Destroy(&descB); free( A ); free( b ); free( bT ); free( x ); } /* Deallocate Workspace */ PLASMA_Dealloc_Handle_Tile(&descL); PLASMA_Desc_Destroy(&descA); PLASMA_Finalize(); #if defined(PLASMA_CUDA) cudaHostUnregister(AT); cudaHostUnregister(piv); #endif free( AT ); free( piv ); return 0; }