cudaError_t cudaHostAlloc(void **pHost, size_t size, unsigned int flags) { void *a_UA = malloc(size + MEMORY_ALIGNMENT); *pHost = (void *) ALIGN_UP(a_UA, MEMORY_ALIGNMENT); return cudaHostRegister(*pHost, size, flags); }
void setTransposeCudaMpi(domain_t domain){ CHECK_CUBLAS( cublasCreate(&cublasHandle) ); alpha[0].x=1.0f; alpha[0].y=0.0f; size=NXSIZE*NY*NZ*sizeof(float2); aux_host1=(float2*)malloc(size); aux_host2=(float2*)malloc(size); CHECK_CUDART( cudaHostRegister(aux_host1,size,0) ); CHECK_CUDART( cudaHostRegister(aux_host2,size,0) ); return; }
static int RunTest(int *iparam, float *dparam, real_Double_t *t_) { float *AT, *BT, *CT; float *A = NULL, *B = NULL, *C1 = NULL, *C2 = NULL; float alpha, beta; PLASMA_desc *descA, *descB, *descC; real_Double_t t; int nb, nb2, nt; int n = iparam[TIMING_N]; int check = iparam[TIMING_CHECK]; int lda = n; /* Allocate Data */ /* Initialize Plasma */ PLASMA_Init( iparam[TIMING_THRDNBR] ); if ( iparam[TIMING_SCHEDULER] ) PLASMA_Set(PLASMA_SCHEDULING_MODE, PLASMA_DYNAMIC_SCHEDULING ); else PLASMA_Set(PLASMA_SCHEDULING_MODE, PLASMA_STATIC_SCHEDULING ); /*if ( !iparam[TIMING_AUTOTUNING] ) {*/ PLASMA_Disable(PLASMA_AUTOTUNING); PLASMA_Set(PLASMA_TILE_SIZE, iparam[TIMING_NB] ); /* } */ /* } else { */ /* PLASMA_Get(PLASMA_TILE_SIZE, &iparam[TIMING_NB] ); */ /* } */ nb = iparam[TIMING_NB]; nb2 = nb * nb; nt = n / nb + ((n % nb == 0) ? 0 : 1); AT = (float *)malloc(nt*nt*nb2*sizeof(float)); BT = (float *)malloc(nt*nt*nb2*sizeof(float)); CT = (float *)malloc(nt*nt*nb2*sizeof(float)); /* Check if unable to allocate memory */ if ( (!AT) || (!BT) || (!CT) ) { printf("Out of Memory \n "); exit(0); } #if defined(PLASMA_CUDA) cudaHostRegister(AT, nt*nt*nb2*sizeof(float), cudaHostRegisterPortable); cudaHostRegister(BT, nt*nt*nb2*sizeof(float), cudaHostRegisterPortable); cudaHostRegister(CT, nt*nt*nb2*sizeof(float), cudaHostRegisterPortable); #endif /* Initialiaze Data */ LAPACKE_slarnv_work(1, ISEED, 1, &alpha); LAPACKE_slarnv_work(1, ISEED, 1, &beta); LAPACKE_slarnv_work(1, ISEED, nt*nt*nb2, AT); LAPACKE_slarnv_work(1, ISEED, nt*nt*nb2, BT); LAPACKE_slarnv_work(1, ISEED, nt*nt*nb2, CT); /* Initialize AT and bT for Symmetric Positif Matrix */ PLASMA_Desc_Create(&descA, AT, PlasmaRealFloat, nb, nb, nb*nb, n, n, 0, 0, n, n); PLASMA_Desc_Create(&descB, BT, PlasmaRealFloat, nb, nb, nb*nb, n, n, 0, 0, n, n); PLASMA_Desc_Create(&descC, CT, PlasmaRealFloat, nb, nb, nb*nb, n, n, 0, 0, n, n); if (check) { C2 = (float *)malloc(n*lda*sizeof(float)); PLASMA_Tile_to_Lapack(descC, (void*)C2, n); } #if defined(PLASMA_CUDA) core_cublas_init(); #endif t = -cWtime(); PLASMA_sgemm_Tile( PlasmaNoTrans, PlasmaNoTrans, alpha, descA, descB, beta, descC ); t += cWtime(); *t_ = t; /* Check the solution */ if (check) { A = (float *)malloc(n*lda*sizeof(float)); PLASMA_Tile_to_Lapack(descA, (void*)A, n); free(AT); B = (float *)malloc(n*lda*sizeof(float)); PLASMA_Tile_to_Lapack(descB, (void*)B, n); free(BT); C1 = (float *)malloc(n*lda*sizeof(float)); PLASMA_Tile_to_Lapack(descC, (void*)C1, n); free(CT); dparam[TIMING_RES] = s_check_gemm( PlasmaNoTrans, PlasmaNoTrans, n, n, n, alpha, A, lda, B, lda, beta, C1, C2, lda, &(dparam[TIMING_ANORM]), &(dparam[TIMING_BNORM]), &(dparam[TIMING_XNORM])); free(C2); } else { free( AT ); free( BT ); free( CT ); } PLASMA_Desc_Destroy(&descA); PLASMA_Desc_Destroy(&descB); PLASMA_Desc_Destroy(&descC); PLASMA_Finalize(); return 0; }
static int RunTest(int *iparam, double *dparam, real_Double_t *t_) { double *A = NULL, *AT, *b = NULL, *bT, *x; PLASMA_desc *descA, *descB, *descT; real_Double_t t; int nb, nb2, nt; int n = iparam[TIMING_N]; int nrhs = iparam[TIMING_NRHS]; int check = iparam[TIMING_CHECK]; int lda = n; int ldb = n; /* Initialize Plasma */ PLASMA_Init( iparam[TIMING_THRDNBR] ); if ( iparam[TIMING_SCHEDULER] ) PLASMA_Set(PLASMA_SCHEDULING_MODE, PLASMA_DYNAMIC_SCHEDULING ); else PLASMA_Set(PLASMA_SCHEDULING_MODE, PLASMA_STATIC_SCHEDULING ); #if defined(PLASMA_CUDA) core_cublas_init(); #endif /*if ( !iparam[TIMING_AUTOTUNING] ) {*/ PLASMA_Disable(PLASMA_AUTOTUNING); PLASMA_Set(PLASMA_TILE_SIZE, iparam[TIMING_NB] ); PLASMA_Set(PLASMA_INNER_BLOCK_SIZE, iparam[TIMING_IB] ); /* } else { */ /* PLASMA_Get(PLASMA_TILE_SIZE, &iparam[TIMING_NB] ); */ /* PLASMA_Get(PLASMA_INNER_BLOCK_SIZE, &iparam[TIMING_IB] ); */ /* } */ nb = iparam[TIMING_NB]; nb2 = nb * nb; nt = n / nb + ((n % nb == 0) ? 0 : 1); /* Allocate Data */ AT = (double *)malloc(nt*nt*nb2*sizeof(double)); /* Check if unable to allocate memory */ if ( !AT ){ printf("Out of Memory \n "); exit(0); } #if defined(PLASMA_CUDA) cudaHostRegister((void*)AT, nt*nt*nb2*sizeof(double), cudaHostRegisterPortable); #endif /* Initialiaze Data */ PLASMA_Desc_Create(&descA, AT, PlasmaRealDouble, nb, nb, nb*nb, n, n, 0, 0, n, n); LAPACKE_dlarnv_work(1, ISEED, nt*nt*nb2, AT); /* Allocate Workspace */ PLASMA_Alloc_Workspace_dgels_Tile(n, n, &descT); #if defined(PLASMA_CUDA) cudaHostRegister((void*)descT->mat, descT->lm*descT->ln*sizeof(double), cudaHostRegisterPortable); #endif /* Save AT in lapack layout for check */ if ( check ) { A = (double *)malloc(lda*n *sizeof(double)); PLASMA_Tile_to_Lapack(descA, (void*)A, n); } t = -cWtime(); PLASMA_dgeqrf_Tile( descA, descT ); t += cWtime(); *t_ = t; /* Check the solution */ if ( check ) { b = (double *)malloc(ldb*nrhs *sizeof(double)); bT = (double *)malloc(nt*nb2 *sizeof(double)); x = (double *)malloc(ldb*nrhs *sizeof(double)); LAPACKE_dlarnv_work(1, ISEED, nt*nb2, bT); PLASMA_Desc_Create(&descB, bT, PlasmaRealDouble, nb, nb, nb*nb, n, nrhs, 0, 0, n, nrhs); PLASMA_Tile_to_Lapack(descB, (void*)b, n); PLASMA_dgeqrs_Tile( descA, descT, descB ); PLASMA_Tile_to_Lapack(descB, (void*)x, n); dparam[TIMING_RES] = d_check_solution(n, n, nrhs, A, lda, b, x, ldb, &(dparam[TIMING_ANORM]), &(dparam[TIMING_BNORM]), &(dparam[TIMING_XNORM])); PLASMA_Desc_Destroy(&descB); free( A ); free( b ); free( bT ); free( x ); } /* Allocate Workspace */ PLASMA_Dealloc_Handle_Tile(&descT); PLASMA_Desc_Destroy(&descA); free( AT ); PLASMA_Finalize(); #if defined(PLASMA_CUDA) #endif return 0; }
void DataBuffer<DT>::page_lock () { cuda_check (cudaHostRegister ( data_.dptr, data_.size_d(), cudaHostRegisterPortable)); cuda_check (cudaHostRegister ( pred_.dptr, pred_.size_d(), cudaHostRegisterPortable)); cuda_check (cudaHostRegister (label_.dptr, label_.size_d(), cudaHostRegisterPortable)); }
/// The force exchange is considerably simpler than the atom exchange. /// In the force case we only need to exchange data that is needed to /// complete the force calculation. Since the atoms have not moved we /// only need to send data from local link cells and we are guaranteed /// that the same atoms exist in the same order in corresponding halo /// cells on remote tasks. The only tricky part is the size of the /// plane of local cells that needs to be sent grows in each direction. /// This is because the y-axis send must send some of the data that was /// received from the x-axis send, and the z-axis must send some data /// from the y-axis send. This accumulation of data to send is /// responsible for data reaching neighbor cells that share only edges /// or corners. /// /// \see eam.c for an explanation of the requirement to exchange /// force data. HaloExchange* initForceHaloExchange(Domain* domain, LinkCell* boxes, int useGPU) { HaloExchange* hh = initHaloExchange(domain); if(useGPU){ hh->loadBuffer = loadForceBuffer; hh->unloadBuffer = unloadForceBuffer; }else{ hh->loadBuffer = loadForceBufferCpu; hh->unloadBuffer = unloadForceBufferCpu; } hh->destroy = destroyForceExchange; int size0 = (boxes->gridSize[1])*(boxes->gridSize[2]); int size1 = (boxes->gridSize[0]+2)*(boxes->gridSize[2]); int size2 = (boxes->gridSize[0]+2)*(boxes->gridSize[1]+2); int maxSize = MAX(size0, size1); maxSize = MAX(size1, size2); hh->bufCapacity = (maxSize)*MAXATOMS*sizeof(ForceMsg); hh->sendBufM = (char*)comdMalloc(hh->bufCapacity); hh->sendBufP = (char*)comdMalloc(hh->bufCapacity); hh->recvBufP = (char*)comdMalloc(hh->bufCapacity); hh->recvBufM = (char*)comdMalloc(hh->bufCapacity); // pin memory cudaHostRegister(hh->sendBufM, hh->bufCapacity, 0); cudaHostRegister(hh->sendBufP, hh->bufCapacity, 0); cudaHostRegister(hh->recvBufP, hh->bufCapacity, 0); cudaHostRegister(hh->recvBufM, hh->bufCapacity, 0); ForceExchangeParms* parms = (ForceExchangeParms*)comdMalloc(sizeof(ForceExchangeParms)); parms->nCells[HALO_X_MINUS] = (boxes->gridSize[1] )*(boxes->gridSize[2] ); parms->nCells[HALO_Y_MINUS] = (boxes->gridSize[0]+2)*(boxes->gridSize[2] ); parms->nCells[HALO_Z_MINUS] = (boxes->gridSize[0]+2)*(boxes->gridSize[1]+2); parms->nCells[HALO_X_PLUS] = parms->nCells[HALO_X_MINUS]; parms->nCells[HALO_Y_PLUS] = parms->nCells[HALO_Y_MINUS]; parms->nCells[HALO_Z_PLUS] = parms->nCells[HALO_Z_MINUS]; for (int ii=0; ii<6; ++ii) { parms->sendCells[ii] = mkForceSendCellList(boxes, ii, parms->nCells[ii]); parms->recvCells[ii] = mkForceRecvCellList(boxes, ii, parms->nCells[ii]); // copy cell list to gpu cudaMalloc((void**)&parms->sendCellsGpu[ii], parms->nCells[ii] * sizeof(int)); cudaMalloc((void**)&parms->recvCellsGpu[ii], parms->nCells[ii] * sizeof(int)); cudaMemcpy(parms->sendCellsGpu[ii], parms->sendCells[ii], parms->nCells[ii] * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(parms->recvCellsGpu[ii], parms->recvCells[ii], parms->nCells[ii] * sizeof(int), cudaMemcpyHostToDevice); // allocate temp buf int size = parms->nCells[ii]+1; if (size % 256 != 0) size = ((size + 255)/256)*256; cudaMalloc((void**)&parms->natoms_buf[ii], size * sizeof(int)); cudaMalloc((void**)&parms->partial_sums[ii], (size/256 + 1) * sizeof(int)); } hh->hashTable = NULL; hh->type = 1; hh->parms = parms; return hh; }
/// \details /// When called in proper sequence by redistributeAtoms, the atom halo /// exchange helps serve three purposes: /// - Send ghost atom data to neighbor tasks. /// - Shift atom coordinates by the global simulation size when they cross /// periodic boundaries. This shift is performed in loadAtomsBuffer. /// - Transfer ownership of atoms between tasks as the atoms move across /// spatial domain boundaries. This transfer of ownership occurs in /// two places. The former owner gives up ownership when /// updateLinkCells moves a formerly local atom into a halo link cell. /// The new owner accepts ownership when unloadAtomsBuffer calls /// putAtomInBox to place a received atom into a local link cell. /// /// This constructor does the following: /// /// - Sets the bufCapacity to hold the largest possible number of atoms /// that can be sent across a face. /// - Initialize function pointers to the atom-specific versions /// - Sets the number of link cells to send across each face. /// - Builds the list of link cells to send across each face. As /// explained in the comments for mkAtomCellList, this list must /// include any link cell, local or halo, that could possibly contain /// an atom that needs to be sent across the face. Atoms that need to /// be sent include "ghost atoms" that are located in local link /// cells that correspond to halo link cells on receiving tasks as well as /// formerly local atoms that have just moved into halo link cells and /// need to be sent to the rank that owns the spatial domain the atom /// has moved into. /// - Sets a coordinate shift factor for each face to account for /// periodic boundary conditions. For most faces the factor is zero. /// For faces on the +x, +y, or +z face of the simulation domain /// the factor is -1.0 (to shift the coordinates by -1 times the /// simulation domain size). For -x, -y, and -z faces of the /// simulation domain, the factor is +1.0. /// /// \see redistributeAtoms HaloExchange* initAtomHaloExchange(Domain* domain, LinkCell* boxes) { HaloExchange* hh = initHaloExchange(domain); int size0 = (boxes->gridSize[1]+2)*(boxes->gridSize[2]+2); int size1 = (boxes->gridSize[0]+2)*(boxes->gridSize[2]+2); int size2 = (boxes->gridSize[0]+2)*(boxes->gridSize[1]+2); int maxSize = MAX(size0, size1); maxSize = MAX(size1, size2); hh->bufCapacity = maxSize*2*MAXATOMS*sizeof(AtomMsg); hh->sendBufM = (char*)comdMalloc(hh->bufCapacity); hh->sendBufP = (char*)comdMalloc(hh->bufCapacity); hh->recvBufP = (char*)comdMalloc(hh->bufCapacity); hh->recvBufM = (char*)comdMalloc(hh->bufCapacity); // pin memory cudaHostRegister(hh->sendBufM, hh->bufCapacity, 0); cudaHostRegister(hh->sendBufP, hh->bufCapacity, 0); cudaHostRegister(hh->recvBufP, hh->bufCapacity, 0); cudaHostRegister(hh->recvBufM, hh->bufCapacity, 0); hh->loadBuffer = loadAtomsBuffer; hh->unloadBuffer = unloadAtomsBuffer; hh->destroy = destroyAtomsExchange; hh->hashTable = initHashTable((boxes->nTotalBoxes - boxes->nLocalBoxes) * MAXATOMS * 2); AtomExchangeParms* parms = (AtomExchangeParms*)comdMalloc(sizeof(AtomExchangeParms)); parms->nCells[HALO_X_MINUS] = 2*(boxes->gridSize[1]+2)*(boxes->gridSize[2]+2); parms->nCells[HALO_Y_MINUS] = 2*(boxes->gridSize[0]+2)*(boxes->gridSize[2]+2); parms->nCells[HALO_Z_MINUS] = 2*(boxes->gridSize[0]+2)*(boxes->gridSize[1]+2); parms->nCells[HALO_X_PLUS] = parms->nCells[HALO_X_MINUS]; parms->nCells[HALO_Y_PLUS] = parms->nCells[HALO_Y_MINUS]; parms->nCells[HALO_Z_PLUS] = parms->nCells[HALO_Z_MINUS]; for (int ii=0; ii<6; ++ii) { parms->cellList[ii] = mkAtomCellList(boxes, (enum HaloFaceOrder)ii, parms->nCells[ii]); // copy cell list to gpu cudaMalloc((void**)&parms->cellListGpu[ii], parms->nCells[ii] * sizeof(int)); cudaMemcpy(parms->cellListGpu[ii], parms->cellList[ii], parms->nCells[ii] * sizeof(int), cudaMemcpyHostToDevice); } // allocate scan buf int size = boxes->nLocalBoxes+1; if (size % 256 != 0) size = ((size + 255)/256)*256; int partial_size = size/256 + 1; if (partial_size % 256 != 0) partial_size = ((partial_size + 255)/256)*256; cudaMalloc((void**)&parms->d_natoms_buf, size * sizeof(int)); parms->h_natoms_buf = (int*) malloc( size * sizeof(int)); cudaMalloc((void**)&parms->d_partial_sums, partial_size * sizeof(int)); for (int ii=0; ii<6; ++ii) { parms->pbcFactor[ii] = (real_t*)comdMalloc(3*sizeof(real_t)); for (int jj=0; jj<3; ++jj) parms->pbcFactor[ii][jj] = 0.0; } int* procCoord = domain->procCoord; //alias int* procGrid = domain->procGrid; //alias if (procCoord[HALO_X_AXIS] == 0) parms->pbcFactor[HALO_X_MINUS][HALO_X_AXIS] = +1.0; if (procCoord[HALO_X_AXIS] == procGrid[HALO_X_AXIS]-1) parms->pbcFactor[HALO_X_PLUS][HALO_X_AXIS] = -1.0; if (procCoord[HALO_Y_AXIS] == 0) parms->pbcFactor[HALO_Y_MINUS][HALO_Y_AXIS] = +1.0; if (procCoord[HALO_Y_AXIS] == procGrid[HALO_Y_AXIS]-1) parms->pbcFactor[HALO_Y_PLUS][HALO_Y_AXIS] = -1.0; if (procCoord[HALO_Z_AXIS] == 0) parms->pbcFactor[HALO_Z_MINUS][HALO_Z_AXIS] = +1.0; if (procCoord[HALO_Z_AXIS] == procGrid[HALO_Z_AXIS]-1) parms->pbcFactor[HALO_Z_PLUS][HALO_Z_AXIS] = -1.0; hh->type = 0; hh->parms = parms; return hh; }
void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc ) { cublasOperation_t transa, transb; cublasStatus_t status; /*---error handler---*/ int nrowa, ncola, nrowb, ncolb; if (TransA == CblasNoTrans) { nrowa = M; ncola = K; } else { nrowa = K; ncola = M; } if (TransB == CblasNoTrans) { nrowb = K; ncolb = N; } else { nrowb = N; ncolb = K; } int nrowc = M; int ncolc = N; int info = 0; if (CBLasTransToCuBlasTrans(TransA,&transa) < 0) info = 1; else if (CBLasTransToCuBlasTrans(TransB,&transb) < 0) info = 2; else if (M < 0) info = 3; else if (N < 0) info = 4; else if (K < 0) info = 5; else if (lda < MAX(1, nrowa)) info = 8; else if (ldb < MAX(1, nrowb)) info = 10; else if (ldc < MAX(1, M)) info = 13; if (info != 0) { xerbla_(ERROR_NAME, &info); return; } /*-------------------*/ /*----dispatcher-----*/ int type = 0; //1:cpu 2:cublasxt 3:blasx if (M <= 0 || N <= 0 || K <= 0) type = 1; if (type == 0 && (M > 1000 || N > 1000 || K > 1000)) type = 3; else type = 1; //Blasx_Debug_Output("type after dispatcher:%d\n",type); /*-------------------*/ switch (type) { case 1: CPU_BLAS: Blasx_Debug_Output("calling cblas_sgemm:"); if (cpublas_handle == NULL) blasx_init(CPU); if (cblas_sgemm_p == NULL) blasx_init_cblas_func(&cblas_sgemm_p, "cblas_sgemm"); (*cblas_sgemm_p)(Order,TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc); break; case 2: if (cublasXt_handle == NULL) blasx_init(CUBLASXT); Blasx_Debug_Output("calling cublasSgemmXt:"); status = cublasXtSgemm(cublasXt_handle, transa, transb, M, N, K, (float*)&alpha, (float*)A, lda, (float*)B, ldb, (float*)&beta, (float*)C, ldc); if( status != CUBLAS_STATUS_SUCCESS ) goto CPU_BLAS; break; case 3: Blasx_Debug_Output("calling BLASX:\n"); cudaHostRegister(A,sizeof(float)*nrowa*ncola,cudaHostRegisterPortable); cudaHostRegister(B,sizeof(float)*nrowb*ncolb,cudaHostRegisterPortable); cudaHostRegister(C,sizeof(float)*nrowc*ncolc,cudaHostRegisterPortable); #ifdef BENCHMARK double Gflops = FLOPS_DGEMM(M, N, K)/(1000000000); double gpu_start, gpu_end; gpu_start = get_cur_time(); #endif if (is_blasx_enable == 0) blasx_init(BLASX); assert( is_blasx_enable == 1 ); assert( SYS_GPUS > 0 ); assert( event_SGEMM[0] != NULL ); assert( C_dev_SGEMM[0] != NULL ); assert( handles_SGEMM[0] != NULL ); assert( streams_SGEMM[0] != NULL ); LRU_t* LRUs[10]; int GPU_id = 0; for (GPU_id = 0; GPU_id < SYS_GPUS; GPU_id++) LRUs[GPU_id] = LRU_init( GPU_id ); blasx_sgemm(SYS_GPUS, handles_SGEMM, LRUs, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); for (GPU_id = 0; GPU_id < SYS_GPUS; GPU_id++) LRU_free( LRUs[GPU_id], GPU_id ); #ifdef BENCHMARK gpu_end = get_cur_time(); printf("BLASX (M:%5d,N:%5d,K:%5d) Speed:%9.1f type:%2d\n", M, N, K, (double)Gflops/(gpu_end - gpu_start), type); #endif cudaHostUnregister(A); cudaHostUnregister(B); cudaHostUnregister(C); break; default: break; } //Blasx_Debug_Output("eventually use type:%d to compute\n",type); }
int dfft_cuda_create_plan(dfft_plan *p, int ndim, int *gdim, int *inembed, int *oembed, int *pdim, int *pidx, int row_m, int input_cyclic, int output_cyclic, MPI_Comm comm, int *proc_map) { int res = dfft_create_plan_common(p, ndim, gdim, inembed, oembed, pdim, pidx, row_m, input_cyclic, output_cyclic, comm, proc_map, 1); #ifndef ENABLE_MPI_CUDA /* allocate staging bufs */ /* we need to use posix_memalign/cudaHostRegister instead * of cudaHostAlloc, because cudaHostAlloc doesn't have hooks * in the MPI library, and using it would lead to data corruption */ int size = p->scratch_size*sizeof(cuda_cpx_t); int page_size = getpagesize(); size = ((size + page_size - 1) / page_size) * page_size; posix_memalign((void **)&(p->h_stage_in),page_size,size); posix_memalign((void **)&(p->h_stage_out),page_size,size); cudaHostRegister(p->h_stage_in, size, cudaHostAllocDefault); CHECK_CUDA(); cudaHostRegister(p->h_stage_out, size, cudaHostAllocDefault); CHECK_CUDA(); #endif /* allocate memory for passing variables */ cudaMalloc((void **)&(p->d_pidx), sizeof(int)*ndim); CHECK_CUDA(); cudaMalloc((void **)&(p->d_pdim), sizeof(int)*ndim); CHECK_CUDA(); cudaMalloc((void **)&(p->d_iembed), sizeof(int)*ndim); CHECK_CUDA(); cudaMalloc((void **)&(p->d_oembed), sizeof(int)*ndim); CHECK_CUDA(); cudaMalloc((void **)&(p->d_length), sizeof(int)*ndim); CHECK_CUDA(); /* initialize cuda buffers */ int *h_length = (int *)malloc(sizeof(int)*ndim); int i; for (i = 0; i < ndim; ++i) h_length[i] = gdim[i]/pdim[i]; cudaMemcpy(p->d_pidx, pidx, sizeof(int)*ndim, cudaMemcpyDefault); CHECK_CUDA(); cudaMemcpy(p->d_pdim, pdim, sizeof(int)*ndim, cudaMemcpyDefault); CHECK_CUDA(); cudaMemcpy(p->d_iembed, p->inembed, sizeof(int)*ndim, cudaMemcpyDefault); CHECK_CUDA(); cudaMemcpy(p->d_oembed, p->oembed, sizeof(int)*ndim, cudaMemcpyDefault); CHECK_CUDA(); cudaMemcpy(p->d_length, h_length, sizeof(int)*ndim, cudaMemcpyDefault); CHECK_CUDA(); free(h_length); int dmax = p->max_depth + 2; p->d_rev_j1 = (int **) malloc(sizeof(int *)*dmax); p->d_rev_global = (int **) malloc(sizeof(int *)*dmax); p->d_rev_partial = (int **) malloc(sizeof(int *)*dmax); p->d_c0 = (int **) malloc(sizeof(int *)*dmax); p->d_c1 = (int **) malloc(sizeof(int *)*dmax); if (p->max_depth) { p->h_alpha = (cuda_scalar_t **) malloc(sizeof(cuda_scalar_t *)*p->max_depth); p->d_alpha = (cuda_scalar_t **) malloc(sizeof(cuda_scalar_t *)*p->max_depth); } int d; for (d = 0; d < dmax; ++d) { cudaMalloc((void **)&(p->d_rev_j1[d]), sizeof(int)*ndim); CHECK_CUDA(); cudaMalloc((void **)&(p->d_rev_partial[d]), sizeof(int)*ndim); CHECK_CUDA(); cudaMalloc((void **)&(p->d_rev_global[d]), sizeof(int)*ndim); CHECK_CUDA(); cudaMalloc((void **)&(p->d_c0[d]), sizeof(int)*ndim); CHECK_CUDA(); cudaMalloc((void **)&(p->d_c1[d]), sizeof(int)*ndim); CHECK_CUDA(); } for (d = 0; d < p->max_depth; ++d) { cudaMalloc((void **)&(p->d_alpha[d]), sizeof(cuda_scalar_t)*ndim); CHECK_CUDA(); p->h_alpha[d] = (cuda_scalar_t *) malloc(sizeof(cuda_scalar_t)*ndim); } /* perform initialization run */ dfft_cuda_execute(NULL, NULL, 0, p); /* initialization finished */ p->init = 0; return res; }
void cg_solve(OperatorType& A, const VectorType& b, VectorType& x, Matvec matvec, typename OperatorType::LocalOrdinalType max_iter, typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& tolerance, typename OperatorType::LocalOrdinalType& num_iters, typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& normr, timer_type* my_cg_times) { typedef typename OperatorType::ScalarType ScalarType; typedef typename OperatorType::GlobalOrdinalType GlobalOrdinalType; typedef typename OperatorType::LocalOrdinalType LocalOrdinalType; typedef typename TypeTraits<ScalarType>::magnitude_type magnitude_type; timer_type t0 = 0, tWAXPY = 0, tDOT = 0, tMATVEC = 0, tMATVECDOT = 0; timer_type total_time = mytimer(); int myproc = 0; #ifdef HAVE_MPI MPI_Comm_rank(MPI_COMM_WORLD, &myproc); #endif if (!A.has_local_indices) { std::cerr << "miniFE::cg_solve ERROR, A.has_local_indices is false, needs to be true. This probably means " << "miniFE::make_local_matrix(A) was not called prior to calling miniFE::cg_solve." << std::endl; return; } size_t nrows = A.rows.size(); LocalOrdinalType ncols = A.num_cols; nvtxRangeId_t r1=nvtxRangeStartA("Allocation of Temporary Vectors"); VectorType r(b.startIndex, nrows); VectorType p(0, ncols); VectorType Ap(b.startIndex, nrows); nvtxRangeEnd(r1); #ifdef HAVE_MPI #ifndef GPUDIRECT //TODO move outside? cudaHostRegister(&p.coefs[0],ncols*sizeof(typename VectorType::ScalarType),0); cudaCheckError(); if(A.send_buffer.size()>0) cudaHostRegister(&A.send_buffer[0],A.send_buffer.size()*sizeof(typename VectorType::ScalarType),0); cudaCheckError(); #endif #endif normr = 0; magnitude_type rtrans = 0; magnitude_type oldrtrans = 0; LocalOrdinalType print_freq = max_iter/10; if (print_freq>50) print_freq = 50; if (print_freq<1) print_freq = 1; ScalarType one = 1.0; ScalarType zero = 0.0; TICK(); waxpby(one, x, zero, x, p); TOCK(tWAXPY); TICK(); matvec(A, p, Ap); TOCK(tMATVEC); TICK(); waxpby(one, b, -one, Ap, r); TOCK(tWAXPY); TICK(); rtrans = dot(r, r); TOCK(tDOT); normr = std::sqrt(rtrans); if (myproc == 0) { std::cout << "Initial Residual = "<< normr << std::endl; } magnitude_type brkdown_tol = std::numeric_limits<magnitude_type>::epsilon(); #ifdef MINIFE_DEBUG std::ostream& os = outstream(); os << "brkdown_tol = " << brkdown_tol << std::endl; #endif for(LocalOrdinalType k=1; k <= max_iter && normr > tolerance; ++k) { if (k == 1) { TICK(); waxpby(one, r, zero, r, p); TOCK(tWAXPY); } else { oldrtrans = rtrans; TICK(); rtrans = dot(r, r); TOCK(tDOT); magnitude_type beta = rtrans/oldrtrans; TICK(); waxpby(one, r, beta, p, p); TOCK(tWAXPY); } normr = std::sqrt(rtrans); if (myproc == 0 && (k%print_freq==0 || k==max_iter)) { std::cout << "Iteration = "<<k<<" Residual = "<<normr<<std::endl; } magnitude_type alpha = 0; magnitude_type p_ap_dot = 0; TICK(); matvec(A, p, Ap); TOCK(tMATVEC); TICK(); p_ap_dot = dot(Ap, p); TOCK(tDOT); #ifdef MINIFE_DEBUG os << "iter " << k << ", p_ap_dot = " << p_ap_dot; os.flush(); #endif //TODO remove false below if (false && p_ap_dot < brkdown_tol) { if (p_ap_dot < 0 || breakdown(p_ap_dot, Ap, p)) { std::cerr << "miniFE::cg_solve ERROR, numerical breakdown!"<<std::endl; #ifdef MINIFE_DEBUG os << "ERROR, numerical breakdown!"<<std::endl; #endif //update the timers before jumping out. my_cg_times[WAXPY] = tWAXPY; my_cg_times[DOT] = tDOT; my_cg_times[MATVEC] = tMATVEC; my_cg_times[TOTAL] = mytimer() - total_time; return; } else brkdown_tol = 0.1 * p_ap_dot; } alpha = rtrans/p_ap_dot; #ifdef MINIFE_DEBUG os << ", rtrans = " << rtrans << ", alpha = " << alpha << std::endl; #endif TICK(); waxpby(one, x, alpha, p, x); waxpby(one, r, -alpha, Ap, r); TOCK(tWAXPY); num_iters = k; } #ifdef HAVE_MPI #ifndef GPUDIRECT //TODO move outside? cudaHostUnregister(&p.coefs[0]); cudaCheckError(); if(A.send_buffer.size()>0) cudaHostUnregister(&A.send_buffer[0]); cudaCheckError(); #endif #endif my_cg_times[WAXPY] = tWAXPY; my_cg_times[DOT] = tDOT; my_cg_times[MATVEC] = tMATVEC; my_cg_times[MATVECDOT] = tMATVECDOT; my_cg_times[TOTAL] = mytimer() - total_time; }
HostReflectionHost::BootUp::BootUp(const std::string& module) : _module(module) { report("Booting up host reflection..."); // add message handlers _addMessageHandlers(); // allocate memory for the queue size_t queueDataSize = maxMessageSize() * 2; size_t size = 2 * (queueDataSize + sizeof(QueueMetaData)); _deviceHostSharedMemory = new char[size]; // setup the queue meta data QueueMetaData* hostToDeviceMetaData = (QueueMetaData*)_deviceHostSharedMemory; QueueMetaData* deviceToHostMetaData = (QueueMetaData*)_deviceHostSharedMemory + 1; char* hostToDeviceData = _deviceHostSharedMemory + 2 * sizeof(QueueMetaData); char* deviceToHostData = _deviceHostSharedMemory + 2 * sizeof(QueueMetaData) + queueDataSize; hostToDeviceMetaData->hostBegin = hostToDeviceData; hostToDeviceMetaData->size = queueDataSize; hostToDeviceMetaData->head = 0; hostToDeviceMetaData->tail = 0; hostToDeviceMetaData->mutex = (size_t)-1; deviceToHostMetaData->hostBegin = deviceToHostData; deviceToHostMetaData->size = queueDataSize; deviceToHostMetaData->head = 0; deviceToHostMetaData->tail = 0; deviceToHostMetaData->mutex = (size_t)-1; // Allocate the queues _hostToDeviceQueue = new HostQueue(hostToDeviceMetaData); _deviceToHostQueue = new HostQueue(deviceToHostMetaData); // Map the memory onto the device cudaHostRegister(_deviceHostSharedMemory, size, 0); char* devicePointer = 0; cudaHostGetDevicePointer((void**)&devicePointer, _deviceHostSharedMemory, 0); // Send the metadata to the device QueueMetaData* hostToDeviceMetaDataPointer = (QueueMetaData*)devicePointer; QueueMetaData* deviceToHostMetaDataPointer = (QueueMetaData*)devicePointer + 1; hostToDeviceMetaData->deviceBegin = devicePointer + 2 * sizeof(QueueMetaData); deviceToHostMetaData->deviceBegin = devicePointer + 2 * sizeof(QueueMetaData) + queueDataSize; cudaConfigureCall(dim3(1, 1, 1), dim3(1, 1, 1), 0, 0); cudaSetupArgument(&hostToDeviceMetaDataPointer, 8, 0 ); cudaSetupArgument(&deviceToHostMetaDataPointer, 8, 8 ); ocelot::launch(_module, "_bootupHostReflection"); // start up the host worker thread _kill = false; _thread = new boost::thread(_runThread, this); }