int main(int argc, char ** argv){ int i; if( (argc>=2) && (atoi(argv[1])!=RANK)) error("rank %d mandatory",RANK); printf("CUDA RANK=%d\n",RANK); kernel.print(); // build busylist busylist = (uint32_t*)malloc_file(CNK*sizeof(uint32_t),FMODE_RO,BLIST_FORMAT,RANK); // put busylist SafeCall(cuMemHostRegister(busylist,CNK*sizeof*busylist,CU_MEMHOSTREGISTER_DEVICEMAP)); SafeCall(cuMemHostGetDevicePointer(&host_busylist,busylist,0)); SafeCall(cuModuleGetGlobal(&dev_busylist,&bytes,kernel.module[0].module,"busylist")); if(bytes!=sizeof(host_busylist)) error("busylist!"); SafeCall(cuMemcpyHtoD(dev_busylist,&host_busylist,bytes)); // put array #ifdef IN_mk_data mkdir(DATADIR,0755); errno=0; array = (unsigned char *)malloc_file(abytes(RANK,CNK),1,DATADIR"%d",RANK); #else array = (unsigned char *)malloc_file(abytes(RANK,CNK),0,DATADIR"%d",RANK); #endif SafeCall(cuMemHostRegister(array,abytes(RANK,CNK),CU_MEMHOSTREGISTER_DEVICEMAP)); SafeCall(cuMemHostGetDevicePointer(&host_array,array,0)); SafeCall(cuModuleGetGlobal(&dev_array,&bytes,kernel.module[0].module,"array")); if(bytes!=sizeof(host_array)) error("array!"); SafeCall(cuMemcpyHtoD(dev_array,&host_array,bytes)); #define THREADS 512 #define MAXG 65535 uint64_t nado = (cnk[RANK] +(THREADS-1))/THREADS; uint32_t gridx = nado>MAXG?MAXG:nado; uint32_t gridy = (nado+(MAXG-1))/MAXG; printf("gridy=%d gridx=%d THREAD=%d\n",gridy, gridx, THREADS); kernel.launch(params,THREADS,gridx,gridy); kernel.wait(); SafeCall(cuMemHostUnregister(busylist)); SafeCall(cuMemHostUnregister(array)); SafeCall(cuModuleGetGlobal(&dev_changed,&bytes,kernel.module[0].module,"changed")); if(bytes!=sizeof(changed)) error("changed!"); SafeCall(cuMemcpyDtoH(changed,dev_changed,bytes)); for(i=0;i<CACHESIZE;i++) total += changed[i]; printf("changed=%ju\n",total); return 0; }
SEXP R_auto_cuMemHostRegister(SEXP r_p, SEXP r_bytesize, SEXP r_Flags) { SEXP r_ans = R_NilValue; void * p = GET_REF(r_p, void ); size_t bytesize = REAL(r_bytesize)[0]; unsigned int Flags = REAL(r_Flags)[0]; CUresult ans; ans = cuMemHostRegister(p, bytesize, Flags); r_ans = Renum_convert_CUresult(ans) ; return(r_ans); }
int * transpose(int *matrix, int height, int width) { if (cuda.get() == NULL) { cuda.reset(new Cuda()); } int *result; result = new int[height*width]; if (width >= BLOCK_DIM_X && height >= BLOCK_DIM_X) { // gpu_transpose_naive(result, matrix, height, width); cuMemHostRegister(result, sizeof(int)*height*width, 0); // TODO check result value gpu_transpose_with_shared_mem(result, matrix, height, width); cuMemHostUnregister(result); // TODO check result value } else { cpu_transpose(result, matrix, height, width); } return result; }
/** * Call the CUDA register function so we pin the memory in the CUDA * space. */ void mca_common_cuda_register(void *ptr, size_t amount, char *msg) { int res; if (!initialized) { mca_common_cuda_init(); } if (mca_common_cuda_enabled && mca_common_cuda_register_memory) { res = cuMemHostRegister(ptr, amount, 0); if (res != CUDA_SUCCESS) { /* If registering the memory fails, print a message and continue. * This is not a fatal error. */ orte_show_help("help-mpi-common-cuda.txt", "cuMemHostRegister failed", true, ptr, amount, res, msg); } else { opal_output_verbose(20, mca_common_cuda_output, "CUDA: cuMemHostRegister OK on mpool %s: " "address=%p, bufsize=%d", msg, ptr, (int)amount); } } }
cl_int pocl_cuda_alloc_mem_obj (cl_device_id device, cl_mem mem_obj, void *host_ptr) { cuCtxSetCurrent (((pocl_cuda_device_data_t *)device->data)->context); CUresult result; void *b = NULL; /* if memory for this global memory is not yet allocated -> do it */ if (mem_obj->device_ptrs[device->global_mem_id].mem_ptr == NULL) { cl_mem_flags flags = mem_obj->flags; if (flags & CL_MEM_USE_HOST_PTR) { #if defined __arm__ // cuMemHostRegister is not supported on ARN // Allocate device memory and perform explicit copies // before and after running a kernel result = cuMemAlloc ((CUdeviceptr *)&b, mem_obj->size); CUDA_CHECK (result, "cuMemAlloc"); #else result = cuMemHostRegister (host_ptr, mem_obj->size, CU_MEMHOSTREGISTER_DEVICEMAP); if (result != CUDA_SUCCESS && result != CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED) CUDA_CHECK (result, "cuMemHostRegister"); result = cuMemHostGetDevicePointer ((CUdeviceptr *)&b, host_ptr, 0); CUDA_CHECK (result, "cuMemHostGetDevicePointer"); #endif } else if (flags & CL_MEM_ALLOC_HOST_PTR) { result = cuMemHostAlloc (&mem_obj->mem_host_ptr, mem_obj->size, CU_MEMHOSTREGISTER_DEVICEMAP); CUDA_CHECK (result, "cuMemHostAlloc"); result = cuMemHostGetDevicePointer ((CUdeviceptr *)&b, mem_obj->mem_host_ptr, 0); CUDA_CHECK (result, "cuMemHostGetDevicePointer"); } else { result = cuMemAlloc ((CUdeviceptr *)&b, mem_obj->size); if (result != CUDA_SUCCESS) { const char *err; cuGetErrorName (result, &err); POCL_MSG_PRINT2 (__FUNCTION__, __LINE__, "-> Failed to allocate memory: %s\n", err); return CL_MEM_OBJECT_ALLOCATION_FAILURE; } } if (flags & CL_MEM_COPY_HOST_PTR) { result = cuMemcpyHtoD ((CUdeviceptr)b, host_ptr, mem_obj->size); CUDA_CHECK (result, "cuMemcpyHtoD"); } mem_obj->device_ptrs[device->global_mem_id].mem_ptr = b; mem_obj->device_ptrs[device->global_mem_id].global_mem_id = device->global_mem_id; } /* copy already allocated global mem info to devices own slot */ mem_obj->device_ptrs[device->dev_id] = mem_obj->device_ptrs[device->global_mem_id]; return CL_SUCCESS; }