int main(int argc, char ** argv){ int i; if( (argc>=2) && (atoi(argv[1])!=RANK)) error("rank %d mandatory",RANK); printf("CUDA RANK=%d\n",RANK); kernel.print(); // build busylist busylist = (uint32_t*)malloc_file(CNK*sizeof(uint32_t),FMODE_RO,BLIST_FORMAT,RANK); // put busylist SafeCall(cuMemHostRegister(busylist,CNK*sizeof*busylist,CU_MEMHOSTREGISTER_DEVICEMAP)); SafeCall(cuMemHostGetDevicePointer(&host_busylist,busylist,0)); SafeCall(cuModuleGetGlobal(&dev_busylist,&bytes,kernel.module[0].module,"busylist")); if(bytes!=sizeof(host_busylist)) error("busylist!"); SafeCall(cuMemcpyHtoD(dev_busylist,&host_busylist,bytes)); // put array #ifdef IN_mk_data mkdir(DATADIR,0755); errno=0; array = (unsigned char *)malloc_file(abytes(RANK,CNK),1,DATADIR"%d",RANK); #else array = (unsigned char *)malloc_file(abytes(RANK,CNK),0,DATADIR"%d",RANK); #endif SafeCall(cuMemHostRegister(array,abytes(RANK,CNK),CU_MEMHOSTREGISTER_DEVICEMAP)); SafeCall(cuMemHostGetDevicePointer(&host_array,array,0)); SafeCall(cuModuleGetGlobal(&dev_array,&bytes,kernel.module[0].module,"array")); if(bytes!=sizeof(host_array)) error("array!"); SafeCall(cuMemcpyHtoD(dev_array,&host_array,bytes)); #define THREADS 512 #define MAXG 65535 uint64_t nado = (cnk[RANK] +(THREADS-1))/THREADS; uint32_t gridx = nado>MAXG?MAXG:nado; uint32_t gridy = (nado+(MAXG-1))/MAXG; printf("gridy=%d gridx=%d THREAD=%d\n",gridy, gridx, THREADS); kernel.launch(params,THREADS,gridx,gridy); kernel.wait(); SafeCall(cuMemHostUnregister(busylist)); SafeCall(cuMemHostUnregister(array)); SafeCall(cuModuleGetGlobal(&dev_changed,&bytes,kernel.module[0].module,"changed")); if(bytes!=sizeof(changed)) error("changed!"); SafeCall(cuMemcpyDtoH(changed,dev_changed,bytes)); for(i=0;i<CACHESIZE;i++) total += changed[i]; printf("changed=%ju\n",total); return 0; }
static void map_init (struct ptx_stream *s) { CUresult r; int size = getpagesize (); assert (s); assert (!s->d); assert (!s->h); r = cuMemAllocHost (&s->h, size); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r)); r = cuMemHostGetDevicePointer (&s->d, s->h, 0); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r)); assert (s->h); s->h_begin = s->h; s->h_end = s->h_begin + size; s->h_next = s->h_prev = s->h_tail = s->h_begin; assert (s->h_next); assert (s->h_end); }
void *swanDevicePtrForHostPtr( void *ptr ) { CUdeviceptr ptrd; CUresult err = cuMemHostGetDevicePointer( &ptrd, (CUdeviceptr) ptr, 0 ); if ( err != CUDA_SUCCESS ) { error("swanMallocHost failed\n" ); } return (void*) ptrd; }
SEXP R_auto_cuMemHostGetDevicePointer(SEXP r_p, SEXP r_Flags) { SEXP r_ans = R_NilValue; CUdeviceptr pdptr; void * p = GET_REF(r_p, void ); unsigned int Flags = REAL(r_Flags)[0]; CUresult ans; ans = cuMemHostGetDevicePointer(& pdptr, p, Flags); if(ans) return(R_cudaErrorInfo(ans)); r_ans = R_createRef((void*) pdptr, "CUdeviceptr") ; return(r_ans); }
GPUPtr GPUInterface::GetDevicePointer(void* hPtr) { #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr, "\t\t\tEntering GPUInterface::GetDevicePointer\n"); #endif GPUPtr dPtr; SAFE_CUPP(cuMemHostGetDevicePointer(&dPtr, hPtr, 0)); #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tLeaving GPUInterface::GetDevicePointer\n"); #endif return dPtr; }
cl_int pocl_cuda_alloc_mem_obj (cl_device_id device, cl_mem mem_obj, void *host_ptr) { cuCtxSetCurrent (((pocl_cuda_device_data_t *)device->data)->context); CUresult result; void *b = NULL; /* if memory for this global memory is not yet allocated -> do it */ if (mem_obj->device_ptrs[device->global_mem_id].mem_ptr == NULL) { cl_mem_flags flags = mem_obj->flags; if (flags & CL_MEM_USE_HOST_PTR) { #if defined __arm__ // cuMemHostRegister is not supported on ARN // Allocate device memory and perform explicit copies // before and after running a kernel result = cuMemAlloc ((CUdeviceptr *)&b, mem_obj->size); CUDA_CHECK (result, "cuMemAlloc"); #else result = cuMemHostRegister (host_ptr, mem_obj->size, CU_MEMHOSTREGISTER_DEVICEMAP); if (result != CUDA_SUCCESS && result != CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED) CUDA_CHECK (result, "cuMemHostRegister"); result = cuMemHostGetDevicePointer ((CUdeviceptr *)&b, host_ptr, 0); CUDA_CHECK (result, "cuMemHostGetDevicePointer"); #endif } else if (flags & CL_MEM_ALLOC_HOST_PTR) { result = cuMemHostAlloc (&mem_obj->mem_host_ptr, mem_obj->size, CU_MEMHOSTREGISTER_DEVICEMAP); CUDA_CHECK (result, "cuMemHostAlloc"); result = cuMemHostGetDevicePointer ((CUdeviceptr *)&b, mem_obj->mem_host_ptr, 0); CUDA_CHECK (result, "cuMemHostGetDevicePointer"); } else { result = cuMemAlloc ((CUdeviceptr *)&b, mem_obj->size); if (result != CUDA_SUCCESS) { const char *err; cuGetErrorName (result, &err); POCL_MSG_PRINT2 (__FUNCTION__, __LINE__, "-> Failed to allocate memory: %s\n", err); return CL_MEM_OBJECT_ALLOCATION_FAILURE; } } if (flags & CL_MEM_COPY_HOST_PTR) { result = cuMemcpyHtoD ((CUdeviceptr)b, host_ptr, mem_obj->size); CUDA_CHECK (result, "cuMemcpyHtoD"); } mem_obj->device_ptrs[device->global_mem_id].mem_ptr = b; mem_obj->device_ptrs[device->global_mem_id].global_mem_id = device->global_mem_id; } /* copy already allocated global mem info to devices own slot */ mem_obj->device_ptrs[device->dev_id] = mem_obj->device_ptrs[device->global_mem_id]; return CL_SUCCESS; }
int gib_recover ( void *buffers, int buf_size, int *buf_ids, int recover_last, gib_context c ) { ERROR_CHECK_FAIL(cuCtxPushCurrent(((gpu_context)(c->acc_context))->pCtx)); #if !GIB_USE_MMAP if (buf_size > gib_buf_size) { int rc = gib_cpu_recover(buffers, buf_size, buf_ids, recover_last, c); ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx)); return rc; } #endif int i, j; int n = c->n; int m = c->m; unsigned char A[128*128], inv[128*128], modA[128*128]; for (i = n; i < n+recover_last; i++) if (buf_ids[i] >= n) { fprintf(stderr, "Attempting to recover a parity buffer, not allowed\n"); return GIB_ERR; } gib_galois_gen_A(A, m+n, n); /* Modify the matrix to have the failed drives reflected */ for (i = 0; i < n; i++) for (j = 0; j < n; j++) modA[i*n+j] = A[buf_ids[i]*n+j]; gib_galois_gaussian_elim(modA, inv, n, n); /* Copy row buf_ids[i] into row i */ for (i = n; i < n+recover_last; i++) for (j = 0; j < n; j++) modA[i*n+j] = inv[buf_ids[i]*n+j]; int nthreads_per_block = 128; int fetch_size = sizeof(int)*nthreads_per_block; int nblocks = (buf_size + fetch_size - 1)/fetch_size; gpu_context gpu_c = (gpu_context) c->acc_context; CUdeviceptr F_d; ERROR_CHECK_FAIL(cuModuleGetGlobal(&F_d, NULL, gpu_c->module, "F_d")); ERROR_CHECK_FAIL(cuMemcpyHtoD(F_d, modA+n*n, (c->m)*(c->n))); #if !GIB_USE_MMAP ERROR_CHECK_FAIL(cuMemcpyHtoD(gpu_c->buffers, buffers, (c->n)*buf_size)); #endif ERROR_CHECK_FAIL(cuFuncSetBlockShape(gpu_c->recover, nthreads_per_block, 1, 1)); int offset = 0; void *ptr; #if GIB_USE_MMAP CUdeviceptr cpu_buffers; ERROR_CHECK_FAIL(cuMemHostGetDevicePointer(&cpu_buffers, buffers, 0)); ptr = (void *)cpu_buffers; #else ptr = (void *)gpu_c->buffers; #endif ERROR_CHECK_FAIL(cuParamSetv(gpu_c->recover, offset, &ptr, sizeof(ptr))); offset += sizeof(ptr); ERROR_CHECK_FAIL(cuParamSetv(gpu_c->recover, offset, &buf_size, sizeof(buf_size))); offset += sizeof(buf_size); ERROR_CHECK_FAIL(cuParamSetv(gpu_c->recover, offset, &recover_last, sizeof(recover_last))); offset += sizeof(recover_last); ERROR_CHECK_FAIL(cuParamSetSize(gpu_c->recover, offset)); ERROR_CHECK_FAIL(cuLaunchGrid(gpu_c->recover, nblocks, 1)); #if !GIB_USE_MMAP CUdeviceptr tmp_d = gpu_c->buffers + c->n*buf_size; void *tmp_h = (void *)((unsigned char *)(buffers) + c->n*buf_size); ERROR_CHECK_FAIL(cuMemcpyDtoH(tmp_h, tmp_d, recover_last*buf_size)); #else cuCtxSynchronize(); #endif ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx)); return GIB_SUC; }
int gib_generate ( void *buffers, int buf_size, gib_context c ) { ERROR_CHECK_FAIL(cuCtxPushCurrent(((gpu_context)(c->acc_context))->pCtx)); /* Do it all at once if the buffers are small enough */ #if !GIB_USE_MMAP /* This is too large to do at once in the GPU memory we have allocated. * Split it into several noncontiguous jobs. */ if (buf_size > gib_buf_size) { int rc = gib_generate_nc(buffers, buf_size, buf_size, c); ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx)); return rc; } #endif int nthreads_per_block = 128; int fetch_size = sizeof(int)*nthreads_per_block; int nblocks = (buf_size + fetch_size - 1)/fetch_size; gpu_context gpu_c = (gpu_context) c->acc_context; unsigned char F[256*256]; gib_galois_gen_F(F, c->m, c->n); CUdeviceptr F_d; ERROR_CHECK_FAIL(cuModuleGetGlobal(&F_d, NULL, gpu_c->module, "F_d")); ERROR_CHECK_FAIL(cuMemcpyHtoD(F_d, F, (c->m)*(c->n))); #if !GIB_USE_MMAP /* Copy the buffers to memory */ ERROR_CHECK_FAIL(cuMemcpyHtoD(gpu_c->buffers, buffers, (c->n)*buf_size)); #endif /* Configure and launch */ ERROR_CHECK_FAIL(cuFuncSetBlockShape(gpu_c->checksum, nthreads_per_block, 1, 1)); int offset = 0; void *ptr; #if GIB_USE_MMAP CUdeviceptr cpu_buffers; ERROR_CHECK_FAIL(cuMemHostGetDevicePointer(&cpu_buffers, buffers, 0)); ptr = (void *)cpu_buffers; #else ptr = (void *)(gpu_c->buffers); #endif ERROR_CHECK_FAIL(cuParamSetv(gpu_c->checksum, offset, &ptr, sizeof(ptr))); offset += sizeof(ptr); ERROR_CHECK_FAIL(cuParamSetv(gpu_c->checksum, offset, &buf_size, sizeof(buf_size))); offset += sizeof(buf_size); ERROR_CHECK_FAIL(cuParamSetSize(gpu_c->checksum, offset)); ERROR_CHECK_FAIL(cuLaunchGrid(gpu_c->checksum, nblocks, 1)); /* Get the results back */ #if !GIB_USE_MMAP CUdeviceptr tmp_d = gpu_c->buffers + c->n*buf_size; void *tmp_h = (void *)((unsigned char *)(buffers) + c->n*buf_size); ERROR_CHECK_FAIL(cuMemcpyDtoH(tmp_h, tmp_d, (c->m)*buf_size)); #else ERROR_CHECK_FAIL(cuCtxSynchronize()); #endif ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx)); return GIB_SUC; }