/*! */ void util::KernelExtractorDriver::kernelLaunch(CUfunction f, int gridX, int gridY) { state.launch.gridDim = ir::Dim3(gridX, gridY, 1); FunctionNameMap::const_iterator f_it = functionNameMap.find(f); cuCtxSynchronize(); // wait for previous launches to conclude [somehow] synchronizeFromDevice(); if (f_it != functionNameMap.end()) { state.launch.moduleName = f_it->second.first; state.launch.kernelName = f_it->second.second; } else { state.launch.moduleName = "unknown_module"; state.launch.kernelName = "unknown_kernel"; } // serialize 'before' state std::string launchName = state.launch.moduleName + "-" + state.launch.kernelName; std::ofstream file(state.application.name + "-" + launchName + ".json"); std::string app = state.application.name; state.application.name += "-before-" + launchName; state.serialize(file); state.application.name = app; }
int madd_gpu(struct device_info *device_info, CUdeviceptr *a_dev, CUdeviceptr *b_dev, CUdeviceptr *c_dev, unsigned int rows, unsigned int cols) { CUresult res; /* set kernel parameters */ void *kernel_params[] = {a_dev, b_dev, c_dev, &rows, &cols}; /* execute kernel */ unsigned int gridWidth = cols >> X_THREADS_PER_BLOCK_SHIFT; unsigned int gridHeight = rows >> Y_THREADS_PER_BLOCK_SHIFT; unsigned int shmemBytes = 0x40; /* random value */ if ((res = cuLaunchKernel(device_info->kernel, gridWidth, gridHeight, 1, X_THREADS_PER_BLOCK, Y_THREADS_PER_BLOCK, 1, shmemBytes, 0, kernel_params, 0)) != CUDA_SUCCESS) { printf("cuLaunchKernel failed: res = %lu\n", (unsigned long)res); return -1; } cuCtxSynchronize(); return 0; }
WEAK void halide_release() { // CUcontext ignore; // TODO: this is for timing; bad for release-mode performance CHECK_CALL( cuCtxSynchronize(), "cuCtxSynchronize on exit" ); // Only destroy the context if we own it if (weak_cuda_ctx) { CHECK_CALL( cuCtxDestroy(weak_cuda_ctx), "cuCtxDestroy on exit" ); weak_cuda_ctx = 0; } // Destroy the events if (__start) { cuEventDestroy(__start); cuEventDestroy(__end); __start = __end = 0; } // Unload the module if (__mod) { CHECK_CALL( cuModuleUnload(__mod), "cuModuleUnload" ); __mod = 0; } //CHECK_CALL( cuCtxPopCurrent(&ignore), "cuCtxPopCurrent" ); }
// Used to generate correct timings when tracing WEAK int halide_dev_sync(void *user_context) { DEBUG_PRINTF( user_context, "CUDA: halide_dev_sync (user_context: %p)\n", user_context ); CudaContext ctx(user_context); if (ctx.error != CUDA_SUCCESS) { return ctx.error; } #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif CUresult err = cuCtxSynchronize(); if (err != CUDA_SUCCESS) { halide_error_varargs(user_context, "CUDA: cuCtxSynchronize failed (%s)", _get_error_name(err)); return err; } #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif return 0; }
WEAK void halide_release() { // It's possible that this is being called from the destructor of // a static variable, in which case the driver may already be // shutting down. For this reason we allow the deinitialized // error. CHECK_CALL_DEINIT_OK( cuCtxSynchronize(), "cuCtxSynchronize on exit" ); // Only destroy the context if we own it if (weak_cuda_ctx) { CHECK_CALL_DEINIT_OK( cuCtxDestroy(weak_cuda_ctx), "cuCtxDestroy on exit" ); weak_cuda_ctx = 0; } // Destroy the events if (__start) { cuEventDestroy(__start); cuEventDestroy(__end); __start = __end = 0; } // Unload the module if (__mod) { CHECK_CALL_DEINIT_OK( cuModuleUnload(__mod), "cuModuleUnload" ); __mod = 0; } //CHECK_CALL( cuCtxPopCurrent(&ignore), "cuCtxPopCurrent" ); }
WEAK int halide_dev_run(void *user_context, void *state_ptr, const char* entry_name, int blocksX, int blocksY, int blocksZ, int threadsX, int threadsY, int threadsZ, int shared_mem_bytes, size_t arg_sizes[], void* args[]) { DEBUG_PRINTF( user_context, "CUDA: halide_dev_run (user_context: %p, entry: %s, blocks: %dx%dx%d, threads: %dx%dx%d, shmem: %d)\n", user_context, entry_name, blocksX, blocksY, blocksZ, threadsX, threadsY, threadsZ, shared_mem_bytes ); CUresult err; CudaContext ctx(user_context); if (ctx.error != CUDA_SUCCESS) { return ctx.error; } #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif halide_assert(user_context, state_ptr); CUmodule mod = ((module_state*)state_ptr)->module; halide_assert(user_context, mod); CUfunction f; err = cuModuleGetFunction(&f, mod, entry_name); if (err != CUDA_SUCCESS) { halide_error_varargs(user_context, "CUDA: cuModuleGetFunction failed (%s)", _get_error_name(err)); return err; } err = cuLaunchKernel(f, blocksX, blocksY, blocksZ, threadsX, threadsY, threadsZ, shared_mem_bytes, NULL, // stream args, NULL); if (err != CUDA_SUCCESS) { halide_error_varargs(user_context, "CUDA: cuLaunchKernel failed (%s)", _get_error_name(err)); return err; } #ifdef DEBUG err = cuCtxSynchronize(); if (err != CUDA_SUCCESS) { halide_error_varargs(user_context, "CUDA: cuCtxSynchronize failed (%s)\n", _get_error_name(err)); return err; } uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif return 0; }
/** * Frees the memory space pointed to by dptr, which must have been returned * by a previous call to cuMemAlloc() or cuMemAllocPitch(). * * Parameters: * dptr - Pointer to memory to free * * Returns: * CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED, * CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_VALUE */ CUresult cuMemFree_v2(CUdeviceptr dptr) { CUresult res; struct CUctx_st *ctx; Ghandle handle; uint64_t addr = dptr; uint64_t size; if (!gdev_initialized) return CUDA_ERROR_NOT_INITIALIZED; res = cuCtxGetCurrent(&ctx); if (res != CUDA_SUCCESS) return res; /* wait for all kernels to complete - some may be using the memory. */ cuCtxSynchronize(); handle = ctx->gdev_handle; if (!(size = gfree(handle, addr))) return CUDA_ERROR_INVALID_VALUE; return CUDA_SUCCESS; }
WEAK void halide_release(void *user_context) { // Do not do any of this if there is not context set. E.g. // if halide_release is called and no CUDA calls have been made. if (cuda_ctx_ptr != NULL) { // It's possible that this is being called from the destructor of // a static variable, in which case the driver may already be // shutting down. For this reason we allow the deinitialized // error. CHECK_CALL_DEINIT_OK( cuCtxSynchronize(), "cuCtxSynchronize on exit" ); // Destroy the events if (__start) { cuEventDestroy(__start); cuEventDestroy(__end); __start = __end = 0; } // Unload the module if (__mod) { CHECK_CALL_DEINIT_OK( cuModuleUnload(__mod), "cuModuleUnload" ); __mod = 0; } // Only destroy the context if we own it if (weak_cuda_ctx) { CHECK_CALL_DEINIT_OK( cuCtxDestroy(weak_cuda_ctx), "cuCtxDestroy on exit" ); weak_cuda_ctx = 0; } cuda_ctx_ptr = NULL; } //CHECK_CALL( cuCtxPopCurrent(&ignore), "cuCtxPopCurrent" ); }
void GPUInterface::ResizeStreamCount(int newStreamCount) { #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tEntering GPUInterface::ResizeStreamCount\n"); #endif SAFE_CUDA(cuCtxPushCurrent(cudaContext)); SAFE_CUDA(cuCtxSynchronize()); if (cudaStreams != NULL) { for(int i=0; i<numStreams; i++) { if (cudaStreams[i] != NULL) SAFE_CUDA(cuStreamDestroy(cudaStreams[i])); } free(cudaStreams); } if (cudaEvents != NULL) { for(int i=0; i<numStreams; i++) { if (cudaEvents[i] != NULL) SAFE_CUDA(cuEventDestroy(cudaEvents[i])); } free(cudaEvents); } if (newStreamCount == 1) { numStreams = 1; cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams); cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1)); cudaStreams[0] = NULL; CUevent event; for(int i=0; i<2; i++) { SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); cudaEvents[i] = event; } } else { numStreams = newStreamCount; if (numStreams > BEAGLE_STREAM_COUNT) { numStreams = BEAGLE_STREAM_COUNT; } cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams); CUstream stream; cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1)); CUevent event; for(int i=0; i<numStreams; i++) { SAFE_CUDA(cuStreamCreate(&stream, CU_STREAM_DEFAULT)); cudaStreams[i] = stream; SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); cudaEvents[i] = event; } SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); cudaEvents[numStreams] = event; } SAFE_CUDA(cuCtxPopCurrent(&cudaContext)); #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tLeaving GPUInterface::ResizeStreamCount\n"); #endif }
CAMLprim value spoc_cuda_flush_all(value gi, value dev){ CAMLparam2(gi, dev); CUDA_GET_CONTEXT; cuCtxSynchronize(); CUDA_RESTORE_CONTEXT; CAMLreturn(Val_unit); }
/* * Class: edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2 * Method: runBlocks * Signature: (I)V */ JNIEXPORT jint JNICALL Java_edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2_runBlocks (JNIEnv *env, jobject this_obj, jint num_blocks, jint block_shape, jint grid_shape){ CUresult status; jlong * infoSpace = (jlong *) malloc(gc_space_size); infoSpace[1] = heapEndPtr; cuMemcpyHtoD(gcInfoSpace, infoSpace, gc_space_size); cuMemcpyHtoD(gpuToSpace, toSpace, heapEndPtr); //cuMemcpyHtoD(gpuTexture, textureMemory, textureMemSize); cuMemcpyHtoD(gpuHandlesMemory, handlesMemory, num_blocks * sizeof(jlong)); cuMemcpyHtoD(gpuHeapEndPtr, &heapEndPtr, sizeof(jlong)); cuMemcpyHtoD(gpuBufferSize, &bufferSize, sizeof(jlong)); /* status = cuModuleGetTexRef(&cache, cuModule, "m_Cache"); if (CUDA_SUCCESS != status) { printf("error in cuModuleGetTexRef %d\n", status); } status = cuTexRefSetAddress(0, cache, gpuTexture, textureMemSize); if (CUDA_SUCCESS != status) { printf("error in cuTextRefSetAddress %d\n", status); } */ status = cuFuncSetBlockShape(cuFunction, block_shape, 1, 1); if (CUDA_SUCCESS != status) { printf("error in cuFuncSetBlockShape %d\n", status); return (jint) status; } status = cuLaunchGrid(cuFunction, grid_shape, 1); if (CUDA_SUCCESS != status) { printf("error in cuLaunchGrid %d\n", status); fflush(stdout); return (jint) status; } status = cuCtxSynchronize(); if (CUDA_SUCCESS != status) { printf("error in cuCtxSynchronize %d\n", status); return (jint) status; } cuMemcpyDtoH(infoSpace, gcInfoSpace, gc_space_size); heapEndPtr = infoSpace[1]; cuMemcpyDtoH(toSpace, gpuToSpace, heapEndPtr); cuMemcpyDtoH(exceptionsMemory, gpuExceptionsMemory, num_blocks * sizeof(jlong)); free(infoSpace); return 0; }
CUresult TestSAXPY( chCUDADevice *chDevice, size_t N, float alpha ) { CUresult status; CUdeviceptr dptrOut = 0; CUdeviceptr dptrIn = 0; float *hostOut = 0; float *hostIn = 0; CUDA_CHECK( cuCtxPushCurrent( chDevice->context() ) ); CUDA_CHECK( cuMemAlloc( &dptrOut, N*sizeof(float) ) ); CUDA_CHECK( cuMemsetD32( dptrOut, 0, N ) ); CUDA_CHECK( cuMemAlloc( &dptrIn, N*sizeof(float) ) ); CUDA_CHECK( cuMemHostAlloc( (void **) &hostOut, N*sizeof(float), 0 ) ); CUDA_CHECK( cuMemHostAlloc( (void **) &hostIn, N*sizeof(float), 0 ) ); for ( size_t i = 0; i < N; i++ ) { hostIn[i] = (float) rand() / (float) RAND_MAX; } CUDA_CHECK( cuMemcpyHtoDAsync( dptrIn, hostIn, N*sizeof(float ), NULL ) ); { CUmodule moduleSAXPY; CUfunction kernelSAXPY; void *params[] = { &dptrOut, &dptrIn, &N, &alpha }; moduleSAXPY = chDevice->module( "saxpy.ptx" ); if ( ! moduleSAXPY ) { status = CUDA_ERROR_NOT_FOUND; goto Error; } CUDA_CHECK( cuModuleGetFunction( &kernelSAXPY, moduleSAXPY, "saxpy" ) ); CUDA_CHECK( cuLaunchKernel( kernelSAXPY, 1500, 1, 1, 512, 1, 1, 0, NULL, params, NULL ) ); } CUDA_CHECK( cuMemcpyDtoHAsync( hostOut, dptrOut, N*sizeof(float), NULL ) ); CUDA_CHECK( cuCtxSynchronize() ); for ( size_t i = 0; i < N; i++ ) { if ( fabsf( hostOut[i] - alpha*hostIn[i] ) > 1e-5f ) { status = CUDA_ERROR_UNKNOWN; goto Error; } } status = CUDA_SUCCESS; printf( "Well it worked!\n" ); Error: cuCtxPopCurrent( NULL ); cuMemFreeHost( hostOut ); cuMemFreeHost( hostIn ); cuMemFree( dptrOut ); cuMemFree( dptrIn ); return status; }
void swanSynchronize( void ) { CUresult err =cuCtxSynchronize(); if ( err != CUDA_SUCCESS ) { error("swanSynchronize failed\n" ); } if( state.debug ) { printf("# swanSynchronize()\n"); } }
void CudaModule::sync(bool yield) { if (!s_inited) { return; } if (!yield || !s_endEvent) { checkError("cuCtxSynchronize", cuCtxSynchronize()); return; } }
void GPUInterface::SynchronizeHost() { #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tEntering GPUInterface::SynchronizeHost\n"); #endif SAFE_CUPP(cuCtxSynchronize()); #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tLeaving GPUInterface::SynchronizeHost\n"); #endif }
SEXP R_cuCtxSynchronize() { SEXP r_ans = R_NilValue; CUresult ans; ans = cuCtxSynchronize(); r_ans = Renum_convert_CUresult(ans) ; return(r_ans); }
void util::KernelExtractorDriver::kernelReturn(CUresult result) { cuCtxSynchronize(); synchronizeFromDevice(); std::string launchName = state.launch.moduleName + "-" + state.launch.kernelName; std::ofstream file(state.application.name + "-" + launchName + ".json", std::ios_base::app); std::string app = state.application.name; state.application.name += "-after-" + state.launch.moduleName + "-" + state.launch.kernelName; state.serialize(file); state.application.name = app; }
void swanRunKernel( const char *kernel, block_config_t grid , block_config_t block, size_t shmem, int flags, void *ptrs[], int *types ) { CUresult err; swanRunKernelAsync( kernel, grid, block, shmem, flags, ptrs, types ); if(state.debug) { err =cuCtxSynchronize(); } // if( err != CUDA_SUCCESS ) { // fprintf( stderr , "SWAN : FATAL : Failure executing kernel sync [%s] [%d]\n", kernel, err ); // assert(0); // exit(-99); // } }
/* * execute kernel function */ void execute_cuda(void){ res = cuLaunchGrid(function, 1, block_num); if(res != CUDA_SUCCESS){ printf("cuLaunchGrid() failed: res = %s\n", conv(res)); exit(1); } res = cuCtxSynchronize(); if(res != CUDA_SUCCESS){ printf("cuCtxSynchronize() failed: res = %s\n", conv(res)); exit(1); } }
WEAK void halide_release(void *user_context) { DEBUG_PRINTF( user_context, "CUDA: halide_release (user_context: %p)\n", user_context ); int err; CUcontext ctx; err = halide_acquire_cuda_context(user_context, &ctx); if (err != CUDA_SUCCESS || !ctx) { return; } // It's possible that this is being called from the destructor of // a static variable, in which case the driver may already be // shutting down. err = cuCtxSynchronize(); halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED); // Unload the modules attached to this context. Note that the list // nodes themselves are not freed, only the module objects are // released. Subsequent calls to halide_init_kernels might re-create // the program object using the same list node to store the module // object. module_state *state = state_list; while (state) { if (state->module) { DEBUG_PRINTF(user_context, " cuModuleUnload %p\n", state->module); err = cuModuleUnload(state->module); halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED); state->module = 0; } state = state->next; } // Only destroy the context if we own it if (ctx == weak_cuda_ctx) { DEBUG_PRINTF(user_context, " cuCtxDestroy %p\n", weak_cuda_ctx); err = cuCtxDestroy(weak_cuda_ctx); halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED); weak_cuda_ctx = NULL; } halide_release_cuda_context(user_context); }
bool VideoDecoderCUDAPrivate::processDecodedData(CUVIDPARSERDISPINFO *cuviddisp, VideoFrame* outFrame) { int num_fields = cuviddisp->progressive_frame ? 1 : 2+cuviddisp->repeat_first_field; for (int active_field = 0; active_field < num_fields; ++active_field) { CUVIDPROCPARAMS proc_params; memset(&proc_params, 0, sizeof(CUVIDPROCPARAMS)); proc_params.progressive_frame = cuviddisp->progressive_frame; //check user config proc_params.second_field = active_field == 1; //check user config proc_params.top_field_first = cuviddisp->top_field_first; proc_params.unpaired_field = cuviddisp->progressive_frame == 1; CUdeviceptr devptr; unsigned int pitch; cuvidCtxLock(vid_ctx_lock, 0); CUresult cuStatus = cuvidMapVideoFrame(dec, cuviddisp->picture_index, &devptr, &pitch, &proc_params); if (cuStatus != CUDA_SUCCESS) { qWarning("cuvidMapVideoFrame failed on index %d (%#x, %s)", cuviddisp->picture_index, cuStatus, _cudaGetErrorEnum(cuStatus)); cuvidUnmapVideoFrame(dec, devptr); cuvidCtxUnlock(vid_ctx_lock, 0); return false; } #define PAD_ALIGN(x,mask) ( (x + mask) & ~mask ) //uint w = dec_create_info.ulWidth;//PAD_ALIGN(dec_create_info.ulWidth, 0x3F); uint h = dec_create_info.ulHeight;//PAD_ALIGN(dec_create_info.ulHeight, 0x0F); //? #undef PAD_ALIGN int size = pitch*h*3/2; if (size > host_data_size && host_data) { cuMemFreeHost(host_data); host_data = 0; host_data_size = 0; } if (!host_data) { cuStatus = cuMemAllocHost((void**)&host_data, size); if (cuStatus != CUDA_SUCCESS) { qWarning("cuMemAllocHost failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus)); cuvidUnmapVideoFrame(dec, devptr); cuvidCtxUnlock(vid_ctx_lock, 0); return false; } host_data_size = size; } if (!host_data) { qWarning("No valid staging memory!"); cuvidUnmapVideoFrame(dec, devptr); cuvidCtxUnlock(vid_ctx_lock, 0); return false; } cuStatus = cuMemcpyDtoHAsync(host_data, devptr, size, stream); if (cuStatus != CUDA_SUCCESS) { qWarning("cuMemcpyDtoHAsync failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus)); cuvidUnmapVideoFrame(dec, devptr); cuvidCtxUnlock(vid_ctx_lock, 0); return false; } cuStatus = cuCtxSynchronize(); if (cuStatus != CUDA_SUCCESS) { qWarning("cuCtxSynchronize failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus)); } cuvidUnmapVideoFrame(dec, devptr); cuvidCtxUnlock(vid_ctx_lock, 0); //qDebug("mark not in use pic_index: %d", cuviddisp->picture_index); surface_in_use[cuviddisp->picture_index] = false; uchar *planes[] = { host_data, host_data + pitch * h }; int pitches[] = { (int)pitch, (int)pitch }; VideoFrame frame(codec_ctx->width, codec_ctx->height, VideoFormat::Format_NV12); frame.setBits(planes); frame.setBytesPerLine(pitches); //TODO: is clone required? may crash on clone, I should review clone() //frame = frame.clone(); if (outFrame) { *outFrame = frame.clone(); } #if COPY_ON_DECODE frame_queue.put(frame.clone()); #endif //qDebug("frame queue size: %d", frame_queue.size()); } return true; }
static CUT_THREADPROC dt_thread_func(void *p) { dt_partition *pt = (dt_partition *)p; struct timeval tv; CUresult res; int thread_num_x=0, thread_num_y=0; int block_num_x=0, block_num_y=0; res = cuCtxSetCurrent(ctx[pt->pid]); if(res != CUDA_SUCCESS) { printf("cuCtxSetCurrent(ctx[%d]) failed: res = %s\n", pt->pid, cuda_response_to_string(res)); exit(1); } /* allocate GPU memory */ //printf("part_error_array_num = %d\n",part_error_array_num); if(pt->pid == 0){ gettimeofday(&tv_memcpy_start, NULL); } res = cuMemcpyHtoD(part_C_dev[pt->pid], dst_C, SUM_SIZE_C); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(part_C_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(part_error_array_dev[pt->pid], part_error_array, part_error_array_num*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(part_error_array_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(pm_size_array_dev[pt->pid], &pt->size_array[0][0], pt->NoP*2*pt->L_MAX*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(pm_size_array_dev) falied: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(def_array_dev[pt->pid], pt->def, sum_size_def_array); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(def_array_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(numpart_dev[pt->pid], pt->numpart, pt->NoC*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(cuMemcpyHtoD(numpart_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(PIDX_array_dev[pt->pid], pt->dst_PIDX, pt->tmp_array_size); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(PIDX_array) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(DID_4_array_dev[pt->pid], pt->dst_DID_4, pt->tmp_array_size); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(DID_4__array) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } if(pt->pid == 0){ gettimeofday(&tv_memcpy_end, NULL); tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv); time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; } int sharedMemBytes = 0; /* get max thread num per block */ int max_threads_num = 0; res = cuDeviceGetAttribute(&max_threads_num, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev[pt->pid]); if(res != CUDA_SUCCESS){ printf("\ncuDeviceGetAttribute() failed: res = %s\n", cuda_response_to_string(res)); exit(1); } /* prepare for launch inverse_Q */ void* kernel_args_inverse[] = { &part_C_dev[pt->pid], &pm_size_array_dev[pt->pid], &part_error_array_dev[pt->pid], &part_error_array_num, (void*)&(pt->NoP), &PIDX_array_dev[pt->pid], &numpart_dev[pt->pid], (void*)&(pt->NoC), (void*)&(pt->max_numpart), (void*)&(pt->interval), (void*)&(pt->L_MAX), (void*)&(pt->pid), (void*)&(device_num) }; /* define CUDA block shape */ int upper_limit_th_num_x = max_threads_num/(pt->max_numpart*pt->NoC); int upper_limit_th_num_y = max_threads_num/upper_limit_th_num_x; if(upper_limit_th_num_x < 1) upper_limit_th_num_x++; if(upper_limit_th_num_y < 1) upper_limit_th_num_y++; thread_num_x = (pt->max_dim0*pt->max_dim1 < upper_limit_th_num_x) ? (pt->max_dim0*pt->max_dim1) : upper_limit_th_num_x; thread_num_y = (pt->max_numpart < upper_limit_th_num_y) ? pt->max_numpart : upper_limit_th_num_y; block_num_x = (pt->max_dim0*pt->max_dim1) / thread_num_x; block_num_y = (pt->max_numpart) / thread_num_y; if((pt->max_dim0*pt->max_dim1) % thread_num_x != 0) block_num_x++; if(pt->max_numpart % thread_num_y != 0) block_num_y++; int blockDimY = thread_num_y / device_num; if(thread_num_y%device_num != 0){ blockDimY++; } /* launch iverse_Q */ if(pt->pid == 0){ gettimeofday(&tv_kernel_start, NULL); } res = cuLaunchKernel( func_inverse_Q[pt->pid], // call function block_num_x, // gridDimX block_num_y, // gridDimY pt->L_MAX-pt->interval, // gridDimZ thread_num_x, // blockDimX blockDimY, // blockDimY pt->NoC, // blockDimZ sharedMemBytes, // sharedMemBytes NULL, // hStream kernel_args_inverse, // kernelParams NULL // extra ); if(res != CUDA_SUCCESS) { printf("block_num_x %d, block_num_y %d, thread_num_x %d, thread_num_y %d\n", block_num_x, block_num_y, thread_num_x, thread_num_y); printf("cuLaunchKernel(inverse_Q) failed : res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuCtxSynchronize(); if(res != CUDA_SUCCESS) { printf("cuCtxSynchronize(inverse_Q) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } if(pt->pid == 0){ gettimeofday(&tv_kernel_end, NULL); tvsub(&tv_kernel_end, &tv_kernel_start, &tv); time_kernel += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; } /* prepare for launch dt1d_x */ void* kernel_args_x[] = { &part_C_dev[pt->pid], // FLOAT *src_start &tmpM_dev[pt->pid], // FLOTA *dst &tmpIy_dev[pt->pid], // int *ptr &DID_4_array_dev[pt->pid], // int *DID_4_array, &def_array_dev[pt->pid], // FLOAT *def_array, &pm_size_array_dev[pt->pid], // int *size_array (void*)&(pt->NoP), // int NoP &PIDX_array_dev[pt->pid], // int *PIDX_array &part_error_array_dev[pt->pid], // int *error_array (void*)&(part_error_array_num), // int error_array_num &numpart_dev[pt->pid], // int *numpart (void*)&(pt->NoC), // int NoC (void*)&(pt->max_numpart), // int max_numpart (void*)&(pt->interval), // int interval (void*)&(pt->L_MAX), // int L_MAX (void*)&(pt->pid), // int pid (void*)&(device_num) // int device_num }; max_threads_num = 64/pt->NoC; if(max_threads_num < 1) max_threads_num++; thread_num_x = (pt->max_dim1 < max_threads_num) ? pt->max_dim1 : max_threads_num; thread_num_y = (pt->max_numpart < max_threads_num) ? pt->max_numpart : max_threads_num; block_num_x = pt->max_dim1 / thread_num_x; block_num_y = pt->max_numpart / thread_num_y; if(pt->max_dim1 % thread_num_x != 0) block_num_x++; if(pt->max_numpart % thread_num_y != 0) block_num_y++; blockDimY = thread_num_y / device_num; if(thread_num_y%device_num != 0){ blockDimY++; } /* launch dt1d_x */ if(pt->pid == 0){ gettimeofday(&tv_kernel_start, NULL); } res = cuLaunchKernel( func_dt1d_x[pt->pid], // call function block_num_x, // gridDimX block_num_y, // gridDimY pt->L_MAX-pt->interval, // gridDimZ thread_num_x, // blockDimX blockDimY, // blockDimY pt->NoC, // blockDimZ sharedMemBytes, // sharedMemBytes NULL, // hStream kernel_args_x, // kernelParams NULL // extra ); if(res != CUDA_SUCCESS) { printf("block_num_x %d, block_num_y %d, thread_num_x %d, thread_num_y %d\n", block_num_x, block_num_y, thread_num_x, thread_num_y); printf("cuLaunchKernel(dt1d_x) failed : res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuCtxSynchronize(); if(res != CUDA_SUCCESS) { printf("cuCtxSynchronize(dt1d_x) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } if(pt->pid == 0){ gettimeofday(&tv_kernel_end, NULL); tvsub(&tv_kernel_end, &tv_kernel_start, &tv); time_kernel += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; } /* prepare for launch dt1d_y */ void* kernel_args_y[] = { &tmpM_dev[pt->pid], // FLOAT *src_start &M_dev[pt->pid], // FLOAT *dst_start &tmpIx_dev[pt->pid], // int *ptr_start &DID_4_array_dev[pt->pid], // int *DID_4_array, &def_array_dev[pt->pid], // FLOAT *def_array, (void*)&(pt->NoP), // int NoP &pm_size_array_dev[pt->pid], // int *size_array &numpart_dev[pt->pid], // int *numpart, &PIDX_array_dev[pt->pid], // int *PIDX_array, (void*)&(pt->NoC), // int NoC (void*)&(pt->max_numpart), // int max_numpart (void*)&(pt->interval), // int interval (void*)&(pt->L_MAX), // int L_MAX &part_error_array_dev[pt->pid], // int *error_array (void*)&(part_error_array_num), // int error_array_num (void*)&(pt->pid), // int pid (void*)&(device_num) // int device_num }; thread_num_x = (pt->max_dim0 < max_threads_num) ? pt->max_dim0 : max_threads_num; thread_num_y = (pt->max_numpart < max_threads_num) ? pt->max_numpart : max_threads_num; block_num_x = pt->max_dim0 / thread_num_x; block_num_y = pt->max_numpart / thread_num_y; if(pt->max_dim0 % thread_num_x != 0) block_num_x++; if(pt->max_numpart % thread_num_y != 0) block_num_y++; blockDimY = thread_num_y / device_num; if(thread_num_y%device_num != 0){ blockDimY++; } /* prepare for launch dt1d_y */ if(pt->pid == 0){ gettimeofday(&tv_kernel_start, NULL); } res = cuLaunchKernel( func_dt1d_y[pt->pid], // call functions block_num_x, // gridDimX block_num_y, // gridDimY pt->L_MAX-pt->interval, // gridDimZ thread_num_x, // blockDimX blockDimY, // blockDimY pt->NoC, // blockDimZ sharedMemBytes, // sharedMemBytes NULL, // hStream kernel_args_y, // kernelParams NULL // extra ); if(res != CUDA_SUCCESS) { printf("cuLaunchKernel(dt1d_y failed : res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuCtxSynchronize(); if(res != CUDA_SUCCESS) { printf("cuCtxSynchronize(dt1d_y) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } if(pt->pid == 0){ gettimeofday(&tv_kernel_end, NULL); tvsub(&tv_kernel_end, &tv_kernel_start, &tv); time_kernel += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; } /* downloads datas from GPU */ /* downloads M from GPU */ int sum_part_size = 0; int sum_pointer_size = 0; int sum_move_size = 0; int part_size = 0; int pointer_size = 0; int part_y = 0; int move_size = 0; int start_kk = 0; int end_kk = 0; int part_end_kk = 0; unsigned long long int pointer_dst_M = (unsigned long long int)pt->dst_M; unsigned long long int pointer_M_dev = (unsigned long long int)M_dev[pt->pid]; for(int L=0; L<(pt->L_MAX-pt->interval); L++) { /**************************************************************************/ /* loop condition */ if( (pt->FSIZE[(L+pt->interval)*2]+2*pt->pady < pt->max_Y) || (pt->FSIZE[(L+pt->interval)*2+1]+2*pt->padx < pt->max_X) ) { continue; } /* loop conditon */ /**************************************************************************/ for(int jj=0; jj<pt->NoC; jj++) { part_y = pt->numpart[jj] / device_num; if(pt->numpart[jj]%device_num != 0){ part_y++; } start_kk = part_y * pt->pid; end_kk = part_y * (pt->pid + 1); if(end_kk > pt->numpart[jj]){ end_kk = pt->numpart[jj]; } if(pt->pid > 0){ part_end_kk = part_y * pt->pid; } for(int kk=0; kk<pt->numpart[jj]; kk++) { int PIDX = pt->PIDX_array[L][jj][kk]; int dims0 = pt->size_array[L][PIDX*2]; int dims1 = pt->size_array[L][PIDX*2+1]; if(start_kk <= kk && kk < end_kk){ part_size += dims0 * dims1; } //if(pt->pid > 0 && part_start_kk <= kk && kk < part_end_kk){ if(pt->pid > 0 && 0 <= kk && kk < part_end_kk){ pointer_size += dims0 * dims1; } move_size += dims0 * dims1; } sum_part_size += part_size; sum_pointer_size += pointer_size; sum_move_size += move_size; // error pt->pid == 2 && L == 24 && jj == 1 if(pt->pid*part_y < pt->numpart[jj]){ if(pt->pid == 0){ gettimeofday(&tv_memcpy_start, NULL); } res = cuMemcpyDtoH((void *)(pointer_dst_M+(unsigned long long int)(pointer_size*sizeof(FLOAT))), (CUdeviceptr)(pointer_M_dev+(unsigned long long int)(pointer_size*sizeof(FLOAT))), part_size*sizeof(FLOAT)); if(res != CUDA_SUCCESS) { printf("error pid = %d\n",pt->pid); printf("cuMemcpyDtoH(dst_M) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } if(pt->pid == 0){ gettimeofday(&tv_memcpy_end, NULL); tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv); time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; } } pointer_dst_M += (unsigned long long int)(move_size * sizeof(FLOAT)); pointer_M_dev += (unsigned long long int)(move_size * sizeof(FLOAT)); part_size = 0; pointer_size = 0; move_size = 0; } } /* downloads tmpIx from GPU */ sum_part_size = 0; sum_pointer_size = 0; part_size = 0; pointer_size = 0; part_y = 0; move_size = 0; start_kk = 0; end_kk = 0; part_end_kk = 0; unsigned long long int pointer_dst_tmpIx = (unsigned long long int)pt->dst_tmpIx; unsigned long long int pointer_tmpIx_dev = (unsigned long long int)tmpIx_dev[pt->pid]; for(int L=0; L<(pt->L_MAX-pt->interval); L++) { /**************************************************************************/ /* loop condition */ if( (pt->FSIZE[(L+pt->interval)*2]+2*pt->pady < pt->max_Y) || (pt->FSIZE[(L+pt->interval)*2+1]+2*pt->padx < pt->max_X) ) { continue; } /* loop conditon */ /**************************************************************************/ for(int jj=0; jj<pt->NoC; jj++) { part_y = pt->numpart[jj] / device_num; if(pt->numpart[jj]%device_num != 0){ part_y++; } start_kk = part_y * pt->pid; end_kk = part_y * (pt->pid + 1); if(end_kk > pt->numpart[jj]){ end_kk = pt->numpart[jj]; } if(pt->pid > 0){ part_end_kk = part_y * pt->pid; } for(int kk=0; kk<pt->numpart[jj]; kk++) { int PIDX = pt->PIDX_array[L][jj][kk]; int dims0 = pt->size_array[L][PIDX*2]; int dims1 = pt->size_array[L][PIDX*2+1]; if(start_kk <= kk && kk < end_kk){ part_size += dims0 * dims1; } if(pt->pid > 0){ if(0 <= kk && kk < part_end_kk){ pointer_size += dims0 * dims1; } } move_size += dims0 * dims1; } sum_part_size += part_size; sum_pointer_size += pointer_size; if(pt->pid*part_y < pt->numpart[jj]){ if(pt->pid == 0){ gettimeofday(&tv_memcpy_start, NULL); } res = cuMemcpyDtoH((void *)(pointer_dst_tmpIx+(unsigned long long int)(pointer_size*sizeof(int))), (CUdeviceptr)(pointer_tmpIx_dev+(unsigned long long int)(pointer_size*sizeof(int))), part_size*sizeof(int)); if(res != CUDA_SUCCESS) { printf("error pid = %d\n",pt->pid); printf("cuMemcpyDtoH(tmpIx) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } if(pt->pid == 0){ gettimeofday(&tv_memcpy_end, NULL); tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv); time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; } } pointer_dst_tmpIx += (unsigned long long int)(move_size * sizeof(int)); pointer_tmpIx_dev += (unsigned long long int)(move_size * sizeof(int)); part_size = 0; pointer_size = 0; move_size = 0; } } /* downloads tmpIy from GPU */ sum_part_size = 0; sum_pointer_size = 0; part_size = 0; pointer_size = 0; part_y = 0; move_size = 0; start_kk = 0; end_kk = 0; part_end_kk = 0; unsigned long long int pointer_dst_tmpIy = (unsigned long long int)pt->dst_tmpIy; unsigned long long int pointer_tmpIy_dev = (unsigned long long int)tmpIy_dev[pt->pid]; for(int L=0; L<(pt->L_MAX-pt->interval); L++) { /**************************************************************************/ /* loop condition */ if( (pt->FSIZE[(L+pt->interval)*2]+2*pt->pady < pt->max_Y) || (pt->FSIZE[(L+pt->interval)*2+1]+2*pt->padx < pt->max_X) ) { continue; } /* loop conditon */ /**************************************************************************/ for(int jj=0; jj<pt->NoC; jj++) { part_y = pt->numpart[jj] / device_num; if(pt->numpart[jj]%device_num != 0){ part_y++; } start_kk = part_y * pt->pid; end_kk = part_y * (pt->pid + 1); if(end_kk > pt->numpart[jj]){ end_kk = pt->numpart[jj]; } if(pt->pid > 0){ part_end_kk = part_y * pt->pid; } for(int kk=0; kk<pt->numpart[jj]; kk++) { int PIDX = pt->PIDX_array[L][jj][kk]; int dims0 = pt->size_array[L][PIDX*2]; int dims1 = pt->size_array[L][PIDX*2+1]; if(start_kk <= kk && kk < end_kk){ part_size += dims0 * dims1; } if(pt->pid > 0){ if(0 <= kk && kk < part_end_kk){ pointer_size += dims0 * dims1; } } move_size += dims0 * dims1; } sum_part_size += part_size; sum_pointer_size += pointer_size; if(pt->pid*part_y < pt->numpart[jj]){ if(pt->pid == 0){ gettimeofday(&tv_memcpy_start, NULL); } res = cuMemcpyDtoH((void *)(pointer_dst_tmpIy+(unsigned long long int)(pointer_size*sizeof(int))), (CUdeviceptr)(pointer_tmpIy_dev+(unsigned long long int)(pointer_size*sizeof(int))), part_size*sizeof(int)); if(res != CUDA_SUCCESS) { printf("error pid = %d\n",pt->pid); printf("cuMemcpyDtoH(tmpIy) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } if(pt->pid == 0){ gettimeofday(&tv_memcpy_end, NULL); tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv); time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; } } pointer_dst_tmpIy += (unsigned long long int)(move_size * sizeof(int)); pointer_tmpIy_dev += (unsigned long long int)(move_size * sizeof(int)); part_size = 0; pointer_size = 0; move_size = 0; } } /* end of thread */ CUT_THREADEND; }
void draw() { rmt_LogText("start profiling"); //rmt_BeginCPUSample(uv_run); uv_run(uv_default_loop(), UV_RUN_NOWAIT); //rmt_EndCPUSample(); CUstream stream0 = 0; rmt_BeginCUDASample(main, stream0); { if (isResized()) { setupSizeResource(); } // Launch the Vector Add CUDA Kernel int threadsPerBlock = 256; int blocksPerGrid = (img.width * img.height + threadsPerBlock - 1) / threadsPerBlock; //printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock); dim3 blockDim = { 32, 32, 1 }; dim3 gridDim = { width / blockDim.x, height / blockDim.y, 1 }; float3 iResolution = { width, height, 1 }; float iGlobalTime = glfwGetTime(); float4 iMouse = { mouseX, mouseY, mouseX, mouseY }; rmt_BeginCUDASample(cuMemcpyHtoD, stream0); checkCudaErrors(cuMemcpyHtoD(d_iResolution, &iResolution, sizeof iResolution)); checkCudaErrors(cuMemcpyHtoD(d_iGlobalTime, &iGlobalTime, sizeof iGlobalTime)); checkCudaErrors(cuMemcpyHtoD(d_iMouse, &iMouse, sizeof iMouse)); rmt_EndCUDASample(stream0); rmt_BeginCUDASample(cuLaunchKernel, stream0); checkCudaErrors(cuLaunchKernel(kernel_addr, gridDim.x, gridDim.y, gridDim.z, /* grid dim */ blockDim.x, blockDim.y, blockDim.z, /* block dim */ 0, 0, /* shared mem, stream */ 0, /* arguments */ 0)); rmt_EndCUDASample(stream0); rmt_BeginCUDASample(cuCtxSynchronize, stream0); checkCudaErrors(cuCtxSynchronize()); rmt_EndCUDASample(stream0); rmt_BeginCUDASample(cuMemcpyDtoH, stream0); checkCudaErrors(cuMemcpyDtoH(img_content, d_img_content, item_size)); rmt_EndCUDASample(stream0); } rmt_EndCUDASample(stream0); rmt_BeginOpenGLSample(main); { background(color(0,0,0)); updateImage(img, img_content); image(img, 0, 0, width, height); TwDraw(); } rmt_EndOpenGLSample(); rmt_LogText("end profiling"); }
static void calc_a_score_GPU(FLOAT *ac_score, FLOAT **score, int *ssize_start, Model_info *MI, FLOAT scale, int *size_score_array, int NoC) { CUresult res; const int IHEI = MI->IM_HEIGHT; const int IWID = MI->IM_WIDTH; int pady_n = MI->pady; int padx_n = MI->padx; int block_pad = (int)(scale/2.0); struct timeval tv; int *RY_array, *RX_array; res = cuMemHostAlloc((void**)&RY_array, NoC*sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP); if(res != CUDA_SUCCESS) { printf("cuMemHostAlloc(RY_array) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemHostAlloc((void**)&RX_array, NoC*sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP); if(res != CUDA_SUCCESS) { printf("cuMemHostAlloc(RX_array) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } for(int i = 0; i < NoC; i++) { int rsize[2] = {MI->rsize[i*2], MI->rsize[i*2+1]}; RY_array[i] = (int)((FLOAT)rsize[0]*scale/2.0-1.0+block_pad); RX_array[i] = (int)((FLOAT)rsize[1]*scale/2.0-1.0+block_pad); } CUdeviceptr ac_score_dev, score_dev; CUdeviceptr ssize_dev, size_score_dev; CUdeviceptr RY_dev, RX_dev; int size_score=0; for(int i = 0; i < NoC; i++) { size_score += size_score_array[i]; } /* allocate GPU memory */ res = cuMemAlloc(&ac_score_dev, gpu_size_A_SCORE); if(res != CUDA_SUCCESS) { printf("cuMemAlloc(ac_score) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemAlloc(&score_dev, size_score); if(res != CUDA_SUCCESS) { printf("cuMemAlloc(score) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemAlloc(&ssize_dev, NoC*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemAlloc(ssize) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemAlloc(&size_score_dev, NoC*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemAlloc(size_score) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemAlloc(&RY_dev, NoC*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemAlloc(RY) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemAlloc(&RX_dev, NoC*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemAlloc(RX) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } gettimeofday(&tv_memcpy_start, nullptr); /* upload date to GPU */ res = cuMemcpyHtoD(ac_score_dev, &ac_score[0], gpu_size_A_SCORE); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(ac_score) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(score_dev, &score[0][0], size_score); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(score) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(ssize_dev, &ssize_start[0], NoC*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(ssize) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(size_score_dev, &size_score_array[0], NoC*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(size_score) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(RY_dev, &RY_array[0], NoC*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(RY) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(RX_dev, &RX_array[0], NoC*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(RX) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } gettimeofday(&tv_memcpy_end, nullptr); tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv); time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; void* kernel_args[] = { (void*)&IWID, (void*)&IHEI, (void*)&scale, (void*)&padx_n, (void*)&pady_n, &RX_dev, &RY_dev, &ac_score_dev, &score_dev, &ssize_dev, (void*)&NoC, &size_score_dev }; int sharedMemBytes = 0; /* define CUDA block shape */ int max_threads_num = 0; int thread_num_x, thread_num_y; int block_num_x, block_num_y; res = cuDeviceGetAttribute(&max_threads_num, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev[0]); if(res != CUDA_SUCCESS){ printf("\ncuDeviceGetAttribute() failed: res = %s\n", cuda_response_to_string(res)); exit(1); } NR_MAXTHREADS_X[0] = (int)sqrt((double)max_threads_num/NoC); NR_MAXTHREADS_Y[0] = (int)sqrt((double)max_threads_num/NoC); thread_num_x = (IWID < NR_MAXTHREADS_X[0]) ? IWID : NR_MAXTHREADS_X[0]; thread_num_y = (IHEI < NR_MAXTHREADS_Y[0]) ? IHEI : NR_MAXTHREADS_Y[0]; block_num_x = IWID / thread_num_x; block_num_y = IHEI / thread_num_y; if(IWID % thread_num_x != 0) block_num_x++; if(IHEI % thread_num_y != 0) block_num_y++; gettimeofday(&tv_kernel_start, nullptr); /* launch GPU kernel */ res = cuLaunchKernel( func_calc_a_score[0], // call function block_num_x, // gridDimX block_num_y, // gridDimY 1, // gridDimZ thread_num_x, // blockDimX thread_num_y, // blockDimY NoC, // blockDimZ sharedMemBytes, // sharedMemBytes nullptr, // hStream kernel_args, // kernelParams nullptr // extra ); if(res != CUDA_SUCCESS) { printf("cuLaunchKernel(calc_a_score) failed : res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuCtxSynchronize(); if(res != CUDA_SUCCESS) { printf("cuCtxSynchronize(calc_a_score) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } gettimeofday(&tv_kernel_end, nullptr); tvsub(&tv_kernel_end, &tv_kernel_start, &tv); time_kernel += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; gettimeofday(&tv_memcpy_start, nullptr); /* download data from GPU */ res = cuMemcpyDtoH(ac_score, ac_score_dev, gpu_size_A_SCORE); if(res != CUDA_SUCCESS) { printf("cuMemcpyDtoH(ac_score) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } gettimeofday(&tv_memcpy_end, nullptr); tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv); time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; /* free GPU memory */ res = cuMemFree(ac_score_dev); if(res != CUDA_SUCCESS) { printf("cuMemFree(ac_score_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemFree(score_dev); if(res != CUDA_SUCCESS) { printf("cuMemFree(score_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemFree(ssize_dev); if(res != CUDA_SUCCESS) { printf("cuMemFree(ssize_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemFree(size_score_dev); if(res != CUDA_SUCCESS) { printf("cuMemFree(size_score_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemFree(RY_dev); if(res != CUDA_SUCCESS) { printf("cuMemFree(RY_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemFree(RX_dev); if(res != CUDA_SUCCESS) { printf("cuMemFree(RX_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } /* free CPU memory */ res = cuMemFreeHost(RY_array); if(res != CUDA_SUCCESS) { printf("cuMemFreeHost(RY_array) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemFreeHost(RX_array); if(res != CUDA_SUCCESS) { printf("cuMemFreeHost(RX_array) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } }
void nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, int async, unsigned *dims, void *targ_mem_desc) { struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn; CUfunction function; CUresult r; int i; struct ptx_stream *dev_str; void *kargs[1]; void *hp, *dp; struct nvptx_thread *nvthd = nvptx_thread (); const char *maybe_abort_msg = "(perhaps abort was called)"; function = targ_fn->fn; dev_str = select_stream_for_async (async, pthread_self (), false, NULL); assert (dev_str == nvthd->current_stream); /* Initialize the launch dimensions. Typically this is constant, provided by the device compiler, but we must permit runtime values. */ for (i = 0; i != 3; i++) if (targ_fn->launch->dim[i]) dims[i] = targ_fn->launch->dim[i]; /* This reserves a chunk of a pre-allocated page of memory mapped on both the host and the device. HP is a host pointer to the new chunk, and DP is the corresponding device pointer. */ map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp); GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); /* Copy the array of arguments to the mapped page. */ for (i = 0; i < mapnum; i++) ((void **) hp)[i] = devaddrs[i]; /* Copy the (device) pointers to arguments to the device (dp and hp might in fact have the same value on a unified-memory system). */ r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *)); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r)); GOMP_PLUGIN_debug (0, " %s: kernel %s: launch" " gangs=%u, workers=%u, vectors=%u\n", __FUNCTION__, targ_fn->launch->fn, dims[0], dims[1], dims[2]); // OpenACC CUDA // // num_gangs nctaid.x // num_workers ntid.y // vector length ntid.x kargs[0] = &dp; r = cuLaunchKernel (function, dims[GOMP_DIM_GANG], 1, 1, dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1, 0, dev_str->stream, kargs, 0); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r)); #ifndef DISABLE_ASYNC if (async < acc_async_noval) { r = cuStreamSynchronize (dev_str->stream); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r), maybe_abort_msg); else if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); } else { CUevent *e; e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r), maybe_abort_msg); else if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); event_gc (true); r = cuEventRecord (*e, dev_str->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); event_add (PTX_EVT_KNL, e, (void *)dev_str); } #else r = cuCtxSynchronize (); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r), maybe_abort_msg); else if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r)); #endif GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__, targ_fn->launch->fn); #ifndef DISABLE_ASYNC if (async < acc_async_noval) #endif map_pop (dev_str); }
int main(int argc, char **argv) { // Start logs printf("[%s] - Starting...\n", argv[0]); //'h_' prefix - CPU (host) memory space float //Results calculated by CPU for reference *h_CallResultCPU, *h_PutResultCPU, //CPU copy of GPU results *h_CallResultGPU, *h_PutResultGPU, //CPU instance of input data *h_StockPrice, *h_OptionStrike, *h_OptionYears; //'d_' prefix - GPU (device) memory space CUdeviceptr //Results calculated by GPU d_CallResult, d_PutResult, //GPU instance of input data d_StockPrice, d_OptionStrike, d_OptionYears; double delta, ref, sum_delta, sum_ref, max_delta, L1norm, gpuTime; StopWatchInterface *hTimer = NULL; int i; sdkCreateTimer(&hTimer); printf("Initializing data...\n"); printf("...allocating CPU memory for options.\n"); h_CallResultCPU = (float *)malloc(OPT_SZ); h_PutResultCPU = (float *)malloc(OPT_SZ); h_CallResultGPU = (float *)malloc(OPT_SZ); h_PutResultGPU = (float *)malloc(OPT_SZ); h_StockPrice = (float *)malloc(OPT_SZ); h_OptionStrike = (float *)malloc(OPT_SZ); h_OptionYears = (float *)malloc(OPT_SZ); char *ptx, *kernel_file; size_t ptxSize; kernel_file = sdkFindFilePath("BlackScholes_kernel.cuh", argv[0]); // Set a Compiler Option to have maximum register to be used by each thread. char *compile_options[1]; compile_options[0] = (char *) malloc(sizeof(char)*(strlen("--maxrregcount=16"))); strcpy((char *)compile_options[0],"--maxrregcount=16"); // Compile the kernel BlackScholes_kernel. compileFileToPTX(kernel_file, 1, (const char **)compile_options, &ptx, &ptxSize); CUmodule module = loadPTX(ptx, argc, argv); CUfunction kernel_addr; checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "BlackScholesGPU")); printf("...allocating GPU memory for options.\n"); checkCudaErrors(cuMemAlloc(&d_CallResult, OPT_SZ)); checkCudaErrors(cuMemAlloc(&d_PutResult, OPT_SZ)); checkCudaErrors(cuMemAlloc(&d_StockPrice, OPT_SZ)); checkCudaErrors(cuMemAlloc(&d_OptionStrike,OPT_SZ)); checkCudaErrors(cuMemAlloc(&d_OptionYears, OPT_SZ)); printf("...generating input data in CPU mem.\n"); srand(5347); //Generate options set for (i = 0; i < OPT_N; i++) { h_CallResultCPU[i] = 0.0f; h_PutResultCPU[i] = -1.0f; h_StockPrice[i] = RandFloat(5.0f, 30.0f); h_OptionStrike[i] = RandFloat(1.0f, 100.0f); h_OptionYears[i] = RandFloat(0.25f, 10.0f); } printf("...copying input data to GPU mem.\n"); //Copy options data to GPU memory for further processing checkCudaErrors(cuMemcpyHtoD(d_StockPrice, h_StockPrice, OPT_SZ)); checkCudaErrors(cuMemcpyHtoD(d_OptionStrike, h_OptionStrike, OPT_SZ)); checkCudaErrors(cuMemcpyHtoD(d_OptionYears, h_OptionYears, OPT_SZ)); printf("Data init done.\n\n"); printf("Executing Black-Scholes GPU kernel (%i iterations)...\n", NUM_ITERATIONS); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); dim3 cudaBlockSize( 128, 1, 1); dim3 cudaGridSize(DIV_UP(OPT_N/2, 128),1,1); float risk = RISKFREE; float volatility = VOLATILITY; int optval = OPT_N; void *arr[] = { (void *)&d_CallResult, (void *)&d_PutResult, (void *)&d_StockPrice, (void *)&d_OptionStrike, (void *)&d_OptionYears, (void *)&risk, (void *)&volatility, (void *)&optval }; for (i = 0; i < NUM_ITERATIONS; i++) { checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, cudaGridSize.z, /* grid dim */ cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */ 0,0, /* shared mem, stream */ &arr[0], /* arguments */ 0)); } checkCudaErrors(cuCtxSynchronize()); sdkStopTimer(&hTimer); gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS; //Both call and put is calculated printf("Options count : %i \n", 2 * OPT_N); printf("BlackScholesGPU() time : %f msec\n", gpuTime); printf("Effective memory bandwidth: %f GB/s\n", ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (gpuTime * 1E-3)); printf("Gigaoptions per second : %f \n\n", ((double)(2 * OPT_N) * 1E-9) / (gpuTime * 1E-3)); printf("BlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u options, NumDevsUsed = %u, Workgroup = %u\n", (((double)(2.0 * OPT_N) * 1.0E-9) / (gpuTime * 1.0E-3)), gpuTime*1e-3, (2 * OPT_N), 1, 128); printf("\nReading back GPU results...\n"); //Read back GPU results to compare them to CPU results checkCudaErrors(cuMemcpyDtoH(h_CallResultGPU, d_CallResult, OPT_SZ)); checkCudaErrors(cuMemcpyDtoH(h_PutResultGPU, d_PutResult, OPT_SZ)); printf("Checking the results...\n"); printf("...running CPU calculations.\n\n"); //Calculate options values on CPU BlackScholesCPU( h_CallResultCPU, h_PutResultCPU, h_StockPrice, h_OptionStrike, h_OptionYears, RISKFREE, VOLATILITY, OPT_N ); printf("Comparing the results...\n"); //Calculate max absolute difference and L1 distance //between CPU and GPU results sum_delta = 0; sum_ref = 0; max_delta = 0; for (i = 0; i < OPT_N; i++) { ref = h_CallResultCPU[i]; delta = fabs(h_CallResultCPU[i] - h_CallResultGPU[i]); if (delta > max_delta) { max_delta = delta; } sum_delta += delta; sum_ref += fabs(ref); } L1norm = sum_delta / sum_ref; printf("L1 norm: %E\n", L1norm); printf("Max absolute error: %E\n\n", max_delta); printf("Shutting down...\n"); printf("...releasing GPU memory.\n"); checkCudaErrors(cuMemFree(d_OptionYears)); checkCudaErrors(cuMemFree(d_OptionStrike)); checkCudaErrors(cuMemFree(d_StockPrice)); checkCudaErrors(cuMemFree(d_PutResult)); checkCudaErrors(cuMemFree(d_CallResult)); printf("...releasing CPU memory.\n"); free(h_OptionYears); free(h_OptionStrike); free(h_StockPrice); free(h_PutResultGPU); free(h_CallResultGPU); free(h_PutResultCPU); free(h_CallResultCPU); sdkDeleteTimer(&hTimer); printf("Shutdown done.\n"); printf("\n[%s] - Test Summary\n", argv[0]); cuProfilerStop(); if (L1norm > 1e-6) { printf("Test failed!\n"); exit(EXIT_FAILURE); } printf("Test passed\n"); exit(EXIT_SUCCESS); }
int cuda_test_madd_vmmap_hybrid(unsigned int n, char *path) { int i, j, idx; CUresult res; CUdevice dev; CUcontext ctx; CUfunction function; CUmodule module; CUdeviceptr a_dev, b_dev, c_dev; unsigned int *a_buf, *b_buf, *c_buf; unsigned long long int a_phys, b_phys, c_phys; unsigned int *c = (unsigned int *) malloc (n*n * sizeof(unsigned int)); int block_x, block_y, grid_x, grid_y; char fname[256]; int ret = 0; struct timeval tv; struct timeval tv_total_start, tv_total_end; float total; struct timeval tv_h2d_start, tv_h2d_end; float h2d; struct timeval tv_d2h_start, tv_d2h_end; float d2h; struct timeval tv_exec_start, tv_exec_end; struct timeval tv_mem_alloc_start; struct timeval tv_data_init_start; float data_init; struct timeval tv_conf_kern_start; struct timeval tv_close_start; float mem_alloc; float exec; float init_gpu; float configure_kernel; float close_gpu; float data_read; unsigned int dummy_b, dummy_c; /* block_x * block_y should not exceed 512. */ block_x = n < 16 ? n : 16; block_y = n < 16 ? n : 16; grid_x = n / block_x; if (n % block_x != 0) grid_x++; grid_y = n / block_y; if (n % block_y != 0) grid_y++; gettimeofday(&tv_total_start, NULL); res = cuInit(0); if (res != CUDA_SUCCESS) { printf("cuInit failed: res = %lu\n", (unsigned long)res); return -1; } res = cuDeviceGet(&dev, 0); if (res != CUDA_SUCCESS) { printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res); return -1; } res = cuCtxCreate(&ctx, 0, dev); if (res != CUDA_SUCCESS) { printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res); return -1; } sprintf(fname, "%s/madd_gpu.cubin", path); res = cuModuleLoad(&module, fname); if (res != CUDA_SUCCESS) { printf("cuModuleLoad() failed\n"); return -1; } res = cuModuleGetFunction(&function, module, "_Z3addPjS_S_j"); if (res != CUDA_SUCCESS) { printf("cuModuleGetFunction() failed\n"); return -1; } res = cuFuncSetBlockShape(function, block_x, block_y, 1); if (res != CUDA_SUCCESS) { printf("cuFuncSetBlockShape() failed\n"); return -1; } gettimeofday(&tv_mem_alloc_start, NULL); /* a[] */ res = cuMemAlloc(&a_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (a) failed\n"); return -1; } res = cuMemMap((void**)&a_buf, a_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemMap (a) failed\n"); return -1; } res = cuMemGetPhysAddr(&a_phys, (void*)a_buf); if (res != CUDA_SUCCESS) { printf("cuMemGetPhysAddress (a) failed\n"); return -1; } /*printf("a[]: Physical Address 0x%llx\n", a_phys);*/ /* b[] */ res = cuMemAlloc(&b_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (b) failed\n"); return -1; } res = cuMemMap((void**)&b_buf, b_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemMap (b) failed\n"); return -1; } res = cuMemGetPhysAddr(&b_phys, (void*)b_buf); if (res != CUDA_SUCCESS) { printf("cuMemGetPhysAddress (b) failed\n"); return -1; } /*printf("b[]: Physical Address 0x%llx\n", b_phys);*/ /* c[] */ res = cuMemAlloc(&c_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (c) failed\n"); return -1; } res = cuMemMap((void**)&c_buf, c_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemMap (c) failed\n"); return -1; } res = cuMemGetPhysAddr(&c_phys, (void*)c_buf); if (res != CUDA_SUCCESS) { printf("cuMemGetPhysAddress (c) failed\n"); return -1; } /*printf("c[]: Physical Address 0x%llx\n", c_phys);*/ gettimeofday(&tv_data_init_start, NULL); /* initialize A[] & B[] */ for (i = 0; i < n; i++) { idx = i*n; for(j = 0; j < n; j++) { a_buf[idx++] = i; } } for (i = 0; i < n; i++) { idx = i*n; for(j = 0; j < n; j++) { b_buf[idx++] = i; } } gettimeofday(&tv_h2d_start, NULL); gettimeofday(&tv_h2d_end, NULL); gettimeofday(&tv_conf_kern_start, NULL); /* set kernel parameters */ res = cuParamSeti(function, 0, a_dev); if (res != CUDA_SUCCESS) { printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 4, a_dev >> 32); if (res != CUDA_SUCCESS) { printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 8, b_dev); if (res != CUDA_SUCCESS) { printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 12, b_dev >> 32); if (res != CUDA_SUCCESS) { printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 16, c_dev); if (res != CUDA_SUCCESS) { printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 20, c_dev >> 32); if (res != CUDA_SUCCESS) { printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 24, n); if (res != CUDA_SUCCESS) { printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSetSize(function, 28); if (res != CUDA_SUCCESS) { printf("cuParamSetSize failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_exec_start, NULL); /* launch the kernel */ res = cuLaunchGrid(function, grid_x, grid_y); if (res != CUDA_SUCCESS) { printf("cuLaunchGrid failed: res = %lu\n", (unsigned long)res); return -1; } cuCtxSynchronize(); gettimeofday(&tv_exec_end, NULL); gettimeofday(&tv_d2h_start, NULL); /* download c[] */ memcpy(c, c_buf, n*n*sizeof(unsigned int)); gettimeofday(&tv_d2h_end, NULL); /* Read back */ for (i = 0; i < n; i++) { idx = i*n; for(j = 0; j < n; j++) { dummy_c = c[idx++]; } } gettimeofday(&tv_close_start, NULL); res = cuMemUnmap((void*)a_buf); if (res != CUDA_SUCCESS) { printf("cuMemUnmap (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemFree(a_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemUnmap((void*)b_buf); if (res != CUDA_SUCCESS) { printf("cuMemUnmap (b) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemFree(b_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (b) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemUnmap((void*)c_buf); if (res != CUDA_SUCCESS) { printf("cuMemUnmap (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemFree(c_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuModuleUnload(module); if (res != CUDA_SUCCESS) { printf("cuModuleUnload failed: res = %lu\n", (unsigned long)res); return -1; } res = cuCtxDestroy(ctx); if (res != CUDA_SUCCESS) { printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_total_end, NULL); tvsub(&tv_mem_alloc_start, &tv_total_start, &tv); init_gpu = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_data_init_start, &tv_mem_alloc_start, &tv); mem_alloc = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_h2d_start, &tv_data_init_start, &tv); data_init = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_h2d_end, &tv_h2d_start, &tv); h2d = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_exec_start, &tv_conf_kern_start, &tv); configure_kernel = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_exec_end, &tv_exec_start, &tv); exec = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_d2h_end, &tv_d2h_start, &tv); d2h = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_close_start, &tv_d2h_end, &tv); data_read = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_total_end, &tv_close_start, &tv); close_gpu = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_total_end, &tv_total_start, &tv); total = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; printf("Init: %f\n", init_gpu); printf("MemAlloc: %f\n", mem_alloc); printf("DataInit: %f\n", data_init); printf("HtoD: %f\n", h2d); printf("KernConf: %f\n", configure_kernel); printf("Exec: %f\n", exec); printf("DtoH: %f\n", d2h); printf("DataRead: %f\n", data_read); printf("Close: %f\n", close_gpu); printf("Total: %f\n", total); return ret; }
static CUresult synchronize() { CU_ERROR_CHECK(cuCtxSynchronize()); return CUDA_SUCCESS; }
int cuda_test_fmadd(unsigned int n, char *path) { int i, j, idx; CUresult res; CUdevice dev; CUcontext ctx; CUfunction function; CUmodule module; CUdeviceptr a_dev, b_dev, c_dev; float *a = (float *) malloc (n*n * sizeof(float)); float *b = (float *) malloc (n*n * sizeof(float)); float *c = (float *) malloc (n*n * sizeof(float)); int block_x, block_y, grid_x, grid_y; int offset; char fname[256]; struct timeval tv; struct timeval tv_total_start, tv_total_end; float total; struct timeval tv_h2d_start, tv_h2d_end; float h2d; struct timeval tv_d2h_start, tv_d2h_end; float d2h; struct timeval tv_exec_start, tv_exec_end; float exec; /* initialize A[] & B[] */ for (i = 0; i < n; i++) { for(j = 0; j < n; j++) { idx = i * n + j; a[idx] = i + 0.1; b[idx] = i + 0.1; } } /* block_x * block_y should not exceed 512. */ block_x = n < 16 ? n : 16; block_y = n < 16 ? n : 16; grid_x = n / block_x; if (n % block_x != 0) grid_x++; grid_y = n / block_y; if (n % block_y != 0) grid_y++; printf("block = (%d, %d)\n", block_x, block_y); printf("grid = (%d, %d)\n", grid_x, grid_y); gettimeofday(&tv_total_start, NULL); res = cuInit(0); if (res != CUDA_SUCCESS) { printf("cuInit failed: res = %lu\n", (unsigned long)res); return -1; } res = cuDeviceGet(&dev, 0); if (res != CUDA_SUCCESS) { printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res); return -1; } res = cuCtxCreate(&ctx, 0, dev); if (res != CUDA_SUCCESS) { printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res); return -1; } sprintf(fname, "%s/fmadd_gpu.cubin", path); res = cuModuleLoad(&module, fname); if (res != CUDA_SUCCESS) { printf("cuModuleLoad() failed\n"); return -1; } res = cuModuleGetFunction(&function, module, "_Z3addPfS_S_i"); if (res != CUDA_SUCCESS) { printf("cuModuleGetFunction() failed\n"); return -1; } res = cuFuncSetSharedSize(function, 0x40); /* just random */ if (res != CUDA_SUCCESS) { printf("cuFuncSetSharedSize() failed\n"); return -1; } res = cuFuncSetBlockShape(function, block_x, block_y, 1); if (res != CUDA_SUCCESS) { printf("cuFuncSetBlockShape() failed\n"); return -1; } /* a[] */ res = cuMemAlloc(&a_dev, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (a) failed\n"); return -1; } /* b[] */ res = cuMemAlloc(&b_dev, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (b) failed\n"); return -1; } /* c[] */ res = cuMemAlloc(&c_dev, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (c) failed\n"); return -1; } gettimeofday(&tv_h2d_start, NULL); /* upload a[] and b[] */ res = cuMemcpyHtoD(a_dev, a, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoD (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemcpyHtoD(b_dev, b, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoD (b) failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_h2d_end, NULL); /* set kernel parameters */ offset = 0; res = cuParamSetv(function, offset, &a_dev, sizeof(a_dev)); if (res != CUDA_SUCCESS) { printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res); return -1; } offset += sizeof(a_dev); res = cuParamSetv(function, offset, &b_dev, sizeof(b_dev)); if (res != CUDA_SUCCESS) { printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res); return -1; } offset += sizeof(b_dev); res = cuParamSetv(function, offset, &c_dev, sizeof(c_dev)); if (res != CUDA_SUCCESS) { printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res); return -1; } offset += sizeof(c_dev); res = cuParamSetv(function, offset, &n, sizeof(n)); if (res != CUDA_SUCCESS) { printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res); return -1; } offset += sizeof(n); res = cuParamSetSize(function, offset); if (res != CUDA_SUCCESS) { printf("cuParamSetSize failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_exec_start, NULL); /* launch the kernel */ res = cuLaunchGrid(function, grid_x, grid_y); if (res != CUDA_SUCCESS) { printf("cuLaunchGrid failed: res = %lu\n", (unsigned long)res); return -1; } cuCtxSynchronize(); gettimeofday(&tv_exec_end, NULL); gettimeofday(&tv_d2h_start, NULL); /* download c[] */ res = cuMemcpyDtoH(c, c_dev, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyDtoH (c) failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_d2h_end, NULL); res = cuMemFree(a_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemFree(b_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (b) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemFree(c_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuModuleUnload(module); if (res != CUDA_SUCCESS) { printf("cuModuleUnload failed: res = %lu\n", (unsigned long)res); return -1; } res = cuCtxDestroy(ctx); if (res != CUDA_SUCCESS) { printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_total_end, NULL); /* check the results */ i = j = idx = 0; while (i < n) { while (j < n) { idx = i * n + j; if (c[idx] != a[idx] + b[idx]) { printf("c[%d] = %f\n", idx, c[idx]); printf("a[%d]+b[%d] = %f\n", idx, idx, a[idx]+b[idx]); return -1; } j++; } i++; } free(a); free(b); free(c); tvsub(&tv_h2d_end, &tv_h2d_start, &tv); h2d = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_d2h_end, &tv_d2h_start, &tv); d2h = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_exec_end, &tv_exec_start, &tv); exec = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_total_end, &tv_total_start, &tv); total = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; printf("HtoD: %f\n", h2d); printf("DtoH: %f\n", d2h); printf("Exec: %f\n", exec); printf("Time (Memcpy + Launch): %f\n", h2d + d2h + exec); printf("Total: %f\n", total); return 0; }
void octree::compute_properties (tree_structure &tree) { #if 0 fprintf(stderr,"This file is not up to date anymore! %s\n", __FILE__); exit(0); //Computes the tree-properties (size, cm, monopole, quadropole, etc) //start the kernel for the leaf-type nodes propsLeaf.set_arg<int>(0, &tree.n_leafs); propsLeaf.set_arg<cl_mem>(1, tree.leafNodeIdx.p()); propsLeaf.set_arg<cl_mem>(2, tree.node_bodies.p()); propsLeaf.set_arg<cl_mem>(3, tree.bodies_Ppos.p()); // propsLeaf.set_arg<cl_mem>(3, tree.bodies_pos.p()); propsLeaf.set_arg<cl_mem>(4, tree.multipole.p()); propsLeaf.set_arg<cl_mem>(5, tree.nodeLowerBounds.p()); propsLeaf.set_arg<cl_mem>(6, tree.nodeUpperBounds.p()); propsLeaf.set_arg<cl_mem>(7, tree.lowerBounds.p()); propsLeaf.set_arg<cl_mem>(8, tree.upperBounds.p()); propsLeaf.set_arg<cl_mem>(9, tree.bodies_Pvel.p()); //Velocity to get max eps propsLeaf.setWork(tree.n_leafs, 128); printf("PropsLeaf: "); propsLeaf.printWorkSize(); propsLeaf.execute(); int temp = tree.n_nodes-tree.n_leafs; propsNonLeaf.set_arg<int>(0, &temp); propsNonLeaf.set_arg<cl_mem>(1, tree.leafNodeIdx.p()); propsNonLeaf.set_arg<cl_mem>(2, tree.node_level_list.p()); propsNonLeaf.set_arg<cl_mem>(3, tree.n_children.p()); propsNonLeaf.set_arg<cl_mem>(4, tree.multipole.p()); propsNonLeaf.set_arg<cl_mem>(5, tree.nodeLowerBounds.p()); propsNonLeaf.set_arg<cl_mem>(6, tree.nodeUpperBounds.p()); for(int i=tree.n_levels; i >= 1; i--) { propsNonLeaf.set_arg<int>(0, &i); { vector<size_t> localWork(2), globalWork(2); int totalOnThisLevel; totalOnThisLevel = tree.node_level_list[i]-tree.node_level_list[i-1]; propsNonLeaf.setWork(totalOnThisLevel, 128); printf("PropsNonLeaf, nodes on level %d : %d (start: %d end: %d) , config: \t", i, totalOnThisLevel, tree.node_level_list[i-1], tree.node_level_list[i]); propsNonLeaf.printWorkSize(); } propsNonLeaf.set_arg<int>(0, &i); //set the level propsNonLeaf.execute(); } float theta2 = theta; propsScaling.set_arg<int>(0, &tree.n_nodes); propsScaling.set_arg<real4>(1, &tree.corner); propsScaling.set_arg<cl_mem>(2, tree.multipole.p()); propsScaling.set_arg<cl_mem>(3, tree.nodeLowerBounds.p()); propsScaling.set_arg<cl_mem>(4, tree.nodeUpperBounds.p()); propsScaling.set_arg<cl_mem>(5, tree.n_children.p()); propsScaling.set_arg<cl_mem>(6, tree.node_data.p()); propsScaling.set_arg<float >(7, &theta2); propsScaling.set_arg<cl_mem>(8, tree.boxSizeInfo.p()); propsScaling.set_arg<cl_mem>(9, tree.boxCenterInfo.p()); propsScaling.setWork(tree.n_nodes, 128); printf("propsScaling: \t "); propsScaling.printWorkSize(); propsScaling.execute(); //tree.multipole.d2h(); //printf("COM: %f %f %f %f \n",tree.multipole[0].x, tree.multipole[0].y, tree.multipole[0].z, tree.multipole[0].w); #ifdef USE_CUDA cuCtxSynchronize(); #else clFinish(devContext.get_command_queue()); #endif tree.nodeLowerBounds.d2h(); tree.nodeUpperBounds.d2h(); copyNodeDataToGroupData.set_arg<int>(0, &tree.n_groups); copyNodeDataToGroupData.set_arg<int>(1, &tree.n_nodes); copyNodeDataToGroupData.set_arg<cl_mem>(2, tree.node_data.p()); copyNodeDataToGroupData.set_arg<cl_mem>(3, tree.group_data.p()); copyNodeDataToGroupData.set_arg<cl_mem>(4, tree.node_bodies.p()); copyNodeDataToGroupData.set_arg<cl_mem>(5, tree.group_list.p()); copyNodeDataToGroupData.set_arg<cl_mem>(6, tree.boxCenterInfo.p()); copyNodeDataToGroupData.set_arg<cl_mem>(7, tree.boxSizeInfo.p()); copyNodeDataToGroupData.set_arg<cl_mem>(8, tree.groupCenterInfo.p()); copyNodeDataToGroupData.set_arg<cl_mem>(9, tree.groupSizeInfo.p()); copyNodeDataToGroupData.setWork(tree.n_nodes, 128); printf("copyNodeDataToGroupData: \t "); copyNodeDataToGroupData.printWorkSize(); copyNodeDataToGroupData.execute(); // tree.multipole.d2h(); // testRes.d2h(); // for(int i=0; i < tree.n_nodes; i++) // for(int i=tree.n_nodes-10; i < tree.n_nodes; i++) /* for(int i=0; i < 10; i++) { fprintf(stderr,"%d\t%f\t%f\t%f\t%f\n", i, tree.multipole[i*3+0].x,tree.multipole[i*3+0].y,tree.multipole[i*3+0].z, tree.multipole[i*3+0].w); // fprintf(stderr,"%d\t%f\t%f\t%f\t%f\t%f\n", i, tree.multipole[i*3+1].x,tree.multipole[i*3+1].y,tree.multipole[i*3+1].z, tree.multipole[i*3+1].w, testRes[i]); fprintf(stderr,"%d\t%f\t%f\t%f\t%f\t%f\n", i, tree.multipole[i*3+1].x,tree.multipole[i*3+1].y,tree.multipole[i*3+1].z, tree.multipole[i*3+1].w, 0); fprintf(stderr,"%d\t%f\t%f\t%f\t%f\n", i, tree.multipole[i*3+2].x,tree.multipole[i*3+2].y,tree.multipole[i*3+2].z, tree.multipole[i*3+2].w); } exit(0); */ #else compute_properties_double(tree); #endif }