CUresult TestSAXPY( chCUDADevice *chDevice, size_t N, float alpha ) { CUresult status; CUdeviceptr dptrOut = 0; CUdeviceptr dptrIn = 0; float *hostOut = 0; float *hostIn = 0; CUDA_CHECK( cuCtxPushCurrent( chDevice->context() ) ); CUDA_CHECK( cuMemAlloc( &dptrOut, N*sizeof(float) ) ); CUDA_CHECK( cuMemsetD32( dptrOut, 0, N ) ); CUDA_CHECK( cuMemAlloc( &dptrIn, N*sizeof(float) ) ); CUDA_CHECK( cuMemHostAlloc( (void **) &hostOut, N*sizeof(float), 0 ) ); CUDA_CHECK( cuMemHostAlloc( (void **) &hostIn, N*sizeof(float), 0 ) ); for ( size_t i = 0; i < N; i++ ) { hostIn[i] = (float) rand() / (float) RAND_MAX; } CUDA_CHECK( cuMemcpyHtoDAsync( dptrIn, hostIn, N*sizeof(float ), NULL ) ); { CUmodule moduleSAXPY; CUfunction kernelSAXPY; void *params[] = { &dptrOut, &dptrIn, &N, &alpha }; moduleSAXPY = chDevice->module( "saxpy.ptx" ); if ( ! moduleSAXPY ) { status = CUDA_ERROR_NOT_FOUND; goto Error; } CUDA_CHECK( cuModuleGetFunction( &kernelSAXPY, moduleSAXPY, "saxpy" ) ); CUDA_CHECK( cuLaunchKernel( kernelSAXPY, 1500, 1, 1, 512, 1, 1, 0, NULL, params, NULL ) ); } CUDA_CHECK( cuMemcpyDtoHAsync( hostOut, dptrOut, N*sizeof(float), NULL ) ); CUDA_CHECK( cuCtxSynchronize() ); for ( size_t i = 0; i < N; i++ ) { if ( fabsf( hostOut[i] - alpha*hostIn[i] ) > 1e-5f ) { status = CUDA_ERROR_UNKNOWN; goto Error; } } status = CUDA_SUCCESS; printf( "Well it worked!\n" ); Error: cuCtxPopCurrent( NULL ); cuMemFreeHost( hostOut ); cuMemFreeHost( hostIn ); cuMemFree( dptrOut ); cuMemFree( dptrIn ); return status; }
static void cuda_free_ctx(cuda_context *ctx) { gpuarray_blas_ops *blas_ops; gpudata *next, *curr; ASSERT_CTX(ctx); ctx->refcnt--; if (ctx->refcnt == 0) { assert(ctx->enter == 0 && "Context was active when freed!"); if (ctx->blas_handle != NULL) { ctx->err = cuda_property(ctx, NULL, NULL, GA_CTX_PROP_BLAS_OPS, &blas_ops); blas_ops->teardown(ctx); } cuMemFreeHost((void *)ctx->errbuf->ptr); deallocate(ctx->errbuf); cuStreamDestroy(ctx->s); /* Clear out the freelist */ for (curr = ctx->freeblocks; curr != NULL; curr = next) { next = curr->next; cuMemFree(curr->ptr); deallocate(curr); } if (!(ctx->flags & DONTFREE)) cuCtxDestroy(ctx->ctx); cache_destroy(ctx->extcopy_cache); CLEAR(ctx); free(ctx); } }
/* * Class: edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2 * Method: reinit * Signature: ()V */ JNIEXPORT void JNICALL Java_edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2_reinit (JNIEnv * env, jobject this_ref, jint max_blocks_per_proc, jint max_threads_per_block, jlong free_space) { cuMemFreeHost(toSpace); cuMemFree(gpuToSpace); cuMemFree(gpuClassMemory); cuMemFreeHost(handlesMemory); cuMemFree(gpuHandlesMemory); cuMemFreeHost(exceptionsMemory); cuMemFree(gpuExceptionsMemory); cuMemFree(gcInfoSpace); cuMemFree(gpuHeapEndPtr); cuMemFree(gpuBufferSize); cuCtxDestroy(cuContext); initDevice(env, this_ref, max_blocks_per_proc, max_threads_per_block, free_space); }
bool VideoDecoderCUDAPrivate::releaseCuda() { available = false; if (!can_load) return true; if (dec) { cuvidDestroyDecoder(dec); dec = 0; } if (parser) { cuvidDestroyVideoParser(parser); parser = 0; } if (stream) { cuStreamDestroy(stream); stream = 0; } if (host_data) { cuMemFreeHost(host_data); host_data = 0; host_data_size = 0; } if (vid_ctx_lock) { cuvidCtxLockDestroy(vid_ctx_lock); vid_ctx_lock = 0; } if (cuctx) { checkCudaErrors(cuCtxDestroy(cuctx)); } // TODO: dllapi unload return true; }
void swanFreeHost( void *ptr ) { //printf("FreeHost %p\n", ptr ); CUresult err = cuMemFreeHost( ptr ); if ( err != CUDA_SUCCESS ) { error("swanFreeHost failed\n" ); } }
void *cuda_make_ctx(CUcontext ctx, int flags) { cuda_context *res; void *p; res = malloc(sizeof(*res)); if (res == NULL) return NULL; res->ctx = ctx; res->err = CUDA_SUCCESS; res->blas_handle = NULL; res->refcnt = 1; res->flags = flags; res->enter = 0; res->freeblocks = NULL; if (detect_arch(ARCH_PREFIX, res->bin_id, &err)) { goto fail_cache; } res->extcopy_cache = cache_lru(64, 32, (cache_eq_fn)extcopy_eq, (cache_hash_fn)extcopy_hash, (cache_freek_fn)extcopy_free, (cache_freev_fn)cuda_freekernel); if (res->extcopy_cache == NULL) { goto fail_cache; } err = cuStreamCreate(&res->s, 0); if (err != CUDA_SUCCESS) { goto fail_stream; } err = cuStreamCreate(&res->mem_s, CU_STREAM_NON_BLOCKING); if (err != CUDA_SUCCESS) { goto fail_mem_stream; } err = cuMemAllocHost(&p, 16); if (err != CUDA_SUCCESS) { goto fail_errbuf; } memset(p, 0, 16); /* Need to tag for new_gpudata */ TAG_CTX(res); res->errbuf = new_gpudata(res, (CUdeviceptr)p, 16); if (res->errbuf == NULL) { err = res->err; goto fail_end; } res->errbuf->flags |= CUDA_MAPPED_PTR; return res; fail_end: cuMemFreeHost(p); fail_errbuf: cuStreamDestroy(res->mem_s); fail_mem_stream: cuStreamDestroy(res->s); fail_stream: cache_destroy(res->extcopy_cache); fail_cache: free(res); return NULL; }
static void map_fini (struct ptx_stream *s) { CUresult r; r = cuMemFreeHost (s->h); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r)); }
void GPUInterface::FreePinnedHostMemory(void* hPtr) { #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr, "\t\t\tEntering GPUInterface::FreePinnedHostMemory\n"); #endif SAFE_CUPP(cuMemFreeHost(hPtr)); #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tLeaving GPUInterface::FreePinnedHostMemory\n"); #endif }
SEXP R_auto_cuMemFreeHost(SEXP r_p) { SEXP r_ans = R_NilValue; void * p = GET_REF(r_p, void ); CUresult ans; ans = cuMemFreeHost(p); r_ans = Renum_convert_CUresult(ans) ; return(r_ans); }
void pocl_cuda_free (cl_device_id device, cl_mem mem_obj) { cuCtxSetCurrent (((pocl_cuda_device_data_t *)device->data)->context); if (mem_obj->flags & CL_MEM_ALLOC_HOST_PTR) { cuMemFreeHost (mem_obj->mem_host_ptr); mem_obj->mem_host_ptr = NULL; } else { void *ptr = mem_obj->device_ptrs[device->dev_id].mem_ptr; cuMemFree ((CUdeviceptr)ptr); } }
void* InteropResource::mapToHost(const VideoFormat &format, void *handle, int picIndex, const CUVIDPROCPARAMS ¶m, int width, int height, int coded_height) { AutoCtxLock locker((cuda_api*)this, lock); Q_UNUSED(locker); CUdeviceptr devptr; unsigned int pitch; CUDA_ENSURE(cuvidMapVideoFrame(dec, picIndex, &devptr, &pitch, const_cast<CUVIDPROCPARAMS*>(¶m)), NULL); CUVIDAutoUnmapper unmapper(this, dec, devptr); Q_UNUSED(unmapper); uchar* host_data = NULL; const size_t host_size = pitch*coded_height*3/2; CUDA_ENSURE(cuMemAllocHost((void**)&host_data, host_size), NULL); // copy to the memory not allocated by cuda is possible but much slower CUDA_ENSURE(cuMemcpyDtoH(host_data, devptr, host_size), NULL); VideoFrame frame(width, height, VideoFormat::Format_NV12); uchar *planes[] = { host_data, host_data + pitch * coded_height }; frame.setBits(planes); int pitches[] = { (int)pitch, (int)pitch }; frame.setBytesPerLine(pitches); VideoFrame *f = reinterpret_cast<VideoFrame*>(handle); frame.setTimestamp(f->timestamp()); frame.setDisplayAspectRatio(f->displayAspectRatio()); if (format == frame.format()) *f = frame.clone(); else *f = frame.to(format); cuMemFreeHost(host_data); return f; }
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //release model void free_model(MODEL *MO) { CUresult res; //free model information for(int ii=0;ii<MO->MI->numcomponent;ii++) { s_free(MO->MI->didx[ii]); s_free(MO->MI->pidx[ii]); s_free(MO->MI->psize[ii]); s_free(MO->MI->x1[ii]); s_free(MO->MI->x2[ii]); s_free(MO->MI->y1[ii]); s_free(MO->MI->y2[ii]); } s_free(MO->MI->anchor); // s_free(MO->MI->def); res = cuMemFreeHost((void *)MO->MI->def); if(res != CUDA_SUCCESS) { printf("cuMemFreeHost(MO->MI->def) failed: res = %s\n", conv(res)); exit(1); } s_free(MO->MI->numpart); s_free(MO->MI->offw); s_free(MO->MI->oidx); s_free(MO->MI->ridx); s_free(MO->MI->rsize); s_free(MO->MI->x1); s_free(MO->MI->x2); s_free(MO->MI->y1); s_free(MO->MI->y2); s_free(MO->MI); //free root-filter information for(int ii=0;ii<MO->RF->NoR;ii++) { s_free(MO->RF->root_size[ii]); #ifdef ORIGINAL s_free(MO->RF->rootfilter[ii]); #else #ifdef SEPARETE_MEM res = cuMemFreeHost((void *)MO->RF->rootfilter[ii]); if(res != CUDA_SUCCESS){ printf("cuMemFreeHost(MO->RF->rootfilter) failed: res = %s\n", conv(res)); exit(1); } #endif #endif } #ifndef ORIGINAL #ifndef SEPARETE_MEM /* free heap region in a lump */ res = cuMemFreeHost((void *)MO->RF->rootfilter[0]); if(res != CUDA_SUCCESS){ printf("cuMemFreeHost(MO->RF->rootfilter[0]) failed: res = %s\n", conv(res)); exit(1); } #endif #endif s_free(MO->RF->rootsym); s_free(MO->RF); //free root-filter information for(int ii=0;ii<MO->PF->NoP;ii++) { s_free(MO->PF->part_size[ii]); #ifdef ORIGINAL s_free(MO->PF->partfilter[ii]); #else #ifdef SEPARETE_MEM res = cuMemFreeHost((void *)MO->PF->partfilter[ii]); if(res != CUDA_SUCCESS){ printf("cuMemFreeHost(MO->PF->partfilter) failed: res = %s\n", conv(res)); exit(1); } #endif #endif } #ifndef ORIGINAL #ifndef SEPARETE_MEM /* free heap region in a lump */ res = cuMemFreeHost((void *)MO->PF->partfilter[0]); if(res != CUDA_SUCCESS){ printf("cuMemFreeHost(MO->PF->partfilter[0] failed: res = %s\n", conv(res)); exit(1); } #endif #endif s_free(MO->PF->part_partner); s_free(MO->PF->part_sym); s_free(MO->PF); s_free(MO); }
bool VideoDecoderCUDAPrivate::processDecodedData(CUVIDPARSERDISPINFO *cuviddisp, VideoFrame* outFrame) { int num_fields = cuviddisp->progressive_frame ? 1 : 2+cuviddisp->repeat_first_field; for (int active_field = 0; active_field < num_fields; ++active_field) { CUVIDPROCPARAMS proc_params; memset(&proc_params, 0, sizeof(CUVIDPROCPARAMS)); proc_params.progressive_frame = cuviddisp->progressive_frame; //check user config proc_params.second_field = active_field == 1; //check user config proc_params.top_field_first = cuviddisp->top_field_first; proc_params.unpaired_field = cuviddisp->progressive_frame == 1; CUdeviceptr devptr; unsigned int pitch; cuvidCtxLock(vid_ctx_lock, 0); CUresult cuStatus = cuvidMapVideoFrame(dec, cuviddisp->picture_index, &devptr, &pitch, &proc_params); if (cuStatus != CUDA_SUCCESS) { qWarning("cuvidMapVideoFrame failed on index %d (%#x, %s)", cuviddisp->picture_index, cuStatus, _cudaGetErrorEnum(cuStatus)); cuvidUnmapVideoFrame(dec, devptr); cuvidCtxUnlock(vid_ctx_lock, 0); return false; } #define PAD_ALIGN(x,mask) ( (x + mask) & ~mask ) //uint w = dec_create_info.ulWidth;//PAD_ALIGN(dec_create_info.ulWidth, 0x3F); uint h = dec_create_info.ulHeight;//PAD_ALIGN(dec_create_info.ulHeight, 0x0F); //? #undef PAD_ALIGN int size = pitch*h*3/2; if (size > host_data_size && host_data) { cuMemFreeHost(host_data); host_data = 0; host_data_size = 0; } if (!host_data) { cuStatus = cuMemAllocHost((void**)&host_data, size); if (cuStatus != CUDA_SUCCESS) { qWarning("cuMemAllocHost failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus)); cuvidUnmapVideoFrame(dec, devptr); cuvidCtxUnlock(vid_ctx_lock, 0); return false; } host_data_size = size; } if (!host_data) { qWarning("No valid staging memory!"); cuvidUnmapVideoFrame(dec, devptr); cuvidCtxUnlock(vid_ctx_lock, 0); return false; } cuStatus = cuMemcpyDtoHAsync(host_data, devptr, size, stream); if (cuStatus != CUDA_SUCCESS) { qWarning("cuMemcpyDtoHAsync failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus)); cuvidUnmapVideoFrame(dec, devptr); cuvidCtxUnlock(vid_ctx_lock, 0); return false; } cuStatus = cuCtxSynchronize(); if (cuStatus != CUDA_SUCCESS) { qWarning("cuCtxSynchronize failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus)); } cuvidUnmapVideoFrame(dec, devptr); cuvidCtxUnlock(vid_ctx_lock, 0); //qDebug("mark not in use pic_index: %d", cuviddisp->picture_index); surface_in_use[cuviddisp->picture_index] = false; uchar *planes[] = { host_data, host_data + pitch * h }; int pitches[] = { (int)pitch, (int)pitch }; VideoFrame frame(codec_ctx->width, codec_ctx->height, VideoFormat::Format_NV12); frame.setBits(planes); frame.setBytesPerLine(pitches); //TODO: is clone required? may crash on clone, I should review clone() //frame = frame.clone(); if (outFrame) { *outFrame = frame.clone(); } #if COPY_ON_DECODE frame_queue.put(frame.clone()); #endif //qDebug("frame queue size: %d", frame_queue.size()); } return true; }
// Run the Cuda part of the computation bool copyDecodedFrameToTexture(unsigned int &nRepeats, int bUseInterop, int *pbIsProgressive) { CUVIDPARSERDISPINFO oDisplayInfo; if (g_pFrameQueue->dequeue(&oDisplayInfo)) { CCtxAutoLock lck(g_CtxLock); // Push the current CUDA context (only if we are using CUDA decoding path) CUresult result = cuCtxPushCurrent(g_oContext); CUdeviceptr pDecodedFrame[2] = { 0, 0 }; CUdeviceptr pInteropFrame[2] = { 0, 0 }; int num_fields = (oDisplayInfo.progressive_frame ? (1) : (2+oDisplayInfo.repeat_first_field)); *pbIsProgressive = oDisplayInfo.progressive_frame; g_bIsProgressive = oDisplayInfo.progressive_frame ? true : false; for (int active_field=0; active_field<num_fields; active_field++) { nRepeats = oDisplayInfo.repeat_first_field; CUVIDPROCPARAMS oVideoProcessingParameters; memset(&oVideoProcessingParameters, 0, sizeof(CUVIDPROCPARAMS)); oVideoProcessingParameters.progressive_frame = oDisplayInfo.progressive_frame; oVideoProcessingParameters.second_field = active_field; oVideoProcessingParameters.top_field_first = oDisplayInfo.top_field_first; oVideoProcessingParameters.unpaired_field = (num_fields == 1); unsigned int nDecodedPitch = 0; unsigned int nWidth = 0; unsigned int nHeight = 0; // map decoded video frame to CUDA surfae g_pVideoDecoder->mapFrame(oDisplayInfo.picture_index, &pDecodedFrame[active_field], &nDecodedPitch, &oVideoProcessingParameters); nWidth = g_pVideoDecoder->targetWidth(); nHeight = g_pVideoDecoder->targetHeight(); // map DirectX texture to CUDA surface size_t nTexturePitch = 0; // If we are Encoding and this is the 1st Frame, we make sure we allocate system memory for readbacks if (g_bReadback && g_bFirstFrame && g_ReadbackSID) { CUresult result; checkCudaErrors(result = cuMemAllocHost((void **)&g_bFrameData[0], (nDecodedPitch * nHeight * 3 / 2))); checkCudaErrors(result = cuMemAllocHost((void **)&g_bFrameData[1], (nDecodedPitch * nHeight * 3 / 2))); g_bFirstFrame = false; if (result != CUDA_SUCCESS) { printf("cuMemAllocHost returned %d\n", (int)result); } } // If streams are enabled, we can perform the readback to the host while the kernel is executing if (g_bReadback && g_ReadbackSID) { CUresult result = cuMemcpyDtoHAsync(g_bFrameData[active_field], pDecodedFrame[active_field], (nDecodedPitch * nHeight * 3 / 2), g_ReadbackSID); if (result != CUDA_SUCCESS) { printf("cuMemAllocHost returned %d\n", (int)result); } } #if ENABLE_DEBUG_OUT printf("%s = %02d, PicIndex = %02d, OutputPTS = %08d\n", (oDisplayInfo.progressive_frame ? "Frame" : "Field"), g_DecodeFrameCount, oDisplayInfo.picture_index, oDisplayInfo.timestamp); #endif if (g_pImageDX) { // map the texture surface g_pImageDX->map(&pInteropFrame[active_field], &nTexturePitch, active_field); } else { pInteropFrame[active_field] = g_pInteropFrame[active_field]; nTexturePitch = g_pVideoDecoder->targetWidth() * 2; } // perform post processing on the CUDA surface (performs colors space conversion and post processing) // comment this out if we inclue the line of code seen above cudaPostProcessFrame(&pDecodedFrame[active_field], nDecodedPitch, &pInteropFrame[active_field], nTexturePitch, g_pCudaModule->getModule(), gfpNV12toARGB, g_KernelSID); if (g_pImageDX) { // unmap the texture surface g_pImageDX->unmap(active_field); } // unmap video frame // unmapFrame() synchronizes with the VideoDecode API (ensures the frame has finished decoding) g_pVideoDecoder->unmapFrame(pDecodedFrame[active_field]); // release the frame, so it can be re-used in decoder g_pFrameQueue->releaseFrame(&oDisplayInfo); g_DecodeFrameCount++; } // Detach from the Current thread checkCudaErrors(cuCtxPopCurrent(NULL)); } else { return false; } // check if decoding has come to an end. // if yes, signal the app to shut down. if (!g_pVideoSource->isStarted() || g_pFrameQueue->isEndOfDecode()) { // Let's free the Frame Data if (g_ReadbackSID && g_bFrameData) { cuMemFreeHost((void *)g_bFrameData[0]); cuMemFreeHost((void *)g_bFrameData[1]); g_bFrameData[0] = NULL; g_bFrameData[1] = NULL; } // Let's just stop, and allow the user to quit, so they can at least see the results g_pVideoSource->stop(); // If we want to loop reload the video file and restart if (g_bLoop && !g_bAutoQuit) { reinitCudaResources(); g_FrameCount = 0; g_DecodeFrameCount = 0; g_pVideoSource->start(); } if (g_bAutoQuit) { g_bDone = true; } } return true; }
/* * Class: edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2 * Method: findReserveMem * Signature: ()I */ JNIEXPORT jlong JNICALL Java_edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2_findReserveMem (JNIEnv * env, jobject this_ref, jint max_blocks_per_proc, jint max_threads_per_block) { size_t to_space_size; size_t temp_size; int status; int deviceCount = 0; jlong prev_i; jlong i; size_t f_mem; size_t t_mem; jint num_blocks; status = cuInit(0); CHECK_STATUS(env,"error in cuInit",status) printf("automatically determining CUDA reserve space...\n"); to_space_size = initContext(env, max_blocks_per_proc, max_threads_per_block); //space for 100 types in the scene classMemSize = sizeof(jint)*100; num_blocks = numMultiProcessors * max_threads_per_block * max_blocks_per_proc; gc_space_size = 1024; to_space_size -= (num_blocks * sizeof(jlong)); to_space_size -= (num_blocks * sizeof(jlong)); to_space_size -= gc_space_size; to_space_size -= classMemSize; for(i = 1024L*1024L; i < to_space_size; i += 100L*1024L*1024L){ temp_size = to_space_size - i; printf("attempting allocation with temp_size: %lu to_space_size: %lu i: %ld\n", temp_size, to_space_size, i); status = cuMemHostAlloc(&toSpace, temp_size, 0); if(status != CUDA_SUCCESS){ cuCtxDestroy(cuContext); initContext(env, max_blocks_per_proc, max_threads_per_block); continue; } status = cuMemAlloc(&gpuToSpace, temp_size); if(status != CUDA_SUCCESS){ cuCtxDestroy(cuContext); initContext(env, max_blocks_per_proc, max_threads_per_block); continue; } status = cuMemAlloc(&gpuClassMemory, classMemSize); if(status != CUDA_SUCCESS){ cuCtxDestroy(cuContext); initContext(env, max_blocks_per_proc, max_threads_per_block); continue; } status = cuMemHostAlloc(&handlesMemory, num_blocks * sizeof(jlong), CU_MEMHOSTALLOC_WRITECOMBINED); if(status != CUDA_SUCCESS){ cuCtxDestroy(cuContext); initContext(env, max_blocks_per_proc, max_threads_per_block); continue; } status = cuMemAlloc(&gpuHandlesMemory, num_blocks * sizeof(jlong)); if(status != CUDA_SUCCESS){ cuCtxDestroy(cuContext); initContext(env, max_blocks_per_proc, max_threads_per_block); continue; } status = cuMemHostAlloc(&exceptionsMemory, num_blocks * sizeof(jlong), 0); if(status != CUDA_SUCCESS){ cuCtxDestroy(cuContext); initContext(env, max_blocks_per_proc, max_threads_per_block); continue; } status = cuMemAlloc(&gpuExceptionsMemory, num_blocks * sizeof(jlong)); if(status != CUDA_SUCCESS){ cuCtxDestroy(cuContext); initContext(env, max_blocks_per_proc, max_threads_per_block); continue; } status = cuMemAlloc(&gcInfoSpace, gc_space_size); if(status != CUDA_SUCCESS){ cuCtxDestroy(cuContext); initContext(env, max_blocks_per_proc, max_threads_per_block); continue; } status = cuMemAlloc(&gpuHeapEndPtr, 8); if(status != CUDA_SUCCESS){ cuCtxDestroy(cuContext); initContext(env, max_blocks_per_proc, max_threads_per_block); continue; } status = cuMemAlloc(&gpuBufferSize, 8); if(status != CUDA_SUCCESS){ cuCtxDestroy(cuContext); initContext(env, max_blocks_per_proc, max_threads_per_block); continue; } //done, free everything cuMemFree(gpuToSpace); cuMemFree(gpuClassMemory); cuMemFree(gpuHandlesMemory); cuMemFree(gpuExceptionsMemory); cuMemFree(gcInfoSpace); cuMemFree(gpuHeapEndPtr); cuMemFree(gpuBufferSize); cuMemFreeHost(toSpace); cuMemFreeHost(handlesMemory); cuMemFreeHost(exceptionsMemory); return i; } throw_cuda_errror_exception(env, "unable to find enough space using CUDA", 0); return 0; }
static void calc_a_score_GPU(FLOAT *ac_score, FLOAT **score, int *ssize_start, Model_info *MI, FLOAT scale, int *size_score_array, int NoC) { CUresult res; const int IHEI = MI->IM_HEIGHT; const int IWID = MI->IM_WIDTH; int pady_n = MI->pady; int padx_n = MI->padx; int block_pad = (int)(scale/2.0); struct timeval tv; int *RY_array, *RX_array; res = cuMemHostAlloc((void**)&RY_array, NoC*sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP); if(res != CUDA_SUCCESS) { printf("cuMemHostAlloc(RY_array) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemHostAlloc((void**)&RX_array, NoC*sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP); if(res != CUDA_SUCCESS) { printf("cuMemHostAlloc(RX_array) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } for(int i = 0; i < NoC; i++) { int rsize[2] = {MI->rsize[i*2], MI->rsize[i*2+1]}; RY_array[i] = (int)((FLOAT)rsize[0]*scale/2.0-1.0+block_pad); RX_array[i] = (int)((FLOAT)rsize[1]*scale/2.0-1.0+block_pad); } CUdeviceptr ac_score_dev, score_dev; CUdeviceptr ssize_dev, size_score_dev; CUdeviceptr RY_dev, RX_dev; int size_score=0; for(int i = 0; i < NoC; i++) { size_score += size_score_array[i]; } /* allocate GPU memory */ res = cuMemAlloc(&ac_score_dev, gpu_size_A_SCORE); if(res != CUDA_SUCCESS) { printf("cuMemAlloc(ac_score) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemAlloc(&score_dev, size_score); if(res != CUDA_SUCCESS) { printf("cuMemAlloc(score) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemAlloc(&ssize_dev, NoC*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemAlloc(ssize) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemAlloc(&size_score_dev, NoC*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemAlloc(size_score) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemAlloc(&RY_dev, NoC*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemAlloc(RY) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemAlloc(&RX_dev, NoC*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemAlloc(RX) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } gettimeofday(&tv_memcpy_start, nullptr); /* upload date to GPU */ res = cuMemcpyHtoD(ac_score_dev, &ac_score[0], gpu_size_A_SCORE); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(ac_score) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(score_dev, &score[0][0], size_score); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(score) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(ssize_dev, &ssize_start[0], NoC*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(ssize) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(size_score_dev, &size_score_array[0], NoC*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(size_score) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(RY_dev, &RY_array[0], NoC*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(RY) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(RX_dev, &RX_array[0], NoC*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(RX) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } gettimeofday(&tv_memcpy_end, nullptr); tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv); time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; void* kernel_args[] = { (void*)&IWID, (void*)&IHEI, (void*)&scale, (void*)&padx_n, (void*)&pady_n, &RX_dev, &RY_dev, &ac_score_dev, &score_dev, &ssize_dev, (void*)&NoC, &size_score_dev }; int sharedMemBytes = 0; /* define CUDA block shape */ int max_threads_num = 0; int thread_num_x, thread_num_y; int block_num_x, block_num_y; res = cuDeviceGetAttribute(&max_threads_num, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev[0]); if(res != CUDA_SUCCESS){ printf("\ncuDeviceGetAttribute() failed: res = %s\n", cuda_response_to_string(res)); exit(1); } NR_MAXTHREADS_X[0] = (int)sqrt((double)max_threads_num/NoC); NR_MAXTHREADS_Y[0] = (int)sqrt((double)max_threads_num/NoC); thread_num_x = (IWID < NR_MAXTHREADS_X[0]) ? IWID : NR_MAXTHREADS_X[0]; thread_num_y = (IHEI < NR_MAXTHREADS_Y[0]) ? IHEI : NR_MAXTHREADS_Y[0]; block_num_x = IWID / thread_num_x; block_num_y = IHEI / thread_num_y; if(IWID % thread_num_x != 0) block_num_x++; if(IHEI % thread_num_y != 0) block_num_y++; gettimeofday(&tv_kernel_start, nullptr); /* launch GPU kernel */ res = cuLaunchKernel( func_calc_a_score[0], // call function block_num_x, // gridDimX block_num_y, // gridDimY 1, // gridDimZ thread_num_x, // blockDimX thread_num_y, // blockDimY NoC, // blockDimZ sharedMemBytes, // sharedMemBytes nullptr, // hStream kernel_args, // kernelParams nullptr // extra ); if(res != CUDA_SUCCESS) { printf("cuLaunchKernel(calc_a_score) failed : res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuCtxSynchronize(); if(res != CUDA_SUCCESS) { printf("cuCtxSynchronize(calc_a_score) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } gettimeofday(&tv_kernel_end, nullptr); tvsub(&tv_kernel_end, &tv_kernel_start, &tv); time_kernel += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; gettimeofday(&tv_memcpy_start, nullptr); /* download data from GPU */ res = cuMemcpyDtoH(ac_score, ac_score_dev, gpu_size_A_SCORE); if(res != CUDA_SUCCESS) { printf("cuMemcpyDtoH(ac_score) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } gettimeofday(&tv_memcpy_end, nullptr); tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv); time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; /* free GPU memory */ res = cuMemFree(ac_score_dev); if(res != CUDA_SUCCESS) { printf("cuMemFree(ac_score_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemFree(score_dev); if(res != CUDA_SUCCESS) { printf("cuMemFree(score_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemFree(ssize_dev); if(res != CUDA_SUCCESS) { printf("cuMemFree(ssize_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemFree(size_score_dev); if(res != CUDA_SUCCESS) { printf("cuMemFree(size_score_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemFree(RY_dev); if(res != CUDA_SUCCESS) { printf("cuMemFree(RY_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemFree(RX_dev); if(res != CUDA_SUCCESS) { printf("cuMemFree(RX_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } /* free CPU memory */ res = cuMemFreeHost(RY_array); if(res != CUDA_SUCCESS) { printf("cuMemFreeHost(RY_array) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemFreeHost(RX_array); if(res != CUDA_SUCCESS) { printf("cuMemFreeHost(RX_array) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } }
//detect boundary box FLOAT *dpm_ttic_gpu_get_boxes(FLOAT **features,FLOAT *scales,int *feature_size, GPUModel *MO, int *detected_count, FLOAT *acc_score, FLOAT thresh) { //constant parameters const int max_scale = MO->MI->max_scale; const int interval = MO->MI->interval; const int sbin = MO->MI->sbin; const int padx = MO->MI->padx; const int pady = MO->MI->pady; const int NoR = MO->RF->NoR; const int NoP = MO->PF->NoP; const int NoC = MO->MI->numcomponent; const int *numpart = MO->MI->numpart; const int LofFeat=(max_scale+interval)*NoC; const int L_MAX = max_scale+interval; /* for measurement */ struct timeval tv; struct timeval tv_make_c_start, tv_make_c_end; struct timeval tv_nucom_start, tv_nucom_end; struct timeval tv_box_start, tv_box_end; float time_box=0; struct timeval tv_root_score_start, tv_root_score_end; float time_root_score = 0; struct timeval tv_part_score_start, tv_part_score_end; float time_part_score = 0; struct timeval tv_dt_start, tv_dt_end; float time_dt = 0; struct timeval tv_calc_a_score_start, tv_calc_a_score_end; float time_calc_a_score = 0; gettimeofday(&tv_make_c_start, nullptr); int **RF_size = MO->RF->root_size; int *rootsym = MO->RF->rootsym; int *part_sym = MO->PF->part_sym; int **part_size = MO->PF->part_size; FLOAT **rootfilter = MO->RF->rootfilter; FLOAT **partfilter=MO->PF->partfilter; int **psize = MO->MI->psize; int **rm_size_array = (int **)malloc(sizeof(int *)*L_MAX); int **pm_size_array = (int **)malloc(sizeof(int *)*L_MAX); pm_size_array = (int **)malloc(sizeof(int *)*L_MAX); FLOAT **Tboxes=(FLOAT**)calloc(LofFeat,sizeof(FLOAT*)); //box coordinate information(Temp) int *b_nums =(int*)calloc(LofFeat,sizeof(int)); //length of Tboxes int count = 0; int detected_boxes=0; CUresult res; /* matched score (root and part) */ FLOAT ***rootmatch,***partmatch = nullptr; int *new_PADsize; // need new_PADsize[L_MAX*3] size_t SUM_SIZE_feat = 0; FLOAT **featp2 = (FLOAT **)malloc(L_MAX*sizeof(FLOAT *)); if(featp2 == nullptr) { // error semantics printf("allocate featp2 failed\n"); exit(1); } /* allocate required memory for new_PADsize */ new_PADsize = (int *)malloc(L_MAX*3*sizeof(int)); if(new_PADsize == nullptr) { // error semantics printf("allocate new_PADsize failed\n"); exit(1); } /* do padarray once and reuse it at calculating root and part time */ /* calculate sum of size of padded feature */ for(int tmpL=0; tmpL<L_MAX; tmpL++) { int PADsize[3] = { feature_size[tmpL*2], feature_size[tmpL*2+1], 31 }; int NEW_Y = PADsize[0] + pady*2; int NEW_X = PADsize[1] + padx*2; SUM_SIZE_feat += (NEW_X*NEW_Y*PADsize[2])*sizeof(FLOAT); } /* allocate region for padded feat in a lump */ FLOAT *dst_feat; res = cuMemHostAlloc((void **)&dst_feat, SUM_SIZE_feat, CU_MEMHOSTALLOC_DEVICEMAP); if(res != CUDA_SUCCESS) { printf("cuMemHostAlloc(dst_feat) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } memset(dst_feat, 0, SUM_SIZE_feat); // zero clear /* distribute allocated region */ uintptr_t pointer_feat = (uintptr_t)dst_feat; for(int tmpL=0; tmpL<L_MAX; tmpL++) { featp2[tmpL] = (FLOAT *)pointer_feat; int PADsize[3] = { feature_size[tmpL*2], feature_size[tmpL*2+1], 31 }; int NEW_Y = PADsize[0] + pady*2; int NEW_X = PADsize[1] + padx*2; pointer_feat += (uintptr_t)(NEW_X*NEW_Y*PADsize[2]*sizeof(FLOAT)); } /* copy feat to feat2 */ for(int tmpL=0; tmpL<L_MAX; tmpL++) { int PADsize[3] = { feature_size[tmpL*2], feature_size[tmpL*2+1], 31 }; int NEW_Y = PADsize[0] + pady*2; int NEW_X = PADsize[1] + padx*2; int L = NEW_Y*padx; int SPL = PADsize[0] + pady; int M_S = sizeof(FLOAT)*PADsize[0]; FLOAT *P = featp2[tmpL]; FLOAT *S = features[tmpL]; for(int i=0; i<PADsize[2]; i++) { P += L; for(int j=0; j<PADsize[1]; j++) { P += pady; memcpy(P, S, M_S); S += PADsize[0]; P += SPL; } P += L; } new_PADsize[tmpL*3] = NEW_Y; new_PADsize[tmpL*3 + 1] = NEW_X; new_PADsize[tmpL*3 + 2] = PADsize[2]; } /* do padarray once and reuse it at calculating root and part time */ /* allocation in a lump */ int *dst_rm_size = (int *)malloc(sizeof(int)*NoC*2*L_MAX); if(dst_rm_size == nullptr) { printf("allocate dst_rm_size failed\n"); exit(1); } /* distribution to rm_size_array[L_MAX] */ uintptr_t ptr = (uintptr_t)dst_rm_size; for(int i=0; i<L_MAX; i++) { rm_size_array[i] = (int *)ptr; ptr += (uintptr_t)(NoC*2*sizeof(int)); } /* allocation in a lump */ int *dst_pm_size = (int *)malloc(sizeof(int)*NoP*2*L_MAX); if(dst_pm_size == nullptr) { printf("allocate dst_pm_size failed\n"); exit(1); } /* distribution to pm_size_array[L_MAX] */ ptr = (uintptr_t)dst_pm_size; for(int i=0; i<L_MAX; i++) { pm_size_array[i] = (int *)ptr; ptr += (uintptr_t)(NoP*2*sizeof(int)); } ///////level for (int level=interval; level<L_MAX; level++) // feature's loop(A's loop) 1level 1picture { if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) { Tboxes[count]=nullptr; count++; continue; } } //for (level) // feature's loop(A's loop) 1level 1picture ///////root calculation///////// /* calculate model score (only root) */ gettimeofday(&tv_root_score_start, nullptr); rootmatch = fconvsMT_GPU( featp2, SUM_SIZE_feat, rootfilter, rootsym, 1, NoR, new_PADsize, RF_size, rm_size_array, L_MAX, interval, feature_size, padx, pady, MO->MI->max_X, MO->MI->max_Y, ROOT ); gettimeofday(&tv_root_score_end, nullptr); tvsub(&tv_root_score_end, &tv_root_score_start, &tv); time_root_score += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; ///////part calculation///////// if(NoP>0) { /* calculate model score (only part) */ gettimeofday(&tv_part_score_start, nullptr); partmatch = fconvsMT_GPU( featp2, SUM_SIZE_feat, partfilter, part_sym, 1, NoP, new_PADsize, part_size, pm_size_array, L_MAX, interval, feature_size, padx, pady, MO->MI->max_X, MO->MI->max_Y, PART ); gettimeofday(&tv_part_score_end, nullptr); tvsub(&tv_part_score_end, &tv_part_score_start, &tv); time_part_score += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; } res = cuCtxSetCurrent(ctx[0]); if(res != CUDA_SUCCESS) { printf("cuCtxSetCurrent(ctx[0]) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } gettimeofday(&tv_make_c_end, nullptr); gettimeofday(&tv_nucom_start, nullptr); count = 0; detected_boxes = 0; int **RL_array = (int **)malloc((L_MAX-interval)*sizeof(int*)); int *dst_RL = (int *) malloc(NoC*(L_MAX-interval)*sizeof(int)); int **RI_array = (int **)malloc((L_MAX-interval)*sizeof(int*)); int *dst_RI = (int *)malloc(NoC*(L_MAX-interval)*sizeof(int)); int **OI_array = (int **)malloc((L_MAX-interval)*sizeof(int*)); int *dst_OI = (int *)malloc((NoC)*(L_MAX-interval)*sizeof(int)); int **RL_S_array = (int **)malloc((L_MAX-interval)*sizeof(int*)); int *dst_RL_S = (int *)malloc(NoC*(L_MAX-interval)*sizeof(int)); FLOAT **OFF_array = (FLOAT **)malloc((L_MAX-interval)*sizeof(FLOAT*)); FLOAT *dst_OFF = (FLOAT *)malloc(NoC*(L_MAX-interval)*sizeof(FLOAT)); FLOAT ***SCORE_array = (FLOAT ***)malloc((L_MAX-interval)*sizeof(FLOAT **)); FLOAT **sub_dst_SCORE = (FLOAT **)malloc(NoC*(L_MAX-interval)*sizeof(FLOAT*)); uintptr_t pointer_RL = (uintptr_t)dst_RL; uintptr_t pointer_RI = (uintptr_t)dst_RI; uintptr_t pointer_OI = (uintptr_t)dst_OI; uintptr_t pointer_RL_S = (uintptr_t)dst_RL_S; uintptr_t pointer_OFF = (uintptr_t)dst_OFF; uintptr_t pointer_SCORE = (uintptr_t)sub_dst_SCORE; for (int level=interval; level<L_MAX; level++) { int L=level-interval; RL_array[L] = (int *)pointer_RL; pointer_RL += (uintptr_t)NoC*sizeof(int); RI_array[L] = (int *)pointer_RI; pointer_RI += (uintptr_t)NoC*sizeof(int); OI_array[L] = (int *)pointer_OI; pointer_OI += (uintptr_t)NoC*sizeof(int); RL_S_array[L] = (int *)pointer_RL_S; pointer_RL_S += (uintptr_t)NoC*sizeof(int); OFF_array[L] = (FLOAT *)pointer_OFF; pointer_OFF += (uintptr_t)NoC*sizeof(FLOAT); SCORE_array[L] = (FLOAT **)pointer_SCORE; pointer_SCORE += (uintptr_t)NoC*sizeof(FLOAT*); } int sum_RL_S = 0; int sum_SNJ = 0; /* prepare for parallel execution */ for(int level=interval; level<L_MAX; level++) { int L = level - interval; if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) { continue; } for(int j=0; j<NoC; j++) { /* root score + offset */ RL_array[L][j] = rm_size_array[level][j*2]*rm_size_array[level][j*2+1]; //length of root-matching RI_array[L][j] = MO->MI->ridx[j]; //root-index OI_array[L][j] = MO->MI->oidx[j]; //offset-index RL_S_array[L][j] =sizeof(FLOAT)*RL_array[L][j]; OFF_array[L][j] = MO->MI->offw[RI_array[L][j]]; //offset information /* search max values */ max_RL_S = (max_RL_S < RL_S_array[L][j]) ? RL_S_array[L][j] : max_RL_S; max_numpart = (max_numpart < numpart[j]) ? numpart[j] : max_numpart; } } sum_RL_S = max_RL_S*NoC*(L_MAX-interval); /* root matching size */ sum_SNJ = sizeof(int*)*max_numpart*NoC*(L_MAX-interval); /* consolidated allocation for SCORE_array and distribute region */ FLOAT *dst_SCORE = (FLOAT *)malloc(sum_RL_S); pointer_SCORE = (uintptr_t)dst_SCORE; for(int level=interval; level<L_MAX; level++) { int L = level - interval; if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) { continue; } for(int j=0; j<NoC; j++) { SCORE_array[L][j] = (FLOAT *)pointer_SCORE; pointer_SCORE += (uintptr_t)max_RL_S; } } /* add offset */ for(int level=interval; level<L_MAX; level++) { int L = level - interval; if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) { continue; } for(int j=0; j<NoC; j++) { memcpy(SCORE_array[L][j], rootmatch[level][j], RL_S_array[L][j]); FLOAT *SC_S = SCORE_array[L][j]; FLOAT *SC_E = SCORE_array[L][j]+RL_array[L][j]; while(SC_S<SC_E) *(SC_S++)+=OFF_array[L][j]; } } /* anchor matrix */ // consolidated allocation int ***ax_array = (int ***)malloc((L_MAX-interval)*sizeof(int **)); int **sub_dst_ax = (int **)malloc(NoC*(L_MAX-interval)*sizeof(int *)); int *dst_ax = (int *)malloc(sum_SNJ); int ***ay_array = (int ***)malloc((L_MAX-interval)*sizeof(int **)); int **sub_dst_ay = (int **)malloc(NoC*(L_MAX-interval)*sizeof(int *)); int *dst_ay = (int *)malloc(sum_SNJ); /* boudary index */ // consolidated allocation int ****Ix_array =(int ****)malloc((L_MAX-interval)*sizeof(int ***)); int ***sub_dst_Ix = (int ***)malloc(NoC*(L_MAX-interval)*sizeof(int **)); int **dst_Ix = (int **)malloc(sum_SNJ); int ****Iy_array = (int ****)malloc((L_MAX-interval)*sizeof(int ***)); int ***sub_dst_Iy = (int ***)malloc(NoC*(L_MAX-interval)*sizeof(int **)); int **dst_Iy = (int **)malloc(sum_SNJ); /* distribute region */ uintptr_t pointer_ax = (uintptr_t)sub_dst_ax; uintptr_t pointer_ay = (uintptr_t)sub_dst_ay; uintptr_t pointer_Ix = (uintptr_t)sub_dst_Ix; uintptr_t pointer_Iy = (uintptr_t)sub_dst_Iy; for(int level=interval; level<L_MAX; level++) { int L = level - interval; if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) { continue; } ax_array[L] = (int **)pointer_ax; pointer_ax += (uintptr_t)(NoC*sizeof(int*)); ay_array[L] = (int **)pointer_ay; pointer_ay += (uintptr_t)(NoC*sizeof(int*)); Ix_array[L] = (int ***)pointer_Ix; pointer_Ix += (uintptr_t)(NoC*sizeof(int**)); Iy_array[L] = (int ***)pointer_Iy; pointer_Iy += (uintptr_t)(NoC*sizeof(int**)); } pointer_ax = (uintptr_t)dst_ax; pointer_ay = (uintptr_t)dst_ay; pointer_Ix = (uintptr_t)dst_Ix; pointer_Iy = (uintptr_t)dst_Iy; for(int level=interval; level<L_MAX; level++) { int L = level - interval; if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) { continue; } for(int j=0; j<NoC; j++) { uintptr_t pointer_offset = sizeof(int*)*max_numpart; ax_array[L][j] = (int *)pointer_ax; pointer_ax += pointer_offset; ay_array[L][j] = (int *)pointer_ay; pointer_ay += pointer_offset; Ix_array[L][j] = (int **)pointer_Ix; pointer_Ix += pointer_offset; Iy_array[L][j] = (int **)pointer_Iy; pointer_Iy += pointer_offset; } } /* add parts */ if(NoP>0) { /* arrays to store temporary loop variables */ int tmp_array_size = 0; for(int level=interval; level<L_MAX; level++) { if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) { continue; } for(int j=0; j<NoC; j++) { tmp_array_size += max_numpart*sizeof(int); } } int ***DIDX_array = (int ***)malloc((L_MAX-interval)*sizeof(int**)); int **sub_dst_DIDX = (int **)malloc(NoC*(L_MAX-interval)*sizeof(int*)); int *dst_DIDX = (int *)malloc(tmp_array_size); int ***DID_4_array = (int ***)malloc((L_MAX-interval)*sizeof(int **)); int **sub_dst_DID_4 = (int **)malloc(NoC*(L_MAX-interval)*sizeof(int*)); int *dst_DID_4; res = cuMemHostAlloc((void **)&dst_DID_4, tmp_array_size, CU_MEMHOSTALLOC_DEVICEMAP); if(res != CUDA_SUCCESS) { printf("cuMemHostAlloc(dst_DID_4) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } int ***PIDX_array = (int ***)malloc((L_MAX-interval)*sizeof(int **)); int **sub_dst_PIDX = (int **)malloc(NoC*(L_MAX-interval)*sizeof(int*)); int *dst_PIDX; res = cuMemHostAlloc((void **)&dst_PIDX, tmp_array_size, CU_MEMHOSTALLOC_DEVICEMAP); if(res != CUDA_SUCCESS) { printf("cuMemHostAlloc(dst_PIDX) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } /* distribute consolidated region */ uintptr_t pointer_DIDX = (uintptr_t)sub_dst_DIDX; uintptr_t pointer_DID_4 = (uintptr_t)sub_dst_DID_4; uintptr_t pointer_PIDX = (uintptr_t)sub_dst_PIDX; for(int level=interval; level<L_MAX; level++) { int L = level - interval; if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) { continue; } DIDX_array[L] = (int **)pointer_DIDX; pointer_DIDX += (uintptr_t)(NoC*sizeof(int*)); DID_4_array[L] = (int **)pointer_DID_4; pointer_DID_4 += (uintptr_t)(NoC*sizeof(int*)); PIDX_array[L] = (int **)pointer_PIDX; pointer_PIDX += (uintptr_t)(NoC*sizeof(int*)); } pointer_DIDX = (uintptr_t)dst_DIDX; pointer_DID_4 = (uintptr_t)dst_DID_4; pointer_PIDX = (uintptr_t)dst_PIDX; for(int level=interval; level<L_MAX; level++) { int L = level - interval; if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) { continue; } for(int j=0; j<NoC; j++) { uintptr_t pointer_offset = (uintptr_t)(max_numpart*sizeof(int)); DIDX_array[L][j] = (int *)pointer_DIDX; pointer_DIDX += pointer_offset; DID_4_array[L][j] = (int *)pointer_DID_4; pointer_DID_4 += pointer_offset; PIDX_array[L][j] = (int *)pointer_PIDX; pointer_PIDX += pointer_offset; } } /* prepare for parallel execution */ int sum_size_index_matrix = 0; for(int level=interval; level<L_MAX; level++) { int L = level - interval; if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) { continue; } for(int j=0; j<NoC; j++) { for (int k=0;k<numpart[j];k++) { /* assign values to each element */ DIDX_array[L][j][k] = MO->MI->didx[j][k]; DID_4_array[L][j][k] = DIDX_array[L][j][k]*4; PIDX_array[L][j][k] = MO->MI->pidx[j][k]; /* anchor */ ax_array[L][j][k] = MO->MI->anchor[DIDX_array[L][j][k]*2]+1; ay_array[L][j][k] = MO->MI->anchor[DIDX_array[L][j][k]*2+1]+1; int PSSIZE[2] ={pm_size_array[L][PIDX_array[L][j][k]*2], pm_size_array[L][PIDX_array[L][j][k]*2+1]}; // size of C /* index matrix */ sum_size_index_matrix += sizeof(int)*PSSIZE[0]*PSSIZE[1]; } } } int *dst_Ix_kk = (int *)malloc(sum_size_index_matrix); int *dst_Iy_kk = (int *)malloc(sum_size_index_matrix); uintptr_t pointer_Ix_kk = (uintptr_t)dst_Ix_kk; uintptr_t pointer_Iy_kk = (uintptr_t)dst_Iy_kk; for(int level=interval; level<L_MAX; level++) { int L = level - interval; if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) { continue; } for(int j=0; j<NoC; j++) { for (int k=0;k<numpart[j];k++) { int PSSIZE[2] ={pm_size_array[L][PIDX_array[L][j][k]*2], pm_size_array[L][PIDX_array[L][j][k]*2+1]}; // size of C Ix_array[L][j][k] = (int *)pointer_Ix_kk; Iy_array[L][j][k] = (int *)pointer_Iy_kk; pointer_Ix_kk += (uintptr_t)(sizeof(int)*PSSIZE[0]*PSSIZE[1]); pointer_Iy_kk += (uintptr_t)(sizeof(int)*PSSIZE[0]*PSSIZE[1]); } } } gettimeofday(&tv_dt_start, nullptr); FLOAT ****M_array = dt_GPU( Ix_array, // int ****Ix_array Iy_array, // int ****Iy_array PIDX_array, // int ***PIDX_array pm_size_array, // int **size_array NoP, // int NoP numpart, // int *numpart NoC, // int NoC interval, // int interval L_MAX, // int L_MAX feature_size, // int *feature_size, padx, // int padx, pady, // int pady, MO->MI->max_X, // int max_X MO->MI->max_Y, // int max_Y MO->MI->def, // FLOAT *def tmp_array_size, // int tmp_array_size dst_PIDX, // int *dst_PIDX dst_DID_4 // int *DID_4 ); gettimeofday(&tv_dt_end, nullptr); tvsub(&tv_dt_end, &tv_dt_start, &tv); time_dt += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; /* add part score */ for(int level=interval; level<L_MAX; level++){ int L = level - interval; if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) { continue; } for(int j=0; j<NoC; j++) { for(int k=0; k<numpart[j]; k++) { int PSSIZE[2] ={pm_size_array[L][PIDX_array[L][j][k]*2], pm_size_array[L][PIDX_array[L][j][k]*2+1]}; // Size of C int R_S[2]={rm_size_array[level][j*2], rm_size_array[level][j*2+1]}; dpm_ttic_add_part_calculation(SCORE_array[L][j], M_array[L][j][k], R_S, PSSIZE, ax_array[L][j][k], ay_array[L][j][k]); } } } s_free(M_array[0][0][0]); s_free(M_array[0][0]); s_free(M_array[0]); s_free(M_array); /* free temporary arrays */ free(dst_DIDX); free(sub_dst_DIDX); free(DIDX_array); res = cuMemFreeHost(dst_DID_4); if(res != CUDA_SUCCESS) { printf("cuMemFreeHost(dst_DID_4) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } free(sub_dst_DID_4); free(DID_4_array); res = cuMemFreeHost(dst_PIDX); if(res != CUDA_SUCCESS) { printf("cuMemFreeHost(dst_PIDX) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } free(sub_dst_PIDX); free(PIDX_array); res = cuCtxSetCurrent(ctx[0]); if(res != CUDA_SUCCESS) { printf("cuCtxSetCurrent(ctx[0]) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } } // start from if(NoP>0) /* combine root and part score and detect boundary box for each-component */ FLOAT *scale_array = (FLOAT *)malloc((L_MAX-interval)*sizeof(FLOAT)); for(int level=interval; level<L_MAX; level++) { int L = level - interval; if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) { Tboxes[count]=nullptr; count++; continue; } scale_array[L] = (FLOAT)sbin/scales[level]; } for (int level=interval; level<L_MAX; level++) // feature's loop(A's loop) 1level 1picture { /* parameters (related for level) */ int L=level-interval; /* matched score size matrix */ FLOAT scale=(FLOAT)sbin/scales[level]; /* loop conditon */ if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) { Tboxes[count]=nullptr; count++; continue; } /* calculate accumulated score */ gettimeofday(&tv_calc_a_score_start, nullptr); calc_a_score_GPU( acc_score, // FLOAT *ac_score SCORE_array[L], // FLOAT **score rm_size_array[level], // int *ssize_start MO->MI, // Model_info *MI scale, // FLOAT scale RL_S_array[L], // int *size_score_array NoC // int NoC ); gettimeofday(&tv_calc_a_score_end, nullptr); tvsub(&tv_calc_a_score_end, &tv_calc_a_score_start, &tv); time_calc_a_score += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; for(int j = 0; j <NoC; j++) { int R_S[2]={rm_size_array[level][j*2], rm_size_array[level][j*2+1]}; /* get all good matches */ int GMN; int *GMPC = get_gmpc(SCORE_array[L][j],thresh,R_S,&GMN); int RSIZE[2]={MO->MI->rsize[j*2], MO->MI->rsize[j*2+1]}; int GL = (numpart[j]+1)*4+3; //31 /* detected box coordinate(current level) */ FLOAT *t_boxes = (FLOAT*)calloc(GMN*GL,sizeof(FLOAT)); gettimeofday(&tv_box_start, nullptr); // NO NEED TO USE GPU for(int k = 0;k < GMN;k++) { FLOAT *P_temp = t_boxes+GL*k; int y = GMPC[2*k]; int x = GMPC[2*k+1]; /* calculate root box coordinate */ FLOAT *RB =rootbox(x,y,scale,padx,pady,RSIZE); memcpy(P_temp, RB,sizeof(FLOAT)*4); s_free(RB); P_temp+=4; for(int pp=0;pp<numpart[j];pp++) { int PBSIZE[2]={psize[j][pp*2], psize[j][pp*2+1]}; int Isize[2]={pm_size_array[L][MO->MI->pidx[j][pp]*2], pm_size_array[L][MO->MI->pidx[j][pp]*2+1]}; /* calculate part box coordinate */ FLOAT *PB = partbox(x,y,ax_array[L][j][pp],ay_array[L][j][pp],scale,padx,pady,PBSIZE,Ix_array[L][j][pp],Iy_array[L][j][pp],Isize); memcpy(P_temp, PB,sizeof(FLOAT)*4); P_temp+=4; s_free(PB); } /* component number and score */ *(P_temp++)=(FLOAT)j; //component number *(P_temp++)=SCORE_array[L][j][x*R_S[0]+y]; //score of good match *P_temp = scale; } // NO NEED TO USE GPU gettimeofday(&tv_box_end, nullptr); tvsub(&tv_box_end, &tv_box_start, &tv); time_box += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; /* save box information */ if (GMN > 0) Tboxes[count] = t_boxes; else Tboxes[count] = nullptr; b_nums[count]=GMN; count++; detected_boxes+=GMN; //number of detected box /* release */ s_free(GMPC); } ////numcom } ////level /* free temporary arrays */ free(dst_RL); free(RL_array); free(dst_RI); free(RI_array); free(dst_OI); free(OI_array); free(dst_RL_S); free(RL_S_array); free(dst_OFF); free(OFF_array); free(dst_SCORE); free(sub_dst_SCORE); free(SCORE_array); free(dst_ax); free(sub_dst_ax); free(ax_array); free(dst_ay); free(sub_dst_ay); free(ay_array); free(Ix_array[0][0][0]); free(dst_Ix); free(sub_dst_Ix); free(Ix_array); free(Iy_array[0][0][0]); free(dst_Iy); free(sub_dst_Iy); free(Iy_array); free(scale_array); gettimeofday(&tv_nucom_end, nullptr); #ifdef PRINT_INFO printf("root SCORE : %f\n", time_root_score); printf("part SCORE : %f\n", time_part_score); printf("dt : %f\n", time_dt); printf("calc_a_score : %f\n", time_calc_a_score); #endif res = cuCtxSetCurrent(ctx[0]); if(res != CUDA_SUCCESS) { printf("cuCtxSetCurrent(ctx[0]) failed: res = %s\n",cuda_response_to_string(res)); exit(1); } /* free memory regions */ res = cuMemFreeHost((void *)featp2[0]); if(res != CUDA_SUCCESS) { printf("cuMemFreeHost(featp2[0]) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } s_free(featp2); res = cuMemFreeHost((void *)rootmatch[interval][0]); if(res != CUDA_SUCCESS) { printf("cuMemFreeHost(rootmatch[0][0]) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } s_free(rootmatch[0]); s_free(rootmatch); if (partmatch != nullptr) { res = cuMemFreeHost((void *)partmatch[0][0]); if(res != CUDA_SUCCESS) { printf("cuMemFreeHost(partmatch[0][0]) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } s_free(partmatch[0]); s_free(partmatch); s_free(new_PADsize); } /* release */ s_free(rm_size_array[0]); s_free(rm_size_array); s_free(pm_size_array[0]); s_free(pm_size_array); /* Output boundary-box coorinate information */ int GL=(numpart[0]+1)*4+3; FLOAT *boxes=(FLOAT*)calloc(detected_boxes*GL,sizeof(FLOAT)); //box coordinate information(Temp) FLOAT *T1 = boxes; for(int i = 0; i < LofFeat; i++) { int num_t = b_nums[i]*GL; if(num_t > 0) { FLOAT *T2 = Tboxes[i]; //memcpy_s(T1,sizeof(FLOAT)*num_t,T2,sizeof(FLOAT)*num_t); memcpy(T1, T2,sizeof(FLOAT)*num_t); T1 += num_t; } } FLOAT abs_threshold = abs(thresh); /* accumulated score calculation */ FLOAT max_score = 0.0; /* add offset to accumulated score */ for(int i = 0; i < MO->MI->IM_HEIGHT*MO->MI->IM_WIDTH; i++) { if (acc_score[i] < thresh) { acc_score[i] = 0.0; } else { acc_score[i] += abs_threshold; if (acc_score[i] > max_score) max_score = acc_score[i]; } } /* normalization */ if (max_score > 0.0) { FLOAT ac_ratio = 1.0 / max_score; for (int i = 0; i < MO->MI->IM_HEIGHT*MO->MI->IM_WIDTH; i++) { acc_score[i] *= ac_ratio; } } /* release */ free_boxes(Tboxes,LofFeat); s_free(b_nums); /* output result */ *detected_count = detected_boxes; return boxes; }
int cuda_test_memcpy_async(unsigned int size) { int i; CUresult res; CUdevice dev; CUcontext ctx; CUstream stream; CUdeviceptr data_addr; unsigned int *in, *out; struct timeval tv; struct timeval tv_total_start, tv_total_end; unsigned long total; struct timeval tv_h2d_start, tv_h2d_end; float h2d; struct timeval tv_d2h_start, tv_d2h_end; float d2h; gettimeofday(&tv_total_start, NULL); res = cuInit(0); if (res != CUDA_SUCCESS) { printf("cuInit failed: res = %u\n", (unsigned int)res); return -1; } res = cuDeviceGet(&dev, 0); if (res != CUDA_SUCCESS) { printf("cuDeviceGet failed: res = %u\n", (unsigned int)res); return -1; } res = cuCtxCreate(&ctx, 0, dev); if (res != CUDA_SUCCESS) { printf("cuCtxCreate failed: res = %u\n", (unsigned int)res); return -1; } res = cuStreamCreate(&stream, 0); if (res != CUDA_SUCCESS) { printf("cuStreamCreate failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemAlloc(&data_addr, size); if (res != CUDA_SUCCESS) { printf("cuMemAlloc failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemAllocHost((void **)&in, size); if (res != CUDA_SUCCESS) { printf("cuMemAllocHost(in) failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemAllocHost((void **)&out, size); if (res != CUDA_SUCCESS) { printf("cuMemAllocHost(out) failed: res = %u\n", (unsigned int)res); return -1; } for (i = 0; i < size / 4; i++) { in[i] = i+1; out[i] = 0; } gettimeofday(&tv_h2d_start, NULL); res = cuMemcpyHtoDAsync(data_addr, in, size, stream); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoDAsync failed: res = %u\n", (unsigned int)res); return -1; } res = cuStreamSynchronize(stream); if (res != CUDA_SUCCESS) { printf("cuStreamSynchronize() failed: res = %u\n", (unsigned int)res); return -1; } gettimeofday(&tv_h2d_end, NULL); gettimeofday(&tv_d2h_start, NULL); res = cuMemcpyDtoHAsync(out, data_addr, size, stream); if (res != CUDA_SUCCESS) { printf("cuMemcpyDtoHAsync failed: res = %u\n", (unsigned int)res); return -1; } res = cuStreamSynchronize(stream); if (res != CUDA_SUCCESS) { printf("cuStreamSynchronize() failed: res = %u\n", (unsigned int)res); return -1; } gettimeofday(&tv_d2h_end, NULL); for (i = 0; i < size / 4; i++) { if (in[i] != out[i]) { printf("in[%d] = %u, out[%d] = %u\n", i, in[i], i, out[i]); } } res = cuMemFreeHost(out); if (res != CUDA_SUCCESS) { printf("cuMemFreeHost(out) failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemFreeHost(in); if (res != CUDA_SUCCESS) { printf("cuMemFreeHost(in) failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemFree(data_addr); if (res != CUDA_SUCCESS) { printf("cuMemFree failed: res = %u\n", (unsigned int)res); return -1; } res = cuStreamDestroy(stream); if (res != CUDA_SUCCESS) { printf("cuStreamDestroy failed: res = %u\n", (unsigned int)res); return -1; } res = cuCtxDestroy(ctx); if (res != CUDA_SUCCESS) { printf("cuCtxDestroy failed: res = %u\n", (unsigned int)res); return -1; } gettimeofday(&tv_total_end, NULL); tvsub(&tv_h2d_end, &tv_h2d_start, &tv); h2d = tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; tvsub(&tv_d2h_end, &tv_d2h_start, &tv); d2h = tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; tvsub(&tv_total_end, &tv_total_start, &tv); total = tv.tv_sec * 1000 + tv.tv_usec / 1000; printf("HtoD: %f\n", h2d); printf("DtoH: %f\n", d2h); return 0; end: return -1; }
int gib_free ( void *buffers, gib_context c ) { ERROR_CHECK_FAIL(cuCtxPushCurrent(((gpu_context)(c->acc_context))->pCtx)); ERROR_CHECK_FAIL(cuMemFreeHost(buffers)); ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx)); return GIB_SUC; }
int main() { CU_ERROR_CHECK(cuInit(0)); int count; CU_ERROR_CHECK(cuDeviceGetCount(&count)); count = (count > 2) ? 2 : count; CUdevice devices[count]; for (int i = 0; i < count; i++) CU_ERROR_CHECK(cuDeviceGet(&devices[i], i)); // Question 1: Can you create multiple contexts on the same device? { fprintf(stderr, "Attempting to create multiple contexts on each device...\n"); CUcontext contexts[count * N]; size_t j = 0; for (int i = 0; i < count; i++) { CUresult error = CUDA_SUCCESS; size_t k; for (k = 0; k < N && error == CUDA_SUCCESS; k++) { error = cuCtxCreate(&contexts[j], CU_CTX_SCHED_AUTO, devices[i]); if (error == CUDA_SUCCESS) CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[j++])); } fprintf(stderr, " created %zu contexts on device %d before cuCtxCreate returned \"%s\"\n", (k - 1), i, cuGetErrorString(error)); } CUresult error = CUDA_SUCCESS; size_t k; for (k = 0; k < j && error == CUDA_SUCCESS; k++) error = cuCtxPushCurrent(contexts[k]); if (error == CUDA_SUCCESS) fprintf(stderr, " successfully pushed %zu contexts with cuCtxPushCurrent\n", k); else fprintf(stderr, " pushed %zu contexts before cuCtxPushCurrent returned \"%s\"\n", (k - 1), cuGetErrorString(error)); for (size_t k = 0; k < j; k++) CU_ERROR_CHECK(cuCtxDestroy(contexts[k])); fprintf(stderr, "\n"); } CUcontext contexts[count][2]; for (int i = 0; i < count; i++) { for (size_t j = 0; j < 2; j++) { CU_ERROR_CHECK(cuCtxCreate(&contexts[i][j], CU_CTX_SCHED_AUTO, devices[i])); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[i][j])); } } // Question 2: Can you access a host pointer in a different context from // which it was created? // Question 3: Can you free a host pointer in a different context from which // it was created? { void * hPtr; CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); CU_ERROR_CHECK(cuMemAllocHost(&hPtr, 1024)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); CUdeviceptr dPtr[count]; CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); CU_ERROR_CHECK(cuMemAlloc(&dPtr[0], 1024)); // Different context, same device fprintf(stderr, "Accessing a host pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(cuMemcpyHtoD(dPtr[0], hPtr, 1024))); CU_ERROR_CHECK(cuMemFree(dPtr[0])); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); if (count > 1) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); CU_ERROR_CHECK(cuMemAlloc(&dPtr[1], 1024)); // Different context, different device fprintf(stderr, "Accessing a host pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(cuMemcpyHtoD(dPtr[1], hPtr, 1024))); CU_ERROR_CHECK(cuMemFree(dPtr[1])); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } fprintf(stderr, "\n"); CUresult error = CUDA_ERROR_UNKNOWN; if (count > 1) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); error = cuMemFreeHost(hPtr); fprintf(stderr, "Freeing a host pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); error = cuMemFreeHost(hPtr); fprintf(stderr, "Freeing a host pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); error = cuMemFreeHost(hPtr); fprintf(stderr, "Freeing a host pointer from the same context to which it was allocated returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } fprintf(stderr, "\n"); } // Question 4: Can you access a device pointer in a different context from // which it was created? // Question 5: Can you free a device pointer in a different context from which // it was created? { CUdeviceptr dPtr[count][2]; CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); CU_ERROR_CHECK(cuMemAlloc(&dPtr[0][0], 1024)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); CU_ERROR_CHECK(cuMemAlloc(&dPtr[0][1], 1024)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); fprintf(stderr, "Accessing a device pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(cuMemcpyDtoD(dPtr[0][0], dPtr[0][1], 1024))); CU_ERROR_CHECK(cuMemFree(dPtr[0][1])); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); if (count > 1) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); CU_ERROR_CHECK(cuMemAlloc(&dPtr[1][0], 1024)); // Different context, different device fprintf(stderr, "Accessing a device pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(cuMemcpyDtoD(dPtr[0][0], dPtr[1][0], 1024))); CU_ERROR_CHECK(cuMemFree(dPtr[1][0])); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } fprintf(stderr, "\n"); CUresult error = CUDA_ERROR_UNKNOWN; if (count > 1) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); error = cuMemFree(dPtr[0][0]); fprintf(stderr, "Freeing a device pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); error = cuMemFree(dPtr[0][0]); fprintf(stderr, "Freeing a device pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); error = cuMemFree(dPtr[0][0]); fprintf(stderr, "Freeing a device pointer from the same context to which it was allocated returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } fprintf(stderr, "\n"); } // Question 6: Can you access a module in a different context from which it // was loaded? // Question 7: Can you unload a module in a different context from which it // was loaded? { CUmodule module; CUdeviceptr ptr; CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); CU_ERROR_CHECK(cuModuleLoad(&module, "kernel-test.ptx")); CU_ERROR_CHECK(cuMemAlloc(&ptr, sizeof(float))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); CUfunction function = 0; if (count > 0) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); fprintf(stderr, "Getting a function pointer from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel"))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } if (function == 0) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); fprintf(stderr, "Getting a function pointer from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel"))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); } if (function == 0) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); fprintf(stderr, "Getting a function pointer from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel"))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } fprintf(stderr, "\n"); CUdeviceptr a, b; CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); CU_ERROR_CHECK(cuMemAlloc(&a, sizeof(float))); CU_ERROR_CHECK(cuMemAlloc(&b, sizeof(float))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); void * params[] = { &a, & b }; CUresult error = CUDA_ERROR_UNKNOWN; if (count > 0) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); fprintf(stderr, "Launching a function from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); fprintf(stderr, "Launching a function from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); fprintf(stderr, "Launching a function from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } fprintf(stderr, "\n"); error = CUDA_ERROR_UNKNOWN; if (count > 0) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); fprintf(stderr, "Unloading a module from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); fprintf(stderr, "Unloading a module from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); fprintf(stderr, "Unloading a module from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); CU_ERROR_CHECK(cuMemFree(a)); CU_ERROR_CHECK(cuMemFree(b)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } for (int i = 0; i < count; i++) { for (size_t j = 0; j < 2; j++) CU_ERROR_CHECK(cuCtxDestroy(contexts[i][j])); } return 0; }