CAMLprim value spoc_cuda_flush(value gi, value dev, value q){ CAMLparam3(gi, dev, q); cuda_event_list *events; events = (cuda_event_list*)(Field(dev,3)); CUDA_GET_CONTEXT; cuStreamSynchronize(queue[Int_val(q)]); /*while(events != NULL && events->next != NULL) { if (events != NULL) { CUDA_CHECK_CALL(cuEventSynchronize (events->evt)); if (events->vec) { cuMemFree (events->vec); } CUDA_CHECK_CALL(cuEventDestroy(events->evt)); } {cuda_event_list *tmp= events; events = events->next; if (tmp != NULL) free(tmp);} } if (events) free(events); events = NULL;*/ CUDA_RESTORE_CONTEXT; Store_field(dev, 3, (value)events); CAMLreturn(Val_unit); }
static void nvptx_wait_all (void) { CUresult r; struct ptx_stream *s; pthread_t self = pthread_self (); struct nvptx_thread *nvthd = nvptx_thread (); pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); /* Wait for active streams initiated by this thread (or by multiple threads) to complete. */ for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next) { if (s->multithreaded || pthread_equal (s->host_thread, self)) { r = cuStreamQuery (s->stream); if (r == CUDA_SUCCESS) continue; else if (r != CUDA_ERROR_NOT_READY) GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r)); r = cuStreamSynchronize (s->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); } } pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); event_gc (true); }
// Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Col2im<T>(args.kernel_mode, args.channels, args.height, args.width, args.kernel_h, args.kernel_w, args.pad_h, args.pad_w, args.stride_h, args.stride_w, args.dilation_h, args.dilation_w, buffers.b_mat(), args.b_offset, // col buffers.a_mat(), args.a_offset, // im &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Col2im<T>(args.kernel_mode, args.channels, args.height, args.width, args.kernel_h, args.kernel_w, args.pad_h, args.pad_w, args.stride_h, args.stride_w, args.dilation_h, args.dilation_w, buffers.b_mat(), args.b_offset, // col buffers.a_mat(), args.a_offset, // im queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; }
// Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; auto status = Scal(args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } #elif CUDA_API auto status = Scal(args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, queue.GetContext()(), queue.GetDevice()()); cuStreamSynchronize(queue()); #endif return status; }
static void nvptx_wait (int async) { CUresult r; struct ptx_stream *s; s = select_stream_for_async (async, pthread_self (), false, NULL); if (!s) GOMP_PLUGIN_fatal ("unknown async %d", async); r = cuStreamSynchronize (s->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); event_gc (true); }
void CudaModuleScene::onAfterLaunchApexCudaFunc(const ApexCudaFunc& func, CUstream stream) { if (mCudaProfileSession) { mCudaProfileSession->onFuncFinish(func.getProfileId(), stream); } #if !CUDA_KERNEL_CHECK_ALWAYS if (mSceneIntl.getCudaKernelCheckEnabled()) #endif { CUresult ret = cuStreamSynchronize(stream); if ( CUDA_SUCCESS != ret ) { APEX_INTERNAL_ERROR("Cuda Error %d after launch of func '%s'", ret, func.getName()); PX_ALWAYS_ASSERT(); } } }
bool GLInteropResource::unmap(GLuint tex) { Q_UNUSED(tex); if (WORKAROUND_UNMAP_CONTEXT_SWITCH) return true; int plane = -1; if (res[0].texture == tex) plane = 0; else if (res[1].texture == tex) plane = 1; else return false; // FIXME: why cuCtxPushCurrent gives CUDA_ERROR_INVALID_CONTEXT if opengl viewport changed? CUDA_WARN(cuCtxPushCurrent(ctx)); CUDA_WARN(cuStreamSynchronize(res[plane].stream)); // FIXME: need a correct context. But why we have to push context even though map/unmap are called in the same thread // Because the decoder switch the context in another thread so we have to switch the context back? // to workaround the context issue, we must pop the context that valid in map() and push it here CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false); CUDA_ENSURE(cuCtxPopCurrent(&ctx), false); return true; }
/* // Property Message // // API //static int getPathOfFeaturePyramidGPUStream(IplImage * image, float step, int numStep, int startIndex, int sideLength, int bx, int by, CvLSVMFeaturePyramid **maps) // INPUT // image // step // numStep // startIndex // sideLength // bx // by // OUTPUT // maps // RESULT // Error status */ static int getPathOfFeaturePyramidGPUStream(IplImage * image, float step, int numStep, int startIndex, int sideLength, int bx, int by, CvLSVMFeaturePyramid **maps) { CvLSVMFeatureMap **feature_maps; int i; int width, height, numChannels, sizeX, sizeY, p, pp, newSizeX, newSizeY; float *scales; CvLSVMFeatureMapGPU **devs_img, **devs_map_pre_norm, **devs_map_pre_pca; CUstream *streams; CUresult res; scales = (float *) malloc(sizeof(float) * (numStep)); devs_img = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * (numStep)); devs_map_pre_norm = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * (numStep)); devs_map_pre_pca = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * (numStep)); streams = (CUstream *) malloc(sizeof(CUstream) * (numStep)); feature_maps = (CvLSVMFeatureMap **) malloc( sizeof(CvLSVMFeatureMap *) * (numStep)); // allocate device memory for (i = 0; i < numStep; i++) { scales[i] = 1.0f / powf(step, (float) i); width = (int) (((float) image->width ) * scales[i] + 0.5); height = (int) (((float) image->height) * scales[i] + 0.5); numChannels = image->nChannels; sizeX = width / sideLength; sizeY = height / sideLength; p = NUM_SECTOR * 3; pp = NUM_SECTOR * 12; newSizeX = sizeX - 2; newSizeY = sizeY - 2; allocFeatureMapObjectGPU<float>(&devs_img[i], width, height, numChannels); allocFeatureMapObjectGPU<float>(&devs_map_pre_norm[i], sizeX, sizeY, p); allocFeatureMapObjectGPU<float>(&devs_map_pre_pca[i], newSizeX, newSizeY, pp); res = cuStreamCreate(&streams[i], CU_STREAM_DEFAULT); CUDA_CHECK(res, "cuStreamCreate(stream)"); } // excute main function resizeGPUStream(numStep, image, scales, devs_img, streams); getFeatureMapsGPUStream(numStep, sideLength, devs_img, devs_map_pre_norm, streams); normalizeAndTruncateGPUStream(numStep, Val_Of_Truncate, devs_map_pre_norm, devs_map_pre_pca, streams); PCAFeatureMapsGPUStream(numStep, bx, by, devs_map_pre_pca, feature_maps, streams); // synchronize cuda stream for (i = 0; i < numStep; i++) { cuStreamSynchronize(streams[i]); cuStreamDestroy(streams[i]); } for (i = 0; i < numStep; i++) { (*maps)->pyramid[startIndex + i] = feature_maps[i]; }/*for(i = 0; i < numStep; i++)*/ // free device memory for (i = 0; i < numStep; i++) { freeFeatureMapObjectGPU(&devs_img[i]); freeFeatureMapObjectGPU(&devs_map_pre_norm[i]); freeFeatureMapObjectGPU(&devs_map_pre_pca[i]); } free(scales); free(devs_img); free(devs_map_pre_norm); free(devs_map_pre_pca); free(streams); free(feature_maps); return LATENT_SVM_OK; }
/* // Feature map Normalization and Truncation in GPU // // API //int normalizeAndTruncateGPUStream(const int numStep, const float alfa, CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMapGPU **devs_map_out, CUstream *streams) // INPUT // numStep // alfa // devs_map_in // streams // OUTPUT // devs_map_out // RESULT // Error status */ int normalizeAndTruncateGPUStream(const int numStep, const float alfa, CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMapGPU **devs_map_out, CUstream *streams) { int sizeX, sizeY, newSizeX, newSizeY, pp; int size_norm, size_map_out; int i; CUresult res; CvLSVMFeatureMapGPU **devs_norm; pp = NUM_SECTOR * 12; devs_norm = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * (numStep)); // allocate device memory for (i = 0; i < numStep; i++) { sizeX = devs_map_in[i]->sizeX; sizeY = devs_map_in[i]->sizeY; newSizeX = sizeX - 2; newSizeY = sizeY - 2; allocFeatureMapObjectGPU<float>(&devs_norm[i], sizeX, sizeY, 1); } // exucute async for (i = 0; i < numStep; i++) { sizeX = devs_map_in[i]->sizeX; sizeY = devs_map_in[i]->sizeY; newSizeX = sizeX - 2; newSizeY = sizeY - 2; size_norm = sizeX * sizeY; size_map_out = newSizeX * newSizeY * pp; // initilize device memory value of 0 res = cuMemsetD32Async(devs_norm[i]->map, 0, size_norm, streams[i]); CUDA_CHECK(res, "cuMemset(dev_norm)"); res = cuMemsetD32Async(devs_map_out[i]->map, 0, size_map_out, streams[i]); CUDA_CHECK(res, "cuMemset(dev_map_out)"); // launch kernel calculateNormGPULaunch(devs_map_in[i], devs_norm[i], streams[i]); } for (i = 0; i < numStep; i++) { // launch kernel normalizeGPULaunch(alfa, devs_map_in[i], devs_norm[i], devs_map_out[i], streams[i]); } // synchronize cuda stream for (i = 0; i < numStep; i++) { cuStreamSynchronize(streams[i]); } // free device memory for (i = 0; i < numStep; i++) { freeFeatureMapObjectGPU(&devs_norm[i]); } free(devs_norm); return LATENT_SVM_OK; }
int cuda_test_memcpy_async(unsigned int size) { int i; CUresult res; CUdevice dev; CUcontext ctx; CUstream stream; CUdeviceptr data_addr; unsigned int *in, *out; struct timeval tv; struct timeval tv_total_start, tv_total_end; unsigned long total; struct timeval tv_h2d_start, tv_h2d_end; float h2d; struct timeval tv_d2h_start, tv_d2h_end; float d2h; gettimeofday(&tv_total_start, NULL); res = cuInit(0); if (res != CUDA_SUCCESS) { printf("cuInit failed: res = %u\n", (unsigned int)res); return -1; } res = cuDeviceGet(&dev, 0); if (res != CUDA_SUCCESS) { printf("cuDeviceGet failed: res = %u\n", (unsigned int)res); return -1; } res = cuCtxCreate(&ctx, 0, dev); if (res != CUDA_SUCCESS) { printf("cuCtxCreate failed: res = %u\n", (unsigned int)res); return -1; } res = cuStreamCreate(&stream, 0); if (res != CUDA_SUCCESS) { printf("cuStreamCreate failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemAlloc(&data_addr, size); if (res != CUDA_SUCCESS) { printf("cuMemAlloc failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemAllocHost((void **)&in, size); if (res != CUDA_SUCCESS) { printf("cuMemAllocHost(in) failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemAllocHost((void **)&out, size); if (res != CUDA_SUCCESS) { printf("cuMemAllocHost(out) failed: res = %u\n", (unsigned int)res); return -1; } for (i = 0; i < size / 4; i++) { in[i] = i+1; out[i] = 0; } gettimeofday(&tv_h2d_start, NULL); res = cuMemcpyHtoDAsync(data_addr, in, size, stream); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoDAsync failed: res = %u\n", (unsigned int)res); return -1; } res = cuStreamSynchronize(stream); if (res != CUDA_SUCCESS) { printf("cuStreamSynchronize() failed: res = %u\n", (unsigned int)res); return -1; } gettimeofday(&tv_h2d_end, NULL); gettimeofday(&tv_d2h_start, NULL); res = cuMemcpyDtoHAsync(out, data_addr, size, stream); if (res != CUDA_SUCCESS) { printf("cuMemcpyDtoHAsync failed: res = %u\n", (unsigned int)res); return -1; } res = cuStreamSynchronize(stream); if (res != CUDA_SUCCESS) { printf("cuStreamSynchronize() failed: res = %u\n", (unsigned int)res); return -1; } gettimeofday(&tv_d2h_end, NULL); for (i = 0; i < size / 4; i++) { if (in[i] != out[i]) { printf("in[%d] = %u, out[%d] = %u\n", i, in[i], i, out[i]); } } res = cuMemFreeHost(out); if (res != CUDA_SUCCESS) { printf("cuMemFreeHost(out) failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemFreeHost(in); if (res != CUDA_SUCCESS) { printf("cuMemFreeHost(in) failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemFree(data_addr); if (res != CUDA_SUCCESS) { printf("cuMemFree failed: res = %u\n", (unsigned int)res); return -1; } res = cuStreamDestroy(stream); if (res != CUDA_SUCCESS) { printf("cuStreamDestroy failed: res = %u\n", (unsigned int)res); return -1; } res = cuCtxDestroy(ctx); if (res != CUDA_SUCCESS) { printf("cuCtxDestroy failed: res = %u\n", (unsigned int)res); return -1; } gettimeofday(&tv_total_end, NULL); tvsub(&tv_h2d_end, &tv_h2d_start, &tv); h2d = tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; tvsub(&tv_d2h_end, &tv_d2h_start, &tv); d2h = tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; tvsub(&tv_total_end, &tv_total_start, &tv); total = tv.tv_sec * 1000 + tv.tv_usec / 1000; printf("HtoD: %f\n", h2d); printf("DtoH: %f\n", d2h); return 0; end: return -1; }
void nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, int async, unsigned *dims, void *targ_mem_desc) { struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn; CUfunction function; CUresult r; int i; struct ptx_stream *dev_str; void *kargs[1]; void *hp, *dp; struct nvptx_thread *nvthd = nvptx_thread (); const char *maybe_abort_msg = "(perhaps abort was called)"; function = targ_fn->fn; dev_str = select_stream_for_async (async, pthread_self (), false, NULL); assert (dev_str == nvthd->current_stream); /* Initialize the launch dimensions. Typically this is constant, provided by the device compiler, but we must permit runtime values. */ for (i = 0; i != 3; i++) if (targ_fn->launch->dim[i]) dims[i] = targ_fn->launch->dim[i]; /* This reserves a chunk of a pre-allocated page of memory mapped on both the host and the device. HP is a host pointer to the new chunk, and DP is the corresponding device pointer. */ map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp); GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); /* Copy the array of arguments to the mapped page. */ for (i = 0; i < mapnum; i++) ((void **) hp)[i] = devaddrs[i]; /* Copy the (device) pointers to arguments to the device (dp and hp might in fact have the same value on a unified-memory system). */ r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *)); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r)); GOMP_PLUGIN_debug (0, " %s: kernel %s: launch" " gangs=%u, workers=%u, vectors=%u\n", __FUNCTION__, targ_fn->launch->fn, dims[0], dims[1], dims[2]); // OpenACC CUDA // // num_gangs nctaid.x // num_workers ntid.y // vector length ntid.x kargs[0] = &dp; r = cuLaunchKernel (function, dims[GOMP_DIM_GANG], 1, 1, dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1, 0, dev_str->stream, kargs, 0); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r)); #ifndef DISABLE_ASYNC if (async < acc_async_noval) { r = cuStreamSynchronize (dev_str->stream); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r), maybe_abort_msg); else if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); } else { CUevent *e; e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r), maybe_abort_msg); else if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); event_gc (true); r = cuEventRecord (*e, dev_str->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); event_add (PTX_EVT_KNL, e, (void *)dev_str); } #else r = cuCtxSynchronize (); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r), maybe_abort_msg); else if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r)); #endif GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__, targ_fn->launch->fn); #ifndef DISABLE_ASYNC if (async < acc_async_noval) #endif map_pop (dev_str); }
bool GLInteropResource::map(int picIndex, const CUVIDPROCPARAMS ¶m, GLuint tex, int w, int h, int H, int plane) { AutoCtxLock locker((cuda_api*)this, lock); Q_UNUSED(locker); if (!ensureResource(w, h, H, tex, plane)) // TODO surface size instead of frame size because we copy the device data return false; //CUDA_ENSURE(cuCtxPushCurrent(ctx), false); CUdeviceptr devptr; unsigned int pitch; CUDA_ENSURE(cuvidMapVideoFrame(dec, picIndex, &devptr, &pitch, const_cast<CUVIDPROCPARAMS*>(¶m)), false); CUVIDAutoUnmapper unmapper(this, dec, devptr); Q_UNUSED(unmapper); // TODO: why can not use res[plane].stream? CUDA_ERROR_INVALID_HANDLE CUDA_ENSURE(cuGraphicsMapResources(1, &res[plane].cuRes, 0), false); CUarray array; CUDA_ENSURE(cuGraphicsSubResourceGetMappedArray(&array, res[plane].cuRes, 0, 0), false); CUDA_MEMCPY2D cu2d; memset(&cu2d, 0, sizeof(cu2d)); cu2d.srcDevice = devptr; cu2d.srcMemoryType = CU_MEMORYTYPE_DEVICE; cu2d.srcPitch = pitch; cu2d.dstArray = array; cu2d.dstMemoryType = CU_MEMORYTYPE_ARRAY; cu2d.dstPitch = pitch; // the whole size or copy size? cu2d.WidthInBytes = pitch; cu2d.Height = h; if (plane == 1) { cu2d.srcXInBytes = 0;// +srcY*srcPitch + srcXInBytes cu2d.srcY = H; // skip the padding height cu2d.Height /= 2; } if (res[plane].stream) CUDA_ENSURE(cuMemcpy2DAsync(&cu2d, res[plane].stream), false); else CUDA_ENSURE(cuMemcpy2D(&cu2d), false); //TODO: delay cuCtxSynchronize && unmap. do it in unmap(tex)? // map to an already mapped resource will crash. sometimes I can not unmap the resource in unmap(tex) because if context switch error // so I simply unmap the resource here if (WORKAROUND_UNMAP_CONTEXT_SWITCH) { if (res[plane].stream) { //CUDA_WARN(cuCtxSynchronize(), false); //wait too long time? use cuStreamQuery? CUDA_WARN(cuStreamSynchronize(res[plane].stream)); //slower than CtxSynchronize } /* * This function provides the synchronization guarantee that any CUDA work issued * in \p stream before ::cuGraphicsUnmapResources() will complete before any * subsequently issued graphics work begins. * The graphics API from which \p resources were registered * should not access any resources while they are mapped by CUDA. If an * application does so, the results are undefined. */ CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false); } else { // call it at last. current context will be used by other cuda calls (unmap() for example) CUDA_ENSURE(cuCtxPopCurrent(&ctx), false); // not required } return true; }
bool EGLInteropResource::map(int picIndex, const CUVIDPROCPARAMS ¶m, GLuint tex, int w, int h, int H, int plane) { // plane is always 0 because frame is rgb AutoCtxLock locker((cuda_api*)this, lock); Q_UNUSED(locker); if (!ensureResource(w, h, param.Reserved[0], H, tex)) // TODO surface size instead of frame size because we copy the device data return false; //CUDA_ENSURE(cuCtxPushCurrent(ctx), false); CUdeviceptr devptr; unsigned int pitch; CUDA_ENSURE(cuvidMapVideoFrame(dec, picIndex, &devptr, &pitch, const_cast<CUVIDPROCPARAMS*>(¶m)), false); CUVIDAutoUnmapper unmapper(this, dec, devptr); Q_UNUSED(unmapper); // TODO: why can not use res[plane].stream? CUDA_ERROR_INVALID_HANDLE CUDA_ENSURE(cuGraphicsMapResources(1, &res[plane].cuRes, 0), false); CUarray array; CUDA_ENSURE(cuGraphicsSubResourceGetMappedArray(&array, res[plane].cuRes, 0, 0), false); CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false); // mapped array still accessible! CUDA_MEMCPY2D cu2d; memset(&cu2d, 0, sizeof(cu2d)); // Y plane cu2d.srcDevice = devptr; cu2d.srcMemoryType = CU_MEMORYTYPE_DEVICE; cu2d.srcPitch = pitch; cu2d.dstArray = array; cu2d.dstMemoryType = CU_MEMORYTYPE_ARRAY; cu2d.dstPitch = pitch; // the whole size or copy size? cu2d.WidthInBytes = res[plane].W; // the same value as texture9_nv12 cu2d.Height = H*3/2; if (res[plane].stream) CUDA_ENSURE(cuMemcpy2DAsync(&cu2d, res[plane].stream), false); else CUDA_ENSURE(cuMemcpy2D(&cu2d), false); //TODO: delay cuCtxSynchronize && unmap. do it in unmap(tex)? // map to an already mapped resource will crash. sometimes I can not unmap the resource in unmap(tex) because if context switch error // so I simply unmap the resource here if (WORKAROUND_UNMAP_CONTEXT_SWITCH) { if (res[plane].stream) { //CUDA_WARN(cuCtxSynchronize(), false); //wait too long time? use cuStreamQuery? CUDA_WARN(cuStreamSynchronize(res[plane].stream)); //slower than CtxSynchronize } /* * This function provides the synchronization guarantee that any CUDA work issued * in \p stream before ::cuGraphicsUnmapResources() will complete before any * subsequently issued graphics work begins. * The graphics API from which \p resources were registered * should not access any resources while they are mapped by CUDA. If an * application does so, the results are undefined. */ // CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false); } D3DLOCKED_RECT rect_src, rect_dst; DX_ENSURE(texture9_nv12->LockRect(0, &rect_src, NULL, D3DLOCK_READONLY), false); DX_ENSURE(surface9_nv12->LockRect(&rect_dst, NULL, D3DLOCK_DISCARD), false); memcpy(rect_dst.pBits, rect_src.pBits, res[plane].W*H*3/2); // exactly w and h DX_ENSURE(surface9_nv12->UnlockRect(), false); DX_ENSURE(texture9_nv12->UnlockRect(0), false); #if 0 //IDirect3DSurface9 *raw_surface = NULL; //DX_ENSURE(texture9_nv12->GetSurfaceLevel(0, &raw_surface), false); const RECT src = { 0, 0, w, h*3/2}; DX_ENSURE(device9->StretchRect(raw_surface, &src, surface9_nv12, NULL, D3DTEXF_NONE), false); #endif if (!map(surface9_nv12, tex, w, h, H)) return false; return true; }
void device_t<CUDA>::finish(){ OCCA_CUDA_CHECK("Device: Finish", cuStreamSynchronize(*((CUstream*) dev->currentStream)) ); }
static void vq_handle_output(VirtIODevice *vdev, VirtQueue *vq) { VirtQueueElement elem; while(virtqueue_pop(vq, &elem)) { struct param *p = elem.out_sg[0].iov_base; //for all library routines: get required arguments from buffer, execute, and push results back in virtqueue switch (p->syscall_type) { case CUINIT: { p->result = cuInit(p->flags); break; } case CUDRIVERGETVERSION: { p->result = cuDriverGetVersion(&p->val1); break; } case CUDEVICEGETCOUNT: { p->result = cuDeviceGetCount(&p->val1); break; } case CUDEVICEGET: { p->result = cuDeviceGet(&p->device, p->val1); break; } case CUDEVICECOMPUTECAPABILITY: { p->result = cuDeviceComputeCapability(&p->val1, &p->val2, p->device); break; } case CUDEVICEGETNAME: { p->result = cuDeviceGetName(elem.in_sg[0].iov_base, p->val1, p->device); break; } case CUDEVICEGETATTRIBUTE: { p->result = cuDeviceGetAttribute(&p->val1, p->attrib, p->device); break; } case CUCTXCREATE: { p->result = cuCtxCreate(&p->ctx, p->flags, p->device); break; } case CUCTXDESTROY: { p->result = cuCtxDestroy(p->ctx); break; } case CUCTXGETCURRENT: { p->result = cuCtxGetCurrent(&p->ctx); break; } case CUCTXGETDEVICE: { p->result = cuCtxGetDevice(&p->device); break; } case CUCTXPOPCURRENT: { p->result = cuCtxPopCurrent(&p->ctx); break; } case CUCTXSETCURRENT: { p->result = cuCtxSetCurrent(p->ctx); break; } case CUCTXSYNCHRONIZE: { p->result = cuCtxSynchronize(); break; } case CUMODULELOAD: { //hardcoded path - needs improvement //all .cubin files should be stored in $QEMU_NFS_PATH - currently $QEMU_NFS_PATH is shared between host and guest with NFS char *binname = malloc((strlen((char *)elem.out_sg[1].iov_base)+strlen(getenv("QEMU_NFS_PATH")+1))*sizeof(char)); if (!binname) { p->result = 0; virtqueue_push(vq, &elem, 0); break; } strcpy(binname, getenv("QEMU_NFS_PATH")); strcat(binname, (char *)elem.out_sg[1].iov_base); //change current CUDA context //each CUDA contets has its own virtual memory space - isolation is ensured by switching contexes if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuModuleLoad(&p->module, binname); free(binname); break; } case CUMODULEGETGLOBAL: { char *name = malloc(100*sizeof(char)); if (!name) { p->result = 999; break; } strcpy(name, (char *)elem.out_sg[1].iov_base); p->result = cuModuleGetGlobal(&p->dptr,&p->size1,p->module,(const char *)name); break; } case CUMODULEUNLOAD: { p->result = cuModuleUnload(p->module); break; } case CUMEMALLOC: { if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuMemAlloc(&p->dptr, p->bytesize); break; } case CUMEMALLOCPITCH: { if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuMemAllocPitch(&p->dptr, &p->size3, p->size1, p->size2, p->bytesize); break; } //large buffers are alocated in smaller chuncks in guest kernel space //gets each chunck seperately and copies it to device memory case CUMEMCPYHTOD: { int i; size_t offset; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.out_sg[1+2*i+1].iov_base; p->result = cuMemcpyHtoD(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYHTODASYNC: { int i; size_t offset; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.out_sg[1+2*i+1].iov_base; p->result = cuMemcpyHtoDAsync(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s, p->stream); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYDTODASYNC: { p->result = cuMemcpyDtoDAsync(p->dptr, p->dptr1, p->size1, p->stream); break; } case CUMEMCPYDTOH: { int i; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } size_t offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.in_sg[0+2*i+1].iov_base; p->result = cuMemcpyDtoH(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYDTOHASYNC: { int i; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } size_t offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.in_sg[0+2*i+1].iov_base; p->result = cuMemcpyDtoHAsync(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s, p->stream); if (p->result != 0) break; offset += s; } break; } case CUMEMSETD32: { p->result = cuMemsetD32(p->dptr, p->bytecount, p->bytesize); break; } case CUMEMFREE: { p->result = cuMemFree(p->dptr); break; } case CUMODULEGETFUNCTION: { char *name = (char *)elem.out_sg[1].iov_base; name[p->length] = '\0'; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuModuleGetFunction(&p->function, p->module, name); break; } case CULAUNCHKERNEL: { void **args = malloc(p->val1*sizeof(void *)); if (!args) { p->result = 9999; break; } int i; for (i=0; i<p->val1; i++) { args[i] = elem.out_sg[1+i].iov_base; } if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuLaunchKernel(p->function, p->gridDimX, p->gridDimY, p->gridDimZ, p->blockDimX, p->blockDimY, p->blockDimZ, p->bytecount, 0, args, 0); free(args); break; } case CUEVENTCREATE: { p->result = cuEventCreate(&p->event1, p->flags); break; } case CUEVENTDESTROY: { p->result = cuEventDestroy(p->event1); break; } case CUEVENTRECORD: { p->result = cuEventRecord(p->event1, p->stream); break; } case CUEVENTSYNCHRONIZE: { p->result = cuEventSynchronize(p->event1); break; } case CUEVENTELAPSEDTIME: { p->result = cuEventElapsedTime(&p->pMilliseconds, p->event1, p->event2); break; } case CUSTREAMCREATE: { p->result = cuStreamCreate(&p->stream, 0); break; } case CUSTREAMSYNCHRONIZE: { p->result = cuStreamSynchronize(p->stream); break; } case CUSTREAMQUERY: { p->result = cuStreamQuery(p->stream); break; } case CUSTREAMDESTROY: { p->result = cuStreamDestroy(p->stream); break; } default: printf("Unknown syscall_type\n"); } virtqueue_push(vq, &elem, 0); } //notify frontend - trigger virtual interrupt virtio_notify(vdev, vq); return; }
/// Blocks until all previously queued commands in command_queue are issued to the associated device and have completed. void finish() const { ctx.set_current(); cuda_check( cuStreamSynchronize( s.get() ) ); }