static void map_init (struct ptx_stream *s) { CUresult r; int size = getpagesize (); assert (s); assert (!s->d); assert (!s->h); r = cuMemAllocHost (&s->h, size); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r)); r = cuMemHostGetDevicePointer (&s->d, s->h, 0); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r)); assert (s->h); s->h_begin = s->h; s->h_end = s->h_begin + size; s->h_next = s->h_prev = s->h_tail = s->h_begin; assert (s->h_next); assert (s->h_end); }
void *cuda_make_ctx(CUcontext ctx, int flags) { cuda_context *res; void *p; res = malloc(sizeof(*res)); if (res == NULL) return NULL; res->ctx = ctx; res->err = CUDA_SUCCESS; res->blas_handle = NULL; res->refcnt = 1; res->flags = flags; res->enter = 0; res->freeblocks = NULL; if (detect_arch(ARCH_PREFIX, res->bin_id, &err)) { goto fail_cache; } res->extcopy_cache = cache_lru(64, 32, (cache_eq_fn)extcopy_eq, (cache_hash_fn)extcopy_hash, (cache_freek_fn)extcopy_free, (cache_freev_fn)cuda_freekernel); if (res->extcopy_cache == NULL) { goto fail_cache; } err = cuStreamCreate(&res->s, 0); if (err != CUDA_SUCCESS) { goto fail_stream; } err = cuStreamCreate(&res->mem_s, CU_STREAM_NON_BLOCKING); if (err != CUDA_SUCCESS) { goto fail_mem_stream; } err = cuMemAllocHost(&p, 16); if (err != CUDA_SUCCESS) { goto fail_errbuf; } memset(p, 0, 16); /* Need to tag for new_gpudata */ TAG_CTX(res); res->errbuf = new_gpudata(res, (CUdeviceptr)p, 16); if (res->errbuf == NULL) { err = res->err; goto fail_end; } res->errbuf->flags |= CUDA_MAPPED_PTR; return res; fail_end: cuMemFreeHost(p); fail_errbuf: cuStreamDestroy(res->mem_s); fail_mem_stream: cuStreamDestroy(res->s); fail_stream: cache_destroy(res->extcopy_cache); fail_cache: free(res); return NULL; }
int gib_alloc ( void **buffers, int buf_size, int *ld, gib_context c ) { ERROR_CHECK_FAIL(cuCtxPushCurrent(((gpu_context)(c->acc_context))->pCtx)); #if GIB_USE_MMAP ERROR_CHECK_FAIL(cuMemHostAlloc(buffers, (c->n+c->m)*buf_size, CU_MEMHOSTALLOC_DEVICEMAP)); #else ERROR_CHECK_FAIL(cuMemAllocHost(buffers, (c->n+c->m)*buf_size)); #endif *ld = buf_size; ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx)); return GIB_SUC; }
SEXP R_auto_cuMemAllocHost(SEXP r_bytesize) { SEXP r_ans = R_NilValue; void * pp; size_t bytesize = REAL(r_bytesize)[0]; CUresult ans; ans = cuMemAllocHost(& pp, bytesize); if(ans) return(R_cudaErrorInfo(ans)); r_ans = R_createRef(pp, "voidPtr") ; return(r_ans); }
/** * Allocates bytesize bytes of host memory that is page-locked and accessible * to the device. The driver tracks the virtual memory ranges allocated with * this function and automatically accelerates calls to functions such as * cuMemcpyHtoD(). Since the memory can be accessed directly by the device, it * can be read or written with much higher bandwidth than pageable memory * obtained with functions such as malloc(). Allocating excessive amounts of * pinned memory may degrade system performance, since it reduces the amount of * memory available to the system for paging. As a result, this function is * best used sparingly to allocate staging areas for data exchange between host * and device. * * The Flags parameter enables different options to be specified that affect * the allocation, as follows. * * CU_MEMHOSTALLOC_PORTABLE: The memory returned by this call will be * considered as pinned memory by all CUDA contexts, not just the one that * performed the allocation. * * CU_MEMHOSTALLOC_DEVICEMAP: Maps the allocation into the CUDA address space. * The device pointer to the memory may be obtained by calling * cuMemHostGetDevicePointer(). This feature is available only on GPUs with * compute capability greater than or equal to 1.1. * * CU_MEMHOSTALLOC_WRITECOMBINED: Allocates the memory as write-combined (WC). * WC memory can be transferred across the PCI Express bus more quickly on some * system configurations, but cannot be read efficiently by most CPUs. WC * memory is a good option for buffers that will be written by the CPU and read * by the GPU via mapped pinned memory or host->device transfers. * * All of these flags are orthogonal to one another: a developer may allocate * memory that is portable, mapped and/or write-combined with no restrictions. * * The CUDA context must have been created with the CU_CTX_MAP_HOST flag in * order for the CU_MEMHOSTALLOC_MAPPED flag to have any effect. * * The CU_MEMHOSTALLOC_MAPPED flag may be specified on CUDA contexts for * devices that do not support mapped pinned memory. The failure is deferred to * cuMemHostGetDevicePointer() because the memory may be mapped into other CUDA * contexts via the CU_MEMHOSTALLOC_PORTABLE flag. * * The memory allocated by this function must be freed with cuMemFreeHost(). * * Note all host memory allocated using cuMemHostAlloc() will automatically be * immediately accessible to all contexts on all devices which support unified * addressing (as may be queried using CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING). * Unless the flag CU_MEMHOSTALLOC_WRITECOMBINED is specified, the device * pointer that may be used to access this host memory from those contexts is * always equal to the returned host pointer *pp. If the flag * CU_MEMHOSTALLOC_WRITECOMBINED is specified, then the function * cuMemHostGetDevicePointer() must be used to query the device pointer, even * if the context supports unified addressing. See Unified Addressing for * additional details. * * Parameters: * pp - Returned host pointer to page-locked memory * bytesize - Requested allocation size in bytes * Flags - Flags for allocation request * * Returns: * CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED, * CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_VALUE, * CUDA_ERROR_OUT_OF_MEMORY */ CUresult cuMemHostAlloc(void **pp, unsigned int bytesize, unsigned int Flags) { if (Flags & CU_MEMHOSTALLOC_PORTABLE) { GDEV_PRINT("CU_MEMHOSTALLOC_PORTABLE: Not Implemented Yet\n"); return CUDA_ERROR_UNKNOWN; } if (Flags & CU_MEMHOSTALLOC_WRITECOMBINED) { GDEV_PRINT("CU_MEMHOSTALLOC_WRITECOMBINED: Not Implemented Yet\n"); return CUDA_ERROR_UNKNOWN; } /* our implementation uses CU_MEMHOSTALLOC_DEVICEMAP by default. */ return cuMemAllocHost(pp, bytesize); }
void *swanMallocHost( size_t len ) { CUresult err; void *ptr; try_init(); #ifdef DEV_EXTENSIONS err= cuMemHostAlloc( &ptr, len, CU_MEMHOSTALLOC_PORTABLE ); //| CU_MEMHOSTALLOC_DEVICEMAP ); //| CU_MEMHOSTALLOC_WRITECOMBINED ); #else err = cuMemAllocHost( &ptr, len ); #endif if ( err != CUDA_SUCCESS ) { fprintf( stderr, "swanMallocHost error: %d\n", err ); error("swanMallocHost failed\n" ); } // printf("MallocHost %p\n", ptr ); memset( ptr, 0, len ); return ptr; }
void* InteropResource::mapToHost(const VideoFormat &format, void *handle, int picIndex, const CUVIDPROCPARAMS ¶m, int width, int height, int coded_height) { AutoCtxLock locker((cuda_api*)this, lock); Q_UNUSED(locker); CUdeviceptr devptr; unsigned int pitch; CUDA_ENSURE(cuvidMapVideoFrame(dec, picIndex, &devptr, &pitch, const_cast<CUVIDPROCPARAMS*>(¶m)), NULL); CUVIDAutoUnmapper unmapper(this, dec, devptr); Q_UNUSED(unmapper); uchar* host_data = NULL; const size_t host_size = pitch*coded_height*3/2; CUDA_ENSURE(cuMemAllocHost((void**)&host_data, host_size), NULL); // copy to the memory not allocated by cuda is possible but much slower CUDA_ENSURE(cuMemcpyDtoH(host_data, devptr, host_size), NULL); VideoFrame frame(width, height, VideoFormat::Format_NV12); uchar *planes[] = { host_data, host_data + pitch * coded_height }; frame.setBits(planes); int pitches[] = { (int)pitch, (int)pitch }; frame.setBytesPerLine(pitches); VideoFrame *f = reinterpret_cast<VideoFrame*>(handle); frame.setTimestamp(f->timestamp()); frame.setDisplayAspectRatio(f->displayAspectRatio()); if (format == frame.format()) *f = frame.clone(); else *f = frame.to(format); cuMemFreeHost(host_data); return f; }
bool VideoDecoderCUDAPrivate::processDecodedData(CUVIDPARSERDISPINFO *cuviddisp, VideoFrame* outFrame) { int num_fields = cuviddisp->progressive_frame ? 1 : 2+cuviddisp->repeat_first_field; for (int active_field = 0; active_field < num_fields; ++active_field) { CUVIDPROCPARAMS proc_params; memset(&proc_params, 0, sizeof(CUVIDPROCPARAMS)); proc_params.progressive_frame = cuviddisp->progressive_frame; //check user config proc_params.second_field = active_field == 1; //check user config proc_params.top_field_first = cuviddisp->top_field_first; proc_params.unpaired_field = cuviddisp->progressive_frame == 1; CUdeviceptr devptr; unsigned int pitch; cuvidCtxLock(vid_ctx_lock, 0); CUresult cuStatus = cuvidMapVideoFrame(dec, cuviddisp->picture_index, &devptr, &pitch, &proc_params); if (cuStatus != CUDA_SUCCESS) { qWarning("cuvidMapVideoFrame failed on index %d (%#x, %s)", cuviddisp->picture_index, cuStatus, _cudaGetErrorEnum(cuStatus)); cuvidUnmapVideoFrame(dec, devptr); cuvidCtxUnlock(vid_ctx_lock, 0); return false; } #define PAD_ALIGN(x,mask) ( (x + mask) & ~mask ) //uint w = dec_create_info.ulWidth;//PAD_ALIGN(dec_create_info.ulWidth, 0x3F); uint h = dec_create_info.ulHeight;//PAD_ALIGN(dec_create_info.ulHeight, 0x0F); //? #undef PAD_ALIGN int size = pitch*h*3/2; if (size > host_data_size && host_data) { cuMemFreeHost(host_data); host_data = 0; host_data_size = 0; } if (!host_data) { cuStatus = cuMemAllocHost((void**)&host_data, size); if (cuStatus != CUDA_SUCCESS) { qWarning("cuMemAllocHost failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus)); cuvidUnmapVideoFrame(dec, devptr); cuvidCtxUnlock(vid_ctx_lock, 0); return false; } host_data_size = size; } if (!host_data) { qWarning("No valid staging memory!"); cuvidUnmapVideoFrame(dec, devptr); cuvidCtxUnlock(vid_ctx_lock, 0); return false; } cuStatus = cuMemcpyDtoHAsync(host_data, devptr, size, stream); if (cuStatus != CUDA_SUCCESS) { qWarning("cuMemcpyDtoHAsync failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus)); cuvidUnmapVideoFrame(dec, devptr); cuvidCtxUnlock(vid_ctx_lock, 0); return false; } cuStatus = cuCtxSynchronize(); if (cuStatus != CUDA_SUCCESS) { qWarning("cuCtxSynchronize failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus)); } cuvidUnmapVideoFrame(dec, devptr); cuvidCtxUnlock(vid_ctx_lock, 0); //qDebug("mark not in use pic_index: %d", cuviddisp->picture_index); surface_in_use[cuviddisp->picture_index] = false; uchar *planes[] = { host_data, host_data + pitch * h }; int pitches[] = { (int)pitch, (int)pitch }; VideoFrame frame(codec_ctx->width, codec_ctx->height, VideoFormat::Format_NV12); frame.setBits(planes); frame.setBytesPerLine(pitches); //TODO: is clone required? may crash on clone, I should review clone() //frame = frame.clone(); if (outFrame) { *outFrame = frame.clone(); } #if COPY_ON_DECODE frame_queue.put(frame.clone()); #endif //qDebug("frame queue size: %d", frame_queue.size()); } return true; }
int cuda_test_memcpy_async(unsigned int size) { int i; CUresult res; CUdevice dev; CUcontext ctx; CUstream stream; CUdeviceptr data_addr; unsigned int *in, *out; struct timeval tv; struct timeval tv_total_start, tv_total_end; unsigned long total; struct timeval tv_h2d_start, tv_h2d_end; float h2d; struct timeval tv_d2h_start, tv_d2h_end; float d2h; gettimeofday(&tv_total_start, NULL); res = cuInit(0); if (res != CUDA_SUCCESS) { printf("cuInit failed: res = %u\n", (unsigned int)res); return -1; } res = cuDeviceGet(&dev, 0); if (res != CUDA_SUCCESS) { printf("cuDeviceGet failed: res = %u\n", (unsigned int)res); return -1; } res = cuCtxCreate(&ctx, 0, dev); if (res != CUDA_SUCCESS) { printf("cuCtxCreate failed: res = %u\n", (unsigned int)res); return -1; } res = cuStreamCreate(&stream, 0); if (res != CUDA_SUCCESS) { printf("cuStreamCreate failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemAlloc(&data_addr, size); if (res != CUDA_SUCCESS) { printf("cuMemAlloc failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemAllocHost((void **)&in, size); if (res != CUDA_SUCCESS) { printf("cuMemAllocHost(in) failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemAllocHost((void **)&out, size); if (res != CUDA_SUCCESS) { printf("cuMemAllocHost(out) failed: res = %u\n", (unsigned int)res); return -1; } for (i = 0; i < size / 4; i++) { in[i] = i+1; out[i] = 0; } gettimeofday(&tv_h2d_start, NULL); res = cuMemcpyHtoDAsync(data_addr, in, size, stream); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoDAsync failed: res = %u\n", (unsigned int)res); return -1; } res = cuStreamSynchronize(stream); if (res != CUDA_SUCCESS) { printf("cuStreamSynchronize() failed: res = %u\n", (unsigned int)res); return -1; } gettimeofday(&tv_h2d_end, NULL); gettimeofday(&tv_d2h_start, NULL); res = cuMemcpyDtoHAsync(out, data_addr, size, stream); if (res != CUDA_SUCCESS) { printf("cuMemcpyDtoHAsync failed: res = %u\n", (unsigned int)res); return -1; } res = cuStreamSynchronize(stream); if (res != CUDA_SUCCESS) { printf("cuStreamSynchronize() failed: res = %u\n", (unsigned int)res); return -1; } gettimeofday(&tv_d2h_end, NULL); for (i = 0; i < size / 4; i++) { if (in[i] != out[i]) { printf("in[%d] = %u, out[%d] = %u\n", i, in[i], i, out[i]); } } res = cuMemFreeHost(out); if (res != CUDA_SUCCESS) { printf("cuMemFreeHost(out) failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemFreeHost(in); if (res != CUDA_SUCCESS) { printf("cuMemFreeHost(in) failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemFree(data_addr); if (res != CUDA_SUCCESS) { printf("cuMemFree failed: res = %u\n", (unsigned int)res); return -1; } res = cuStreamDestroy(stream); if (res != CUDA_SUCCESS) { printf("cuStreamDestroy failed: res = %u\n", (unsigned int)res); return -1; } res = cuCtxDestroy(ctx); if (res != CUDA_SUCCESS) { printf("cuCtxDestroy failed: res = %u\n", (unsigned int)res); return -1; } gettimeofday(&tv_total_end, NULL); tvsub(&tv_h2d_end, &tv_h2d_start, &tv); h2d = tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; tvsub(&tv_d2h_end, &tv_d2h_start, &tv); d2h = tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; tvsub(&tv_total_end, &tv_total_start, &tv); total = tv.tv_sec * 1000 + tv.tv_usec / 1000; printf("HtoD: %f\n", h2d); printf("DtoH: %f\n", d2h); return 0; end: return -1; }
int main() { CU_ERROR_CHECK(cuInit(0)); int count; CU_ERROR_CHECK(cuDeviceGetCount(&count)); count = (count > 2) ? 2 : count; CUdevice devices[count]; for (int i = 0; i < count; i++) CU_ERROR_CHECK(cuDeviceGet(&devices[i], i)); // Question 1: Can you create multiple contexts on the same device? { fprintf(stderr, "Attempting to create multiple contexts on each device...\n"); CUcontext contexts[count * N]; size_t j = 0; for (int i = 0; i < count; i++) { CUresult error = CUDA_SUCCESS; size_t k; for (k = 0; k < N && error == CUDA_SUCCESS; k++) { error = cuCtxCreate(&contexts[j], CU_CTX_SCHED_AUTO, devices[i]); if (error == CUDA_SUCCESS) CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[j++])); } fprintf(stderr, " created %zu contexts on device %d before cuCtxCreate returned \"%s\"\n", (k - 1), i, cuGetErrorString(error)); } CUresult error = CUDA_SUCCESS; size_t k; for (k = 0; k < j && error == CUDA_SUCCESS; k++) error = cuCtxPushCurrent(contexts[k]); if (error == CUDA_SUCCESS) fprintf(stderr, " successfully pushed %zu contexts with cuCtxPushCurrent\n", k); else fprintf(stderr, " pushed %zu contexts before cuCtxPushCurrent returned \"%s\"\n", (k - 1), cuGetErrorString(error)); for (size_t k = 0; k < j; k++) CU_ERROR_CHECK(cuCtxDestroy(contexts[k])); fprintf(stderr, "\n"); } CUcontext contexts[count][2]; for (int i = 0; i < count; i++) { for (size_t j = 0; j < 2; j++) { CU_ERROR_CHECK(cuCtxCreate(&contexts[i][j], CU_CTX_SCHED_AUTO, devices[i])); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[i][j])); } } // Question 2: Can you access a host pointer in a different context from // which it was created? // Question 3: Can you free a host pointer in a different context from which // it was created? { void * hPtr; CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); CU_ERROR_CHECK(cuMemAllocHost(&hPtr, 1024)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); CUdeviceptr dPtr[count]; CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); CU_ERROR_CHECK(cuMemAlloc(&dPtr[0], 1024)); // Different context, same device fprintf(stderr, "Accessing a host pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(cuMemcpyHtoD(dPtr[0], hPtr, 1024))); CU_ERROR_CHECK(cuMemFree(dPtr[0])); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); if (count > 1) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); CU_ERROR_CHECK(cuMemAlloc(&dPtr[1], 1024)); // Different context, different device fprintf(stderr, "Accessing a host pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(cuMemcpyHtoD(dPtr[1], hPtr, 1024))); CU_ERROR_CHECK(cuMemFree(dPtr[1])); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } fprintf(stderr, "\n"); CUresult error = CUDA_ERROR_UNKNOWN; if (count > 1) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); error = cuMemFreeHost(hPtr); fprintf(stderr, "Freeing a host pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); error = cuMemFreeHost(hPtr); fprintf(stderr, "Freeing a host pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); error = cuMemFreeHost(hPtr); fprintf(stderr, "Freeing a host pointer from the same context to which it was allocated returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } fprintf(stderr, "\n"); } // Question 4: Can you access a device pointer in a different context from // which it was created? // Question 5: Can you free a device pointer in a different context from which // it was created? { CUdeviceptr dPtr[count][2]; CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); CU_ERROR_CHECK(cuMemAlloc(&dPtr[0][0], 1024)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); CU_ERROR_CHECK(cuMemAlloc(&dPtr[0][1], 1024)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); fprintf(stderr, "Accessing a device pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(cuMemcpyDtoD(dPtr[0][0], dPtr[0][1], 1024))); CU_ERROR_CHECK(cuMemFree(dPtr[0][1])); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); if (count > 1) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); CU_ERROR_CHECK(cuMemAlloc(&dPtr[1][0], 1024)); // Different context, different device fprintf(stderr, "Accessing a device pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(cuMemcpyDtoD(dPtr[0][0], dPtr[1][0], 1024))); CU_ERROR_CHECK(cuMemFree(dPtr[1][0])); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } fprintf(stderr, "\n"); CUresult error = CUDA_ERROR_UNKNOWN; if (count > 1) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); error = cuMemFree(dPtr[0][0]); fprintf(stderr, "Freeing a device pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); error = cuMemFree(dPtr[0][0]); fprintf(stderr, "Freeing a device pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); error = cuMemFree(dPtr[0][0]); fprintf(stderr, "Freeing a device pointer from the same context to which it was allocated returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } fprintf(stderr, "\n"); } // Question 6: Can you access a module in a different context from which it // was loaded? // Question 7: Can you unload a module in a different context from which it // was loaded? { CUmodule module; CUdeviceptr ptr; CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); CU_ERROR_CHECK(cuModuleLoad(&module, "kernel-test.ptx")); CU_ERROR_CHECK(cuMemAlloc(&ptr, sizeof(float))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); CUfunction function = 0; if (count > 0) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); fprintf(stderr, "Getting a function pointer from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel"))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } if (function == 0) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); fprintf(stderr, "Getting a function pointer from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel"))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); } if (function == 0) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); fprintf(stderr, "Getting a function pointer from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel"))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } fprintf(stderr, "\n"); CUdeviceptr a, b; CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); CU_ERROR_CHECK(cuMemAlloc(&a, sizeof(float))); CU_ERROR_CHECK(cuMemAlloc(&b, sizeof(float))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); void * params[] = { &a, & b }; CUresult error = CUDA_ERROR_UNKNOWN; if (count > 0) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); fprintf(stderr, "Launching a function from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); fprintf(stderr, "Launching a function from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); fprintf(stderr, "Launching a function from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } fprintf(stderr, "\n"); error = CUDA_ERROR_UNKNOWN; if (count > 0) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); fprintf(stderr, "Unloading a module from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); fprintf(stderr, "Unloading a module from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); fprintf(stderr, "Unloading a module from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); CU_ERROR_CHECK(cuMemFree(a)); CU_ERROR_CHECK(cuMemFree(b)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } for (int i = 0; i < count; i++) { for (size_t j = 0; j < 2; j++) CU_ERROR_CHECK(cuCtxDestroy(contexts[i][j])); } return 0; }
CNvidiaNvencCodec(DWORD nCodecInstanceId, const CCodecContextBase& CodecContext) : m_NvidiaNvencCodecContext(static_cast<const CNvidiaNvencCodecContext&>(CodecContext)), m_hNvEncodeAPI64(LoadLibraryA("nvEncodeAPI64.dll")) { PNVENCODEAPICREATEINSTANCE pNvEncodeAPICreateInstance = reinterpret_cast<PNVENCODEAPICREATEINSTANCE>(GetProcAddress(m_hNvEncodeAPI64, "NvEncodeAPICreateInstance")); memset(&m_FunctionList, 0, sizeof(m_FunctionList)); m_FunctionList.version = NV_ENCODE_API_FUNCTION_LIST_VER; NVENCSTATUS nStatus = pNvEncodeAPICreateInstance(&m_FunctionList); CHECK_CUDA_DRV_STATUS(cuCtxCreate(&m_Context, 0, 0)); if (m_NvidiaNvencCodecContext.GetUseSwscaleInsteadOfCuda()) { CHECK_CUDA_DRV_STATUS(cuMemAlloc(&m_pNv12Buffer, m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight() * 3 / 2)); m_nNv12BufferPitch = m_NvidiaNvencCodecContext.GetWidth(); CHECK_CUDA_DRV_STATUS(cuMemAllocHost(&m_pPageLockedNv12Buffer, m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight() * 3 / 2)); m_pNv12Planes[0] = reinterpret_cast<unsigned char*>(m_pPageLockedNv12Buffer); m_pNv12Planes[1] = reinterpret_cast<unsigned char*>(m_pPageLockedNv12Buffer) + m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight(); m_pNv12Strides[0] = m_NvidiaNvencCodecContext.GetWidth(); m_pNv12Strides[1] = m_NvidiaNvencCodecContext.GetWidth(); m_SwscaleContext = sws_getContext(m_NvidiaNvencCodecContext.GetWidth(), m_NvidiaNvencCodecContext.GetHeight(), AV_PIX_FMT_BGR32, m_NvidiaNvencCodecContext.GetWidth(), m_NvidiaNvencCodecContext.GetHeight(), AV_PIX_FMT_NV12, 0, 0, 0, 0); } else { CHECK_CUDA_DRV_STATUS(cuMemAllocPitch(&m_pNv12Buffer, &m_nNv12BufferPitch, m_NvidiaNvencCodecContext.GetWidth(), m_NvidiaNvencCodecContext.GetHeight() * 3 / 2, 16)); if (m_NvidiaNvencCodecContext.GetUsePageLockedIntermediateBuffer()) { CHECK_CUDA_DRV_STATUS(cuMemAllocHost(&m_pPageLockedRgb32Buffer, m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight() * 4)); } CHECK_CUDA_DRV_STATUS(cuMemAlloc(&m_pRgb32Buffer, m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight() * 4)); } CHECK_CUDA_DRV_STATUS(cuStreamCreate(&m_Stream, 0)); NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS SessionParameters; memset(&SessionParameters, 0, sizeof(SessionParameters)); SessionParameters.version = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER; SessionParameters.apiVersion = NVENCAPI_VERSION; SessionParameters.device = m_Context; SessionParameters.deviceType = NV_ENC_DEVICE_TYPE_CUDA; nStatus = m_FunctionList.nvEncOpenEncodeSessionEx(&SessionParameters, &m_pEncoder); m_PictureParameters.version = NV_ENC_PIC_PARAMS_VER; auto PresetGuid = NV_ENC_PRESET_HP_GUID; NV_ENC_PRESET_CONFIG PresetConfiguration = { NV_ENC_PRESET_CONFIG_VER, 0 }; PresetConfiguration.presetCfg.version = NV_ENC_CONFIG_VER; CHECK_NVENC_STATUS(m_FunctionList.nvEncGetEncodePresetConfig(m_pEncoder, NV_ENC_CODEC_H264_GUID, PresetGuid, &PresetConfiguration)); NV_ENC_CONFIG EncoderConfiguration = { NV_ENC_CONFIG_VER, 0 }; EncoderConfiguration = PresetConfiguration.presetCfg; EncoderConfiguration.gopLength = NVENC_INFINITE_GOPLENGTH; EncoderConfiguration.profileGUID = NV_ENC_H264_PROFILE_BASELINE_GUID; EncoderConfiguration.frameIntervalP = 1; // No B frames EncoderConfiguration.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FRAME; EncoderConfiguration.encodeCodecConfig.h264Config.idrPeriod = m_NvidiaNvencCodecContext.GetFrameCount(); EncoderConfiguration.encodeCodecConfig.h264Config.chromaFormatIDC = 1; EncoderConfiguration.encodeCodecConfig.h264Config.sliceMode = 0; EncoderConfiguration.encodeCodecConfig.h264Config.sliceModeData = 0; NV_ENC_INITIALIZE_PARAMS InitializationParameters = { NV_ENC_INITIALIZE_PARAMS_VER, 0 }; InitializationParameters.encodeGUID = NV_ENC_CODEC_H264_GUID; InitializationParameters.presetGUID = PresetGuid; InitializationParameters.frameRateNum = m_NvidiaNvencCodecContext.GetFps(); InitializationParameters.frameRateDen = 1; #ifdef ASYNCHRONOUS InitializationParameters.enableEncodeAsync = 1; #else InitializationParameters.enableEncodeAsync = 0; #endif InitializationParameters.enablePTD = 1; // Let the encoder decide the picture type InitializationParameters.reportSliceOffsets = 0; InitializationParameters.maxEncodeWidth = m_NvidiaNvencCodecContext.GetWidth(); InitializationParameters.maxEncodeHeight = m_NvidiaNvencCodecContext.GetHeight(); InitializationParameters.encodeConfig = &EncoderConfiguration; InitializationParameters.encodeWidth = m_NvidiaNvencCodecContext.GetWidth(); InitializationParameters.encodeHeight = m_NvidiaNvencCodecContext.GetHeight(); InitializationParameters.darWidth = 16; InitializationParameters.darHeight = 9; CHECK_NVENC_STATUS(m_FunctionList.nvEncInitializeEncoder(m_pEncoder, &InitializationParameters)); // Picture parameters that are known ahead of encoding m_PictureParameters = { NV_ENC_PIC_PARAMS_VER, 0 }; m_PictureParameters.codecPicParams.h264PicParams.sliceMode = 0; m_PictureParameters.codecPicParams.h264PicParams.sliceModeData = 0; m_PictureParameters.inputWidth = m_NvidiaNvencCodecContext.GetWidth(); m_PictureParameters.inputHeight = m_NvidiaNvencCodecContext.GetHeight(); m_PictureParameters.bufferFmt = NV_ENC_BUFFER_FORMAT_NV12_PL; m_PictureParameters.inputPitch = static_cast<uint32_t>(m_nNv12BufferPitch); m_PictureParameters.pictureStruct = NV_ENC_PIC_STRUCT_FRAME; #ifdef ASYNCHRONOUS m_hCompletionEvent = CreateEvent(NULL, FALSE, FALSE, NULL); m_EventParameters = { NV_ENC_EVENT_PARAMS_VER, 0 }; m_EventParameters.completionEvent = m_hCompletionEvent; CHECK_NVENC_STATUS(m_FunctionList.nvEncRegisterAsyncEvent(m_pEncoder, &m_EventParameters)); m_PictureParameters.completionEvent = m_hCompletionEvent; #endif // Register CUDA input pointer NV_ENC_REGISTER_RESOURCE RegisterResource = { NV_ENC_REGISTER_RESOURCE_VER, NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR, m_NvidiaNvencCodecContext.GetWidth(), m_NvidiaNvencCodecContext.GetHeight(), static_cast<uint32_t>(m_nNv12BufferPitch), 0, reinterpret_cast<void*>(m_pNv12Buffer), NULL, NV_ENC_BUFFER_FORMAT_NV12_PL }; CHECK_NVENC_STATUS(m_FunctionList.nvEncRegisterResource(m_pEncoder, &RegisterResource)); NV_ENC_MAP_INPUT_RESOURCE MapInputResource = { NV_ENC_MAP_INPUT_RESOURCE_VER, 0, 0, RegisterResource.registeredResource }; m_pRegisteredResource = RegisterResource.registeredResource; CHECK_NVENC_STATUS(m_FunctionList.nvEncMapInputResource(m_pEncoder, &MapInputResource)); m_PictureParameters.inputBuffer = MapInputResource.mappedResource; // Create output bitstream buffer m_nOutputBitstreamSize = 2 * 1024 * 1024; NV_ENC_CREATE_BITSTREAM_BUFFER CreateBitstreamBuffer = { NV_ENC_CREATE_BITSTREAM_BUFFER_VER, m_nOutputBitstreamSize, NV_ENC_MEMORY_HEAP_AUTOSELECT, 0 }; CHECK_NVENC_STATUS(m_FunctionList.nvEncCreateBitstreamBuffer(m_pEncoder, &CreateBitstreamBuffer)); m_pOutputBitstream = CreateBitstreamBuffer.bitstreamBuffer; m_PictureParameters.outputBitstream = m_pOutputBitstream; if (m_NvidiaNvencCodecContext.GetSaveOutputToFile()) { char pOutputFilename[MAX_PATH]; sprintf_s(pOutputFilename, "nvenc-%d.h264", nCodecInstanceId); if (fopen_s(&m_pOutputFile, pOutputFilename, "wb") != 0) { throw std::runtime_error(std::string("could not open ").append(pOutputFilename).append(" for writing!")); } } }
static int init_buffers(NvEncoder*enc) { int ret=0; NVENCSTATUS nvResult; CUresult cuResult; NV_ENC_REGISTER_RESOURCE reg_res={0,}; enc->encBuf.stInputBfr.dwWidth=enc->encCfg.maxWidth; enc->encBuf.stInputBfr.dwHeight=enc->encCfg.maxHeight; enc->encBuf.stInputBfr.bufferFmt=NV_ENC_BUFFER_FORMAT_NV12_PL; //////////////////////////// //Allocate INPUT Resource CUcontext cuCtxCur; CUdeviceptr devPtr; cuResult=cuMemAlloc(&devPtr,enc->encCfg.maxWidth*enc->encCfg.maxHeight*3/2); CHK_CUDA_ERR(cuResult); enc->encBuf.stInputBfr.pNV12devPtr=devPtr; enc->encBuf.stInputBfr.uNV12Stride=enc->encBuf.stInputBfr.dwWidth; enc->encBuf.stInputBfr.LumaSize=enc->encBuf.stInputBfr.uNV12Stride*enc->encBuf.stInputBfr.dwHeight; // fprintf(stderr,"CUDA INPUT BUFFER::Stride =%u\n",enc->encBuf.stInputBfr.uNV12Stride); // fprintf(stderr,"CUDA INPUT BUFFER::Width =%u\n",enc->encBuf.stInputBfr.dwWidth); // fprintf(stderr,"CUDA INPUT BUFFER::Height =%u\n",enc->encBuf.stInputBfr.dwHeight); #if 0 void* phost; cuResult=cuMemAllocHost(&phost,enc->encBuf.stInputBfr.uNV12Stride, enc->encBuf.stInputBfr.dwHeight*3/2) CHK_CUDA_ERR(cuResult); enc->encBuf.stInputBfr.pNV12hostPtr=phost; enc->encBuf.stInputBfr.uNV12hostStride=enc->encBuf.stInputBfr.uNV12Stride; #endif /////////////////////// //Using Mapped Resource.. void*resource; nvResult=enc->hwEncoder->NvEncRegisterResource(NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR,(void*)enc->encBuf.stInputBfr.pNV12devPtr, enc->encBuf.stInputBfr.dwWidth,enc->encBuf.stInputBfr.dwHeight,enc->encBuf.stInputBfr.uNV12Stride,&resource); CHK_NVENC_ERR(nvResult); enc->encBuf.stInputBfr.nvRegisteredResource=resource; void* insurf; nvResult=enc->hwEncoder->NvEncMapInputResource(enc->encBuf.stInputBfr.nvRegisteredResource,&insurf); CHK_NVENC_ERR(nvResult); enc->encBuf.stInputBfr.hInputSurface=insurf; //////////////////////////// //Allocate OUTPUT Resource enc->encBuf.stOutputBfr.dwBitstreamBufferSize=1024*1024;//1MiB nvResult=enc->hwEncoder->NvEncCreateBitstreamBuffer(enc->encBuf.stOutputBfr.dwBitstreamBufferSize,&enc->encBuf.stOutputBfr.hBitstreamBuffer); CHK_NVENC_ERR(nvResult); enc->encBuf.stOutputBfr.bEOSFlag=false; enc->encBuf.stOutputBfr.bWaitOnEvent=false; return ret; }
// Run the Cuda part of the computation bool copyDecodedFrameToTexture(unsigned int &nRepeats, int bUseInterop, int *pbIsProgressive) { CUVIDPARSERDISPINFO oDisplayInfo; if (g_pFrameQueue->dequeue(&oDisplayInfo)) { CCtxAutoLock lck(g_CtxLock); // Push the current CUDA context (only if we are using CUDA decoding path) CUresult result = cuCtxPushCurrent(g_oContext); CUdeviceptr pDecodedFrame[2] = { 0, 0 }; CUdeviceptr pInteropFrame[2] = { 0, 0 }; int num_fields = (oDisplayInfo.progressive_frame ? (1) : (2+oDisplayInfo.repeat_first_field)); *pbIsProgressive = oDisplayInfo.progressive_frame; g_bIsProgressive = oDisplayInfo.progressive_frame ? true : false; for (int active_field=0; active_field<num_fields; active_field++) { nRepeats = oDisplayInfo.repeat_first_field; CUVIDPROCPARAMS oVideoProcessingParameters; memset(&oVideoProcessingParameters, 0, sizeof(CUVIDPROCPARAMS)); oVideoProcessingParameters.progressive_frame = oDisplayInfo.progressive_frame; oVideoProcessingParameters.second_field = active_field; oVideoProcessingParameters.top_field_first = oDisplayInfo.top_field_first; oVideoProcessingParameters.unpaired_field = (num_fields == 1); unsigned int nDecodedPitch = 0; unsigned int nWidth = 0; unsigned int nHeight = 0; // map decoded video frame to CUDA surfae g_pVideoDecoder->mapFrame(oDisplayInfo.picture_index, &pDecodedFrame[active_field], &nDecodedPitch, &oVideoProcessingParameters); nWidth = g_pVideoDecoder->targetWidth(); nHeight = g_pVideoDecoder->targetHeight(); // map DirectX texture to CUDA surface size_t nTexturePitch = 0; // If we are Encoding and this is the 1st Frame, we make sure we allocate system memory for readbacks if (g_bReadback && g_bFirstFrame && g_ReadbackSID) { CUresult result; checkCudaErrors(result = cuMemAllocHost((void **)&g_bFrameData[0], (nDecodedPitch * nHeight * 3 / 2))); checkCudaErrors(result = cuMemAllocHost((void **)&g_bFrameData[1], (nDecodedPitch * nHeight * 3 / 2))); g_bFirstFrame = false; if (result != CUDA_SUCCESS) { printf("cuMemAllocHost returned %d\n", (int)result); } } // If streams are enabled, we can perform the readback to the host while the kernel is executing if (g_bReadback && g_ReadbackSID) { CUresult result = cuMemcpyDtoHAsync(g_bFrameData[active_field], pDecodedFrame[active_field], (nDecodedPitch * nHeight * 3 / 2), g_ReadbackSID); if (result != CUDA_SUCCESS) { printf("cuMemAllocHost returned %d\n", (int)result); } } #if ENABLE_DEBUG_OUT printf("%s = %02d, PicIndex = %02d, OutputPTS = %08d\n", (oDisplayInfo.progressive_frame ? "Frame" : "Field"), g_DecodeFrameCount, oDisplayInfo.picture_index, oDisplayInfo.timestamp); #endif if (g_pImageDX) { // map the texture surface g_pImageDX->map(&pInteropFrame[active_field], &nTexturePitch, active_field); } else { pInteropFrame[active_field] = g_pInteropFrame[active_field]; nTexturePitch = g_pVideoDecoder->targetWidth() * 2; } // perform post processing on the CUDA surface (performs colors space conversion and post processing) // comment this out if we inclue the line of code seen above cudaPostProcessFrame(&pDecodedFrame[active_field], nDecodedPitch, &pInteropFrame[active_field], nTexturePitch, g_pCudaModule->getModule(), gfpNV12toARGB, g_KernelSID); if (g_pImageDX) { // unmap the texture surface g_pImageDX->unmap(active_field); } // unmap video frame // unmapFrame() synchronizes with the VideoDecode API (ensures the frame has finished decoding) g_pVideoDecoder->unmapFrame(pDecodedFrame[active_field]); // release the frame, so it can be re-used in decoder g_pFrameQueue->releaseFrame(&oDisplayInfo); g_DecodeFrameCount++; } // Detach from the Current thread checkCudaErrors(cuCtxPopCurrent(NULL)); } else { return false; } // check if decoding has come to an end. // if yes, signal the app to shut down. if (!g_pVideoSource->isStarted() || g_pFrameQueue->isEndOfDecode()) { // Let's free the Frame Data if (g_ReadbackSID && g_bFrameData) { cuMemFreeHost((void *)g_bFrameData[0]); cuMemFreeHost((void *)g_bFrameData[1]); g_bFrameData[0] = NULL; g_bFrameData[1] = NULL; } // Let's just stop, and allow the user to quit, so they can at least see the results g_pVideoSource->stop(); // If we want to loop reload the video file and restart if (g_bLoop && !g_bAutoQuit) { reinitCudaResources(); g_FrameCount = 0; g_DecodeFrameCount = 0; g_pVideoSource->start(); } if (g_bAutoQuit) { g_bDone = true; } } return true; }