Exemple #1
0
static void
map_init (struct ptx_stream *s)
{
  CUresult r;

  int size = getpagesize ();

  assert (s);
  assert (!s->d);
  assert (!s->h);

  r = cuMemAllocHost (&s->h, size);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r));

  r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r));

  assert (s->h);

  s->h_begin = s->h;
  s->h_end = s->h_begin + size;
  s->h_next = s->h_prev = s->h_tail = s->h_begin;

  assert (s->h_next);
  assert (s->h_end);
}
void *cuda_make_ctx(CUcontext ctx, int flags) {
  cuda_context *res;
  void *p;

  res = malloc(sizeof(*res));
  if (res == NULL)
    return NULL;
  res->ctx = ctx;
  res->err = CUDA_SUCCESS;
  res->blas_handle = NULL;
  res->refcnt = 1;
  res->flags = flags;
  res->enter = 0;
  res->freeblocks = NULL;
  if (detect_arch(ARCH_PREFIX, res->bin_id, &err)) {
    goto fail_cache;
  }
  res->extcopy_cache = cache_lru(64, 32, (cache_eq_fn)extcopy_eq,
                                 (cache_hash_fn)extcopy_hash,
                                 (cache_freek_fn)extcopy_free,
                                 (cache_freev_fn)cuda_freekernel);
  if (res->extcopy_cache == NULL) {
    goto fail_cache;
  }
  err = cuStreamCreate(&res->s, 0);
  if (err != CUDA_SUCCESS) {
    goto fail_stream;
  }
  err = cuStreamCreate(&res->mem_s, CU_STREAM_NON_BLOCKING);
  if (err != CUDA_SUCCESS) {
    goto fail_mem_stream;
  }
  err = cuMemAllocHost(&p, 16);
  if (err != CUDA_SUCCESS) {
    goto fail_errbuf;
  }
  memset(p, 0, 16);
  /* Need to tag for new_gpudata */
  TAG_CTX(res);
  res->errbuf = new_gpudata(res, (CUdeviceptr)p, 16);
  if (res->errbuf == NULL) {
    err = res->err;
    goto fail_end;
  }
  res->errbuf->flags |= CUDA_MAPPED_PTR;
  return res;
 fail_end:
  cuMemFreeHost(p);
 fail_errbuf:
  cuStreamDestroy(res->mem_s);
 fail_mem_stream:
  cuStreamDestroy(res->s);
 fail_stream:
  cache_destroy(res->extcopy_cache);
 fail_cache:
  free(res);
  return NULL;
}
int gib_alloc ( void **buffers, int buf_size, int *ld, gib_context c ) {
  ERROR_CHECK_FAIL(cuCtxPushCurrent(((gpu_context)(c->acc_context))->pCtx));
#if GIB_USE_MMAP
  ERROR_CHECK_FAIL(cuMemHostAlloc(buffers, (c->n+c->m)*buf_size, 
				  CU_MEMHOSTALLOC_DEVICEMAP));
#else
  ERROR_CHECK_FAIL(cuMemAllocHost(buffers, (c->n+c->m)*buf_size));
#endif
  *ld = buf_size;
  ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx));
  return GIB_SUC;
}
Exemple #4
0
SEXP
R_auto_cuMemAllocHost(SEXP r_bytesize)
{
    SEXP r_ans = R_NilValue;
    void * pp;
    size_t bytesize = REAL(r_bytesize)[0];
    CUresult ans;
    ans = cuMemAllocHost(& pp,  bytesize);
    if(ans)
       return(R_cudaErrorInfo(ans));
    r_ans = R_createRef(pp, "voidPtr") ;
    return(r_ans);
}
Exemple #5
0
/**
 * Allocates bytesize bytes of host memory that is page-locked and accessible 
 * to the device. The driver tracks the virtual memory ranges allocated with 
 * this function and automatically accelerates calls to functions such as 
 * cuMemcpyHtoD(). Since the memory can be accessed directly by the device, it 
 * can be read or written with much higher bandwidth than pageable memory 
 * obtained with functions such as malloc(). Allocating excessive amounts of 
 * pinned memory may degrade system performance, since it reduces the amount of 
 * memory available to the system for paging. As a result, this function is 
 * best used sparingly to allocate staging areas for data exchange between host
 * and device.
 *
 * The Flags parameter enables different options to be specified that affect 
 * the allocation, as follows.
 *
 * CU_MEMHOSTALLOC_PORTABLE: The memory returned by this call will be 
 * considered as pinned memory by all CUDA contexts, not just the one that 
 * performed the allocation.
 *
 * CU_MEMHOSTALLOC_DEVICEMAP: Maps the allocation into the CUDA address space. 
 * The device pointer to the memory may be obtained by calling 
 * cuMemHostGetDevicePointer(). This feature is available only on GPUs with 
 * compute capability greater than or equal to 1.1.
 *
 * CU_MEMHOSTALLOC_WRITECOMBINED: Allocates the memory as write-combined (WC). 
 * WC memory can be transferred across the PCI Express bus more quickly on some
 * system configurations, but cannot be read efficiently by most CPUs. WC 
 * memory is a good option for buffers that will be written by the CPU and read
 * by the GPU via mapped pinned memory or host->device transfers.
 *
 * All of these flags are orthogonal to one another: a developer may allocate 
 * memory that is portable, mapped and/or write-combined with no restrictions.
 *
 * The CUDA context must have been created with the CU_CTX_MAP_HOST flag in 
 * order for the CU_MEMHOSTALLOC_MAPPED flag to have any effect.
 *
 * The CU_MEMHOSTALLOC_MAPPED flag may be specified on CUDA contexts for 
 * devices that do not support mapped pinned memory. The failure is deferred to
 * cuMemHostGetDevicePointer() because the memory may be mapped into other CUDA
 * contexts via the CU_MEMHOSTALLOC_PORTABLE flag.
 *
 * The memory allocated by this function must be freed with cuMemFreeHost().
 *
 * Note all host memory allocated using cuMemHostAlloc() will automatically be
 * immediately accessible to all contexts on all devices which support unified 
 * addressing (as may be queried using CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING).
 * Unless the flag CU_MEMHOSTALLOC_WRITECOMBINED is specified, the device 
 * pointer that may be used to access this host memory from those contexts is 
 * always equal to the returned host pointer *pp. If the flag 
 * CU_MEMHOSTALLOC_WRITECOMBINED is specified, then the function 
 * cuMemHostGetDevicePointer() must be used to query the device pointer, even 
 * if the context supports unified addressing. See Unified Addressing for 
 * additional details.
 *
 * Parameters:
 * pp - Returned host pointer to page-locked memory
 * bytesize - Requested allocation size in bytes
 * Flags - Flags for allocation request
 *
 * Returns:
 * CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED, 
 * CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_VALUE, 
 * CUDA_ERROR_OUT_OF_MEMORY 
 */
CUresult cuMemHostAlloc(void **pp, unsigned int bytesize, unsigned int Flags)
{
	if (Flags & CU_MEMHOSTALLOC_PORTABLE) {
		GDEV_PRINT("CU_MEMHOSTALLOC_PORTABLE: Not Implemented Yet\n");
		return CUDA_ERROR_UNKNOWN;
	}

	if (Flags & CU_MEMHOSTALLOC_WRITECOMBINED) {
		GDEV_PRINT("CU_MEMHOSTALLOC_WRITECOMBINED: Not Implemented Yet\n");
		return CUDA_ERROR_UNKNOWN;
	}

	/* our implementation uses CU_MEMHOSTALLOC_DEVICEMAP by default. */
	return cuMemAllocHost(pp, bytesize);
}
Exemple #6
0
void *swanMallocHost( size_t len ) {
	CUresult err;
	void *ptr;
	try_init();

#ifdef DEV_EXTENSIONS
	err= cuMemHostAlloc( &ptr, len, CU_MEMHOSTALLOC_PORTABLE  ); //| CU_MEMHOSTALLOC_DEVICEMAP  ); //| CU_MEMHOSTALLOC_WRITECOMBINED );
#else
	err = cuMemAllocHost( &ptr, len );
#endif

	if ( err != CUDA_SUCCESS ) {
		fprintf( stderr, "swanMallocHost error: %d\n", err );
		error("swanMallocHost failed\n" );
	}
//	printf("MallocHost %p\n", ptr );
	memset( ptr, 0, len );
	return ptr;
}
Exemple #7
0
void* InteropResource::mapToHost(const VideoFormat &format, void *handle, int picIndex, const CUVIDPROCPARAMS &param, int width, int height, int coded_height)
{
    AutoCtxLock locker((cuda_api*)this, lock);
    Q_UNUSED(locker);
    CUdeviceptr devptr;
    unsigned int pitch;

    CUDA_ENSURE(cuvidMapVideoFrame(dec, picIndex, &devptr, &pitch, const_cast<CUVIDPROCPARAMS*>(&param)), NULL);
    CUVIDAutoUnmapper unmapper(this, dec, devptr);
    Q_UNUSED(unmapper);
    uchar* host_data = NULL;
    const size_t host_size = pitch*coded_height*3/2;
    CUDA_ENSURE(cuMemAllocHost((void**)&host_data, host_size), NULL);
    // copy to the memory not allocated by cuda is possible but much slower
    CUDA_ENSURE(cuMemcpyDtoH(host_data, devptr, host_size), NULL);

    VideoFrame frame(width, height, VideoFormat::Format_NV12);
    uchar *planes[] = {
        host_data,
        host_data + pitch * coded_height
    };
    frame.setBits(planes);
    int pitches[] = { (int)pitch, (int)pitch };
    frame.setBytesPerLine(pitches);

    VideoFrame *f = reinterpret_cast<VideoFrame*>(handle);
    frame.setTimestamp(f->timestamp());
    frame.setDisplayAspectRatio(f->displayAspectRatio());
    if (format == frame.format())
        *f = frame.clone();
    else
        *f = frame.to(format);

    cuMemFreeHost(host_data);
    return f;
}
Exemple #8
0
bool VideoDecoderCUDAPrivate::processDecodedData(CUVIDPARSERDISPINFO *cuviddisp, VideoFrame* outFrame) {
    int num_fields = cuviddisp->progressive_frame ? 1 : 2+cuviddisp->repeat_first_field;

    for (int active_field = 0; active_field < num_fields; ++active_field) {
        CUVIDPROCPARAMS proc_params;
        memset(&proc_params, 0, sizeof(CUVIDPROCPARAMS));
        proc_params.progressive_frame = cuviddisp->progressive_frame; //check user config
        proc_params.second_field = active_field == 1; //check user config
        proc_params.top_field_first = cuviddisp->top_field_first;
        proc_params.unpaired_field = cuviddisp->progressive_frame == 1;

        CUdeviceptr devptr;
        unsigned int pitch;
        cuvidCtxLock(vid_ctx_lock, 0);
        CUresult cuStatus = cuvidMapVideoFrame(dec, cuviddisp->picture_index, &devptr, &pitch, &proc_params);
        if (cuStatus != CUDA_SUCCESS) {
            qWarning("cuvidMapVideoFrame failed on index %d (%#x, %s)", cuviddisp->picture_index, cuStatus, _cudaGetErrorEnum(cuStatus));
            cuvidUnmapVideoFrame(dec, devptr);
            cuvidCtxUnlock(vid_ctx_lock, 0);
            return false;
        }
#define PAD_ALIGN(x,mask) ( (x + mask) & ~mask )
        //uint w = dec_create_info.ulWidth;//PAD_ALIGN(dec_create_info.ulWidth, 0x3F);
        uint h = dec_create_info.ulHeight;//PAD_ALIGN(dec_create_info.ulHeight, 0x0F); //?
#undef PAD_ALIGN
        int size = pitch*h*3/2;
        if (size > host_data_size && host_data) {
            cuMemFreeHost(host_data);
            host_data = 0;
            host_data_size = 0;
        }
        if (!host_data) {
            cuStatus = cuMemAllocHost((void**)&host_data, size);
            if (cuStatus != CUDA_SUCCESS) {
                qWarning("cuMemAllocHost failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus));
                cuvidUnmapVideoFrame(dec, devptr);
                cuvidCtxUnlock(vid_ctx_lock, 0);
                return false;
            }
            host_data_size = size;
        }
        if (!host_data) {
            qWarning("No valid staging memory!");
            cuvidUnmapVideoFrame(dec, devptr);
            cuvidCtxUnlock(vid_ctx_lock, 0);
            return false;
        }
        cuStatus = cuMemcpyDtoHAsync(host_data, devptr, size, stream);
        if (cuStatus != CUDA_SUCCESS) {
            qWarning("cuMemcpyDtoHAsync failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus));
            cuvidUnmapVideoFrame(dec, devptr);
            cuvidCtxUnlock(vid_ctx_lock, 0);
            return false;
        }
        cuStatus = cuCtxSynchronize();
        if (cuStatus != CUDA_SUCCESS) {
            qWarning("cuCtxSynchronize failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus));
        }
        cuvidUnmapVideoFrame(dec, devptr);
        cuvidCtxUnlock(vid_ctx_lock, 0);
        //qDebug("mark not in use pic_index: %d", cuviddisp->picture_index);
        surface_in_use[cuviddisp->picture_index] = false;

        uchar *planes[] = {
            host_data,
            host_data + pitch * h
        };
        int pitches[] = { (int)pitch, (int)pitch };
        VideoFrame frame(codec_ctx->width, codec_ctx->height, VideoFormat::Format_NV12);
        frame.setBits(planes);
        frame.setBytesPerLine(pitches);
        //TODO: is clone required? may crash on clone, I should review clone()
        //frame = frame.clone();
        if (outFrame) {
            *outFrame = frame.clone();
        }
#if COPY_ON_DECODE
        frame_queue.put(frame.clone());
#endif
        //qDebug("frame queue size: %d", frame_queue.size());
    }
    return true;
}
Exemple #9
0
int cuda_test_memcpy_async(unsigned int size)
{
	int i;
	CUresult res;
	CUdevice dev;
	CUcontext ctx;
	CUstream stream;
	CUdeviceptr data_addr;
	unsigned int *in, *out;
	struct timeval tv;
	struct timeval tv_total_start, tv_total_end;
	unsigned long total;
	struct timeval tv_h2d_start, tv_h2d_end;
	float h2d;
	struct timeval tv_d2h_start, tv_d2h_end;
	float d2h;

	gettimeofday(&tv_total_start, NULL);

	res = cuInit(0);
	if (res != CUDA_SUCCESS) {
		printf("cuInit failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuDeviceGet(&dev, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuCtxCreate(&ctx, 0, dev);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuStreamCreate(&stream, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuStreamCreate failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemAlloc(&data_addr, size);
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemAllocHost((void **)&in, size);
	if (res != CUDA_SUCCESS) {
		printf("cuMemAllocHost(in) failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemAllocHost((void **)&out, size);
	if (res != CUDA_SUCCESS) {
		printf("cuMemAllocHost(out) failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	for (i = 0; i < size / 4; i++) {
		in[i] = i+1;
		out[i] = 0;
	}

	gettimeofday(&tv_h2d_start, NULL);
	res = cuMemcpyHtoDAsync(data_addr, in, size, stream);
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoDAsync failed: res = %u\n", (unsigned int)res);
		return -1;
	}
	res = cuStreamSynchronize(stream);
	if (res != CUDA_SUCCESS) {
		printf("cuStreamSynchronize() failed: res = %u\n", (unsigned int)res);
		return -1;
	}
	gettimeofday(&tv_h2d_end, NULL);

	gettimeofday(&tv_d2h_start, NULL);
	res = cuMemcpyDtoHAsync(out, data_addr, size, stream);
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyDtoHAsync failed: res = %u\n", (unsigned int)res);
		return -1;
	}
	res = cuStreamSynchronize(stream);
	if (res != CUDA_SUCCESS) {
		printf("cuStreamSynchronize() failed: res = %u\n", (unsigned int)res);
		return -1;
	}
	gettimeofday(&tv_d2h_end, NULL);

	for (i = 0; i < size / 4; i++) {
		if (in[i] != out[i]) {
			printf("in[%d] = %u, out[%d] = %u\n",
				   i, in[i], i, out[i]);
		}
	}

	res = cuMemFreeHost(out);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFreeHost(out) failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemFreeHost(in);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFreeHost(in) failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemFree(data_addr);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuStreamDestroy(stream);
	if (res != CUDA_SUCCESS) {
		printf("cuStreamDestroy failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuCtxDestroy(ctx);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxDestroy failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	gettimeofday(&tv_total_end, NULL);

	tvsub(&tv_h2d_end, &tv_h2d_start, &tv);
	h2d = tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
	tvsub(&tv_d2h_end, &tv_d2h_start, &tv);
	d2h = tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
	tvsub(&tv_total_end, &tv_total_start, &tv);
	total = tv.tv_sec * 1000 + tv.tv_usec / 1000;

	printf("HtoD: %f\n", h2d);
	printf("DtoH: %f\n", d2h);

	return 0;

end:

	return -1;
}
Exemple #10
0
int main() {
  CU_ERROR_CHECK(cuInit(0));

  int count;
  CU_ERROR_CHECK(cuDeviceGetCount(&count));

  count = (count > 2) ? 2 : count;

  CUdevice devices[count];
  for (int i = 0; i < count; i++)
    CU_ERROR_CHECK(cuDeviceGet(&devices[i], i));

  // Question 1:  Can you create multiple contexts on the same device?
  {
    fprintf(stderr, "Attempting to create multiple contexts on each device...\n");
    CUcontext contexts[count * N];
    size_t j = 0;
    for (int i = 0; i < count; i++) {
      CUresult error = CUDA_SUCCESS;
      size_t k;
      for (k = 0; k < N && error == CUDA_SUCCESS; k++) {
        error = cuCtxCreate(&contexts[j], CU_CTX_SCHED_AUTO, devices[i]);
        if (error == CUDA_SUCCESS)
          CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[j++]));
      }
      fprintf(stderr, "  created %zu contexts on device %d before cuCtxCreate returned \"%s\"\n", (k - 1), i, cuGetErrorString(error));
    }

    CUresult error = CUDA_SUCCESS;
    size_t k;
    for (k = 0; k < j && error == CUDA_SUCCESS; k++)
      error = cuCtxPushCurrent(contexts[k]);
    if (error == CUDA_SUCCESS)
      fprintf(stderr, "  successfully pushed %zu contexts with cuCtxPushCurrent\n", k);
    else
      fprintf(stderr, "  pushed %zu contexts before cuCtxPushCurrent returned \"%s\"\n", (k - 1), cuGetErrorString(error));

    for (size_t k = 0; k < j; k++)
      CU_ERROR_CHECK(cuCtxDestroy(contexts[k]));

    fprintf(stderr, "\n");
  }

  CUcontext contexts[count][2];
  for (int i = 0; i < count; i++) {
    for (size_t j = 0; j < 2; j++) {
      CU_ERROR_CHECK(cuCtxCreate(&contexts[i][j], CU_CTX_SCHED_AUTO, devices[i]));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[i][j]));
    }
  }

  // Question 2:  Can you access a host pointer in a different context from
  // which it was created?
  // Question 3:  Can you free a host pointer in a different context from which
  // it was created?
  {
    void * hPtr;
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
    CU_ERROR_CHECK(cuMemAllocHost(&hPtr, 1024));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));

    CUdeviceptr dPtr[count];
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
    CU_ERROR_CHECK(cuMemAlloc(&dPtr[0], 1024)); // Different context, same device
    fprintf(stderr, "Accessing a host pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(cuMemcpyHtoD(dPtr[0], hPtr, 1024)));
    CU_ERROR_CHECK(cuMemFree(dPtr[0]));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    if (count > 1) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      CU_ERROR_CHECK(cuMemAlloc(&dPtr[1], 1024)); // Different context, different device
      fprintf(stderr, "Accessing a host pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(cuMemcpyHtoD(dPtr[1], hPtr, 1024)));
    CU_ERROR_CHECK(cuMemFree(dPtr[1]));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }

    fprintf(stderr, "\n");

    CUresult error = CUDA_ERROR_UNKNOWN;
    if (count > 1) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      error = cuMemFreeHost(hPtr);
      fprintf(stderr, "Freeing a host pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
      error = cuMemFreeHost(hPtr);
      fprintf(stderr, "Freeing a host pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
      error = cuMemFreeHost(hPtr);
      fprintf(stderr, "Freeing a host pointer from the same context to which it was allocated returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    }

    fprintf(stderr, "\n");
  }

  // Question 4:  Can you access a device pointer in a different context from
  // which it was created?
  // Question 5:  Can you free a device pointer in a different context from which
  // it was created?
  {
    CUdeviceptr dPtr[count][2];
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
    CU_ERROR_CHECK(cuMemAlloc(&dPtr[0][0], 1024));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
    CU_ERROR_CHECK(cuMemAlloc(&dPtr[0][1], 1024));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));

    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
    fprintf(stderr, "Accessing a device pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(cuMemcpyDtoD(dPtr[0][0], dPtr[0][1], 1024)));
    CU_ERROR_CHECK(cuMemFree(dPtr[0][1]));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));

    if (count > 1) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      CU_ERROR_CHECK(cuMemAlloc(&dPtr[1][0], 1024)); // Different context, different device
      fprintf(stderr, "Accessing a device pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(cuMemcpyDtoD(dPtr[0][0], dPtr[1][0], 1024)));
      CU_ERROR_CHECK(cuMemFree(dPtr[1][0]));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }

    fprintf(stderr, "\n");

    CUresult error = CUDA_ERROR_UNKNOWN;
    if (count > 1) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      error = cuMemFree(dPtr[0][0]);
      fprintf(stderr, "Freeing a device pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
      error = cuMemFree(dPtr[0][0]);
      fprintf(stderr, "Freeing a device pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
      error = cuMemFree(dPtr[0][0]);
      fprintf(stderr, "Freeing a device pointer from the same context to which it was allocated returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    }

    fprintf(stderr, "\n");
  }

  // Question 6:  Can you access a module in a different context from which it
  // was loaded?
  // Question 7:  Can you unload a module in a different context from which it
  // was loaded?
  {
    CUmodule module;
    CUdeviceptr ptr;
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
    CU_ERROR_CHECK(cuModuleLoad(&module,  "kernel-test.ptx"));
    CU_ERROR_CHECK(cuMemAlloc(&ptr, sizeof(float)));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));

    CUfunction function = 0;
    if (count > 0) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      fprintf(stderr, "Getting a function pointer from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel")));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }
    if (function == 0) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
      fprintf(stderr, "Getting a function pointer from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel")));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    }
    if (function == 0) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
      fprintf(stderr, "Getting a function pointer from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel")));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    }

    fprintf(stderr, "\n");

    CUdeviceptr a, b;
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
    CU_ERROR_CHECK(cuMemAlloc(&a, sizeof(float)));
    CU_ERROR_CHECK(cuMemAlloc(&b, sizeof(float)));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    void * params[] = { &a, & b };

    CUresult error = CUDA_ERROR_UNKNOWN;
    if (count > 0) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      fprintf(stderr, "Launching a function from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
      fprintf(stderr, "Launching a function from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
      fprintf(stderr, "Launching a function from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    }

    fprintf(stderr, "\n");

    error = CUDA_ERROR_UNKNOWN;
    if (count > 0) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      fprintf(stderr, "Unloading a module from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
      fprintf(stderr, "Unloading a module from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
      fprintf(stderr, "Unloading a module from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    }

    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
    CU_ERROR_CHECK(cuMemFree(a));
    CU_ERROR_CHECK(cuMemFree(b));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
  }

  for (int i = 0; i < count; i++) {
    for (size_t j = 0; j < 2; j++)
      CU_ERROR_CHECK(cuCtxDestroy(contexts[i][j]));
  }

  return 0;
}
Exemple #11
0
	CNvidiaNvencCodec(DWORD nCodecInstanceId, const CCodecContextBase& CodecContext) : m_NvidiaNvencCodecContext(static_cast<const CNvidiaNvencCodecContext&>(CodecContext)), m_hNvEncodeAPI64(LoadLibraryA("nvEncodeAPI64.dll"))
	{
		PNVENCODEAPICREATEINSTANCE pNvEncodeAPICreateInstance = reinterpret_cast<PNVENCODEAPICREATEINSTANCE>(GetProcAddress(m_hNvEncodeAPI64, "NvEncodeAPICreateInstance"));
		memset(&m_FunctionList, 0, sizeof(m_FunctionList));
		m_FunctionList.version = NV_ENCODE_API_FUNCTION_LIST_VER;
		NVENCSTATUS nStatus = pNvEncodeAPICreateInstance(&m_FunctionList);
		CHECK_CUDA_DRV_STATUS(cuCtxCreate(&m_Context, 0, 0));
		if (m_NvidiaNvencCodecContext.GetUseSwscaleInsteadOfCuda())
		{
			CHECK_CUDA_DRV_STATUS(cuMemAlloc(&m_pNv12Buffer, m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight() * 3 / 2));
			m_nNv12BufferPitch = m_NvidiaNvencCodecContext.GetWidth();
			CHECK_CUDA_DRV_STATUS(cuMemAllocHost(&m_pPageLockedNv12Buffer, m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight() * 3 / 2));
			m_pNv12Planes[0] = reinterpret_cast<unsigned char*>(m_pPageLockedNv12Buffer);
			m_pNv12Planes[1] = reinterpret_cast<unsigned char*>(m_pPageLockedNv12Buffer) + m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight();
			m_pNv12Strides[0] = m_NvidiaNvencCodecContext.GetWidth();
			m_pNv12Strides[1] = m_NvidiaNvencCodecContext.GetWidth();
			m_SwscaleContext = sws_getContext(m_NvidiaNvencCodecContext.GetWidth(), m_NvidiaNvencCodecContext.GetHeight(), AV_PIX_FMT_BGR32, m_NvidiaNvencCodecContext.GetWidth(), m_NvidiaNvencCodecContext.GetHeight(), AV_PIX_FMT_NV12, 0, 0, 0, 0);
		}
		else
		{
			CHECK_CUDA_DRV_STATUS(cuMemAllocPitch(&m_pNv12Buffer, &m_nNv12BufferPitch, m_NvidiaNvencCodecContext.GetWidth(), m_NvidiaNvencCodecContext.GetHeight() * 3 / 2, 16));
			if (m_NvidiaNvencCodecContext.GetUsePageLockedIntermediateBuffer())
			{
				CHECK_CUDA_DRV_STATUS(cuMemAllocHost(&m_pPageLockedRgb32Buffer, m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight() * 4));
			}
			CHECK_CUDA_DRV_STATUS(cuMemAlloc(&m_pRgb32Buffer, m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight() * 4));
		}
		CHECK_CUDA_DRV_STATUS(cuStreamCreate(&m_Stream, 0));
		NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS SessionParameters;
		memset(&SessionParameters, 0, sizeof(SessionParameters));
		SessionParameters.version = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER;
		SessionParameters.apiVersion = NVENCAPI_VERSION;
		SessionParameters.device = m_Context;
		SessionParameters.deviceType = NV_ENC_DEVICE_TYPE_CUDA;
		nStatus = m_FunctionList.nvEncOpenEncodeSessionEx(&SessionParameters, &m_pEncoder);
		m_PictureParameters.version = NV_ENC_PIC_PARAMS_VER;
		auto PresetGuid = NV_ENC_PRESET_HP_GUID;
		NV_ENC_PRESET_CONFIG PresetConfiguration = { NV_ENC_PRESET_CONFIG_VER, 0 };
		PresetConfiguration.presetCfg.version = NV_ENC_CONFIG_VER;
		CHECK_NVENC_STATUS(m_FunctionList.nvEncGetEncodePresetConfig(m_pEncoder, NV_ENC_CODEC_H264_GUID, PresetGuid, &PresetConfiguration));
		NV_ENC_CONFIG EncoderConfiguration = { NV_ENC_CONFIG_VER, 0 };
		EncoderConfiguration = PresetConfiguration.presetCfg;
		EncoderConfiguration.gopLength = NVENC_INFINITE_GOPLENGTH;
		EncoderConfiguration.profileGUID = NV_ENC_H264_PROFILE_BASELINE_GUID;
		EncoderConfiguration.frameIntervalP = 1; // No B frames
		EncoderConfiguration.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FRAME;
		EncoderConfiguration.encodeCodecConfig.h264Config.idrPeriod = m_NvidiaNvencCodecContext.GetFrameCount();
		EncoderConfiguration.encodeCodecConfig.h264Config.chromaFormatIDC = 1;
		EncoderConfiguration.encodeCodecConfig.h264Config.sliceMode = 0;
		EncoderConfiguration.encodeCodecConfig.h264Config.sliceModeData = 0;
		NV_ENC_INITIALIZE_PARAMS InitializationParameters = { NV_ENC_INITIALIZE_PARAMS_VER, 0 };
		InitializationParameters.encodeGUID = NV_ENC_CODEC_H264_GUID;
		InitializationParameters.presetGUID = PresetGuid;
		InitializationParameters.frameRateNum = m_NvidiaNvencCodecContext.GetFps();
		InitializationParameters.frameRateDen = 1;
#ifdef ASYNCHRONOUS
		InitializationParameters.enableEncodeAsync = 1;
#else
		InitializationParameters.enableEncodeAsync = 0;
#endif
		InitializationParameters.enablePTD = 1; // Let the encoder decide the picture type
		InitializationParameters.reportSliceOffsets = 0;
		InitializationParameters.maxEncodeWidth = m_NvidiaNvencCodecContext.GetWidth();
		InitializationParameters.maxEncodeHeight = m_NvidiaNvencCodecContext.GetHeight();
		InitializationParameters.encodeConfig = &EncoderConfiguration;
		InitializationParameters.encodeWidth = m_NvidiaNvencCodecContext.GetWidth();
		InitializationParameters.encodeHeight = m_NvidiaNvencCodecContext.GetHeight();
		InitializationParameters.darWidth = 16;
		InitializationParameters.darHeight = 9;
		CHECK_NVENC_STATUS(m_FunctionList.nvEncInitializeEncoder(m_pEncoder, &InitializationParameters));
		// Picture parameters that are known ahead of encoding
		m_PictureParameters = { NV_ENC_PIC_PARAMS_VER, 0 };
		m_PictureParameters.codecPicParams.h264PicParams.sliceMode = 0;
		m_PictureParameters.codecPicParams.h264PicParams.sliceModeData = 0;
		m_PictureParameters.inputWidth = m_NvidiaNvencCodecContext.GetWidth();
		m_PictureParameters.inputHeight = m_NvidiaNvencCodecContext.GetHeight();
		m_PictureParameters.bufferFmt = NV_ENC_BUFFER_FORMAT_NV12_PL;
		m_PictureParameters.inputPitch = static_cast<uint32_t>(m_nNv12BufferPitch);
		m_PictureParameters.pictureStruct = NV_ENC_PIC_STRUCT_FRAME;
#ifdef ASYNCHRONOUS
		m_hCompletionEvent = CreateEvent(NULL, FALSE, FALSE, NULL);
		m_EventParameters = { NV_ENC_EVENT_PARAMS_VER, 0 };
		m_EventParameters.completionEvent = m_hCompletionEvent;
		CHECK_NVENC_STATUS(m_FunctionList.nvEncRegisterAsyncEvent(m_pEncoder, &m_EventParameters));
		m_PictureParameters.completionEvent = m_hCompletionEvent;
#endif
		// Register CUDA input pointer
		NV_ENC_REGISTER_RESOURCE RegisterResource = { NV_ENC_REGISTER_RESOURCE_VER, NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR, m_NvidiaNvencCodecContext.GetWidth(), m_NvidiaNvencCodecContext.GetHeight(), static_cast<uint32_t>(m_nNv12BufferPitch), 0, reinterpret_cast<void*>(m_pNv12Buffer), NULL, NV_ENC_BUFFER_FORMAT_NV12_PL };
		CHECK_NVENC_STATUS(m_FunctionList.nvEncRegisterResource(m_pEncoder, &RegisterResource));
		NV_ENC_MAP_INPUT_RESOURCE MapInputResource = { NV_ENC_MAP_INPUT_RESOURCE_VER, 0, 0, RegisterResource.registeredResource };
		m_pRegisteredResource = RegisterResource.registeredResource;
		CHECK_NVENC_STATUS(m_FunctionList.nvEncMapInputResource(m_pEncoder, &MapInputResource));
		m_PictureParameters.inputBuffer = MapInputResource.mappedResource;
		// Create output bitstream buffer
		m_nOutputBitstreamSize = 2 * 1024 * 1024;
		NV_ENC_CREATE_BITSTREAM_BUFFER CreateBitstreamBuffer = { NV_ENC_CREATE_BITSTREAM_BUFFER_VER, m_nOutputBitstreamSize, NV_ENC_MEMORY_HEAP_AUTOSELECT, 0 };
		CHECK_NVENC_STATUS(m_FunctionList.nvEncCreateBitstreamBuffer(m_pEncoder, &CreateBitstreamBuffer));
		m_pOutputBitstream = CreateBitstreamBuffer.bitstreamBuffer;
		m_PictureParameters.outputBitstream = m_pOutputBitstream;
		if (m_NvidiaNvencCodecContext.GetSaveOutputToFile())
		{
			char pOutputFilename[MAX_PATH];
			sprintf_s(pOutputFilename, "nvenc-%d.h264", nCodecInstanceId);
			if (fopen_s(&m_pOutputFile, pOutputFilename, "wb") != 0)
			{
				throw std::runtime_error(std::string("could not open ").append(pOutputFilename).append(" for writing!"));
			}
		}
	}
static int init_buffers(NvEncoder*enc)
{


    int ret=0;
    NVENCSTATUS nvResult;
    CUresult cuResult;


    NV_ENC_REGISTER_RESOURCE reg_res={0,};


    enc->encBuf.stInputBfr.dwWidth=enc->encCfg.maxWidth;
    enc->encBuf.stInputBfr.dwHeight=enc->encCfg.maxHeight;
    enc->encBuf.stInputBfr.bufferFmt=NV_ENC_BUFFER_FORMAT_NV12_PL;


    ////////////////////////////
    //Allocate INPUT Resource

    CUcontext cuCtxCur;
    CUdeviceptr devPtr;

    cuResult=cuMemAlloc(&devPtr,enc->encCfg.maxWidth*enc->encCfg.maxHeight*3/2);
    CHK_CUDA_ERR(cuResult);

    enc->encBuf.stInputBfr.pNV12devPtr=devPtr;
    enc->encBuf.stInputBfr.uNV12Stride=enc->encBuf.stInputBfr.dwWidth;

    enc->encBuf.stInputBfr.LumaSize=enc->encBuf.stInputBfr.uNV12Stride*enc->encBuf.stInputBfr.dwHeight;


//    fprintf(stderr,"CUDA INPUT BUFFER::Stride =%u\n",enc->encBuf.stInputBfr.uNV12Stride);
//    fprintf(stderr,"CUDA INPUT BUFFER::Width  =%u\n",enc->encBuf.stInputBfr.dwWidth);
//    fprintf(stderr,"CUDA INPUT BUFFER::Height =%u\n",enc->encBuf.stInputBfr.dwHeight);

#if 0
    void* phost;

    cuResult=cuMemAllocHost(&phost,enc->encBuf.stInputBfr.uNV12Stride,
            enc->encBuf.stInputBfr.dwHeight*3/2)
    CHK_CUDA_ERR(cuResult);


    enc->encBuf.stInputBfr.pNV12hostPtr=phost;
    enc->encBuf.stInputBfr.uNV12hostStride=enc->encBuf.stInputBfr.uNV12Stride;
#endif

    ///////////////////////
    //Using Mapped Resource..



    void*resource;
    nvResult=enc->hwEncoder->NvEncRegisterResource(NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR,(void*)enc->encBuf.stInputBfr.pNV12devPtr,
            enc->encBuf.stInputBfr.dwWidth,enc->encBuf.stInputBfr.dwHeight,enc->encBuf.stInputBfr.uNV12Stride,&resource);

    CHK_NVENC_ERR(nvResult);

    enc->encBuf.stInputBfr.nvRegisteredResource=resource;

    void* insurf;
    nvResult=enc->hwEncoder->NvEncMapInputResource(enc->encBuf.stInputBfr.nvRegisteredResource,&insurf);

    CHK_NVENC_ERR(nvResult);

    enc->encBuf.stInputBfr.hInputSurface=insurf;



    ////////////////////////////
    //Allocate OUTPUT Resource

    enc->encBuf.stOutputBfr.dwBitstreamBufferSize=1024*1024;//1MiB
    nvResult=enc->hwEncoder->NvEncCreateBitstreamBuffer(enc->encBuf.stOutputBfr.dwBitstreamBufferSize,&enc->encBuf.stOutputBfr.hBitstreamBuffer);
    CHK_NVENC_ERR(nvResult);


    enc->encBuf.stOutputBfr.bEOSFlag=false;
    enc->encBuf.stOutputBfr.bWaitOnEvent=false;





    return ret;

}
Exemple #13
0
// Run the Cuda part of the computation
bool copyDecodedFrameToTexture(unsigned int &nRepeats, int bUseInterop, int *pbIsProgressive)
{
    CUVIDPARSERDISPINFO oDisplayInfo;

    if (g_pFrameQueue->dequeue(&oDisplayInfo))
    {
        CCtxAutoLock lck(g_CtxLock);
        // Push the current CUDA context (only if we are using CUDA decoding path)
        CUresult result = cuCtxPushCurrent(g_oContext);

        CUdeviceptr  pDecodedFrame[2] = { 0, 0 };
        CUdeviceptr  pInteropFrame[2] = { 0, 0 };

        int num_fields = (oDisplayInfo.progressive_frame ? (1) : (2+oDisplayInfo.repeat_first_field));
        *pbIsProgressive = oDisplayInfo.progressive_frame;
        g_bIsProgressive = oDisplayInfo.progressive_frame ? true : false;

        for (int active_field=0; active_field<num_fields; active_field++)
        {
            nRepeats = oDisplayInfo.repeat_first_field;
            CUVIDPROCPARAMS oVideoProcessingParameters;
            memset(&oVideoProcessingParameters, 0, sizeof(CUVIDPROCPARAMS));

            oVideoProcessingParameters.progressive_frame = oDisplayInfo.progressive_frame;
            oVideoProcessingParameters.second_field      = active_field;
            oVideoProcessingParameters.top_field_first   = oDisplayInfo.top_field_first;
            oVideoProcessingParameters.unpaired_field    = (num_fields == 1);

            unsigned int nDecodedPitch = 0;
            unsigned int nWidth = 0;
            unsigned int nHeight = 0;

            // map decoded video frame to CUDA surfae
            g_pVideoDecoder->mapFrame(oDisplayInfo.picture_index, &pDecodedFrame[active_field], &nDecodedPitch, &oVideoProcessingParameters);
            nWidth  = g_pVideoDecoder->targetWidth();
            nHeight = g_pVideoDecoder->targetHeight();
            // map DirectX texture to CUDA surface
            size_t nTexturePitch = 0;

            // If we are Encoding and this is the 1st Frame, we make sure we allocate system memory for readbacks
            if (g_bReadback && g_bFirstFrame && g_ReadbackSID)
            {
                CUresult result;
                checkCudaErrors(result = cuMemAllocHost((void **)&g_bFrameData[0], (nDecodedPitch * nHeight * 3 / 2)));
                checkCudaErrors(result = cuMemAllocHost((void **)&g_bFrameData[1], (nDecodedPitch * nHeight * 3 / 2)));
                g_bFirstFrame = false;

                if (result != CUDA_SUCCESS)
                {
                    printf("cuMemAllocHost returned %d\n", (int)result);
                }
            }

            // If streams are enabled, we can perform the readback to the host while the kernel is executing
            if (g_bReadback && g_ReadbackSID)
            {
                CUresult result = cuMemcpyDtoHAsync(g_bFrameData[active_field], pDecodedFrame[active_field], (nDecodedPitch * nHeight * 3 / 2), g_ReadbackSID);

                if (result != CUDA_SUCCESS)
                {
                    printf("cuMemAllocHost returned %d\n", (int)result);
                }
            }

#if ENABLE_DEBUG_OUT
            printf("%s = %02d, PicIndex = %02d, OutputPTS = %08d\n",
                   (oDisplayInfo.progressive_frame ? "Frame" : "Field"),
                   g_DecodeFrameCount, oDisplayInfo.picture_index, oDisplayInfo.timestamp);
#endif

            if (g_pImageDX)
            {
                // map the texture surface
                g_pImageDX->map(&pInteropFrame[active_field], &nTexturePitch, active_field);
            }
            else
            {
                pInteropFrame[active_field] = g_pInteropFrame[active_field];
                nTexturePitch = g_pVideoDecoder->targetWidth() * 2;
            }

            // perform post processing on the CUDA surface (performs colors space conversion and post processing)
            // comment this out if we inclue the line of code seen above
            cudaPostProcessFrame(&pDecodedFrame[active_field], nDecodedPitch, &pInteropFrame[active_field], nTexturePitch, g_pCudaModule->getModule(), gfpNV12toARGB, g_KernelSID);

            if (g_pImageDX)
            {
                // unmap the texture surface
                g_pImageDX->unmap(active_field);
            }

            // unmap video frame
            // unmapFrame() synchronizes with the VideoDecode API (ensures the frame has finished decoding)
            g_pVideoDecoder->unmapFrame(pDecodedFrame[active_field]);
            // release the frame, so it can be re-used in decoder
            g_pFrameQueue->releaseFrame(&oDisplayInfo);
            g_DecodeFrameCount++;
        }

        // Detach from the Current thread
        checkCudaErrors(cuCtxPopCurrent(NULL));
    }
    else
    {
        return false;
    }

    // check if decoding has come to an end.
    // if yes, signal the app to shut down.
    if (!g_pVideoSource->isStarted() || g_pFrameQueue->isEndOfDecode())
    {
        // Let's free the Frame Data
        if (g_ReadbackSID && g_bFrameData)
        {
            cuMemFreeHost((void *)g_bFrameData[0]);
            cuMemFreeHost((void *)g_bFrameData[1]);
            g_bFrameData[0] = NULL;
            g_bFrameData[1] = NULL;
        }

        // Let's just stop, and allow the user to quit, so they can at least see the results
        g_pVideoSource->stop();

        // If we want to loop reload the video file and restart
        if (g_bLoop && !g_bAutoQuit)
        {
            reinitCudaResources();
            g_FrameCount = 0;
            g_DecodeFrameCount = 0;
            g_pVideoSource->start();
        }

        if (g_bAutoQuit)
        {
            g_bDone = true;
        }
    }

    return true;
}