Пример #1
0
CAMLprim value spoc_cuda_flush(value gi, value dev, value q){
	CAMLparam3(gi, dev, q);
	cuda_event_list *events;
	events =  (cuda_event_list*)(Field(dev,3));

	CUDA_GET_CONTEXT;

	cuStreamSynchronize(queue[Int_val(q)]);
	/*while(events != NULL && events->next != NULL)
	{
		if (events != NULL) {
			CUDA_CHECK_CALL(cuEventSynchronize (events->evt));
			if (events->vec)
			{
				cuMemFree (events->vec);
			}
			CUDA_CHECK_CALL(cuEventDestroy(events->evt));
		}
		{cuda_event_list *tmp= events;
		events = events->next;
		if (tmp != NULL)
			free(tmp);}
	}
	if (events) free(events);
	events = NULL;*/
	CUDA_RESTORE_CONTEXT;
	Store_field(dev, 3, (value)events);

	CAMLreturn(Val_unit);
}
Пример #2
0
static void
nvptx_wait_all (void)
{
  CUresult r;
  struct ptx_stream *s;
  pthread_t self = pthread_self ();
  struct nvptx_thread *nvthd = nvptx_thread ();

  pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);

  /* Wait for active streams initiated by this thread (or by multiple threads)
     to complete.  */
  for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
    {
      if (s->multithreaded || pthread_equal (s->host_thread, self))
	{
	  r = cuStreamQuery (s->stream);
	  if (r == CUDA_SUCCESS)
	    continue;
	  else if (r != CUDA_ERROR_NOT_READY)
	    GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));

	  r = cuStreamSynchronize (s->stream);
	  if (r != CUDA_SUCCESS)
	    GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
	}
    }

  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);

  event_gc (true);
}
Пример #3
0
 // Describes how to run the CLBlast routine
 static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
   #ifdef OPENCL_API
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Col2im<T>(args.kernel_mode,
                             args.channels, args.height, args.width,
                             args.kernel_h, args.kernel_w,
                             args.pad_h, args.pad_w,
                             args.stride_h, args.stride_w,
                             args.dilation_h, args.dilation_w,
                             buffers.b_mat(), args.b_offset, // col
                             buffers.a_mat(), args.a_offset, // im
                             &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
   #elif CUDA_API
     auto status = Col2im<T>(args.kernel_mode,
                             args.channels, args.height, args.width,
                             args.kernel_h, args.kernel_w,
                             args.pad_h, args.pad_w,
                             args.stride_h, args.stride_w,
                             args.dilation_h, args.dilation_w,
                             buffers.b_mat(), args.b_offset, // col
                             buffers.a_mat(), args.a_offset, // im
                             queue.GetContext()(), queue.GetDevice()());
     cuStreamSynchronize(queue());
   #endif
   return status;
 }
Пример #4
0
 // Describes how to run the CLBlast routine
 static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
   #ifdef OPENCL_API
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Scal(args.n, args.alpha,
                        buffers.x_vec(), args.x_offset, args.x_inc,
                        &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
   #elif CUDA_API
     auto status = Scal(args.n, args.alpha,
                        buffers.x_vec(), args.x_offset, args.x_inc,
                        queue.GetContext()(), queue.GetDevice()());
     cuStreamSynchronize(queue());
   #endif
   return status;
 }
Пример #5
0
static void
nvptx_wait (int async)
{
  CUresult r;
  struct ptx_stream *s;

  s = select_stream_for_async (async, pthread_self (), false, NULL);

  if (!s)
    GOMP_PLUGIN_fatal ("unknown async %d", async);

  r = cuStreamSynchronize (s->stream);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));

  event_gc (true);
}
void CudaModuleScene::onAfterLaunchApexCudaFunc(const ApexCudaFunc& func, CUstream stream)
{
	if (mCudaProfileSession)
	{
		mCudaProfileSession->onFuncFinish(func.getProfileId(), stream);
	}

#if !CUDA_KERNEL_CHECK_ALWAYS
	if (mSceneIntl.getCudaKernelCheckEnabled())
#endif
	{
		CUresult ret = cuStreamSynchronize(stream);
		if ( CUDA_SUCCESS != ret )
		{
			APEX_INTERNAL_ERROR("Cuda Error %d after launch of func '%s'", ret, func.getName());
			PX_ALWAYS_ASSERT();
		}
	}
}
Пример #7
0
bool GLInteropResource::unmap(GLuint tex)
{
    Q_UNUSED(tex);
    if (WORKAROUND_UNMAP_CONTEXT_SWITCH)
        return true;
    int plane = -1;
    if (res[0].texture == tex)
        plane = 0;
    else if (res[1].texture == tex)
        plane = 1;
    else
        return false;
    // FIXME: why cuCtxPushCurrent gives CUDA_ERROR_INVALID_CONTEXT if opengl viewport changed?
    CUDA_WARN(cuCtxPushCurrent(ctx));
    CUDA_WARN(cuStreamSynchronize(res[plane].stream));
    // FIXME: need a correct context. But why we have to push context even though map/unmap are called in the same thread
    // Because the decoder switch the context in another thread so we have to switch the context back?
    // to workaround the context issue, we must pop the context that valid in map() and push it here
    CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false);
    CUDA_ENSURE(cuCtxPopCurrent(&ctx), false);
    return true;
}
Пример #8
0
/*
// Property Message
//
// API
//static int getPathOfFeaturePyramidGPUStream(IplImage * image, float step,
          int numStep, int startIndex, int sideLength, int bx, int by,
          CvLSVMFeaturePyramid **maps)
// INPUT
// image
// step
// numStep
// startIndex
// sideLength
// bx
// by
// OUTPUT
// maps
// RESULT
// Error status
*/
static int getPathOfFeaturePyramidGPUStream(IplImage * image, float step,
        int numStep, int startIndex, int sideLength, int bx, int by,
        CvLSVMFeaturePyramid **maps)
{
    CvLSVMFeatureMap **feature_maps;

    int i;
    int width, height, numChannels, sizeX, sizeY, p, pp, newSizeX, newSizeY;
    float *scales;
    CvLSVMFeatureMapGPU **devs_img, **devs_map_pre_norm, **devs_map_pre_pca;
    CUstream *streams;
    CUresult res;

    scales = (float *) malloc(sizeof(float) * (numStep));
    devs_img = (CvLSVMFeatureMapGPU **) malloc(
            sizeof(CvLSVMFeatureMapGPU*) * (numStep));
    devs_map_pre_norm = (CvLSVMFeatureMapGPU **) malloc(
            sizeof(CvLSVMFeatureMapGPU*) * (numStep));
    devs_map_pre_pca = (CvLSVMFeatureMapGPU **) malloc(
            sizeof(CvLSVMFeatureMapGPU*) * (numStep));
    streams = (CUstream *) malloc(sizeof(CUstream) * (numStep));
    feature_maps = (CvLSVMFeatureMap **) malloc(
            sizeof(CvLSVMFeatureMap *) * (numStep));

    // allocate device memory
    for (i = 0; i < numStep; i++)
    {
        scales[i] = 1.0f / powf(step, (float) i);
        width  = (int) (((float) image->width ) * scales[i] + 0.5);
        height = (int) (((float) image->height) * scales[i] + 0.5);
        numChannels = image->nChannels;
        sizeX = width  / sideLength;
        sizeY = height / sideLength;
        p  = NUM_SECTOR * 3;
        pp = NUM_SECTOR * 12;
        newSizeX = sizeX - 2;
        newSizeY = sizeY - 2;

        allocFeatureMapObjectGPU<float>(&devs_img[i], width, height,
                numChannels);
        allocFeatureMapObjectGPU<float>(&devs_map_pre_norm[i], sizeX, sizeY, p);
        allocFeatureMapObjectGPU<float>(&devs_map_pre_pca[i], newSizeX,
                newSizeY, pp);
        res = cuStreamCreate(&streams[i], CU_STREAM_DEFAULT);
        CUDA_CHECK(res, "cuStreamCreate(stream)");
    }

    // excute main function
    resizeGPUStream(numStep, image, scales, devs_img, streams);

    getFeatureMapsGPUStream(numStep, sideLength, devs_img, devs_map_pre_norm,
            streams);

    normalizeAndTruncateGPUStream(numStep, Val_Of_Truncate, devs_map_pre_norm,
            devs_map_pre_pca, streams);

    PCAFeatureMapsGPUStream(numStep, bx, by, devs_map_pre_pca, feature_maps,
            streams);

    // synchronize cuda stream
    for (i = 0; i < numStep; i++)
    {
        cuStreamSynchronize(streams[i]);
        cuStreamDestroy(streams[i]);
    }

    for (i = 0; i < numStep; i++)
    {
        (*maps)->pyramid[startIndex + i] = feature_maps[i];
    }/*for(i = 0; i < numStep; i++)*/

    // free device memory
    for (i = 0; i < numStep; i++)
    {
        freeFeatureMapObjectGPU(&devs_img[i]);
        freeFeatureMapObjectGPU(&devs_map_pre_norm[i]);
        freeFeatureMapObjectGPU(&devs_map_pre_pca[i]);
    }

    free(scales);
    free(devs_img);
    free(devs_map_pre_norm);
    free(devs_map_pre_pca);
    free(streams);
    free(feature_maps);

    return LATENT_SVM_OK;
}
Пример #9
0
/*
// Feature map Normalization and Truncation in GPU
//
// API
//int normalizeAndTruncateGPUStream(const int numStep, const float alfa,
          CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMapGPU **devs_map_out,
          CUstream *streams)
// INPUT
// numStep
// alfa
// devs_map_in
// streams
// OUTPUT
// devs_map_out
// RESULT
// Error status
*/
int normalizeAndTruncateGPUStream(const int numStep, const float alfa,
        CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMapGPU **devs_map_out,
        CUstream *streams)
{

    int sizeX, sizeY, newSizeX, newSizeY, pp;
    int size_norm, size_map_out;
    int i;
    CUresult res;
    CvLSVMFeatureMapGPU **devs_norm;

    pp = NUM_SECTOR * 12;

    devs_norm = (CvLSVMFeatureMapGPU **) malloc(
            sizeof(CvLSVMFeatureMapGPU*) * (numStep));

    // allocate device memory
    for (i = 0; i < numStep; i++)
    {
        sizeX = devs_map_in[i]->sizeX;
        sizeY = devs_map_in[i]->sizeY;
        newSizeX = sizeX - 2;
        newSizeY = sizeY - 2;

        allocFeatureMapObjectGPU<float>(&devs_norm[i], sizeX, sizeY, 1);
    }

    // exucute async
    for (i = 0; i < numStep; i++)
    {
        sizeX = devs_map_in[i]->sizeX;
        sizeY = devs_map_in[i]->sizeY;
        newSizeX = sizeX - 2;
        newSizeY = sizeY - 2;
        size_norm = sizeX * sizeY;
        size_map_out = newSizeX * newSizeY * pp;

        // initilize device memory value of 0
        res = cuMemsetD32Async(devs_norm[i]->map, 0, size_norm, streams[i]);
        CUDA_CHECK(res, "cuMemset(dev_norm)");
        res = cuMemsetD32Async(devs_map_out[i]->map, 0, size_map_out,
                streams[i]);
        CUDA_CHECK(res, "cuMemset(dev_map_out)");

        // launch kernel
        calculateNormGPULaunch(devs_map_in[i], devs_norm[i], streams[i]);

    }

    for (i = 0; i < numStep; i++)
    {
        // launch kernel
        normalizeGPULaunch(alfa, devs_map_in[i], devs_norm[i], devs_map_out[i],
                streams[i]);
    }

    // synchronize cuda stream
    for (i = 0; i < numStep; i++)
    {
        cuStreamSynchronize(streams[i]);
    }

    // free device memory
    for (i = 0; i < numStep; i++)
    {
        freeFeatureMapObjectGPU(&devs_norm[i]);
    }

    free(devs_norm);

    return LATENT_SVM_OK;
}
Пример #10
0
int cuda_test_memcpy_async(unsigned int size)
{
	int i;
	CUresult res;
	CUdevice dev;
	CUcontext ctx;
	CUstream stream;
	CUdeviceptr data_addr;
	unsigned int *in, *out;
	struct timeval tv;
	struct timeval tv_total_start, tv_total_end;
	unsigned long total;
	struct timeval tv_h2d_start, tv_h2d_end;
	float h2d;
	struct timeval tv_d2h_start, tv_d2h_end;
	float d2h;

	gettimeofday(&tv_total_start, NULL);

	res = cuInit(0);
	if (res != CUDA_SUCCESS) {
		printf("cuInit failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuDeviceGet(&dev, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuCtxCreate(&ctx, 0, dev);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuStreamCreate(&stream, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuStreamCreate failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemAlloc(&data_addr, size);
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemAllocHost((void **)&in, size);
	if (res != CUDA_SUCCESS) {
		printf("cuMemAllocHost(in) failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemAllocHost((void **)&out, size);
	if (res != CUDA_SUCCESS) {
		printf("cuMemAllocHost(out) failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	for (i = 0; i < size / 4; i++) {
		in[i] = i+1;
		out[i] = 0;
	}

	gettimeofday(&tv_h2d_start, NULL);
	res = cuMemcpyHtoDAsync(data_addr, in, size, stream);
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoDAsync failed: res = %u\n", (unsigned int)res);
		return -1;
	}
	res = cuStreamSynchronize(stream);
	if (res != CUDA_SUCCESS) {
		printf("cuStreamSynchronize() failed: res = %u\n", (unsigned int)res);
		return -1;
	}
	gettimeofday(&tv_h2d_end, NULL);

	gettimeofday(&tv_d2h_start, NULL);
	res = cuMemcpyDtoHAsync(out, data_addr, size, stream);
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyDtoHAsync failed: res = %u\n", (unsigned int)res);
		return -1;
	}
	res = cuStreamSynchronize(stream);
	if (res != CUDA_SUCCESS) {
		printf("cuStreamSynchronize() failed: res = %u\n", (unsigned int)res);
		return -1;
	}
	gettimeofday(&tv_d2h_end, NULL);

	for (i = 0; i < size / 4; i++) {
		if (in[i] != out[i]) {
			printf("in[%d] = %u, out[%d] = %u\n",
				   i, in[i], i, out[i]);
		}
	}

	res = cuMemFreeHost(out);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFreeHost(out) failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemFreeHost(in);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFreeHost(in) failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemFree(data_addr);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuStreamDestroy(stream);
	if (res != CUDA_SUCCESS) {
		printf("cuStreamDestroy failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuCtxDestroy(ctx);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxDestroy failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	gettimeofday(&tv_total_end, NULL);

	tvsub(&tv_h2d_end, &tv_h2d_start, &tv);
	h2d = tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
	tvsub(&tv_d2h_end, &tv_d2h_start, &tv);
	d2h = tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
	tvsub(&tv_total_end, &tv_total_start, &tv);
	total = tv.tv_sec * 1000 + tv.tv_usec / 1000;

	printf("HtoD: %f\n", h2d);
	printf("DtoH: %f\n", d2h);

	return 0;

end:

	return -1;
}
Пример #11
0
void
nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
	    int async, unsigned *dims, void *targ_mem_desc)
{
  struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
  CUfunction function;
  CUresult r;
  int i;
  struct ptx_stream *dev_str;
  void *kargs[1];
  void *hp, *dp;
  struct nvptx_thread *nvthd = nvptx_thread ();
  const char *maybe_abort_msg = "(perhaps abort was called)";

  function = targ_fn->fn;

  dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
  assert (dev_str == nvthd->current_stream);

  /* Initialize the launch dimensions.  Typically this is constant,
     provided by the device compiler, but we must permit runtime
     values.  */
  for (i = 0; i != 3; i++)
    if (targ_fn->launch->dim[i])
      dims[i] = targ_fn->launch->dim[i];

  /* This reserves a chunk of a pre-allocated page of memory mapped on both
     the host and the device. HP is a host pointer to the new chunk, and DP is
     the corresponding device pointer.  */
  map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);

  GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);

  /* Copy the array of arguments to the mapped page.  */
  for (i = 0; i < mapnum; i++)
    ((void **) hp)[i] = devaddrs[i];

  /* Copy the (device) pointers to arguments to the device (dp and hp might in
     fact have the same value on a unified-memory system).  */
  r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));

  GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
		     " gangs=%u, workers=%u, vectors=%u\n",
		     __FUNCTION__, targ_fn->launch->fn,
		     dims[0], dims[1], dims[2]);

  // OpenACC		CUDA
  //
  // num_gangs		nctaid.x
  // num_workers	ntid.y
  // vector length	ntid.x

  kargs[0] = &dp;
  r = cuLaunchKernel (function,
		      dims[GOMP_DIM_GANG], 1, 1,
		      dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
		      0, dev_str->stream, kargs, 0);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));

#ifndef DISABLE_ASYNC
  if (async < acc_async_noval)
    {
      r = cuStreamSynchronize (dev_str->stream);
      if (r == CUDA_ERROR_LAUNCH_FAILED)
	GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
			   maybe_abort_msg);
      else if (r != CUDA_SUCCESS)
        GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
    }
  else
    {
      CUevent *e;

      e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));

      r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
      if (r == CUDA_ERROR_LAUNCH_FAILED)
	GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
			   maybe_abort_msg);
      else if (r != CUDA_SUCCESS)
        GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));

      event_gc (true);

      r = cuEventRecord (*e, dev_str->stream);
      if (r != CUDA_SUCCESS)
        GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));

      event_add (PTX_EVT_KNL, e, (void *)dev_str);
    }
#else
  r = cuCtxSynchronize ();
  if (r == CUDA_ERROR_LAUNCH_FAILED)
    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
		       maybe_abort_msg);
  else if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
#endif

  GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
		     targ_fn->launch->fn);

#ifndef DISABLE_ASYNC
  if (async < acc_async_noval)
#endif
    map_pop (dev_str);
}
Пример #12
0
bool GLInteropResource::map(int picIndex, const CUVIDPROCPARAMS &param, GLuint tex, int w, int h, int H, int plane)
{
    AutoCtxLock locker((cuda_api*)this, lock);
    Q_UNUSED(locker);
    if (!ensureResource(w, h, H, tex, plane)) // TODO surface size instead of frame size because we copy the device data
        return false;
    //CUDA_ENSURE(cuCtxPushCurrent(ctx), false);
    CUdeviceptr devptr;
    unsigned int pitch;

    CUDA_ENSURE(cuvidMapVideoFrame(dec, picIndex, &devptr, &pitch, const_cast<CUVIDPROCPARAMS*>(&param)), false);
    CUVIDAutoUnmapper unmapper(this, dec, devptr);
    Q_UNUSED(unmapper);
    // TODO: why can not use res[plane].stream? CUDA_ERROR_INVALID_HANDLE
    CUDA_ENSURE(cuGraphicsMapResources(1, &res[plane].cuRes, 0), false);
    CUarray array;
    CUDA_ENSURE(cuGraphicsSubResourceGetMappedArray(&array, res[plane].cuRes, 0, 0), false);

    CUDA_MEMCPY2D cu2d;
    memset(&cu2d, 0, sizeof(cu2d));
    cu2d.srcDevice = devptr;
    cu2d.srcMemoryType = CU_MEMORYTYPE_DEVICE;
    cu2d.srcPitch = pitch;
    cu2d.dstArray = array;
    cu2d.dstMemoryType = CU_MEMORYTYPE_ARRAY;
    cu2d.dstPitch = pitch;
    // the whole size or copy size?
    cu2d.WidthInBytes = pitch;
    cu2d.Height = h;
    if (plane == 1) {
        cu2d.srcXInBytes = 0;// +srcY*srcPitch + srcXInBytes
        cu2d.srcY = H; // skip the padding height
        cu2d.Height /= 2;
    }
    if (res[plane].stream)
        CUDA_ENSURE(cuMemcpy2DAsync(&cu2d, res[plane].stream), false);
    else
        CUDA_ENSURE(cuMemcpy2D(&cu2d), false);
    //TODO: delay cuCtxSynchronize && unmap. do it in unmap(tex)?
    // map to an already mapped resource will crash. sometimes I can not unmap the resource in unmap(tex) because if context switch error
    // so I simply unmap the resource here
    if (WORKAROUND_UNMAP_CONTEXT_SWITCH) {
        if (res[plane].stream) {
            //CUDA_WARN(cuCtxSynchronize(), false); //wait too long time? use cuStreamQuery?
            CUDA_WARN(cuStreamSynchronize(res[plane].stream)); //slower than CtxSynchronize
        }
        /*
         * This function provides the synchronization guarantee that any CUDA work issued
         * in \p stream before ::cuGraphicsUnmapResources() will complete before any
         * subsequently issued graphics work begins.
         * The graphics API from which \p resources were registered
         * should not access any resources while they are mapped by CUDA. If an
         * application does so, the results are undefined.
         */
        CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false);
    } else {
        // call it at last. current context will be used by other cuda calls (unmap() for example)
        CUDA_ENSURE(cuCtxPopCurrent(&ctx), false); // not required
    }
    return true;
}
Пример #13
0
bool EGLInteropResource::map(int picIndex, const CUVIDPROCPARAMS &param, GLuint tex, int w, int h, int H, int plane)
{
    // plane is always 0 because frame is rgb
    AutoCtxLock locker((cuda_api*)this, lock);
    Q_UNUSED(locker);
    if (!ensureResource(w, h, param.Reserved[0], H, tex)) // TODO surface size instead of frame size because we copy the device data
        return false;
    //CUDA_ENSURE(cuCtxPushCurrent(ctx), false);
    CUdeviceptr devptr;
    unsigned int pitch;

    CUDA_ENSURE(cuvidMapVideoFrame(dec, picIndex, &devptr, &pitch, const_cast<CUVIDPROCPARAMS*>(&param)), false);
    CUVIDAutoUnmapper unmapper(this, dec, devptr);
    Q_UNUSED(unmapper);
    // TODO: why can not use res[plane].stream? CUDA_ERROR_INVALID_HANDLE
    CUDA_ENSURE(cuGraphicsMapResources(1, &res[plane].cuRes, 0), false);
    CUarray array;
    CUDA_ENSURE(cuGraphicsSubResourceGetMappedArray(&array, res[plane].cuRes, 0, 0), false);
    CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false); // mapped array still accessible!

    CUDA_MEMCPY2D cu2d;
    memset(&cu2d, 0, sizeof(cu2d));
    // Y plane
    cu2d.srcDevice = devptr;
    cu2d.srcMemoryType = CU_MEMORYTYPE_DEVICE;
    cu2d.srcPitch = pitch;
    cu2d.dstArray = array;
    cu2d.dstMemoryType = CU_MEMORYTYPE_ARRAY;
    cu2d.dstPitch = pitch;
    // the whole size or copy size?
    cu2d.WidthInBytes = res[plane].W; // the same value as texture9_nv12
    cu2d.Height = H*3/2;
    if (res[plane].stream)
        CUDA_ENSURE(cuMemcpy2DAsync(&cu2d, res[plane].stream), false);
    else
        CUDA_ENSURE(cuMemcpy2D(&cu2d), false);
    //TODO: delay cuCtxSynchronize && unmap. do it in unmap(tex)?
    // map to an already mapped resource will crash. sometimes I can not unmap the resource in unmap(tex) because if context switch error
    // so I simply unmap the resource here
    if (WORKAROUND_UNMAP_CONTEXT_SWITCH) {
        if (res[plane].stream) {
            //CUDA_WARN(cuCtxSynchronize(), false); //wait too long time? use cuStreamQuery?
            CUDA_WARN(cuStreamSynchronize(res[plane].stream)); //slower than CtxSynchronize
        }
        /*
         * This function provides the synchronization guarantee that any CUDA work issued
         * in \p stream before ::cuGraphicsUnmapResources() will complete before any
         * subsequently issued graphics work begins.
         * The graphics API from which \p resources were registered
         * should not access any resources while they are mapped by CUDA. If an
         * application does so, the results are undefined.
         */
//        CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false);
    }
    D3DLOCKED_RECT rect_src, rect_dst;
    DX_ENSURE(texture9_nv12->LockRect(0, &rect_src, NULL, D3DLOCK_READONLY), false);
    DX_ENSURE(surface9_nv12->LockRect(&rect_dst, NULL, D3DLOCK_DISCARD), false);
    memcpy(rect_dst.pBits, rect_src.pBits, res[plane].W*H*3/2); // exactly w and h
    DX_ENSURE(surface9_nv12->UnlockRect(), false);
    DX_ENSURE(texture9_nv12->UnlockRect(0), false);
#if 0
    //IDirect3DSurface9 *raw_surface = NULL;
    //DX_ENSURE(texture9_nv12->GetSurfaceLevel(0, &raw_surface), false);
    const RECT src = { 0, 0, w, h*3/2};
    DX_ENSURE(device9->StretchRect(raw_surface, &src, surface9_nv12, NULL, D3DTEXF_NONE), false);
#endif
    if (!map(surface9_nv12, tex, w, h, H))
        return false;
    return true;
}
Пример #14
0
 void device_t<CUDA>::finish(){
   OCCA_CUDA_CHECK("Device: Finish",
                   cuStreamSynchronize(*((CUstream*) dev->currentStream)) );
 }
Пример #15
0
static void vq_handle_output(VirtIODevice *vdev, VirtQueue *vq)
{
	VirtQueueElement elem;
	
	while(virtqueue_pop(vq, &elem)) {
		struct param *p = elem.out_sg[0].iov_base;
	
		//for all library routines: get required arguments from buffer, execute, and push results back in virtqueue
		switch (p->syscall_type) {
		case CUINIT: {
			p->result = cuInit(p->flags);
			break;
		}
		case CUDRIVERGETVERSION: {
			p->result = cuDriverGetVersion(&p->val1);
			break;
		}
		case CUDEVICEGETCOUNT: {
			p->result = cuDeviceGetCount(&p->val1);
			break;
		}
		case CUDEVICEGET: {
			p->result = cuDeviceGet(&p->device, p->val1);
			break;
		}
		case CUDEVICECOMPUTECAPABILITY: {
			p->result = cuDeviceComputeCapability(&p->val1, &p->val2, p->device);
			break;
		}
		case CUDEVICEGETNAME: {
			p->result = cuDeviceGetName(elem.in_sg[0].iov_base, p->val1, p->device);
			break;
		}
		case CUDEVICEGETATTRIBUTE: {
			p->result = cuDeviceGetAttribute(&p->val1, p->attrib, p->device);
			break;
		}
		case CUCTXCREATE: {
                        p->result = cuCtxCreate(&p->ctx, p->flags, p->device);				
			break;
		}
		case CUCTXDESTROY: {
			p->result = cuCtxDestroy(p->ctx);
			break;
		}
		case CUCTXGETCURRENT: {
			p->result = cuCtxGetCurrent(&p->ctx);
			break;
		}
		case CUCTXGETDEVICE: {
			p->result = cuCtxGetDevice(&p->device);
			break;
		}
		case CUCTXPOPCURRENT: {
			p->result = cuCtxPopCurrent(&p->ctx);
			break;
		}
		case CUCTXSETCURRENT: {
			p->result = cuCtxSetCurrent(p->ctx);
	                break;
		}
	        case CUCTXSYNCHRONIZE: {
		        p->result = cuCtxSynchronize();
	                break;
	        }
		case CUMODULELOAD: {
			//hardcoded path - needs improvement
			//all .cubin files should be stored in $QEMU_NFS_PATH - currently $QEMU_NFS_PATH is shared between host and guest with NFS
			char *binname = malloc((strlen((char *)elem.out_sg[1].iov_base)+strlen(getenv("QEMU_NFS_PATH")+1))*sizeof(char));
			if (!binname) {
				p->result = 0;
		                virtqueue_push(vq, &elem, 0);
				break;
			}
		        strcpy(binname, getenv("QEMU_NFS_PATH"));
		        strcat(binname, (char *)elem.out_sg[1].iov_base);
			//change current CUDA context
			//each CUDA contets has its own virtual memory space - isolation is ensured by switching contexes
                        if (cuCtxSetCurrent(p->ctx) != 0) {
				p->result = 999;
                                break;
			}
			p->result = cuModuleLoad(&p->module, binname);
			free(binname);
			break;
		}
                case CUMODULEGETGLOBAL: {
                        char *name = malloc(100*sizeof(char));
                        if (!name) {
                                p->result = 999;
                                break;
                        }
                        strcpy(name, (char *)elem.out_sg[1].iov_base);
                        p->result = cuModuleGetGlobal(&p->dptr,&p->size1,p->module,(const char *)name);
                        break;
                }
		case CUMODULEUNLOAD: {
			p->result = cuModuleUnload(p->module);
			break;			
		}
		case CUMEMALLOC: {
			if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			p->result = cuMemAlloc(&p->dptr, p->bytesize);
			break;
		}
                case CUMEMALLOCPITCH: {
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			p->result = cuMemAllocPitch(&p->dptr, &p->size3, p->size1, p->size2, p->bytesize);
			break;
		}
		//large buffers are alocated in smaller chuncks in guest kernel space
		//gets each chunck seperately and copies it to device memory
	        case CUMEMCPYHTOD: {
			int i;
			size_t offset;
                        unsigned long s, nr_pages = p->nr_pages;
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			offset = 0;
			for (i=0; i<nr_pages; i++) {
				s = *(long *)elem.out_sg[1+2*i+1].iov_base;
				p->result = cuMemcpyHtoD(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s);
				if (p->result != 0) break;
				offset += s;
			}
	                break;
		}
		case CUMEMCPYHTODASYNC: {
			int i;
                        size_t offset;
                        unsigned long s, nr_pages = p->nr_pages;
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
                        offset = 0;
			for (i=0; i<nr_pages; i++) {
                                s = *(long *)elem.out_sg[1+2*i+1].iov_base;
                                p->result = cuMemcpyHtoDAsync(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s, p->stream);
                                if (p->result != 0) break;
                                offset += s;
                        }
                        break;
		}
		case CUMEMCPYDTODASYNC: {
			p->result = cuMemcpyDtoDAsync(p->dptr, p->dptr1, p->size1, p->stream);
                        break;		
		}
	        case CUMEMCPYDTOH: {
			int i;
                        unsigned long s, nr_pages = p->nr_pages;
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			size_t offset = 0;
			for (i=0; i<nr_pages; i++) {
				s = *(long *)elem.in_sg[0+2*i+1].iov_base;
				p->result = cuMemcpyDtoH(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s);
				if (p->result != 0) break;
				offset += s;
			}
			break;
		}
		case CUMEMCPYDTOHASYNC: {
			int i;
                        unsigned long s, nr_pages = p->nr_pages;
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
                        size_t offset = 0;
			for (i=0; i<nr_pages; i++) {
                                s = *(long *)elem.in_sg[0+2*i+1].iov_base;
                                p->result = cuMemcpyDtoHAsync(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s, p->stream);
                                if (p->result != 0) break;
                                offset += s;
                        }
			break;
		}
		case CUMEMSETD32: {
			p->result = cuMemsetD32(p->dptr, p->bytecount, p->bytesize);
			break;
		}
	        case CUMEMFREE: {
	                p->result = cuMemFree(p->dptr);
	                break;
	        }
		case CUMODULEGETFUNCTION: {
			char *name = (char *)elem.out_sg[1].iov_base;
			name[p->length] = '\0';
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			p->result = cuModuleGetFunction(&p->function, p->module, name);
			break;	
		}
		case CULAUNCHKERNEL: {
			void **args = malloc(p->val1*sizeof(void *));
	                if (!args) {
				p->result = 9999;
	                        break;
        	        }
			int i;
			for (i=0; i<p->val1; i++) {
				args[i] = elem.out_sg[1+i].iov_base;
			}
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			p->result = cuLaunchKernel(p->function,
					p->gridDimX, p->gridDimY, p->gridDimZ,
			                p->blockDimX, p->blockDimY, p->blockDimZ,
					p->bytecount, 0, args, 0);
			free(args);
			break;
		}
		case CUEVENTCREATE: {
			p->result = cuEventCreate(&p->event1, p->flags);
			break;
		}
		case CUEVENTDESTROY: {
			p->result = cuEventDestroy(p->event1);
			break;
		}
		case CUEVENTRECORD: {
			p->result = cuEventRecord(p->event1, p->stream);
			break;
		}
		case CUEVENTSYNCHRONIZE: {
			p->result = cuEventSynchronize(p->event1);
			break;
		}
		case CUEVENTELAPSEDTIME: {
			p->result = cuEventElapsedTime(&p->pMilliseconds, p->event1, p->event2);
			break;
		}
		case CUSTREAMCREATE: {
			p->result =  cuStreamCreate(&p->stream, 0);
			break;
		}		
                case CUSTREAMSYNCHRONIZE: {
                        p->result = cuStreamSynchronize(p->stream);
                        break;
                }
                case CUSTREAMQUERY: {
                        p->result = cuStreamQuery(p->stream);
                        break;
                }
		case CUSTREAMDESTROY: {
                        p->result = cuStreamDestroy(p->stream);
                        break;
                }

		default: 
			printf("Unknown syscall_type\n");
		}
		virtqueue_push(vq, &elem, 0);
	}
	//notify frontend - trigger virtual interrupt
	virtio_notify(vdev, vq);
	return;
}
Пример #16
0
 /// Blocks until all previously queued commands in command_queue are issued to the associated device and have completed.
 void finish() const {
     ctx.set_current();
     cuda_check( cuStreamSynchronize( s.get() ) );
 }