Beispiel #1
0
/**
 * Frees the memory space pointed to by dptr, which must have been returned 
 * by a previous call to cuMemAlloc() or cuMemAllocPitch().
 *
 * Parameters:
 * dptr 	- Pointer to memory to free
 *
 * Returns:
 * CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED, 
 * CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_VALUE 
 */
CUresult cuMemFree_v2(CUdeviceptr dptr)
{
	CUresult res;
	struct CUctx_st *ctx;
	Ghandle handle;
	uint64_t addr = dptr;
	uint64_t size;

	if (!gdev_initialized)
		return CUDA_ERROR_NOT_INITIALIZED;

	res = cuCtxGetCurrent(&ctx);
	if (res != CUDA_SUCCESS)
		return res;

	/* wait for all kernels to complete - some may be using the memory. */
	cuCtxSynchronize();

	handle = ctx->gdev_handle;

	if (!(size = gfree(handle, addr)))
		return CUDA_ERROR_INVALID_VALUE;

	return CUDA_SUCCESS;
}
Beispiel #2
0
/**
 * Copies from device memory to device memory. dstDevice and srcDevice are the 
 * base pointers of the destination and source, respectively. ByteCount 
 * specifies the number of bytes to copy. Note that this function is 
 * asynchronous.
 *
 * Parameters:
 * dstDevice - Destination device pointer
 * srcDevice - Source device pointer
 * ByteCount - Size of memory copy in bytes
 *
 * Returns:
 * CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED, 
 * CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_VALUE 
 */
CUresult cuMemcpyDtoD_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount)
{
	CUresult res;
	struct CUctx_st *ctx;
	Ghandle handle;
	uint64_t dst_addr = dstDevice;
	uint64_t src_addr = srcDevice;
	uint32_t size = ByteCount;

	if (!gdev_initialized)
		return CUDA_ERROR_NOT_INITIALIZED;

	res = cuCtxGetCurrent(&ctx);
	if (res != CUDA_SUCCESS)
		return res;

	if (!dst_addr || !src_addr || !size)
		return CUDA_ERROR_INVALID_VALUE;

	handle = ctx->gdev_handle;

	if (gmemcpy(handle, dst_addr, src_addr, size))
		return CUDA_ERROR_UNKNOWN;

	return CUDA_SUCCESS;
}
Beispiel #3
0
/**
 * Allocates bytesize bytes of host memory that is page-locked and accessible 
 * to the device. The driver tracks the virtual memory ranges allocated with 
 * this function and automatically accelerates calls to functions such as 
 * cuMemcpy(). Since the memory can be accessed directly by the device, it can
 * be read or written with much higher bandwidth than pageable memory obtained
 * with functions such as malloc(). Allocating excessive amounts of memory 
 * with cuMemAllocHost() may degrade system performance, since it reduces the 
 * amount of memory available to the system for paging. As a result, this 
 * function is best used sparingly to allocate staging areas for data exchange
 * between host and device.
 *
 * Note all host memory allocated using cuMemHostAlloc() will automatically 
 * be immediately accessible to all contexts on all devices which support 
 * unified addressing (as may be queried using 
 * CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING). The device pointer that may be 
 * used to access this host memory from those contexts is always equal to the 
 * returned host pointer *pp. See Unified Addressing for additional details.
 *
 * Parameters:
 * pp - Returned host pointer to page-locked memory
 * bytesize - Requested allocation size in bytes
 *
 * Returns:
 * CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED, 
 * CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_VALUE, 
 * CUDA_ERROR_OUT_OF_MEMORY 
 */
CUresult cuMemAllocHost_v2(void **pp, unsigned int bytesize)
{
	CUresult res;
	struct CUctx_st *ctx;
	Ghandle handle;
	void *buf;
	uint64_t size = bytesize;

	if (!gdev_initialized)
		return CUDA_ERROR_NOT_INITIALIZED;

	res = cuCtxGetCurrent(&ctx);
	if (res != CUDA_SUCCESS)
		return res;

	if (!pp)
		return CUDA_ERROR_INVALID_VALUE;

	handle = ctx->gdev_handle;
	if (!(buf = gmalloc_dma(handle, size)))
		return CUDA_ERROR_OUT_OF_MEMORY;

	*pp = buf;

	return CUDA_SUCCESS;
}
Beispiel #4
0
/**
 * Allocates bytesize bytes of linear memory on the device and returns in 
 * @dptr a pointer to the allocated memory. The allocated memory is suitably 
 * aligned for any kind of variable. The memory is not cleared. If bytesize 
 * is 0, cuMemAlloc() returns CUDA_ERROR_INVALID_VALUE.
 *
 * Parameters:
 * dptr - Returned device pointer
 * bytesize - Requested allocation size in bytes
 *
 * Returns:
 * CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED, 
 * CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_VALUE, 
 * CUDA_ERROR_OUT_OF_MEMORY 
 */
CUresult cuMemAlloc_v2(CUdeviceptr *dptr, unsigned int bytesize)
{
	CUresult res;
	struct CUctx_st *ctx;
	Ghandle handle;
	uint64_t addr;
	uint64_t size = bytesize;

	if (!gdev_initialized)
		return CUDA_ERROR_NOT_INITIALIZED;

	res = cuCtxGetCurrent(&ctx);
	if (res != CUDA_SUCCESS)
		return res;

	if (!dptr)
		return CUDA_ERROR_INVALID_VALUE;

	handle = ctx->gdev_handle;
	if (!(addr = gmalloc(handle, size))) {
		return CUDA_ERROR_OUT_OF_MEMORY;
	}

	*dptr = addr;

	return CUDA_SUCCESS;
}
Beispiel #5
0
/**
 * Returns in *dptr and *bytes the base pointer and size of the global of name 
 * name located in module hmod. If no variable of that name exists, 
 * cuModuleGetGlobal() returns CUDA_ERROR_NOT_FOUND. Both parameters dptr and
 * bytes are optional. If one of them is NULL, it is ignored.
 *
 * Parameters:
 * dptr 	- Returned global device pointer
 * bytes 	- Returned global size in bytes
 * hmod 	- Module to retrieve global from
 * name 	- Name of global to retrieve
 *
 * Returns:
 * CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED, 
 * CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_VALUE, CUDA_ERROR_NOT_FOUND 
 */
CUresult cuModuleGetGlobal_v2
(CUdeviceptr *dptr, unsigned int *bytes, CUmodule hmod, const char *name)
{
	CUresult res;
	uint64_t addr;
	uint32_t size;
	struct CUctx_st *ctx;
	struct CUmod_st *mod = hmod;

	if (!gdev_initialized)
		return CUDA_ERROR_NOT_INITIALIZED;
	if (!dptr || !bytes || !mod || !name)
		return CUDA_ERROR_INVALID_VALUE;

	res = cuCtxGetCurrent(&ctx);
	if (res != CUDA_SUCCESS)
		return res;

	if ((res = gdev_cuda_search_symbol(&addr, &size, mod, name)) 
		!= CUDA_SUCCESS)
		return res;

	*dptr = addr;
	*bytes = size;

	return CUDA_SUCCESS;
}
Beispiel #6
0
/**
 * This function gets called once to check if the program is running in a cuda
 * environment. 
 */
static void opal_cuda_support_init(void)
{
    int id;
    CUresult res;
    CUcontext cuContext;

    if (initialized) {
        return;
    }

    /* Set different levels of verbosity in the cuda related code. */
    id = mca_base_param_reg_int_name("opal", "cuda_verbose", 
                                     "Set level of opal cuda verbosity",
                                     false, false, 0, &opal_cuda_verbose);
    opal_cuda_output = opal_output_open(NULL);
    opal_output_set_verbosity(opal_cuda_output, opal_cuda_verbose);

    /* Check to see if this process is running in a CUDA context.  If so,
     * all is good.  Currently, just print out a message in verbose mode
     * to help with debugging. */
    res = cuCtxGetCurrent(&cuContext);
    if (CUDA_SUCCESS != res) {
        opal_output_verbose(10, opal_cuda_output,
                            "CUDA: cuCtxGetCurrent failed, CUDA device pointers will not work");
    } else {
        opal_output_verbose(10, opal_cuda_output,
                            "CUDA: cuCtxGetCurrent succeeded, CUDA device pointers will work");
    }

    initialized = true;
}
Beispiel #7
0
/**
 * Unloads a module hmod from the current context.
 *
 * Parameters:
 * hmod - Module to unload
 *
 * Returns:
 * CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED, 
 * CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_VALUE 
 */
CUresult cuModuleUnload(CUmodule hmod)
{
	CUresult res;
	struct CUmod_st *mod = hmod;
	struct CUctx_st *ctx;
	Ghandle handle;

	if (!gdev_initialized)
		return CUDA_ERROR_NOT_INITIALIZED;
	if (!mod)
		return CUDA_ERROR_INVALID_VALUE;

	res = cuCtxGetCurrent(&ctx);
	if (res != CUDA_SUCCESS)
		return res;

	handle = ctx->gdev_handle;

	gfree(handle, mod->code_addr);
	if (mod->sdata_size > 0)
		gfree(handle, mod->sdata_addr);

	if ((res = gdev_cuda_destruct_kernels(mod)) != CUDA_SUCCESS)
		return res;

	if ((res = gdev_cuda_unload_cubin(mod)) != CUDA_SUCCESS)
		return res;

	FREE(mod);

	return CUDA_SUCCESS;
}
Beispiel #8
0
void *
GOMP_OFFLOAD_openacc_create_thread_data (int ord)
{
  struct ptx_device *ptx_dev;
  struct nvptx_thread *nvthd
    = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
  CUresult r;
  CUcontext thd_ctx;

  ptx_dev = ptx_devices[ord];

  assert (ptx_dev);

  r = cuCtxGetCurrent (&thd_ctx);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));

  assert (ptx_dev->ctx);

  if (!thd_ctx)
    {
      r = cuCtxPushCurrent (ptx_dev->ctx);
      if (r != CUDA_SUCCESS)
	GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
    }

  nvthd->current_stream = ptx_dev->null_stream;
  nvthd->ptx_dev = ptx_dev;

  return (void *) nvthd;
}
Beispiel #9
0
void
context_check (CUcontext ctx1)
{
    CUcontext ctx2, ctx3;
    CUresult r;

    r = cuCtxGetCurrent (&ctx2);
    if (r != CUDA_SUCCESS)
    {
        fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r);
        exit (EXIT_FAILURE);
    }

    if (ctx1 != ctx2)
    {
        fprintf (stderr, "new context established\n");
        exit (EXIT_FAILURE);
    }

    ctx3 = (CUcontext) acc_get_current_cuda_context ();

    if (ctx1 != ctx3)
    {
        fprintf (stderr, "acc_get_current_cuda_context returned wrong value\n");
        exit (EXIT_FAILURE);
    }

    return;
}
Beispiel #10
0
static CUresult destroyContext(const void * args) {
  (void)args;

  CUcontext context;
  CU_ERROR_CHECK(cuCtxGetCurrent(&context));
  CU_ERROR_CHECK(cuCtxDestroy(context));

  return CUDA_SUCCESS;
}
void cuda_enter(cuda_context *ctx) {
  ASSERT_CTX(ctx);
  cuCtxGetCurrent(&ctx->old);
  if (ctx->old != ctx->ctx)
    ctx->err = cuCtxSetCurrent(ctx->ctx);
  /* If no context was there in the first place, then we take over
     to avoid the set dance on the thread */
  if (ctx->old == NULL) ctx->old = ctx->ctx;
}
/*
 * Create a VampirTrace CUPTI Activity context.
 * 
 * @param ctxID ID of the CUDA context
 * @param devID ID of the CUDA device
 * 
 * @return pointer to created VampirTrace CUPTI Activity context
 */
static vt_cuptiact_ctx_t* vt_cuptiact_createContext(uint32_t ctxID, 
                                                    CUcontext cuCtx, 
                                                    uint32_t devID)
{
  vt_cuptiact_ctx_t* vtCtx = NULL;
  
  /* create new context, as it is not listed */
  vtCtx = (vt_cuptiact_ctx_t *)malloc(sizeof(vt_cuptiact_ctx_t));
  if(vtCtx == NULL) 
    vt_error_msg("[CUPTI Activity] Could not allocate memory for context!");
  vtCtx->ctxID = ctxID;
  vtCtx->next = NULL;
  vtCtx->strmList = NULL;
  vtCtx->gpuMemAllocated = 0;
  vtCtx->gpuMemList = NULL;
  vtCtx->buffer = NULL;
  vtCtx->vtLastGPUTime = vt_gpu_init_time;
  vtCtx->gpuIdleOn = 1;
  
  /* 
   * Get time synchronization factor between host and GPU time for measurement 
   * interval 
   */
  {
    VT_CUPTI_CALL(cuptiGetTimestamp(&(vtCtx->sync.gpuStart)), "cuptiGetTimestamp");
    vtCtx->sync.hostStart = vt_pform_wtime();
  }
  
  VT_CHECK_THREAD;
  vtCtx->ptid = VT_MY_THREAD;
  
  if(cuCtx == NULL) CHECK_CU_ERROR(cuCtxGetCurrent(&cuCtx), NULL);
  vtCtx->cuCtx = cuCtx;
  
  /* set default CUPTI stream ID (needed for memory usage and idle tracing) */
  VT_CUPTI_CALL(cuptiGetStreamId(vtCtx->cuCtx, NULL, &(vtCtx->defaultStrmID)), 
                                 "cuptiGetStreamId");
  
  if(devID == (uint32_t)-1){
    CUdevice cuDev;
    
    /* driver API prog: correct cuDev, but result is 201 (invalid context) */
    if(CUDA_SUCCESS != cuCtxGetDevice(&cuDev)){
      devID = VT_NO_ID;
    }else{
      devID = (uint32_t)cuDev;
    }
  }
  
  vtCtx->devID = devID;
  vtCtx->cuDev = devID;
  
  /*vt_cntl_msg(1,"device id: %d", devID);*/
  
  return vtCtx;
}
Beispiel #13
0
// Helper function that fetches the current CUDA context
void cgGetCtx(CUcontext *ctx) {
   CUresult res;
   
   res = cuCtxGetCurrent(ctx);
   if (res != CUDA_SUCCESS) {
      VALGRIND_PRINTF_BACKTRACE("Error: Retrieving CUDA context in VG-wrapper failed.\n"
      );
   } else if (*ctx == NULL) {
      VALGRIND_PRINTF_BACKTRACE("Error: Retrieved NULL context in Valgrind wrapper.\n:"
      );
   }
}
Beispiel #14
0
SEXP
R_cuCtxGetCurrent()
{
    SEXP r_ans = R_NilValue;
    CUcontext pctx;
    CUresult ans;
    ans = cuCtxGetCurrent(& pctx);
    if(ans)
       return(R_cudaErrorInfo(ans));
    r_ans = R_createRef(pctx, "CUcontext") ;
    return(r_ans);
}
Beispiel #15
0
CUresult AttachCuContext(ContextPtr* ppContext) {
	ContextPtr context(new CuContext(false));
	CUresult result = cuCtxGetCurrent(&context->_h);
	if(CUDA_SUCCESS != result || !context->_h)
		return CUDA_ERROR_INVALID_CONTEXT;

	int ordinal;
	cuCtxGetDevice(&ordinal);
	CreateCuDevice(ordinal, &context->_device);
	
	ppContext->swap(context);
	return CUDA_SUCCESS;
}
/*
 * Finalizes CUPTI device.
 * 
 * @param ptid VampirTrace process/thread id
 * @param cleanExit 1 to cleanup CUPTI event group, otherwise 0
 */
void vt_cuptievt_finalize_device(uint8_t cleanExit){
  CUptiResult cuptiErr = CUPTI_SUCCESS;
  vt_cupti_ctx_t *vtcuptiCtx = NULL;

  vt_cntl_msg(2, "[CUPTI Events] Finalize device ... ");

  {
    CUcontext cuCtx;
    
#if (defined(CUDA_VERSION) && (CUDA_VERSION < 4000))
    VT_CUDRV_CALL(cuCtxPopCurrent(&cuCtx), "cuCtxPopCurrent");
    VT_CUDRV_CALL(cuCtxPushCurrent(cuCtx), "cuCtxPushCurrent");
#else
    VT_CUDRV_CALL(cuCtxGetCurrent(&cuCtx), "cuCtxGetCurrent");
#endif

    vtcuptiCtx = vt_cupti_removeCtx(&cuCtx);
    if(vtcuptiCtx == NULL) 
      return;
  }
  
  if(vtcuptiCtx->events == NULL)
    return;

  if(cleanExit && vt_gpu_debug != 0){
    /*uint64_t time = vt_pform_wtime();

    vt_cupti_resetCounter(vtcuptiCtx, 0, &time);*/

    /* stop CUPTI counter capturing */
    vt_cuptievt_stop(vtcuptiCtx->events);

    /* destroy all CUPTI event groups, which have been created */
    {
      vt_cupti_evtgrp_t *vtcuptiGrp = vtcuptiCtx->events->vtGrpList;

      while(vtcuptiGrp != NULL){
        cuptiErr = cuptiEventGroupRemoveAllEvents(vtcuptiGrp->evtGrp);
        VT_CUPTI_CALL(cuptiErr, "cuptiEventGroupRemoveAllEvents");

        cuptiErr = cuptiEventGroupDestroy(vtcuptiGrp->evtGrp);
        VT_CUPTI_CALL(cuptiErr, "cuptiEventGroupDestroy");

        vtcuptiGrp = vtcuptiGrp->next;
      }
    }
  }

  /* free VampirTrace CUPTI event context */
  vt_cuptievt_freeEventCtx(vtcuptiCtx->events);
}
Beispiel #17
0
/*
 * Finalizes CUPTI device.
 * 
 * @param cleanExit 1 to cleanup CUPTI event group, otherwise 0
 */
void vt_cupti_finalize_device(uint32_t ptid, uint8_t cleanExit){
  CUptiResult cuptiErr = CUPTI_SUCCESS;
  vt_cupti_ctx_t *vtcuptiCtx = NULL;

  vt_cntl_msg(2, "[CUPTI] Finalize device ... ");

  {
    CUcontext cuCtx = NULL;

    VT_SUSPEND_CUDA_TRACING(ptid);
    
#if (defined(CUDA_VERSION) && (CUDA_VERSION < 4000))
    CHECK_CU_ERROR(cuCtxPopCurrent(&cuCtx), "cuCtxPopCurrent");
    CHECK_CU_ERROR(cuCtxPushCurrent(cuCtx), "cuCtxPushCurrent");
#else
    CHECK_CU_ERROR(cuCtxGetCurrent(&cuCtx), "cuCtxGetCurrent");
#endif
    
    VT_RESUME_CUDA_TRACING(ptid);

    vtcuptiCtx = vt_cupti_takeCtxFromList(cuCtx);
    if(vtcuptiCtx == NULL) return;
  }

  if(cleanExit && vt_gpu_debug != 0){
    /*uint64_t time = vt_pform_wtime();

    vt_cupti_resetCounter(vtcuptiCtx, 0, &time);*/

    /* stop CUPTI counter capturing */
    vt_cupti_stop(vtcuptiCtx);

    /* destroy all CUPTI event groups, which have been created */
    {
      vt_cupti_grp_t *vtcuptiGrp = vtcuptiCtx->vtGrpList;

      while(vtcuptiGrp != NULL){
        cuptiErr = cuptiEventGroupRemoveAllEvents(vtcuptiGrp->evtGrp);
        CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupRemoveAllEvents");

        cuptiErr = cuptiEventGroupDestroy(vtcuptiGrp->evtGrp);
        CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupDestroy");

        vtcuptiGrp = vtcuptiGrp->next;
      }
    }
  }

  /* free VampirTrace CUPTI context */
  vt_cupti_freeCtx(vtcuptiCtx);
}
Beispiel #18
0
static void *cuda_init(int ord, int flags, int *ret) {
    CUdevice dev;
    cuda_context *res;
    static int init_done = 0;

    if (ord == -2) {
      CUcontext ctx;
      /* Grab the ambient context */
      err = cuCtxGetCurrent(&ctx);
      CHKFAIL(NULL);
      /* If somebody made a context, then the api is initialized */
      init_done = 1;
      res = cuda_make_ctx(ctx, DONTFREE);
      if (res == NULL) {
        FAIL(NULL, GA_IMPL_ERROR);
      }
      res->flags |= flags;
      return res;
    }

    if (!init_done) {
      err = cuInit(0);
      CHKFAIL(NULL);
      init_done = 1;
    }

    if (ord == -1) {
      int i, c;
      err = cuDeviceGetCount(&c);
      CHKFAIL(NULL);
      for (i = 0; i < c; i++) {
        err = cuDeviceGet(&dev, i);
        CHKFAIL(NULL);
        res = do_init(dev, flags, NULL);
        if (res != NULL)
          return res;
      }
      FAIL(NULL, GA_NODEV_ERROR);
    } else {
      err = cuDeviceGet(&dev, ord);
      CHKFAIL(NULL);
      return do_init(dev, flags, ret);
    }
}
static CUdevice get_device_from_ctx(CUcontext ctx)
{

    // Strangely, there does not seem to be a way to get this from the
    // context without making it current. Feels hacky, possibly
    // subject to future change.
    CUcontext curCtx = 0;
    CUdevice device = 0;
    cuCtxGetCurrent(&curCtx);
    if (curCtx != ctx) {
        cuCtxPushCurrent(ctx);
    }
    cuCtxGetDevice(&device);
    if (curCtx != ctx) {
        cuCtxPopCurrent(NULL);
    }

    return device;
}
static void *cuda_init(int ord, int flags, int *ret) {
    CUdevice dev;
    CUcontext ctx;
    cuda_context *res;
    static int init_done = 0;
    unsigned int fl = CU_CTX_SCHED_AUTO;

    if (ord == -1) {
      /* Grab the ambient context */
      err = cuCtxGetCurrent(&ctx);
      CHKFAIL(NULL);
      res = cuda_make_ctx(ctx, DONTFREE);
      if (res == NULL) {
        FAIL(NULL, GA_IMPL_ERROR);
      }
      res->flags |= flags;
      return res;
    }

    if (!init_done) {
      err = cuInit(0);
      CHKFAIL(NULL);
      init_done = 1;
    }
    err = cuDeviceGet(&dev, ord);
    CHKFAIL(NULL);
    if (flags & GA_CTX_SINGLE_THREAD)
      fl = CU_CTX_SCHED_SPIN;
    if (flags & GA_CTX_MULTI_THREAD)
      fl = CU_CTX_SCHED_YIELD;
    err = cuCtxCreate(&ctx, fl, dev);
    CHKFAIL(NULL);
    res = cuda_make_ctx(ctx, 0);
    res->flags |= flags;
    if (res == NULL) {
      cuCtxDestroy(ctx);
      FAIL(NULL, GA_IMPL_ERROR);
    }
    /* Don't leave the context on the thread stack */
    cuCtxPopCurrent(NULL);
    return res;
}
/*
 * Retrieves the VampirTrace CUPTI context for the CUDA context associated with
 * the calling host thread. Initiates context creation, if it is not available 
 * yet.
 *
 * @param ptid the VampirTrace thread id of the calling host thread
 * 
 * @return VampirTrace CUPTI context
 */
vt_cupti_ctx_t* vt_cuptievt_getOrCreateCurrentCtx(uint32_t ptid)
{
  CUcontext cuCtx = NULL;
  
  if(!vt_cuptievt_initialized) vt_cupti_events_init();

# if (defined(CUDA_VERSION) && (CUDA_VERSION < 4000))
  VT_CUDRV_CALL(cuCtxPopCurrent(&cuCtx), "cuCtxPopCurrent");
  VT_CUDRV_CALL(cuCtxPushCurrent(cuCtx), "cuCtxPushCurrent");
# else
  VT_CUDRV_CALL(cuCtxGetCurrent(&cuCtx), "cuCtxGetCurrent");
# endif
  
  if(cuCtx == NULL){
    vt_cntl_msg(2, "[CUPTI Events] No context is bound to the calling CPU thread!");
    return NULL;
  }
  
  return vt_cuptievt_getOrCreateCtx(cuCtx, ptid);
}
Beispiel #22
0
static void
nvptx_attach_host_thread_to_device (int n)
{
  CUdevice dev;
  CUresult r;
  struct ptx_device *ptx_dev;
  CUcontext thd_ctx;

  r = cuCtxGetDevice (&dev);
  if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
    GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));

  if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
    return;
  else
    {
      CUcontext old_ctx;

      ptx_dev = ptx_devices[n];
      assert (ptx_dev);

      r = cuCtxGetCurrent (&thd_ctx);
      if (r != CUDA_SUCCESS)
        GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));

      /* We don't necessarily have a current context (e.g. if it has been
         destroyed.  Pop it if we do though.  */
      if (thd_ctx != NULL)
	{
	  r = cuCtxPopCurrent (&old_ctx);
	  if (r != CUDA_SUCCESS)
            GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
	}

      r = cuCtxPushCurrent (ptx_dev->ctx);
      if (r != CUDA_SUCCESS)
        GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
    }
}
Beispiel #23
0
/**
 * Returns in *hfunc the handle of the function of name name located in module
 *  hmod. If no function of that name exists, cuModuleGetFunction() returns 
 * CUDA_ERROR_NOT_FOUND.
 *
 * Parameters:
 * hfunc - Returned function handle
 * hmod	- Module to retrieve function from
 * name - Name of function to retrieve
 *
 * Returns:
 * CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED, 
 * CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_VALUE, CUDA_ERROR_NOT_FOUND 
 */
CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name)
{
	CUresult res;
	struct CUctx_st *ctx;
	struct CUfunc_st *func;
	struct CUmod_st *mod = hmod;

	if (!gdev_initialized)
		return CUDA_ERROR_NOT_INITIALIZED;
	if (!hfunc || !mod || !name)
		return CUDA_ERROR_INVALID_VALUE;

	res = cuCtxGetCurrent(&ctx);
	if (res != CUDA_SUCCESS)
		return res;

	if ((res = gdev_cuda_search_function(&func, mod, name)) != CUDA_SUCCESS)
		return res;

	*hfunc = func;

	return CUDA_SUCCESS;
}
Beispiel #24
0
/**
 * Passes back the device pointer pdptr corresponding to the mapped, pinned 
 * host buffer p allocated by cuMemHostAlloc.
 *
 * cuMemHostGetDevicePointer() will fail if the CU_MEMALLOCHOST_DEVICEMAP flag 
 * was not specified at the time the memory was allocated, or if the function 
 * is called on a GPU that does not support mapped pinned memory.
 *
 * Flags provides for future releases. For now, it must be set to 0.
 *
 * Parameters:
 * pdptr - Returned device pointer
 * p - Host pointer
 * Flags - Options (must be 0)
 *
 * Returns:
 * CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED, 
 * CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_VALUE 
 */
CUresult cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags)
{
	CUresult res;
	struct CUctx_st *ctx;
	Ghandle handle;
	uint64_t addr;

	if (!gdev_initialized)
		return CUDA_ERROR_NOT_INITIALIZED;

	res = cuCtxGetCurrent(&ctx);
	if (res != CUDA_SUCCESS)
		return res;

	if (!pdptr || !p || Flags != 0)
		return CUDA_ERROR_INVALID_VALUE;

	handle = ctx->gdev_handle;
	addr = gvirtget(handle, p);
	*pdptr = (CUdeviceptr)addr;

	return CUDA_SUCCESS;
}
Beispiel #25
0
cv::gpu::VideoReader_GPU::Impl::Impl(const cv::Ptr<VideoSource>& source) :
    videoSource_(source),
    lock_(0)
{
    // init context
    GpuMat temp(1, 1, CV_8UC1);
    temp.release();

    DeviceInfo devInfo;
    CV_Assert( devInfo.supports(FEATURE_SET_COMPUTE_11) );

    CUcontext ctx;
    cuSafeCall( cuCtxGetCurrent(&ctx) );
    cuSafeCall( cuvidCtxLockCreate(&lock_, ctx) );

    frameQueue_.reset(new detail::FrameQueue);
    videoDecoder_.reset(new detail::VideoDecoder(videoSource_->format(), lock_));
    videoParser_.reset(new detail::VideoParser(videoDecoder_.get(), frameQueue_.get()));

    videoSource_->setFrameQueue(frameQueue_.get());
    videoSource_->setVideoParser(videoParser_.get());

    videoSource_->start();
}
Beispiel #26
0
/*
 * Returns the VampirTrace CUPTI context for the CUDA context associated with
 * the calling host thread.
 *
 * @param ptid the VampirTrace thread id of the calling host thread
 */
vt_cupti_ctx_t* vt_cupti_getCurrentContext(uint32_t ptid)
{
  CUcontext cuCtx = NULL;
  
  if(!vt_cupti_initialized) vt_cupti_init();

  VT_SUSPEND_CUDA_TRACING(ptid);

# if (defined(CUDA_VERSION) && (CUDA_VERSION < 4000))
  CHECK_CU_ERROR(cuCtxPopCurrent(&cuCtx), "cuCtxPopCurrent");
  CHECK_CU_ERROR(cuCtxPushCurrent(cuCtx), "cuCtxPushCurrent");
# else
  CHECK_CU_ERROR(cuCtxGetCurrent(&cuCtx), "cuCtxGetCurrent");
# endif

  VT_RESUME_CUDA_TRACING(ptid);
  
  if(cuCtx == NULL) {
    vt_cntl_msg(2, "[CUPTI] No context is bound to the calling CPU thread", cuCtx);
    return NULL;
  }
  
  return vt_cupti_getCtx(cuCtx, ptid);
}
Beispiel #27
0
static struct ptx_device *
nvptx_open_device (int n)
{
  struct ptx_device *ptx_dev;
  CUdevice dev, ctx_dev;
  CUresult r;
  int async_engines, pi;

  r = cuDeviceGet (&dev, n);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r));

  ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));

  ptx_dev->ord = n;
  ptx_dev->dev = dev;
  ptx_dev->ctx_shared = false;

  r = cuCtxGetDevice (&ctx_dev);
  if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
    GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
  
  if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
    {
      /* The current host thread has an active context for a different device.
         Detach it.  */
      CUcontext old_ctx;
      
      r = cuCtxPopCurrent (&old_ctx);
      if (r != CUDA_SUCCESS)
	GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
    }

  r = cuCtxGetCurrent (&ptx_dev->ctx);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));

  if (!ptx_dev->ctx)
    {
      r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
      if (r != CUDA_SUCCESS)
	GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r));
    }
  else
    ptx_dev->ctx_shared = true;

  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));

  ptx_dev->overlap = pi;

  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));

  ptx_dev->map = pi;

  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));

  ptx_dev->concur = pi;

  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));

  ptx_dev->mode = pi;

  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));

  ptx_dev->mkern = pi;

  r = cuDeviceGetAttribute (&async_engines,
			    CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
  if (r != CUDA_SUCCESS)
    async_engines = 1;

  ptx_dev->images = NULL;
  pthread_mutex_init (&ptx_dev->image_lock, NULL);

  init_streams_for_device (ptx_dev, async_engines);

  return ptx_dev;
}
Beispiel #28
0
/*
 * Initializes a CUPTI host thread and create the event group.
 *
 * @param ptid the VampirTrace thread id
 * @param cuCtx optionally given CUDA context
 *
 * @return the created VampirTrace CUPTI host thread structure
 */
static vt_cupti_ctx_t* vt_cupti_initCtx(uint32_t ptid, CUcontext cuCtx)
{
  vt_cupti_ctx_t *vtcuptiCtx = NULL;
  uint64_t time;

  vt_cntl_msg(2, "[CUPTI] Initializing VampirTrace CUPTI context (ptid=%d)",
              ptid);
  
  time = vt_pform_wtime();
  vt_enter(ptid, &time, rid_cupti_init);

  /* do not trace CUDA functions invoked here */
  VT_SUSPEND_CUDA_TRACING(ptid);

  /* initialize CUDA driver API, if necessary and get context handle */
  if(cuCtx == NULL){
#if (defined(CUDA_VERSION) && (CUDA_VERSION < 4000))
    CHECK_CU_ERROR(cuCtxPopCurrent(&cuCtx), "cuCtxPopCurrent");
    CHECK_CU_ERROR(cuCtxPushCurrent(cuCtx), "cuCtxPushCurrent");
#else
    CHECK_CU_ERROR(cuCtxGetCurrent(&cuCtx), "cuCtxGetCurrent");
#endif
  }

  /* get a pointer to eventIDArray */
  {
    CUresult cuErr = CUDA_SUCCESS;
    int dev_major, dev_minor;
    CUdevice cuDev = 0;
    vt_cupti_dev_t *cuptiDev;

    CHECK_CU_ERROR(cuCtxGetDevice(&cuDev), "cuCtxGetDevice");

    cuErr = cuDeviceComputeCapability(&dev_major, &dev_minor, cuDev);
    CHECK_CU_ERROR(cuErr, "cuDeviceComputeCapability");

    /* check if device capability already listed */
    CUPTI_LOCK();
      cuptiDev = vt_cupti_capList;
    CUPTI_UNLOCK();
    
    cuptiDev = vt_cupti_checkMetricList(cuptiDev, dev_major, dev_minor);
    if(cuptiDev){
      vtcuptiCtx = (vt_cupti_ctx_t*)malloc(sizeof(vt_cupti_ctx_t));
      if(vtcuptiCtx == NULL)
        vt_error_msg("malloc(sizeof(VTCUPTIhostThrd)) failed!");
      vtcuptiCtx->cuCtx = cuCtx;
      vtcuptiCtx->vtDevCap = cuptiDev;
      vtcuptiCtx->vtGrpList = NULL;
      vtcuptiCtx->counterData = NULL;
      vtcuptiCtx->cuptiEvtIDs = NULL;
      vtcuptiCtx->next = NULL;
    }else{
      time = vt_pform_wtime();
      vt_exit(ptid, &time);
      VT_RESUME_CUDA_TRACING(ptid);
      return NULL;
    }
  }

  VT_RESUME_CUDA_TRACING(ptid);

  /* create and add the VampirTrace CUPTI groups to the context */
  vt_cupti_addEvtGrpsToCtx(vtcuptiCtx);

  /* allocate memory for CUPTI counter reads */
  {
    size_t allocSize = vtcuptiCtx->vtGrpList->evtNum;
    
    vtcuptiCtx->counterData = (uint64_t *)malloc(allocSize*sizeof(uint64_t));
    vtcuptiCtx->cuptiEvtIDs = (CUpti_EventID *)malloc(allocSize*sizeof(CUpti_EventID));
  }

  /* add VampirTrace CUPTI context entry to list (as first element) */
  CUPTI_LOCK();
    vtcuptiCtx->next = vtcuptiCtxlist;
    vtcuptiCtxlist = vtcuptiCtx;
  CUPTI_UNLOCK();

  time = vt_pform_wtime();
  vt_exit(ptid, &time);

  return vtcuptiCtx;
}
/*
 * Create a VampirTrace CUPTI context. If the CUDA context is not given, the 
 * current context will be requested and used.
 * 
 * @param cuCtx CUDA context
 * @param cuDev CUDA device
 * @param ctxID ID of the CUDA context
 * @param devID ID of the CUDA device
 * 
 * @return pointer to created VampirTrace CUPTI context
 */
vt_cupti_ctx_t* vt_cupti_createCtx(CUcontext cuCtx, CUdevice cuDev,
                                   uint32_t cuCtxID, uint32_t cuDevID)
{
  vt_cupti_ctx_t* vtCtx = NULL;
  
  /* create new context */
  vtCtx = (vt_cupti_ctx_t *)malloc(sizeof(vt_cupti_ctx_t));
  if(vtCtx == NULL) 
    vt_error_msg("[CUPTI] Could not allocate memory for VT CUPTI context!");
  vtCtx->ctxID = cuCtxID;
#if (defined(VT_CUPTI_ACTIVITY) || defined(VT_CUPTI_CALLBACKS))
  vtCtx->gpuMemAllocated = 0;
  vtCtx->gpuMemList = NULL;
  vtCtx->strmList = NULL;
#endif
  vtCtx->next = NULL;
  
  VT_CHECK_THREAD;
  vtCtx->ptid = VT_MY_THREAD;
  
  /* try to get CUDA device (ID), if they are not given */
  if(cuDevID == VT_CUPTI_NO_DEVICE_ID){
    if(cuDev == VT_CUPTI_NO_CUDA_DEVICE){
      CUcontext cuCurrCtx;
      
      if(cuCtx != NULL){
        cuCtxGetCurrent(&cuCurrCtx);
      
        /* if given context does not match the current one, get the device for 
           the given one */
        if(cuCtx != cuCurrCtx)
          VT_CUDRV_CALL(cuCtxSetCurrent(cuCtx), NULL);
      }
      
      if(CUDA_SUCCESS == cuCtxGetDevice(&cuDev))
        cuDevID = (uint32_t)cuDev;
      
      /* reset the active context */
      if(cuCtx != NULL && cuCtx != cuCurrCtx)
        VT_CUDRV_CALL(cuCtxSetCurrent(cuCurrCtx), NULL);
      
    }else{
      /* no device ID, but CUDA device is given */
      cuDevID = (uint32_t)cuDev;
    }
  }
  
  vtCtx->devID = cuDevID;
  vtCtx->cuDev = cuDev;
  
  /* get the current CUDA context, if it is not given */
  if(cuCtx == NULL) 
    VT_CUDRV_CALL(cuCtxGetCurrent(&cuCtx), NULL);
  
  /* set the CUDA context */
  vtCtx->cuCtx = cuCtx;
  
#if defined(VT_CUPTI_ACTIVITY)
  vtCtx->activity = NULL;
#endif

#if defined(VT_CUPTI_CALLBACKS)
  vtCtx->callbacks = NULL;
#endif
  
#if defined(VT_CUPTI_EVENTS)
  vtCtx->events = NULL;
#endif

  vt_cntl_msg(2, "[CUPTI] Created context for CUcontext %d, CUdevice %d", 
              cuCtx, cuDev);
  
  return vtCtx;
}
Beispiel #30
0
int
main (int argc, char **argv)
{
    cublasStatus_t s;
    cublasHandle_t h;
    CUcontext pctx;
    CUresult r;
    int i;
    const int N = 256;
    float *h_X, *h_Y1, *h_Y2;
    float *d_X,*d_Y;
    float alpha = 2.0f;
    float error_norm;
    float ref_norm;

    /* Test 4 - OpenACC creates, cuBLAS shares.  */

    acc_set_device_num (0, acc_device_nvidia);

    r = cuCtxGetCurrent (&pctx);
    if (r != CUDA_SUCCESS)
    {
        fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r);
        exit (EXIT_FAILURE);
    }

    h_X = (float *) malloc (N * sizeof (float));
    if (h_X == 0)
    {
        fprintf (stderr, "malloc failed: for h_X\n");
        exit (EXIT_FAILURE);
    }

    h_Y1 = (float *) malloc (N * sizeof (float));
    if (h_Y1 == 0)
    {
        fprintf (stderr, "malloc failed: for h_Y1\n");
        exit (EXIT_FAILURE);
    }

    h_Y2 = (float *) malloc (N * sizeof (float));
    if (h_Y2 == 0)
    {
        fprintf (stderr, "malloc failed: for h_Y2\n");
        exit (EXIT_FAILURE);
    }

    for (i = 0; i < N; i++)
    {
        h_X[i] = rand () / (float) RAND_MAX;
        h_Y2[i] = h_Y1[i] = rand () / (float) RAND_MAX;
    }

#pragma acc parallel copyin (h_X[0:N]), copy (h_Y2[0:N]) copy (alpha)
    {
        int i;

        for (i = 0; i < N; i++)
        {
            h_Y2[i] = alpha * h_X[i] + h_Y2[i];
        }
    }

    r = cuCtxGetCurrent (&pctx);
    if (r != CUDA_SUCCESS)
    {
        fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r);
        exit (EXIT_FAILURE);
    }

    d_X = (float *) acc_copyin (&h_X[0], N * sizeof (float));
    if (d_X == NULL)
    {
        fprintf (stderr, "copyin error h_Y1\n");
        exit (EXIT_FAILURE);
    }

    d_Y = (float *) acc_copyin (&h_Y1[0], N * sizeof (float));
    if (d_Y == NULL)
    {
        fprintf (stderr, "copyin error h_Y1\n");
        exit (EXIT_FAILURE);
    }

    s = cublasCreate (&h);
    if (s != CUBLAS_STATUS_SUCCESS)
    {
        fprintf (stderr, "cublasCreate failed: %d\n", s);
        exit (EXIT_FAILURE);
    }

    context_check (pctx);

    s = cublasSaxpy (h, N, &alpha, d_X, 1, d_Y, 1);
    if (s != CUBLAS_STATUS_SUCCESS)
    {
        fprintf (stderr, "cublasSaxpy failed: %d\n", s);
        exit (EXIT_FAILURE);
    }

    context_check (pctx);

    acc_memcpy_from_device (&h_Y1[0], d_Y, N * sizeof (float));

    context_check (pctx);

    error_norm = 0;
    ref_norm = 0;

    for (i = 0; i < N; ++i)
    {
        float diff;

        diff = h_Y1[i] - h_Y2[i];
        error_norm += diff * diff;
        ref_norm += h_Y2[i] * h_Y2[i];
    }

    error_norm = (float) sqrt ((double) error_norm);
    ref_norm = (float) sqrt ((double) ref_norm);

    if ((fabs (ref_norm) < 1e-7) || ((error_norm / ref_norm) >= 1e-6f))
    {
        fprintf (stderr, "math error\n");
        exit (EXIT_FAILURE);
    }

    free (h_X);
    free (h_Y1);
    free (h_Y2);

    acc_free (d_X);
    acc_free (d_Y);

    context_check (pctx);

    s = cublasDestroy (h);
    if (s != CUBLAS_STATUS_SUCCESS)
    {
        fprintf (stderr, "cublasDestroy failed: %d\n", s);
        exit (EXIT_FAILURE);
    }

    context_check (pctx);

    acc_shutdown (acc_device_nvidia);

    r = cuCtxGetCurrent (&pctx);
    if (r != CUDA_SUCCESS)
    {
        fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r);
        exit (EXIT_FAILURE);
    }

    if (pctx)
    {
        fprintf (stderr, "Unexpected context\n");
        exit (EXIT_FAILURE);
    }

    return EXIT_SUCCESS;
}