Example #1
0
/*
 * Specify device(s): Counts number of cuda events available in this system
 */
static int
detectDevice( void )
{
	CUresult err;
	int skipDevice = 0;
	int id;
	char deviceName_tmp[PAPI_MIN_STR_LEN] = "init";

	totalEventCount = 0;

/* CUDA initialization  */
	err = (*cuInitPtr)( 0 );
	if ( err != CUDA_SUCCESS ) {
		SUBDBG ("Info: Error from cuInit(): %d\n", err);
		return ( PAPI_ENOSUPP );
	}

	/* How many gpgpu devices do we have? */
	err = (*cuDeviceGetCountPtr)( &deviceCount );
	CHECK_CU_ERROR( err, "cuDeviceGetCount" );
	if ( deviceCount == 0 )
		return ( PAPI_ENOSUPP );

	/* allocate memory for device data table */
	device = ( DeviceData_t * ) malloc( sizeof ( DeviceData_t ) * deviceCount );
	if ( device == NULL ) {
		perror( "malloc(): Failed to allocate memory to CUDA device table" );
		return ( PAPI_ENOSUPP );
	}

	/* What are the devices? Get Name and # of domains per device */
	for ( id = 0; id < deviceCount; id++ ) {
		err = (*cuDeviceGetPtr)( &device[id].dev, id );
		CHECK_CU_ERROR( err, "cuDeviceGet" );

		err = (*cuDeviceGetNamePtr)( device[id].name, PAPI_MIN_STR_LEN, device[id].dev );
		CHECK_CU_ERROR( err, "cuDeviceGetName" );

		SUBDBG ("Cuda deviceName: %s\n", device[id].name);

		/* Skip device if there are multiple of the same type 
		   and if it has been already added to the list */
		if ( 0 == strcmp( deviceName_tmp, device[id].name ) ) {
			skipDevice++;
			continue;
		}

		strcpy( deviceName_tmp, device[id].name );

		/* enumerate the domains on the device */
		if ( 0 != enumEventDomains( device[id].dev, id ) )
			return ( PAPI_ENOSUPP );
	}

	deviceCount = deviceCount - skipDevice;

	/* return number of events provided via CuPTI */
	return totalEventCount;
}
Example #2
0
/*
 * Finalizes CUPTI device.
 * 
 * @param cleanExit 1 to cleanup CUPTI event group, otherwise 0
 */
void vt_cupti_finalize_device(uint32_t ptid, uint8_t cleanExit){
  CUptiResult cuptiErr = CUPTI_SUCCESS;
  vt_cupti_ctx_t *vtcuptiCtx = NULL;

  vt_cntl_msg(2, "[CUPTI] Finalize device ... ");

  {
    CUcontext cuCtx = NULL;

    VT_SUSPEND_CUDA_TRACING(ptid);
    
#if (defined(CUDA_VERSION) && (CUDA_VERSION < 4000))
    CHECK_CU_ERROR(cuCtxPopCurrent(&cuCtx), "cuCtxPopCurrent");
    CHECK_CU_ERROR(cuCtxPushCurrent(cuCtx), "cuCtxPushCurrent");
#else
    CHECK_CU_ERROR(cuCtxGetCurrent(&cuCtx), "cuCtxGetCurrent");
#endif
    
    VT_RESUME_CUDA_TRACING(ptid);

    vtcuptiCtx = vt_cupti_takeCtxFromList(cuCtx);
    if(vtcuptiCtx == NULL) return;
  }

  if(cleanExit && vt_gpu_debug != 0){
    /*uint64_t time = vt_pform_wtime();

    vt_cupti_resetCounter(vtcuptiCtx, 0, &time);*/

    /* stop CUPTI counter capturing */
    vt_cupti_stop(vtcuptiCtx);

    /* destroy all CUPTI event groups, which have been created */
    {
      vt_cupti_grp_t *vtcuptiGrp = vtcuptiCtx->vtGrpList;

      while(vtcuptiGrp != NULL){
        cuptiErr = cuptiEventGroupRemoveAllEvents(vtcuptiGrp->evtGrp);
        CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupRemoveAllEvents");

        cuptiErr = cuptiEventGroupDestroy(vtcuptiGrp->evtGrp);
        CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupDestroy");

        vtcuptiGrp = vtcuptiGrp->next;
      }
    }
  }

  /* free VampirTrace CUPTI context */
  vt_cupti_freeCtx(vtcuptiCtx);
}
/*
 * Create a VampirTrace CUPTI Activity context.
 * 
 * @param ctxID ID of the CUDA context
 * @param devID ID of the CUDA device
 * 
 * @return pointer to created VampirTrace CUPTI Activity context
 */
static vt_cuptiact_ctx_t* vt_cuptiact_createContext(uint32_t ctxID, 
                                                    CUcontext cuCtx, 
                                                    uint32_t devID)
{
  vt_cuptiact_ctx_t* vtCtx = NULL;
  
  /* create new context, as it is not listed */
  vtCtx = (vt_cuptiact_ctx_t *)malloc(sizeof(vt_cuptiact_ctx_t));
  if(vtCtx == NULL) 
    vt_error_msg("[CUPTI Activity] Could not allocate memory for context!");
  vtCtx->ctxID = ctxID;
  vtCtx->next = NULL;
  vtCtx->strmList = NULL;
  vtCtx->gpuMemAllocated = 0;
  vtCtx->gpuMemList = NULL;
  vtCtx->buffer = NULL;
  vtCtx->vtLastGPUTime = vt_gpu_init_time;
  vtCtx->gpuIdleOn = 1;
  
  /* 
   * Get time synchronization factor between host and GPU time for measurement 
   * interval 
   */
  {
    VT_CUPTI_CALL(cuptiGetTimestamp(&(vtCtx->sync.gpuStart)), "cuptiGetTimestamp");
    vtCtx->sync.hostStart = vt_pform_wtime();
  }
  
  VT_CHECK_THREAD;
  vtCtx->ptid = VT_MY_THREAD;
  
  if(cuCtx == NULL) CHECK_CU_ERROR(cuCtxGetCurrent(&cuCtx), NULL);
  vtCtx->cuCtx = cuCtx;
  
  /* set default CUPTI stream ID (needed for memory usage and idle tracing) */
  VT_CUPTI_CALL(cuptiGetStreamId(vtCtx->cuCtx, NULL, &(vtCtx->defaultStrmID)), 
                                 "cuptiGetStreamId");
  
  if(devID == (uint32_t)-1){
    CUdevice cuDev;
    
    /* driver API prog: correct cuDev, but result is 201 (invalid context) */
    if(CUDA_SUCCESS != cuCtxGetDevice(&cuDev)){
      devID = VT_NO_ID;
    }else{
      devID = (uint32_t)cuDev;
    }
  }
  
  vtCtx->devID = devID;
  vtCtx->cuDev = devID;
  
  /*vt_cntl_msg(1,"device id: %d", devID);*/
  
  return vtCtx;
}
Example #4
0
/*
 * Returns the VampirTrace CUPTI context for the CUDA context associated with
 * the calling host thread.
 *
 * @param ptid the VampirTrace thread id of the calling host thread
 */
vt_cupti_ctx_t* vt_cupti_getCurrentContext(uint32_t ptid)
{
  CUcontext cuCtx = NULL;
  
  if(!vt_cupti_initialized) vt_cupti_init();

  VT_SUSPEND_CUDA_TRACING(ptid);

# if (defined(CUDA_VERSION) && (CUDA_VERSION < 4000))
  CHECK_CU_ERROR(cuCtxPopCurrent(&cuCtx), "cuCtxPopCurrent");
  CHECK_CU_ERROR(cuCtxPushCurrent(cuCtx), "cuCtxPushCurrent");
# else
  CHECK_CU_ERROR(cuCtxGetCurrent(&cuCtx), "cuCtxGetCurrent");
# endif

  VT_RESUME_CUDA_TRACING(ptid);
  
  if(cuCtx == NULL) {
    vt_cntl_msg(2, "[CUPTI] No context is bound to the calling CPU thread", cuCtx);
    return NULL;
  }
  
  return vt_cupti_getCtx(cuCtx, ptid);
}
Example #5
0
static vt_cupti_dev_t* vt_cupti_setupMetricList(void)
{
  CUresult err;
  int deviceCount, id;
  vt_cupti_dev_t *capList = NULL;

  /* CUDA initialization */
	err = cuInit( 0 );
	if ( err != CUDA_SUCCESS ) {
		printf( "Initialization of CUDA library failed.\n" );
		exit( EXIT_FAILURE );
	}

  /* How many gpgpu devices do we have? */
	err = cuDeviceGetCount( &deviceCount );
	CHECK_CU_ERROR(err, "cuDeviceGetCount");
	if(deviceCount == 0){
		printf("[CUPTI]There is no device supporting CUDA.\n");
		exit(EXIT_FAILURE);
	}

  /* create list with available compute capabilities */
  for(id = 0; id < deviceCount; id++){
    CUdevice cuDev;
    vt_cupti_dev_t *cuptiDev;
    int dev_major, dev_minor;

    err = cuDeviceGet(&cuDev, id);
		CHECK_CU_ERROR(err, "cuDeviceGet");

    err = cuDeviceComputeCapability(&dev_major, &dev_minor, cuDev);
    CHECK_CU_ERROR(err, "cuDeviceComputeCapability");

    /* check if device capability already listed */
    cuptiDev = vt_cupti_checkMetricList(capList, dev_major, dev_minor);

    if(cuptiDev == NULL){
      /* allocate memory for device list entry */
      cuptiDev = (vt_cupti_dev_t *)malloc(sizeof(vt_cupti_dev_t));
      cuptiDev->dev_major = dev_major;
      cuptiDev->dev_minor = dev_minor;
      cuptiDev->cuDev = cuDev;
      cuptiDev->vtcuptiEvtList = NULL;
      cuptiDev->evtNum = 0;
      cuptiDev->next = NULL;

      /* prepend to list */
      cuptiDev->next = capList;
      capList = cuptiDev;
    }
  }

  vt_cupti_fillMetricList(capList);

  /* cleanup list: remove entries, which don't have metrics */
  {
    vt_cupti_dev_t *curr = capList;
    vt_cupti_dev_t *last = capList;

    while(curr != NULL){
      vt_cupti_dev_t *freeDev = curr;
      curr = curr->next;

      if(freeDev->evtNum == 0){
        /* first element */
        if(freeDev == capList){
          capList = capList->next;
        }else{
          last->next = freeDev->next;
        }
        free(freeDev);
      }else last = freeDev;
    }
  }

  return capList;
}
Example #6
0
/*
 * Initializes a CUPTI host thread and create the event group.
 *
 * @param ptid the VampirTrace thread id
 * @param cuCtx optionally given CUDA context
 *
 * @return the created VampirTrace CUPTI host thread structure
 */
static vt_cupti_ctx_t* vt_cupti_initCtx(uint32_t ptid, CUcontext cuCtx)
{
  vt_cupti_ctx_t *vtcuptiCtx = NULL;
  uint64_t time;

  vt_cntl_msg(2, "[CUPTI] Initializing VampirTrace CUPTI context (ptid=%d)",
              ptid);
  
  time = vt_pform_wtime();
  vt_enter(ptid, &time, rid_cupti_init);

  /* do not trace CUDA functions invoked here */
  VT_SUSPEND_CUDA_TRACING(ptid);

  /* initialize CUDA driver API, if necessary and get context handle */
  if(cuCtx == NULL){
#if (defined(CUDA_VERSION) && (CUDA_VERSION < 4000))
    CHECK_CU_ERROR(cuCtxPopCurrent(&cuCtx), "cuCtxPopCurrent");
    CHECK_CU_ERROR(cuCtxPushCurrent(cuCtx), "cuCtxPushCurrent");
#else
    CHECK_CU_ERROR(cuCtxGetCurrent(&cuCtx), "cuCtxGetCurrent");
#endif
  }

  /* get a pointer to eventIDArray */
  {
    CUresult cuErr = CUDA_SUCCESS;
    int dev_major, dev_minor;
    CUdevice cuDev = 0;
    vt_cupti_dev_t *cuptiDev;

    CHECK_CU_ERROR(cuCtxGetDevice(&cuDev), "cuCtxGetDevice");

    cuErr = cuDeviceComputeCapability(&dev_major, &dev_minor, cuDev);
    CHECK_CU_ERROR(cuErr, "cuDeviceComputeCapability");

    /* check if device capability already listed */
    CUPTI_LOCK();
      cuptiDev = vt_cupti_capList;
    CUPTI_UNLOCK();
    
    cuptiDev = vt_cupti_checkMetricList(cuptiDev, dev_major, dev_minor);
    if(cuptiDev){
      vtcuptiCtx = (vt_cupti_ctx_t*)malloc(sizeof(vt_cupti_ctx_t));
      if(vtcuptiCtx == NULL)
        vt_error_msg("malloc(sizeof(VTCUPTIhostThrd)) failed!");
      vtcuptiCtx->cuCtx = cuCtx;
      vtcuptiCtx->vtDevCap = cuptiDev;
      vtcuptiCtx->vtGrpList = NULL;
      vtcuptiCtx->counterData = NULL;
      vtcuptiCtx->cuptiEvtIDs = NULL;
      vtcuptiCtx->next = NULL;
    }else{
      time = vt_pform_wtime();
      vt_exit(ptid, &time);
      VT_RESUME_CUDA_TRACING(ptid);
      return NULL;
    }
  }

  VT_RESUME_CUDA_TRACING(ptid);

  /* create and add the VampirTrace CUPTI groups to the context */
  vt_cupti_addEvtGrpsToCtx(vtcuptiCtx);

  /* allocate memory for CUPTI counter reads */
  {
    size_t allocSize = vtcuptiCtx->vtGrpList->evtNum;
    
    vtcuptiCtx->counterData = (uint64_t *)malloc(allocSize*sizeof(uint64_t));
    vtcuptiCtx->cuptiEvtIDs = (CUpti_EventID *)malloc(allocSize*sizeof(CUpti_EventID));
  }

  /* add VampirTrace CUPTI context entry to list (as first element) */
  CUPTI_LOCK();
    vtcuptiCtx->next = vtcuptiCtxlist;
    vtcuptiCtxlist = vtcuptiCtx;
  CUPTI_UNLOCK();

  time = vt_pform_wtime();
  vt_exit(ptid, &time);

  return vtcuptiCtx;
}
Example #7
0
/* Initialize hardware counters, setup the function vector table
 * and get hardware information, this routine is called when the 
 * PAPI process is initialized (IE PAPI_library_init)
 *
 * NOTE: only called by main thread (not by every thread) !!!
 *
 * Starting in CUDA 4.0, multiple CPU threads can access the same CUDA context.
 * This is a much easier programming model then pre-4.0 as threads - using the 
 * same context - can share memory, data, etc. 
 * It's possible to create a different context for each thread, but then we are
 * likely running into a limitation that only one context can be profiled at a time.
 * ==> and we don't want this. That's why CUDA context creation is done in 
 * CUDA_init_component() (called only by main thread) rather than CUDA_init() 
 * or CUDA_init_control_state() (both called by each thread).
 */
int
CUDA_init_component( int cidx )
{
	SUBDBG ("Entry: cidx: %d\n", cidx);
	CUresult cuErr = CUDA_SUCCESS;

	/* link in all the cuda libraries and resolve the symbols we need to use */
	if (linkCudaLibraries() != PAPI_OK) {
		SUBDBG ("Dynamic link of CUDA libraries failed, component will be disabled.\n");
		SUBDBG ("See disable reason in papi_component_avail output for more details.\n");
		return (PAPI_ENOSUPP);
	}

	/* Create dynamic event table */
	NUM_EVENTS = detectDevice(  );
	if (NUM_EVENTS < 0) {
		strncpy(_cuda_vector.cmp_info.disabled_reason, "Call to detectDevice failed.",PAPI_MAX_STR_LEN);
		return (PAPI_ENOSUPP);
	}
	/* TODO: works only for one device right now;
	 need to find out if user can use 2 or more devices at same time */

	/* want create a CUDA context for either the default device or
	 the device specified with cudaSetDevice() in user code */
	if ( CUDA_SUCCESS != (*cudaGetDevicePtr)( &currentDeviceID ) ) {
		strncpy(_cuda_vector.cmp_info.disabled_reason, "No NVIDIA GPU's found.",PAPI_MAX_STR_LEN);
		return ( PAPI_ENOSUPP );
	}
	
	if ( getenv( "PAPI_VERBOSE" ) ) {
		printf( "DEVICE USED: %s (%d)\n", device[currentDeviceID].name,
			   currentDeviceID );
	}
	
	/* get the CUDA context from the calling CPU thread */
	cuErr = (*cuCtxGetCurrentPtr)( &cuCtx );

	/* if no CUDA context is bound to the calling CPU thread yet, create one */
	if ( cuErr != CUDA_SUCCESS || cuCtx == NULL ) {
		cuErr = (*cuCtxCreatePtr)( &cuCtx, 0, device[currentDeviceID].dev );
		CHECK_CU_ERROR( cuErr, "cuCtxCreate" );
	}

	/* cuCtxGetCurrent() can return a non-null context that is not valid 
	   because the context has not yet been initialized.
	   Here is a workaround: 
	   cudaFree(NULL) forces the context to be initialized
	   if cudaFree(NULL) returns success then we are able to use the context in subsequent calls
	   if cudaFree(NULL) returns an error (or subsequent cupti* calls) then the context is not usable,
	   and will never be useable */
	if ( CUDA_SUCCESS != (*cudaFreePtr)( NULL ) ) {
		strncpy(_cuda_vector.cmp_info.disabled_reason, "Problem initializing CUDA context.",PAPI_MAX_STR_LEN);
		return ( PAPI_ENOSUPP );
	}

	/* Create dynamic event table */
	cuda_native_table = ( CUDA_native_event_entry_t * )
		malloc( sizeof ( CUDA_native_event_entry_t ) * NUM_EVENTS );
	if ( cuda_native_table == NULL ) {
		perror( "malloc(): Failed to allocate memory to events table" );
		strncpy(_cuda_vector.cmp_info.disabled_reason, "Failed to allocate memory to events table.",PAPI_MAX_STR_LEN);
		return ( PAPI_ENOSUPP );
	}

	if ( NUM_EVENTS != createNativeEvents(  ) ) {
		strncpy(_cuda_vector.cmp_info.disabled_reason, "Error creating CUDA event list.",PAPI_MAX_STR_LEN);
		return ( PAPI_ENOSUPP );
	}
	
	/* Export the component id */
	_cuda_vector.cmp_info.CmpIdx = cidx;

	return ( PAPI_OK );
}