Ejemplo n.º 1
0
/** Initialize hardware counters, setup the function vector table
 * and get hardware information, this routine is called when the
 * PAPI process is initialized (IE PAPI_library_init)
 */
		int
_papi_nvml_init_substrate( int cidx )
{
		nvmlReturn_t ret;
		cudaError_t cuerr;

		int cuda_count = 0;
		unsigned int nvml_count = 0;

		ret = nvmlInit();
		if ( NVML_SUCCESS != ret ) {
				strcpy(_nvml_vector.cmp_info.disabled_reason, "The NVIDIA managament library failed to initialize.");
				goto disable;
		}

		cuerr = cuInit( 0 );
		if ( CUDA_SUCCESS != cuerr ) {
				strcpy(_nvml_vector.cmp_info.disabled_reason, "The CUDA library failed to initialize.");
				goto disable;
		}

		/* Figure out the number of CUDA devices in the system */
		ret = nvmlDeviceGetCount( &nvml_count );
		if ( NVML_SUCCESS != ret ) {
				strcpy(_nvml_vector.cmp_info.disabled_reason, "Unable to get a count of devices from the NVIDIA managament library.");
				goto disable;
		}

		cuerr = cudaGetDeviceCount( &cuda_count );
		if ( CUDA_SUCCESS != cuerr ) {
				strcpy(_nvml_vector.cmp_info.disabled_reason, "Unable to get a device count from CUDA.");
				goto disable;
		}

		/* We can probably recover from this, when we're clever */
		if ( nvml_count != cuda_count ) {
				strcpy(_nvml_vector.cmp_info.disabled_reason, "Cuda and the NVIDIA managament library have different device counts.");
				goto disable;
		}

		device_count = cuda_count;

		/* A per device representation of what events are present */
		features = (int*)papi_malloc(sizeof(int) * device_count );

		/* Handles to each device */
		devices = (nvmlDevice_t*)papi_malloc(sizeof(nvmlDevice_t) * device_count);

		/* Figure out what events are supported on each card. */
		if ( (papi_errorcode = detectDevices( ) ) != PAPI_OK ) {
			papi_free(features);
			papi_free(devices);
			sprintf(_nvml_vector.cmp_info.disabled_reason, "An error occured in device feature detection, please check your NVIDIA Management Library and CUDA install." );
			goto disable;
		}

		/* The assumption is that if everything went swimmingly in detectDevices, 
			all nvml calls here should be fine. */
		createNativeEvents( );

		/* Export the total number of events available */
		_nvml_vector.cmp_info.num_native_events = num_events;

		/* Export the component id */
		_nvml_vector.cmp_info.CmpIdx = cidx;

		/* Export the number of 'counters' */
		_nvml_vector.cmp_info.num_cntrs = num_events;

		return PAPI_OK;

disable:
		_nvml_vector.cmp_info.num_cntrs = 0;
		return PAPI_OK;	
}
Ejemplo n.º 2
0
/* Initialize hardware counters, setup the function vector table
 * and get hardware information, this routine is called when the 
 * PAPI process is initialized (IE PAPI_library_init)
 *
 * NOTE: only called by main thread (not by every thread) !!!
 *
 * Starting in CUDA 4.0, multiple CPU threads can access the same CUDA context.
 * This is a much easier programming model then pre-4.0 as threads - using the 
 * same context - can share memory, data, etc. 
 * It's possible to create a different context for each thread, but then we are
 * likely running into a limitation that only one context can be profiled at a time.
 * ==> and we don't want this. That's why CUDA context creation is done in 
 * CUDA_init_component() (called only by main thread) rather than CUDA_init() 
 * or CUDA_init_control_state() (both called by each thread).
 */
int
CUDA_init_component( int cidx )
{
	SUBDBG ("Entry: cidx: %d\n", cidx);
	CUresult cuErr = CUDA_SUCCESS;

	/* link in all the cuda libraries and resolve the symbols we need to use */
	if (linkCudaLibraries() != PAPI_OK) {
		SUBDBG ("Dynamic link of CUDA libraries failed, component will be disabled.\n");
		SUBDBG ("See disable reason in papi_component_avail output for more details.\n");
		return (PAPI_ENOSUPP);
	}

	/* Create dynamic event table */
	NUM_EVENTS = detectDevice(  );
	if (NUM_EVENTS < 0) {
		strncpy(_cuda_vector.cmp_info.disabled_reason, "Call to detectDevice failed.",PAPI_MAX_STR_LEN);
		return (PAPI_ENOSUPP);
	}
	/* TODO: works only for one device right now;
	 need to find out if user can use 2 or more devices at same time */

	/* want create a CUDA context for either the default device or
	 the device specified with cudaSetDevice() in user code */
	if ( CUDA_SUCCESS != (*cudaGetDevicePtr)( &currentDeviceID ) ) {
		strncpy(_cuda_vector.cmp_info.disabled_reason, "No NVIDIA GPU's found.",PAPI_MAX_STR_LEN);
		return ( PAPI_ENOSUPP );
	}
	
	if ( getenv( "PAPI_VERBOSE" ) ) {
		printf( "DEVICE USED: %s (%d)\n", device[currentDeviceID].name,
			   currentDeviceID );
	}
	
	/* get the CUDA context from the calling CPU thread */
	cuErr = (*cuCtxGetCurrentPtr)( &cuCtx );

	/* if no CUDA context is bound to the calling CPU thread yet, create one */
	if ( cuErr != CUDA_SUCCESS || cuCtx == NULL ) {
		cuErr = (*cuCtxCreatePtr)( &cuCtx, 0, device[currentDeviceID].dev );
		CHECK_CU_ERROR( cuErr, "cuCtxCreate" );
	}

	/* cuCtxGetCurrent() can return a non-null context that is not valid 
	   because the context has not yet been initialized.
	   Here is a workaround: 
	   cudaFree(NULL) forces the context to be initialized
	   if cudaFree(NULL) returns success then we are able to use the context in subsequent calls
	   if cudaFree(NULL) returns an error (or subsequent cupti* calls) then the context is not usable,
	   and will never be useable */
	if ( CUDA_SUCCESS != (*cudaFreePtr)( NULL ) ) {
		strncpy(_cuda_vector.cmp_info.disabled_reason, "Problem initializing CUDA context.",PAPI_MAX_STR_LEN);
		return ( PAPI_ENOSUPP );
	}

	/* Create dynamic event table */
	cuda_native_table = ( CUDA_native_event_entry_t * )
		malloc( sizeof ( CUDA_native_event_entry_t ) * NUM_EVENTS );
	if ( cuda_native_table == NULL ) {
		perror( "malloc(): Failed to allocate memory to events table" );
		strncpy(_cuda_vector.cmp_info.disabled_reason, "Failed to allocate memory to events table.",PAPI_MAX_STR_LEN);
		return ( PAPI_ENOSUPP );
	}

	if ( NUM_EVENTS != createNativeEvents(  ) ) {
		strncpy(_cuda_vector.cmp_info.disabled_reason, "Error creating CUDA event list.",PAPI_MAX_STR_LEN);
		return ( PAPI_ENOSUPP );
	}
	
	/* Export the component id */
	_cuda_vector.cmp_info.CmpIdx = cidx;

	return ( PAPI_OK );
}