/** Initialize hardware counters, setup the function vector table * and get hardware information, this routine is called when the * PAPI process is initialized (IE PAPI_library_init) */ int _papi_nvml_init_substrate( int cidx ) { nvmlReturn_t ret; cudaError_t cuerr; int cuda_count = 0; unsigned int nvml_count = 0; ret = nvmlInit(); if ( NVML_SUCCESS != ret ) { strcpy(_nvml_vector.cmp_info.disabled_reason, "The NVIDIA managament library failed to initialize."); goto disable; } cuerr = cuInit( 0 ); if ( CUDA_SUCCESS != cuerr ) { strcpy(_nvml_vector.cmp_info.disabled_reason, "The CUDA library failed to initialize."); goto disable; } /* Figure out the number of CUDA devices in the system */ ret = nvmlDeviceGetCount( &nvml_count ); if ( NVML_SUCCESS != ret ) { strcpy(_nvml_vector.cmp_info.disabled_reason, "Unable to get a count of devices from the NVIDIA managament library."); goto disable; } cuerr = cudaGetDeviceCount( &cuda_count ); if ( CUDA_SUCCESS != cuerr ) { strcpy(_nvml_vector.cmp_info.disabled_reason, "Unable to get a device count from CUDA."); goto disable; } /* We can probably recover from this, when we're clever */ if ( nvml_count != cuda_count ) { strcpy(_nvml_vector.cmp_info.disabled_reason, "Cuda and the NVIDIA managament library have different device counts."); goto disable; } device_count = cuda_count; /* A per device representation of what events are present */ features = (int*)papi_malloc(sizeof(int) * device_count ); /* Handles to each device */ devices = (nvmlDevice_t*)papi_malloc(sizeof(nvmlDevice_t) * device_count); /* Figure out what events are supported on each card. */ if ( (papi_errorcode = detectDevices( ) ) != PAPI_OK ) { papi_free(features); papi_free(devices); sprintf(_nvml_vector.cmp_info.disabled_reason, "An error occured in device feature detection, please check your NVIDIA Management Library and CUDA install." ); goto disable; } /* The assumption is that if everything went swimmingly in detectDevices, all nvml calls here should be fine. */ createNativeEvents( ); /* Export the total number of events available */ _nvml_vector.cmp_info.num_native_events = num_events; /* Export the component id */ _nvml_vector.cmp_info.CmpIdx = cidx; /* Export the number of 'counters' */ _nvml_vector.cmp_info.num_cntrs = num_events; return PAPI_OK; disable: _nvml_vector.cmp_info.num_cntrs = 0; return PAPI_OK; }
/* Initialize hardware counters, setup the function vector table * and get hardware information, this routine is called when the * PAPI process is initialized (IE PAPI_library_init) * * NOTE: only called by main thread (not by every thread) !!! * * Starting in CUDA 4.0, multiple CPU threads can access the same CUDA context. * This is a much easier programming model then pre-4.0 as threads - using the * same context - can share memory, data, etc. * It's possible to create a different context for each thread, but then we are * likely running into a limitation that only one context can be profiled at a time. * ==> and we don't want this. That's why CUDA context creation is done in * CUDA_init_component() (called only by main thread) rather than CUDA_init() * or CUDA_init_control_state() (both called by each thread). */ int CUDA_init_component( int cidx ) { SUBDBG ("Entry: cidx: %d\n", cidx); CUresult cuErr = CUDA_SUCCESS; /* link in all the cuda libraries and resolve the symbols we need to use */ if (linkCudaLibraries() != PAPI_OK) { SUBDBG ("Dynamic link of CUDA libraries failed, component will be disabled.\n"); SUBDBG ("See disable reason in papi_component_avail output for more details.\n"); return (PAPI_ENOSUPP); } /* Create dynamic event table */ NUM_EVENTS = detectDevice( ); if (NUM_EVENTS < 0) { strncpy(_cuda_vector.cmp_info.disabled_reason, "Call to detectDevice failed.",PAPI_MAX_STR_LEN); return (PAPI_ENOSUPP); } /* TODO: works only for one device right now; need to find out if user can use 2 or more devices at same time */ /* want create a CUDA context for either the default device or the device specified with cudaSetDevice() in user code */ if ( CUDA_SUCCESS != (*cudaGetDevicePtr)( ¤tDeviceID ) ) { strncpy(_cuda_vector.cmp_info.disabled_reason, "No NVIDIA GPU's found.",PAPI_MAX_STR_LEN); return ( PAPI_ENOSUPP ); } if ( getenv( "PAPI_VERBOSE" ) ) { printf( "DEVICE USED: %s (%d)\n", device[currentDeviceID].name, currentDeviceID ); } /* get the CUDA context from the calling CPU thread */ cuErr = (*cuCtxGetCurrentPtr)( &cuCtx ); /* if no CUDA context is bound to the calling CPU thread yet, create one */ if ( cuErr != CUDA_SUCCESS || cuCtx == NULL ) { cuErr = (*cuCtxCreatePtr)( &cuCtx, 0, device[currentDeviceID].dev ); CHECK_CU_ERROR( cuErr, "cuCtxCreate" ); } /* cuCtxGetCurrent() can return a non-null context that is not valid because the context has not yet been initialized. Here is a workaround: cudaFree(NULL) forces the context to be initialized if cudaFree(NULL) returns success then we are able to use the context in subsequent calls if cudaFree(NULL) returns an error (or subsequent cupti* calls) then the context is not usable, and will never be useable */ if ( CUDA_SUCCESS != (*cudaFreePtr)( NULL ) ) { strncpy(_cuda_vector.cmp_info.disabled_reason, "Problem initializing CUDA context.",PAPI_MAX_STR_LEN); return ( PAPI_ENOSUPP ); } /* Create dynamic event table */ cuda_native_table = ( CUDA_native_event_entry_t * ) malloc( sizeof ( CUDA_native_event_entry_t ) * NUM_EVENTS ); if ( cuda_native_table == NULL ) { perror( "malloc(): Failed to allocate memory to events table" ); strncpy(_cuda_vector.cmp_info.disabled_reason, "Failed to allocate memory to events table.",PAPI_MAX_STR_LEN); return ( PAPI_ENOSUPP ); } if ( NUM_EVENTS != createNativeEvents( ) ) { strncpy(_cuda_vector.cmp_info.disabled_reason, "Error creating CUDA event list.",PAPI_MAX_STR_LEN); return ( PAPI_ENOSUPP ); } /* Export the component id */ _cuda_vector.cmp_info.CmpIdx = cidx; return ( PAPI_OK ); }