bool LPS::detectDeviceAndAddress(deviceType device, sa0State sa0) { if (sa0 == sa0_auto || sa0 == sa0_high) { address = SA0_HIGH_ADDRESS; if (detectDevice(device)) return true; } if (sa0 == sa0_auto || sa0 == sa0_low) { address = SA0_LOW_ADDRESS; if (detectDevice(device)) return true; } return false; }
/* Initialize hardware counters, setup the function vector table * and get hardware information, this routine is called when the * PAPI process is initialized (IE PAPI_library_init) * * NOTE: only called by main thread (not by every thread) !!! * * Starting in CUDA 4.0, multiple CPU threads can access the same CUDA context. * This is a much easier programming model then pre-4.0 as threads - using the * same context - can share memory, data, etc. * It's possible to create a different context for each thread, but then we are * likely running into a limitation that only one context can be profiled at a time. * ==> and we don't want this. That's why CUDA context creation is done in * CUDA_init_component() (called only by main thread) rather than CUDA_init() * or CUDA_init_control_state() (both called by each thread). */ int CUDA_init_component( int cidx ) { SUBDBG ("Entry: cidx: %d\n", cidx); CUresult cuErr = CUDA_SUCCESS; /* link in all the cuda libraries and resolve the symbols we need to use */ if (linkCudaLibraries() != PAPI_OK) { SUBDBG ("Dynamic link of CUDA libraries failed, component will be disabled.\n"); SUBDBG ("See disable reason in papi_component_avail output for more details.\n"); return (PAPI_ENOSUPP); } /* Create dynamic event table */ NUM_EVENTS = detectDevice( ); if (NUM_EVENTS < 0) { strncpy(_cuda_vector.cmp_info.disabled_reason, "Call to detectDevice failed.",PAPI_MAX_STR_LEN); return (PAPI_ENOSUPP); } /* TODO: works only for one device right now; need to find out if user can use 2 or more devices at same time */ /* want create a CUDA context for either the default device or the device specified with cudaSetDevice() in user code */ if ( CUDA_SUCCESS != (*cudaGetDevicePtr)( ¤tDeviceID ) ) { strncpy(_cuda_vector.cmp_info.disabled_reason, "No NVIDIA GPU's found.",PAPI_MAX_STR_LEN); return ( PAPI_ENOSUPP ); } if ( getenv( "PAPI_VERBOSE" ) ) { printf( "DEVICE USED: %s (%d)\n", device[currentDeviceID].name, currentDeviceID ); } /* get the CUDA context from the calling CPU thread */ cuErr = (*cuCtxGetCurrentPtr)( &cuCtx ); /* if no CUDA context is bound to the calling CPU thread yet, create one */ if ( cuErr != CUDA_SUCCESS || cuCtx == NULL ) { cuErr = (*cuCtxCreatePtr)( &cuCtx, 0, device[currentDeviceID].dev ); CHECK_CU_ERROR( cuErr, "cuCtxCreate" ); } /* cuCtxGetCurrent() can return a non-null context that is not valid because the context has not yet been initialized. Here is a workaround: cudaFree(NULL) forces the context to be initialized if cudaFree(NULL) returns success then we are able to use the context in subsequent calls if cudaFree(NULL) returns an error (or subsequent cupti* calls) then the context is not usable, and will never be useable */ if ( CUDA_SUCCESS != (*cudaFreePtr)( NULL ) ) { strncpy(_cuda_vector.cmp_info.disabled_reason, "Problem initializing CUDA context.",PAPI_MAX_STR_LEN); return ( PAPI_ENOSUPP ); } /* Create dynamic event table */ cuda_native_table = ( CUDA_native_event_entry_t * ) malloc( sizeof ( CUDA_native_event_entry_t ) * NUM_EVENTS ); if ( cuda_native_table == NULL ) { perror( "malloc(): Failed to allocate memory to events table" ); strncpy(_cuda_vector.cmp_info.disabled_reason, "Failed to allocate memory to events table.",PAPI_MAX_STR_LEN); return ( PAPI_ENOSUPP ); } if ( NUM_EVENTS != createNativeEvents( ) ) { strncpy(_cuda_vector.cmp_info.disabled_reason, "Error creating CUDA event list.",PAPI_MAX_STR_LEN); return ( PAPI_ENOSUPP ); } /* Export the component id */ _cuda_vector.cmp_info.CmpIdx = cidx; return ( PAPI_OK ); }