/* * Specify device(s): Counts number of cuda events available in this system */ static int detectDevice( void ) { CUresult err; int skipDevice = 0; int id; char deviceName_tmp[PAPI_MIN_STR_LEN] = "init"; totalEventCount = 0; /* CUDA initialization */ err = (*cuInitPtr)( 0 ); if ( err != CUDA_SUCCESS ) { SUBDBG ("Info: Error from cuInit(): %d\n", err); return ( PAPI_ENOSUPP ); } /* How many gpgpu devices do we have? */ err = (*cuDeviceGetCountPtr)( &deviceCount ); CHECK_CU_ERROR( err, "cuDeviceGetCount" ); if ( deviceCount == 0 ) return ( PAPI_ENOSUPP ); /* allocate memory for device data table */ device = ( DeviceData_t * ) malloc( sizeof ( DeviceData_t ) * deviceCount ); if ( device == NULL ) { perror( "malloc(): Failed to allocate memory to CUDA device table" ); return ( PAPI_ENOSUPP ); } /* What are the devices? Get Name and # of domains per device */ for ( id = 0; id < deviceCount; id++ ) { err = (*cuDeviceGetPtr)( &device[id].dev, id ); CHECK_CU_ERROR( err, "cuDeviceGet" ); err = (*cuDeviceGetNamePtr)( device[id].name, PAPI_MIN_STR_LEN, device[id].dev ); CHECK_CU_ERROR( err, "cuDeviceGetName" ); SUBDBG ("Cuda deviceName: %s\n", device[id].name); /* Skip device if there are multiple of the same type and if it has been already added to the list */ if ( 0 == strcmp( deviceName_tmp, device[id].name ) ) { skipDevice++; continue; } strcpy( deviceName_tmp, device[id].name ); /* enumerate the domains on the device */ if ( 0 != enumEventDomains( device[id].dev, id ) ) return ( PAPI_ENOSUPP ); } deviceCount = deviceCount - skipDevice; /* return number of events provided via CuPTI */ return totalEventCount; }
/* * Finalizes CUPTI device. * * @param cleanExit 1 to cleanup CUPTI event group, otherwise 0 */ void vt_cupti_finalize_device(uint32_t ptid, uint8_t cleanExit){ CUptiResult cuptiErr = CUPTI_SUCCESS; vt_cupti_ctx_t *vtcuptiCtx = NULL; vt_cntl_msg(2, "[CUPTI] Finalize device ... "); { CUcontext cuCtx = NULL; VT_SUSPEND_CUDA_TRACING(ptid); #if (defined(CUDA_VERSION) && (CUDA_VERSION < 4000)) CHECK_CU_ERROR(cuCtxPopCurrent(&cuCtx), "cuCtxPopCurrent"); CHECK_CU_ERROR(cuCtxPushCurrent(cuCtx), "cuCtxPushCurrent"); #else CHECK_CU_ERROR(cuCtxGetCurrent(&cuCtx), "cuCtxGetCurrent"); #endif VT_RESUME_CUDA_TRACING(ptid); vtcuptiCtx = vt_cupti_takeCtxFromList(cuCtx); if(vtcuptiCtx == NULL) return; } if(cleanExit && vt_gpu_debug != 0){ /*uint64_t time = vt_pform_wtime(); vt_cupti_resetCounter(vtcuptiCtx, 0, &time);*/ /* stop CUPTI counter capturing */ vt_cupti_stop(vtcuptiCtx); /* destroy all CUPTI event groups, which have been created */ { vt_cupti_grp_t *vtcuptiGrp = vtcuptiCtx->vtGrpList; while(vtcuptiGrp != NULL){ cuptiErr = cuptiEventGroupRemoveAllEvents(vtcuptiGrp->evtGrp); CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupRemoveAllEvents"); cuptiErr = cuptiEventGroupDestroy(vtcuptiGrp->evtGrp); CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupDestroy"); vtcuptiGrp = vtcuptiGrp->next; } } } /* free VampirTrace CUPTI context */ vt_cupti_freeCtx(vtcuptiCtx); }
/* * Create a VampirTrace CUPTI Activity context. * * @param ctxID ID of the CUDA context * @param devID ID of the CUDA device * * @return pointer to created VampirTrace CUPTI Activity context */ static vt_cuptiact_ctx_t* vt_cuptiact_createContext(uint32_t ctxID, CUcontext cuCtx, uint32_t devID) { vt_cuptiact_ctx_t* vtCtx = NULL; /* create new context, as it is not listed */ vtCtx = (vt_cuptiact_ctx_t *)malloc(sizeof(vt_cuptiact_ctx_t)); if(vtCtx == NULL) vt_error_msg("[CUPTI Activity] Could not allocate memory for context!"); vtCtx->ctxID = ctxID; vtCtx->next = NULL; vtCtx->strmList = NULL; vtCtx->gpuMemAllocated = 0; vtCtx->gpuMemList = NULL; vtCtx->buffer = NULL; vtCtx->vtLastGPUTime = vt_gpu_init_time; vtCtx->gpuIdleOn = 1; /* * Get time synchronization factor between host and GPU time for measurement * interval */ { VT_CUPTI_CALL(cuptiGetTimestamp(&(vtCtx->sync.gpuStart)), "cuptiGetTimestamp"); vtCtx->sync.hostStart = vt_pform_wtime(); } VT_CHECK_THREAD; vtCtx->ptid = VT_MY_THREAD; if(cuCtx == NULL) CHECK_CU_ERROR(cuCtxGetCurrent(&cuCtx), NULL); vtCtx->cuCtx = cuCtx; /* set default CUPTI stream ID (needed for memory usage and idle tracing) */ VT_CUPTI_CALL(cuptiGetStreamId(vtCtx->cuCtx, NULL, &(vtCtx->defaultStrmID)), "cuptiGetStreamId"); if(devID == (uint32_t)-1){ CUdevice cuDev; /* driver API prog: correct cuDev, but result is 201 (invalid context) */ if(CUDA_SUCCESS != cuCtxGetDevice(&cuDev)){ devID = VT_NO_ID; }else{ devID = (uint32_t)cuDev; } } vtCtx->devID = devID; vtCtx->cuDev = devID; /*vt_cntl_msg(1,"device id: %d", devID);*/ return vtCtx; }
/* * Returns the VampirTrace CUPTI context for the CUDA context associated with * the calling host thread. * * @param ptid the VampirTrace thread id of the calling host thread */ vt_cupti_ctx_t* vt_cupti_getCurrentContext(uint32_t ptid) { CUcontext cuCtx = NULL; if(!vt_cupti_initialized) vt_cupti_init(); VT_SUSPEND_CUDA_TRACING(ptid); # if (defined(CUDA_VERSION) && (CUDA_VERSION < 4000)) CHECK_CU_ERROR(cuCtxPopCurrent(&cuCtx), "cuCtxPopCurrent"); CHECK_CU_ERROR(cuCtxPushCurrent(cuCtx), "cuCtxPushCurrent"); # else CHECK_CU_ERROR(cuCtxGetCurrent(&cuCtx), "cuCtxGetCurrent"); # endif VT_RESUME_CUDA_TRACING(ptid); if(cuCtx == NULL) { vt_cntl_msg(2, "[CUPTI] No context is bound to the calling CPU thread", cuCtx); return NULL; } return vt_cupti_getCtx(cuCtx, ptid); }
static vt_cupti_dev_t* vt_cupti_setupMetricList(void) { CUresult err; int deviceCount, id; vt_cupti_dev_t *capList = NULL; /* CUDA initialization */ err = cuInit( 0 ); if ( err != CUDA_SUCCESS ) { printf( "Initialization of CUDA library failed.\n" ); exit( EXIT_FAILURE ); } /* How many gpgpu devices do we have? */ err = cuDeviceGetCount( &deviceCount ); CHECK_CU_ERROR(err, "cuDeviceGetCount"); if(deviceCount == 0){ printf("[CUPTI]There is no device supporting CUDA.\n"); exit(EXIT_FAILURE); } /* create list with available compute capabilities */ for(id = 0; id < deviceCount; id++){ CUdevice cuDev; vt_cupti_dev_t *cuptiDev; int dev_major, dev_minor; err = cuDeviceGet(&cuDev, id); CHECK_CU_ERROR(err, "cuDeviceGet"); err = cuDeviceComputeCapability(&dev_major, &dev_minor, cuDev); CHECK_CU_ERROR(err, "cuDeviceComputeCapability"); /* check if device capability already listed */ cuptiDev = vt_cupti_checkMetricList(capList, dev_major, dev_minor); if(cuptiDev == NULL){ /* allocate memory for device list entry */ cuptiDev = (vt_cupti_dev_t *)malloc(sizeof(vt_cupti_dev_t)); cuptiDev->dev_major = dev_major; cuptiDev->dev_minor = dev_minor; cuptiDev->cuDev = cuDev; cuptiDev->vtcuptiEvtList = NULL; cuptiDev->evtNum = 0; cuptiDev->next = NULL; /* prepend to list */ cuptiDev->next = capList; capList = cuptiDev; } } vt_cupti_fillMetricList(capList); /* cleanup list: remove entries, which don't have metrics */ { vt_cupti_dev_t *curr = capList; vt_cupti_dev_t *last = capList; while(curr != NULL){ vt_cupti_dev_t *freeDev = curr; curr = curr->next; if(freeDev->evtNum == 0){ /* first element */ if(freeDev == capList){ capList = capList->next; }else{ last->next = freeDev->next; } free(freeDev); }else last = freeDev; } } return capList; }
/* * Initializes a CUPTI host thread and create the event group. * * @param ptid the VampirTrace thread id * @param cuCtx optionally given CUDA context * * @return the created VampirTrace CUPTI host thread structure */ static vt_cupti_ctx_t* vt_cupti_initCtx(uint32_t ptid, CUcontext cuCtx) { vt_cupti_ctx_t *vtcuptiCtx = NULL; uint64_t time; vt_cntl_msg(2, "[CUPTI] Initializing VampirTrace CUPTI context (ptid=%d)", ptid); time = vt_pform_wtime(); vt_enter(ptid, &time, rid_cupti_init); /* do not trace CUDA functions invoked here */ VT_SUSPEND_CUDA_TRACING(ptid); /* initialize CUDA driver API, if necessary and get context handle */ if(cuCtx == NULL){ #if (defined(CUDA_VERSION) && (CUDA_VERSION < 4000)) CHECK_CU_ERROR(cuCtxPopCurrent(&cuCtx), "cuCtxPopCurrent"); CHECK_CU_ERROR(cuCtxPushCurrent(cuCtx), "cuCtxPushCurrent"); #else CHECK_CU_ERROR(cuCtxGetCurrent(&cuCtx), "cuCtxGetCurrent"); #endif } /* get a pointer to eventIDArray */ { CUresult cuErr = CUDA_SUCCESS; int dev_major, dev_minor; CUdevice cuDev = 0; vt_cupti_dev_t *cuptiDev; CHECK_CU_ERROR(cuCtxGetDevice(&cuDev), "cuCtxGetDevice"); cuErr = cuDeviceComputeCapability(&dev_major, &dev_minor, cuDev); CHECK_CU_ERROR(cuErr, "cuDeviceComputeCapability"); /* check if device capability already listed */ CUPTI_LOCK(); cuptiDev = vt_cupti_capList; CUPTI_UNLOCK(); cuptiDev = vt_cupti_checkMetricList(cuptiDev, dev_major, dev_minor); if(cuptiDev){ vtcuptiCtx = (vt_cupti_ctx_t*)malloc(sizeof(vt_cupti_ctx_t)); if(vtcuptiCtx == NULL) vt_error_msg("malloc(sizeof(VTCUPTIhostThrd)) failed!"); vtcuptiCtx->cuCtx = cuCtx; vtcuptiCtx->vtDevCap = cuptiDev; vtcuptiCtx->vtGrpList = NULL; vtcuptiCtx->counterData = NULL; vtcuptiCtx->cuptiEvtIDs = NULL; vtcuptiCtx->next = NULL; }else{ time = vt_pform_wtime(); vt_exit(ptid, &time); VT_RESUME_CUDA_TRACING(ptid); return NULL; } } VT_RESUME_CUDA_TRACING(ptid); /* create and add the VampirTrace CUPTI groups to the context */ vt_cupti_addEvtGrpsToCtx(vtcuptiCtx); /* allocate memory for CUPTI counter reads */ { size_t allocSize = vtcuptiCtx->vtGrpList->evtNum; vtcuptiCtx->counterData = (uint64_t *)malloc(allocSize*sizeof(uint64_t)); vtcuptiCtx->cuptiEvtIDs = (CUpti_EventID *)malloc(allocSize*sizeof(CUpti_EventID)); } /* add VampirTrace CUPTI context entry to list (as first element) */ CUPTI_LOCK(); vtcuptiCtx->next = vtcuptiCtxlist; vtcuptiCtxlist = vtcuptiCtx; CUPTI_UNLOCK(); time = vt_pform_wtime(); vt_exit(ptid, &time); return vtcuptiCtx; }
/* Initialize hardware counters, setup the function vector table * and get hardware information, this routine is called when the * PAPI process is initialized (IE PAPI_library_init) * * NOTE: only called by main thread (not by every thread) !!! * * Starting in CUDA 4.0, multiple CPU threads can access the same CUDA context. * This is a much easier programming model then pre-4.0 as threads - using the * same context - can share memory, data, etc. * It's possible to create a different context for each thread, but then we are * likely running into a limitation that only one context can be profiled at a time. * ==> and we don't want this. That's why CUDA context creation is done in * CUDA_init_component() (called only by main thread) rather than CUDA_init() * or CUDA_init_control_state() (both called by each thread). */ int CUDA_init_component( int cidx ) { SUBDBG ("Entry: cidx: %d\n", cidx); CUresult cuErr = CUDA_SUCCESS; /* link in all the cuda libraries and resolve the symbols we need to use */ if (linkCudaLibraries() != PAPI_OK) { SUBDBG ("Dynamic link of CUDA libraries failed, component will be disabled.\n"); SUBDBG ("See disable reason in papi_component_avail output for more details.\n"); return (PAPI_ENOSUPP); } /* Create dynamic event table */ NUM_EVENTS = detectDevice( ); if (NUM_EVENTS < 0) { strncpy(_cuda_vector.cmp_info.disabled_reason, "Call to detectDevice failed.",PAPI_MAX_STR_LEN); return (PAPI_ENOSUPP); } /* TODO: works only for one device right now; need to find out if user can use 2 or more devices at same time */ /* want create a CUDA context for either the default device or the device specified with cudaSetDevice() in user code */ if ( CUDA_SUCCESS != (*cudaGetDevicePtr)( ¤tDeviceID ) ) { strncpy(_cuda_vector.cmp_info.disabled_reason, "No NVIDIA GPU's found.",PAPI_MAX_STR_LEN); return ( PAPI_ENOSUPP ); } if ( getenv( "PAPI_VERBOSE" ) ) { printf( "DEVICE USED: %s (%d)\n", device[currentDeviceID].name, currentDeviceID ); } /* get the CUDA context from the calling CPU thread */ cuErr = (*cuCtxGetCurrentPtr)( &cuCtx ); /* if no CUDA context is bound to the calling CPU thread yet, create one */ if ( cuErr != CUDA_SUCCESS || cuCtx == NULL ) { cuErr = (*cuCtxCreatePtr)( &cuCtx, 0, device[currentDeviceID].dev ); CHECK_CU_ERROR( cuErr, "cuCtxCreate" ); } /* cuCtxGetCurrent() can return a non-null context that is not valid because the context has not yet been initialized. Here is a workaround: cudaFree(NULL) forces the context to be initialized if cudaFree(NULL) returns success then we are able to use the context in subsequent calls if cudaFree(NULL) returns an error (or subsequent cupti* calls) then the context is not usable, and will never be useable */ if ( CUDA_SUCCESS != (*cudaFreePtr)( NULL ) ) { strncpy(_cuda_vector.cmp_info.disabled_reason, "Problem initializing CUDA context.",PAPI_MAX_STR_LEN); return ( PAPI_ENOSUPP ); } /* Create dynamic event table */ cuda_native_table = ( CUDA_native_event_entry_t * ) malloc( sizeof ( CUDA_native_event_entry_t ) * NUM_EVENTS ); if ( cuda_native_table == NULL ) { perror( "malloc(): Failed to allocate memory to events table" ); strncpy(_cuda_vector.cmp_info.disabled_reason, "Failed to allocate memory to events table.",PAPI_MAX_STR_LEN); return ( PAPI_ENOSUPP ); } if ( NUM_EVENTS != createNativeEvents( ) ) { strncpy(_cuda_vector.cmp_info.disabled_reason, "Error creating CUDA event list.",PAPI_MAX_STR_LEN); return ( PAPI_ENOSUPP ); } /* Export the component id */ _cuda_vector.cmp_info.CmpIdx = cidx; return ( PAPI_OK ); }