/* * Print all available counters for a given CUDA device to stdout. * * @param cuDev the CUDA device */ static void vt_cupti_showAllCounters(CUdevice cuDev) { CUptiResult cuptiErr = CUPTI_SUCCESS; CUpti_EventDomainID *domainId = NULL; uint32_t maxDomains = 0; uint32_t i; size_t size = 0; cuptiErr = cuptiDeviceGetNumEventDomains(cuDev, &maxDomains); CHECK_CUPTI_ERROR(cuptiErr, "cuptiDeviceGetNumEventDomains"); if(maxDomains == 0){ vt_cntl_msg(1, "[CUPTI] No domain is exposed by dev = %d\n", cuDev); return; } size = sizeof(CUpti_EventDomainID) * maxDomains; domainId = (CUpti_EventDomainID*)malloc(size); if(domainId == NULL){ vt_cntl_msg(1, "[CUPTI] Failed to allocate memory to domain ID"); return; } memset(domainId, 0, size); cuptiErr = cuptiDeviceEnumEventDomains(cuDev, &size, domainId); CHECK_CUPTI_ERROR(cuptiErr, "cuptiDeviceEnumEventDomains"); /* enum domains */ for(i = 0; i < maxDomains; i++) enumEvents(cuDev, domainId[i]); free(domainId); }
/* * De-initialize the VampirTrace CUPTI context without destroying it. * * @param vtcuptiCtx pointer to the VampirTrace CUPTI context */ static void vt_cupti_finish(vt_cupti_ctx_t *vtcuptiCtx) { CUptiResult cuptiErr = CUPTI_SUCCESS; if(vtcuptiCtx == NULL || vt_gpu_debug) return; /*uint64_t time = vt_pform_wtime(); vt_cupti_resetCounter(vtcuptiCtx, 0, &time);*/ /* stop CUPTI counter capturing */ vt_cupti_stop(vtcuptiCtx); /* destroy all CUPTI event groups, which have been created */ { vt_cupti_grp_t *vtcuptiGrp = vtcuptiCtx->vtGrpList; while(vtcuptiGrp != NULL){ cuptiErr = cuptiEventGroupRemoveAllEvents(vtcuptiGrp->evtGrp); CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupRemoveAllEvents"); cuptiErr = cuptiEventGroupDestroy(vtcuptiGrp->evtGrp); CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupDestroy"); vtcuptiGrp = vtcuptiGrp->next; } } }
__attribute__((constructor)) void Trace_start() { cuptierr = cuptiSubscribe(&subscriber, (CUpti_CallbackFunc)getTimestampCallback , &trace); CHECK_CUPTI_ERROR(cuptierr, "cuptiSubscribe"); cuptierr = cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API); CHECK_CUPTI_ERROR(cuptierr, "cuptiEnableDomain"); printf("<-----------register Trace_end--------------->\n"); atexit (Trace_end); }
/* * Finalizes CUPTI device. * * @param cleanExit 1 to cleanup CUPTI event group, otherwise 0 */ void vt_cupti_finalize_device(uint32_t ptid, uint8_t cleanExit){ CUptiResult cuptiErr = CUPTI_SUCCESS; vt_cupti_ctx_t *vtcuptiCtx = NULL; vt_cntl_msg(2, "[CUPTI] Finalize device ... "); { CUcontext cuCtx = NULL; VT_SUSPEND_CUDA_TRACING(ptid); #if (defined(CUDA_VERSION) && (CUDA_VERSION < 4000)) CHECK_CU_ERROR(cuCtxPopCurrent(&cuCtx), "cuCtxPopCurrent"); CHECK_CU_ERROR(cuCtxPushCurrent(cuCtx), "cuCtxPushCurrent"); #else CHECK_CU_ERROR(cuCtxGetCurrent(&cuCtx), "cuCtxGetCurrent"); #endif VT_RESUME_CUDA_TRACING(ptid); vtcuptiCtx = vt_cupti_takeCtxFromList(cuCtx); if(vtcuptiCtx == NULL) return; } if(cleanExit && vt_gpu_debug != 0){ /*uint64_t time = vt_pform_wtime(); vt_cupti_resetCounter(vtcuptiCtx, 0, &time);*/ /* stop CUPTI counter capturing */ vt_cupti_stop(vtcuptiCtx); /* destroy all CUPTI event groups, which have been created */ { vt_cupti_grp_t *vtcuptiGrp = vtcuptiCtx->vtGrpList; while(vtcuptiGrp != NULL){ cuptiErr = cuptiEventGroupRemoveAllEvents(vtcuptiGrp->evtGrp); CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupRemoveAllEvents"); cuptiErr = cuptiEventGroupDestroy(vtcuptiGrp->evtGrp); CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupDestroy"); vtcuptiGrp = vtcuptiGrp->next; } } } /* free VampirTrace CUPTI context */ vt_cupti_freeCtx(vtcuptiCtx); }
int CUDA_update_control_state( hwd_control_state_t * ptr, NativeInfo_t * native, int count, hwd_context_t * ctx ) { ( void ) ctx; CUDA_control_state_t * CUDA_ptr = ( CUDA_control_state_t * ) ptr; int index, i; CUptiResult cuptiErr = CUPTI_SUCCESS; /* Disable the CUDA eventGroup; it also frees the perfmon hardware on the GPU */ cuptiErr = (*cuptiEventGroupDisablePtr)( CUDA_ptr->eventGroup ); CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupDisable" ); cuptiErr = (*cuptiEventGroupRemoveAllEventsPtr)( CUDA_ptr->eventGroup ); CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupRemoveAllEvents" ); // otherwise, add the events to the eventset for ( i = 0; i < count; i++ ) { index = native[i].ni_event; native[i].ni_position = index; /* store events, that have been added to the CuPTI eveentGroup in a seperate place (addedEvents). Needed, so that we can read the values for the added events only */ CUDA_ptr->addedEvents.count = count; CUDA_ptr->addedEvents.list[i] = index; /* if this device name is different from the actual device the code is running on, then exit */ if ( 0 != strncmp( device[currentDeviceID].name, cuda_native_table[index].name, strlen( device[currentDeviceID].name ) ) ) { fprintf( stderr, "Device %s is used -- BUT event %s is collected. \n ---> ERROR: Specify events for the device that is used!\n\n", device[currentDeviceID].name, cuda_native_table[index].name ); return ( PAPI_ENOSUPP ); // Not supported } /* Add events to the CuPTI eventGroup */ cuptiErr = (*cuptiEventGroupAddEventPtr)( CUDA_ptr->eventGroup, cuda_native_table[index].resources. eventId ); CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupAddEvent" ); } return ( PAPI_OK ); }
/* * Control of counters (Reading/Writing/Starting/Stopping/Setup) * functions */ int CUDA_init_control_state( hwd_control_state_t * ctrl ) { CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl; CUptiResult cuptiErr = CUPTI_SUCCESS; int i; /* allocate memory for the list of events that are added to the CuPTI eventGroup */ CUDA_ctrl->addedEvents.list = malloc( sizeof ( int ) * NUM_EVENTS ); if ( CUDA_ctrl->addedEvents.list == NULL ) { perror ( "malloc(): Failed to allocate memory to table of events that are added to CuPTI eventGroup" ); return ( PAPI_ENOSUPP ); } /* initialize the event list */ for ( i = 0; i < NUM_EVENTS; i++ ) CUDA_ctrl->addedEvents.list[i] = 0; cuptiErr = (*cuptiEventGroupCreatePtr)( cuCtx, &CUDA_ctrl->eventGroup, 0 ); CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupCreate" ); return PAPI_OK; }
/* * Reset the VampirTrace counter values (to zero) for active CUPTI counters. * * @param vtcuptiCtx pointer to the VampirTrace CUPTI context * @param strmid the stream id for the counter values * @param time the VampirTrace timestamps */ void vt_cupti_resetCounter(vt_cupti_ctx_t *vtcuptiCtx, uint32_t strmid, uint64_t *time) { size_t i; vt_cupti_grp_t *vtcuptiGrp = NULL; if(vtcuptiCtx == NULL){ VT_CHECK_THREAD; vtcuptiCtx = vt_cupti_getCurrentContext(VT_MY_THREAD); if(vtcuptiCtx == NULL) return; } vtcuptiGrp = vtcuptiCtx->vtGrpList; while(vtcuptiGrp != NULL){ for(i = 0; i < vtcuptiGrp->evtNum; i++){ vt_count(strmid, time, *(vtcuptiGrp->vtCIDs+i), 0); } /* reset counter values of this group */ CHECK_CUPTI_ERROR(cuptiEventGroupResetAllEvents(vtcuptiGrp->evtGrp), "cuptiEventGroupResetAllEvents"); vtcuptiGrp = vtcuptiGrp->next; } }
int CUDA_start( hwd_context_t * ctx, hwd_control_state_t * ctrl ) { ( void ) ctx; int i; CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl; CUptiResult cuptiErr = CUPTI_SUCCESS; // reset all event values to 0 for ( i = 0; i < NUM_EVENTS; i++ ) CUDA_ctrl->counts[i] = 0; cuptiErr = (*cuptiEventGroupEnablePtr)( CUDA_ctrl->eventGroup ); CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupEnable" ); /* Resets all events in the CuPTI eventGroup to zero */ cuptiErr = (*cuptiEventGroupResetAllEventsPtr)( CUDA_ctrl->eventGroup ); CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupResetAllEvents" ); return ( PAPI_OK ); }
/* * Disable and Destoy the CUDA eventGroup */ int CUDA_cleanup_eventset( hwd_control_state_t * ctrl ) { ( void ) ctrl; // TODO: after cleanup_eventset() which destroys the eventset, update_control_state() // is called, which operates on the already destroyed eventset. Bad! #if 0 CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl; CUptiResult cuptiErr = CUPTI_SUCCESS; /* Disable the CUDA eventGroup; it also frees the perfmon hardware on the GPU */ cuptiErr = (*cuptiEventGroupDisablePtr)( CUDA_ctrl->eventGroup ); CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupDisable" ); /* Call the CuPTI cleaning function before leaving */ cuptiErr = (*cuptiEventGroupDestroyPtr)( CUDA_ctrl->eventGroup ); CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupDestroy" ); #endif return ( PAPI_OK ); }
int CUDA_reset( hwd_context_t * ctx, hwd_control_state_t * ctrl ) { ( void ) ctx; CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl; CUptiResult cuptiErr = CUPTI_SUCCESS; /* Resets all events in the CuPTI eventGroup to zero */ cuptiErr = (*cuptiEventGroupResetAllEventsPtr)( CUDA_ctrl->eventGroup ); CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupResetAllEvents" ); return ( PAPI_OK ); }
/* * Returns all event values from the CuPTI eventGroup */ static int getEventValue( long long *counts, CUpti_EventGroup eventGroup, AddedEvents_t addedEvents ) { CUptiResult cuptiErr = CUPTI_SUCCESS; size_t events_read, bufferSizeBytes, arraySizeBytes, i; uint64_t *counterDataBuffer; CUpti_EventID *eventIDArray; int j; bufferSizeBytes = addedEvents.count * sizeof ( uint64_t ); counterDataBuffer = ( uint64_t * ) malloc( bufferSizeBytes ); arraySizeBytes = addedEvents.count * sizeof ( CUpti_EventID ); eventIDArray = ( CUpti_EventID * ) malloc( arraySizeBytes ); /* read counter data for the specified event from the CuPTI eventGroup */ cuptiErr = (*cuptiEventGroupReadAllEventsPtr)( eventGroup, CUPTI_EVENT_READ_FLAG_NONE, &bufferSizeBytes, counterDataBuffer, &arraySizeBytes, eventIDArray, &events_read ); CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupReadAllEvents" ); if ( events_read != ( size_t ) addedEvents.count ) return -1; /* Since there is no guarantee that returned counter values are in the same order as the counters in the PAPI addedEvents.list, we need to map the CUpti_EventID to PAPI event ID values. According to CuPTI doc: counter return values of counterDataBuffer correspond to the return event IDs in eventIDArray */ for ( i = 0; i < events_read; i++ ) for ( j = 0; j < addedEvents.count; j++ ) if ( cuda_native_table[addedEvents.list[j]].resources.eventId == eventIDArray[i] ) // since cuptiEventGroupReadAllEvents() resets counter values to 0; // we have to accumulate ourselves counts[addedEvents.list[j]] = counts[addedEvents.list[j]] + counterDataBuffer[i]; free( counterDataBuffer ); free( eventIDArray ); return 0; }
static vt_cupti_grp_t* vt_cupti_createEvtGrp(vt_cupti_ctx_t *vtcuptiCtx) { CUptiResult cuptiErr = CUPTI_SUCCESS; vt_cupti_grp_t *vtcuptiGrp = NULL; vtcuptiGrp = (vt_cupti_grp_t*)malloc(sizeof(vt_cupti_grp_t)); vtcuptiGrp->evtNum = 0; vtcuptiGrp->enabled = 0; vtcuptiGrp->next = NULL; /* create initial CUPTI counter group */ cuptiErr = cuptiEventGroupCreate(vtcuptiCtx->cuCtx, &(vtcuptiGrp->evtGrp), 0); CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupCreate"); vtcuptiGrp->cuptiEvtIDs = (CUpti_EventID *)malloc( vtcuptiCtx->vtDevCap->evtNum*sizeof(CUpti_EventID)); vtcuptiGrp->vtCIDs = (uint32_t *)malloc( vtcuptiCtx->vtDevCap->evtNum*sizeof(uint32_t)); return vtcuptiGrp; }
/* * Stop CUPTI counter capturing by disabling the CUPTI event groups. * * @param vtcuptiCtx pointer to the VampirTrace CUPTI context */ static void vt_cupti_stop(vt_cupti_ctx_t *vtcuptiCtx) { vt_cupti_grp_t *vtcuptiGrp = NULL; /*vt_cntl_msg(1, "[CUPTI] vt_cupti_stop() ... ");*/ if(vtcuptiCtx == NULL || vt_gpu_debug) return; /* stop counter reading for all groups */ vtcuptiGrp = vtcuptiCtx->vtGrpList; while(vtcuptiGrp != NULL){ if(vtcuptiGrp->enabled){ CUptiResult cuptiErr = CUPTI_SUCCESS; cuptiErr = cuptiEventGroupDisable(vtcuptiGrp->evtGrp); CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupDisable"); vtcuptiGrp->enabled = 0; } vtcuptiGrp = vtcuptiGrp->next; } }
void Trace_end () { displayTimestamps(trace); cuptierr = cuptiUnsubscribe(subscriber); CHECK_CUPTI_ERROR(cuptierr, "cuptiUnsubscribe"); }
/* * Request the CUTPI counter values and write it to the given VampirTrace * stream with the given timestamps. * * @param vtcuptiCtx pointer to the VampirTrace CUPTI context * @param strmid the stream id for the counter values * @param time the VampirTrace timestamps */ void vt_cupti_writeCounter(vt_cupti_ctx_t *vtcuptiCtx, uint32_t strmid, uint64_t *time) { CUptiResult cuptiErr = CUPTI_SUCCESS; vt_cupti_grp_t *vtcuptiGrp = NULL; size_t bufferSizeBytes; size_t arraySizeBytes; size_t numCountersRead; if(vtcuptiCtx == NULL){ VT_CHECK_THREAD; vtcuptiCtx = vt_cupti_getCurrentContext(VT_MY_THREAD); if(vtcuptiCtx == NULL) return; } vtcuptiGrp = vtcuptiCtx->vtGrpList; while(vtcuptiGrp != NULL){ /* read events only, if the event group is enabled */ if(vtcuptiGrp->enabled){ bufferSizeBytes = vtcuptiGrp->evtNum * sizeof(uint64_t); arraySizeBytes = vtcuptiGrp->evtNum * sizeof(CUpti_EventID); /* read events */ cuptiErr = cuptiEventGroupReadAllEvents(vtcuptiGrp->evtGrp, CUPTI_EVENT_READ_FLAG_NONE, &bufferSizeBytes, vtcuptiCtx->counterData, &arraySizeBytes, vtcuptiCtx->cuptiEvtIDs, &numCountersRead); CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupReadAllEvents"); if(vtcuptiGrp->evtNum != numCountersRead){ vt_error_msg("[CUPTI] %d counter reads, %d metrics specified in " "VT_CUPTI_METRICS!", numCountersRead, vtcuptiGrp->evtNum); } /* For all events of the event group: map added event IDs to just read event * IDs, as the order may not be the same. For small numbers of counter reads * this simple mapping should be fast enough. */ { size_t j; for(j = 0; j < numCountersRead; j++){ size_t i; for(i = 0; i < vtcuptiGrp->evtNum; i++){ if(vtcuptiCtx->cuptiEvtIDs[j] == *(vtcuptiGrp->cuptiEvtIDs+i)){ /* write the counter value as VampirTrace counter */ vt_count(strmid, time, *(vtcuptiGrp->vtCIDs+i), vtcuptiCtx->counterData[i]); } } } } } vtcuptiGrp = vtcuptiGrp->next; } }
/* * Detect supported domains for specified device */ static int enumEventDomains( CUdevice dev, int deviceId ) { CUptiResult err = CUPTI_SUCCESS; CUpti_EventDomainID *domainId = NULL; uint32_t id = 0; size_t size = 0; device[deviceId].domainCount = 0; /* get number of domains for device dev */ err = (*cuptiDeviceGetNumEventDomainsPtr)( dev, &device[deviceId].domainCount ); CHECK_CUPTI_ERROR( err, "cuptiDeviceGetNumEventDomains" ); if ( device[deviceId].domainCount == 0 ) { printf( "No domain is exposed by dev = %d\n", dev ); return -1; } /* CuPTI domain struct */ size = sizeof ( CUpti_EventDomainID ) * device[deviceId].domainCount; domainId = ( CUpti_EventDomainID * ) malloc( size ); if ( domainId == NULL ) { perror( "malloc(): Failed to allocate memory to CuPTI domain ID" ); return -1; } memset( domainId, 0, size ); /* PAPI domain struct */ device[deviceId].domain = ( DomainData_t * ) malloc( sizeof ( DomainData_t ) * device[deviceId].domainCount ); if ( device[deviceId].domain == NULL ) { perror( "malloc(): Failed to allocate memory to PAPI domain struct" ); free(domainId); return -1; } /* Enumerates the event domains for a device dev */ err = (*cuptiDeviceEnumEventDomainsPtr)( dev, &size, domainId ); CHECK_CUPTI_ERROR( err, "cuptiDeviceEnumEventDomains" ); /* enum domains */ for ( id = 0; id < device[deviceId].domainCount; id++ ) { device[deviceId].domain[id].domainId = domainId[id]; /* query domain name */ size = PAPI_MIN_STR_LEN; #ifdef CUDA_4_0 err = cuptiEventDomainGetAttribute( dev, device[deviceId].domain[id]. domainId, CUPTI_EVENT_DOMAIN_ATTR_NAME, &size, ( void * ) device[deviceId]. domain[id].name ); CHECK_CUPTI_ERROR( err, "cuptiEventDomainGetAttribute" ); /* query num of events avaialble in the domain */ size = sizeof ( device[deviceId].domain[id].eventCount ); err = cuptiEventDomainGetAttribute( dev, device[deviceId].domain[id]. domainId, CUPTI_EVENT_DOMAIN_MAX_EVENTS, &size, ( void * ) &device[deviceId]. domain[id].eventCount ); CHECK_CUPTI_ERROR( err, "cuptiEventDomainGetAttribute" ); /* enumerate the events for the domain[id] on the device dev */ if ( 0 != enumEvents( dev, deviceId, id ) ) return -1; #else err = (*cuptiDeviceGetEventDomainAttributePtr)( dev, device[deviceId].domain[id].domainId, CUPTI_EVENT_DOMAIN_ATTR_NAME, &size, ( void * ) device[deviceId].domain[id].name ); CHECK_CUPTI_ERROR( err, "cuptiDeviceGetEventDomainAttribute" ); /* query num of events avaialble in the domain */ err = (*cuptiEventDomainGetNumEventsPtr)( device[deviceId].domain[id].domainId, &device[deviceId].domain[id].eventCount ); CHECK_CUPTI_ERROR( err, "cuptiEventDomainGetNumEvents" ); /* enumerate the events for the domain[id] on the device deviceId */ if ( 0 != enumEvents( deviceId, id ) ) return -1; #endif } totalDomainCount += device[deviceId].domainCount; free( domainId ); return 0; }
static int enumEvents( int deviceId, int domainId ) #endif { CUptiResult err = CUPTI_SUCCESS; CUpti_EventID *eventId = NULL; size_t size = 0; uint32_t id = 0; /* CuPTI event struct */ size = sizeof ( CUpti_EventID ) * device[deviceId].domain[domainId].eventCount; eventId = ( CUpti_EventID * ) malloc( size ); if ( eventId == NULL ) { perror( "malloc(): Failed to allocate memory to CuPTI event ID" ); return -1; } memset( eventId, 0, size ); /* PAPI event struct */ device[deviceId].domain[domainId].event = ( EventData_t * ) malloc( sizeof ( EventData_t ) * device[deviceId].domain[domainId]. eventCount ); if ( device[deviceId].domain[domainId].event == NULL ) { perror( "malloc(): Failed to allocate memory to PAPI event struct" ); free(eventId); return -1; } /* enumerate the events for the domain[domainId] on the device[deviceId] */ #ifdef CUDA_4_0 err = (*cuptiEventDomainEnumEventsPtr)( dev, ( CUpti_EventDomainID ) device[deviceId]. domain[domainId].domainId, &size, eventId ); #else err = (*cuptiEventDomainEnumEventsPtr)( ( CUpti_EventDomainID ) device[deviceId]. domain[domainId].domainId, &size, eventId ); #endif CHECK_CUPTI_ERROR( err, "cuptiEventDomainEnumEvents" ); /* query event info */ for ( id = 0; id < device[deviceId].domain[domainId].eventCount; id++ ) { device[deviceId].domain[domainId].event[id].eventId = eventId[id]; /* query event name */ size = PAPI_MIN_STR_LEN; #ifdef CUDA_4_0 err = (*cuptiEventGetAttributePtr)( dev, device[deviceId].domain[domainId]. event[id].eventId, CUPTI_EVENT_ATTR_NAME, &size, ( uint8_t * ) device[deviceId]. domain[domainId].event[id].name ); #else err = (*cuptiEventGetAttributePtr)( device[deviceId].domain[domainId]. event[id].eventId, CUPTI_EVENT_ATTR_NAME, &size, ( uint8_t * ) device[deviceId]. domain[domainId].event[id].name ); #endif CHECK_CUPTI_ERROR( err, "cuptiEventGetAttribute" ); /* query event description */ size = PAPI_2MAX_STR_LEN; #ifdef CUDA_4_0 err = (*cuptiEventGetAttributePtr)( dev, device[deviceId].domain[domainId]. event[id].eventId, CUPTI_EVENT_ATTR_SHORT_DESCRIPTION, &size, ( uint8_t * ) device[deviceId]. domain[domainId].event[id].desc ); #else err = (*cuptiEventGetAttributePtr)( device[deviceId].domain[domainId]. event[id].eventId, CUPTI_EVENT_ATTR_SHORT_DESCRIPTION, &size, ( uint8_t * ) device[deviceId]. domain[domainId].event[id].desc ); #endif CHECK_CUPTI_ERROR( err, "cuptiEventGetAttribute" ); } totalEventCount += device[deviceId].domain[domainId].eventCount; free( eventId ); return 0; }
/* * Enumerate/Print the available CUPTI events for a given CUDA device and * domain. * * @param cuDev the CUDA device * @param domainId the CUPTI event domain */ static void enumEvents(CUdevice cuDev, CUpti_EventDomainID domainId) { CUptiResult cuptiErr = CUPTI_SUCCESS; /* size_t DESC_SHORT = 512; */ CUpti_EventID *eventId = NULL; uint32_t maxEvents = 0; uint32_t i = 0; size_t size = 0; /* query num of events available in the domain */ cuptiErr = cuptiEventDomainGetNumEvents(cuDev, (CUpti_EventDomainID)domainId, &maxEvents); if(cuptiErr == CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID){ vt_error_msg("Domain Id %d is not supported by device", domainId); }else{ CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventDomainGetNumEvents"); } size = sizeof(CUpti_EventID) * maxEvents; eventId = (CUpti_EventID*)malloc(size); if(eventId == NULL) vt_error_msg("Failed to allocate memory to event ID"); memset(eventId, 0, size); cuptiErr = cuptiEventDomainEnumEvents(cuDev, (CUpti_EventDomainID)domainId, &size, eventId); CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventDomainEnumEvents"); /* query event info */ { size_t NAME_SHORT = 32; char *eventname = (char*)malloc(NAME_SHORT*sizeof(char)); /* event name */ /*char *shortdesc = malloc(DESC_SHORT*sizeof(char)); short desc of the event */ for(i = 0; i < maxEvents; i++){ NAME_SHORT = 32; cuptiErr = cuptiEventGetAttribute(cuDev, eventId[i], CUPTI_EVENT_ATTR_NAME, &NAME_SHORT, eventname); CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGetAttribute"); /*cuptiErr = cuptiEventGetAttribute(cuDev, eventId[i], CUPTI_EVENT_ATTR_SHORT_DESCRIPTION, &DESC_SHORT, (uint8_t*)shortdesc); CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGetAttribute");*/ vt_cntl_msg(1, "Id:Name = %d: %s", eventId[i], eventname); /*vt_cntl_msg(1, "Shortdesc = %s\n", shortdesc);*/ } free(eventname); } free(eventId); }
void CUPTIAPI getTimestampCallback(void *userdata, CUpti_CallbackDomain domain, CUpti_CallbackId cbid, const CUpti_CallbackData *cbInfo) { static int memTransCount = 0; uint64_t startTimestamp; uint64_t endTimestamp; printf ("<------------getTimestampCallback--------------->\n"); RuntimeApiTrace_t *traceData = (RuntimeApiTrace_t*)userdata; CUptiResult cuptiErr; // Data is collected only for the following API if ((cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) || (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaThreadSynchronize_v3020) || (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020)) { // Set pointer depending on API if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) traceData = traceData + KERNEL; else if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaThreadSynchronize_v3020) traceData = traceData + THREAD_SYNC; else if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020) traceData = traceData + MEMCPY_H2D1 + memTransCount; if (cbInfo->callbackSite == CUPTI_API_ENTER) { // for a kernel launch report the kernel name, otherwise use the API // function name. if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) { traceData->functionName = cbInfo->symbolName; } else { traceData->functionName = cbInfo->functionName; } printf ("%s\t",traceData->functionName); // Store parameters passed to cudaMemcpy if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020) { traceData->memcpy_bytes = ((cudaMemcpy_v3020_params *)(cbInfo->functionParams))->count; traceData->memcpy_kind = ((cudaMemcpy_v3020_params *)(cbInfo->functionParams))->kind; } // Collect timestamp for API start cuptiErr = cuptiDeviceGetTimestamp(cbInfo->context, &startTimestamp); CHECK_CUPTI_ERROR(cuptiErr, "cuptiDeviceGetTimestamp"); traceData->startTimestamp = startTimestamp; printf ("%llu\n", traceData->startTimestamp); } if (cbInfo->callbackSite == CUPTI_API_EXIT) { // Collect timestamp for API exit cuptiErr = cuptiDeviceGetTimestamp(cbInfo->context, &endTimestamp); CHECK_CUPTI_ERROR(cuptiErr, "cuptiDeviceGetTimestamp"); traceData->endTimestamp = endTimestamp; // Advance to the next memory transfer operation if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020) { memTransCount++; } } } //displayTimestamps(traceData); }