static void vt_cuptievt_start(vt_cupti_events_t *vtcuptiEvtCtx) { CUptiResult cuptiErr = CUPTI_SUCCESS; vt_cupti_evtgrp_t *vtcuptiGrp = NULL; vt_cupti_evtgrp_t *lastGrp = NULL; /* start gathering counter values, if context was successfully initialized */ if(NULL == vtcuptiEvtCtx){ /* no performance counters for this thread available */ VT_CHECK_THREAD; vt_gpu_prop[VT_MY_THREAD] |= VTGPU_NO_PC; vt_cntl_msg(2, "[CUPTI Events] Context not initialized!"); return; } /* start all groups */ vtcuptiGrp = vtcuptiEvtCtx->vtGrpList; lastGrp = vtcuptiEvtCtx->vtGrpList; while(vtcuptiGrp != NULL){ cuptiErr = cuptiEventGroupEnable(vtcuptiGrp->evtGrp); /* if the event group could not be enabled, remove it */ if(cuptiErr != CUPTI_SUCCESS){ size_t i; vt_cupti_evtgrp_t *freeGrp = vtcuptiGrp; size_t valueSize = 32; char name[32]; vtcuptiGrp = vtcuptiGrp->next; /* give user information about the group, which cannot be enabled */ for(i = 0; i < freeGrp->evtNum; i++){ VTCUPTIEVENTGETATTRIBUTE(vtcuptiEvtCtx->vtDevCap->cuDev, *(freeGrp->cuptiEvtIDs)+i, CUPTI_EVENT_ATTR_NAME, &valueSize, (char*)name); vt_warning("[CUPTI Events] Event '%s' (%d) cannot be enabled", name, *(freeGrp->cuptiEvtIDs)+i); } /* group is first element in linked list */ if(vtcuptiEvtCtx->vtGrpList == freeGrp){ vtcuptiEvtCtx->vtGrpList = vtcuptiEvtCtx->vtGrpList->next; }else{/* has to be at least the second group in linked list */ lastGrp->next = freeGrp->next; } free(freeGrp); freeGrp = NULL; }else{ vtcuptiGrp->enabled = 1; lastGrp= vtcuptiGrp; vtcuptiGrp = vtcuptiGrp->next; } } }
static void vt_cupti_start(vt_cupti_ctx_t *vtcuptiCtx) { CUptiResult cuptiErr = CUPTI_SUCCESS; vt_cupti_grp_t *vtcuptiGrp = NULL; vt_cupti_grp_t *lastGrp = NULL; if(vtcuptiCtx == NULL) return; /* start all groups */ vtcuptiGrp = vtcuptiCtx->vtGrpList; lastGrp = vtcuptiCtx->vtGrpList; while(vtcuptiGrp != NULL){ cuptiErr = cuptiEventGroupEnable(vtcuptiGrp->evtGrp); /* if the event group could not be enabled, remove it */ if(cuptiErr != CUPTI_SUCCESS){ size_t i; vt_cupti_grp_t *freeGrp = vtcuptiGrp; size_t valueSize = 32; char name[32]; vtcuptiGrp = vtcuptiGrp->next; /* give user information about the group, which cannot be enabled */ for(i = 0; i < freeGrp->evtNum; i++){ cuptiEventGetAttribute(vtcuptiCtx->vtDevCap->cuDev, *(freeGrp->cuptiEvtIDs)+i, CUPTI_EVENT_ATTR_NAME, &valueSize, (char*)name); vt_warning("[CUPTI] Event '%s' (%d) cannot be enabled", name, *(freeGrp->cuptiEvtIDs)+i); } /* group is first element in linked list */ if(vtcuptiCtx->vtGrpList == freeGrp){ vtcuptiCtx->vtGrpList = vtcuptiCtx->vtGrpList->next; }else{/* has to be at least the second group in linked list */ lastGrp->next = freeGrp->next; } free(freeGrp); freeGrp = NULL; }else{ vtcuptiGrp->enabled = 1; lastGrp= vtcuptiGrp; vtcuptiGrp = vtcuptiGrp->next; } } }
static void cupti_callback_launch_kernel(cupti_user_t *user, CUpti_CallbackData *cbdata) { // Find associated counter data pthread_mutex_lock(&mutex); struct context_counter_data *counter_data; for (counter_data = allCounterData; counter_data; counter_data = counter_data->next) { if (counter_data->context == cbdata->context) { break; } } pthread_mutex_unlock(&mutex); if (!counter_data) { if (cbdata->callbackSite == CUPTI_API_ENTER) { fprintf(stderr, "CUPTI warning: Could not find context for kernel start!\n"); // Simply generate it. Use user data as a cheap way to // prevent an infinite loop. if (user) { CUpti_ResourceData rdata; memset(&rdata, 0, sizeof(rdata)); rdata.context = cbdata->context; cupti_callback_context_created(user, &rdata); cupti_callback_launch_kernel(NULL, cbdata); } } return; } CUpti_EventGroupSets *eventGroupPasses = counter_data->eventGroupSets; if (cbdata->callbackSite == CUPTI_API_ENTER) { //cudaDeviceSynchronize(); // Set collection mode. Kernel mode is the only one that is // guaranteed to work, even if it forces us to sum up metrics // manually. CUPTI_ASSERT(cuptiSetEventCollectionMode(cbdata->context, CUPTI_EVENT_COLLECTION_MODE_KERNEL)); // Enable the counters! int i; for (i = 0; i < eventGroupPasses->sets->numEventGroups; i++) { uint32_t all = 1; CUPTI_ASSERT(cuptiEventGroupSetAttribute(eventGroupPasses->sets->eventGroups[i], CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES, sizeof(all), &all)); CUPTI_ASSERT(cuptiEventGroupEnable(eventGroupPasses->sets->eventGroups[i])); } } else if (cbdata->callbackSite == CUPTI_API_EXIT) { CUdevice device = get_device_from_ctx(cbdata->context); // Find out how many events we have in total. Note that // cuptiMetricGetNumEvents wouldn't help us here, as we are // collecting multiple metrics, which *might* have overlapping // events. uint32_t numEvents = 0; int i; for (i = 0; i < eventGroupPasses->sets->numEventGroups; i++) { uint32_t num = 0; size_t numSize = sizeof(num); CUPTI_ASSERT(cuptiEventGroupGetAttribute(eventGroupPasses->sets->eventGroups[i], CUPTI_EVENT_GROUP_ATTR_NUM_EVENTS, &numSize, &num)); numEvents += num; } // Allocate arrays for event IDs & values size_t eventIdsSize = sizeof(CUpti_EventID) * numEvents; CUpti_EventID *eventIds = (CUpti_EventID *)alloca(eventIdsSize); size_t eventValuesSize = sizeof(uint64_t) * numEvents; uint64_t *eventValues = (uint64_t *)alloca(eventValuesSize); memset(eventValues, 0, sizeof(uint64_t) * numEvents); // Now read all events, per group int eventIx = 0; for (i = 0; i < eventGroupPasses->sets->numEventGroups; i++) { CUpti_EventGroup eventGroup = eventGroupPasses->sets->eventGroups[i]; // Get event IDs uint32_t num = 0; size_t numSize = sizeof(num); CUPTI_ASSERT(cuptiEventGroupGetAttribute(eventGroup, CUPTI_EVENT_GROUP_ATTR_NUM_EVENTS, &numSize, &num)); // Get how many domain instances were actually counting uint32_t domInstNum = 0; size_t domInstNumSize = sizeof(domInstNum); CUPTI_ASSERT(cuptiEventGroupGetAttribute(eventGroup, CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT, &domInstNumSize, &domInstNum)); // Get counter values from all instances size_t idsSize = sizeof(CUpti_EventID) * num; size_t valsSize = sizeof(uint64_t) * num * domInstNum; uint64_t *vals = (uint64_t *)alloca(valsSize); size_t numRead = 0; CUPTI_ASSERT(cuptiEventGroupReadAllEvents(eventGroup, CUPTI_EVENT_READ_FLAG_NONE, &valsSize, vals, &idsSize, eventIds + eventIx, &numRead)); if (numRead != num) { fprintf(stderr, "CUPTI warning: ReadAllEvents returned unexpected number of values (expected %u, got %u)!\n", (unsigned)num, (unsigned)numRead); } // For normalisation we need the *total* number of domain // instances (not only the ones that were available for counting) CUpti_EventDomainID domainId = 0; size_t domainIdSize = sizeof(domainId); CUPTI_ASSERT(cuptiEventGroupGetAttribute(eventGroup, CUPTI_EVENT_GROUP_ATTR_EVENT_DOMAIN_ID, &domainIdSize, &domainId)); uint32_t totalDomInstNum = 0; size_t totalDomInstNumSize = sizeof(totalDomInstNum); CUPTI_ASSERT(cuptiDeviceGetEventDomainAttribute(device, domainId, CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT, &totalDomInstNumSize, &totalDomInstNum)); // Determine true counter values int j; for (j = 0; j < numRead; j++) { // First, sum up across instances uint64_t val = 0; int k; for (k = 0; k < domInstNum; k++) { val += vals[j+k*num]; } // Then normalise and add to proper event count eventValues[eventIx + j] = (val * totalDomInstNum) / domInstNum; } // Progress! eventIx += num; } // Now calculate metrics. for (i = 0; i < metricCount; i++) { // This only works if the metric does not depend on kernel // time (because we set it to zero here - use // cupti_activity facilities to measure kernel time // separately). CUpti_MetricValue metric; CUPTI_ASSERT(cuptiMetricGetValue(device, counter_data->metricIds[i], eventIdsSize, eventIds, eventValuesSize, eventValues, 0, &metric)); // Sum up metrics. Note that this might not actually make // sense for all of them, we warn about that before. switch (counter_data->metricKinds[i]) { case CUPTI_METRIC_VALUE_KIND_DOUBLE: metrics[i] += metric.metricValueDouble; break; case CUPTI_METRIC_VALUE_KIND_UINT64: metrics[i] += metric.metricValueUint64; break; case CUPTI_METRIC_VALUE_KIND_INT64: metrics[i] += metric.metricValueInt64; break; case CUPTI_METRIC_VALUE_KIND_PERCENT: metrics[i] += metric.metricValuePercent; break; case CUPTI_METRIC_VALUE_KIND_THROUGHPUT: metrics[i] += metric.metricValueThroughput; break; case CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL: metrics[i] += metric.metricValueUtilizationLevel; break; } } } }