static void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize) { CUptiResult status; CUpti_Activity *record = NULL; do { status = cuptiActivityGetNextRecord(buffer, validSize, &record); if(status == CUPTI_SUCCESS) { printActivity(record); } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) { break; } else { CUPTI_CALL(status); } } while (1); size_t dropped; CUPTI_CALL(cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped)); if (dropped != 0) { printf("Dropped %u activity records\n", (unsigned int)dropped); } printf("\n\n\n\n\n\n"); printf("************* STALL SUMMARY ********************\n"); int i; for(i=0;i<12;++i) if(stall_name[i] != NULL) printf("%s = %d \n",stall_name[i],val[i]); printf("*************************************************\n\n"); }
static void handleSync(CUpti_CallbackId cbid, const CUpti_SynchronizeData *syncData) { // check the top buffer of the global queue and dequeue if full. If // we dump a buffer add it back to the queue uint8_t *buffer = dumpIfFull(NULL, 0); if (buffer != NULL) { CUPTI_CALL(cuptiActivityEnqueueBuffer(NULL, 0, buffer, BUF_SIZE)); } // dump context buffer on context sync if (cbid == CUPTI_CBID_SYNCHRONIZE_CONTEXT_SYNCHRONIZED) { buffer = dumpIfFull(syncData->context, 0); if (buffer != NULL) { CUPTI_CALL(cuptiActivityEnqueueBuffer(syncData->context, 0, buffer, BUF_SIZE)); } } // dump stream buffer on stream sync else if (cbid == CUPTI_CBID_SYNCHRONIZE_STREAM_SYNCHRONIZED) { uint32_t streamId; CUPTI_CALL(cuptiGetStreamId(syncData->context, syncData->stream, &streamId)); buffer = dumpIfFull(syncData->context, streamId); if (buffer != NULL) { CUPTI_CALL(cuptiActivityEnqueueBuffer(syncData->context, streamId, buffer, BUF_SIZE)); } } }
static void handleResource(CUpti_CallbackId cbid, const CUpti_ResourceData *resourceData) { // enqueue buffers on a context's queue when the context is created if (cbid == CUPTI_CBID_RESOURCE_CONTEXT_CREATED) { queueNewBuffer(resourceData->context, 0); queueNewBuffer(resourceData->context, 0); } // dump all buffers on a context destroy else if (cbid == CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING) { while (dump(resourceData->context, 0) != NULL) ; } // enqueue buffers on a stream's queue when a non-default stream is created if (cbid == CUPTI_CBID_RESOURCE_STREAM_CREATED) { uint32_t streamId; CUPTI_CALL(cuptiGetStreamId(resourceData->context, resourceData->resourceHandle.stream, &streamId)); queueNewBuffer(resourceData->context, streamId); queueNewBuffer(resourceData->context, streamId); } // dump all buffers on a stream destroy else if (cbid == CUPTI_CBID_RESOURCE_STREAM_DESTROY_STARTING) { uint32_t streamId; CUPTI_CALL(cuptiGetStreamId(resourceData->context, resourceData->resourceHandle.stream, &streamId)); while (dump(resourceData->context, streamId) != NULL) ; } }
void initTrace() { // Enqueue a couple of buffers in the global queue queueNewBuffer(NULL, 0); queueNewBuffer(NULL, 0); // device activity record is created when CUDA initializes, so we // want to enable it before cuInit() or any CUDA runtime call CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE)); // CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY)); cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT); // cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER); // cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME); cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY); cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET); cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL); CUpti_SubscriberHandle subscriber; CUPTI_CALL(cuptiSubscribe(&subscriber, (CUpti_CallbackFunc)traceCallback, NULL)); CUPTI_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_RESOURCE)); CUPTI_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_SYNCHRONIZE)); //add by wukai //CUPIT_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API)); CUPTI_CALL(cuptiGetTimestamp(&startTimestamp)); }
/** * Dump the contents of the top buffer in the queue specified by * 'context' and 'streamId', and return the top buffer. If the queue * is empty return NULL. */ static uint8_t * dump(CUcontext context, uint32_t streamId) { uint8_t *buffer = NULL; size_t validBufferSizeBytes; CUptiResult status; status = cuptiActivityDequeueBuffer(context, streamId, &buffer, &validBufferSizeBytes); if (status == CUPTI_ERROR_QUEUE_EMPTY) { return NULL; } CUPTI_CALL(status); if (context == NULL) { printf("[CUPTI] Starting dump for global\n"); } else if (streamId == 0) { printf("[CUPTI] Starting dump for context %p\n", context); } else { printf("[CUPTI] Starting dump for context %p, stream %u\n", context, streamId); } CUpti_Activity *record = NULL; do { status = cuptiActivityGetNextRecord(buffer, validBufferSizeBytes, &record); if(status == CUPTI_SUCCESS) { printActivity(record); } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) { printf ("[CUPTI] CUPTI_ERROR_MAX_LIMIT_REACHED\n"); break; } else { CUPTI_CALL(status); } } while (1); // report any records dropped from the queue size_t dropped; CUPTI_CALL(cuptiActivityGetNumDroppedRecords(context, streamId, &dropped)); if (dropped != 0) { printf("Dropped %u activity records\n", (unsigned int)dropped); } if (context == NULL) { printf("[CUPTI] Finished dump for global\n"); } else if (streamId == 0) { printf("[CUPTI] Finished dump for context %p \n", context); } else { printf("[CUPTI] Finished dump for context %p, stream %u\n", context, streamId); } return buffer; }
/** * Allocate a new BUF_SIZE buffer and add it to the queue specified by * 'context' and 'streamId'. */ static void queueNewBuffer(CUcontext context, uint32_t streamId) { size_t size = BUF_SIZE; uint8_t *buffer = (uint8_t *)malloc(size+ALIGN_SIZE); CUPTI_CALL(cuptiActivityEnqueueBuffer(context, streamId, ALIGN_BUFFER(buffer, ALIGN_SIZE), size)); }
/** * If the top buffer in the queue specified by 'context' and * 'streamId' is full, then dump its contents and return the * buffer. If the top buffer is not full, return NULL. */ static uint8_t * dumpIfFull(CUcontext context, uint32_t streamId) { size_t validBufferSizeBytes; CUptiResult status; status = cuptiActivityQueryBuffer(context, streamId, &validBufferSizeBytes); if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) { return dump(context, streamId); } else if ((status != CUPTI_SUCCESS) && (status != CUPTI_ERROR_QUEUE_EMPTY)) { CUPTI_CALL(status); } return NULL; }
int cupti_metrics_enable() { CUptiResult res; if (!initialised || !metricCount) { return 0; } // We cannot initialise anything here - we only hook CUDA context // creation in order to do the actual work later. if (subscriber == NULL) { // Create subscription CUPTI_CALL(cuptiSubscribe(&subscriber, (CUpti_CallbackFunc)cupti_callback, NULL)); // Bind callbacks CUPTI_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_RESOURCE, CUPTI_CBID_RESOURCE_CONTEXT_CREATED)); CUPTI_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_RESOURCE, CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING)); // Hook kernel launch. This feels hacky, but is the approach // callback_metric.cu takes. #ifndef LOG_ALL_DRIVER_CALLS CUPTI_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API, CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)); #else int i; for (i = CUPTI_DRIVER_TRACE_CBID_cuInit; i < CUPTI_DRIVER_TRACE_CBID_SIZE; i++) { CUPTI_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API, i)); } #endif } }
__attribute__((constructor)) void initTrace() { //get the arguments from the environment variables int deviceId, sampRate; CUcontext cuCtx; deviceId = atoi(getenv("GPU_DEVICE_ID")); cuInit(0); cuCtxCreate(&cuCtx,0,deviceId); CUPTI_CALL(cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted)); CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_PC_SAMPLING)); //CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_BRANCH)); CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL)); CUPTI_CALL(cuptiSubscribe(&g_subscriber, (CUpti_CallbackFunc)traceCallback, NULL)); CUPTI_CALL(cuptiEnableDomain(1, g_subscriber, CUPTI_CB_DOMAIN_RESOURCE)); CUpti_ActivityPCSamplingConfig config; sampRate=atoi(getenv("PC_SAMPLING_RATE")); config.samplingPeriod= sampRate; CUPTI_CALL(cuptiActivityConfigurePCSampling(cuCtx, &config)); }
__attribute__((destructor)) void finiTrace() { // printf("FLushing CUPTI \n"); CUPTI_CALL(cuptiActivityFlushAll(0)); }