static void handleResource(CUpti_CallbackId cbid, const CUpti_ResourceData *resourceData) { // enqueue buffers on a context's queue when the context is created if (cbid == CUPTI_CBID_RESOURCE_CONTEXT_CREATED) { queueNewBuffer(resourceData->context, 0); queueNewBuffer(resourceData->context, 0); } // dump all buffers on a context destroy else if (cbid == CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING) { while (dump(resourceData->context, 0) != NULL) ; } // enqueue buffers on a stream's queue when a non-default stream is created if (cbid == CUPTI_CBID_RESOURCE_STREAM_CREATED) { uint32_t streamId; CUPTI_CALL(cuptiGetStreamId(resourceData->context, resourceData->resourceHandle.stream, &streamId)); queueNewBuffer(resourceData->context, streamId); queueNewBuffer(resourceData->context, streamId); } // dump all buffers on a stream destroy else if (cbid == CUPTI_CBID_RESOURCE_STREAM_DESTROY_STARTING) { uint32_t streamId; CUPTI_CALL(cuptiGetStreamId(resourceData->context, resourceData->resourceHandle.stream, &streamId)); while (dump(resourceData->context, streamId) != NULL) ; } }
/* * Create a VampirTrace CUPTI activity context. * * @return pointer to created VampirTrace CUPTI Activity context */ static vt_cupti_activity_t* vt_cuptiact_createCtxActivity(CUcontext cuCtx) { vt_cupti_activity_t* vtCtxAct = NULL; /* create new context, as it is not listed */ vtCtxAct = (vt_cupti_activity_t *)malloc(sizeof(vt_cupti_activity_t)); if(vtCtxAct == NULL) vt_error_msg("[CUPTI Activity] Could not allocate memory for activity context!"); vtCtxAct->strmList = NULL; vtCtxAct->gpuMemAllocated = 0; vtCtxAct->gpuMemList = NULL; vtCtxAct->buffer = NULL; vtCtxAct->vtLastGPUTime = vt_gpu_init_time; vtCtxAct->gpuIdleOn = 1; /* * Get time synchronization factor between host and GPU time for measurement * interval */ { VT_CUPTI_CALL(cuptiGetTimestamp(&(vtCtxAct->sync.gpuStart)), "cuptiGetTimestamp"); vtCtxAct->sync.hostStart = vt_pform_wtime(); } /* set default CUPTI stream ID (needed for memory usage and idle tracing) */ VT_CUPTI_CALL(cuptiGetStreamId(cuCtx, NULL, &(vtCtxAct->defaultStrmID)), "cuptiGetStreamId"); return vtCtxAct; }
static void handleSync(CUpti_CallbackId cbid, const CUpti_SynchronizeData *syncData) { // check the top buffer of the global queue and dequeue if full. If // we dump a buffer add it back to the queue uint8_t *buffer = dumpIfFull(NULL, 0); if (buffer != NULL) { CUPTI_CALL(cuptiActivityEnqueueBuffer(NULL, 0, buffer, BUF_SIZE)); } // dump context buffer on context sync if (cbid == CUPTI_CBID_SYNCHRONIZE_CONTEXT_SYNCHRONIZED) { buffer = dumpIfFull(syncData->context, 0); if (buffer != NULL) { CUPTI_CALL(cuptiActivityEnqueueBuffer(syncData->context, 0, buffer, BUF_SIZE)); } } // dump stream buffer on stream sync else if (cbid == CUPTI_CBID_SYNCHRONIZE_STREAM_SYNCHRONIZED) { uint32_t streamId; CUPTI_CALL(cuptiGetStreamId(syncData->context, syncData->stream, &streamId)); buffer = dumpIfFull(syncData->context, streamId); if (buffer != NULL) { CUPTI_CALL(cuptiActivityEnqueueBuffer(syncData->context, streamId, buffer, BUF_SIZE)); } } }
/* * Create a VampirTrace CUPTI Activity context. * * @param ctxID ID of the CUDA context * @param devID ID of the CUDA device * * @return pointer to created VampirTrace CUPTI Activity context */ static vt_cuptiact_ctx_t* vt_cuptiact_createContext(uint32_t ctxID, CUcontext cuCtx, uint32_t devID) { vt_cuptiact_ctx_t* vtCtx = NULL; /* create new context, as it is not listed */ vtCtx = (vt_cuptiact_ctx_t *)malloc(sizeof(vt_cuptiact_ctx_t)); if(vtCtx == NULL) vt_error_msg("[CUPTI Activity] Could not allocate memory for context!"); vtCtx->ctxID = ctxID; vtCtx->next = NULL; vtCtx->strmList = NULL; vtCtx->gpuMemAllocated = 0; vtCtx->gpuMemList = NULL; vtCtx->buffer = NULL; vtCtx->vtLastGPUTime = vt_gpu_init_time; vtCtx->gpuIdleOn = 1; /* * Get time synchronization factor between host and GPU time for measurement * interval */ { VT_CUPTI_CALL(cuptiGetTimestamp(&(vtCtx->sync.gpuStart)), "cuptiGetTimestamp"); vtCtx->sync.hostStart = vt_pform_wtime(); } VT_CHECK_THREAD; vtCtx->ptid = VT_MY_THREAD; if(cuCtx == NULL) CHECK_CU_ERROR(cuCtxGetCurrent(&cuCtx), NULL); vtCtx->cuCtx = cuCtx; /* set default CUPTI stream ID (needed for memory usage and idle tracing) */ VT_CUPTI_CALL(cuptiGetStreamId(vtCtx->cuCtx, NULL, &(vtCtx->defaultStrmID)), "cuptiGetStreamId"); if(devID == (uint32_t)-1){ CUdevice cuDev; /* driver API prog: correct cuDev, but result is 201 (invalid context) */ if(CUDA_SUCCESS != cuCtxGetDevice(&cuDev)){ devID = VT_NO_ID; }else{ devID = (uint32_t)cuDev; } } vtCtx->devID = devID; vtCtx->cuDev = devID; /*vt_cntl_msg(1,"device id: %d", devID);*/ return vtCtx; }
/* * Create a VampirTrace CUPTI stream. * * @param vtCtx VampirTrace CUPTI context * @param cuStrm CUDA stream * @param strmID ID of the CUDA stream * * @return pointer to created VampirTrace CUPTI stream */ vt_cupti_strm_t* vt_cupti_createStream(vt_cupti_ctx_t *vtCtx, CUstream cuStrm, uint32_t strmID) { vt_cupti_strm_t *vtStrm = NULL; if(vtCtx == NULL){ vt_warning("[CUPTI] Cannot create stream without VampirTrace CUPTI context"); return NULL; } vtStrm = (vt_cupti_strm_t *)malloc(sizeof(vt_cupti_strm_t)); if(vtStrm == NULL) vt_error_msg("[CUPTI] Could not allocate memory for stream!"); vtStrm->cuStrm = cuStrm; vtStrm->vtLastTime = vt_gpu_init_time; vtStrm->destroyed = 0; vtStrm->next = NULL; #if defined(VT_CUPTI_ACTIVITY) /* create stream by VT CUPTI callbacks implementation (CUstream is given) */ if(strmID == VT_CUPTI_NO_STREAM_ID){ if(cuStrm != VT_CUPTI_NO_STREAM){ VT_CUPTI_CALL(cuptiGetStreamId(vtCtx->cuCtx, cuStrm, &strmID), "cuptiGetStreamId"); }else{ vt_warning("[CUPTI] Neither CUDA stream nor stream ID given!"); free(vtStrm); return NULL; } } #else /* only VT_CUPTI_CALLBACKS is defined */ if(vtCtx->callbacks != NULL){ strmID = vtCtx->callbacks->streamsCreated; vtCtx->callbacks->streamsCreated++; } #endif vtStrm->cuStrmID = strmID; /* create VampirTrace thread */ { char thread_name[16] = "CUDA"; if(vt_gpu_stream_reuse){ if(vtCtx->devID != VT_NO_ID){ if(-1 == snprintf(thread_name+4, 12, "[%d]", vtCtx->devID)) vt_cntl_msg(1, "Could not create thread name for CUDA thread!"); } }else{ if(vtCtx->devID == VT_NO_ID){ if(-1 == snprintf(thread_name+4, 12, "[?:%d]", strmID)) vt_cntl_msg(1, "Could not create thread name for CUDA thread!"); }else{ if(-1 == snprintf(thread_name+4, 12, "[%d:%d]", vtCtx->devID, strmID)) vt_cntl_msg(1, "Could not create thread name for CUDA thread!"); } } VT_CHECK_THREAD; vt_gpu_registerThread(thread_name, VT_MY_THREAD, &(vtStrm->vtThrdID)); } if(vt_gpu_init_time < vt_start_time) vt_gpu_init_time = vt_start_time; /* for the first stream created for this context */ if(vtCtx->strmList == NULL){ if(vt_gpu_trace_idle > 0){ /* write enter event for GPU_IDLE on first stream */ vt_enter(vtStrm->vtThrdID, &vt_gpu_init_time, vt_gpu_rid_idle); /*vt_warning("IDLEente: %llu (%d)", vt_gpu_init_time, vtStrm->vtThrdID);*/ #if defined(VT_CUPTI_ACTIVITY) if(vtCtx->activity != NULL) vtCtx->activity->gpuIdleOn = 1; #endif } /* set the counter value for cudaMalloc to 0 on first stream */ if(vt_gpu_trace_memusage > 0) vt_count(vtStrm->vtThrdID, &vt_gpu_init_time, vt_gpu_cid_memusage, 0); } if(vt_gpu_trace_kernels > 1){ /* set count values to zero */ vt_count(vtStrm->vtThrdID, &vt_gpu_init_time, vt_cupti_cid_blocksPerGrid, 0); vt_count(vtStrm->vtThrdID, &vt_gpu_init_time, vt_cupti_cid_threadsPerBlock, 0); vt_count(vtStrm->vtThrdID, &vt_gpu_init_time, vt_cupti_cid_threadsPerKernel, 0); } /* prepend the stream vtStrm->next = vtCtx->strmList; vtCtx->strmList = vtStrm;*/ return vtStrm; }