void initTrace() { // Enqueue a couple of buffers in the global queue queueNewBuffer(NULL, 0); queueNewBuffer(NULL, 0); // device activity record is created when CUDA initializes, so we // want to enable it before cuInit() or any CUDA runtime call CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE)); // CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY)); cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT); // cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER); // cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME); cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY); cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET); cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL); CUpti_SubscriberHandle subscriber; CUPTI_CALL(cuptiSubscribe(&subscriber, (CUpti_CallbackFunc)traceCallback, NULL)); CUPTI_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_RESOURCE)); CUPTI_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_SYNCHRONIZE)); //add by wukai //CUPIT_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API)); CUPTI_CALL(cuptiGetTimestamp(&startTimestamp)); }
/* * Create a VampirTrace CUPTI activity context. * * @return pointer to created VampirTrace CUPTI Activity context */ static vt_cupti_activity_t* vt_cuptiact_createCtxActivity(CUcontext cuCtx) { vt_cupti_activity_t* vtCtxAct = NULL; /* create new context, as it is not listed */ vtCtxAct = (vt_cupti_activity_t *)malloc(sizeof(vt_cupti_activity_t)); if(vtCtxAct == NULL) vt_error_msg("[CUPTI Activity] Could not allocate memory for activity context!"); vtCtxAct->strmList = NULL; vtCtxAct->gpuMemAllocated = 0; vtCtxAct->gpuMemList = NULL; vtCtxAct->buffer = NULL; vtCtxAct->vtLastGPUTime = vt_gpu_init_time; vtCtxAct->gpuIdleOn = 1; /* * Get time synchronization factor between host and GPU time for measurement * interval */ { VT_CUPTI_CALL(cuptiGetTimestamp(&(vtCtxAct->sync.gpuStart)), "cuptiGetTimestamp"); vtCtxAct->sync.hostStart = vt_pform_wtime(); } /* set default CUPTI stream ID (needed for memory usage and idle tracing) */ VT_CUPTI_CALL(cuptiGetStreamId(cuCtx, NULL, &(vtCtxAct->defaultStrmID)), "cuptiGetStreamId"); return vtCtxAct; }
/* * Create a VampirTrace CUPTI Activity context. * * @param ctxID ID of the CUDA context * @param devID ID of the CUDA device * * @return pointer to created VampirTrace CUPTI Activity context */ static vt_cuptiact_ctx_t* vt_cuptiact_createContext(uint32_t ctxID, CUcontext cuCtx, uint32_t devID) { vt_cuptiact_ctx_t* vtCtx = NULL; /* create new context, as it is not listed */ vtCtx = (vt_cuptiact_ctx_t *)malloc(sizeof(vt_cuptiact_ctx_t)); if(vtCtx == NULL) vt_error_msg("[CUPTI Activity] Could not allocate memory for context!"); vtCtx->ctxID = ctxID; vtCtx->next = NULL; vtCtx->strmList = NULL; vtCtx->gpuMemAllocated = 0; vtCtx->gpuMemList = NULL; vtCtx->buffer = NULL; vtCtx->vtLastGPUTime = vt_gpu_init_time; vtCtx->gpuIdleOn = 1; /* * Get time synchronization factor between host and GPU time for measurement * interval */ { VT_CUPTI_CALL(cuptiGetTimestamp(&(vtCtx->sync.gpuStart)), "cuptiGetTimestamp"); vtCtx->sync.hostStart = vt_pform_wtime(); } VT_CHECK_THREAD; vtCtx->ptid = VT_MY_THREAD; if(cuCtx == NULL) CHECK_CU_ERROR(cuCtxGetCurrent(&cuCtx), NULL); vtCtx->cuCtx = cuCtx; /* set default CUPTI stream ID (needed for memory usage and idle tracing) */ VT_CUPTI_CALL(cuptiGetStreamId(vtCtx->cuCtx, NULL, &(vtCtx->defaultStrmID)), "cuptiGetStreamId"); if(devID == (uint32_t)-1){ CUdevice cuDev; /* driver API prog: correct cuDev, but result is 201 (invalid context) */ if(CUDA_SUCCESS != cuCtxGetDevice(&cuDev)){ devID = VT_NO_ID; }else{ devID = (uint32_t)cuDev; } } vtCtx->devID = devID; vtCtx->cuDev = devID; /*vt_cntl_msg(1,"device id: %d", devID);*/ return vtCtx; }
void vt_cuptiact_flushCtxActivities(vt_cupti_ctx_t *vtCtx) { CUptiResult status; uint8_t *buffer = NULL; size_t bufSize; CUpti_Activity *record = NULL; uint64_t hostStop, gpuStop; uint32_t ptid = VT_NO_ID; vt_cupti_activity_t *vtcuptiActivity = NULL; /* check for VampirTrace CUPTI context */ if(vtCtx == NULL || vtCtx->activity == NULL){ vt_warning("[CUPTI Activity] Context not found!"); return; } vtcuptiActivity = vtCtx->activity; /* check if the buffer contains records */ status = cuptiActivityQueryBuffer(vtCtx->cuCtx, 0, &bufSize); if(status != CUPTI_SUCCESS){ if(CUPTI_ERROR_QUEUE_EMPTY == status || CUPTI_ERROR_MAX_LIMIT_REACHED != status){ return; } } /* expose VampirTrace CUPTI activity flush as measurement overhead */ VT_CHECK_THREAD; ptid = VT_MY_THREAD; hostStop = vt_pform_wtime(); vt_enter(ptid, &hostStop, vt_cuptiact_rid_flush); vt_cntl_msg(2,"[CUPTI Activity] Handle context %d activities", vtCtx->cuCtx); /* lock the whole buffer flush VT_CUPTI_LOCK();*/ /* dump the contents of the global queue */ VT_CUPTI_CALL(cuptiActivityDequeueBuffer(vtCtx->cuCtx, 0, &buffer, &bufSize), "cuptiActivityDequeueBuffer"); /* * Get time synchronization factor between host and GPU time for measured * period */ { VT_CUPTI_CALL(cuptiGetTimestamp(&gpuStop), "cuptiGetTimestamp"); hostStop = vt_pform_wtime(); vtcuptiActivity->sync.hostStop = hostStop; vtcuptiActivity->sync.factor = (double)(hostStop - vtcuptiActivity->sync.hostStart) /(double)(gpuStop - vtcuptiActivity->sync.gpuStart); } /*vt_cntl_msg(1, "hostStop: %llu , gpuStop: %llu", hostStopTS, gpuStopTS); vt_cntl_msg(1, "factor: %lf", syncFactor);*/ do{ status = cuptiActivityGetNextRecord(buffer, bufSize, &record); if(status == CUPTI_SUCCESS) { vt_cuptiact_writeRecord(record, vtCtx); }else if(status == CUPTI_ERROR_MAX_LIMIT_REACHED){ break; }else{ VT_CUPTI_CALL(status, "cuptiActivityGetNextRecord"); } }while(1); /* report any records dropped from the global queue */ { size_t dropped; VT_CUPTI_CALL(cuptiActivityGetNumDroppedRecords(vtCtx->cuCtx, 0, &dropped), "cuptiActivityGetNumDroppedRecords"); if(dropped != 0) vt_warning("[CUPTI Activity] Dropped %u records. Current buffer size: %llu bytes\n" "To avoid dropping of records increase the buffer size!\n" "Proposed minimum VT_CUDATRACE_BUFFER_SIZE=%llu", (unsigned int)dropped, vt_cuptiact_bufSize, vt_cuptiact_bufSize + dropped/2 * (sizeof(CUpti_ActivityKernel) + sizeof(CUpti_ActivityMemcpy))); } /* enter GPU idle region after last kernel, if exited before */ if(vtcuptiActivity->gpuIdleOn == 0){ vt_enter(vtcuptiActivity->strmList->vtThrdID, &(vtcuptiActivity->vtLastGPUTime), vt_gpu_rid_idle); vtcuptiActivity->gpuIdleOn = 1; /*vt_warning("IDLfente: %llu (%d)", vtCtx->vtLastGPUTime, vtCtx->strmList->vtThrdID);*/ } /* enqueue buffer again */ VT_CUPTI_CALL(cuptiActivityEnqueueBuffer(vtCtx->cuCtx, 0, buffer, vt_cuptiact_bufSize), "cuptiActivityEnqueueBuffer"); /* set new synchronization point */ vtcuptiActivity->sync.hostStart = hostStop; vtcuptiActivity->sync.gpuStart = gpuStop; /*VT_CUPTI_UNLOCK();*/ /* use local variable hostStop to write exit event for activity flush */ hostStop = vt_pform_wtime(); vt_exit(ptid, &hostStop); }