/* * Create a VampirTrace CUPTI activity context. * * @return pointer to created VampirTrace CUPTI Activity context */ static vt_cupti_activity_t* vt_cuptiact_createCtxActivity(CUcontext cuCtx) { vt_cupti_activity_t* vtCtxAct = NULL; /* create new context, as it is not listed */ vtCtxAct = (vt_cupti_activity_t *)malloc(sizeof(vt_cupti_activity_t)); if(vtCtxAct == NULL) vt_error_msg("[CUPTI Activity] Could not allocate memory for activity context!"); vtCtxAct->strmList = NULL; vtCtxAct->gpuMemAllocated = 0; vtCtxAct->gpuMemList = NULL; vtCtxAct->buffer = NULL; vtCtxAct->vtLastGPUTime = vt_gpu_init_time; vtCtxAct->gpuIdleOn = 1; /* * Get time synchronization factor between host and GPU time for measurement * interval */ { VT_CUPTI_CALL(cuptiGetTimestamp(&(vtCtxAct->sync.gpuStart)), "cuptiGetTimestamp"); vtCtxAct->sync.hostStart = vt_pform_wtime(); } /* set default CUPTI stream ID (needed for memory usage and idle tracing) */ VT_CUPTI_CALL(cuptiGetStreamId(cuCtx, NULL, &(vtCtxAct->defaultStrmID)), "cuptiGetStreamId"); return vtCtxAct; }
/* * Create a VampirTrace CUPTI Activity context. * * @param ctxID ID of the CUDA context * @param devID ID of the CUDA device * * @return pointer to created VampirTrace CUPTI Activity context */ static vt_cuptiact_ctx_t* vt_cuptiact_createContext(uint32_t ctxID, CUcontext cuCtx, uint32_t devID) { vt_cuptiact_ctx_t* vtCtx = NULL; /* create new context, as it is not listed */ vtCtx = (vt_cuptiact_ctx_t *)malloc(sizeof(vt_cuptiact_ctx_t)); if(vtCtx == NULL) vt_error_msg("[CUPTI Activity] Could not allocate memory for context!"); vtCtx->ctxID = ctxID; vtCtx->next = NULL; vtCtx->strmList = NULL; vtCtx->gpuMemAllocated = 0; vtCtx->gpuMemList = NULL; vtCtx->buffer = NULL; vtCtx->vtLastGPUTime = vt_gpu_init_time; vtCtx->gpuIdleOn = 1; /* * Get time synchronization factor between host and GPU time for measurement * interval */ { VT_CUPTI_CALL(cuptiGetTimestamp(&(vtCtx->sync.gpuStart)), "cuptiGetTimestamp"); vtCtx->sync.hostStart = vt_pform_wtime(); } VT_CHECK_THREAD; vtCtx->ptid = VT_MY_THREAD; if(cuCtx == NULL) CHECK_CU_ERROR(cuCtxGetCurrent(&cuCtx), NULL); vtCtx->cuCtx = cuCtx; /* set default CUPTI stream ID (needed for memory usage and idle tracing) */ VT_CUPTI_CALL(cuptiGetStreamId(vtCtx->cuCtx, NULL, &(vtCtx->defaultStrmID)), "cuptiGetStreamId"); if(devID == (uint32_t)-1){ CUdevice cuDev; /* driver API prog: correct cuDev, but result is 201 (invalid context) */ if(CUDA_SUCCESS != cuCtxGetDevice(&cuDev)){ devID = VT_NO_ID; }else{ devID = (uint32_t)cuDev; } } vtCtx->devID = devID; vtCtx->cuDev = devID; /*vt_cntl_msg(1,"device id: %d", devID);*/ return vtCtx; }
/* * Print all available counters to stdout. * * @param capList list of CUDA devices with different capabilities */ static void vt_cupti_showAllCounters(vt_cupti_device_t *capList) { CUptiResult cuptiErr = CUPTI_SUCCESS; CUpti_EventDomainID *domainId = NULL; uint32_t maxDomains = 0; uint32_t i; size_t size = 0; while(capList != NULL){ CUdevice cuDev = capList->cuDev; vt_cntl_msg(1, "[CUPTI Events] Available events for device %d (SM %d.%d):", cuDev, capList->dev_major, capList->dev_minor); vt_cntl_msg(1, "Id:Name"); vt_cntl_msg(1, "Description\n" "-------------------------------------------------------------------"); cuptiErr = cuptiDeviceGetNumEventDomains(cuDev, &maxDomains); VT_CUPTI_CALL(cuptiErr, "cuptiDeviceGetNumEventDomains"); if(maxDomains == 0){ vt_warning("[CUPTI Events] No domain is exposed by dev = %d\n", cuDev); return; } size = sizeof(CUpti_EventDomainID) * maxDomains; domainId = (CUpti_EventDomainID*)malloc(size); if(domainId == NULL){ vt_warning("[CUPTI Events] Failed to allocate memory to domain ID"); return; } memset(domainId, 0, size); cuptiErr = cuptiDeviceEnumEventDomains(cuDev, &size, domainId); VT_CUPTI_CALL(cuptiErr, "cuptiDeviceEnumEventDomains"); /* enum domains */ for(i = 0; i < maxDomains; i++) vt_cuptievt_enumEvents(cuDev, domainId[i]); vt_cntl_msg(1, "------------------------------------------------------"); free(domainId); capList = capList->next; } /* as this function is in the call-path of the initialize functions * -> vt_cupti_setupMetrics * -> vt_cupti_fillMetricList * -> vt_cupti_showAllCounters */ vt_cuptievt_initialized = 1; VT_CUPTI_UNLOCK(); exit(0); }
/* * Finalizes CUPTI device. * * @param ptid VampirTrace process/thread id * @param cleanExit 1 to cleanup CUPTI event group, otherwise 0 */ void vt_cuptievt_finalize_device(uint8_t cleanExit){ CUptiResult cuptiErr = CUPTI_SUCCESS; vt_cupti_ctx_t *vtcuptiCtx = NULL; vt_cntl_msg(2, "[CUPTI Events] Finalize device ... "); { CUcontext cuCtx; #if (defined(CUDA_VERSION) && (CUDA_VERSION < 4000)) VT_CUDRV_CALL(cuCtxPopCurrent(&cuCtx), "cuCtxPopCurrent"); VT_CUDRV_CALL(cuCtxPushCurrent(cuCtx), "cuCtxPushCurrent"); #else VT_CUDRV_CALL(cuCtxGetCurrent(&cuCtx), "cuCtxGetCurrent"); #endif vtcuptiCtx = vt_cupti_removeCtx(&cuCtx); if(vtcuptiCtx == NULL) return; } if(vtcuptiCtx->events == NULL) return; if(cleanExit && vt_gpu_debug != 0){ /*uint64_t time = vt_pform_wtime(); vt_cupti_resetCounter(vtcuptiCtx, 0, &time);*/ /* stop CUPTI counter capturing */ vt_cuptievt_stop(vtcuptiCtx->events); /* destroy all CUPTI event groups, which have been created */ { vt_cupti_evtgrp_t *vtcuptiGrp = vtcuptiCtx->events->vtGrpList; while(vtcuptiGrp != NULL){ cuptiErr = cuptiEventGroupRemoveAllEvents(vtcuptiGrp->evtGrp); VT_CUPTI_CALL(cuptiErr, "cuptiEventGroupRemoveAllEvents"); cuptiErr = cuptiEventGroupDestroy(vtcuptiGrp->evtGrp); VT_CUPTI_CALL(cuptiErr, "cuptiEventGroupDestroy"); vtcuptiGrp = vtcuptiGrp->next; } } } /* free VampirTrace CUPTI event context */ vt_cuptievt_freeEventCtx(vtcuptiCtx->events); }
static vt_cupti_evtgrp_t* vt_cuptievt_createEvtGrp(vt_cupti_ctx_t *vtcuptiCtx) { CUptiResult cuptiErr = CUPTI_SUCCESS; vt_cupti_evtgrp_t *vtcuptiGrp = NULL; vtcuptiGrp = (vt_cupti_evtgrp_t*)malloc(sizeof(vt_cupti_evtgrp_t)); vtcuptiGrp->evtNum = 0; vtcuptiGrp->enabled = 0; vtcuptiGrp->next = NULL; /* create initial CUPTI counter group */ cuptiErr = cuptiEventGroupCreate(vtcuptiCtx->cuCtx, &(vtcuptiGrp->evtGrp), 0); VT_CUPTI_CALL(cuptiErr, "cuptiEventGroupCreate"); { size_t evtNum = vtcuptiCtx->events->vtDevCap->evtNum; vtcuptiGrp->cuptiEvtIDs = (CUpti_EventID *)malloc(evtNum*sizeof(CUpti_EventID)); vtcuptiGrp->vtCIDs = (uint32_t *)malloc(evtNum*sizeof(uint32_t)); } return vtcuptiGrp; }
/* * Reset the VampirTrace counter values (to zero) for active CUPTI counters. * * @param vtcuptiEvtCtx pointer to the VampirTrace CUPTI events context * @param strmid the stream id for the counter values * @param time the VampirTrace timestamps */ void vt_cuptievt_resetCounter(vt_cupti_events_t *vtcuptiEvtCtx, uint32_t strmid, uint64_t *time) { size_t i; vt_cupti_evtgrp_t *vtcuptiGrp = NULL; /* create a VampirTrace CUPTI events context, if it is not available */ if(vtcuptiEvtCtx == NULL){ VT_CHECK_THREAD; vtcuptiEvtCtx = vt_cuptievt_getOrCreateCurrentCtx(VT_MY_THREAD)->events; if(vtcuptiEvtCtx == NULL) return; } vtcuptiGrp = vtcuptiEvtCtx->vtGrpList; while(vtcuptiGrp != NULL){ for(i = 0; i < vtcuptiGrp->evtNum; i++){ vt_count(strmid, time, *(vtcuptiGrp->vtCIDs+i), 0); } /* reset counter values of this group */ VT_CUPTI_CALL(cuptiEventGroupResetAllEvents(vtcuptiGrp->evtGrp), "cuptiEventGroupResetAllEvents"); vtcuptiGrp = vtcuptiGrp->next; } }
/* * Allocate a new buffer and add it to the queue specified by a CUDA context. * * @param cuCtx the CUDA context, specifying the queue * * @return pointer to the created buffer */ static uint8_t* vt_cuptiact_queueNewBuffer(CUcontext cuCtx) { uint8_t *buffer = (uint8_t *)malloc(vt_cuptiact_bufSize); VT_CUPTI_CALL(cuptiActivityEnqueueBuffer(cuCtx, 0, ALIGN_BUFFER(buffer, 8), vt_cuptiact_bufSize), "cuptiActivityEnqueueBuffer"); return buffer; }
void vt_cuptiact_enableConcurrentKernel(vt_cupti_ctx_t* vtCtx) { /* * Disable collection of kernels for the given CUDA context. * !!! does not work yet !!! VT_CUPTI_CALL(cuptiActivityDisableContext(cuCtx, CUPTI_ACTIVITY_KIND_KERNEL), "cuptiActivityDisableContext");* * flush the already buffered activities for this CUDA context * vt_cuptiact_flushCtxActivities(cuCtx); * Enable collection of kernels for the given CUDA context VT_CUPTI_CALL(cuptiActivityEnableContext(cuCtx, CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL), "cuptiActivityEnableContext");*/ if((vt_gpu_config & VT_GPU_TRACE_CONCURRENT_KERNEL) != VT_GPU_TRACE_CONCURRENT_KERNEL){ vt_cntl_msg(2, "[CUPTI Activity] Enable concurrent kernel tracing."); /* * Disable normal (lower overhead) kernel tracing. */ VT_CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL), "cuptiActivityDisable"); /* * Flush the already buffered activities for this CUDA context. */ VT_CUPTI_LOCK(); vt_cuptiact_flushCtxActivities(vtCtx); VT_CUPTI_UNLOCK(); /* * Enable concurrent kernel tracing (higher overhead). */ VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL), "cuptiActivityEnable"); vt_gpu_config |= VT_GPU_TRACE_CONCURRENT_KERNEL; } }
/* * Stop CUPTI counter capturing by disabling the CUPTI event groups. * * @param vtcuptiEvtCtx pointer to the VampirTrace CUPTI events context */ static void vt_cuptievt_stop(vt_cupti_events_t *vtcuptiEvtCtx) { vt_cupti_evtgrp_t *vtcuptiGrp = NULL; if(vtcuptiEvtCtx == NULL || vt_gpu_debug) return; /* stop counter reading for all groups */ vtcuptiGrp = vtcuptiEvtCtx->vtGrpList; while(vtcuptiGrp != NULL){ if(vtcuptiGrp->enabled){ CUptiResult cuptiErr = CUPTI_SUCCESS; cuptiErr = cuptiEventGroupDisable(vtcuptiGrp->evtGrp); VT_CUPTI_CALL(cuptiErr, "cuptiEventGroupDisable"); vtcuptiGrp->enabled = 0; } vtcuptiGrp = vtcuptiGrp->next; } }
/* * Create a VampirTrace CUPTI stream. * * @param vtCtx VampirTrace CUPTI context * @param cuStrm CUDA stream * @param strmID ID of the CUDA stream * * @return pointer to created VampirTrace CUPTI stream */ vt_cupti_strm_t* vt_cupti_createStream(vt_cupti_ctx_t *vtCtx, CUstream cuStrm, uint32_t strmID) { vt_cupti_strm_t *vtStrm = NULL; if(vtCtx == NULL){ vt_warning("[CUPTI] Cannot create stream without VampirTrace CUPTI context"); return NULL; } vtStrm = (vt_cupti_strm_t *)malloc(sizeof(vt_cupti_strm_t)); if(vtStrm == NULL) vt_error_msg("[CUPTI] Could not allocate memory for stream!"); vtStrm->cuStrm = cuStrm; vtStrm->vtLastTime = vt_gpu_init_time; vtStrm->destroyed = 0; vtStrm->next = NULL; #if defined(VT_CUPTI_ACTIVITY) /* create stream by VT CUPTI callbacks implementation (CUstream is given) */ if(strmID == VT_CUPTI_NO_STREAM_ID){ if(cuStrm != VT_CUPTI_NO_STREAM){ VT_CUPTI_CALL(cuptiGetStreamId(vtCtx->cuCtx, cuStrm, &strmID), "cuptiGetStreamId"); }else{ vt_warning("[CUPTI] Neither CUDA stream nor stream ID given!"); free(vtStrm); return NULL; } } #else /* only VT_CUPTI_CALLBACKS is defined */ if(vtCtx->callbacks != NULL){ strmID = vtCtx->callbacks->streamsCreated; vtCtx->callbacks->streamsCreated++; } #endif vtStrm->cuStrmID = strmID; /* create VampirTrace thread */ { char thread_name[16] = "CUDA"; if(vt_gpu_stream_reuse){ if(vtCtx->devID != VT_NO_ID){ if(-1 == snprintf(thread_name+4, 12, "[%d]", vtCtx->devID)) vt_cntl_msg(1, "Could not create thread name for CUDA thread!"); } }else{ if(vtCtx->devID == VT_NO_ID){ if(-1 == snprintf(thread_name+4, 12, "[?:%d]", strmID)) vt_cntl_msg(1, "Could not create thread name for CUDA thread!"); }else{ if(-1 == snprintf(thread_name+4, 12, "[%d:%d]", vtCtx->devID, strmID)) vt_cntl_msg(1, "Could not create thread name for CUDA thread!"); } } VT_CHECK_THREAD; vt_gpu_registerThread(thread_name, VT_MY_THREAD, &(vtStrm->vtThrdID)); } if(vt_gpu_init_time < vt_start_time) vt_gpu_init_time = vt_start_time; /* for the first stream created for this context */ if(vtCtx->strmList == NULL){ if(vt_gpu_trace_idle > 0){ /* write enter event for GPU_IDLE on first stream */ vt_enter(vtStrm->vtThrdID, &vt_gpu_init_time, vt_gpu_rid_idle); /*vt_warning("IDLEente: %llu (%d)", vt_gpu_init_time, vtStrm->vtThrdID);*/ #if defined(VT_CUPTI_ACTIVITY) if(vtCtx->activity != NULL) vtCtx->activity->gpuIdleOn = 1; #endif } /* set the counter value for cudaMalloc to 0 on first stream */ if(vt_gpu_trace_memusage > 0) vt_count(vtStrm->vtThrdID, &vt_gpu_init_time, vt_gpu_cid_memusage, 0); } if(vt_gpu_trace_kernels > 1){ /* set count values to zero */ vt_count(vtStrm->vtThrdID, &vt_gpu_init_time, vt_cupti_cid_blocksPerGrid, 0); vt_count(vtStrm->vtThrdID, &vt_gpu_init_time, vt_cupti_cid_threadsPerBlock, 0); vt_count(vtStrm->vtThrdID, &vt_gpu_init_time, vt_cupti_cid_threadsPerKernel, 0); } /* prepend the stream vtStrm->next = vtCtx->strmList; vtCtx->strmList = vtStrm;*/ return vtStrm; }
void vt_cupti_events_finalizeContext(vt_cupti_ctx_t *vtCtx) { uint64_t time = vt_pform_wtime(); vt_cupti_strm_t *curStrm = NULL; vt_cupti_evtgrp_t *vtcuptiGrp = NULL; if(vtCtx == NULL || vtCtx->events == NULL) return; /* These CUPTI calls may fail, as CUPTI has implicitly destroyed something */ if(vt_gpu_debug == 0){ curStrm = vtCtx->strmList; /* for all streams of this context */ while(curStrm != NULL){ /* ensure increasing time stamps */ if(time < curStrm->vtLastTime){ curStrm = curStrm->next; continue; } vt_cuptievt_resetCounter(vtCtx->events, curStrm->vtThrdID, &time); curStrm = curStrm->next; } /* stop CUPTI counter capturing */ vt_cuptievt_stop(vtCtx->events); /* destroy all CUPTI event groups, which have been created */ vtcuptiGrp = vtCtx->events->vtGrpList; while(vtcuptiGrp != NULL){ VT_CUPTI_CALL(cuptiEventGroupRemoveAllEvents(vtcuptiGrp->evtGrp), "cuptiEventGroupRemoveAllEvents"); VT_CUPTI_CALL(cuptiEventGroupDestroy(vtcuptiGrp->evtGrp), "cuptiEventGroupDestroy"); vtcuptiGrp = vtcuptiGrp->next; } }else{ /* set at least the VampirTrace counter to zero */ curStrm = vtCtx->strmList; /* for all streams of this context */ while(curStrm != NULL){ /* ensure increasing time stamps */ if(time < curStrm->vtLastTime){ curStrm = curStrm->next; continue; } vtcuptiGrp = vtCtx->events->vtGrpList; while(vtcuptiGrp != NULL){ size_t i; for(i = 0; i < vtcuptiGrp->evtNum; i++){ vt_count(curStrm->vtThrdID, &time, *(vtcuptiGrp->vtCIDs+i), 0); } vtcuptiGrp = vtcuptiGrp->next; } curStrm = curStrm->next; } } /* free previously allocated memory */ vt_cuptievt_freeEventCtx(vtCtx->events); }
/* * Request the CUTPI counter values and write it to the given VampirTrace * stream with the given timestamps. * * @param vtcuptiEvtCtx pointer to the VampirTrace CUPTI events context * @param strmid the stream id for the counter values * @param time the VampirTrace timestamps */ void vt_cuptievt_writeCounter(vt_cupti_events_t *vtcuptiEvtCtx, uint32_t strmid, uint64_t *time) { CUptiResult cuptiErr = CUPTI_SUCCESS; vt_cupti_evtgrp_t *vtcuptiGrp = NULL; size_t bufferSizeBytes; size_t arraySizeBytes; size_t numCountersRead; if(vtcuptiEvtCtx == NULL){ VT_CHECK_THREAD; vtcuptiEvtCtx = vt_cuptievt_getOrCreateCurrentCtx(VT_MY_THREAD)->events; if(vtcuptiEvtCtx == NULL) return; } vtcuptiGrp = vtcuptiEvtCtx->vtGrpList; while(vtcuptiGrp != NULL){ /* read events only, if the event group is enabled */ if(vtcuptiGrp->enabled){ bufferSizeBytes = vtcuptiGrp->evtNum * sizeof(uint64_t); arraySizeBytes = vtcuptiGrp->evtNum * sizeof(CUpti_EventID); /* read events */ cuptiErr = cuptiEventGroupReadAllEvents(vtcuptiGrp->evtGrp, CUPTI_EVENT_READ_FLAG_NONE, &bufferSizeBytes, vtcuptiEvtCtx->counterData, &arraySizeBytes, vtcuptiEvtCtx->cuptiEvtIDs, &numCountersRead); VT_CUPTI_CALL(cuptiErr, "cuptiEventGroupReadAllEvents"); if(vtcuptiGrp->evtNum != numCountersRead){ vt_error_msg("[CUPTI Events] %d counter reads, %d metrics specified in " "VT_CUPTI_METRICS!", numCountersRead, vtcuptiGrp->evtNum); } /* For all events of the event group: map added event IDs to just read event * IDs, as the order may not be the same. For small numbers of counter reads * this simple mapping should be fast enough. */ { size_t j; for(j = 0; j < numCountersRead; j++){ size_t i; for(i = 0; i < vtcuptiGrp->evtNum; i++){ if(vtcuptiEvtCtx->cuptiEvtIDs[j] == *(vtcuptiGrp->cuptiEvtIDs+i)){ /* write the counter value as VampirTrace counter */ vt_count(strmid, time, *(vtcuptiGrp->vtCIDs+i), vtcuptiEvtCtx->counterData[i]); } } } } } vtcuptiGrp = vtcuptiGrp->next; } }
/* no need to lock, because it is only called by vt_cupti_callback_init() */ void vt_cupti_activity_init() { /*if(!vt_cuptiact_initialized){ vt_cupti_init(); VT_CUPTI_LOCK();*/ if(!vt_cuptiact_initialized){ vt_cntl_msg(2, "[CUPTI Activity] Initializing ... "); { vt_cuptiact_bufSize = vt_env_cudatrace_bsize(); /* no buffer size < 1024 bytes allowed (see CUPTI documentation) */ if(vt_cuptiact_bufSize < 1024){ if(vt_cuptiact_bufSize > 0){ vt_warning("[CUPTI Activity] Buffer size has to be at least 1024 " "bytes! It has been set to %d.", vt_cuptiact_bufSize); } vt_cuptiact_bufSize = VT_CUPTI_ACT_DEFAULT_BSIZE; } /* queue a global buffer to initialize CUPTI before CUDA init vt_cuptiact_buffer = (uint8_t *)malloc(vt_cuptiact_bufSize); VT_CUPTI_CALL(cuptiActivityEnqueueBuffer(NULL, 0, vt_cuptiact_buffer, vt_cuptiact_bufSize), "cuptiActivityEnqueueBuffer");*/ } #if (defined(VT_MT) || defined(VT_HYB)) VTTHRD_LOCK_IDS(); #endif if(vt_gpu_trace_kernels > 1){ /* define kernel counters */ vt_cuptiact_cid_knStaticSharedMem = vt_def_counter(VT_MASTER_THREAD, "staticSharedMemory", "Bytes", VT_CNTR_ABS | VT_CNTR_NEXT | VT_CNTR_UNSIGNED, vt_cupti_cgid_cuda_kernel, 0); vt_cuptiact_cid_knDynamicSharedMem = vt_def_counter(VT_MASTER_THREAD, "dynamicSharedMemory", "Bytes", VT_CNTR_ABS | VT_CNTR_NEXT | VT_CNTR_UNSIGNED, vt_cupti_cgid_cuda_kernel, 0); vt_cuptiact_cid_knLocalMemTotal = vt_def_counter(VT_MASTER_THREAD, "localMemoryPerKernel", "Bytes", VT_CNTR_ABS | VT_CNTR_NEXT | VT_CNTR_UNSIGNED, vt_cupti_cgid_cuda_kernel, 0); vt_cuptiact_cid_knRegistersPerThread = vt_def_counter(VT_MASTER_THREAD, "registersPerThread", "#", VT_CNTR_ABS | VT_CNTR_NEXT | VT_CNTR_UNSIGNED, vt_cupti_cgid_cuda_kernel, 0); } /* define region for GPU activity flush */ vt_cuptiact_rid_flush = vt_def_region(VT_MASTER_THREAD, "flushActivities", VT_NO_ID, VT_NO_LNO, VT_NO_LNO, "VT_CUDA", VT_FUNCTION); #if (defined(VT_MT) || defined(VT_HYB)) VTTHRD_UNLOCK_IDS(); #endif /*** enable the activities ***/ /* enable kernel tracing */ if(vt_gpu_trace_kernels > 0){ #if (defined(CUPTI_API_VERSION) && (CUPTI_API_VERSION >= 3)) if((vt_gpu_config & VT_GPU_TRACE_CONCURRENT_KERNEL) == VT_GPU_TRACE_CONCURRENT_KERNEL){ /*VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL), "cuptiActivityEnable");*/ VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL), "cuptiActivityEnable"); }else #endif VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL), "cuptiActivityEnable"); } /* enable memory copy tracing */ if(vt_gpu_trace_mcpy){ VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY), "cuptiActivityEnable"); } /* register the finalize function of VampirTrace CUPTI to be called before * the program exits atexit(vt_cupti_activity_finalize);*/ vt_cuptiact_initialized = 1; /*VT_CUPTI_UNLOCK(); }*/ } }
void vt_cuptiact_flushCtxActivities(vt_cupti_ctx_t *vtCtx) { CUptiResult status; uint8_t *buffer = NULL; size_t bufSize; CUpti_Activity *record = NULL; uint64_t hostStop, gpuStop; uint32_t ptid = VT_NO_ID; vt_cupti_activity_t *vtcuptiActivity = NULL; /* check for VampirTrace CUPTI context */ if(vtCtx == NULL || vtCtx->activity == NULL){ vt_warning("[CUPTI Activity] Context not found!"); return; } vtcuptiActivity = vtCtx->activity; /* check if the buffer contains records */ status = cuptiActivityQueryBuffer(vtCtx->cuCtx, 0, &bufSize); if(status != CUPTI_SUCCESS){ if(CUPTI_ERROR_QUEUE_EMPTY == status || CUPTI_ERROR_MAX_LIMIT_REACHED != status){ return; } } /* expose VampirTrace CUPTI activity flush as measurement overhead */ VT_CHECK_THREAD; ptid = VT_MY_THREAD; hostStop = vt_pform_wtime(); vt_enter(ptid, &hostStop, vt_cuptiact_rid_flush); vt_cntl_msg(2,"[CUPTI Activity] Handle context %d activities", vtCtx->cuCtx); /* lock the whole buffer flush VT_CUPTI_LOCK();*/ /* dump the contents of the global queue */ VT_CUPTI_CALL(cuptiActivityDequeueBuffer(vtCtx->cuCtx, 0, &buffer, &bufSize), "cuptiActivityDequeueBuffer"); /* * Get time synchronization factor between host and GPU time for measured * period */ { VT_CUPTI_CALL(cuptiGetTimestamp(&gpuStop), "cuptiGetTimestamp"); hostStop = vt_pform_wtime(); vtcuptiActivity->sync.hostStop = hostStop; vtcuptiActivity->sync.factor = (double)(hostStop - vtcuptiActivity->sync.hostStart) /(double)(gpuStop - vtcuptiActivity->sync.gpuStart); } /*vt_cntl_msg(1, "hostStop: %llu , gpuStop: %llu", hostStopTS, gpuStopTS); vt_cntl_msg(1, "factor: %lf", syncFactor);*/ do{ status = cuptiActivityGetNextRecord(buffer, bufSize, &record); if(status == CUPTI_SUCCESS) { vt_cuptiact_writeRecord(record, vtCtx); }else if(status == CUPTI_ERROR_MAX_LIMIT_REACHED){ break; }else{ VT_CUPTI_CALL(status, "cuptiActivityGetNextRecord"); } }while(1); /* report any records dropped from the global queue */ { size_t dropped; VT_CUPTI_CALL(cuptiActivityGetNumDroppedRecords(vtCtx->cuCtx, 0, &dropped), "cuptiActivityGetNumDroppedRecords"); if(dropped != 0) vt_warning("[CUPTI Activity] Dropped %u records. Current buffer size: %llu bytes\n" "To avoid dropping of records increase the buffer size!\n" "Proposed minimum VT_CUDATRACE_BUFFER_SIZE=%llu", (unsigned int)dropped, vt_cuptiact_bufSize, vt_cuptiact_bufSize + dropped/2 * (sizeof(CUpti_ActivityKernel) + sizeof(CUpti_ActivityMemcpy))); } /* enter GPU idle region after last kernel, if exited before */ if(vtcuptiActivity->gpuIdleOn == 0){ vt_enter(vtcuptiActivity->strmList->vtThrdID, &(vtcuptiActivity->vtLastGPUTime), vt_gpu_rid_idle); vtcuptiActivity->gpuIdleOn = 1; /*vt_warning("IDLfente: %llu (%d)", vtCtx->vtLastGPUTime, vtCtx->strmList->vtThrdID);*/ } /* enqueue buffer again */ VT_CUPTI_CALL(cuptiActivityEnqueueBuffer(vtCtx->cuCtx, 0, buffer, vt_cuptiact_bufSize), "cuptiActivityEnqueueBuffer"); /* set new synchronization point */ vtcuptiActivity->sync.hostStart = hostStop; vtcuptiActivity->sync.gpuStart = gpuStop; /*VT_CUPTI_UNLOCK();*/ /* use local variable hostStop to write exit event for activity flush */ hostStop = vt_pform_wtime(); vt_exit(ptid, &hostStop); }