/* * Get a VampirTrace CUPTI context by CUDA context * * @param cuCtx the CUDA context * * @return VampirTrace CUPTI context */ vt_cupti_ctx_t* vt_cupti_getCtx(CUcontext cuCtx) { vt_cupti_ctx_t* vtCtx = NULL; /* lookup context */ VT_CUPTI_LOCK(); vtCtx = vt_cupti_ctxList; while(vtCtx != NULL){ if(vtCtx->cuCtx == cuCtx){ /* workaround to set the correct device number if(vtCtx->devID == VT_CUPTI_NO_DEVICE_ID){ CUdevice cuDev; if(CUDA_SUCCESS != cuCtxGetDevice(&cuDev)){ vt_warning("[CUPTI] Could not get CUdevice from context"); } vtCtx->devID = (uint32_t)cuDev; vtCtx->cuDev = cuDev; }*/ VT_CUPTI_UNLOCK(); return vtCtx; } vtCtx = vtCtx->next; } VT_CUPTI_UNLOCK(); return NULL; }
/* * Remove a context from the global context list and return it. * * @param cuCtx pointer to the CUDA context * @return the VampirTrace CUPTI context, which has been removed */ vt_cupti_ctx_t* vt_cupti_removeCtx(CUcontext *cuCtx) { vt_cupti_ctx_t *currCtx = NULL; vt_cupti_ctx_t *lastCtx = NULL; VT_CUPTI_LOCK(); currCtx = vt_cupti_ctxList; lastCtx = vt_cupti_ctxList; while(currCtx != NULL){ if(currCtx->cuCtx == *cuCtx){ /* if first element in list */ if(currCtx == vt_cupti_ctxList){ vt_cupti_ctxList = vt_cupti_ctxList->next; }else{ lastCtx->next = currCtx->next; } VT_CUPTI_UNLOCK(); return currCtx; } lastCtx = currCtx; currCtx = currCtx->next; } VT_CUPTI_UNLOCK(); vt_cntl_msg(2, "[CUPTI] Could not remove context (CUDA Context not found)!"); return NULL; }
void vt_cuptiact_markStreamAsDestroyed(CUcontext cuCtx, uint32_t strmID) { vt_cupti_ctx_t *vtCtx = NULL; vt_cuptiact_strm_t *currStrm = NULL; VT_CUPTI_LOCK(); if(cuCtx == NULL){ vt_warning("[CUPTI Activity] No CUDA context given in " "vt_cuptiact_markStreamAsDestroyed()!"); VT_CUPTI_UNLOCK(); return; } vtCtx = vt_cupti_getCtxNoLock(cuCtx); if(vtCtx == NULL){ vt_warning("[CUPTI Activity] No context found in " "vt_cuptiact_markStreamAsDestroyed()!"); VT_CUPTI_UNLOCK(); return; } currStrm = vtCtx->activity->strmList; while(currStrm != NULL){ if(currStrm->strmID == strmID){ currStrm->destroyed = 1; VT_CUPTI_UNLOCK(); return; } currStrm = currStrm->next; } VT_CUPTI_UNLOCK(); }
/* * Finalize the CUPTI common interface. * - free the VampirTrace CUPTI context list */ void vt_cupti_finalize() { if(!vt_cupti_finalized && vt_cupti_initialized){ VT_CUPTI_LOCK(); if(!vt_cupti_finalized && vt_cupti_initialized){ vt_cntl_msg(2, "[CUPTI] Finalizing ... "); /* free VampirTrace CUPTI context structures */ while(vt_cupti_ctxList != NULL){ vt_cupti_ctx_t *tmp = vt_cupti_ctxList; vt_cupti_ctxList = vt_cupti_ctxList->next; vt_cupti_finalizeCtx(tmp); tmp = NULL; } vt_cupti_finalized = 1; VT_CUPTI_UNLOCK(); #if (defined(VT_MT) || defined (VT_HYB)) VTTHRD_LOCK_ENV(); VTThrd_deleteMutex(&VTThrdMutexCupti); VTTHRD_UNLOCK_ENV(); #endif /* VT_MT || VT_HYB */ } } }
/* * Prepend the given VampirTrace CUPTI context to the global context list. * * @param vtCtx pointer to the VampirTrace CUPTI context to be prepended */ void vt_cupti_prependCtx(vt_cupti_ctx_t *vtCtx) { VT_CUPTI_LOCK(); vtCtx->next = vt_cupti_ctxList; vt_cupti_ctxList = vtCtx; VT_CUPTI_UNLOCK(); }
void vt_cupti_activity_finalize() { if(!vt_cuptiact_finalized && vt_cuptiact_initialized){ VT_CUPTI_LOCK(); if(!vt_cuptiact_finalized && vt_cuptiact_initialized){ vt_cupti_ctx_t *vtCtx = vt_cupti_ctxList; vt_cntl_msg(2, "[CUPTI Activity] Finalizing ... "); while(vtCtx != NULL){ /* write buffered activities, which have not been dumped yet */ vt_cuptiact_flushCtxActivities(vtCtx); /* free the context */ vt_cuptiact_destroyContext(vtCtx->activity); vtCtx->activity = NULL; /* set pointer to next context */ vtCtx = vtCtx->next; } vt_cuptiact_finalized = 1; VT_CUPTI_UNLOCK(); } } }
/* * Print all available counters to stdout. * * @param capList list of CUDA devices with different capabilities */ static void vt_cupti_showAllCounters(vt_cupti_device_t *capList) { CUptiResult cuptiErr = CUPTI_SUCCESS; CUpti_EventDomainID *domainId = NULL; uint32_t maxDomains = 0; uint32_t i; size_t size = 0; while(capList != NULL){ CUdevice cuDev = capList->cuDev; vt_cntl_msg(1, "[CUPTI Events] Available events for device %d (SM %d.%d):", cuDev, capList->dev_major, capList->dev_minor); vt_cntl_msg(1, "Id:Name"); vt_cntl_msg(1, "Description\n" "-------------------------------------------------------------------"); cuptiErr = cuptiDeviceGetNumEventDomains(cuDev, &maxDomains); VT_CUPTI_CALL(cuptiErr, "cuptiDeviceGetNumEventDomains"); if(maxDomains == 0){ vt_warning("[CUPTI Events] No domain is exposed by dev = %d\n", cuDev); return; } size = sizeof(CUpti_EventDomainID) * maxDomains; domainId = (CUpti_EventDomainID*)malloc(size); if(domainId == NULL){ vt_warning("[CUPTI Events] Failed to allocate memory to domain ID"); return; } memset(domainId, 0, size); cuptiErr = cuptiDeviceEnumEventDomains(cuDev, &size, domainId); VT_CUPTI_CALL(cuptiErr, "cuptiDeviceEnumEventDomains"); /* enum domains */ for(i = 0; i < maxDomains; i++) vt_cuptievt_enumEvents(cuDev, domainId[i]); vt_cntl_msg(1, "------------------------------------------------------"); free(domainId); capList = capList->next; } /* as this function is in the call-path of the initialize functions * -> vt_cupti_setupMetrics * -> vt_cupti_fillMetricList * -> vt_cupti_showAllCounters */ vt_cuptievt_initialized = 1; VT_CUPTI_UNLOCK(); exit(0); }
/* * Initialize the CUPTI events data of the given VampirTrace CUPTI context. * * @param vtCtx pointer to the VampirTrace CUPTI context */ void vt_cupti_events_initContext(vt_cupti_ctx_t *vtcuptiCtx) { vt_cupti_events_t *vtcuptiEvtCtx = NULL; vt_cntl_msg(2, "[CUPTI Events] Initializing VampirTrace CUPTI events context"); /* get a pointer to eventIDArray */ { CUresult cuErr = CUDA_SUCCESS; int dev_major, dev_minor; vt_cupti_device_t *cuptiDev; /* TODO: do not trace this driver API function call */ cuErr = cuDeviceComputeCapability(&dev_major, &dev_minor, vtcuptiCtx->cuDev); VT_CUDRV_CALL(cuErr, "cuDeviceComputeCapability"); /* check if device capability already listed */ VT_CUPTI_LOCK(); cuptiDev = vtcuptievtCapList; VT_CUPTI_UNLOCK(); cuptiDev = vt_cupti_checkMetricList(cuptiDev, dev_major, dev_minor); if(cuptiDev){ /* allocate the VampirTrace CUPTI events context */ vtcuptiEvtCtx = (vt_cupti_events_t *)malloc(sizeof(vt_cupti_events_t)); if(vtcuptiEvtCtx == NULL) vt_error_msg("[CUPTI Events] malloc(sizeof(vt_cupti_events_t)) failed!"); vtcuptiEvtCtx->vtDevCap = cuptiDev; vtcuptiEvtCtx->vtGrpList = NULL; vtcuptiEvtCtx->counterData = NULL; vtcuptiEvtCtx->cuptiEvtIDs = NULL; vtcuptiCtx->events = vtcuptiEvtCtx; }else{ return; } } /* create and add the VampirTrace CUPTI groups to the context */ vt_cupti_addEvtGrpsToCtx(vtcuptiCtx); /* allocate memory for CUPTI counter reads */ { size_t allocSize = vtcuptiEvtCtx->vtGrpList->evtNum; vtcuptiEvtCtx->counterData = (uint64_t *)malloc(allocSize*sizeof(uint64_t)); vtcuptiEvtCtx->cuptiEvtIDs = (CUpti_EventID *)malloc(allocSize*sizeof(CUpti_EventID)); } vt_cuptievt_start(vtcuptiEvtCtx); }
/* * Finalizes the VampirTrace CUPTI events interface. */ void vt_cupti_events_finalize() { if(!vt_cuptievt_finalized && vt_cuptievt_initialized){ /* fast check without lock */ VT_CUPTI_LOCK(); if(!vt_cuptievt_finalized && vt_cuptievt_initialized){ vt_cupti_ctx_t *vtcuptiCtxList = vt_cupti_ctxList; /* needed because of the atexit in vt_cupti_events_init() */ VT_SUSPEND_MALLOC_TRACING(VT_CURRENT_THREAD); vt_cntl_msg(2, "[CUPTI Events] Finalizing ..."); /* free VampirTrace CUPTI events context structures */ while(vtcuptiCtxList != NULL){ if(vtcuptiCtxList->events != NULL){ vt_cupti_events_finalizeContext(vtcuptiCtxList); free(vtcuptiCtxList->events); vtcuptiCtxList->events = NULL; } vtcuptiCtxList = vtcuptiCtxList->next; } /* free capability metric list */ while(vtcuptievtCapList != NULL){ vt_cupti_device_t *tmp = vtcuptievtCapList; vtcuptievtCapList = vtcuptievtCapList->next; /* free VampirTrace CUPTI events */ while(tmp->vtcuptiEvtList != NULL){ vt_cupti_evtctr_t *tmpEvt = tmp->vtcuptiEvtList; tmp->vtcuptiEvtList = tmp->vtcuptiEvtList->next; free(tmpEvt); tmpEvt = NULL; } free(tmp); tmp = NULL; } VT_RESUME_MALLOC_TRACING(VT_CURRENT_THREAD); vt_cuptievt_finalized = 1; VT_CUPTI_UNLOCK(); } } }
void vt_cupti_init() { if(!vt_cupti_initialized){ #if (defined(VT_MT) || defined(VT_HYB)) VTThrd_createMutex(&VTThrdMutexCupti); #endif VT_CUPTI_LOCK(); if(!vt_cupti_initialized){ vt_cntl_msg(2, "[CUPTI] Initializing ... "); /* register the finalize function of VampirTrace CUPTI to be called before * the program exits */ atexit(vt_cupti_finalize); vt_cupti_initialized = 1; VT_CUPTI_UNLOCK(); } } }
void vt_cuptiact_enableConcurrentKernel(vt_cupti_ctx_t* vtCtx) { /* * Disable collection of kernels for the given CUDA context. * !!! does not work yet !!! VT_CUPTI_CALL(cuptiActivityDisableContext(cuCtx, CUPTI_ACTIVITY_KIND_KERNEL), "cuptiActivityDisableContext");* * flush the already buffered activities for this CUDA context * vt_cuptiact_flushCtxActivities(cuCtx); * Enable collection of kernels for the given CUDA context VT_CUPTI_CALL(cuptiActivityEnableContext(cuCtx, CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL), "cuptiActivityEnableContext");*/ if((vt_gpu_config & VT_GPU_TRACE_CONCURRENT_KERNEL) != VT_GPU_TRACE_CONCURRENT_KERNEL){ vt_cntl_msg(2, "[CUPTI Activity] Enable concurrent kernel tracing."); /* * Disable normal (lower overhead) kernel tracing. */ VT_CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL), "cuptiActivityDisable"); /* * Flush the already buffered activities for this CUDA context. */ VT_CUPTI_LOCK(); vt_cuptiact_flushCtxActivities(vtCtx); VT_CUPTI_UNLOCK(); /* * Enable concurrent kernel tracing (higher overhead). */ VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL), "cuptiActivityEnable"); vt_gpu_config |= VT_GPU_TRACE_CONCURRENT_KERNEL; } }
/* * Initialize VampirTrace IDs and registers the finalize function. * This may be done implicitly by vt_cupti_count(). */ void vt_cupti_events_init() { if(!vt_cuptievt_initialized){ /* fast check without lock */ vt_cupti_init(); VT_CUPTI_LOCK(); if(!vt_cuptievt_initialized){ vt_cntl_msg(2, "[CUPTI Events] Initializing ... "); /* create VampirTrace counter group ID only once */ #if (defined(VT_MT) || defined(VT_HYB)) VTTHRD_LOCK_IDS(); #endif vt_cuptievt_rid_init = vt_def_region(VT_MASTER_THREAD, "vtcuptiHostThreadInit", VT_NO_ID, VT_NO_LNO, VT_NO_LNO, "VT_CUPTI", VT_FUNCTION); vt_cuptievt_cgid = vt_def_counter_group(VT_MASTER_THREAD, "CUPTI"); #if (defined(VT_MT) || defined(VT_HYB)) VTTHRD_UNLOCK_IDS(); #endif vt_cupti_events_sampling = (uint8_t)vt_env_cupti_sampling(); vtcuptievtCapList = vt_cuptievt_setupMetricList(); if(NULL == vtcuptievtCapList){ vt_cupti_events_enabled = 0; }else{ /* register the finalize function of VampirTrace CUPTI to be called before * the program exits */ atexit(vt_cupti_events_finalize); } vt_cuptievt_initialized = 1; VT_CUPTI_UNLOCK(); } } }
/* * Increases the "Allocated CUDA memory" counter. * * @param ctxUID CUDA context identifier (@see CUPTI callback info) * @param devPtr pointer to the allocated memory (needed for vtcudaFree()) * @param size the number of bytes allocated */ void vt_cuptiact_writeMalloc(uint32_t ctxID, CUcontext cuCtx, void *devPtr, size_t size) { uint64_t vtTime; vt_cupti_ctx_t* vtCtx = NULL; vt_cupti_activity_t *vtcuptiActivity = NULL; vt_cupti_gpumem_t *vtMalloc = NULL; if(devPtr == NULL) return; VT_SUSPEND_MALLOC_TRACING(VT_CURRENT_THREAD); vtMalloc = (vt_cupti_gpumem_t*)malloc(sizeof(vt_cupti_gpumem_t)); vtMalloc->memPtr = devPtr; vtMalloc->size = size; /* check for VampirTrace CUPTI context */ vtCtx = vt_cupti_getCtx(cuCtx); if(vtCtx == NULL){ vtCtx = vt_cupti_createCtx(cuCtx, VT_CUPTI_NO_CUDA_DEVICE, ctxID, VT_CUPTI_NO_DEVICE_ID); vt_cupti_prependCtx(vtCtx); } /* check for VampirTrace CUPTI activity context */ if(vtCtx->activity == NULL){ vtCtx->activity = vt_cuptiact_createCtxActivity(cuCtx); } vtcuptiActivity = vtCtx->activity; /* lock the work on the context */ VT_CUPTI_LOCK(); /* flush activity buffer */ vt_cuptiact_flushCtxActivities(vtCtx); /* add malloc entry to list */ vtMalloc->next = vtcuptiActivity->gpuMemList; vtcuptiActivity->gpuMemList = vtMalloc; /* increase allocated memory counter */ vtcuptiActivity->gpuMemAllocated += size; /* check if first CUDA stream is available */ if(vtcuptiActivity->strmList == NULL){ if(vt_gpu_init_time < vt_start_time) vt_gpu_init_time = vt_start_time; vtcuptiActivity->strmList = vt_cuptiact_createStream(vtCtx, vtcuptiActivity->defaultStrmID); vt_count(vtcuptiActivity->strmList->vtThrdID, &vt_gpu_init_time, vt_gpu_cid_memusage, 0); } VT_CUPTI_UNLOCK(); VT_RESUME_MALLOC_TRACING(VT_CURRENT_THREAD); /* write counter value */ vtTime = vt_pform_wtime(); vt_count(vtcuptiActivity->strmList->vtThrdID, &vtTime, vt_gpu_cid_memusage, (uint64_t)(vtcuptiActivity->gpuMemAllocated)); }
/* * Decreases the "Allocated CUDA memory" counter. * * @param ctxUID CUDA context identifier (@see CUPTI callback info) * @param devPtr pointer to the allocated memory */ void vt_cuptiact_writeFree(uint32_t ctxID, CUcontext cuCtx, void *devPtr) { uint64_t vtTime; vt_cupti_ctx_t* vtCtx = NULL; vt_cupti_activity_t *vtcuptiActivity = NULL; vt_cupti_gpumem_t *curMalloc = NULL; vt_cupti_gpumem_t *lastMalloc = NULL; if(devPtr == NULL) return; VT_SUSPEND_MALLOC_TRACING(VT_CURRENT_THREAD); /* check for VampirTrace CUPTI context */ vtCtx = vt_cupti_getCtx(cuCtx); if(vtCtx == NULL){ vtCtx = vt_cupti_createCtx(cuCtx, VT_CUPTI_NO_CUDA_DEVICE, ctxID, VT_CUPTI_NO_DEVICE_ID); vt_cupti_prependCtx(vtCtx); } /* check for VampirTrace CUPTI activity context */ if(vtCtx->activity == NULL){ vtCtx->activity = vt_cuptiact_createCtxActivity(cuCtx); } vtcuptiActivity = vtCtx->activity; VT_CUPTI_LOCK(); /* flush activity buffer */ vt_cuptiact_flushCtxActivities(vtCtx); curMalloc = vtcuptiActivity->gpuMemList; lastMalloc = curMalloc; /* lookup the CUDA malloc entry by its memory pointer */ while(curMalloc != NULL){ if(devPtr == curMalloc->memPtr){ /* decrease allocated counter value and write it */ vtTime = vt_pform_wtime(); vtcuptiActivity->gpuMemAllocated -= curMalloc->size; vt_count(vtcuptiActivity->strmList->vtThrdID, &vtTime, vt_gpu_cid_memusage, (uint64_t)(vtcuptiActivity->gpuMemAllocated)); /* set pointer over current element to next one */ lastMalloc->next = curMalloc->next; /* if current element is the first list entry, set the list entry */ if(curMalloc == vtcuptiActivity->gpuMemList){ vtcuptiActivity->gpuMemList = curMalloc->next; } /* free VT memory of CUDA malloc */ curMalloc->next = NULL; free(curMalloc); curMalloc = NULL; /* set mallocList to NULL, if last element freed */ if(vtcuptiActivity->gpuMemAllocated == 0) { vtcuptiActivity->gpuMemList = NULL; } VT_CUPTI_UNLOCK(); VT_RESUME_MALLOC_TRACING(VT_CURRENT_THREAD); return; } lastMalloc = curMalloc; curMalloc = curMalloc->next; } VT_CUPTI_UNLOCK(); VT_RESUME_MALLOC_TRACING(VT_CURRENT_THREAD); vt_warning("[CUPTI Activity] free CUDA memory, which has not been allocated!"); }