/* * Get or if not available create a VampirTrace CUPTI context by CUDA context. * * @param cuCtx the CUDA context * * @return VampirTrace CUPTI context */ vt_cupti_ctx_t* vt_cupti_getCreateCtx(CUcontext cuCtx) { vt_cupti_ctx_t* vtCtx = vt_cupti_getCtx(cuCtx); if(vtCtx == NULL){ VT_SUSPEND_MALLOC_TRACING(VT_CURRENT_THREAD); vtCtx = vt_cupti_createCtx(cuCtx, VT_CUPTI_NO_CUDA_DEVICE, VT_CUPTI_NO_CONTEXT_ID, VT_CUPTI_NO_DEVICE_ID); VT_RESUME_MALLOC_TRACING(VT_CURRENT_THREAD); vt_cupti_prependCtx(vtCtx); } return vtCtx; }
/* * Returns the VampirTrace CUPTI context for the CUDA context associated with * the calling host thread. * * @param ptid the VampirTrace thread id of the calling host thread */ vt_cupti_ctx_t* vt_cupti_getCurrentContext(uint32_t ptid) { CUcontext cuCtx = NULL; if(!vt_cupti_initialized) vt_cupti_init(); VT_SUSPEND_CUDA_TRACING(ptid); # if (defined(CUDA_VERSION) && (CUDA_VERSION < 4000)) CHECK_CU_ERROR(cuCtxPopCurrent(&cuCtx), "cuCtxPopCurrent"); CHECK_CU_ERROR(cuCtxPushCurrent(cuCtx), "cuCtxPushCurrent"); # else CHECK_CU_ERROR(cuCtxGetCurrent(&cuCtx), "cuCtxGetCurrent"); # endif VT_RESUME_CUDA_TRACING(ptid); if(cuCtx == NULL) { vt_cntl_msg(2, "[CUPTI] No context is bound to the calling CPU thread", cuCtx); return NULL; } return vt_cupti_getCtx(cuCtx, ptid); }
/* * Increases the "Allocated CUDA memory" counter. * * @param ctxUID CUDA context identifier (@see CUPTI callback info) * @param devPtr pointer to the allocated memory (needed for vtcudaFree()) * @param size the number of bytes allocated */ void vt_cuptiact_writeMalloc(uint32_t ctxID, CUcontext cuCtx, void *devPtr, size_t size) { uint64_t vtTime; vt_cupti_ctx_t* vtCtx = NULL; vt_cupti_activity_t *vtcuptiActivity = NULL; vt_cupti_gpumem_t *vtMalloc = NULL; if(devPtr == NULL) return; VT_SUSPEND_MALLOC_TRACING(VT_CURRENT_THREAD); vtMalloc = (vt_cupti_gpumem_t*)malloc(sizeof(vt_cupti_gpumem_t)); vtMalloc->memPtr = devPtr; vtMalloc->size = size; /* check for VampirTrace CUPTI context */ vtCtx = vt_cupti_getCtx(cuCtx); if(vtCtx == NULL){ vtCtx = vt_cupti_createCtx(cuCtx, VT_CUPTI_NO_CUDA_DEVICE, ctxID, VT_CUPTI_NO_DEVICE_ID); vt_cupti_prependCtx(vtCtx); } /* check for VampirTrace CUPTI activity context */ if(vtCtx->activity == NULL){ vtCtx->activity = vt_cuptiact_createCtxActivity(cuCtx); } vtcuptiActivity = vtCtx->activity; /* lock the work on the context */ VT_CUPTI_LOCK(); /* flush activity buffer */ vt_cuptiact_flushCtxActivities(vtCtx); /* add malloc entry to list */ vtMalloc->next = vtcuptiActivity->gpuMemList; vtcuptiActivity->gpuMemList = vtMalloc; /* increase allocated memory counter */ vtcuptiActivity->gpuMemAllocated += size; /* check if first CUDA stream is available */ if(vtcuptiActivity->strmList == NULL){ if(vt_gpu_init_time < vt_start_time) vt_gpu_init_time = vt_start_time; vtcuptiActivity->strmList = vt_cuptiact_createStream(vtCtx, vtcuptiActivity->defaultStrmID); vt_count(vtcuptiActivity->strmList->vtThrdID, &vt_gpu_init_time, vt_gpu_cid_memusage, 0); } VT_CUPTI_UNLOCK(); VT_RESUME_MALLOC_TRACING(VT_CURRENT_THREAD); /* write counter value */ vtTime = vt_pform_wtime(); vt_count(vtcuptiActivity->strmList->vtThrdID, &vtTime, vt_gpu_cid_memusage, (uint64_t)(vtcuptiActivity->gpuMemAllocated)); }
/* * Decreases the "Allocated CUDA memory" counter. * * @param ctxUID CUDA context identifier (@see CUPTI callback info) * @param devPtr pointer to the allocated memory */ void vt_cuptiact_writeFree(uint32_t ctxID, CUcontext cuCtx, void *devPtr) { uint64_t vtTime; vt_cupti_ctx_t* vtCtx = NULL; vt_cupti_activity_t *vtcuptiActivity = NULL; vt_cupti_gpumem_t *curMalloc = NULL; vt_cupti_gpumem_t *lastMalloc = NULL; if(devPtr == NULL) return; VT_SUSPEND_MALLOC_TRACING(VT_CURRENT_THREAD); /* check for VampirTrace CUPTI context */ vtCtx = vt_cupti_getCtx(cuCtx); if(vtCtx == NULL){ vtCtx = vt_cupti_createCtx(cuCtx, VT_CUPTI_NO_CUDA_DEVICE, ctxID, VT_CUPTI_NO_DEVICE_ID); vt_cupti_prependCtx(vtCtx); } /* check for VampirTrace CUPTI activity context */ if(vtCtx->activity == NULL){ vtCtx->activity = vt_cuptiact_createCtxActivity(cuCtx); } vtcuptiActivity = vtCtx->activity; VT_CUPTI_LOCK(); /* flush activity buffer */ vt_cuptiact_flushCtxActivities(vtCtx); curMalloc = vtcuptiActivity->gpuMemList; lastMalloc = curMalloc; /* lookup the CUDA malloc entry by its memory pointer */ while(curMalloc != NULL){ if(devPtr == curMalloc->memPtr){ /* decrease allocated counter value and write it */ vtTime = vt_pform_wtime(); vtcuptiActivity->gpuMemAllocated -= curMalloc->size; vt_count(vtcuptiActivity->strmList->vtThrdID, &vtTime, vt_gpu_cid_memusage, (uint64_t)(vtcuptiActivity->gpuMemAllocated)); /* set pointer over current element to next one */ lastMalloc->next = curMalloc->next; /* if current element is the first list entry, set the list entry */ if(curMalloc == vtcuptiActivity->gpuMemList){ vtcuptiActivity->gpuMemList = curMalloc->next; } /* free VT memory of CUDA malloc */ curMalloc->next = NULL; free(curMalloc); curMalloc = NULL; /* set mallocList to NULL, if last element freed */ if(vtcuptiActivity->gpuMemAllocated == 0) { vtcuptiActivity->gpuMemList = NULL; } VT_CUPTI_UNLOCK(); VT_RESUME_MALLOC_TRACING(VT_CURRENT_THREAD); return; } lastMalloc = curMalloc; curMalloc = curMalloc->next; } VT_CUPTI_UNLOCK(); VT_RESUME_MALLOC_TRACING(VT_CURRENT_THREAD); vt_warning("[CUPTI Activity] free CUDA memory, which has not been allocated!"); }