/* * Increases the "Allocated CUDA memory" counter. * * @param ctxUID CUDA context identifier (@see CUPTI callback info) * @param devPtr pointer to the allocated memory (needed for vtcudaFree()) * @param size the number of bytes allocated */ void vt_cuptiact_writeMalloc(uint32_t ctxID, CUcontext cuCtx, void *devPtr, size_t size) { uint64_t vtTime; vt_cuptiact_ctx_t* vtCtx = NULL; vt_cuptiact_gpumem_t *vtMalloc = (vt_cuptiact_gpumem_t*)malloc(sizeof(vt_cuptiact_gpumem_t)); if(devPtr == NULL) return; /* flush activity buffer */ vt_cuptiact_flushCtxActivities(cuCtx); vtMalloc->memPtr = devPtr; vtMalloc->size = size; vtCtx = vt_cuptiact_getCtx(cuCtx); if(vtCtx == NULL){ vtCtx = vt_cuptiact_createContext(ctxID, cuCtx, (uint32_t)-1); } /* lock the work on the context */ VT_CUPTI_ACT_LOCK(); /* add malloc entry to list */ vtMalloc->next = vtCtx->gpuMemList; vtCtx->gpuMemList = vtMalloc; /* increase allocated memory counter */ vtCtx->gpuMemAllocated += size; /* check if first CUDA stream is available */ if(vtCtx->strmList == NULL){ if(vt_gpu_init_time < vt_start_time) vt_gpu_init_time = vt_start_time; vtCtx->strmList = vt_cuptiact_createStream(vtCtx, vtCtx->defaultStrmID); vt_count(vtCtx->strmList->vtThrdID, &vt_gpu_init_time, vt_gpu_cid_memusage, 0); } VT_CUPTI_ACT_UNLOCK(); /* write counter value */ vtTime = vt_pform_wtime(); vt_count(vtCtx->strmList->vtThrdID, &vtTime, vt_gpu_cid_memusage, (uint64_t)(vtCtx->gpuMemAllocated)); }
/* * Increases the "Allocated CUDA memory" counter. * * @param ctxUID CUDA context identifier (@see CUPTI callback info) * @param devPtr pointer to the allocated memory (needed for vtcudaFree()) * @param size the number of bytes allocated */ void vt_cuptiact_writeMalloc(uint32_t ctxID, CUcontext cuCtx, void *devPtr, size_t size) { uint64_t vtTime; vt_cupti_ctx_t* vtCtx = NULL; vt_cupti_activity_t *vtcuptiActivity = NULL; vt_cupti_gpumem_t *vtMalloc = NULL; if(devPtr == NULL) return; VT_SUSPEND_MALLOC_TRACING(VT_CURRENT_THREAD); vtMalloc = (vt_cupti_gpumem_t*)malloc(sizeof(vt_cupti_gpumem_t)); vtMalloc->memPtr = devPtr; vtMalloc->size = size; /* check for VampirTrace CUPTI context */ vtCtx = vt_cupti_getCtx(cuCtx); if(vtCtx == NULL){ vtCtx = vt_cupti_createCtx(cuCtx, VT_CUPTI_NO_CUDA_DEVICE, ctxID, VT_CUPTI_NO_DEVICE_ID); vt_cupti_prependCtx(vtCtx); } /* check for VampirTrace CUPTI activity context */ if(vtCtx->activity == NULL){ vtCtx->activity = vt_cuptiact_createCtxActivity(cuCtx); } vtcuptiActivity = vtCtx->activity; /* lock the work on the context */ VT_CUPTI_LOCK(); /* flush activity buffer */ vt_cuptiact_flushCtxActivities(vtCtx); /* add malloc entry to list */ vtMalloc->next = vtcuptiActivity->gpuMemList; vtcuptiActivity->gpuMemList = vtMalloc; /* increase allocated memory counter */ vtcuptiActivity->gpuMemAllocated += size; /* check if first CUDA stream is available */ if(vtcuptiActivity->strmList == NULL){ if(vt_gpu_init_time < vt_start_time) vt_gpu_init_time = vt_start_time; vtcuptiActivity->strmList = vt_cuptiact_createStream(vtCtx, vtcuptiActivity->defaultStrmID); vt_count(vtcuptiActivity->strmList->vtThrdID, &vt_gpu_init_time, vt_gpu_cid_memusage, 0); } VT_CUPTI_UNLOCK(); VT_RESUME_MALLOC_TRACING(VT_CURRENT_THREAD); /* write counter value */ vtTime = vt_pform_wtime(); vt_count(vtcuptiActivity->strmList->vtThrdID, &vtTime, vt_gpu_cid_memusage, (uint64_t)(vtcuptiActivity->gpuMemAllocated)); }
/* * Check for a VampirTrace activity stream by stream ID. If it does not exist, * create it. * * @param vtCtx VampirTrace CUPTI Activity context * @param strmID the CUDA stream ID provided by CUPTI callback API * * @return the VampirTrace CUDA stream */ static vt_cuptiact_strm_t* vt_cuptiact_checkStream(vt_cupti_ctx_t* vtCtx, uint32_t strmID) { vt_cupti_activity_t *vtcuptiActivity = vtCtx->activity; vt_cuptiact_strm_t *currStrm = NULL; vt_cuptiact_strm_t *lastStrm = NULL; vt_cuptiact_strm_t *reusableStrm = NULL; if(vtCtx == NULL){ vt_warning("[CUPTI Activity] No context given in vt_cuptiact_checkStream()!"); return NULL; } /* lookup stream */ /*VT_CUPTI_LOCK();*/ currStrm = vtcuptiActivity->strmList; lastStrm = vtcuptiActivity->strmList; while(currStrm != NULL){ /* check for existing stream */ if(currStrm->strmID == strmID){ /*VT_CUPTI_UNLOCK();*/ return currStrm; } /* check for reusable stream */ if(vt_gpu_stream_reuse && reusableStrm == NULL && currStrm->destroyed == 1){ reusableStrm = currStrm; } lastStrm = currStrm; currStrm = currStrm->next; } /* reuse a destroyed stream, if there is any available */ if(vt_gpu_stream_reuse && reusableStrm){ vt_cntl_msg(2, "[CUPTI Activity] Reusing CUDA stream %d with stream %d", reusableStrm->strmID, strmID); reusableStrm->destroyed = 0; reusableStrm->strmID = strmID; return reusableStrm; } /* * If stream list is empty, the stream to be created is not the default * stream and GPU idle and memory copy tracing is enabled, then create * a default stream. */ if(vtcuptiActivity->strmList == NULL && strmID != vtcuptiActivity->defaultStrmID && vt_gpu_trace_idle == 1 && vt_gpu_trace_mcpy){ vtcuptiActivity->strmList = vt_cuptiact_createStream(vtCtx, vtcuptiActivity->defaultStrmID); lastStrm = vtcuptiActivity->strmList; } /* create the stream, which has not been created yet */ currStrm = vt_cuptiact_createStream(vtCtx, strmID); /* append the newly created stream */ if(NULL != lastStrm) lastStrm->next = currStrm; else vtcuptiActivity->strmList = currStrm; /*VT_CUPTI_UNLOCK();*/ return currStrm; }