void vt_cuptiact_markStreamAsDestroyed(CUcontext cuCtx, uint32_t strmID) { vt_cupti_ctx_t *vtCtx = NULL; vt_cuptiact_strm_t *currStrm = NULL; VT_CUPTI_LOCK(); if(cuCtx == NULL){ vt_warning("[CUPTI Activity] No CUDA context given in " "vt_cuptiact_markStreamAsDestroyed()!"); VT_CUPTI_UNLOCK(); return; } vtCtx = vt_cupti_getCtxNoLock(cuCtx); if(vtCtx == NULL){ vt_warning("[CUPTI Activity] No context found in " "vt_cuptiact_markStreamAsDestroyed()!"); VT_CUPTI_UNLOCK(); return; } currStrm = vtCtx->activity->strmList; while(currStrm != NULL){ if(currStrm->strmID == strmID){ currStrm->destroyed = 1; VT_CUPTI_UNLOCK(); return; } currStrm = currStrm->next; } VT_CUPTI_UNLOCK(); }
/* * Print all available counters to stdout. * * @param capList list of CUDA devices with different capabilities */ static void vt_cupti_showAllCounters(vt_cupti_device_t *capList) { CUptiResult cuptiErr = CUPTI_SUCCESS; CUpti_EventDomainID *domainId = NULL; uint32_t maxDomains = 0; uint32_t i; size_t size = 0; while(capList != NULL){ CUdevice cuDev = capList->cuDev; vt_cntl_msg(1, "[CUPTI Events] Available events for device %d (SM %d.%d):", cuDev, capList->dev_major, capList->dev_minor); vt_cntl_msg(1, "Id:Name"); vt_cntl_msg(1, "Description\n" "-------------------------------------------------------------------"); cuptiErr = cuptiDeviceGetNumEventDomains(cuDev, &maxDomains); VT_CUPTI_CALL(cuptiErr, "cuptiDeviceGetNumEventDomains"); if(maxDomains == 0){ vt_warning("[CUPTI Events] No domain is exposed by dev = %d\n", cuDev); return; } size = sizeof(CUpti_EventDomainID) * maxDomains; domainId = (CUpti_EventDomainID*)malloc(size); if(domainId == NULL){ vt_warning("[CUPTI Events] Failed to allocate memory to domain ID"); return; } memset(domainId, 0, size); cuptiErr = cuptiDeviceEnumEventDomains(cuDev, &size, domainId); VT_CUPTI_CALL(cuptiErr, "cuptiDeviceEnumEventDomains"); /* enum domains */ for(i = 0; i < maxDomains; i++) vt_cuptievt_enumEvents(cuDev, domainId[i]); vt_cntl_msg(1, "------------------------------------------------------"); free(domainId); capList = capList->next; } /* as this function is in the call-path of the initialize functions * -> vt_cupti_setupMetrics * -> vt_cupti_fillMetricList * -> vt_cupti_showAllCounters */ vt_cuptievt_initialized = 1; VT_CUPTI_UNLOCK(); exit(0); }
int vt_env_exectrace() { static int exectrace = -1; char* tmp; if (exectrace == -1) { tmp = getenv("VT_EXECTRACE"); if (tmp != NULL && strlen(tmp) > 0) { vt_cntl_msg(2, "VT_EXECTRACE=%s", tmp); exectrace = parse_bool(tmp); } else { tmp = getenv("VT_LIBCTRACE"); if (tmp != NULL && strlen(tmp) > 0) { exectrace = parse_bool(tmp); vt_warning("VT_LIBCTRACE is deprecated, use VT_EXECTRACE instead!"); } else { exectrace = 1; } } } return exectrace; }
int vt_env_gputrace_kernel() { static int cudakernel = -1; if (cudakernel == -1) { char* tmp = getenv("VT_GPUTRACE_KERNEL"); if (tmp != NULL && strlen(tmp) > 0) { vt_cntl_msg(2, "VT_GPUTRACE_KERNEL=%s", tmp); cudakernel = atoi(tmp); /* perhaps user wrote 'yes' or 'true' */ if(cudakernel == 0 && parse_bool(tmp) == 1) cudakernel = 1; if(cudakernel == 1) vt_warning("VT_GPUTRACE_KERNEL is deprecated, " "use option 'kernel' with VT_GPUTRACE instead!"); } else { cudakernel = 1; } } return cudakernel; }
int vt_env_gputrace_memusage() { static int gpumem = -1; if (gpumem == -1) { char* tmp = getenv("VT_GPUTRACE_MEMUSAGE"); if (tmp != NULL && strlen(tmp) > 0) { vt_cntl_msg(2, "VT_GPUTRACE_MEMUSAGE=%s", tmp); gpumem = atoi(tmp); /* if user wrote 'yes' or 'true' */ if(gpumem == 0 && parse_bool(tmp) == 1) gpumem = 1; } else { gpumem = 0; } if (gpumem > 0) vt_warning("VT_GPUTRACE_MEMUSAGE is deprecated, " "use option 'memusage' with VT_GPUTRACE instead!"); } return gpumem; }
size_t vt_env_thread_bsize() { static size_t buffer_size = 0; char* tmp; if (buffer_size == 0) { tmp = getenv("VT_THREAD_BUFFER_SIZE"); if (tmp != NULL && strlen(tmp) > 0) { vt_cntl_msg(2, "VT_THREAD_BUFFER_SIZE=%s", tmp); buffer_size = parse_size(tmp); if (buffer_size <= 0) { vt_error_msg("VT_BUFFER_SIZE not properly set"); } else if (buffer_size < VT_MIN_BUFSIZE) { vt_warning("VT_BUFFER_SIZE=%d resized to %d bytes", buffer_size, VT_MIN_BUFSIZE); buffer_size = VT_MIN_BUFSIZE; } } else { buffer_size = 0; } } return buffer_size; }
static void vt_cuptievt_start(vt_cupti_events_t *vtcuptiEvtCtx) { CUptiResult cuptiErr = CUPTI_SUCCESS; vt_cupti_evtgrp_t *vtcuptiGrp = NULL; vt_cupti_evtgrp_t *lastGrp = NULL; /* start gathering counter values, if context was successfully initialized */ if(NULL == vtcuptiEvtCtx){ /* no performance counters for this thread available */ VT_CHECK_THREAD; vt_gpu_prop[VT_MY_THREAD] |= VTGPU_NO_PC; vt_cntl_msg(2, "[CUPTI Events] Context not initialized!"); return; } /* start all groups */ vtcuptiGrp = vtcuptiEvtCtx->vtGrpList; lastGrp = vtcuptiEvtCtx->vtGrpList; while(vtcuptiGrp != NULL){ cuptiErr = cuptiEventGroupEnable(vtcuptiGrp->evtGrp); /* if the event group could not be enabled, remove it */ if(cuptiErr != CUPTI_SUCCESS){ size_t i; vt_cupti_evtgrp_t *freeGrp = vtcuptiGrp; size_t valueSize = 32; char name[32]; vtcuptiGrp = vtcuptiGrp->next; /* give user information about the group, which cannot be enabled */ for(i = 0; i < freeGrp->evtNum; i++){ VTCUPTIEVENTGETATTRIBUTE(vtcuptiEvtCtx->vtDevCap->cuDev, *(freeGrp->cuptiEvtIDs)+i, CUPTI_EVENT_ATTR_NAME, &valueSize, (char*)name); vt_warning("[CUPTI Events] Event '%s' (%d) cannot be enabled", name, *(freeGrp->cuptiEvtIDs)+i); } /* group is first element in linked list */ if(vtcuptiEvtCtx->vtGrpList == freeGrp){ vtcuptiEvtCtx->vtGrpList = vtcuptiEvtCtx->vtGrpList->next; }else{/* has to be at least the second group in linked list */ lastGrp->next = freeGrp->next; } free(freeGrp); freeGrp = NULL; }else{ vtcuptiGrp->enabled = 1; lastGrp= vtcuptiGrp; vtcuptiGrp = vtcuptiGrp->next; } } }
/* may be called per thread */ void vt_plugin_cntr_write_post_mortem(VTThrd * thrd) { uint32_t counter_index; vt_plugin_cntr_timevalue * time_values = NULL; uint32_t number_of_counters; uint64_t number_of_values = 0; uint64_t i; uint64_t dummy_time; uint32_t tid; struct vt_plugin_single_counter current_counter; struct vt_plugin_cntr_defines * plugin_cntr_defines = (struct vt_plugin_cntr_defines *) thrd->plugin_cntr_defines; if (plugin_cntr_defines == NULL) return; if (plugin_cntr_defines->size_of_counters[VT_PLUGIN_CNTR_ASYNCH_POST_MORTEM] == 0) return; if (VTTHRD_TRACE_STATUS(thrd) != VT_TRACE_ON) return; for (tid=0;tid<VTThrdn;tid++) if ( VTThrdv[tid] == thrd ) break; if ( tid == VTThrdn ){ vt_warning("Can not determine internal TID when gathering post-mortem counters"); return; } /* for all post_mortem counters */ number_of_counters = plugin_cntr_defines->size_of_counters[VT_PLUGIN_CNTR_ASYNCH_POST_MORTEM]; dummy_time = vt_pform_wtime(); /* set flag for writing post mortem counters; prevents writing of flush * enter/exit event when flushing */ thrd->plugin_cntr_writing_post_mortem = 1; /* we assume that for each counter (not plugin), * the data is monotonically increasing */ /* for all counters of this thread */ for (counter_index = 0; counter_index < number_of_counters; counter_index++) { current_counter = plugin_cntr_defines->counters[VT_PLUGIN_CNTR_ASYNCH_POST_MORTEM] [counter_index]; /* get data */ number_of_values = current_counter.getAllValues( current_counter.from_plugin_id, &time_values); if (time_values == NULL) return; for (i = 0; i < number_of_values; i++) { WRITE_ASYNCH_DATA(thrd, tid, current_counter, time_values[i], &dummy_time); } free(time_values); } /* unset flag for writing post mortem counters */ thrd->plugin_cntr_writing_post_mortem = 0; }
static void metric_warning(int errcode, char *note) { char errstring[PAPI_MAX_STR_LEN]; PAPI_perror(errcode, errstring, PAPI_MAX_STR_LEN); if (errcode == PAPI_ESYS) { strncat(errstring, ": ", PAPI_MAX_STR_LEN-strlen(errstring)); strncat(errstring, strerror(errno), PAPI_MAX_STR_LEN-strlen(errstring)); } vt_warning("%s: %s (ignored)\n", note?note:"PAPI", errstring); }
static void vt_cupti_start(vt_cupti_ctx_t *vtcuptiCtx) { CUptiResult cuptiErr = CUPTI_SUCCESS; vt_cupti_grp_t *vtcuptiGrp = NULL; vt_cupti_grp_t *lastGrp = NULL; if(vtcuptiCtx == NULL) return; /* start all groups */ vtcuptiGrp = vtcuptiCtx->vtGrpList; lastGrp = vtcuptiCtx->vtGrpList; while(vtcuptiGrp != NULL){ cuptiErr = cuptiEventGroupEnable(vtcuptiGrp->evtGrp); /* if the event group could not be enabled, remove it */ if(cuptiErr != CUPTI_SUCCESS){ size_t i; vt_cupti_grp_t *freeGrp = vtcuptiGrp; size_t valueSize = 32; char name[32]; vtcuptiGrp = vtcuptiGrp->next; /* give user information about the group, which cannot be enabled */ for(i = 0; i < freeGrp->evtNum; i++){ cuptiEventGetAttribute(vtcuptiCtx->vtDevCap->cuDev, *(freeGrp->cuptiEvtIDs)+i, CUPTI_EVENT_ATTR_NAME, &valueSize, (char*)name); vt_warning("[CUPTI] Event '%s' (%d) cannot be enabled", name, *(freeGrp->cuptiEvtIDs)+i); } /* group is first element in linked list */ if(vtcuptiCtx->vtGrpList == freeGrp){ vtcuptiCtx->vtGrpList = vtcuptiCtx->vtGrpList->next; }else{/* has to be at least the second group in linked list */ lastGrp->next = freeGrp->next; } free(freeGrp); freeGrp = NULL; }else{ vtcuptiGrp->enabled = 1; lastGrp= vtcuptiGrp; vtcuptiGrp = vtcuptiGrp->next; } } }
/* * Handles errors returned from CUPTI function calls. * * @param ecode the CUDA driver API error code * @param msg a message to get more detailed information about the error * @param the corresponding file * @param the line the error occurred */ void vt_cupti_handleError(CUptiResult err, const char* msg, const char *file, const int line) { const char *errstr; if(msg != NULL) vt_cntl_msg(1, msg); cuptiGetResultString(err, &errstr); if(vt_gpu_error){ vt_error_msg("[CUPTI] %s:%d:'%s'", file, line, errstr); }else{ vt_warning("[CUPTI] %s:%d:'%s'", file, line, errstr); } }
void vt_cuptiact_setupActivityContext(vt_cupti_ctx_t *vtCtx) { /* try to get the global VampirTrace CUPTI context */ if(vtCtx == NULL){ vt_warning("[CUPTI Activity] No context given!"); return; } VT_SUSPEND_MALLOC_TRACING(vtCtx->ptid); /* create the VampirTrace CUPTI activity context */ if(vtCtx->activity == NULL) vtCtx->activity = vt_cuptiact_createCtxActivity(vtCtx->cuCtx); /* queue new buffer to context to record activities */ vtCtx->activity->buffer = vt_cuptiact_queueNewBuffer(vtCtx->cuCtx); VT_RESUME_MALLOC_TRACING(vtCtx->ptid); }
int execvp(const char* path, char* const argv[]) { int rc; VT_MEMHOOKS_OFF(); if ( DO_TRACE(execvp) ) { /* mark enter function */ uint64_t time = vt_pform_wtime(); vt_enter(&time, libc_funcs[FUNCIDX(execvp)].rid); } /* close VT for current process */ vt_close(); /* call (real) function */ CALL_FUNC(execvp, rc, (path, argv)); vt_warning("execvp failed: %s", strerror(errno)); return rc; }
int execle(const char* path, const char* arg, ...) { int rc; char* argv[100]; char** envp; char* tmp; int i; va_list ap; VT_MEMHOOKS_OFF(); va_start(ap, arg); i = 0; argv[i++] = (char*)arg; while((tmp = va_arg(ap, char*) )) argv[i++] = tmp; argv[i] = NULL; envp = va_arg(ap, char**); va_end(ap); if ( DO_TRACE(execle) ) { /* mark enter function */ uint64_t time = vt_pform_wtime(); vt_enter(&time, libc_funcs[FUNCIDX(execle)].rid); } /* close VT for current process */ vt_close(); /* call (real) function */ CALL_FUNC(execve, rc, (path, argv, envp)); vt_warning("execle failed: %s", strerror(errno)); return rc; }
char* vt_env_ldir() { static char* ldir = NULL; char* tmp; if (! ldir) { tmp = getenv("VT_PFORM_LDIR"); #if defined(VT_IOFSL) if (vt_env_iofsl_servers()) { ldir = vt_env_gdir(); if (tmp != NULL && strlen(tmp) > 0) { vt_warning("Setting of VT_PFORM_LDIR isn't allowed in IOFSL " "mode; reset it to VT_PFORM_GDIR (=%s)", ldir); } } else #endif /* VT_IOFSL */ { if (tmp != NULL && strlen(tmp) > 0) { vt_cntl_msg(2, "VT_PFORM_LDIR=%s", tmp); ldir = replace_vars(tmp); } else { ldir = replace_vars(vt_pform_ldir()); } } } return ldir; }
/* no need to lock, because it is only called by vt_cupti_callback_init() */ void vt_cupti_activity_init() { /*if(!vt_cuptiact_initialized){ vt_cupti_init(); VT_CUPTI_LOCK();*/ if(!vt_cuptiact_initialized){ vt_cntl_msg(2, "[CUPTI Activity] Initializing ... "); { vt_cuptiact_bufSize = vt_env_cudatrace_bsize(); /* no buffer size < 1024 bytes allowed (see CUPTI documentation) */ if(vt_cuptiact_bufSize < 1024){ if(vt_cuptiact_bufSize > 0){ vt_warning("[CUPTI Activity] Buffer size has to be at least 1024 " "bytes! It has been set to %d.", vt_cuptiact_bufSize); } vt_cuptiact_bufSize = VT_CUPTI_ACT_DEFAULT_BSIZE; } /* queue a global buffer to initialize CUPTI before CUDA init vt_cuptiact_buffer = (uint8_t *)malloc(vt_cuptiact_bufSize); VT_CUPTI_CALL(cuptiActivityEnqueueBuffer(NULL, 0, vt_cuptiact_buffer, vt_cuptiact_bufSize), "cuptiActivityEnqueueBuffer");*/ } #if (defined(VT_MT) || defined(VT_HYB)) VTTHRD_LOCK_IDS(); #endif if(vt_gpu_trace_kernels > 1){ /* define kernel counters */ vt_cuptiact_cid_knStaticSharedMem = vt_def_counter(VT_MASTER_THREAD, "staticSharedMemory", "Bytes", VT_CNTR_ABS | VT_CNTR_NEXT | VT_CNTR_UNSIGNED, vt_cupti_cgid_cuda_kernel, 0); vt_cuptiact_cid_knDynamicSharedMem = vt_def_counter(VT_MASTER_THREAD, "dynamicSharedMemory", "Bytes", VT_CNTR_ABS | VT_CNTR_NEXT | VT_CNTR_UNSIGNED, vt_cupti_cgid_cuda_kernel, 0); vt_cuptiact_cid_knLocalMemTotal = vt_def_counter(VT_MASTER_THREAD, "localMemoryPerKernel", "Bytes", VT_CNTR_ABS | VT_CNTR_NEXT | VT_CNTR_UNSIGNED, vt_cupti_cgid_cuda_kernel, 0); vt_cuptiact_cid_knRegistersPerThread = vt_def_counter(VT_MASTER_THREAD, "registersPerThread", "#", VT_CNTR_ABS | VT_CNTR_NEXT | VT_CNTR_UNSIGNED, vt_cupti_cgid_cuda_kernel, 0); } /* define region for GPU activity flush */ vt_cuptiact_rid_flush = vt_def_region(VT_MASTER_THREAD, "flushActivities", VT_NO_ID, VT_NO_LNO, VT_NO_LNO, "VT_CUDA", VT_FUNCTION); #if (defined(VT_MT) || defined(VT_HYB)) VTTHRD_UNLOCK_IDS(); #endif /*** enable the activities ***/ /* enable kernel tracing */ if(vt_gpu_trace_kernels > 0){ #if (defined(CUPTI_API_VERSION) && (CUPTI_API_VERSION >= 3)) if((vt_gpu_config & VT_GPU_TRACE_CONCURRENT_KERNEL) == VT_GPU_TRACE_CONCURRENT_KERNEL){ /*VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL), "cuptiActivityEnable");*/ VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL), "cuptiActivityEnable"); }else #endif VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL), "cuptiActivityEnable"); } /* enable memory copy tracing */ if(vt_gpu_trace_mcpy){ VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY), "cuptiActivityEnable"); } /* register the finalize function of VampirTrace CUPTI to be called before * the program exits atexit(vt_cupti_activity_finalize);*/ vt_cuptiact_initialized = 1; /*VT_CUPTI_UNLOCK(); }*/ } }
/* * Parse the environment variable for CUPTI metrics (including CUDA device * capabilities) and fill the capability metric list. * * @param capList points to the first element of the capability metric list */ static void vt_cupti_fillMetricList(vt_cupti_device_t *capList) { char *metricString = vt_env_cupti_events(); char *metric_sep = vt_env_metrics_sep(); char *metric, *metric_cap; metric = strtok(metricString, metric_sep); while (metric != NULL){ CUptiResult cuptiErr = CUPTI_SUCCESS; vt_cupti_device_t *cuptiDev = NULL; vt_cupti_evtctr_t *vtcuptiEvt = NULL; int metr_major = 0; int metr_minor = 0; /* try to get CUDA device capability parsed from metric */ metr_major = atoi(metric); metric_cap = strchr(metric+1, '.'); if(metric_cap){ metr_minor = atoi(metric_cap+1); metric_cap = strchr(metric_cap+1, '_'); } /* check whether device capability is given or not */ if(metric_cap){ metric = metric_cap + 1; vt_cntl_msg(2, "Metric '%s', %d.%d", metric, metr_major, metr_minor); cuptiDev = vt_cupti_checkMetricList(capList, metr_major, metr_minor); if(cuptiDev == NULL){ metric = strtok(NULL, metric_sep); continue; } vtcuptiEvt = (vt_cupti_evtctr_t*)malloc(sizeof(vt_cupti_evtctr_t)); cuptiErr = cuptiEventGetIdFromName(cuptiDev->cuDev, metric, &vtcuptiEvt->cuptiEvtID); if(cuptiErr != CUPTI_SUCCESS){ if(!strncmp(metric, "help", 4)) vt_cupti_showAllCounters(capList); vt_warning("[CUPTI Events] Skipping invalid event '%s' for device %d", metric, cuptiDev->cuDev); metric = strtok(NULL, metric_sep); continue; } /* create VampirTrace counter ID */ #if (defined(VT_MT) || defined(VT_HYB)) VTTHRD_LOCK_IDS(); #endif vtcuptiEvt->vtCID = vt_def_counter(VT_MASTER_THREAD, metric, "#", VT_CNTR_ABS | VT_CNTR_LAST | VT_CNTR_UNSIGNED, vt_cuptievt_cgid, 0); #if (defined(VT_MT) || defined(VT_HYB)) VTTHRD_UNLOCK_IDS(); #endif cuptiDev->evtNum++; vtcuptiEvt->next = cuptiDev->vtcuptiEvtList; cuptiDev->vtcuptiEvtList = vtcuptiEvt; }else{ /* device capability is not given. Try to add metric to all devices */ uint32_t cid_metric = VT_NO_ID; cuptiDev = capList; while(cuptiDev != NULL){ vtcuptiEvt = (vt_cupti_evtctr_t*)malloc(sizeof(vt_cupti_evtctr_t)); cuptiErr = cuptiEventGetIdFromName(cuptiDev->cuDev, metric, &vtcuptiEvt->cuptiEvtID); if(cuptiErr != CUPTI_SUCCESS){ if(!strncmp(metric, "help", 4)) vt_cupti_showAllCounters(capList); vt_warning("[CUPTI Events] Skipping invalid event '%s' for device %d", metric, cuptiDev->cuDev); }else{ /* create VampirTrace counter ID, if not yet done for other device */ if(cid_metric == VT_NO_ID){ #if (defined(VT_MT) || defined(VT_HYB)) VTTHRD_LOCK_IDS(); #endif cid_metric = vt_def_counter(VT_MASTER_THREAD, metric, "#", VT_CNTR_ABS | VT_CNTR_LAST | VT_CNTR_UNSIGNED, vt_cuptievt_cgid, 0); #if (defined(VT_MT) || defined(VT_HYB)) VTTHRD_UNLOCK_IDS(); #endif } cuptiDev->evtNum++; vtcuptiEvt->vtCID = cid_metric; vtcuptiEvt->next = cuptiDev->vtcuptiEvtList; cuptiDev->vtcuptiEvtList = vtcuptiEvt; } cuptiDev = cuptiDev->next; } } metric = strtok(NULL, metric_sep); } }
/* * Decreases the "Allocated CUDA memory" counter. * * @param ctxUID CUDA context identifier (@see CUPTI callback info) * @param devPtr pointer to the allocated memory */ void vt_cuptiact_writeFree(uint32_t ctxID, CUcontext cuCtx, void *devPtr) { uint64_t vtTime; vt_cuptiact_ctx_t* vtCtx = NULL; vt_cuptiact_gpumem_t *curMalloc = NULL; vt_cuptiact_gpumem_t *lastMalloc = NULL; if(devPtr == NULL) return; /* flush activity buffer */ vt_cuptiact_flushCtxActivities(cuCtx); vtCtx = vt_cuptiact_getCtx(cuCtx); if(vtCtx == NULL){ vtCtx = vt_cuptiact_createContext(ctxID, cuCtx, (uint32_t)-1); } VT_CUPTI_ACT_LOCK(); curMalloc = vtCtx->gpuMemList; lastMalloc = vtCtx->gpuMemList; /* lookup the CUDA malloc entry by its memory pointer */ while(curMalloc != NULL){ if(devPtr == curMalloc->memPtr){ /* decrease allocated counter value and write it */ vtTime = vt_pform_wtime(); vtCtx->gpuMemAllocated -= curMalloc->size; vt_count(vtCtx->strmList->vtThrdID, &vtTime, vt_gpu_cid_memusage, (uint64_t)(vtCtx->gpuMemAllocated)); /* set pointer over current element to next one */ lastMalloc->next = curMalloc->next; /* if current element is the first list entry, set the list entry */ if(curMalloc == vtCtx->gpuMemList){ vtCtx->gpuMemList = curMalloc->next; } /* free VT memory of CUDA malloc */ curMalloc->next = NULL; free(curMalloc); curMalloc = NULL; /* set mallocList to NULL, if last element freed */ if(vtCtx->gpuMemAllocated == 0) { vtCtx->gpuMemList = NULL; } VT_CUPTI_ACT_UNLOCK(); return; } lastMalloc = curMalloc; curMalloc = curMalloc->next; } VT_CUPTI_ACT_UNLOCK(); vt_warning("[CUPTI Activity] free CUDA memory, which has not been allocated!"); }
/* may be called per thread */ void vt_plugin_cntr_write_post_mortem(VTThrd * thrd) { uint32_t counter_index; uint64_t * counter_current_indices; vt_plugin_cntr_timevalue ** time_values_by_counter = NULL; uint32_t number_of_counters; uint64_t * number_of_values_by_counter; uint64_t dummy_time; uint32_t tid; struct vt_plugin_single_counter current_counter; struct vt_plugin_cntr_defines * plugin_cntr_defines = (struct vt_plugin_cntr_defines *) thrd->plugin_cntr_defines; if (plugin_cntr_defines == NULL) return; if (plugin_cntr_defines->size_of_counters[VT_PLUGIN_CNTR_ASYNCH_POST_MORTEM] == 0) return; if (VTTHRD_TRACE_STATUS(thrd) != VT_TRACE_ON) return; for (tid=0;tid<VTThrdn;tid++) if ( VTThrdv[tid] == thrd ) break; if ( tid == VTThrdn ){ vt_warning("Can not determine internal TID when gathering post-mortem counters"); return; } /* for all post_mortem counters */ number_of_counters = plugin_cntr_defines->size_of_counters[VT_PLUGIN_CNTR_ASYNCH_POST_MORTEM]; dummy_time = vt_pform_wtime(); /* set flag for writing post mortem counters; prevents writing of flush * enter/exit event when flushing */ thrd->plugin_cntr_writing_post_mortem = 1; /* we assume that for each counter (not plugin), * the data is monotonically increasing */ /* for all counters of this thread */ time_values_by_counter = calloc(number_of_counters, sizeof(*time_values_by_counter)); vt_libassert(time_values_by_counter); number_of_values_by_counter = calloc(number_of_counters, sizeof(*number_of_values_by_counter)); vt_libassert(number_of_values_by_counter); for (counter_index = 0; counter_index < number_of_counters; counter_index++) { current_counter = plugin_cntr_defines->counters[VT_PLUGIN_CNTR_ASYNCH_POST_MORTEM][counter_index]; /* get data */ number_of_values_by_counter[counter_index] = current_counter.getAllValues( current_counter.from_plugin_id, &(time_values_by_counter[counter_index])); if (time_values_by_counter[counter_index] == NULL) { free(time_values_by_counter); free(number_of_values_by_counter); return; } } /* initialized with 0! */ counter_current_indices = calloc(number_of_counters, sizeof(*counter_current_indices)); vt_libassert(counter_current_indices); while (1) { vt_plugin_cntr_timevalue *min_tvp = NULL; uint32_t min_counter; for (counter_index = 0; counter_index < number_of_counters; counter_index++) { /* * TODO optimize for "nice" plugins by looking if the "next" counter has the * _same_ timestamp (so there cannot be anyone else with a smaller one */ vt_plugin_cntr_timevalue *tvp; if (counter_current_indices[counter_index] >= number_of_values_by_counter[counter_index]) { continue; } tvp = &(time_values_by_counter[counter_index][counter_current_indices[counter_index]]); if (!min_tvp || tvp->timestamp < min_tvp->timestamp) { min_tvp = tvp; min_counter = counter_index; } } if (min_tvp == NULL) { /* we are done */ break; } current_counter = plugin_cntr_defines->counters[VT_PLUGIN_CNTR_ASYNCH_POST_MORTEM][min_counter]; WRITE_ASYNCH_DATA(thrd, tid, current_counter, *min_tvp, &dummy_time); counter_current_indices[min_counter]++; } free(time_values_by_counter); free(counter_current_indices); free(number_of_values_by_counter); /* unset flag for writing post mortem counters */ thrd->plugin_cntr_writing_post_mortem = 0; }
/* * Decreases the "Allocated CUDA memory" counter. * * @param ctxUID CUDA context identifier (@see CUPTI callback info) * @param devPtr pointer to the allocated memory */ void vt_cuptiact_writeFree(uint32_t ctxID, CUcontext cuCtx, void *devPtr) { uint64_t vtTime; vt_cupti_ctx_t* vtCtx = NULL; vt_cupti_activity_t *vtcuptiActivity = NULL; vt_cupti_gpumem_t *curMalloc = NULL; vt_cupti_gpumem_t *lastMalloc = NULL; if(devPtr == NULL) return; VT_SUSPEND_MALLOC_TRACING(VT_CURRENT_THREAD); /* check for VampirTrace CUPTI context */ vtCtx = vt_cupti_getCtx(cuCtx); if(vtCtx == NULL){ vtCtx = vt_cupti_createCtx(cuCtx, VT_CUPTI_NO_CUDA_DEVICE, ctxID, VT_CUPTI_NO_DEVICE_ID); vt_cupti_prependCtx(vtCtx); } /* check for VampirTrace CUPTI activity context */ if(vtCtx->activity == NULL){ vtCtx->activity = vt_cuptiact_createCtxActivity(cuCtx); } vtcuptiActivity = vtCtx->activity; VT_CUPTI_LOCK(); /* flush activity buffer */ vt_cuptiact_flushCtxActivities(vtCtx); curMalloc = vtcuptiActivity->gpuMemList; lastMalloc = curMalloc; /* lookup the CUDA malloc entry by its memory pointer */ while(curMalloc != NULL){ if(devPtr == curMalloc->memPtr){ /* decrease allocated counter value and write it */ vtTime = vt_pform_wtime(); vtcuptiActivity->gpuMemAllocated -= curMalloc->size; vt_count(vtcuptiActivity->strmList->vtThrdID, &vtTime, vt_gpu_cid_memusage, (uint64_t)(vtcuptiActivity->gpuMemAllocated)); /* set pointer over current element to next one */ lastMalloc->next = curMalloc->next; /* if current element is the first list entry, set the list entry */ if(curMalloc == vtcuptiActivity->gpuMemList){ vtcuptiActivity->gpuMemList = curMalloc->next; } /* free VT memory of CUDA malloc */ curMalloc->next = NULL; free(curMalloc); curMalloc = NULL; /* set mallocList to NULL, if last element freed */ if(vtcuptiActivity->gpuMemAllocated == 0) { vtcuptiActivity->gpuMemList = NULL; } VT_CUPTI_UNLOCK(); VT_RESUME_MALLOC_TRACING(VT_CURRENT_THREAD); return; } lastMalloc = curMalloc; curMalloc = curMalloc->next; } VT_CUPTI_UNLOCK(); VT_RESUME_MALLOC_TRACING(VT_CURRENT_THREAD); vt_warning("[CUPTI Activity] free CUDA memory, which has not been allocated!"); }
/* * Check for a VampirTrace activity stream by stream ID. If it does not exist, * create it. * * @param vtCtx VampirTrace CUPTI Activity context * @param strmID the CUDA stream ID provided by CUPTI callback API * * @return the VampirTrace CUDA stream */ static vt_cuptiact_strm_t* vt_cuptiact_checkStream(vt_cupti_ctx_t* vtCtx, uint32_t strmID) { vt_cupti_activity_t *vtcuptiActivity = vtCtx->activity; vt_cuptiact_strm_t *currStrm = NULL; vt_cuptiact_strm_t *lastStrm = NULL; vt_cuptiact_strm_t *reusableStrm = NULL; if(vtCtx == NULL){ vt_warning("[CUPTI Activity] No context given in vt_cuptiact_checkStream()!"); return NULL; } /* lookup stream */ /*VT_CUPTI_LOCK();*/ currStrm = vtcuptiActivity->strmList; lastStrm = vtcuptiActivity->strmList; while(currStrm != NULL){ /* check for existing stream */ if(currStrm->strmID == strmID){ /*VT_CUPTI_UNLOCK();*/ return currStrm; } /* check for reusable stream */ if(vt_gpu_stream_reuse && reusableStrm == NULL && currStrm->destroyed == 1){ reusableStrm = currStrm; } lastStrm = currStrm; currStrm = currStrm->next; } /* reuse a destroyed stream, if there is any available */ if(vt_gpu_stream_reuse && reusableStrm){ vt_cntl_msg(2, "[CUPTI Activity] Reusing CUDA stream %d with stream %d", reusableStrm->strmID, strmID); reusableStrm->destroyed = 0; reusableStrm->strmID = strmID; return reusableStrm; } /* * If stream list is empty, the stream to be created is not the default * stream and GPU idle and memory copy tracing is enabled, then create * a default stream. */ if(vtcuptiActivity->strmList == NULL && strmID != vtcuptiActivity->defaultStrmID && vt_gpu_trace_idle == 1 && vt_gpu_trace_mcpy){ vtcuptiActivity->strmList = vt_cuptiact_createStream(vtCtx, vtcuptiActivity->defaultStrmID); lastStrm = vtcuptiActivity->strmList; } /* create the stream, which has not been created yet */ currStrm = vt_cuptiact_createStream(vtCtx, strmID); /* append the newly created stream */ if(NULL != lastStrm) lastStrm->next = currStrm; else vtcuptiActivity->strmList = currStrm; /*VT_CUPTI_UNLOCK();*/ return currStrm; }
void vt_cuptiact_flushCtxActivities(vt_cupti_ctx_t *vtCtx) { CUptiResult status; uint8_t *buffer = NULL; size_t bufSize; CUpti_Activity *record = NULL; uint64_t hostStop, gpuStop; uint32_t ptid = VT_NO_ID; vt_cupti_activity_t *vtcuptiActivity = NULL; /* check for VampirTrace CUPTI context */ if(vtCtx == NULL || vtCtx->activity == NULL){ vt_warning("[CUPTI Activity] Context not found!"); return; } vtcuptiActivity = vtCtx->activity; /* check if the buffer contains records */ status = cuptiActivityQueryBuffer(vtCtx->cuCtx, 0, &bufSize); if(status != CUPTI_SUCCESS){ if(CUPTI_ERROR_QUEUE_EMPTY == status || CUPTI_ERROR_MAX_LIMIT_REACHED != status){ return; } } /* expose VampirTrace CUPTI activity flush as measurement overhead */ VT_CHECK_THREAD; ptid = VT_MY_THREAD; hostStop = vt_pform_wtime(); vt_enter(ptid, &hostStop, vt_cuptiact_rid_flush); vt_cntl_msg(2,"[CUPTI Activity] Handle context %d activities", vtCtx->cuCtx); /* lock the whole buffer flush VT_CUPTI_LOCK();*/ /* dump the contents of the global queue */ VT_CUPTI_CALL(cuptiActivityDequeueBuffer(vtCtx->cuCtx, 0, &buffer, &bufSize), "cuptiActivityDequeueBuffer"); /* * Get time synchronization factor between host and GPU time for measured * period */ { VT_CUPTI_CALL(cuptiGetTimestamp(&gpuStop), "cuptiGetTimestamp"); hostStop = vt_pform_wtime(); vtcuptiActivity->sync.hostStop = hostStop; vtcuptiActivity->sync.factor = (double)(hostStop - vtcuptiActivity->sync.hostStart) /(double)(gpuStop - vtcuptiActivity->sync.gpuStart); } /*vt_cntl_msg(1, "hostStop: %llu , gpuStop: %llu", hostStopTS, gpuStopTS); vt_cntl_msg(1, "factor: %lf", syncFactor);*/ do{ status = cuptiActivityGetNextRecord(buffer, bufSize, &record); if(status == CUPTI_SUCCESS) { vt_cuptiact_writeRecord(record, vtCtx); }else if(status == CUPTI_ERROR_MAX_LIMIT_REACHED){ break; }else{ VT_CUPTI_CALL(status, "cuptiActivityGetNextRecord"); } }while(1); /* report any records dropped from the global queue */ { size_t dropped; VT_CUPTI_CALL(cuptiActivityGetNumDroppedRecords(vtCtx->cuCtx, 0, &dropped), "cuptiActivityGetNumDroppedRecords"); if(dropped != 0) vt_warning("[CUPTI Activity] Dropped %u records. Current buffer size: %llu bytes\n" "To avoid dropping of records increase the buffer size!\n" "Proposed minimum VT_CUDATRACE_BUFFER_SIZE=%llu", (unsigned int)dropped, vt_cuptiact_bufSize, vt_cuptiact_bufSize + dropped/2 * (sizeof(CUpti_ActivityKernel) + sizeof(CUpti_ActivityMemcpy))); } /* enter GPU idle region after last kernel, if exited before */ if(vtcuptiActivity->gpuIdleOn == 0){ vt_enter(vtcuptiActivity->strmList->vtThrdID, &(vtcuptiActivity->vtLastGPUTime), vt_gpu_rid_idle); vtcuptiActivity->gpuIdleOn = 1; /*vt_warning("IDLfente: %llu (%d)", vtCtx->vtLastGPUTime, vtCtx->strmList->vtThrdID);*/ } /* enqueue buffer again */ VT_CUPTI_CALL(cuptiActivityEnqueueBuffer(vtCtx->cuCtx, 0, buffer, vt_cuptiact_bufSize), "cuptiActivityEnqueueBuffer"); /* set new synchronization point */ vtcuptiActivity->sync.hostStart = hostStop; vtcuptiActivity->sync.gpuStart = gpuStop; /*VT_CUPTI_UNLOCK();*/ /* use local variable hostStop to write exit event for activity flush */ hostStop = vt_pform_wtime(); vt_exit(ptid, &hostStop); }
/* * Create a VampirTrace CUPTI stream. * * @param vtCtx VampirTrace CUPTI context * @param cuStrm CUDA stream * @param strmID ID of the CUDA stream * * @return pointer to created VampirTrace CUPTI stream */ vt_cupti_strm_t* vt_cupti_createStream(vt_cupti_ctx_t *vtCtx, CUstream cuStrm, uint32_t strmID) { vt_cupti_strm_t *vtStrm = NULL; if(vtCtx == NULL){ vt_warning("[CUPTI] Cannot create stream without VampirTrace CUPTI context"); return NULL; } vtStrm = (vt_cupti_strm_t *)malloc(sizeof(vt_cupti_strm_t)); if(vtStrm == NULL) vt_error_msg("[CUPTI] Could not allocate memory for stream!"); vtStrm->cuStrm = cuStrm; vtStrm->vtLastTime = vt_gpu_init_time; vtStrm->destroyed = 0; vtStrm->next = NULL; #if defined(VT_CUPTI_ACTIVITY) /* create stream by VT CUPTI callbacks implementation (CUstream is given) */ if(strmID == VT_CUPTI_NO_STREAM_ID){ if(cuStrm != VT_CUPTI_NO_STREAM){ VT_CUPTI_CALL(cuptiGetStreamId(vtCtx->cuCtx, cuStrm, &strmID), "cuptiGetStreamId"); }else{ vt_warning("[CUPTI] Neither CUDA stream nor stream ID given!"); free(vtStrm); return NULL; } } #else /* only VT_CUPTI_CALLBACKS is defined */ if(vtCtx->callbacks != NULL){ strmID = vtCtx->callbacks->streamsCreated; vtCtx->callbacks->streamsCreated++; } #endif vtStrm->cuStrmID = strmID; /* create VampirTrace thread */ { char thread_name[16] = "CUDA"; if(vt_gpu_stream_reuse){ if(vtCtx->devID != VT_NO_ID){ if(-1 == snprintf(thread_name+4, 12, "[%d]", vtCtx->devID)) vt_cntl_msg(1, "Could not create thread name for CUDA thread!"); } }else{ if(vtCtx->devID == VT_NO_ID){ if(-1 == snprintf(thread_name+4, 12, "[?:%d]", strmID)) vt_cntl_msg(1, "Could not create thread name for CUDA thread!"); }else{ if(-1 == snprintf(thread_name+4, 12, "[%d:%d]", vtCtx->devID, strmID)) vt_cntl_msg(1, "Could not create thread name for CUDA thread!"); } } VT_CHECK_THREAD; vt_gpu_registerThread(thread_name, VT_MY_THREAD, &(vtStrm->vtThrdID)); } if(vt_gpu_init_time < vt_start_time) vt_gpu_init_time = vt_start_time; /* for the first stream created for this context */ if(vtCtx->strmList == NULL){ if(vt_gpu_trace_idle > 0){ /* write enter event for GPU_IDLE on first stream */ vt_enter(vtStrm->vtThrdID, &vt_gpu_init_time, vt_gpu_rid_idle); /*vt_warning("IDLEente: %llu (%d)", vt_gpu_init_time, vtStrm->vtThrdID);*/ #if defined(VT_CUPTI_ACTIVITY) if(vtCtx->activity != NULL) vtCtx->activity->gpuIdleOn = 1; #endif } /* set the counter value for cudaMalloc to 0 on first stream */ if(vt_gpu_trace_memusage > 0) vt_count(vtStrm->vtThrdID, &vt_gpu_init_time, vt_gpu_cid_memusage, 0); } if(vt_gpu_trace_kernels > 1){ /* set count values to zero */ vt_count(vtStrm->vtThrdID, &vt_gpu_init_time, vt_cupti_cid_blocksPerGrid, 0); vt_count(vtStrm->vtThrdID, &vt_gpu_init_time, vt_cupti_cid_threadsPerBlock, 0); vt_count(vtStrm->vtThrdID, &vt_gpu_init_time, vt_cupti_cid_threadsPerKernel, 0); } /* prepend the stream vtStrm->next = vtCtx->strmList; vtCtx->strmList = vtStrm;*/ return vtStrm; }
/* * Use the CUPTI activity kernel record to write the corresponding VampirTrace * events. * * @param kernel the CUPTI activity kernel record * @param vtCtx the VampirTrace CUPTI activity context */ static void vt_cuptiact_writeKernelRecord(CUpti_ActivityKernel *kernel, vt_cupti_ctx_t *vtCtx) { vt_cupti_activity_t *vtcuptiActivity = vtCtx->activity; vt_cuptiact_strm_t *vtStrm = NULL; uint32_t vtThrdID = VT_NO_ID; uint32_t knRID = VT_NO_ID; vt_gpu_hn_string_t *hn = NULL; VT_SUSPEND_MALLOC_TRACING(vtCtx->ptid); /* get VampirTrace thread ID for the kernel's stream */ vtStrm = vt_cuptiact_checkStream(vtCtx, kernel->streamId); vtThrdID = vtStrm->vtThrdID; VT_RESUME_MALLOC_TRACING(vtCtx->ptid); /* get the VampirTrace region ID for the kernel */ hn = vt_gpu_stringHashGet(kernel->name); if(hn){ knRID = hn->rid; }else{ char *knName = vt_cuda_demangleKernel(kernel->name); if(knName == NULL || *knName == '\0') { knName = (char *)kernel->name; if(knName == NULL) knName = "unknownKernel"; } knRID = vt_def_region(VT_MASTER_THREAD, knName, VT_NO_ID, VT_NO_LNO, VT_NO_LNO, "CUDA_KERNEL", VT_FUNCTION); hn = vt_gpu_stringHashPut(kernel->name, knRID); } /* write events */ { uint64_t start = vtcuptiActivity->sync.hostStart + (kernel->start - vtcuptiActivity->sync.gpuStart) * vtcuptiActivity->sync.factor; uint64_t stop = start + (kernel->end - kernel->start) * vtcuptiActivity->sync.factor; /* if current activity's start time is before last written timestamp */ if(start < vtStrm->vtLastTime){ vt_warning("[CUPTI Activity] Kernel: start time < last written timestamp!"); vt_warning("[CUPTI Activity] Kernel: '%s', CUdevice: %d, " "CUDA stream ID: %d, Thread ID: %d", hn->sname, vtCtx->cuDev, vtStrm->strmID, vtStrm->vtThrdID); if(vtStrm->vtLastTime < stop){ vt_warning("[CUPTI Activity] Set kernel start time to sync-point time" "(cut %.4lf%%)", (double)(vtStrm->vtLastTime - start)/(double)(stop-start)); start = vtStrm->vtLastTime; }else{ vt_warning("[CUPTI Activity] Skipping ..."); return; } } /* check if time between start and stop is increasing */ if(stop < start){ vt_warning("[CUPTI Activity] Kernel: start time > stop time!"); vt_warning("[CUPTI Activity] Skipping '%s' on CUDA device:stream [%d:%d]," " Thread ID %d", hn->sname, vtCtx->cuDev, vtStrm->strmID, vtStrm->vtThrdID); return; } /* check if synchronization stop time is before kernel stop time */ if(vtcuptiActivity->sync.hostStop < stop){ vt_warning("[CUPTI Activity] Kernel: sync-point time < kernel stop time"); vt_warning("[CUPTI Activity] Kernel: '%s', CUdevice: %d, " "CUDA stream ID: %d, Thread ID: %d", hn->sname, vtCtx->cuDev, vtStrm->strmID, vtStrm->vtThrdID); /* Write kernel with sync.hostStop stop time stamp, if possible */ if(vtcuptiActivity->sync.hostStop > start){ vt_warning("[CUPTI Activity] Set kernel-stop-time to sync-point-time " "(cut %.4lf%%)", (double)(stop - vtcuptiActivity->sync.hostStop)/(double)(stop-start)); stop = vtcuptiActivity->sync.hostStop; }else{ vt_warning("[CUPTI Activity] Skipping ..."); return; } } /* set the last VampirTrace timestamp, written in this stream */ vtStrm->vtLastTime = stop; /*vt_cntl_msg(1, "'%s'(%d) start: %llu; stop: %llu (tid: %d)", kernel->name, knRID, start, stop, vtThrdID);*/ /* GPU idle time will be written to first CUDA stream in list */ if(vt_gpu_trace_idle){ if(vtcuptiActivity->gpuIdleOn){ /*vt_warning("IDLEexit: %llu (%d)", start, vtCtx->strmList->vtThrdID);*/ vt_exit(vtcuptiActivity->strmList->vtThrdID, &start); vtcuptiActivity->gpuIdleOn = 0; }else if(start > vtcuptiActivity->vtLastGPUTime){ /* idle is off and kernels are consecutive */ /*vt_warning("IDLEente: %llu (%d)", vtCtx->vtLastGPUTime, vtCtx->strmList->vtThrdID); vt_warning("IDLEexit: %llu (%d)", start, vtCtx->strmList->vtThrdID);*/ vt_enter(vtcuptiActivity->strmList->vtThrdID, &(vtcuptiActivity->vtLastGPUTime), vt_gpu_rid_idle); vt_exit(vtcuptiActivity->strmList->vtThrdID, &start); } } vt_enter(vtThrdID, &start, knRID); /*vt_warning("KERNente: %llu (%d)", start, vtThrdID);*/ /* use counter to provide additional information for kernels */ if(vt_gpu_trace_kernels > 1){ /* grid and block size counter (start) */ { uint32_t threadsPerBlock = kernel->blockX * kernel->blockY * kernel->blockZ; uint32_t blocksPerGrid = kernel->gridX * kernel->gridY * kernel->gridZ; vt_count(vtThrdID, &start, vt_cupti_cid_blocksPerGrid, blocksPerGrid); vt_count(vtThrdID, &start, vt_cupti_cid_threadsPerBlock, threadsPerBlock); vt_count(vtThrdID, &start, vt_cupti_cid_threadsPerKernel, threadsPerBlock * blocksPerGrid); } /* memory counter (start) */ vt_count(vtThrdID, &start, vt_cuptiact_cid_knStaticSharedMem, kernel->staticSharedMemory); vt_count(vtThrdID, &start, vt_cuptiact_cid_knDynamicSharedMem, kernel->dynamicSharedMemory); vt_count(vtThrdID, &start, vt_cuptiact_cid_knLocalMemTotal, kernel->localMemoryTotal); vt_count(vtThrdID, &start, vt_cuptiact_cid_knRegistersPerThread, kernel->registersPerThread); /* memory counter (stop) */ vt_count(vtThrdID, &stop, vt_cuptiact_cid_knStaticSharedMem, 0); vt_count(vtThrdID, &stop, vt_cuptiact_cid_knDynamicSharedMem, 0); vt_count(vtThrdID, &stop, vt_cuptiact_cid_knLocalMemTotal, 0); vt_count(vtThrdID, &stop, vt_cuptiact_cid_knRegistersPerThread, 0); /* grid and block size counter (stop) */ vt_count(vtThrdID, &stop, vt_cupti_cid_blocksPerGrid, 0); vt_count(vtThrdID, &stop, vt_cupti_cid_threadsPerBlock, 0); vt_count(vtThrdID, &stop, vt_cupti_cid_threadsPerKernel, 0); } vt_exit(vtThrdID, &stop); /*vt_warning("KERNexit: %llu (%d)", stop, vtThrdID);*/ if(vtcuptiActivity->vtLastGPUTime < stop) vtcuptiActivity->vtLastGPUTime = stop; } /*vt_cntl_msg(1, "KERNEL '%s' [%llu ns] device %u, context %u, stream %u, " "correlation %u/r%u\n" "\t grid [%u,%u,%u], block [%u,%u,%u], " "shared memory (static %u, dynamic %u)", kernel->name, (unsigned long long)(kernel->end - kernel->start), kernel->deviceId, kernel->contextId, kernel->streamId, kernel->correlationId, kernel->runtimeCorrelationId, kernel->gridX, kernel->gridY, kernel->gridZ, kernel->blockX, kernel->blockY, kernel->blockZ, kernel->staticSharedMemory, kernel->dynamicSharedMemory);*/ }
/* * Use the CUPTI activity memory copy record to write the corresponding * VampirTrace events. * * @param mcpy the CUPTI activity memory copy record * @param vtCtx the VampirTrace CUPTI activity context */ static void vt_cuptiact_writeMemcpyRecord(CUpti_ActivityMemcpy *mcpy, vt_cupti_ctx_t *vtCtx) { vt_cupti_activity_t *vtcuptiActivity = vtCtx->activity; vt_gpu_copy_kind_t kind = VT_GPU_COPYDIRECTION_UNKNOWN; uint32_t vtThrdID; uint64_t start, stop; vt_cuptiact_strm_t *vtStrm = NULL; /*vt_cntl_msg(1,"mcpycopykind: %d (strm %d)", mcpy->copyKind, mcpy->streamId);*/ if(mcpy->copyKind == CUPTI_ACTIVITY_MEMCPY_KIND_DTOD) return; start = vtcuptiActivity->sync.hostStart + (mcpy->start - vtcuptiActivity->sync.gpuStart) * vtcuptiActivity->sync.factor; stop = start + (mcpy->end - mcpy->start) * vtcuptiActivity->sync.factor; VT_SUSPEND_MALLOC_TRACING(vtCtx->ptid); /* get VampirTrace thread ID for the kernel's stream */ vtStrm = vt_cuptiact_checkStream(vtCtx, mcpy->streamId); vtThrdID = vtStrm->vtThrdID; VT_RESUME_MALLOC_TRACING(vtCtx->ptid); /* if current activity's start time is before last written timestamp */ if(start < vtStrm->vtLastTime){ vt_warning("[CUPTI Activity] Memcpy: start time < last written timestamp! " "(CUDA device:stream [%d:%d], Thread ID: %d)", vtCtx->cuDev, vtStrm->strmID, vtStrm->vtThrdID); if(vtStrm->vtLastTime < stop){ vt_warning("[CUPTI Activity] Set memcpy start time to sync-point time" "(truncate %.4lf%%)", (double)(vtStrm->vtLastTime - start)/(double)(stop - start)); start = vtStrm->vtLastTime; }else{ vt_warning("[CUPTI Activity] Skipping ..."); return; } } /* check if time between start and stop is increasing */ if(stop < start){ vt_warning("[CUPTI Activity] Skipping memcpy (start time > stop time) on " "CUdevice:Stream %d:%d, Thread ID %d", vtCtx->cuDev, vtStrm->strmID, vtStrm->vtThrdID); return; } /* check if synchronization stop time is before kernel stop time */ if(vtcuptiActivity->sync.hostStop < stop){ vt_warning("[CUPTI Activity] Memcpy: sync stop time < stop time! " "(CUDA device:stream [%d:%d], Thread ID: %d)", vtCtx->cuDev, vtStrm->strmID, vtStrm->vtThrdID); /* Write memcpy with sync.hostStop stop time stamp, if possible */ if(vtcuptiActivity->sync.hostStop > start){ vt_warning("[CUPTI Activity] Set memcpy-stop-time to sync-point-time " "(truncate %.4lf%%)", (double)(stop - vtcuptiActivity->sync.hostStop)/ (double)(stop - start)); stop = vtcuptiActivity->sync.hostStop; }else{ vt_warning("[CUPTI Activity] Skipping ..."); return; } } /* set the last VampirTrace timestamp, written in this stream */ vtStrm->vtLastTime = stop; /* check copy direction */ if(mcpy->srcKind == CUPTI_ACTIVITY_MEMORY_KIND_DEVICE){ if(mcpy->dstKind == CUPTI_ACTIVITY_MEMORY_KIND_DEVICE){ kind = VT_GPU_DEV2DEV; }else{ kind = VT_GPU_DEV2HOST; } }else{ if(mcpy->dstKind == CUPTI_ACTIVITY_MEMORY_KIND_DEVICE){ kind = VT_GPU_HOST2DEV; }else{ kind = VT_GPU_HOST2HOST; } } if(vtcuptiActivity->gpuIdleOn == 0 && mcpy->streamId == vtcuptiActivity->defaultStrmID){ vt_enter(vtcuptiActivity->strmList->vtThrdID, &(vtcuptiActivity->vtLastGPUTime), vt_gpu_rid_idle); vtcuptiActivity->gpuIdleOn = 1; /*vt_warning("IDLMente: %llu (%d)", vtCtx->vtLastGPUTime, vtCtx->strmList->vtThrdID);*/ } /*VT_CUPTI_LOCK();*/ if(kind != VT_GPU_DEV2DEV) vt_gpu_prop[vtCtx->ptid] |= VTGPU_GPU_COMM; vt_gpu_prop[vtThrdID] |= VTGPU_GPU_COMM; /*VT_CUPTI_UNLOCK();*/ /* vt_warning("MCPYente: %llu (%d)", start, vtThrdID); vt_warning("MCPYexit: %llu (%d)", stop, vtThrdID); */ if(kind == VT_GPU_HOST2DEV){ vt_mpi_rma_get(vtThrdID, &start, VT_GPU_RANK_ID(vtCtx->ptid), vt_gpu_commCID, 0, mcpy->bytes); }else if(kind == VT_GPU_DEV2HOST){ vt_mpi_rma_put(vtThrdID, &start, VT_GPU_RANK_ID(vtCtx->ptid), vt_gpu_commCID, 0, mcpy->bytes); }else if(kind == VT_GPU_DEV2DEV){ vt_mpi_rma_get(vtThrdID, &start, VT_GPU_RANK_ID(vtThrdID), vt_gpu_commCID, 0, mcpy->bytes); } if(kind != VT_GPU_HOST2HOST){ vt_mpi_rma_end(vtThrdID, &stop, vt_gpu_commCID, 0); } /*if(vtCtx->vtLastGPUTime < stop) vtCtx->vtLastGPUTime = stop;*/ /*vt_cntl_msg(1, "MEMCPY %llu -> %llu[%llu ns] device %u, context %u, stream %u, " "correlation %u/r%u", mcpy->start, mcpy->end, (unsigned long long)(mcpy->end - mcpy->start), mcpy->deviceId, mcpy->contextId, mcpy->streamId, mcpy->correlationId, mcpy->runtimeCorrelationId);*/ }
static int get_new_trcid() { int new_trcid; int fd; int8_t tmp_len; struct flock fl; char tmp[10] = ""; uint8_t do_unlock = 1; vt_libassert(trcid_filename[0] != '\0'); VT_SUSPEND_IO_TRACING(VT_CURRENT_THREAD); /* open/create temp. id file */ if ( (fd = open(trcid_filename, (O_RDWR | O_CREAT), (S_IRUSR | S_IWUSR))) == -1 ) vt_error_msg("Cannot open file %s: %s", trcid_filename, strerror(errno)); /* lock temp. id file */ fl.l_type = F_WRLCK; fl.l_whence = SEEK_SET; fl.l_start = 0; fl.l_len = 0; if (fcntl(fd, F_SETLKW, &fl) == -1) { do_unlock = 0; vt_warning("Cannot lock file %s: %s", trcid_filename, strerror(errno)); } /* read current trace id */ if ( read(fd, tmp, 9) == -1 ) vt_error_msg("Cannot read file %s: %s", trcid_filename, strerror(errno)); tmp[9] = '\0'; if ( tmp[0] == '\0' ) new_trcid = 1; /* set trace id to 1, if file is empty */ else new_trcid = atoi(tmp) + 1; /* increment trace id */ /* write new trace id */ lseek(fd, 0, SEEK_SET); snprintf(tmp, sizeof(tmp)-1, "%i\n", new_trcid); tmp_len = strlen( tmp ); if( tmp_len > write( fd, tmp, tmp_len ) ){ vt_error_msg( "Failed to write to file %s: %s", trcid_filename,strerror(errno) ); } /* unlock temp. id file */ if ( do_unlock ) { fl.l_type = F_UNLCK; if ( fcntl(fd, F_SETLK, &fl) == -1 ) vt_error_msg("Cannot unlock file %s: %s", trcid_filename, strerror(errno)); } /* close temp. id file */ close(fd); vt_cntl_msg(2, "Updated trace-id in %s to %i", trcid_filename, new_trcid); VT_RESUME_IO_TRACING(VT_CURRENT_THREAD); return new_trcid; }