/* * Reset the VampirTrace counter values (to zero) for active CUPTI counters. * * @param vtcuptiEvtCtx pointer to the VampirTrace CUPTI events context * @param strmid the stream id for the counter values * @param time the VampirTrace timestamps */ void vt_cuptievt_resetCounter(vt_cupti_events_t *vtcuptiEvtCtx, uint32_t strmid, uint64_t *time) { size_t i; vt_cupti_evtgrp_t *vtcuptiGrp = NULL; /* create a VampirTrace CUPTI events context, if it is not available */ if(vtcuptiEvtCtx == NULL){ VT_CHECK_THREAD; vtcuptiEvtCtx = vt_cuptievt_getOrCreateCurrentCtx(VT_MY_THREAD)->events; if(vtcuptiEvtCtx == NULL) return; } vtcuptiGrp = vtcuptiEvtCtx->vtGrpList; while(vtcuptiGrp != NULL){ for(i = 0; i < vtcuptiGrp->evtNum; i++){ vt_count(strmid, time, *(vtcuptiGrp->vtCIDs+i), 0); } /* reset counter values of this group */ VT_CUPTI_CALL(cuptiEventGroupResetAllEvents(vtcuptiGrp->evtGrp), "cuptiEventGroupResetAllEvents"); vtcuptiGrp = vtcuptiGrp->next; } }
/* * Reset the VampirTrace counter values (to zero) for active CUPTI counters. * * @param vtcuptiCtx pointer to the VampirTrace CUPTI context * @param strmid the stream id for the counter values * @param time the VampirTrace timestamps */ void vt_cupti_resetCounter(vt_cupti_ctx_t *vtcuptiCtx, uint32_t strmid, uint64_t *time) { size_t i; vt_cupti_grp_t *vtcuptiGrp = NULL; if(vtcuptiCtx == NULL){ VT_CHECK_THREAD; vtcuptiCtx = vt_cupti_getCurrentContext(VT_MY_THREAD); if(vtcuptiCtx == NULL) return; } vtcuptiGrp = vtcuptiCtx->vtGrpList; while(vtcuptiGrp != NULL){ for(i = 0; i < vtcuptiGrp->evtNum; i++){ vt_count(strmid, time, *(vtcuptiGrp->vtCIDs+i), 0); } /* reset counter values of this group */ CHECK_CUPTI_ERROR(cuptiEventGroupResetAllEvents(vtcuptiGrp->evtGrp), "cuptiEventGroupResetAllEvents"); vtcuptiGrp = vtcuptiGrp->next; } }
/* * Increases the "Allocated CUDA memory" counter. * * @param ctxUID CUDA context identifier (@see CUPTI callback info) * @param devPtr pointer to the allocated memory (needed for vtcudaFree()) * @param size the number of bytes allocated */ void vt_cuptiact_writeMalloc(uint32_t ctxID, CUcontext cuCtx, void *devPtr, size_t size) { uint64_t vtTime; vt_cuptiact_ctx_t* vtCtx = NULL; vt_cuptiact_gpumem_t *vtMalloc = (vt_cuptiact_gpumem_t*)malloc(sizeof(vt_cuptiact_gpumem_t)); if(devPtr == NULL) return; /* flush activity buffer */ vt_cuptiact_flushCtxActivities(cuCtx); vtMalloc->memPtr = devPtr; vtMalloc->size = size; vtCtx = vt_cuptiact_getCtx(cuCtx); if(vtCtx == NULL){ vtCtx = vt_cuptiact_createContext(ctxID, cuCtx, (uint32_t)-1); } /* lock the work on the context */ VT_CUPTI_ACT_LOCK(); /* add malloc entry to list */ vtMalloc->next = vtCtx->gpuMemList; vtCtx->gpuMemList = vtMalloc; /* increase allocated memory counter */ vtCtx->gpuMemAllocated += size; /* check if first CUDA stream is available */ if(vtCtx->strmList == NULL){ if(vt_gpu_init_time < vt_start_time) vt_gpu_init_time = vt_start_time; vtCtx->strmList = vt_cuptiact_createStream(vtCtx, vtCtx->defaultStrmID); vt_count(vtCtx->strmList->vtThrdID, &vt_gpu_init_time, vt_gpu_cid_memusage, 0); } VT_CUPTI_ACT_UNLOCK(); /* write counter value */ vtTime = vt_pform_wtime(); vt_count(vtCtx->strmList->vtThrdID, &vtTime, vt_gpu_cid_memusage, (uint64_t)(vtCtx->gpuMemAllocated)); }
void* vt_malloc_hook(size_t size, const void* caller) { void* result; uint64_t bytes; uint64_t time; uint8_t was_recorded; VT_MEMHOOKS_OFF(); /* restore original hooks */ time = vt_pform_wtime(); was_recorded = vt_enter(VT_CURRENT_THREAD, &time, memhook_regid[MEMHOOK_REG_MALLOC]); result = malloc(size); /* call recursively */ /* get total allocated memory */ if ( result != NULL ) { bytes = ( ~ (uint64_t) 3 ) & (uint64_t) *( (size_t*) ( (char*)result - SIZEOF_VOIDP ) ); } else { bytes = 0; } /* update counter value */ memalloc_val += bytes; time = vt_pform_wtime(); if ( was_recorded && bytes > 0 ) { /* write marker, if desired */ if( memalloc_marker ) { vt_marker(VT_CURRENT_THREAD, &time, memalloc_mid[MEMHOOK_MARK_ALLOC], "Allocated %llu Bytes", (unsigned long long)bytes); } /* write counter value */ vt_count(VT_CURRENT_THREAD, &time, memalloc_cid, memalloc_val); } vt_exit(VT_CURRENT_THREAD, &time); VT_MEMHOOKS_ON(); /* restore our own hooks */ return result; }
VT_DECLDEF(void VT_User_count_double_val___f(unsigned int* cid, double* val)) { uint64_t time; uint64_t cval; VT_INIT; VT_MEMHOOKS_OFF(); time = vt_pform_wtime(); cval = OTF_Double2Counter(*val); vt_count(VT_CURRENT_THREAD, &time, *cid, cval); VT_MEMHOOKS_ON(); } VT_GENERATE_F77_BINDINGS(vt_user_count_double_val__,
void VT_User_count_double_val__(unsigned int cid, double val) { uint64_t time; uint64_t cval; VT_INIT; VT_MEMHOOKS_OFF(); time = vt_pform_wtime(); cval = OTF_Double2Counter(val); vt_count(VT_CURRENT_THREAD, &time, cid, cval); VT_MEMHOOKS_ON(); }
void VT_User_count_unsigned_val__(unsigned int cid, unsigned long long val) { uint64_t time; uint64_t cval; VT_INIT; VT_MEMHOOKS_OFF(); time = vt_pform_wtime(); cval = OTF_Unsigned2Counter((uint64_t)val); vt_count(VT_CURRENT_THREAD, &time, cid, cval); VT_MEMHOOKS_ON(); }
void vt_free_hook(void* ptr, const void* caller) { uint64_t bytes; uint64_t time; uint8_t was_recorded; VT_MEMHOOKS_OFF(); /* restore original hooks */ time = vt_pform_wtime(); was_recorded = vt_enter(VT_CURRENT_THREAD, &time, memhook_regid[MEMHOOK_REG_FREE]); if ( NULL != ptr ) { bytes = ( ~ (uint64_t) 3 ) & (uint64_t) *( (size_t*) ( (char*)ptr - SIZEOF_VOIDP ) ); } else { bytes = 0; } free(ptr); /* call recursively */ /* update counter value */ if ( bytes <= memalloc_val ) memalloc_val -= bytes; else memalloc_val = 0; time = vt_pform_wtime(); if ( was_recorded && bytes > 0 ) { /* write marker, if desired */ if( memalloc_marker ) { vt_marker(VT_CURRENT_THREAD, &time, memalloc_mid[MEMHOOK_MARK_FREE], "Freed %llu Bytes", (unsigned long long)bytes); } /* write counter value */ vt_count(VT_CURRENT_THREAD, &time, memalloc_cid, memalloc_val); } vt_exit(VT_CURRENT_THREAD, &time); VT_MEMHOOKS_ON(); /* restore our own hooks */ }
/* * Request the CUTPI counter values and write it to the given VampirTrace * stream with the given timestamps. * * @param vtcuptiEvtCtx pointer to the VampirTrace CUPTI events context * @param strmid the stream id for the counter values * @param time the VampirTrace timestamps */ void vt_cuptievt_writeCounter(vt_cupti_events_t *vtcuptiEvtCtx, uint32_t strmid, uint64_t *time) { CUptiResult cuptiErr = CUPTI_SUCCESS; vt_cupti_evtgrp_t *vtcuptiGrp = NULL; size_t bufferSizeBytes; size_t arraySizeBytes; size_t numCountersRead; if(vtcuptiEvtCtx == NULL){ VT_CHECK_THREAD; vtcuptiEvtCtx = vt_cuptievt_getOrCreateCurrentCtx(VT_MY_THREAD)->events; if(vtcuptiEvtCtx == NULL) return; } vtcuptiGrp = vtcuptiEvtCtx->vtGrpList; while(vtcuptiGrp != NULL){ /* read events only, if the event group is enabled */ if(vtcuptiGrp->enabled){ bufferSizeBytes = vtcuptiGrp->evtNum * sizeof(uint64_t); arraySizeBytes = vtcuptiGrp->evtNum * sizeof(CUpti_EventID); /* read events */ cuptiErr = cuptiEventGroupReadAllEvents(vtcuptiGrp->evtGrp, CUPTI_EVENT_READ_FLAG_NONE, &bufferSizeBytes, vtcuptiEvtCtx->counterData, &arraySizeBytes, vtcuptiEvtCtx->cuptiEvtIDs, &numCountersRead); VT_CUPTI_CALL(cuptiErr, "cuptiEventGroupReadAllEvents"); if(vtcuptiGrp->evtNum != numCountersRead){ vt_error_msg("[CUPTI Events] %d counter reads, %d metrics specified in " "VT_CUPTI_METRICS!", numCountersRead, vtcuptiGrp->evtNum); } /* For all events of the event group: map added event IDs to just read event * IDs, as the order may not be the same. For small numbers of counter reads * this simple mapping should be fast enough. */ { size_t j; for(j = 0; j < numCountersRead; j++){ size_t i; for(i = 0; i < vtcuptiGrp->evtNum; i++){ if(vtcuptiEvtCtx->cuptiEvtIDs[j] == *(vtcuptiGrp->cuptiEvtIDs+i)){ /* write the counter value as VampirTrace counter */ vt_count(strmid, time, *(vtcuptiGrp->vtCIDs+i), vtcuptiEvtCtx->counterData[i]); } } } } } vtcuptiGrp = vtcuptiGrp->next; } }
/* -- stdlib.h:realloc -- */ void* realloc(void* ptr, size_t size) { void* ret; /* initialize this wrapper function */ MALLOCWRAP_FUNC_INIT("realloc", void*, (void*, size_t)); /* once, get the actual function pointer */ MALLOCWRAP_GET_FUNC_PTR(); if( MALLOCWRAP_DO_TRACE() ) { uint32_t tid; uint64_t time; uint64_t bytes; uint64_t bytes1; uint64_t bytes2; uint64_t* counter_val; uint8_t was_recorded; /* get calling thread id */ tid = VT_MY_THREAD; /* suspend LIBC memory (de)allocation tracing */ VT_SUSPEND_MALLOC_TRACING(tid); /* get current timestamp for the following function enter event */ time = vt_pform_wtime(); /* once, get unique function identifier */ MALLOCWRAP_GET_FUNC_ID(); /* record function enter event */ was_recorded = vt_enter(tid, &time, VT_LIBWRAP_FUNC_ID); /* get total allocated memory before realloc */ if( ptr != NULL ) { /* bytes1 = ( ~ (uint64_t) 3 ) & (uint64_t) *( (size_t*) ( (char*)ptr - SIZEOF_VOIDP ) );*/ bytes1 = (uint64_t)malloc_usable_size(ptr); } else { bytes1 = bytes = 0; } /* call the actual library function */ ret = MALLOCWRAP_FUNC_CALL((ptr, size)); /* get total allocated memory after realloc */ if( ret != NULL ) { /* bytes2 = ( ~ (uint64_t) 3 ) & (uint64_t) *( (size_t*) ( (char*)ret - SIZEOF_VOIDP ) );*/ bytes2 = (uint64_t)malloc_usable_size(ret); bytes = bytes2 < bytes1 ? bytes1 - bytes2 : bytes2 - bytes1; } else { bytes2 = bytes = 0; } /* get pointer to thread's memory allocation counter value and update */ counter_val = &(VTTHRD_MALLOC_TRACING_COUNTER_VAL(VTThrdv[tid])); if( bytes2 < bytes1 ) { if( bytes <= *counter_val ) *counter_val -= bytes; else *counter_val = 0; } else { *counter_val += bytes; } /* get timestamp for the following function exit event [+ marker] */ time = vt_pform_wtime(); if( was_recorded && bytes > 0 ) { /* write marker, if desired */ if( mallocwrap_write_markers ) { static const char* marker_prefix_alloced = "Allocated"; static const char* marker_prefix_freed = "Freed"; uint32_t marker_id; const char* marker_prefix; if ( bytes2 < bytes1 ) { marker_id = mallocwrap_marker_free_id; marker_prefix = marker_prefix_freed; } else { marker_id = mallocwrap_marker_alloc_id; marker_prefix = marker_prefix_alloced; } vt_marker(tid, &time, marker_id, "%s %llu Bytes", marker_prefix, (unsigned long long)bytes); } /* write counter value */ vt_count(tid, &time, mallocwrap_counter_id, *counter_val); } /* record function exit event */ vt_exit(tid, &time); /* resume LIBC memory (de)allocation tracing */ VT_RESUME_MALLOC_TRACING(tid); } else { /* call the actual library function */ ret = MALLOCWRAP_FUNC_CALL((ptr, size)); } /* get errno from external LIBC (not necessary if using RTLD_NEXT) */ /*errno = vt_libwrap_get_libc_errno();*/ return ret; }
/* -- stdlib.h:calloc -- */ void* calloc(size_t nmemb, size_t size) { void* ret; /* initialize this wrapper function */ MALLOCWRAP_FUNC_INIT("calloc", void*, (size_t, size_t)); /* once, get the actual function pointer NOTE: The dlsym function which is used to determine the actual function pointer of calloc uses itself this function, which would ends up in an infinite recursion. In order to make it work we have to perform a quite dirty hack found on http://blog.bigpixel.ro/2010/09/interposing-calloc-on-linux: While we are trying to get the actual function pointer, we're returning NULL for the memory which needs to be allocated by dlsym, in hope that dlsym can handle this situation. If this workaround causes any problems, just undefine the MALLOCWRAP_CALLOC macro above to disable the calloc wrapper function completely. */ if( VT_LIBWRAP_FUNC_PTR == VT_LIBWRAP_NULL ) { /* flag for indicating that we are trying to get the actual function pointer of calloc */ static uint8_t getting_func_ptr = 0; if( !getting_func_ptr ) { /* before trying to get the actual function pointer of calloc, set an indicator in order to return NULL from the next calloc called from dlsym */ getting_func_ptr = 1; VTLibwrap_func_init(mallocwrap_lw, VT_LIBWRAP_FUNC_NAME, NULL, 0, (void**)(&VT_LIBWRAP_FUNC_PTR), NULL); getting_func_ptr = 0; } else { /* assumed that this calloc is called from dlsym, return NULL */ return NULL; } } if( MALLOCWRAP_DO_TRACE() ) { uint32_t tid; uint64_t time; uint64_t bytes; uint64_t* counter_val; uint8_t was_recorded; /* get calling thread id */ tid = VT_MY_THREAD; /* suspend LIBC memory (de)allocation tracing */ VT_SUSPEND_MALLOC_TRACING(tid); /* get current timestamp for the following function enter event */ time = vt_pform_wtime(); /* once, get unique function identifier */ MALLOCWRAP_GET_FUNC_ID(); /* record function enter event */ was_recorded = vt_enter(tid, &time, VT_LIBWRAP_FUNC_ID); /* call the actual library function */ ret = MALLOCWRAP_FUNC_CALL((nmemb, size)); /* get total allocated memory */ if( ret != NULL ) { /* bytes = ( ~ (uint64_t) 3 ) & (uint64_t) *( (size_t*) ( (char*)ret - SIZEOF_VOIDP ) );*/ bytes = (uint64_t)malloc_usable_size(ret); } else { bytes = 0; } /* get pointer to thread's memory allocation counter value and update */ counter_val = &(VTTHRD_MALLOC_TRACING_COUNTER_VAL(VTThrdv[tid])); *counter_val += bytes; /* get timestamp for the following function exit event [+ marker] */ time = vt_pform_wtime(); if( was_recorded && bytes > 0 ) { /* write marker, if desired */ if( mallocwrap_write_markers ) { vt_marker(tid, &time, mallocwrap_marker_alloc_id, "Allocated %llu Bytes", (unsigned long long)bytes); } /* write counter value */ vt_count(tid, &time, mallocwrap_counter_id, *counter_val); } /* record function exit event */ vt_exit(tid, &time); /* resume LIBC memory (de)allocation tracing */ VT_RESUME_MALLOC_TRACING(tid); } else { /* call the actual library function */ ret = MALLOCWRAP_FUNC_CALL((nmemb, size)); } /* get errno from external LIBC (not necessary if using RTLD_NEXT) */ /*errno = vt_libwrap_get_libc_errno();*/ return ret; }
/* * Increases the "Allocated CUDA memory" counter. * * @param ctxUID CUDA context identifier (@see CUPTI callback info) * @param devPtr pointer to the allocated memory (needed for vtcudaFree()) * @param size the number of bytes allocated */ void vt_cuptiact_writeMalloc(uint32_t ctxID, CUcontext cuCtx, void *devPtr, size_t size) { uint64_t vtTime; vt_cupti_ctx_t* vtCtx = NULL; vt_cupti_activity_t *vtcuptiActivity = NULL; vt_cupti_gpumem_t *vtMalloc = NULL; if(devPtr == NULL) return; VT_SUSPEND_MALLOC_TRACING(VT_CURRENT_THREAD); vtMalloc = (vt_cupti_gpumem_t*)malloc(sizeof(vt_cupti_gpumem_t)); vtMalloc->memPtr = devPtr; vtMalloc->size = size; /* check for VampirTrace CUPTI context */ vtCtx = vt_cupti_getCtx(cuCtx); if(vtCtx == NULL){ vtCtx = vt_cupti_createCtx(cuCtx, VT_CUPTI_NO_CUDA_DEVICE, ctxID, VT_CUPTI_NO_DEVICE_ID); vt_cupti_prependCtx(vtCtx); } /* check for VampirTrace CUPTI activity context */ if(vtCtx->activity == NULL){ vtCtx->activity = vt_cuptiact_createCtxActivity(cuCtx); } vtcuptiActivity = vtCtx->activity; /* lock the work on the context */ VT_CUPTI_LOCK(); /* flush activity buffer */ vt_cuptiact_flushCtxActivities(vtCtx); /* add malloc entry to list */ vtMalloc->next = vtcuptiActivity->gpuMemList; vtcuptiActivity->gpuMemList = vtMalloc; /* increase allocated memory counter */ vtcuptiActivity->gpuMemAllocated += size; /* check if first CUDA stream is available */ if(vtcuptiActivity->strmList == NULL){ if(vt_gpu_init_time < vt_start_time) vt_gpu_init_time = vt_start_time; vtcuptiActivity->strmList = vt_cuptiact_createStream(vtCtx, vtcuptiActivity->defaultStrmID); vt_count(vtcuptiActivity->strmList->vtThrdID, &vt_gpu_init_time, vt_gpu_cid_memusage, 0); } VT_CUPTI_UNLOCK(); VT_RESUME_MALLOC_TRACING(VT_CURRENT_THREAD); /* write counter value */ vtTime = vt_pform_wtime(); vt_count(vtcuptiActivity->strmList->vtThrdID, &vtTime, vt_gpu_cid_memusage, (uint64_t)(vtcuptiActivity->gpuMemAllocated)); }
/* * Use the CUPTI activity kernel record to write the corresponding VampirTrace * events. * * @param kernel the CUPTI activity kernel record * @param vtCtx the VampirTrace CUPTI activity context */ static void vt_cuptiact_writeKernelRecord(CUpti_ActivityKernel *kernel, vt_cupti_ctx_t *vtCtx) { vt_cupti_activity_t *vtcuptiActivity = vtCtx->activity; vt_cuptiact_strm_t *vtStrm = NULL; uint32_t vtThrdID = VT_NO_ID; uint32_t knRID = VT_NO_ID; vt_gpu_hn_string_t *hn = NULL; VT_SUSPEND_MALLOC_TRACING(vtCtx->ptid); /* get VampirTrace thread ID for the kernel's stream */ vtStrm = vt_cuptiact_checkStream(vtCtx, kernel->streamId); vtThrdID = vtStrm->vtThrdID; VT_RESUME_MALLOC_TRACING(vtCtx->ptid); /* get the VampirTrace region ID for the kernel */ hn = vt_gpu_stringHashGet(kernel->name); if(hn){ knRID = hn->rid; }else{ char *knName = vt_cuda_demangleKernel(kernel->name); if(knName == NULL || *knName == '\0') { knName = (char *)kernel->name; if(knName == NULL) knName = "unknownKernel"; } knRID = vt_def_region(VT_MASTER_THREAD, knName, VT_NO_ID, VT_NO_LNO, VT_NO_LNO, "CUDA_KERNEL", VT_FUNCTION); hn = vt_gpu_stringHashPut(kernel->name, knRID); } /* write events */ { uint64_t start = vtcuptiActivity->sync.hostStart + (kernel->start - vtcuptiActivity->sync.gpuStart) * vtcuptiActivity->sync.factor; uint64_t stop = start + (kernel->end - kernel->start) * vtcuptiActivity->sync.factor; /* if current activity's start time is before last written timestamp */ if(start < vtStrm->vtLastTime){ vt_warning("[CUPTI Activity] Kernel: start time < last written timestamp!"); vt_warning("[CUPTI Activity] Kernel: '%s', CUdevice: %d, " "CUDA stream ID: %d, Thread ID: %d", hn->sname, vtCtx->cuDev, vtStrm->strmID, vtStrm->vtThrdID); if(vtStrm->vtLastTime < stop){ vt_warning("[CUPTI Activity] Set kernel start time to sync-point time" "(cut %.4lf%%)", (double)(vtStrm->vtLastTime - start)/(double)(stop-start)); start = vtStrm->vtLastTime; }else{ vt_warning("[CUPTI Activity] Skipping ..."); return; } } /* check if time between start and stop is increasing */ if(stop < start){ vt_warning("[CUPTI Activity] Kernel: start time > stop time!"); vt_warning("[CUPTI Activity] Skipping '%s' on CUDA device:stream [%d:%d]," " Thread ID %d", hn->sname, vtCtx->cuDev, vtStrm->strmID, vtStrm->vtThrdID); return; } /* check if synchronization stop time is before kernel stop time */ if(vtcuptiActivity->sync.hostStop < stop){ vt_warning("[CUPTI Activity] Kernel: sync-point time < kernel stop time"); vt_warning("[CUPTI Activity] Kernel: '%s', CUdevice: %d, " "CUDA stream ID: %d, Thread ID: %d", hn->sname, vtCtx->cuDev, vtStrm->strmID, vtStrm->vtThrdID); /* Write kernel with sync.hostStop stop time stamp, if possible */ if(vtcuptiActivity->sync.hostStop > start){ vt_warning("[CUPTI Activity] Set kernel-stop-time to sync-point-time " "(cut %.4lf%%)", (double)(stop - vtcuptiActivity->sync.hostStop)/(double)(stop-start)); stop = vtcuptiActivity->sync.hostStop; }else{ vt_warning("[CUPTI Activity] Skipping ..."); return; } } /* set the last VampirTrace timestamp, written in this stream */ vtStrm->vtLastTime = stop; /*vt_cntl_msg(1, "'%s'(%d) start: %llu; stop: %llu (tid: %d)", kernel->name, knRID, start, stop, vtThrdID);*/ /* GPU idle time will be written to first CUDA stream in list */ if(vt_gpu_trace_idle){ if(vtcuptiActivity->gpuIdleOn){ /*vt_warning("IDLEexit: %llu (%d)", start, vtCtx->strmList->vtThrdID);*/ vt_exit(vtcuptiActivity->strmList->vtThrdID, &start); vtcuptiActivity->gpuIdleOn = 0; }else if(start > vtcuptiActivity->vtLastGPUTime){ /* idle is off and kernels are consecutive */ /*vt_warning("IDLEente: %llu (%d)", vtCtx->vtLastGPUTime, vtCtx->strmList->vtThrdID); vt_warning("IDLEexit: %llu (%d)", start, vtCtx->strmList->vtThrdID);*/ vt_enter(vtcuptiActivity->strmList->vtThrdID, &(vtcuptiActivity->vtLastGPUTime), vt_gpu_rid_idle); vt_exit(vtcuptiActivity->strmList->vtThrdID, &start); } } vt_enter(vtThrdID, &start, knRID); /*vt_warning("KERNente: %llu (%d)", start, vtThrdID);*/ /* use counter to provide additional information for kernels */ if(vt_gpu_trace_kernels > 1){ /* grid and block size counter (start) */ { uint32_t threadsPerBlock = kernel->blockX * kernel->blockY * kernel->blockZ; uint32_t blocksPerGrid = kernel->gridX * kernel->gridY * kernel->gridZ; vt_count(vtThrdID, &start, vt_cupti_cid_blocksPerGrid, blocksPerGrid); vt_count(vtThrdID, &start, vt_cupti_cid_threadsPerBlock, threadsPerBlock); vt_count(vtThrdID, &start, vt_cupti_cid_threadsPerKernel, threadsPerBlock * blocksPerGrid); } /* memory counter (start) */ vt_count(vtThrdID, &start, vt_cuptiact_cid_knStaticSharedMem, kernel->staticSharedMemory); vt_count(vtThrdID, &start, vt_cuptiact_cid_knDynamicSharedMem, kernel->dynamicSharedMemory); vt_count(vtThrdID, &start, vt_cuptiact_cid_knLocalMemTotal, kernel->localMemoryTotal); vt_count(vtThrdID, &start, vt_cuptiact_cid_knRegistersPerThread, kernel->registersPerThread); /* memory counter (stop) */ vt_count(vtThrdID, &stop, vt_cuptiact_cid_knStaticSharedMem, 0); vt_count(vtThrdID, &stop, vt_cuptiact_cid_knDynamicSharedMem, 0); vt_count(vtThrdID, &stop, vt_cuptiact_cid_knLocalMemTotal, 0); vt_count(vtThrdID, &stop, vt_cuptiact_cid_knRegistersPerThread, 0); /* grid and block size counter (stop) */ vt_count(vtThrdID, &stop, vt_cupti_cid_blocksPerGrid, 0); vt_count(vtThrdID, &stop, vt_cupti_cid_threadsPerBlock, 0); vt_count(vtThrdID, &stop, vt_cupti_cid_threadsPerKernel, 0); } vt_exit(vtThrdID, &stop); /*vt_warning("KERNexit: %llu (%d)", stop, vtThrdID);*/ if(vtcuptiActivity->vtLastGPUTime < stop) vtcuptiActivity->vtLastGPUTime = stop; } /*vt_cntl_msg(1, "KERNEL '%s' [%llu ns] device %u, context %u, stream %u, " "correlation %u/r%u\n" "\t grid [%u,%u,%u], block [%u,%u,%u], " "shared memory (static %u, dynamic %u)", kernel->name, (unsigned long long)(kernel->end - kernel->start), kernel->deviceId, kernel->contextId, kernel->streamId, kernel->correlationId, kernel->runtimeCorrelationId, kernel->gridX, kernel->gridY, kernel->gridZ, kernel->blockX, kernel->blockY, kernel->blockZ, kernel->staticSharedMemory, kernel->dynamicSharedMemory);*/ }
/* * Decreases the "Allocated CUDA memory" counter. * * @param ctxUID CUDA context identifier (@see CUPTI callback info) * @param devPtr pointer to the allocated memory */ void vt_cuptiact_writeFree(uint32_t ctxID, CUcontext cuCtx, void *devPtr) { uint64_t vtTime; vt_cupti_ctx_t* vtCtx = NULL; vt_cupti_activity_t *vtcuptiActivity = NULL; vt_cupti_gpumem_t *curMalloc = NULL; vt_cupti_gpumem_t *lastMalloc = NULL; if(devPtr == NULL) return; VT_SUSPEND_MALLOC_TRACING(VT_CURRENT_THREAD); /* check for VampirTrace CUPTI context */ vtCtx = vt_cupti_getCtx(cuCtx); if(vtCtx == NULL){ vtCtx = vt_cupti_createCtx(cuCtx, VT_CUPTI_NO_CUDA_DEVICE, ctxID, VT_CUPTI_NO_DEVICE_ID); vt_cupti_prependCtx(vtCtx); } /* check for VampirTrace CUPTI activity context */ if(vtCtx->activity == NULL){ vtCtx->activity = vt_cuptiact_createCtxActivity(cuCtx); } vtcuptiActivity = vtCtx->activity; VT_CUPTI_LOCK(); /* flush activity buffer */ vt_cuptiact_flushCtxActivities(vtCtx); curMalloc = vtcuptiActivity->gpuMemList; lastMalloc = curMalloc; /* lookup the CUDA malloc entry by its memory pointer */ while(curMalloc != NULL){ if(devPtr == curMalloc->memPtr){ /* decrease allocated counter value and write it */ vtTime = vt_pform_wtime(); vtcuptiActivity->gpuMemAllocated -= curMalloc->size; vt_count(vtcuptiActivity->strmList->vtThrdID, &vtTime, vt_gpu_cid_memusage, (uint64_t)(vtcuptiActivity->gpuMemAllocated)); /* set pointer over current element to next one */ lastMalloc->next = curMalloc->next; /* if current element is the first list entry, set the list entry */ if(curMalloc == vtcuptiActivity->gpuMemList){ vtcuptiActivity->gpuMemList = curMalloc->next; } /* free VT memory of CUDA malloc */ curMalloc->next = NULL; free(curMalloc); curMalloc = NULL; /* set mallocList to NULL, if last element freed */ if(vtcuptiActivity->gpuMemAllocated == 0) { vtcuptiActivity->gpuMemList = NULL; } VT_CUPTI_UNLOCK(); VT_RESUME_MALLOC_TRACING(VT_CURRENT_THREAD); return; } lastMalloc = curMalloc; curMalloc = curMalloc->next; } VT_CUPTI_UNLOCK(); VT_RESUME_MALLOC_TRACING(VT_CURRENT_THREAD); vt_warning("[CUPTI Activity] free CUDA memory, which has not been allocated!"); }
void* vt_realloc_hook(void* ptr, size_t size, const void* caller) { void* result; uint64_t bytes; uint64_t bytes1; uint64_t bytes2; uint64_t time; uint8_t was_recorded; VT_MEMHOOKS_OFF(); /* restore original hooks */ time = vt_pform_wtime(); was_recorded = vt_enter(VT_CURRENT_THREAD, &time, memhook_regid[MEMHOOK_REG_REALLOC]); /* get total allocated memory before realloc */ if ( NULL != ptr ) { bytes1 = ( ~ (uint64_t) 3 ) & (uint64_t) *( (size_t*) ( (char*)ptr - SIZEOF_VOIDP ) ); } else { bytes1 = bytes = 0; } result = realloc(ptr, size); /* call recursively */ /* get total allocated memory after realloc */ if ( NULL != result ) { bytes2 = ( ~ (uint64_t) 3 ) & (uint64_t) *( (size_t*) ( (char*)result - SIZEOF_VOIDP ) ); bytes = bytes2 < bytes1 ? bytes1 - bytes2 : bytes2 - bytes1; } else { bytes2 = bytes = 0; } /* update counter value */ if ( bytes2 < bytes1 ) { if ( bytes <= memalloc_val ) memalloc_val -= bytes; else memalloc_val = 0; } else { memalloc_val += bytes; } time = vt_pform_wtime(); if( was_recorded && bytes > 0 ) { /* write marker, if desired */ if( memalloc_marker ) { uint32_t marker_type; char* marker_prefix; if ( bytes2 < bytes1 ) { marker_type = MEMHOOK_MARK_FREE; marker_prefix = "Freed"; } else { marker_type = MEMHOOK_MARK_ALLOC; marker_prefix = "Allocated"; } /* write marker */ vt_marker(VT_CURRENT_THREAD, &time, memalloc_mid[marker_type], "%s %llu Bytes", marker_prefix, (unsigned long long)bytes); } /* write counter value */ vt_count(VT_CURRENT_THREAD, &time, memalloc_cid, memalloc_val); } vt_exit(VT_CURRENT_THREAD, &time); VT_MEMHOOKS_ON(); /* restore our own hooks */ return result; }
void vt_cupti_events_finalizeContext(vt_cupti_ctx_t *vtCtx) { uint64_t time = vt_pform_wtime(); vt_cupti_strm_t *curStrm = NULL; vt_cupti_evtgrp_t *vtcuptiGrp = NULL; if(vtCtx == NULL || vtCtx->events == NULL) return; /* These CUPTI calls may fail, as CUPTI has implicitly destroyed something */ if(vt_gpu_debug == 0){ curStrm = vtCtx->strmList; /* for all streams of this context */ while(curStrm != NULL){ /* ensure increasing time stamps */ if(time < curStrm->vtLastTime){ curStrm = curStrm->next; continue; } vt_cuptievt_resetCounter(vtCtx->events, curStrm->vtThrdID, &time); curStrm = curStrm->next; } /* stop CUPTI counter capturing */ vt_cuptievt_stop(vtCtx->events); /* destroy all CUPTI event groups, which have been created */ vtcuptiGrp = vtCtx->events->vtGrpList; while(vtcuptiGrp != NULL){ VT_CUPTI_CALL(cuptiEventGroupRemoveAllEvents(vtcuptiGrp->evtGrp), "cuptiEventGroupRemoveAllEvents"); VT_CUPTI_CALL(cuptiEventGroupDestroy(vtcuptiGrp->evtGrp), "cuptiEventGroupDestroy"); vtcuptiGrp = vtcuptiGrp->next; } }else{ /* set at least the VampirTrace counter to zero */ curStrm = vtCtx->strmList; /* for all streams of this context */ while(curStrm != NULL){ /* ensure increasing time stamps */ if(time < curStrm->vtLastTime){ curStrm = curStrm->next; continue; } vtcuptiGrp = vtCtx->events->vtGrpList; while(vtcuptiGrp != NULL){ size_t i; for(i = 0; i < vtcuptiGrp->evtNum; i++){ vt_count(curStrm->vtThrdID, &time, *(vtcuptiGrp->vtCIDs+i), 0); } vtcuptiGrp = vtcuptiGrp->next; } curStrm = curStrm->next; } } /* free previously allocated memory */ vt_cuptievt_freeEventCtx(vtCtx->events); }
/* * Create a VampirTrace CUPTI stream. * * @param vtCtx VampirTrace CUPTI context * @param cuStrm CUDA stream * @param strmID ID of the CUDA stream * * @return pointer to created VampirTrace CUPTI stream */ vt_cupti_strm_t* vt_cupti_createStream(vt_cupti_ctx_t *vtCtx, CUstream cuStrm, uint32_t strmID) { vt_cupti_strm_t *vtStrm = NULL; if(vtCtx == NULL){ vt_warning("[CUPTI] Cannot create stream without VampirTrace CUPTI context"); return NULL; } vtStrm = (vt_cupti_strm_t *)malloc(sizeof(vt_cupti_strm_t)); if(vtStrm == NULL) vt_error_msg("[CUPTI] Could not allocate memory for stream!"); vtStrm->cuStrm = cuStrm; vtStrm->vtLastTime = vt_gpu_init_time; vtStrm->destroyed = 0; vtStrm->next = NULL; #if defined(VT_CUPTI_ACTIVITY) /* create stream by VT CUPTI callbacks implementation (CUstream is given) */ if(strmID == VT_CUPTI_NO_STREAM_ID){ if(cuStrm != VT_CUPTI_NO_STREAM){ VT_CUPTI_CALL(cuptiGetStreamId(vtCtx->cuCtx, cuStrm, &strmID), "cuptiGetStreamId"); }else{ vt_warning("[CUPTI] Neither CUDA stream nor stream ID given!"); free(vtStrm); return NULL; } } #else /* only VT_CUPTI_CALLBACKS is defined */ if(vtCtx->callbacks != NULL){ strmID = vtCtx->callbacks->streamsCreated; vtCtx->callbacks->streamsCreated++; } #endif vtStrm->cuStrmID = strmID; /* create VampirTrace thread */ { char thread_name[16] = "CUDA"; if(vt_gpu_stream_reuse){ if(vtCtx->devID != VT_NO_ID){ if(-1 == snprintf(thread_name+4, 12, "[%d]", vtCtx->devID)) vt_cntl_msg(1, "Could not create thread name for CUDA thread!"); } }else{ if(vtCtx->devID == VT_NO_ID){ if(-1 == snprintf(thread_name+4, 12, "[?:%d]", strmID)) vt_cntl_msg(1, "Could not create thread name for CUDA thread!"); }else{ if(-1 == snprintf(thread_name+4, 12, "[%d:%d]", vtCtx->devID, strmID)) vt_cntl_msg(1, "Could not create thread name for CUDA thread!"); } } VT_CHECK_THREAD; vt_gpu_registerThread(thread_name, VT_MY_THREAD, &(vtStrm->vtThrdID)); } if(vt_gpu_init_time < vt_start_time) vt_gpu_init_time = vt_start_time; /* for the first stream created for this context */ if(vtCtx->strmList == NULL){ if(vt_gpu_trace_idle > 0){ /* write enter event for GPU_IDLE on first stream */ vt_enter(vtStrm->vtThrdID, &vt_gpu_init_time, vt_gpu_rid_idle); /*vt_warning("IDLEente: %llu (%d)", vt_gpu_init_time, vtStrm->vtThrdID);*/ #if defined(VT_CUPTI_ACTIVITY) if(vtCtx->activity != NULL) vtCtx->activity->gpuIdleOn = 1; #endif } /* set the counter value for cudaMalloc to 0 on first stream */ if(vt_gpu_trace_memusage > 0) vt_count(vtStrm->vtThrdID, &vt_gpu_init_time, vt_gpu_cid_memusage, 0); } if(vt_gpu_trace_kernels > 1){ /* set count values to zero */ vt_count(vtStrm->vtThrdID, &vt_gpu_init_time, vt_cupti_cid_blocksPerGrid, 0); vt_count(vtStrm->vtThrdID, &vt_gpu_init_time, vt_cupti_cid_threadsPerBlock, 0); vt_count(vtStrm->vtThrdID, &vt_gpu_init_time, vt_cupti_cid_threadsPerKernel, 0); } /* prepend the stream vtStrm->next = vtCtx->strmList; vtCtx->strmList = vtStrm;*/ return vtStrm; }
/* * Decreases the "Allocated CUDA memory" counter. * * @param ctxUID CUDA context identifier (@see CUPTI callback info) * @param devPtr pointer to the allocated memory */ void vt_cuptiact_writeFree(uint32_t ctxID, CUcontext cuCtx, void *devPtr) { uint64_t vtTime; vt_cuptiact_ctx_t* vtCtx = NULL; vt_cuptiact_gpumem_t *curMalloc = NULL; vt_cuptiact_gpumem_t *lastMalloc = NULL; if(devPtr == NULL) return; /* flush activity buffer */ vt_cuptiact_flushCtxActivities(cuCtx); vtCtx = vt_cuptiact_getCtx(cuCtx); if(vtCtx == NULL){ vtCtx = vt_cuptiact_createContext(ctxID, cuCtx, (uint32_t)-1); } VT_CUPTI_ACT_LOCK(); curMalloc = vtCtx->gpuMemList; lastMalloc = vtCtx->gpuMemList; /* lookup the CUDA malloc entry by its memory pointer */ while(curMalloc != NULL){ if(devPtr == curMalloc->memPtr){ /* decrease allocated counter value and write it */ vtTime = vt_pform_wtime(); vtCtx->gpuMemAllocated -= curMalloc->size; vt_count(vtCtx->strmList->vtThrdID, &vtTime, vt_gpu_cid_memusage, (uint64_t)(vtCtx->gpuMemAllocated)); /* set pointer over current element to next one */ lastMalloc->next = curMalloc->next; /* if current element is the first list entry, set the list entry */ if(curMalloc == vtCtx->gpuMemList){ vtCtx->gpuMemList = curMalloc->next; } /* free VT memory of CUDA malloc */ curMalloc->next = NULL; free(curMalloc); curMalloc = NULL; /* set mallocList to NULL, if last element freed */ if(vtCtx->gpuMemAllocated == 0) { vtCtx->gpuMemList = NULL; } VT_CUPTI_ACT_UNLOCK(); return; } lastMalloc = curMalloc; curMalloc = curMalloc->next; } VT_CUPTI_ACT_UNLOCK(); vt_warning("[CUPTI Activity] free CUDA memory, which has not been allocated!"); }