void initTrace() { // Enqueue a couple of buffers in the global queue queueNewBuffer(NULL, 0); queueNewBuffer(NULL, 0); // device activity record is created when CUDA initializes, so we // want to enable it before cuInit() or any CUDA runtime call CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE)); // CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY)); cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT); // cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER); // cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME); cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY); cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET); cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL); CUpti_SubscriberHandle subscriber; CUPTI_CALL(cuptiSubscribe(&subscriber, (CUpti_CallbackFunc)traceCallback, NULL)); CUPTI_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_RESOURCE)); CUPTI_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_SYNCHRONIZE)); //add by wukai //CUPIT_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API)); CUPTI_CALL(cuptiGetTimestamp(&startTimestamp)); }
__attribute__((constructor)) void initTrace() { //get the arguments from the environment variables int deviceId, sampRate; CUcontext cuCtx; deviceId = atoi(getenv("GPU_DEVICE_ID")); cuInit(0); cuCtxCreate(&cuCtx,0,deviceId); CUPTI_CALL(cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted)); CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_PC_SAMPLING)); //CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_BRANCH)); CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL)); CUPTI_CALL(cuptiSubscribe(&g_subscriber, (CUpti_CallbackFunc)traceCallback, NULL)); CUPTI_CALL(cuptiEnableDomain(1, g_subscriber, CUPTI_CB_DOMAIN_RESOURCE)); CUpti_ActivityPCSamplingConfig config; sampRate=atoi(getenv("PC_SAMPLING_RATE")); config.samplingPeriod= sampRate; CUPTI_CALL(cuptiActivityConfigurePCSampling(cuCtx, &config)); }
void vt_cuptiact_enableConcurrentKernel(vt_cupti_ctx_t* vtCtx) { /* * Disable collection of kernels for the given CUDA context. * !!! does not work yet !!! VT_CUPTI_CALL(cuptiActivityDisableContext(cuCtx, CUPTI_ACTIVITY_KIND_KERNEL), "cuptiActivityDisableContext");* * flush the already buffered activities for this CUDA context * vt_cuptiact_flushCtxActivities(cuCtx); * Enable collection of kernels for the given CUDA context VT_CUPTI_CALL(cuptiActivityEnableContext(cuCtx, CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL), "cuptiActivityEnableContext");*/ if((vt_gpu_config & VT_GPU_TRACE_CONCURRENT_KERNEL) != VT_GPU_TRACE_CONCURRENT_KERNEL){ vt_cntl_msg(2, "[CUPTI Activity] Enable concurrent kernel tracing."); /* * Disable normal (lower overhead) kernel tracing. */ VT_CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL), "cuptiActivityDisable"); /* * Flush the already buffered activities for this CUDA context. */ VT_CUPTI_LOCK(); vt_cuptiact_flushCtxActivities(vtCtx); VT_CUPTI_UNLOCK(); /* * Enable concurrent kernel tracing (higher overhead). */ VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL), "cuptiActivityEnable"); vt_gpu_config |= VT_GPU_TRACE_CONCURRENT_KERNEL; } }
/* no need to lock, because it is only called by vt_cupti_callback_init() */ void vt_cupti_activity_init() { /*if(!vt_cuptiact_initialized){ vt_cupti_init(); VT_CUPTI_LOCK();*/ if(!vt_cuptiact_initialized){ vt_cntl_msg(2, "[CUPTI Activity] Initializing ... "); { vt_cuptiact_bufSize = vt_env_cudatrace_bsize(); /* no buffer size < 1024 bytes allowed (see CUPTI documentation) */ if(vt_cuptiact_bufSize < 1024){ if(vt_cuptiact_bufSize > 0){ vt_warning("[CUPTI Activity] Buffer size has to be at least 1024 " "bytes! It has been set to %d.", vt_cuptiact_bufSize); } vt_cuptiact_bufSize = VT_CUPTI_ACT_DEFAULT_BSIZE; } /* queue a global buffer to initialize CUPTI before CUDA init vt_cuptiact_buffer = (uint8_t *)malloc(vt_cuptiact_bufSize); VT_CUPTI_CALL(cuptiActivityEnqueueBuffer(NULL, 0, vt_cuptiact_buffer, vt_cuptiact_bufSize), "cuptiActivityEnqueueBuffer");*/ } #if (defined(VT_MT) || defined(VT_HYB)) VTTHRD_LOCK_IDS(); #endif if(vt_gpu_trace_kernels > 1){ /* define kernel counters */ vt_cuptiact_cid_knStaticSharedMem = vt_def_counter(VT_MASTER_THREAD, "staticSharedMemory", "Bytes", VT_CNTR_ABS | VT_CNTR_NEXT | VT_CNTR_UNSIGNED, vt_cupti_cgid_cuda_kernel, 0); vt_cuptiact_cid_knDynamicSharedMem = vt_def_counter(VT_MASTER_THREAD, "dynamicSharedMemory", "Bytes", VT_CNTR_ABS | VT_CNTR_NEXT | VT_CNTR_UNSIGNED, vt_cupti_cgid_cuda_kernel, 0); vt_cuptiact_cid_knLocalMemTotal = vt_def_counter(VT_MASTER_THREAD, "localMemoryPerKernel", "Bytes", VT_CNTR_ABS | VT_CNTR_NEXT | VT_CNTR_UNSIGNED, vt_cupti_cgid_cuda_kernel, 0); vt_cuptiact_cid_knRegistersPerThread = vt_def_counter(VT_MASTER_THREAD, "registersPerThread", "#", VT_CNTR_ABS | VT_CNTR_NEXT | VT_CNTR_UNSIGNED, vt_cupti_cgid_cuda_kernel, 0); } /* define region for GPU activity flush */ vt_cuptiact_rid_flush = vt_def_region(VT_MASTER_THREAD, "flushActivities", VT_NO_ID, VT_NO_LNO, VT_NO_LNO, "VT_CUDA", VT_FUNCTION); #if (defined(VT_MT) || defined(VT_HYB)) VTTHRD_UNLOCK_IDS(); #endif /*** enable the activities ***/ /* enable kernel tracing */ if(vt_gpu_trace_kernels > 0){ #if (defined(CUPTI_API_VERSION) && (CUPTI_API_VERSION >= 3)) if((vt_gpu_config & VT_GPU_TRACE_CONCURRENT_KERNEL) == VT_GPU_TRACE_CONCURRENT_KERNEL){ /*VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL), "cuptiActivityEnable");*/ VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL), "cuptiActivityEnable"); }else #endif VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL), "cuptiActivityEnable"); } /* enable memory copy tracing */ if(vt_gpu_trace_mcpy){ VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY), "cuptiActivityEnable"); } /* register the finalize function of VampirTrace CUPTI to be called before * the program exits atexit(vt_cupti_activity_finalize);*/ vt_cuptiact_initialized = 1; /*VT_CUPTI_UNLOCK(); }*/ } }