Beispiel #1
0
void
initTrace()
{

  // Enqueue a couple of buffers in the global queue
  queueNewBuffer(NULL, 0);
  queueNewBuffer(NULL, 0);

  // device activity record is created when CUDA initializes, so we
  // want to enable it before cuInit() or any CUDA runtime call
  CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
//  CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY));
  cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT);
 //                       cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER);
//                        cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME);
                        cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY);
                        cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET);
                        cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL);

	
  CUpti_SubscriberHandle subscriber;
  CUPTI_CALL(cuptiSubscribe(&subscriber, (CUpti_CallbackFunc)traceCallback, NULL));

  CUPTI_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_RESOURCE));
  CUPTI_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_SYNCHRONIZE));
//add by wukai
  //CUPIT_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API));

  CUPTI_CALL(cuptiGetTimestamp(&startTimestamp));
}
Beispiel #2
0
__attribute__((constructor)) void
initTrace()
{
	//get the arguments from the environment variables
	int deviceId, sampRate;
	
    CUcontext cuCtx;
	deviceId = atoi(getenv("GPU_DEVICE_ID"));
    cuInit(0);
	cuCtxCreate(&cuCtx,0,deviceId);
	CUPTI_CALL(cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted));
    CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_PC_SAMPLING));
	//CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_BRANCH));

	CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));
	CUPTI_CALL(cuptiSubscribe(&g_subscriber, (CUpti_CallbackFunc)traceCallback, NULL));
	CUPTI_CALL(cuptiEnableDomain(1, g_subscriber, CUPTI_CB_DOMAIN_RESOURCE));
	CUpti_ActivityPCSamplingConfig config;
	sampRate=atoi(getenv("PC_SAMPLING_RATE"));
	config.samplingPeriod= sampRate;
	CUPTI_CALL(cuptiActivityConfigurePCSampling(cuCtx, &config));
}
void vt_cuptiact_enableConcurrentKernel(vt_cupti_ctx_t* vtCtx)
{
    /* 
     * Disable collection of kernels for the given CUDA context. 
     * !!! does not work yet !!!
     
    VT_CUPTI_CALL(cuptiActivityDisableContext(cuCtx, CUPTI_ACTIVITY_KIND_KERNEL),
                  "cuptiActivityDisableContext");*
  
    * flush the already buffered activities for this CUDA context *
    vt_cuptiact_flushCtxActivities(cuCtx);

    * Enable collection of kernels for the given CUDA context 
    VT_CUPTI_CALL(cuptiActivityEnableContext(cuCtx, CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL), 
                  "cuptiActivityEnableContext");*/
  
  if((vt_gpu_config & VT_GPU_TRACE_CONCURRENT_KERNEL) 
         != VT_GPU_TRACE_CONCURRENT_KERNEL){

    vt_cntl_msg(2, "[CUPTI Activity] Enable concurrent kernel tracing.");
    
    /*
     * Disable normal (lower overhead) kernel tracing.
     */
    VT_CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL),
                  "cuptiActivityDisable");
    
    /* 
     * Flush the already buffered activities for this CUDA context.
     */
    VT_CUPTI_LOCK();
    vt_cuptiact_flushCtxActivities(vtCtx);
    VT_CUPTI_UNLOCK();

    /*
     * Enable concurrent kernel tracing (higher overhead).
     */
    VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL), 
                  "cuptiActivityEnable");
    
    vt_gpu_config |= VT_GPU_TRACE_CONCURRENT_KERNEL;
  }
}
/* no need to lock, because it is only called by vt_cupti_callback_init() */
void vt_cupti_activity_init()
{
  /*if(!vt_cuptiact_initialized){
    vt_cupti_init();
    VT_CUPTI_LOCK();*/
    if(!vt_cuptiact_initialized){
      vt_cntl_msg(2, "[CUPTI Activity] Initializing ... ");
      
      {        
        vt_cuptiact_bufSize = vt_env_cudatrace_bsize();
        
        /* no buffer size < 1024 bytes allowed (see CUPTI documentation) */
        if(vt_cuptiact_bufSize < 1024){
          if(vt_cuptiact_bufSize > 0){
            vt_warning("[CUPTI Activity] Buffer size has to be at least 1024 "
                       "bytes! It has been set to %d.", vt_cuptiact_bufSize);
          }
          vt_cuptiact_bufSize = VT_CUPTI_ACT_DEFAULT_BSIZE;
        }
        
        /* queue a global buffer to initialize CUPTI before CUDA init 
        vt_cuptiact_buffer = (uint8_t *)malloc(vt_cuptiact_bufSize);
        VT_CUPTI_CALL(cuptiActivityEnqueueBuffer(NULL, 0, 
                                      vt_cuptiact_buffer, vt_cuptiact_bufSize), 
                      "cuptiActivityEnqueueBuffer");*/
      }
      
#if (defined(VT_MT) || defined(VT_HYB))
      VTTHRD_LOCK_IDS();
#endif
      if(vt_gpu_trace_kernels > 1){
        /* define kernel counters */
        vt_cuptiact_cid_knStaticSharedMem = vt_def_counter(VT_MASTER_THREAD, 
                      "staticSharedMemory", "Bytes",
                      VT_CNTR_ABS | VT_CNTR_NEXT | VT_CNTR_UNSIGNED, 
                      vt_cupti_cgid_cuda_kernel, 0);
        vt_cuptiact_cid_knDynamicSharedMem = vt_def_counter(VT_MASTER_THREAD, 
                      "dynamicSharedMemory", "Bytes",
                      VT_CNTR_ABS | VT_CNTR_NEXT | VT_CNTR_UNSIGNED, 
                      vt_cupti_cgid_cuda_kernel, 0);
        vt_cuptiact_cid_knLocalMemTotal = vt_def_counter(VT_MASTER_THREAD, 
                      "localMemoryPerKernel", "Bytes",
                      VT_CNTR_ABS | VT_CNTR_NEXT | VT_CNTR_UNSIGNED, 
                      vt_cupti_cgid_cuda_kernel, 0);
        vt_cuptiact_cid_knRegistersPerThread = vt_def_counter(VT_MASTER_THREAD, 
                      "registersPerThread", "#",
                      VT_CNTR_ABS | VT_CNTR_NEXT | VT_CNTR_UNSIGNED, 
                      vt_cupti_cgid_cuda_kernel, 0);
      }
     
      /* define region for GPU activity flush */
      vt_cuptiact_rid_flush = vt_def_region(VT_MASTER_THREAD, "flushActivities", 
                        VT_NO_ID, VT_NO_LNO, VT_NO_LNO, "VT_CUDA", VT_FUNCTION);
#if (defined(VT_MT) || defined(VT_HYB))
      VTTHRD_UNLOCK_IDS();
#endif
      
      /*** enable the activities ***/
      /* enable kernel tracing */
      if(vt_gpu_trace_kernels > 0){
#if (defined(CUPTI_API_VERSION) && (CUPTI_API_VERSION >= 3))
        if((vt_gpu_config & VT_GPU_TRACE_CONCURRENT_KERNEL) 
           == VT_GPU_TRACE_CONCURRENT_KERNEL){
          /*VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL), 
                        "cuptiActivityEnable");*/
          VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL), 
                        "cuptiActivityEnable");
        }else
#endif
          VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL), 
                        "cuptiActivityEnable");
      }
      
      /* enable memory copy tracing */
      if(vt_gpu_trace_mcpy){
        VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY), 
                      "cuptiActivityEnable");
      }
      
      /* register the finalize function of VampirTrace CUPTI to be called before
       * the program exits 
      atexit(vt_cupti_activity_finalize);*/

      vt_cuptiact_initialized = 1;
      /*VT_CUPTI_UNLOCK();
    }*/
  }
}