Пример #1
0
/*
 * Create a VampirTrace CUPTI activity context.
 * 
 * @return pointer to created VampirTrace CUPTI Activity context
 */
static vt_cupti_activity_t* vt_cuptiact_createCtxActivity(CUcontext cuCtx)
{
  vt_cupti_activity_t* vtCtxAct = NULL;
  
  /* create new context, as it is not listed */
  vtCtxAct = (vt_cupti_activity_t *)malloc(sizeof(vt_cupti_activity_t));
  if(vtCtxAct == NULL) 
    vt_error_msg("[CUPTI Activity] Could not allocate memory for activity context!");
  vtCtxAct->strmList = NULL;
  vtCtxAct->gpuMemAllocated = 0;
  vtCtxAct->gpuMemList = NULL;
  vtCtxAct->buffer = NULL;
  vtCtxAct->vtLastGPUTime = vt_gpu_init_time;
  vtCtxAct->gpuIdleOn = 1;
  
  /* 
   * Get time synchronization factor between host and GPU time for measurement 
   * interval 
   */
  {
    VT_CUPTI_CALL(cuptiGetTimestamp(&(vtCtxAct->sync.gpuStart)), "cuptiGetTimestamp");
    vtCtxAct->sync.hostStart = vt_pform_wtime();
  }
  
    /* set default CUPTI stream ID (needed for memory usage and idle tracing) */
  VT_CUPTI_CALL(cuptiGetStreamId(cuCtx, NULL, &(vtCtxAct->defaultStrmID)), 
                                 "cuptiGetStreamId");
  
  return vtCtxAct;
}
Пример #2
0
/*
 * Create a VampirTrace CUPTI Activity context.
 * 
 * @param ctxID ID of the CUDA context
 * @param devID ID of the CUDA device
 * 
 * @return pointer to created VampirTrace CUPTI Activity context
 */
static vt_cuptiact_ctx_t* vt_cuptiact_createContext(uint32_t ctxID, 
                                                    CUcontext cuCtx, 
                                                    uint32_t devID)
{
  vt_cuptiact_ctx_t* vtCtx = NULL;
  
  /* create new context, as it is not listed */
  vtCtx = (vt_cuptiact_ctx_t *)malloc(sizeof(vt_cuptiact_ctx_t));
  if(vtCtx == NULL) 
    vt_error_msg("[CUPTI Activity] Could not allocate memory for context!");
  vtCtx->ctxID = ctxID;
  vtCtx->next = NULL;
  vtCtx->strmList = NULL;
  vtCtx->gpuMemAllocated = 0;
  vtCtx->gpuMemList = NULL;
  vtCtx->buffer = NULL;
  vtCtx->vtLastGPUTime = vt_gpu_init_time;
  vtCtx->gpuIdleOn = 1;
  
  /* 
   * Get time synchronization factor between host and GPU time for measurement 
   * interval 
   */
  {
    VT_CUPTI_CALL(cuptiGetTimestamp(&(vtCtx->sync.gpuStart)), "cuptiGetTimestamp");
    vtCtx->sync.hostStart = vt_pform_wtime();
  }
  
  VT_CHECK_THREAD;
  vtCtx->ptid = VT_MY_THREAD;
  
  if(cuCtx == NULL) CHECK_CU_ERROR(cuCtxGetCurrent(&cuCtx), NULL);
  vtCtx->cuCtx = cuCtx;
  
  /* set default CUPTI stream ID (needed for memory usage and idle tracing) */
  VT_CUPTI_CALL(cuptiGetStreamId(vtCtx->cuCtx, NULL, &(vtCtx->defaultStrmID)), 
                                 "cuptiGetStreamId");
  
  if(devID == (uint32_t)-1){
    CUdevice cuDev;
    
    /* driver API prog: correct cuDev, but result is 201 (invalid context) */
    if(CUDA_SUCCESS != cuCtxGetDevice(&cuDev)){
      devID = VT_NO_ID;
    }else{
      devID = (uint32_t)cuDev;
    }
  }
  
  vtCtx->devID = devID;
  vtCtx->cuDev = devID;
  
  /*vt_cntl_msg(1,"device id: %d", devID);*/
  
  return vtCtx;
}
/*
 * Print all available counters to stdout.
 *
 * @param capList list of CUDA devices with different capabilities
 */
static void vt_cupti_showAllCounters(vt_cupti_device_t *capList)
{
  CUptiResult cuptiErr = CUPTI_SUCCESS;
  CUpti_EventDomainID *domainId = NULL;
  uint32_t maxDomains = 0;
  uint32_t i;
  size_t size = 0;
  
  while(capList != NULL){
    CUdevice cuDev = capList->cuDev;
    vt_cntl_msg(1, "[CUPTI Events] Available events for device %d (SM %d.%d):", 
                   cuDev, capList->dev_major, capList->dev_minor);
    vt_cntl_msg(1, "Id:Name");
    vt_cntl_msg(1, "Description\n"
         "-------------------------------------------------------------------");
    
    cuptiErr = cuptiDeviceGetNumEventDomains(cuDev, &maxDomains);
    VT_CUPTI_CALL(cuptiErr, "cuptiDeviceGetNumEventDomains");

    if(maxDomains == 0){
      vt_warning("[CUPTI Events] No domain is exposed by dev = %d\n", cuDev);
      return;
    }

    size = sizeof(CUpti_EventDomainID) * maxDomains;
    domainId = (CUpti_EventDomainID*)malloc(size);
    if(domainId == NULL){
      vt_warning("[CUPTI Events] Failed to allocate memory to domain ID");
      return;
    }
    memset(domainId, 0, size);

    cuptiErr = cuptiDeviceEnumEventDomains(cuDev, &size, domainId);
    VT_CUPTI_CALL(cuptiErr, "cuptiDeviceEnumEventDomains");

    /* enum domains */
    for(i = 0; i < maxDomains; i++) vt_cuptievt_enumEvents(cuDev, domainId[i]);

    vt_cntl_msg(1, "------------------------------------------------------");
    
    free(domainId);
    
    capList = capList->next;
  }
  
  /* as this function is in the call-path of the initialize functions
   * -> vt_cupti_setupMetrics 
   * -> vt_cupti_fillMetricList 
   * -> vt_cupti_showAllCounters
   */
  vt_cuptievt_initialized = 1;
  VT_CUPTI_UNLOCK();
  exit(0);
}
/*
 * Finalizes CUPTI device.
 * 
 * @param ptid VampirTrace process/thread id
 * @param cleanExit 1 to cleanup CUPTI event group, otherwise 0
 */
void vt_cuptievt_finalize_device(uint8_t cleanExit){
  CUptiResult cuptiErr = CUPTI_SUCCESS;
  vt_cupti_ctx_t *vtcuptiCtx = NULL;

  vt_cntl_msg(2, "[CUPTI Events] Finalize device ... ");

  {
    CUcontext cuCtx;
    
#if (defined(CUDA_VERSION) && (CUDA_VERSION < 4000))
    VT_CUDRV_CALL(cuCtxPopCurrent(&cuCtx), "cuCtxPopCurrent");
    VT_CUDRV_CALL(cuCtxPushCurrent(cuCtx), "cuCtxPushCurrent");
#else
    VT_CUDRV_CALL(cuCtxGetCurrent(&cuCtx), "cuCtxGetCurrent");
#endif

    vtcuptiCtx = vt_cupti_removeCtx(&cuCtx);
    if(vtcuptiCtx == NULL) 
      return;
  }
  
  if(vtcuptiCtx->events == NULL)
    return;

  if(cleanExit && vt_gpu_debug != 0){
    /*uint64_t time = vt_pform_wtime();

    vt_cupti_resetCounter(vtcuptiCtx, 0, &time);*/

    /* stop CUPTI counter capturing */
    vt_cuptievt_stop(vtcuptiCtx->events);

    /* destroy all CUPTI event groups, which have been created */
    {
      vt_cupti_evtgrp_t *vtcuptiGrp = vtcuptiCtx->events->vtGrpList;

      while(vtcuptiGrp != NULL){
        cuptiErr = cuptiEventGroupRemoveAllEvents(vtcuptiGrp->evtGrp);
        VT_CUPTI_CALL(cuptiErr, "cuptiEventGroupRemoveAllEvents");

        cuptiErr = cuptiEventGroupDestroy(vtcuptiGrp->evtGrp);
        VT_CUPTI_CALL(cuptiErr, "cuptiEventGroupDestroy");

        vtcuptiGrp = vtcuptiGrp->next;
      }
    }
  }

  /* free VampirTrace CUPTI event context */
  vt_cuptievt_freeEventCtx(vtcuptiCtx->events);
}
static vt_cupti_evtgrp_t* vt_cuptievt_createEvtGrp(vt_cupti_ctx_t *vtcuptiCtx)
{
  CUptiResult cuptiErr = CUPTI_SUCCESS;
  vt_cupti_evtgrp_t *vtcuptiGrp = NULL;

  vtcuptiGrp = (vt_cupti_evtgrp_t*)malloc(sizeof(vt_cupti_evtgrp_t));
  vtcuptiGrp->evtNum = 0;
  vtcuptiGrp->enabled = 0;
  vtcuptiGrp->next = NULL;

  /* create initial CUPTI counter group */
  cuptiErr = cuptiEventGroupCreate(vtcuptiCtx->cuCtx, &(vtcuptiGrp->evtGrp), 0);
  VT_CUPTI_CALL(cuptiErr, "cuptiEventGroupCreate");

  {
    size_t evtNum = vtcuptiCtx->events->vtDevCap->evtNum;
    
    vtcuptiGrp->cuptiEvtIDs = 
            (CUpti_EventID *)malloc(evtNum*sizeof(CUpti_EventID));
    
    vtcuptiGrp->vtCIDs = 
            (uint32_t *)malloc(evtNum*sizeof(uint32_t));
  }

  return vtcuptiGrp;
}
/*
 * Reset the VampirTrace counter values (to zero) for active CUPTI counters.
 *
 * @param vtcuptiEvtCtx pointer to the VampirTrace CUPTI events context
 * @param strmid the stream id for the counter values
 * @param time the VampirTrace timestamps
 */
void vt_cuptievt_resetCounter(vt_cupti_events_t *vtcuptiEvtCtx, uint32_t strmid,
                              uint64_t *time)
{
  size_t i;
  vt_cupti_evtgrp_t *vtcuptiGrp = NULL;

  /* create a VampirTrace CUPTI events context, if it is not available */
  if(vtcuptiEvtCtx == NULL){
    VT_CHECK_THREAD;
    vtcuptiEvtCtx = vt_cuptievt_getOrCreateCurrentCtx(VT_MY_THREAD)->events;
    if(vtcuptiEvtCtx == NULL) return;
  }

  vtcuptiGrp = vtcuptiEvtCtx->vtGrpList;
  
  while(vtcuptiGrp != NULL){
    for(i = 0; i < vtcuptiGrp->evtNum; i++){
      vt_count(strmid, time, *(vtcuptiGrp->vtCIDs+i), 0);
    }
    
    /* reset counter values of this group */
    VT_CUPTI_CALL(cuptiEventGroupResetAllEvents(vtcuptiGrp->evtGrp),
                      "cuptiEventGroupResetAllEvents");
    
    vtcuptiGrp = vtcuptiGrp->next;
  }
}
Пример #7
0
/*
 * Allocate a new buffer and add it to the queue specified by a CUDA context.
 * 
 * @param cuCtx the CUDA context, specifying the queue
 * 
 * @return pointer to the created buffer
 */
static uint8_t* vt_cuptiact_queueNewBuffer(CUcontext cuCtx)
{
  uint8_t *buffer = (uint8_t *)malloc(vt_cuptiact_bufSize);
	
  VT_CUPTI_CALL(cuptiActivityEnqueueBuffer(cuCtx, 0, ALIGN_BUFFER(buffer, 8), 
                                           vt_cuptiact_bufSize), 
                "cuptiActivityEnqueueBuffer");
  
  return buffer;
}
Пример #8
0
void vt_cuptiact_enableConcurrentKernel(vt_cupti_ctx_t* vtCtx)
{
    /* 
     * Disable collection of kernels for the given CUDA context. 
     * !!! does not work yet !!!
     
    VT_CUPTI_CALL(cuptiActivityDisableContext(cuCtx, CUPTI_ACTIVITY_KIND_KERNEL),
                  "cuptiActivityDisableContext");*
  
    * flush the already buffered activities for this CUDA context *
    vt_cuptiact_flushCtxActivities(cuCtx);

    * Enable collection of kernels for the given CUDA context 
    VT_CUPTI_CALL(cuptiActivityEnableContext(cuCtx, CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL), 
                  "cuptiActivityEnableContext");*/
  
  if((vt_gpu_config & VT_GPU_TRACE_CONCURRENT_KERNEL) 
         != VT_GPU_TRACE_CONCURRENT_KERNEL){

    vt_cntl_msg(2, "[CUPTI Activity] Enable concurrent kernel tracing.");
    
    /*
     * Disable normal (lower overhead) kernel tracing.
     */
    VT_CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL),
                  "cuptiActivityDisable");
    
    /* 
     * Flush the already buffered activities for this CUDA context.
     */
    VT_CUPTI_LOCK();
    vt_cuptiact_flushCtxActivities(vtCtx);
    VT_CUPTI_UNLOCK();

    /*
     * Enable concurrent kernel tracing (higher overhead).
     */
    VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL), 
                  "cuptiActivityEnable");
    
    vt_gpu_config |= VT_GPU_TRACE_CONCURRENT_KERNEL;
  }
}
/*
 * Stop CUPTI counter capturing by disabling the CUPTI event groups.
 * 
 * @param vtcuptiEvtCtx pointer to the VampirTrace CUPTI events context
 */
static void vt_cuptievt_stop(vt_cupti_events_t *vtcuptiEvtCtx)
{
  vt_cupti_evtgrp_t *vtcuptiGrp = NULL;

  if(vtcuptiEvtCtx == NULL || vt_gpu_debug) 
    return;

  /* stop counter reading for all groups */
  vtcuptiGrp = vtcuptiEvtCtx->vtGrpList;
  while(vtcuptiGrp != NULL){
    if(vtcuptiGrp->enabled){
      CUptiResult cuptiErr = CUPTI_SUCCESS;
      
      cuptiErr = cuptiEventGroupDisable(vtcuptiGrp->evtGrp);
      VT_CUPTI_CALL(cuptiErr, "cuptiEventGroupDisable");

      vtcuptiGrp->enabled = 0;
    }

    vtcuptiGrp = vtcuptiGrp->next;
  }
}
Пример #10
0
/*
 * Create a VampirTrace CUPTI stream.
 * 
 * @param vtCtx VampirTrace CUPTI context
 * @param cuStrm CUDA stream
 * @param strmID ID of the CUDA stream
 * 
 * @return pointer to created VampirTrace CUPTI stream
 */
vt_cupti_strm_t* vt_cupti_createStream(vt_cupti_ctx_t *vtCtx, 
                                       CUstream cuStrm, uint32_t strmID)
{
  vt_cupti_strm_t *vtStrm = NULL;
  
  if(vtCtx == NULL){
    vt_warning("[CUPTI] Cannot create stream without VampirTrace CUPTI context");
    return NULL;
  }
  
  vtStrm = (vt_cupti_strm_t *)malloc(sizeof(vt_cupti_strm_t));
  if(vtStrm == NULL)
    vt_error_msg("[CUPTI] Could not allocate memory for stream!");
  vtStrm->cuStrm = cuStrm;
  vtStrm->vtLastTime = vt_gpu_init_time;
  vtStrm->destroyed = 0;
  vtStrm->next = NULL;
  
#if defined(VT_CUPTI_ACTIVITY)
  /* create stream by VT CUPTI callbacks implementation (CUstream is given) */
  if(strmID == VT_CUPTI_NO_STREAM_ID){
    if(cuStrm != VT_CUPTI_NO_STREAM){
      VT_CUPTI_CALL(cuptiGetStreamId(vtCtx->cuCtx, cuStrm, &strmID), 
                                     "cuptiGetStreamId");
    }else{
      vt_warning("[CUPTI] Neither CUDA stream nor stream ID given!");
      free(vtStrm);
      return NULL;
    }
  }
#else /* only VT_CUPTI_CALLBACKS is defined */
  if(vtCtx->callbacks != NULL){
    strmID = vtCtx->callbacks->streamsCreated;
    vtCtx->callbacks->streamsCreated++;
  }
#endif

  vtStrm->cuStrmID = strmID;
  
  /* create VampirTrace thread */
  {
    char thread_name[16] = "CUDA";

    if(vt_gpu_stream_reuse){
      if(vtCtx->devID != VT_NO_ID){
        if(-1 == snprintf(thread_name+4, 12, "[%d]", vtCtx->devID))
          vt_cntl_msg(1, "Could not create thread name for CUDA thread!");
      }
    }else{
      if(vtCtx->devID == VT_NO_ID){
        if(-1 == snprintf(thread_name+4, 12, "[?:%d]", strmID))
          vt_cntl_msg(1, "Could not create thread name for CUDA thread!");
      }else{
        if(-1 == snprintf(thread_name+4, 12, "[%d:%d]", vtCtx->devID, strmID))
          vt_cntl_msg(1, "Could not create thread name for CUDA thread!");
      }
    }
    
    VT_CHECK_THREAD;
    vt_gpu_registerThread(thread_name, VT_MY_THREAD, &(vtStrm->vtThrdID));
  }
  
  if(vt_gpu_init_time < vt_start_time)
      vt_gpu_init_time = vt_start_time;
  
  /* for the first stream created for this context */
  if(vtCtx->strmList == NULL){
    if(vt_gpu_trace_idle > 0){      
      /* write enter event for GPU_IDLE on first stream */
      vt_enter(vtStrm->vtThrdID, &vt_gpu_init_time, vt_gpu_rid_idle);
      /*vt_warning("IDLEente: %llu (%d)", vt_gpu_init_time, vtStrm->vtThrdID);*/
      
#if defined(VT_CUPTI_ACTIVITY)
      if(vtCtx->activity != NULL)
        vtCtx->activity->gpuIdleOn = 1;
#endif
    }
    
    /* set the counter value for cudaMalloc to 0 on first stream */
    if(vt_gpu_trace_memusage > 0)
      vt_count(vtStrm->vtThrdID, &vt_gpu_init_time, vt_gpu_cid_memusage, 0);
  }

  if(vt_gpu_trace_kernels > 1){
    /* set count values to zero */
    vt_count(vtStrm->vtThrdID, &vt_gpu_init_time, vt_cupti_cid_blocksPerGrid, 0);
    vt_count(vtStrm->vtThrdID, &vt_gpu_init_time, vt_cupti_cid_threadsPerBlock, 0);
    vt_count(vtStrm->vtThrdID, &vt_gpu_init_time, vt_cupti_cid_threadsPerKernel, 0);
  }
  
  /* prepend the stream 
  vtStrm->next = vtCtx->strmList;
  vtCtx->strmList = vtStrm;*/
  
  return vtStrm;
}
Пример #11
0
void vt_cupti_events_finalizeContext(vt_cupti_ctx_t *vtCtx)
{
  uint64_t time = vt_pform_wtime();
  vt_cupti_strm_t *curStrm = NULL;
  vt_cupti_evtgrp_t *vtcuptiGrp = NULL;
  
  if(vtCtx == NULL || vtCtx->events == NULL)
    return;
  
  /* These CUPTI calls may fail, as CUPTI has implicitly destroyed something */
  if(vt_gpu_debug == 0){
    curStrm = vtCtx->strmList;


    /* for all streams of this context */
    while(curStrm != NULL){

      /* ensure increasing time stamps */
      if(time < curStrm->vtLastTime){
        curStrm = curStrm->next;
        continue;
      }

      vt_cuptievt_resetCounter(vtCtx->events, curStrm->vtThrdID, &time);

      curStrm = curStrm->next;
    }

    /* stop CUPTI counter capturing */
    vt_cuptievt_stop(vtCtx->events);

    /* destroy all CUPTI event groups, which have been created */
    vtcuptiGrp = vtCtx->events->vtGrpList;

    while(vtcuptiGrp != NULL){
      VT_CUPTI_CALL(cuptiEventGroupRemoveAllEvents(vtcuptiGrp->evtGrp), 
                    "cuptiEventGroupRemoveAllEvents");

      VT_CUPTI_CALL(cuptiEventGroupDestroy(vtcuptiGrp->evtGrp), 
                    "cuptiEventGroupDestroy");

      vtcuptiGrp = vtcuptiGrp->next;
    }
  }else{
    /* set at least the VampirTrace counter to zero */
    curStrm = vtCtx->strmList;

    /* for all streams of this context */
    while(curStrm != NULL){

      /* ensure increasing time stamps */
      if(time < curStrm->vtLastTime){
        curStrm = curStrm->next;
        continue;
      }
      
      vtcuptiGrp = vtCtx->events->vtGrpList;

      while(vtcuptiGrp != NULL){
        size_t i;
        
        for(i = 0; i < vtcuptiGrp->evtNum; i++){
          vt_count(curStrm->vtThrdID, &time, *(vtcuptiGrp->vtCIDs+i), 0);
        }

        vtcuptiGrp = vtcuptiGrp->next;
      }
      
      curStrm = curStrm->next;
    }
  }
  
  /* free previously allocated memory */
  vt_cuptievt_freeEventCtx(vtCtx->events);
}
Пример #12
0
/*
 * Request the CUTPI counter values and write it to the given VampirTrace
 * stream with the given timestamps.
 *
 * @param vtcuptiEvtCtx pointer to the VampirTrace CUPTI events context
 * @param strmid the stream id for the counter values
 * @param time the VampirTrace timestamps
 */
void vt_cuptievt_writeCounter(vt_cupti_events_t *vtcuptiEvtCtx, uint32_t strmid,
                              uint64_t *time)
{
  CUptiResult cuptiErr = CUPTI_SUCCESS;
  vt_cupti_evtgrp_t *vtcuptiGrp = NULL;

  size_t bufferSizeBytes;
  size_t arraySizeBytes;
  size_t numCountersRead;

  if(vtcuptiEvtCtx == NULL){
    VT_CHECK_THREAD;
    vtcuptiEvtCtx = vt_cuptievt_getOrCreateCurrentCtx(VT_MY_THREAD)->events;
    if(vtcuptiEvtCtx == NULL) return;
  }

  vtcuptiGrp = vtcuptiEvtCtx->vtGrpList;
  while(vtcuptiGrp != NULL){
    /* read events only, if the event group is enabled */
    if(vtcuptiGrp->enabled){

      bufferSizeBytes = vtcuptiGrp->evtNum * sizeof(uint64_t);
      arraySizeBytes = vtcuptiGrp->evtNum * sizeof(CUpti_EventID);

      /* read events */
      cuptiErr = cuptiEventGroupReadAllEvents(vtcuptiGrp->evtGrp,
                                              CUPTI_EVENT_READ_FLAG_NONE,
                                              &bufferSizeBytes, vtcuptiEvtCtx->counterData,
                                              &arraySizeBytes, vtcuptiEvtCtx->cuptiEvtIDs,
                                              &numCountersRead);
      VT_CUPTI_CALL(cuptiErr, "cuptiEventGroupReadAllEvents");
      
      if(vtcuptiGrp->evtNum != numCountersRead){
        vt_error_msg("[CUPTI Events] %d counter reads, %d metrics specified in "
                     "VT_CUPTI_METRICS!", numCountersRead, vtcuptiGrp->evtNum);
      }

      /* For all events of the event group: map added event IDs to just read event
       * IDs, as the order may not be the same. For small numbers of counter reads
       * this simple mapping should be fast enough.
       */
      {
        size_t j;

        for(j = 0; j < numCountersRead; j++){
          size_t i;
          for(i = 0; i < vtcuptiGrp->evtNum; i++){
            if(vtcuptiEvtCtx->cuptiEvtIDs[j] == *(vtcuptiGrp->cuptiEvtIDs+i)){
              /* write the counter value as VampirTrace counter */
              vt_count(strmid, time, *(vtcuptiGrp->vtCIDs+i), vtcuptiEvtCtx->counterData[i]);
            }
          }
        }
      }

    }

    vtcuptiGrp = vtcuptiGrp->next;
  }
  
}
Пример #13
0
/* no need to lock, because it is only called by vt_cupti_callback_init() */
void vt_cupti_activity_init()
{
  /*if(!vt_cuptiact_initialized){
    vt_cupti_init();
    VT_CUPTI_LOCK();*/
    if(!vt_cuptiact_initialized){
      vt_cntl_msg(2, "[CUPTI Activity] Initializing ... ");
      
      {        
        vt_cuptiact_bufSize = vt_env_cudatrace_bsize();
        
        /* no buffer size < 1024 bytes allowed (see CUPTI documentation) */
        if(vt_cuptiact_bufSize < 1024){
          if(vt_cuptiact_bufSize > 0){
            vt_warning("[CUPTI Activity] Buffer size has to be at least 1024 "
                       "bytes! It has been set to %d.", vt_cuptiact_bufSize);
          }
          vt_cuptiact_bufSize = VT_CUPTI_ACT_DEFAULT_BSIZE;
        }
        
        /* queue a global buffer to initialize CUPTI before CUDA init 
        vt_cuptiact_buffer = (uint8_t *)malloc(vt_cuptiact_bufSize);
        VT_CUPTI_CALL(cuptiActivityEnqueueBuffer(NULL, 0, 
                                      vt_cuptiact_buffer, vt_cuptiact_bufSize), 
                      "cuptiActivityEnqueueBuffer");*/
      }
      
#if (defined(VT_MT) || defined(VT_HYB))
      VTTHRD_LOCK_IDS();
#endif
      if(vt_gpu_trace_kernels > 1){
        /* define kernel counters */
        vt_cuptiact_cid_knStaticSharedMem = vt_def_counter(VT_MASTER_THREAD, 
                      "staticSharedMemory", "Bytes",
                      VT_CNTR_ABS | VT_CNTR_NEXT | VT_CNTR_UNSIGNED, 
                      vt_cupti_cgid_cuda_kernel, 0);
        vt_cuptiact_cid_knDynamicSharedMem = vt_def_counter(VT_MASTER_THREAD, 
                      "dynamicSharedMemory", "Bytes",
                      VT_CNTR_ABS | VT_CNTR_NEXT | VT_CNTR_UNSIGNED, 
                      vt_cupti_cgid_cuda_kernel, 0);
        vt_cuptiact_cid_knLocalMemTotal = vt_def_counter(VT_MASTER_THREAD, 
                      "localMemoryPerKernel", "Bytes",
                      VT_CNTR_ABS | VT_CNTR_NEXT | VT_CNTR_UNSIGNED, 
                      vt_cupti_cgid_cuda_kernel, 0);
        vt_cuptiact_cid_knRegistersPerThread = vt_def_counter(VT_MASTER_THREAD, 
                      "registersPerThread", "#",
                      VT_CNTR_ABS | VT_CNTR_NEXT | VT_CNTR_UNSIGNED, 
                      vt_cupti_cgid_cuda_kernel, 0);
      }
     
      /* define region for GPU activity flush */
      vt_cuptiact_rid_flush = vt_def_region(VT_MASTER_THREAD, "flushActivities", 
                        VT_NO_ID, VT_NO_LNO, VT_NO_LNO, "VT_CUDA", VT_FUNCTION);
#if (defined(VT_MT) || defined(VT_HYB))
      VTTHRD_UNLOCK_IDS();
#endif
      
      /*** enable the activities ***/
      /* enable kernel tracing */
      if(vt_gpu_trace_kernels > 0){
#if (defined(CUPTI_API_VERSION) && (CUPTI_API_VERSION >= 3))
        if((vt_gpu_config & VT_GPU_TRACE_CONCURRENT_KERNEL) 
           == VT_GPU_TRACE_CONCURRENT_KERNEL){
          /*VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL), 
                        "cuptiActivityEnable");*/
          VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL), 
                        "cuptiActivityEnable");
        }else
#endif
          VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL), 
                        "cuptiActivityEnable");
      }
      
      /* enable memory copy tracing */
      if(vt_gpu_trace_mcpy){
        VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY), 
                      "cuptiActivityEnable");
      }
      
      /* register the finalize function of VampirTrace CUPTI to be called before
       * the program exits 
      atexit(vt_cupti_activity_finalize);*/

      vt_cuptiact_initialized = 1;
      /*VT_CUPTI_UNLOCK();
    }*/
  }
}
Пример #14
0
void vt_cuptiact_flushCtxActivities(vt_cupti_ctx_t *vtCtx)
{ 
  CUptiResult status;
  uint8_t *buffer = NULL;
  size_t bufSize;
  CUpti_Activity *record = NULL;
  uint64_t hostStop, gpuStop;
  uint32_t ptid = VT_NO_ID;
  vt_cupti_activity_t *vtcuptiActivity = NULL;
  
  /* check for VampirTrace CUPTI context */
  if(vtCtx == NULL || vtCtx->activity == NULL){
    vt_warning("[CUPTI Activity] Context not found!");
    return;
  }
  vtcuptiActivity = vtCtx->activity;
  
  /* check if the buffer contains records */
  status = cuptiActivityQueryBuffer(vtCtx->cuCtx, 0, &bufSize);
  if(status != CUPTI_SUCCESS){
    if(CUPTI_ERROR_QUEUE_EMPTY == status || 
       CUPTI_ERROR_MAX_LIMIT_REACHED != status){
      return;
    }
  }

  /* expose VampirTrace CUPTI activity flush as measurement overhead */
  VT_CHECK_THREAD;
  ptid = VT_MY_THREAD;
  hostStop = vt_pform_wtime();
  vt_enter(ptid, &hostStop, vt_cuptiact_rid_flush);
  
  vt_cntl_msg(2,"[CUPTI Activity] Handle context %d activities", vtCtx->cuCtx);
  
  /* lock the whole buffer flush 
  VT_CUPTI_LOCK();*/
  
  /* dump the contents of the global queue */
  VT_CUPTI_CALL(cuptiActivityDequeueBuffer(vtCtx->cuCtx, 0, &buffer, 
                &bufSize), "cuptiActivityDequeueBuffer");

  /* 
   * Get time synchronization factor between host and GPU time for measured 
   * period 
   */
  {
    VT_CUPTI_CALL(cuptiGetTimestamp(&gpuStop), "cuptiGetTimestamp");
    hostStop = vt_pform_wtime();
    vtcuptiActivity->sync.hostStop = hostStop;
    
    vtcuptiActivity->sync.factor = (double)(hostStop - vtcuptiActivity->sync.hostStart)
                       /(double)(gpuStop - vtcuptiActivity->sync.gpuStart);
  }

  /*vt_cntl_msg(1, "hostStop: %llu , gpuStop: %llu", hostStopTS, gpuStopTS);
  vt_cntl_msg(1, "factor: %lf", syncFactor);*/
  
  do{
    status = cuptiActivityGetNextRecord(buffer, bufSize, &record);
    if(status == CUPTI_SUCCESS) {
      vt_cuptiact_writeRecord(record, vtCtx);
    }else if(status == CUPTI_ERROR_MAX_LIMIT_REACHED){
      break;
    }else{
      VT_CUPTI_CALL(status, "cuptiActivityGetNextRecord");
    }
  }while(1);

  /* report any records dropped from the global queue */
  {
    size_t dropped;
    
    VT_CUPTI_CALL(cuptiActivityGetNumDroppedRecords(vtCtx->cuCtx, 0, &dropped), 
                  "cuptiActivityGetNumDroppedRecords");
    if(dropped != 0)
      vt_warning("[CUPTI Activity] Dropped %u records. Current buffer size: %llu bytes\n"
                 "To avoid dropping of records increase the buffer size!\n"
                 "Proposed minimum VT_CUDATRACE_BUFFER_SIZE=%llu", 
                 (unsigned int)dropped, vt_cuptiact_bufSize, 
                 vt_cuptiact_bufSize + dropped/2 * 
                 (sizeof(CUpti_ActivityKernel) + sizeof(CUpti_ActivityMemcpy)));
  }
  
  /* enter GPU idle region after last kernel, if exited before */
  if(vtcuptiActivity->gpuIdleOn == 0){
    vt_enter(vtcuptiActivity->strmList->vtThrdID, 
             &(vtcuptiActivity->vtLastGPUTime), vt_gpu_rid_idle);
    vtcuptiActivity->gpuIdleOn = 1;
    /*vt_warning("IDLfente: %llu (%d)", vtCtx->vtLastGPUTime, vtCtx->strmList->vtThrdID);*/
  }
  
  /* enqueue buffer again */
  VT_CUPTI_CALL(cuptiActivityEnqueueBuffer(vtCtx->cuCtx, 0, buffer, 
                vt_cuptiact_bufSize), "cuptiActivityEnqueueBuffer");
  
    
  /* set new synchronization point */
  vtcuptiActivity->sync.hostStart = hostStop;
  vtcuptiActivity->sync.gpuStart = gpuStop;
  
  /*VT_CUPTI_UNLOCK();*/
  
  /* use local variable hostStop to write exit event for activity flush */
  hostStop = vt_pform_wtime();
  vt_exit(ptid, &hostStop);
}