Пример #1
0
void
initTrace()
{

  // Enqueue a couple of buffers in the global queue
  queueNewBuffer(NULL, 0);
  queueNewBuffer(NULL, 0);

  // device activity record is created when CUDA initializes, so we
  // want to enable it before cuInit() or any CUDA runtime call
  CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
//  CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY));
  cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT);
 //                       cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER);
//                        cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME);
                        cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY);
                        cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET);
                        cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL);

	
  CUpti_SubscriberHandle subscriber;
  CUPTI_CALL(cuptiSubscribe(&subscriber, (CUpti_CallbackFunc)traceCallback, NULL));

  CUPTI_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_RESOURCE));
  CUPTI_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_SYNCHRONIZE));
//add by wukai
  //CUPIT_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API));

  CUPTI_CALL(cuptiGetTimestamp(&startTimestamp));
}
Пример #2
0
/*
 * Create a VampirTrace CUPTI activity context.
 * 
 * @return pointer to created VampirTrace CUPTI Activity context
 */
static vt_cupti_activity_t* vt_cuptiact_createCtxActivity(CUcontext cuCtx)
{
  vt_cupti_activity_t* vtCtxAct = NULL;
  
  /* create new context, as it is not listed */
  vtCtxAct = (vt_cupti_activity_t *)malloc(sizeof(vt_cupti_activity_t));
  if(vtCtxAct == NULL) 
    vt_error_msg("[CUPTI Activity] Could not allocate memory for activity context!");
  vtCtxAct->strmList = NULL;
  vtCtxAct->gpuMemAllocated = 0;
  vtCtxAct->gpuMemList = NULL;
  vtCtxAct->buffer = NULL;
  vtCtxAct->vtLastGPUTime = vt_gpu_init_time;
  vtCtxAct->gpuIdleOn = 1;
  
  /* 
   * Get time synchronization factor between host and GPU time for measurement 
   * interval 
   */
  {
    VT_CUPTI_CALL(cuptiGetTimestamp(&(vtCtxAct->sync.gpuStart)), "cuptiGetTimestamp");
    vtCtxAct->sync.hostStart = vt_pform_wtime();
  }
  
    /* set default CUPTI stream ID (needed for memory usage and idle tracing) */
  VT_CUPTI_CALL(cuptiGetStreamId(cuCtx, NULL, &(vtCtxAct->defaultStrmID)), 
                                 "cuptiGetStreamId");
  
  return vtCtxAct;
}
Пример #3
0
/*
 * Create a VampirTrace CUPTI Activity context.
 * 
 * @param ctxID ID of the CUDA context
 * @param devID ID of the CUDA device
 * 
 * @return pointer to created VampirTrace CUPTI Activity context
 */
static vt_cuptiact_ctx_t* vt_cuptiact_createContext(uint32_t ctxID, 
                                                    CUcontext cuCtx, 
                                                    uint32_t devID)
{
  vt_cuptiact_ctx_t* vtCtx = NULL;
  
  /* create new context, as it is not listed */
  vtCtx = (vt_cuptiact_ctx_t *)malloc(sizeof(vt_cuptiact_ctx_t));
  if(vtCtx == NULL) 
    vt_error_msg("[CUPTI Activity] Could not allocate memory for context!");
  vtCtx->ctxID = ctxID;
  vtCtx->next = NULL;
  vtCtx->strmList = NULL;
  vtCtx->gpuMemAllocated = 0;
  vtCtx->gpuMemList = NULL;
  vtCtx->buffer = NULL;
  vtCtx->vtLastGPUTime = vt_gpu_init_time;
  vtCtx->gpuIdleOn = 1;
  
  /* 
   * Get time synchronization factor between host and GPU time for measurement 
   * interval 
   */
  {
    VT_CUPTI_CALL(cuptiGetTimestamp(&(vtCtx->sync.gpuStart)), "cuptiGetTimestamp");
    vtCtx->sync.hostStart = vt_pform_wtime();
  }
  
  VT_CHECK_THREAD;
  vtCtx->ptid = VT_MY_THREAD;
  
  if(cuCtx == NULL) CHECK_CU_ERROR(cuCtxGetCurrent(&cuCtx), NULL);
  vtCtx->cuCtx = cuCtx;
  
  /* set default CUPTI stream ID (needed for memory usage and idle tracing) */
  VT_CUPTI_CALL(cuptiGetStreamId(vtCtx->cuCtx, NULL, &(vtCtx->defaultStrmID)), 
                                 "cuptiGetStreamId");
  
  if(devID == (uint32_t)-1){
    CUdevice cuDev;
    
    /* driver API prog: correct cuDev, but result is 201 (invalid context) */
    if(CUDA_SUCCESS != cuCtxGetDevice(&cuDev)){
      devID = VT_NO_ID;
    }else{
      devID = (uint32_t)cuDev;
    }
  }
  
  vtCtx->devID = devID;
  vtCtx->cuDev = devID;
  
  /*vt_cntl_msg(1,"device id: %d", devID);*/
  
  return vtCtx;
}
Пример #4
0
void vt_cuptiact_flushCtxActivities(vt_cupti_ctx_t *vtCtx)
{ 
  CUptiResult status;
  uint8_t *buffer = NULL;
  size_t bufSize;
  CUpti_Activity *record = NULL;
  uint64_t hostStop, gpuStop;
  uint32_t ptid = VT_NO_ID;
  vt_cupti_activity_t *vtcuptiActivity = NULL;
  
  /* check for VampirTrace CUPTI context */
  if(vtCtx == NULL || vtCtx->activity == NULL){
    vt_warning("[CUPTI Activity] Context not found!");
    return;
  }
  vtcuptiActivity = vtCtx->activity;
  
  /* check if the buffer contains records */
  status = cuptiActivityQueryBuffer(vtCtx->cuCtx, 0, &bufSize);
  if(status != CUPTI_SUCCESS){
    if(CUPTI_ERROR_QUEUE_EMPTY == status || 
       CUPTI_ERROR_MAX_LIMIT_REACHED != status){
      return;
    }
  }

  /* expose VampirTrace CUPTI activity flush as measurement overhead */
  VT_CHECK_THREAD;
  ptid = VT_MY_THREAD;
  hostStop = vt_pform_wtime();
  vt_enter(ptid, &hostStop, vt_cuptiact_rid_flush);
  
  vt_cntl_msg(2,"[CUPTI Activity] Handle context %d activities", vtCtx->cuCtx);
  
  /* lock the whole buffer flush 
  VT_CUPTI_LOCK();*/
  
  /* dump the contents of the global queue */
  VT_CUPTI_CALL(cuptiActivityDequeueBuffer(vtCtx->cuCtx, 0, &buffer, 
                &bufSize), "cuptiActivityDequeueBuffer");

  /* 
   * Get time synchronization factor between host and GPU time for measured 
   * period 
   */
  {
    VT_CUPTI_CALL(cuptiGetTimestamp(&gpuStop), "cuptiGetTimestamp");
    hostStop = vt_pform_wtime();
    vtcuptiActivity->sync.hostStop = hostStop;
    
    vtcuptiActivity->sync.factor = (double)(hostStop - vtcuptiActivity->sync.hostStart)
                       /(double)(gpuStop - vtcuptiActivity->sync.gpuStart);
  }

  /*vt_cntl_msg(1, "hostStop: %llu , gpuStop: %llu", hostStopTS, gpuStopTS);
  vt_cntl_msg(1, "factor: %lf", syncFactor);*/
  
  do{
    status = cuptiActivityGetNextRecord(buffer, bufSize, &record);
    if(status == CUPTI_SUCCESS) {
      vt_cuptiact_writeRecord(record, vtCtx);
    }else if(status == CUPTI_ERROR_MAX_LIMIT_REACHED){
      break;
    }else{
      VT_CUPTI_CALL(status, "cuptiActivityGetNextRecord");
    }
  }while(1);

  /* report any records dropped from the global queue */
  {
    size_t dropped;
    
    VT_CUPTI_CALL(cuptiActivityGetNumDroppedRecords(vtCtx->cuCtx, 0, &dropped), 
                  "cuptiActivityGetNumDroppedRecords");
    if(dropped != 0)
      vt_warning("[CUPTI Activity] Dropped %u records. Current buffer size: %llu bytes\n"
                 "To avoid dropping of records increase the buffer size!\n"
                 "Proposed minimum VT_CUDATRACE_BUFFER_SIZE=%llu", 
                 (unsigned int)dropped, vt_cuptiact_bufSize, 
                 vt_cuptiact_bufSize + dropped/2 * 
                 (sizeof(CUpti_ActivityKernel) + sizeof(CUpti_ActivityMemcpy)));
  }
  
  /* enter GPU idle region after last kernel, if exited before */
  if(vtcuptiActivity->gpuIdleOn == 0){
    vt_enter(vtcuptiActivity->strmList->vtThrdID, 
             &(vtcuptiActivity->vtLastGPUTime), vt_gpu_rid_idle);
    vtcuptiActivity->gpuIdleOn = 1;
    /*vt_warning("IDLfente: %llu (%d)", vtCtx->vtLastGPUTime, vtCtx->strmList->vtThrdID);*/
  }
  
  /* enqueue buffer again */
  VT_CUPTI_CALL(cuptiActivityEnqueueBuffer(vtCtx->cuCtx, 0, buffer, 
                vt_cuptiact_bufSize), "cuptiActivityEnqueueBuffer");
  
    
  /* set new synchronization point */
  vtcuptiActivity->sync.hostStart = hostStop;
  vtcuptiActivity->sync.gpuStart = gpuStop;
  
  /*VT_CUPTI_UNLOCK();*/
  
  /* use local variable hostStop to write exit event for activity flush */
  hostStop = vt_pform_wtime();
  vt_exit(ptid, &hostStop);
}