void vt_cuptiact_markStreamAsDestroyed(CUcontext cuCtx, uint32_t strmID)
{
  vt_cupti_ctx_t *vtCtx = NULL;
  vt_cuptiact_strm_t *currStrm = NULL;

  VT_CUPTI_LOCK();
  
  if(cuCtx == NULL){
    vt_warning("[CUPTI Activity] No CUDA context given in "
               "vt_cuptiact_markStreamAsDestroyed()!");
    VT_CUPTI_UNLOCK();
    return;
  }
  
  vtCtx = vt_cupti_getCtxNoLock(cuCtx);
  
  if(vtCtx == NULL){
    vt_warning("[CUPTI Activity] No context found in "
               "vt_cuptiact_markStreamAsDestroyed()!");
    VT_CUPTI_UNLOCK();
    return;
  }
  
  currStrm = vtCtx->activity->strmList;
  while(currStrm != NULL){
    if(currStrm->strmID == strmID){
      currStrm->destroyed = 1;
      VT_CUPTI_UNLOCK();
      return;
    }
    currStrm = currStrm->next;
  }
  
  VT_CUPTI_UNLOCK();
}
/*
 * Print all available counters to stdout.
 *
 * @param capList list of CUDA devices with different capabilities
 */
static void vt_cupti_showAllCounters(vt_cupti_device_t *capList)
{
  CUptiResult cuptiErr = CUPTI_SUCCESS;
  CUpti_EventDomainID *domainId = NULL;
  uint32_t maxDomains = 0;
  uint32_t i;
  size_t size = 0;
  
  while(capList != NULL){
    CUdevice cuDev = capList->cuDev;
    vt_cntl_msg(1, "[CUPTI Events] Available events for device %d (SM %d.%d):", 
                   cuDev, capList->dev_major, capList->dev_minor);
    vt_cntl_msg(1, "Id:Name");
    vt_cntl_msg(1, "Description\n"
         "-------------------------------------------------------------------");
    
    cuptiErr = cuptiDeviceGetNumEventDomains(cuDev, &maxDomains);
    VT_CUPTI_CALL(cuptiErr, "cuptiDeviceGetNumEventDomains");

    if(maxDomains == 0){
      vt_warning("[CUPTI Events] No domain is exposed by dev = %d\n", cuDev);
      return;
    }

    size = sizeof(CUpti_EventDomainID) * maxDomains;
    domainId = (CUpti_EventDomainID*)malloc(size);
    if(domainId == NULL){
      vt_warning("[CUPTI Events] Failed to allocate memory to domain ID");
      return;
    }
    memset(domainId, 0, size);

    cuptiErr = cuptiDeviceEnumEventDomains(cuDev, &size, domainId);
    VT_CUPTI_CALL(cuptiErr, "cuptiDeviceEnumEventDomains");

    /* enum domains */
    for(i = 0; i < maxDomains; i++) vt_cuptievt_enumEvents(cuDev, domainId[i]);

    vt_cntl_msg(1, "------------------------------------------------------");
    
    free(domainId);
    
    capList = capList->next;
  }
  
  /* as this function is in the call-path of the initialize functions
   * -> vt_cupti_setupMetrics 
   * -> vt_cupti_fillMetricList 
   * -> vt_cupti_showAllCounters
   */
  vt_cuptievt_initialized = 1;
  VT_CUPTI_UNLOCK();
  exit(0);
}
示例#3
0
int vt_env_exectrace()
{
  static int exectrace = -1;
  char* tmp;

  if (exectrace == -1)
    {
      tmp = getenv("VT_EXECTRACE");
      if (tmp != NULL && strlen(tmp) > 0)
        {
          vt_cntl_msg(2, "VT_EXECTRACE=%s", tmp);

          exectrace = parse_bool(tmp);
        }
      else
        {
          tmp = getenv("VT_LIBCTRACE");
          if (tmp != NULL && strlen(tmp) > 0)
            {
              exectrace = parse_bool(tmp);

              vt_warning("VT_LIBCTRACE is deprecated, use VT_EXECTRACE instead!");
            }
          else
            {
              exectrace = 1;
            }
        }
    }
  return exectrace;
}
示例#4
0
int vt_env_gputrace_kernel()
{
  static int cudakernel = -1;

  if (cudakernel == -1)
    {
      char* tmp = getenv("VT_GPUTRACE_KERNEL");
      if (tmp != NULL && strlen(tmp) > 0)
        {
          vt_cntl_msg(2, "VT_GPUTRACE_KERNEL=%s", tmp);

          cudakernel = atoi(tmp);
          /* perhaps user wrote 'yes' or 'true' */
          if(cudakernel == 0 && parse_bool(tmp) == 1) cudakernel = 1;
          
          if(cudakernel == 1)
            vt_warning("VT_GPUTRACE_KERNEL is deprecated, "
                      "use option 'kernel' with VT_GPUTRACE instead!");
        }
      else
        {
          cudakernel = 1;
        }
    }
  return cudakernel;
}
示例#5
0
int vt_env_gputrace_memusage()
{
  static int gpumem = -1;
  
  if (gpumem == -1)
    {
      char* tmp = getenv("VT_GPUTRACE_MEMUSAGE");
      if (tmp != NULL && strlen(tmp) > 0)
        {
          vt_cntl_msg(2, "VT_GPUTRACE_MEMUSAGE=%s", tmp);

          gpumem = atoi(tmp);
          /* if user wrote 'yes' or 'true' */
          if(gpumem == 0 && parse_bool(tmp) == 1) gpumem = 1;
        }
      else
        {
          gpumem = 0;
        }
      
      if (gpumem > 0)
        vt_warning("VT_GPUTRACE_MEMUSAGE is deprecated, "
                  "use option 'memusage' with VT_GPUTRACE instead!");
    }
  return gpumem;
}
示例#6
0
size_t vt_env_thread_bsize()
{
  static size_t buffer_size = 0;
  char* tmp;

  if (buffer_size == 0)
    {
      tmp = getenv("VT_THREAD_BUFFER_SIZE");
      if (tmp != NULL && strlen(tmp) > 0)
        {
          vt_cntl_msg(2, "VT_THREAD_BUFFER_SIZE=%s", tmp);

          buffer_size = parse_size(tmp);
          if (buffer_size <= 0)
            {
              vt_error_msg("VT_BUFFER_SIZE not properly set");
            }
          else if (buffer_size < VT_MIN_BUFSIZE)
            {
              vt_warning("VT_BUFFER_SIZE=%d resized to %d bytes", 
                         buffer_size, VT_MIN_BUFSIZE);
              buffer_size = VT_MIN_BUFSIZE;
            }
        }
      else
        {
          buffer_size = 0;
        }
    }

  return buffer_size;
}
static void vt_cuptievt_start(vt_cupti_events_t *vtcuptiEvtCtx)
{
  CUptiResult cuptiErr = CUPTI_SUCCESS;
  vt_cupti_evtgrp_t *vtcuptiGrp = NULL;
  vt_cupti_evtgrp_t *lastGrp = NULL;

  /* start gathering counter values, if context was successfully initialized */
  if(NULL == vtcuptiEvtCtx){
    /* no performance counters for this thread available */
    VT_CHECK_THREAD;
    vt_gpu_prop[VT_MY_THREAD] |= VTGPU_NO_PC;
    vt_cntl_msg(2, "[CUPTI Events] Context not initialized!");
    return;
  }

  /* start all groups */
  vtcuptiGrp = vtcuptiEvtCtx->vtGrpList;
  lastGrp = vtcuptiEvtCtx->vtGrpList;
  while(vtcuptiGrp != NULL){
    cuptiErr = cuptiEventGroupEnable(vtcuptiGrp->evtGrp);
    
    /* if the event group could not be enabled, remove it */
    if(cuptiErr != CUPTI_SUCCESS){
      size_t i;
      vt_cupti_evtgrp_t *freeGrp = vtcuptiGrp;
      size_t valueSize = 32;
      char name[32];

      vtcuptiGrp = vtcuptiGrp->next;

      /* give user information about the group, which cannot be enabled */
      for(i = 0; i < freeGrp->evtNum; i++){
        VTCUPTIEVENTGETATTRIBUTE(vtcuptiEvtCtx->vtDevCap->cuDev,
                                 *(freeGrp->cuptiEvtIDs)+i,
                                 CUPTI_EVENT_ATTR_NAME,
                                 &valueSize, (char*)name);
        vt_warning("[CUPTI Events] Event '%s' (%d) cannot be enabled",
                   name, *(freeGrp->cuptiEvtIDs)+i);
      }

      /* group is first element in linked list */
      if(vtcuptiEvtCtx->vtGrpList == freeGrp){
        vtcuptiEvtCtx->vtGrpList = vtcuptiEvtCtx->vtGrpList->next;
      }else{/* has to be at least the second group in linked list */
        lastGrp->next = freeGrp->next;
      }

      free(freeGrp);
      freeGrp = NULL;
    }else{
      vtcuptiGrp->enabled = 1;
      lastGrp= vtcuptiGrp;
      vtcuptiGrp = vtcuptiGrp->next;
    }
  }
  
}
/* may be called per thread */
void vt_plugin_cntr_write_post_mortem(VTThrd * thrd) {
  uint32_t counter_index;
  vt_plugin_cntr_timevalue * time_values = NULL;
  uint32_t number_of_counters;
  uint64_t number_of_values = 0;
  uint64_t i;
  uint64_t dummy_time;
  uint32_t tid;
  struct vt_plugin_single_counter current_counter;
  struct vt_plugin_cntr_defines * plugin_cntr_defines =
      (struct vt_plugin_cntr_defines *) thrd->plugin_cntr_defines;
  if (plugin_cntr_defines == NULL)
    return;

  if (plugin_cntr_defines->size_of_counters[VT_PLUGIN_CNTR_ASYNCH_POST_MORTEM]
      == 0)
    return;
  if (VTTHRD_TRACE_STATUS(thrd) != VT_TRACE_ON)
    return;
  for (tid=0;tid<VTThrdn;tid++)
    if ( VTThrdv[tid] == thrd )
      break;
  if ( tid == VTThrdn ){
    vt_warning("Can not determine internal TID when gathering post-mortem counters");
    return;
  }
  /* for all post_mortem counters */
  number_of_counters
    = plugin_cntr_defines->size_of_counters[VT_PLUGIN_CNTR_ASYNCH_POST_MORTEM];
  dummy_time = vt_pform_wtime();
  /* set flag for writing post mortem counters; prevents writing of flush
   * enter/exit event when flushing */
  thrd->plugin_cntr_writing_post_mortem = 1;
  /* we assume that for each counter (not plugin),
   * the data is monotonically increasing */
  /* for all counters of this thread */
  for (counter_index = 0;
       counter_index < number_of_counters;
       counter_index++) {
    current_counter
        = plugin_cntr_defines->counters[VT_PLUGIN_CNTR_ASYNCH_POST_MORTEM]
                                       [counter_index];
    /* get data */
    number_of_values = current_counter.getAllValues(
        current_counter.from_plugin_id, &time_values);
    if (time_values == NULL)
      return;
    for (i = 0; i < number_of_values; i++) {
      WRITE_ASYNCH_DATA(thrd, tid, current_counter, time_values[i], &dummy_time);
    }
    free(time_values);
  }
  /* unset flag for writing post mortem counters */
  thrd->plugin_cntr_writing_post_mortem = 0;
}
static void metric_warning(int errcode, char *note)
{
  char errstring[PAPI_MAX_STR_LEN];

  PAPI_perror(errcode, errstring, PAPI_MAX_STR_LEN);
  if (errcode == PAPI_ESYS) {
    strncat(errstring, ": ", PAPI_MAX_STR_LEN-strlen(errstring));
    strncat(errstring, strerror(errno), PAPI_MAX_STR_LEN-strlen(errstring));
  }
  vt_warning("%s: %s (ignored)\n", note?note:"PAPI", errstring);
}
示例#10
0
static void vt_cupti_start(vt_cupti_ctx_t *vtcuptiCtx)
{
  CUptiResult cuptiErr = CUPTI_SUCCESS;
  vt_cupti_grp_t *vtcuptiGrp = NULL;
  vt_cupti_grp_t *lastGrp = NULL;

  if(vtcuptiCtx == NULL) return;

  /* start all groups */
  vtcuptiGrp = vtcuptiCtx->vtGrpList;
  lastGrp = vtcuptiCtx->vtGrpList;
  while(vtcuptiGrp != NULL){
    cuptiErr = cuptiEventGroupEnable(vtcuptiGrp->evtGrp);
    
    /* if the event group could not be enabled, remove it */
    if(cuptiErr != CUPTI_SUCCESS){
      size_t i;
      vt_cupti_grp_t *freeGrp = vtcuptiGrp;
      size_t valueSize = 32;
      char name[32];

      vtcuptiGrp = vtcuptiGrp->next;

      /* give user information about the group, which cannot be enabled */
      for(i = 0; i < freeGrp->evtNum; i++){
        cuptiEventGetAttribute(vtcuptiCtx->vtDevCap->cuDev,
                               *(freeGrp->cuptiEvtIDs)+i,
                               CUPTI_EVENT_ATTR_NAME,
                               &valueSize, (char*)name);
        vt_warning("[CUPTI] Event '%s' (%d) cannot be enabled",
                   name, *(freeGrp->cuptiEvtIDs)+i);
      }

      /* group is first element in linked list */
      if(vtcuptiCtx->vtGrpList == freeGrp){
        vtcuptiCtx->vtGrpList = vtcuptiCtx->vtGrpList->next;
      }else{/* has to be at least the second group in linked list */
        lastGrp->next = freeGrp->next;
      }

      free(freeGrp);
      freeGrp = NULL;
    }else{
      vtcuptiGrp->enabled = 1;
      lastGrp= vtcuptiGrp;
      vtcuptiGrp = vtcuptiGrp->next;
    }
  }
  
}
/*
 * Handles errors returned from CUPTI function calls.
 * 
 * @param ecode the CUDA driver API error code
 * @param msg a message to get more detailed information about the error
 * @param the corresponding file
 * @param the line the error occurred
 */
void vt_cupti_handleError(CUptiResult err, const char* msg,
                          const char *file, const int line)
{
  const char *errstr;
  
  if(msg != NULL) vt_cntl_msg(1, msg);
  
  cuptiGetResultString(err, &errstr);
  
  if(vt_gpu_error){
    vt_error_msg("[CUPTI] %s:%d:'%s'", file, line, errstr);
  }else{
    vt_warning("[CUPTI] %s:%d:'%s'", file, line, errstr);
  }
}
void vt_cuptiact_setupActivityContext(vt_cupti_ctx_t *vtCtx)
{  
  /* try to get the global VampirTrace CUPTI context */
  if(vtCtx == NULL){
    vt_warning("[CUPTI Activity] No context given!");
    return;
  }
  
  VT_SUSPEND_MALLOC_TRACING(vtCtx->ptid);
  
  /* create the VampirTrace CUPTI activity context */
  if(vtCtx->activity == NULL)
    vtCtx->activity = vt_cuptiact_createCtxActivity(vtCtx->cuCtx);
  
  /* queue new buffer to context to record activities */
  vtCtx->activity->buffer = vt_cuptiact_queueNewBuffer(vtCtx->cuCtx);
  
  VT_RESUME_MALLOC_TRACING(vtCtx->ptid);
}
示例#13
0
int execvp(const char* path, char* const argv[])
{
  int rc;

  VT_MEMHOOKS_OFF();

  if ( DO_TRACE(execvp) )
  {
    /* mark enter function */
    uint64_t time = vt_pform_wtime();
    vt_enter(&time, libc_funcs[FUNCIDX(execvp)].rid);
  }

  /* close VT for current process */
  vt_close();

  /* call (real) function */
  CALL_FUNC(execvp, rc, (path, argv));

  vt_warning("execvp failed: %s", strerror(errno));
  return rc;
}
示例#14
0
int execle(const char* path, const char* arg, ...)
{
  int rc;
  char* argv[100];
  char** envp;
  char* tmp;
  int i;
  va_list ap;

  VT_MEMHOOKS_OFF();

  va_start(ap, arg);

  i = 0;
  argv[i++] = (char*)arg;
  while((tmp = va_arg(ap, char*) ))
    argv[i++] = tmp;
  argv[i] = NULL;
  envp = va_arg(ap, char**);

  va_end(ap);

  if ( DO_TRACE(execle) )
  {
    /* mark enter function */
    uint64_t time = vt_pform_wtime();
    vt_enter(&time, libc_funcs[FUNCIDX(execle)].rid);
  }

  /* close VT for current process */
  vt_close();

  /* call (real) function */
  CALL_FUNC(execve, rc, (path, argv, envp));

  vt_warning("execle failed: %s", strerror(errno));
  return rc;
}
示例#15
0
char* vt_env_ldir()
{
  static char* ldir = NULL;
  char* tmp;

  if (! ldir)
    {
      tmp = getenv("VT_PFORM_LDIR");

#if defined(VT_IOFSL)
      if (vt_env_iofsl_servers())
        {
          ldir = vt_env_gdir();

          if (tmp != NULL && strlen(tmp) > 0)
            {
              vt_warning("Setting of VT_PFORM_LDIR isn't allowed in IOFSL "
                         "mode; reset it to VT_PFORM_GDIR (=%s)", ldir);
            }
        }
      else
#endif /* VT_IOFSL */
        {
          if (tmp != NULL && strlen(tmp) > 0)
            {
              vt_cntl_msg(2, "VT_PFORM_LDIR=%s", tmp);

              ldir = replace_vars(tmp);
            }
          else
            {
              ldir = replace_vars(vt_pform_ldir());
            }
        }
    }
  return ldir;
}
/* no need to lock, because it is only called by vt_cupti_callback_init() */
void vt_cupti_activity_init()
{
  /*if(!vt_cuptiact_initialized){
    vt_cupti_init();
    VT_CUPTI_LOCK();*/
    if(!vt_cuptiact_initialized){
      vt_cntl_msg(2, "[CUPTI Activity] Initializing ... ");
      
      {        
        vt_cuptiact_bufSize = vt_env_cudatrace_bsize();
        
        /* no buffer size < 1024 bytes allowed (see CUPTI documentation) */
        if(vt_cuptiact_bufSize < 1024){
          if(vt_cuptiact_bufSize > 0){
            vt_warning("[CUPTI Activity] Buffer size has to be at least 1024 "
                       "bytes! It has been set to %d.", vt_cuptiact_bufSize);
          }
          vt_cuptiact_bufSize = VT_CUPTI_ACT_DEFAULT_BSIZE;
        }
        
        /* queue a global buffer to initialize CUPTI before CUDA init 
        vt_cuptiact_buffer = (uint8_t *)malloc(vt_cuptiact_bufSize);
        VT_CUPTI_CALL(cuptiActivityEnqueueBuffer(NULL, 0, 
                                      vt_cuptiact_buffer, vt_cuptiact_bufSize), 
                      "cuptiActivityEnqueueBuffer");*/
      }
      
#if (defined(VT_MT) || defined(VT_HYB))
      VTTHRD_LOCK_IDS();
#endif
      if(vt_gpu_trace_kernels > 1){
        /* define kernel counters */
        vt_cuptiact_cid_knStaticSharedMem = vt_def_counter(VT_MASTER_THREAD, 
                      "staticSharedMemory", "Bytes",
                      VT_CNTR_ABS | VT_CNTR_NEXT | VT_CNTR_UNSIGNED, 
                      vt_cupti_cgid_cuda_kernel, 0);
        vt_cuptiact_cid_knDynamicSharedMem = vt_def_counter(VT_MASTER_THREAD, 
                      "dynamicSharedMemory", "Bytes",
                      VT_CNTR_ABS | VT_CNTR_NEXT | VT_CNTR_UNSIGNED, 
                      vt_cupti_cgid_cuda_kernel, 0);
        vt_cuptiact_cid_knLocalMemTotal = vt_def_counter(VT_MASTER_THREAD, 
                      "localMemoryPerKernel", "Bytes",
                      VT_CNTR_ABS | VT_CNTR_NEXT | VT_CNTR_UNSIGNED, 
                      vt_cupti_cgid_cuda_kernel, 0);
        vt_cuptiact_cid_knRegistersPerThread = vt_def_counter(VT_MASTER_THREAD, 
                      "registersPerThread", "#",
                      VT_CNTR_ABS | VT_CNTR_NEXT | VT_CNTR_UNSIGNED, 
                      vt_cupti_cgid_cuda_kernel, 0);
      }
     
      /* define region for GPU activity flush */
      vt_cuptiact_rid_flush = vt_def_region(VT_MASTER_THREAD, "flushActivities", 
                        VT_NO_ID, VT_NO_LNO, VT_NO_LNO, "VT_CUDA", VT_FUNCTION);
#if (defined(VT_MT) || defined(VT_HYB))
      VTTHRD_UNLOCK_IDS();
#endif
      
      /*** enable the activities ***/
      /* enable kernel tracing */
      if(vt_gpu_trace_kernels > 0){
#if (defined(CUPTI_API_VERSION) && (CUPTI_API_VERSION >= 3))
        if((vt_gpu_config & VT_GPU_TRACE_CONCURRENT_KERNEL) 
           == VT_GPU_TRACE_CONCURRENT_KERNEL){
          /*VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL), 
                        "cuptiActivityEnable");*/
          VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL), 
                        "cuptiActivityEnable");
        }else
#endif
          VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL), 
                        "cuptiActivityEnable");
      }
      
      /* enable memory copy tracing */
      if(vt_gpu_trace_mcpy){
        VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY), 
                      "cuptiActivityEnable");
      }
      
      /* register the finalize function of VampirTrace CUPTI to be called before
       * the program exits 
      atexit(vt_cupti_activity_finalize);*/

      vt_cuptiact_initialized = 1;
      /*VT_CUPTI_UNLOCK();
    }*/
  }
}
/*
 * Parse the environment variable for CUPTI metrics (including CUDA device
 * capabilities) and fill the capability metric list.
 *
 * @param capList points to the first element of the capability metric list
 */
static void vt_cupti_fillMetricList(vt_cupti_device_t *capList)
{
  char *metricString = vt_env_cupti_events();
  char *metric_sep = vt_env_metrics_sep();
  char *metric, *metric_cap;

  metric = strtok(metricString, metric_sep);

  while (metric != NULL){
    CUptiResult cuptiErr = CUPTI_SUCCESS;
    vt_cupti_device_t *cuptiDev = NULL;
    vt_cupti_evtctr_t *vtcuptiEvt = NULL;
    int metr_major = 0;
    int metr_minor = 0;

    /* try to get CUDA device capability parsed from metric */
    metr_major = atoi(metric);
    metric_cap = strchr(metric+1, '.');
    if(metric_cap){
      metr_minor = atoi(metric_cap+1);
      metric_cap = strchr(metric_cap+1, '_');
    }
    
    /* check whether device capability is given or not */
    if(metric_cap){
      metric = metric_cap + 1;

      vt_cntl_msg(2, "Metric '%s', %d.%d", metric, metr_major, metr_minor);

      cuptiDev = vt_cupti_checkMetricList(capList, metr_major, metr_minor);
      if(cuptiDev == NULL){
        metric = strtok(NULL, metric_sep);
        continue;
      }
      
      vtcuptiEvt = (vt_cupti_evtctr_t*)malloc(sizeof(vt_cupti_evtctr_t));
      cuptiErr = cuptiEventGetIdFromName(cuptiDev->cuDev, metric,
                                         &vtcuptiEvt->cuptiEvtID);
      if(cuptiErr != CUPTI_SUCCESS){
        if(!strncmp(metric, "help", 4)) vt_cupti_showAllCounters(capList);
        vt_warning("[CUPTI Events] Skipping invalid event '%s' for device %d",
                   metric, cuptiDev->cuDev);
        metric = strtok(NULL, metric_sep);
        continue;
      }

      /* create VampirTrace counter ID */
#if (defined(VT_MT) || defined(VT_HYB))
      VTTHRD_LOCK_IDS();
#endif
      vtcuptiEvt->vtCID = vt_def_counter(VT_MASTER_THREAD, metric, "#",
            VT_CNTR_ABS | VT_CNTR_LAST | VT_CNTR_UNSIGNED, vt_cuptievt_cgid, 0);
#if (defined(VT_MT) || defined(VT_HYB))
      VTTHRD_UNLOCK_IDS();
#endif

      cuptiDev->evtNum++;
      vtcuptiEvt->next = cuptiDev->vtcuptiEvtList;
      cuptiDev->vtcuptiEvtList = vtcuptiEvt;
    }else{ 
      /* device capability is not given. Try to add metric to all devices */
      uint32_t cid_metric = VT_NO_ID;

      cuptiDev = capList;
      while(cuptiDev != NULL){
        vtcuptiEvt = (vt_cupti_evtctr_t*)malloc(sizeof(vt_cupti_evtctr_t));
        cuptiErr = cuptiEventGetIdFromName(cuptiDev->cuDev, metric,
                                           &vtcuptiEvt->cuptiEvtID);

        if(cuptiErr != CUPTI_SUCCESS){
          if(!strncmp(metric, "help", 4)) vt_cupti_showAllCounters(capList);
          vt_warning("[CUPTI Events] Skipping invalid event '%s' for device %d",
                     metric, cuptiDev->cuDev);
        }else{
          /* create VampirTrace counter ID, if not yet done for other device */
          if(cid_metric == VT_NO_ID){
#if (defined(VT_MT) || defined(VT_HYB))
      VTTHRD_LOCK_IDS();
#endif
      cid_metric = vt_def_counter(VT_MASTER_THREAD, metric, "#", 
            VT_CNTR_ABS | VT_CNTR_LAST | VT_CNTR_UNSIGNED, vt_cuptievt_cgid, 0);
#if (defined(VT_MT) || defined(VT_HYB))
      VTTHRD_UNLOCK_IDS();
#endif
          }
          
          cuptiDev->evtNum++;
          vtcuptiEvt->vtCID = cid_metric;
          vtcuptiEvt->next = cuptiDev->vtcuptiEvtList;
          cuptiDev->vtcuptiEvtList = vtcuptiEvt;
        }

        cuptiDev = cuptiDev->next;
      }
    }

    metric = strtok(NULL, metric_sep);
  }
}
/*
 * Decreases the "Allocated CUDA memory" counter.
 *
 * @param ctxUID CUDA context identifier (@see CUPTI callback info)
 * @param devPtr pointer to the allocated memory
 */
void vt_cuptiact_writeFree(uint32_t ctxID, CUcontext cuCtx, void *devPtr)
{
  uint64_t vtTime;
  vt_cuptiact_ctx_t* vtCtx = NULL;
  vt_cuptiact_gpumem_t *curMalloc = NULL;
  vt_cuptiact_gpumem_t *lastMalloc = NULL;

  if(devPtr == NULL) return;
  
  /* flush activity buffer */
  vt_cuptiact_flushCtxActivities(cuCtx);
  
  vtCtx = vt_cuptiact_getCtx(cuCtx);
  if(vtCtx == NULL){
    vtCtx = vt_cuptiact_createContext(ctxID, cuCtx, (uint32_t)-1);
  }
  
  VT_CUPTI_ACT_LOCK();

  curMalloc = vtCtx->gpuMemList;
  lastMalloc = vtCtx->gpuMemList;

  /* lookup the CUDA malloc entry by its memory pointer */
  while(curMalloc != NULL){
    if(devPtr == curMalloc->memPtr){

      /* decrease allocated counter value and write it */
      vtTime = vt_pform_wtime();
      vtCtx->gpuMemAllocated -= curMalloc->size;
      vt_count(vtCtx->strmList->vtThrdID, &vtTime, vt_gpu_cid_memusage,
               (uint64_t)(vtCtx->gpuMemAllocated));


      /* set pointer over current element to next one */
      lastMalloc->next = curMalloc->next;

      /* if current element is the first list entry, set the list entry */
      if(curMalloc == vtCtx->gpuMemList){
        vtCtx->gpuMemList = curMalloc->next;
      }

      /* free VT memory of CUDA malloc */
      curMalloc->next = NULL;
      free(curMalloc);
      curMalloc = NULL;

      /* set mallocList to NULL, if last element freed */
      if(vtCtx->gpuMemAllocated == 0) {
        vtCtx->gpuMemList = NULL;
      }
      
      VT_CUPTI_ACT_UNLOCK();
      return;
    }

    lastMalloc = curMalloc;
    curMalloc = curMalloc->next;
  }

  VT_CUPTI_ACT_UNLOCK();

  vt_warning("[CUPTI Activity] free CUDA memory, which has not been allocated!");
}
/* may be called per thread */
void vt_plugin_cntr_write_post_mortem(VTThrd * thrd) {
  uint32_t counter_index;
  uint64_t * counter_current_indices;
  vt_plugin_cntr_timevalue ** time_values_by_counter = NULL;
  uint32_t number_of_counters;
  uint64_t * number_of_values_by_counter;
  uint64_t dummy_time;
  uint32_t tid;
  struct vt_plugin_single_counter current_counter;
  struct vt_plugin_cntr_defines * plugin_cntr_defines =
      (struct vt_plugin_cntr_defines *) thrd->plugin_cntr_defines;
  if (plugin_cntr_defines == NULL)
    return;

  if (plugin_cntr_defines->size_of_counters[VT_PLUGIN_CNTR_ASYNCH_POST_MORTEM]
      == 0)
    return;
  if (VTTHRD_TRACE_STATUS(thrd) != VT_TRACE_ON)
    return;
  for (tid=0;tid<VTThrdn;tid++)
    if ( VTThrdv[tid] == thrd )
      break;
  if ( tid == VTThrdn ){
    vt_warning("Can not determine internal TID when gathering post-mortem counters");
    return;
  }
  /* for all post_mortem counters */
  number_of_counters
    = plugin_cntr_defines->size_of_counters[VT_PLUGIN_CNTR_ASYNCH_POST_MORTEM];
  dummy_time = vt_pform_wtime();
  /* set flag for writing post mortem counters; prevents writing of flush
   * enter/exit event when flushing */
  thrd->plugin_cntr_writing_post_mortem = 1;
  /* we assume that for each counter (not plugin),
   * the data is monotonically increasing */
  /* for all counters of this thread */

  time_values_by_counter = calloc(number_of_counters, sizeof(*time_values_by_counter));
  vt_libassert(time_values_by_counter);
  number_of_values_by_counter = calloc(number_of_counters, sizeof(*number_of_values_by_counter));
  vt_libassert(number_of_values_by_counter);
  for (counter_index = 0;
       counter_index < number_of_counters;
       counter_index++) {
    current_counter
        = plugin_cntr_defines->counters[VT_PLUGIN_CNTR_ASYNCH_POST_MORTEM][counter_index];
    /* get data */
    number_of_values_by_counter[counter_index] = current_counter.getAllValues(
        current_counter.from_plugin_id, &(time_values_by_counter[counter_index]));
    if (time_values_by_counter[counter_index] == NULL) {
      free(time_values_by_counter);
      free(number_of_values_by_counter);
      return;
    }
  }
  /* initialized with 0! */
  counter_current_indices = calloc(number_of_counters, sizeof(*counter_current_indices));
  vt_libassert(counter_current_indices);
  while (1) {
	vt_plugin_cntr_timevalue *min_tvp = NULL;
	uint32_t min_counter;
    for (counter_index = 0;
	     counter_index < number_of_counters;
	     counter_index++)
    {
      /*
       * TODO optimize for "nice" plugins by looking if the "next" counter has the
       * _same_ timestamp (so there cannot be anyone else with a smaller one
       */
	  vt_plugin_cntr_timevalue *tvp;
	  if (counter_current_indices[counter_index] >= number_of_values_by_counter[counter_index]) {
		continue;
	  }
	  tvp = &(time_values_by_counter[counter_index][counter_current_indices[counter_index]]);
      if (!min_tvp || tvp->timestamp < min_tvp->timestamp) {
        min_tvp = tvp;
        min_counter = counter_index;
      }
	}
    if (min_tvp == NULL) {
      /* we are done */
      break;
    }
    current_counter
        = plugin_cntr_defines->counters[VT_PLUGIN_CNTR_ASYNCH_POST_MORTEM][min_counter];
    WRITE_ASYNCH_DATA(thrd, tid, current_counter, *min_tvp, &dummy_time);
    counter_current_indices[min_counter]++;
  }
  free(time_values_by_counter);
  free(counter_current_indices);
  free(number_of_values_by_counter);
  /* unset flag for writing post mortem counters */
  thrd->plugin_cntr_writing_post_mortem = 0;
}
/*
 * Decreases the "Allocated CUDA memory" counter.
 *
 * @param ctxUID CUDA context identifier (@see CUPTI callback info)
 * @param devPtr pointer to the allocated memory
 */
void vt_cuptiact_writeFree(uint32_t ctxID, CUcontext cuCtx, void *devPtr)
{
  uint64_t vtTime;
  vt_cupti_ctx_t* vtCtx = NULL;
  vt_cupti_activity_t *vtcuptiActivity = NULL;
  vt_cupti_gpumem_t *curMalloc = NULL;
  vt_cupti_gpumem_t *lastMalloc = NULL;

  if(devPtr == NULL) return;
  
  VT_SUSPEND_MALLOC_TRACING(VT_CURRENT_THREAD);
  
  /* check for VampirTrace CUPTI context */
  vtCtx = vt_cupti_getCtx(cuCtx);
  if(vtCtx == NULL){
    
    vtCtx = vt_cupti_createCtx(cuCtx, VT_CUPTI_NO_CUDA_DEVICE, ctxID, VT_CUPTI_NO_DEVICE_ID);
    
    vt_cupti_prependCtx(vtCtx);
  }
  
  /* check for VampirTrace CUPTI activity context */
  if(vtCtx->activity == NULL){
    vtCtx->activity = vt_cuptiact_createCtxActivity(cuCtx);
  }
  vtcuptiActivity = vtCtx->activity;
  
  VT_CUPTI_LOCK();
  
  /* flush activity buffer */
  vt_cuptiact_flushCtxActivities(vtCtx);

  curMalloc = vtcuptiActivity->gpuMemList;
  lastMalloc = curMalloc;

  /* lookup the CUDA malloc entry by its memory pointer */
  while(curMalloc != NULL){
    if(devPtr == curMalloc->memPtr){

      /* decrease allocated counter value and write it */
      vtTime = vt_pform_wtime();
      vtcuptiActivity->gpuMemAllocated -= curMalloc->size;
      vt_count(vtcuptiActivity->strmList->vtThrdID, &vtTime, vt_gpu_cid_memusage,
               (uint64_t)(vtcuptiActivity->gpuMemAllocated));


      /* set pointer over current element to next one */
      lastMalloc->next = curMalloc->next;

      /* if current element is the first list entry, set the list entry */
      if(curMalloc == vtcuptiActivity->gpuMemList){
        vtcuptiActivity->gpuMemList = curMalloc->next;
      }

      /* free VT memory of CUDA malloc */
      curMalloc->next = NULL;
      free(curMalloc);
      curMalloc = NULL;

      /* set mallocList to NULL, if last element freed */
      if(vtcuptiActivity->gpuMemAllocated == 0) {
        vtcuptiActivity->gpuMemList = NULL;
      }
  
      VT_CUPTI_UNLOCK();
      VT_RESUME_MALLOC_TRACING(VT_CURRENT_THREAD);
      return;
    }

    lastMalloc = curMalloc;
    curMalloc = curMalloc->next;
  }

  VT_CUPTI_UNLOCK();
  
  VT_RESUME_MALLOC_TRACING(VT_CURRENT_THREAD);

  vt_warning("[CUPTI Activity] free CUDA memory, which has not been allocated!");
}
/*
 * Check for a VampirTrace activity stream by stream ID. If it does not exist,
 * create it.
 * 
 * @param vtCtx VampirTrace CUPTI Activity context
 * @param strmID the CUDA stream ID provided by CUPTI callback API
 * 
 * @return the VampirTrace CUDA stream
 */
static vt_cuptiact_strm_t* vt_cuptiact_checkStream(vt_cupti_ctx_t* vtCtx, 
                                                   uint32_t strmID)
{
  vt_cupti_activity_t *vtcuptiActivity = vtCtx->activity;
  vt_cuptiact_strm_t *currStrm = NULL;
  vt_cuptiact_strm_t *lastStrm = NULL;
  vt_cuptiact_strm_t *reusableStrm = NULL;
  
  if(vtCtx == NULL){
    vt_warning("[CUPTI Activity] No context given in vt_cuptiact_checkStream()!");
    return NULL;
  }
  
  /* lookup stream */
  /*VT_CUPTI_LOCK();*/
  currStrm = vtcuptiActivity->strmList;
  lastStrm = vtcuptiActivity->strmList;
  while(currStrm != NULL){
    /* check for existing stream */
    if(currStrm->strmID == strmID){
      /*VT_CUPTI_UNLOCK();*/
      return currStrm;
    }
    
    /* check for reusable stream */
    if(vt_gpu_stream_reuse && reusableStrm == NULL && currStrm->destroyed == 1){
      reusableStrm = currStrm;
    }
    
    lastStrm = currStrm;
    currStrm = currStrm->next;
  }
  
  /* reuse a destroyed stream, if there is any available */
  if(vt_gpu_stream_reuse && reusableStrm){
    vt_cntl_msg(2, "[CUPTI Activity] Reusing CUDA stream %d with stream %d",
                   reusableStrm->strmID, strmID);
    reusableStrm->destroyed = 0;
    reusableStrm->strmID = strmID;

    return reusableStrm;
  }
  
  /* 
   * If stream list is empty, the stream to be created is not the default
   * stream and GPU idle and memory copy tracing is enabled, then create
   * a default stream.
   */
  if(vtcuptiActivity->strmList == NULL && strmID != vtcuptiActivity->defaultStrmID && 
     vt_gpu_trace_idle == 1 && vt_gpu_trace_mcpy){
    vtcuptiActivity->strmList = 
            vt_cuptiact_createStream(vtCtx, vtcuptiActivity->defaultStrmID);
    lastStrm = vtcuptiActivity->strmList;
  }
  
  /* create the stream, which has not been created yet */
  currStrm = vt_cuptiact_createStream(vtCtx, strmID);
  
  /* append the newly created stream */
  if(NULL != lastStrm) lastStrm->next = currStrm;
  else vtcuptiActivity->strmList = currStrm;
  
  /*VT_CUPTI_UNLOCK();*/
  return currStrm;
}
void vt_cuptiact_flushCtxActivities(vt_cupti_ctx_t *vtCtx)
{ 
  CUptiResult status;
  uint8_t *buffer = NULL;
  size_t bufSize;
  CUpti_Activity *record = NULL;
  uint64_t hostStop, gpuStop;
  uint32_t ptid = VT_NO_ID;
  vt_cupti_activity_t *vtcuptiActivity = NULL;
  
  /* check for VampirTrace CUPTI context */
  if(vtCtx == NULL || vtCtx->activity == NULL){
    vt_warning("[CUPTI Activity] Context not found!");
    return;
  }
  vtcuptiActivity = vtCtx->activity;
  
  /* check if the buffer contains records */
  status = cuptiActivityQueryBuffer(vtCtx->cuCtx, 0, &bufSize);
  if(status != CUPTI_SUCCESS){
    if(CUPTI_ERROR_QUEUE_EMPTY == status || 
       CUPTI_ERROR_MAX_LIMIT_REACHED != status){
      return;
    }
  }

  /* expose VampirTrace CUPTI activity flush as measurement overhead */
  VT_CHECK_THREAD;
  ptid = VT_MY_THREAD;
  hostStop = vt_pform_wtime();
  vt_enter(ptid, &hostStop, vt_cuptiact_rid_flush);
  
  vt_cntl_msg(2,"[CUPTI Activity] Handle context %d activities", vtCtx->cuCtx);
  
  /* lock the whole buffer flush 
  VT_CUPTI_LOCK();*/
  
  /* dump the contents of the global queue */
  VT_CUPTI_CALL(cuptiActivityDequeueBuffer(vtCtx->cuCtx, 0, &buffer, 
                &bufSize), "cuptiActivityDequeueBuffer");

  /* 
   * Get time synchronization factor between host and GPU time for measured 
   * period 
   */
  {
    VT_CUPTI_CALL(cuptiGetTimestamp(&gpuStop), "cuptiGetTimestamp");
    hostStop = vt_pform_wtime();
    vtcuptiActivity->sync.hostStop = hostStop;
    
    vtcuptiActivity->sync.factor = (double)(hostStop - vtcuptiActivity->sync.hostStart)
                       /(double)(gpuStop - vtcuptiActivity->sync.gpuStart);
  }

  /*vt_cntl_msg(1, "hostStop: %llu , gpuStop: %llu", hostStopTS, gpuStopTS);
  vt_cntl_msg(1, "factor: %lf", syncFactor);*/
  
  do{
    status = cuptiActivityGetNextRecord(buffer, bufSize, &record);
    if(status == CUPTI_SUCCESS) {
      vt_cuptiact_writeRecord(record, vtCtx);
    }else if(status == CUPTI_ERROR_MAX_LIMIT_REACHED){
      break;
    }else{
      VT_CUPTI_CALL(status, "cuptiActivityGetNextRecord");
    }
  }while(1);

  /* report any records dropped from the global queue */
  {
    size_t dropped;
    
    VT_CUPTI_CALL(cuptiActivityGetNumDroppedRecords(vtCtx->cuCtx, 0, &dropped), 
                  "cuptiActivityGetNumDroppedRecords");
    if(dropped != 0)
      vt_warning("[CUPTI Activity] Dropped %u records. Current buffer size: %llu bytes\n"
                 "To avoid dropping of records increase the buffer size!\n"
                 "Proposed minimum VT_CUDATRACE_BUFFER_SIZE=%llu", 
                 (unsigned int)dropped, vt_cuptiact_bufSize, 
                 vt_cuptiact_bufSize + dropped/2 * 
                 (sizeof(CUpti_ActivityKernel) + sizeof(CUpti_ActivityMemcpy)));
  }
  
  /* enter GPU idle region after last kernel, if exited before */
  if(vtcuptiActivity->gpuIdleOn == 0){
    vt_enter(vtcuptiActivity->strmList->vtThrdID, 
             &(vtcuptiActivity->vtLastGPUTime), vt_gpu_rid_idle);
    vtcuptiActivity->gpuIdleOn = 1;
    /*vt_warning("IDLfente: %llu (%d)", vtCtx->vtLastGPUTime, vtCtx->strmList->vtThrdID);*/
  }
  
  /* enqueue buffer again */
  VT_CUPTI_CALL(cuptiActivityEnqueueBuffer(vtCtx->cuCtx, 0, buffer, 
                vt_cuptiact_bufSize), "cuptiActivityEnqueueBuffer");
  
    
  /* set new synchronization point */
  vtcuptiActivity->sync.hostStart = hostStop;
  vtcuptiActivity->sync.gpuStart = gpuStop;
  
  /*VT_CUPTI_UNLOCK();*/
  
  /* use local variable hostStop to write exit event for activity flush */
  hostStop = vt_pform_wtime();
  vt_exit(ptid, &hostStop);
}
/*
 * Create a VampirTrace CUPTI stream.
 * 
 * @param vtCtx VampirTrace CUPTI context
 * @param cuStrm CUDA stream
 * @param strmID ID of the CUDA stream
 * 
 * @return pointer to created VampirTrace CUPTI stream
 */
vt_cupti_strm_t* vt_cupti_createStream(vt_cupti_ctx_t *vtCtx, 
                                       CUstream cuStrm, uint32_t strmID)
{
  vt_cupti_strm_t *vtStrm = NULL;
  
  if(vtCtx == NULL){
    vt_warning("[CUPTI] Cannot create stream without VampirTrace CUPTI context");
    return NULL;
  }
  
  vtStrm = (vt_cupti_strm_t *)malloc(sizeof(vt_cupti_strm_t));
  if(vtStrm == NULL)
    vt_error_msg("[CUPTI] Could not allocate memory for stream!");
  vtStrm->cuStrm = cuStrm;
  vtStrm->vtLastTime = vt_gpu_init_time;
  vtStrm->destroyed = 0;
  vtStrm->next = NULL;
  
#if defined(VT_CUPTI_ACTIVITY)
  /* create stream by VT CUPTI callbacks implementation (CUstream is given) */
  if(strmID == VT_CUPTI_NO_STREAM_ID){
    if(cuStrm != VT_CUPTI_NO_STREAM){
      VT_CUPTI_CALL(cuptiGetStreamId(vtCtx->cuCtx, cuStrm, &strmID), 
                                     "cuptiGetStreamId");
    }else{
      vt_warning("[CUPTI] Neither CUDA stream nor stream ID given!");
      free(vtStrm);
      return NULL;
    }
  }
#else /* only VT_CUPTI_CALLBACKS is defined */
  if(vtCtx->callbacks != NULL){
    strmID = vtCtx->callbacks->streamsCreated;
    vtCtx->callbacks->streamsCreated++;
  }
#endif

  vtStrm->cuStrmID = strmID;
  
  /* create VampirTrace thread */
  {
    char thread_name[16] = "CUDA";

    if(vt_gpu_stream_reuse){
      if(vtCtx->devID != VT_NO_ID){
        if(-1 == snprintf(thread_name+4, 12, "[%d]", vtCtx->devID))
          vt_cntl_msg(1, "Could not create thread name for CUDA thread!");
      }
    }else{
      if(vtCtx->devID == VT_NO_ID){
        if(-1 == snprintf(thread_name+4, 12, "[?:%d]", strmID))
          vt_cntl_msg(1, "Could not create thread name for CUDA thread!");
      }else{
        if(-1 == snprintf(thread_name+4, 12, "[%d:%d]", vtCtx->devID, strmID))
          vt_cntl_msg(1, "Could not create thread name for CUDA thread!");
      }
    }
    
    VT_CHECK_THREAD;
    vt_gpu_registerThread(thread_name, VT_MY_THREAD, &(vtStrm->vtThrdID));
  }
  
  if(vt_gpu_init_time < vt_start_time)
      vt_gpu_init_time = vt_start_time;
  
  /* for the first stream created for this context */
  if(vtCtx->strmList == NULL){
    if(vt_gpu_trace_idle > 0){      
      /* write enter event for GPU_IDLE on first stream */
      vt_enter(vtStrm->vtThrdID, &vt_gpu_init_time, vt_gpu_rid_idle);
      /*vt_warning("IDLEente: %llu (%d)", vt_gpu_init_time, vtStrm->vtThrdID);*/
      
#if defined(VT_CUPTI_ACTIVITY)
      if(vtCtx->activity != NULL)
        vtCtx->activity->gpuIdleOn = 1;
#endif
    }
    
    /* set the counter value for cudaMalloc to 0 on first stream */
    if(vt_gpu_trace_memusage > 0)
      vt_count(vtStrm->vtThrdID, &vt_gpu_init_time, vt_gpu_cid_memusage, 0);
  }

  if(vt_gpu_trace_kernels > 1){
    /* set count values to zero */
    vt_count(vtStrm->vtThrdID, &vt_gpu_init_time, vt_cupti_cid_blocksPerGrid, 0);
    vt_count(vtStrm->vtThrdID, &vt_gpu_init_time, vt_cupti_cid_threadsPerBlock, 0);
    vt_count(vtStrm->vtThrdID, &vt_gpu_init_time, vt_cupti_cid_threadsPerKernel, 0);
  }
  
  /* prepend the stream 
  vtStrm->next = vtCtx->strmList;
  vtCtx->strmList = vtStrm;*/
  
  return vtStrm;
}
/*
 * Use the CUPTI activity kernel record to write the corresponding VampirTrace
 * events.
 * 
 * @param kernel the CUPTI activity kernel record
 * @param vtCtx the VampirTrace CUPTI activity context
 */
static void vt_cuptiact_writeKernelRecord(CUpti_ActivityKernel *kernel, 
                                          vt_cupti_ctx_t *vtCtx)
{
  vt_cupti_activity_t *vtcuptiActivity = vtCtx->activity;
  vt_cuptiact_strm_t *vtStrm = NULL;
  uint32_t vtThrdID = VT_NO_ID;
  uint32_t knRID = VT_NO_ID;
  vt_gpu_hn_string_t *hn = NULL;
  
  VT_SUSPEND_MALLOC_TRACING(vtCtx->ptid);
  
  /* get VampirTrace thread ID for the kernel's stream */  
  vtStrm = vt_cuptiact_checkStream(vtCtx, kernel->streamId);
  vtThrdID = vtStrm->vtThrdID;
  
  VT_RESUME_MALLOC_TRACING(vtCtx->ptid);
  
  /* get the VampirTrace region ID for the kernel */
  hn = vt_gpu_stringHashGet(kernel->name);
  if(hn){
    knRID = hn->rid;
  }else{
    char *knName = vt_cuda_demangleKernel(kernel->name);
    
    if(knName == NULL || *knName == '\0') {
      knName = (char *)kernel->name;
      
      if(knName == NULL) knName = "unknownKernel";
    }
    
    knRID = vt_def_region(VT_MASTER_THREAD, knName, VT_NO_ID,
                          VT_NO_LNO, VT_NO_LNO, "CUDA_KERNEL", VT_FUNCTION);

    hn = vt_gpu_stringHashPut(kernel->name, knRID);
  }

  /* write events */
  {
    uint64_t start = vtcuptiActivity->sync.hostStart 
                   + (kernel->start - vtcuptiActivity->sync.gpuStart) * vtcuptiActivity->sync.factor;
    uint64_t stop = start + (kernel->end - kernel->start) * vtcuptiActivity->sync.factor;
    
    /* if current activity's start time is before last written timestamp */
    if(start < vtStrm->vtLastTime){
      vt_warning("[CUPTI Activity] Kernel: start time < last written timestamp!");
      vt_warning("[CUPTI Activity] Kernel: '%s', CUdevice: %d, "
                 "CUDA stream ID: %d, Thread ID: %d", 
                 hn->sname, vtCtx->cuDev, vtStrm->strmID, vtStrm->vtThrdID);
      
      if(vtStrm->vtLastTime < stop){
        vt_warning("[CUPTI Activity] Set kernel start time to sync-point time"
                   "(cut %.4lf%%)", 
                   (double)(vtStrm->vtLastTime - start)/(double)(stop-start));
        start = vtStrm->vtLastTime;
      }else{
        vt_warning("[CUPTI Activity] Skipping ...");
        return;
      }
    }
    
    /* check if time between start and stop is increasing */
    if(stop < start){
      vt_warning("[CUPTI Activity] Kernel: start time > stop time!");
      vt_warning("[CUPTI Activity] Skipping '%s' on CUDA device:stream [%d:%d],"
                 " Thread ID %d", 
                 hn->sname, vtCtx->cuDev, vtStrm->strmID, vtStrm->vtThrdID);
      return;
    }
    
    /* check if synchronization stop time is before kernel stop time */
    if(vtcuptiActivity->sync.hostStop < stop){
      vt_warning("[CUPTI Activity] Kernel: sync-point time < kernel stop time");
      vt_warning("[CUPTI Activity] Kernel: '%s', CUdevice: %d, "
                 "CUDA stream ID: %d, Thread ID: %d", 
                 hn->sname, vtCtx->cuDev, vtStrm->strmID, vtStrm->vtThrdID);
      
      /* Write kernel with sync.hostStop stop time stamp, if possible */
      if(vtcuptiActivity->sync.hostStop > start){
        vt_warning("[CUPTI Activity] Set kernel-stop-time to sync-point-time "
                   "(cut %.4lf%%)", 
                   (double)(stop - vtcuptiActivity->sync.hostStop)/(double)(stop-start));
        
        stop = vtcuptiActivity->sync.hostStop;
      }else{
        vt_warning("[CUPTI Activity] Skipping ...");
        return;
      }
    }
    
    /* set the last VampirTrace timestamp, written in this stream */
    vtStrm->vtLastTime = stop;

    /*vt_cntl_msg(1, "'%s'(%d) start: %llu; stop: %llu (tid: %d)", 
                   kernel->name, knRID, start, stop, vtThrdID);*/
    
    /* GPU idle time will be written to first CUDA stream in list */
    if(vt_gpu_trace_idle){
      if(vtcuptiActivity->gpuIdleOn){
        /*vt_warning("IDLEexit: %llu (%d)", start, vtCtx->strmList->vtThrdID);*/
        vt_exit(vtcuptiActivity->strmList->vtThrdID, &start);
        vtcuptiActivity->gpuIdleOn = 0;
      }else if(start > vtcuptiActivity->vtLastGPUTime){
        /* idle is off and kernels are consecutive */
        /*vt_warning("IDLEente: %llu (%d)", vtCtx->vtLastGPUTime, vtCtx->strmList->vtThrdID);
        vt_warning("IDLEexit: %llu (%d)", start, vtCtx->strmList->vtThrdID);*/
        vt_enter(vtcuptiActivity->strmList->vtThrdID, &(vtcuptiActivity->vtLastGPUTime), vt_gpu_rid_idle);
        vt_exit(vtcuptiActivity->strmList->vtThrdID, &start);
      }
    }

    vt_enter(vtThrdID, &start, knRID);
    /*vt_warning("KERNente: %llu (%d)", start, vtThrdID);*/
    
    /* use counter to provide additional information for kernels */
    if(vt_gpu_trace_kernels > 1){
      /* grid and block size counter (start) */
      {
        uint32_t threadsPerBlock = kernel->blockX * kernel->blockY * kernel->blockZ;
        uint32_t blocksPerGrid = kernel->gridX * kernel->gridY * kernel->gridZ;

        vt_count(vtThrdID, &start, vt_cupti_cid_blocksPerGrid, 
                 blocksPerGrid);
        vt_count(vtThrdID, &start, vt_cupti_cid_threadsPerBlock, 
                 threadsPerBlock);
        vt_count(vtThrdID, &start, vt_cupti_cid_threadsPerKernel,
                 threadsPerBlock * blocksPerGrid);
      }

      /* memory counter (start) */
      vt_count(vtThrdID, &start, vt_cuptiact_cid_knStaticSharedMem,
               kernel->staticSharedMemory);
      vt_count(vtThrdID, &start, vt_cuptiact_cid_knDynamicSharedMem,
               kernel->dynamicSharedMemory);
      vt_count(vtThrdID, &start, vt_cuptiact_cid_knLocalMemTotal,
               kernel->localMemoryTotal);
      vt_count(vtThrdID, &start, vt_cuptiact_cid_knRegistersPerThread,
               kernel->registersPerThread);

      /* memory counter (stop) */
      vt_count(vtThrdID, &stop, vt_cuptiact_cid_knStaticSharedMem, 0);
      vt_count(vtThrdID, &stop, vt_cuptiact_cid_knDynamicSharedMem, 0);
      vt_count(vtThrdID, &stop, vt_cuptiact_cid_knLocalMemTotal, 0);
      vt_count(vtThrdID, &stop, vt_cuptiact_cid_knRegistersPerThread, 0);

      /* grid and block size counter (stop) */
      vt_count(vtThrdID, &stop, vt_cupti_cid_blocksPerGrid, 0);
      vt_count(vtThrdID, &stop, vt_cupti_cid_threadsPerBlock, 0);
      vt_count(vtThrdID, &stop, vt_cupti_cid_threadsPerKernel, 0);
    }
    
    vt_exit(vtThrdID, &stop);
    /*vt_warning("KERNexit: %llu (%d)", stop, vtThrdID);*/
    
    if(vtcuptiActivity->vtLastGPUTime < stop) vtcuptiActivity->vtLastGPUTime = stop;
  }

  /*vt_cntl_msg(1, "KERNEL '%s' [%llu ns] device %u, context %u, stream %u, "
                 "correlation %u/r%u\n"
                 "\t grid [%u,%u,%u], block [%u,%u,%u], "
                 "shared memory (static %u, dynamic %u)",
             kernel->name, (unsigned long long)(kernel->end - kernel->start),
             kernel->deviceId, kernel->contextId, kernel->streamId, 
             kernel->correlationId, kernel->runtimeCorrelationId,
             kernel->gridX, kernel->gridY, kernel->gridZ,
             kernel->blockX, kernel->blockY, kernel->blockZ,
             kernel->staticSharedMemory, kernel->dynamicSharedMemory);*/
}
/*
 * Use the CUPTI activity memory copy record to write the corresponding 
 * VampirTrace events.
 * 
 * @param mcpy the CUPTI activity memory copy record
 * @param vtCtx the VampirTrace CUPTI activity context
 */
static void vt_cuptiact_writeMemcpyRecord(CUpti_ActivityMemcpy *mcpy, 
                                          vt_cupti_ctx_t *vtCtx)
{
  vt_cupti_activity_t *vtcuptiActivity = vtCtx->activity;
  vt_gpu_copy_kind_t kind = VT_GPU_COPYDIRECTION_UNKNOWN;

  uint32_t vtThrdID;
  uint64_t start, stop;
  vt_cuptiact_strm_t *vtStrm = NULL;
  
  /*vt_cntl_msg(1,"mcpycopykind: %d (strm %d)", mcpy->copyKind, mcpy->streamId);*/
  if(mcpy->copyKind == CUPTI_ACTIVITY_MEMCPY_KIND_DTOD) return;
  
  start = vtcuptiActivity->sync.hostStart 
                 + (mcpy->start - vtcuptiActivity->sync.gpuStart) * vtcuptiActivity->sync.factor;
  stop = start + (mcpy->end - mcpy->start) * vtcuptiActivity->sync.factor;
  
  VT_SUSPEND_MALLOC_TRACING(vtCtx->ptid);
  /* get VampirTrace thread ID for the kernel's stream */
  vtStrm = vt_cuptiact_checkStream(vtCtx, mcpy->streamId);
  vtThrdID = vtStrm->vtThrdID;
  VT_RESUME_MALLOC_TRACING(vtCtx->ptid);
  
  /* if current activity's start time is before last written timestamp */
  if(start < vtStrm->vtLastTime){
    vt_warning("[CUPTI Activity] Memcpy: start time < last written timestamp! "
               "(CUDA device:stream [%d:%d], Thread ID: %d)", 
               vtCtx->cuDev, vtStrm->strmID, vtStrm->vtThrdID);
      

    if(vtStrm->vtLastTime < stop){
      vt_warning("[CUPTI Activity] Set memcpy start time to sync-point time"
                 "(truncate %.4lf%%)", 
                 (double)(vtStrm->vtLastTime - start)/(double)(stop - start));
      start = vtStrm->vtLastTime;
    }else{
      vt_warning("[CUPTI Activity] Skipping ...");
      return;
    }
  }
  
  /* check if time between start and stop is increasing */
  if(stop < start){
    vt_warning("[CUPTI Activity] Skipping memcpy (start time > stop time) on "
               "CUdevice:Stream %d:%d, Thread ID %d", 
               vtCtx->cuDev, vtStrm->strmID, vtStrm->vtThrdID);
    return;
  }

  /* check if synchronization stop time is before kernel stop time */
  if(vtcuptiActivity->sync.hostStop < stop){
    vt_warning("[CUPTI Activity] Memcpy: sync stop time < stop time! "
               "(CUDA device:stream [%d:%d], Thread ID: %d)", 
               vtCtx->cuDev, vtStrm->strmID, vtStrm->vtThrdID);
      
      /* Write memcpy with sync.hostStop stop time stamp, if possible */
      if(vtcuptiActivity->sync.hostStop > start){
        vt_warning("[CUPTI Activity] Set memcpy-stop-time to sync-point-time "
                   "(truncate %.4lf%%)", 
                   (double)(stop - vtcuptiActivity->sync.hostStop)/
                   (double)(stop - start));
        
        stop = vtcuptiActivity->sync.hostStop;
      }else{
        vt_warning("[CUPTI Activity] Skipping ...");
        return;
      }
  }
  
  /* set the last VampirTrace timestamp, written in this stream */
  vtStrm->vtLastTime = stop;
  
  /* check copy direction */
  if(mcpy->srcKind == CUPTI_ACTIVITY_MEMORY_KIND_DEVICE){
    if(mcpy->dstKind == CUPTI_ACTIVITY_MEMORY_KIND_DEVICE){
      kind = VT_GPU_DEV2DEV;
    }else{
      kind = VT_GPU_DEV2HOST;
    }
  }else{
    if(mcpy->dstKind == CUPTI_ACTIVITY_MEMORY_KIND_DEVICE){
      kind = VT_GPU_HOST2DEV;
    }else{
      kind = VT_GPU_HOST2HOST;
    }
  }
  
  if(vtcuptiActivity->gpuIdleOn == 0 && mcpy->streamId == vtcuptiActivity->defaultStrmID){
    vt_enter(vtcuptiActivity->strmList->vtThrdID, &(vtcuptiActivity->vtLastGPUTime), vt_gpu_rid_idle);
    vtcuptiActivity->gpuIdleOn = 1;
    /*vt_warning("IDLMente: %llu (%d)", vtCtx->vtLastGPUTime, vtCtx->strmList->vtThrdID);*/
  }
  
  /*VT_CUPTI_LOCK();*/
  if(kind != VT_GPU_DEV2DEV) vt_gpu_prop[vtCtx->ptid] |= VTGPU_GPU_COMM;
  vt_gpu_prop[vtThrdID] |= VTGPU_GPU_COMM;
  /*VT_CUPTI_UNLOCK();*/
  /*
  vt_warning("MCPYente: %llu (%d)", start, vtThrdID);
  vt_warning("MCPYexit: %llu (%d)", stop, vtThrdID);
  */
  if(kind == VT_GPU_HOST2DEV){
    vt_mpi_rma_get(vtThrdID, &start, VT_GPU_RANK_ID(vtCtx->ptid),
                   vt_gpu_commCID, 0, mcpy->bytes);
  }else if(kind == VT_GPU_DEV2HOST){
    vt_mpi_rma_put(vtThrdID, &start, VT_GPU_RANK_ID(vtCtx->ptid),
                   vt_gpu_commCID, 0, mcpy->bytes);
  }else if(kind == VT_GPU_DEV2DEV){
    vt_mpi_rma_get(vtThrdID, &start, VT_GPU_RANK_ID(vtThrdID),
                   vt_gpu_commCID, 0, mcpy->bytes);
  }
  
  if(kind != VT_GPU_HOST2HOST){
    vt_mpi_rma_end(vtThrdID, &stop, vt_gpu_commCID, 0);
  }
  
  /*if(vtCtx->vtLastGPUTime < stop) vtCtx->vtLastGPUTime = stop;*/
  
  /*vt_cntl_msg(1, "MEMCPY %llu -> %llu[%llu ns] device %u, context %u, stream %u, "
                     "correlation %u/r%u",
               mcpy->start, mcpy->end, 
               (unsigned long long)(mcpy->end - mcpy->start),
               mcpy->deviceId, mcpy->contextId, mcpy->streamId, 
               mcpy->correlationId, mcpy->runtimeCorrelationId);*/
}
static int get_new_trcid()
{
  int new_trcid;
  int fd;
  int8_t tmp_len;
  struct flock fl;
  char tmp[10] = "";
  uint8_t do_unlock = 1;

  vt_libassert(trcid_filename[0] != '\0');

  VT_SUSPEND_IO_TRACING(VT_CURRENT_THREAD);

  /* open/create temp. id file */
  if ( (fd = open(trcid_filename,
		  (O_RDWR | O_CREAT),
		  (S_IRUSR | S_IWUSR))) == -1 )
    vt_error_msg("Cannot open file %s: %s", trcid_filename, strerror(errno));

  /* lock temp. id file */
  fl.l_type = F_WRLCK; fl.l_whence = SEEK_SET; fl.l_start = 0; fl.l_len = 0;
  if (fcntl(fd, F_SETLKW, &fl) == -1)
  {
    do_unlock = 0;
    vt_warning("Cannot lock file %s: %s", trcid_filename, strerror(errno));
  }

  /* read current trace id */
  if ( read(fd, tmp, 9) == -1 )
    vt_error_msg("Cannot read file %s: %s", trcid_filename, strerror(errno));
  tmp[9] = '\0';

  if ( tmp[0] == '\0' )
    new_trcid = 1;             /* set trace id to 1, if file is empty */
  else
    new_trcid = atoi(tmp) + 1; /* increment trace id */

  /* write new trace id */
  lseek(fd, 0, SEEK_SET);
  snprintf(tmp, sizeof(tmp)-1, "%i\n", new_trcid);
  tmp_len = strlen( tmp );
  if( tmp_len > write( fd, tmp, tmp_len ) ){
    vt_error_msg( "Failed to write to file %s: %s", trcid_filename,strerror(errno) );
  }

  /* unlock temp. id file */
  if ( do_unlock )
  {
    fl.l_type = F_UNLCK;
    if ( fcntl(fd, F_SETLK, &fl) == -1 )
      vt_error_msg("Cannot unlock file %s: %s", trcid_filename, strerror(errno));
  }

  /* close temp. id file */
  close(fd);

  vt_cntl_msg(2, "Updated trace-id in %s to %i", trcid_filename, new_trcid);

  VT_RESUME_IO_TRACING(VT_CURRENT_THREAD);

  return new_trcid;
}