/*
 * Increases the "Allocated CUDA memory" counter.
 *
 * @param ctxUID CUDA context identifier (@see CUPTI callback info)
 * @param devPtr pointer to the allocated memory (needed for vtcudaFree())
 * @param size the number of bytes allocated
 */
void vt_cuptiact_writeMalloc(uint32_t ctxID, CUcontext cuCtx, 
                             void *devPtr, size_t size)
{
  uint64_t vtTime;
  vt_cuptiact_ctx_t* vtCtx = NULL;
  vt_cuptiact_gpumem_t *vtMalloc = 
                    (vt_cuptiact_gpumem_t*)malloc(sizeof(vt_cuptiact_gpumem_t));
  
  if(devPtr == NULL) return;
  
  /* flush activity buffer */
  vt_cuptiact_flushCtxActivities(cuCtx);
  
  vtMalloc->memPtr = devPtr;
  vtMalloc->size = size;
  
  vtCtx = vt_cuptiact_getCtx(cuCtx);
  if(vtCtx == NULL){
    vtCtx = vt_cuptiact_createContext(ctxID, cuCtx, (uint32_t)-1);
  }
  
  /* lock the work on the context */
  VT_CUPTI_ACT_LOCK();
  
  /* add malloc entry to list */
  vtMalloc->next = vtCtx->gpuMemList;
  vtCtx->gpuMemList = vtMalloc;
  
  /* increase allocated memory counter */
  vtCtx->gpuMemAllocated += size;

  /* check if first CUDA stream is available */
  if(vtCtx->strmList == NULL){
    if(vt_gpu_init_time < vt_start_time)
      vt_gpu_init_time = vt_start_time;
        
    vtCtx->strmList = vt_cuptiact_createStream(vtCtx, vtCtx->defaultStrmID);
    vt_count(vtCtx->strmList->vtThrdID, &vt_gpu_init_time, vt_gpu_cid_memusage, 0);
  }
  
  VT_CUPTI_ACT_UNLOCK();
  
  /* write counter value */
  vtTime = vt_pform_wtime();
  vt_count(vtCtx->strmList->vtThrdID, &vtTime, vt_gpu_cid_memusage, 
           (uint64_t)(vtCtx->gpuMemAllocated));
}
/*
 * Increases the "Allocated CUDA memory" counter.
 *
 * @param ctxUID CUDA context identifier (@see CUPTI callback info)
 * @param devPtr pointer to the allocated memory (needed for vtcudaFree())
 * @param size the number of bytes allocated
 */
void vt_cuptiact_writeMalloc(uint32_t ctxID, CUcontext cuCtx, 
                             void *devPtr, size_t size)
{
  uint64_t vtTime;
  vt_cupti_ctx_t* vtCtx = NULL;
  vt_cupti_activity_t *vtcuptiActivity = NULL;
  vt_cupti_gpumem_t *vtMalloc = NULL;
  
  if(devPtr == NULL) return;
  
  VT_SUSPEND_MALLOC_TRACING(VT_CURRENT_THREAD);
  
  vtMalloc = (vt_cupti_gpumem_t*)malloc(sizeof(vt_cupti_gpumem_t));
  
  vtMalloc->memPtr = devPtr;
  vtMalloc->size = size;
  
  /* check for VampirTrace CUPTI context */
  vtCtx = vt_cupti_getCtx(cuCtx);
  if(vtCtx == NULL){
    vtCtx = vt_cupti_createCtx(cuCtx, VT_CUPTI_NO_CUDA_DEVICE, ctxID, VT_CUPTI_NO_DEVICE_ID);
    vt_cupti_prependCtx(vtCtx);
  }
  
  /* check for VampirTrace CUPTI activity context */
  if(vtCtx->activity == NULL){
    vtCtx->activity = vt_cuptiact_createCtxActivity(cuCtx);
  }
  vtcuptiActivity = vtCtx->activity;

  /* lock the work on the context */
  VT_CUPTI_LOCK();
  
  /* flush activity buffer */
  vt_cuptiact_flushCtxActivities(vtCtx);
  
  /* add malloc entry to list */
  vtMalloc->next = vtcuptiActivity->gpuMemList;
  vtcuptiActivity->gpuMemList = vtMalloc;
  
  /* increase allocated memory counter */
  vtcuptiActivity->gpuMemAllocated += size;

  /* check if first CUDA stream is available */
  if(vtcuptiActivity->strmList == NULL){
    if(vt_gpu_init_time < vt_start_time)
      vt_gpu_init_time = vt_start_time;
        
    vtcuptiActivity->strmList = vt_cuptiact_createStream(vtCtx, vtcuptiActivity->defaultStrmID);
    vt_count(vtcuptiActivity->strmList->vtThrdID, &vt_gpu_init_time, vt_gpu_cid_memusage, 0);
  }
  
  VT_CUPTI_UNLOCK();
  
  VT_RESUME_MALLOC_TRACING(VT_CURRENT_THREAD);
  
  /* write counter value */
  vtTime = vt_pform_wtime();
  vt_count(vtcuptiActivity->strmList->vtThrdID, &vtTime, vt_gpu_cid_memusage, 
           (uint64_t)(vtcuptiActivity->gpuMemAllocated));
}
/*
 * Check for a VampirTrace activity stream by stream ID. If it does not exist,
 * create it.
 * 
 * @param vtCtx VampirTrace CUPTI Activity context
 * @param strmID the CUDA stream ID provided by CUPTI callback API
 * 
 * @return the VampirTrace CUDA stream
 */
static vt_cuptiact_strm_t* vt_cuptiact_checkStream(vt_cupti_ctx_t* vtCtx, 
                                                   uint32_t strmID)
{
  vt_cupti_activity_t *vtcuptiActivity = vtCtx->activity;
  vt_cuptiact_strm_t *currStrm = NULL;
  vt_cuptiact_strm_t *lastStrm = NULL;
  vt_cuptiact_strm_t *reusableStrm = NULL;
  
  if(vtCtx == NULL){
    vt_warning("[CUPTI Activity] No context given in vt_cuptiact_checkStream()!");
    return NULL;
  }
  
  /* lookup stream */
  /*VT_CUPTI_LOCK();*/
  currStrm = vtcuptiActivity->strmList;
  lastStrm = vtcuptiActivity->strmList;
  while(currStrm != NULL){
    /* check for existing stream */
    if(currStrm->strmID == strmID){
      /*VT_CUPTI_UNLOCK();*/
      return currStrm;
    }
    
    /* check for reusable stream */
    if(vt_gpu_stream_reuse && reusableStrm == NULL && currStrm->destroyed == 1){
      reusableStrm = currStrm;
    }
    
    lastStrm = currStrm;
    currStrm = currStrm->next;
  }
  
  /* reuse a destroyed stream, if there is any available */
  if(vt_gpu_stream_reuse && reusableStrm){
    vt_cntl_msg(2, "[CUPTI Activity] Reusing CUDA stream %d with stream %d",
                   reusableStrm->strmID, strmID);
    reusableStrm->destroyed = 0;
    reusableStrm->strmID = strmID;

    return reusableStrm;
  }
  
  /* 
   * If stream list is empty, the stream to be created is not the default
   * stream and GPU idle and memory copy tracing is enabled, then create
   * a default stream.
   */
  if(vtcuptiActivity->strmList == NULL && strmID != vtcuptiActivity->defaultStrmID && 
     vt_gpu_trace_idle == 1 && vt_gpu_trace_mcpy){
    vtcuptiActivity->strmList = 
            vt_cuptiact_createStream(vtCtx, vtcuptiActivity->defaultStrmID);
    lastStrm = vtcuptiActivity->strmList;
  }
  
  /* create the stream, which has not been created yet */
  currStrm = vt_cuptiact_createStream(vtCtx, strmID);
  
  /* append the newly created stream */
  if(NULL != lastStrm) lastStrm->next = currStrm;
  else vtcuptiActivity->strmList = currStrm;
  
  /*VT_CUPTI_UNLOCK();*/
  return currStrm;
}