/*
 * Get or if not available create a VampirTrace CUPTI context by CUDA context.
 * 
 * @param cuCtx the CUDA context
 * 
 * @return VampirTrace CUPTI context
 */
vt_cupti_ctx_t* vt_cupti_getCreateCtx(CUcontext cuCtx)
{
  vt_cupti_ctx_t* vtCtx = vt_cupti_getCtx(cuCtx);
  
  if(vtCtx == NULL){
    VT_SUSPEND_MALLOC_TRACING(VT_CURRENT_THREAD);
    vtCtx = vt_cupti_createCtx(cuCtx, VT_CUPTI_NO_CUDA_DEVICE,
                               VT_CUPTI_NO_CONTEXT_ID, VT_CUPTI_NO_DEVICE_ID);
    VT_RESUME_MALLOC_TRACING(VT_CURRENT_THREAD);

    vt_cupti_prependCtx(vtCtx);
  }
  
  return vtCtx;
}
/*
 * Increases the "Allocated CUDA memory" counter.
 *
 * @param ctxUID CUDA context identifier (@see CUPTI callback info)
 * @param devPtr pointer to the allocated memory (needed for vtcudaFree())
 * @param size the number of bytes allocated
 */
void vt_cuptiact_writeMalloc(uint32_t ctxID, CUcontext cuCtx, 
                             void *devPtr, size_t size)
{
  uint64_t vtTime;
  vt_cupti_ctx_t* vtCtx = NULL;
  vt_cupti_activity_t *vtcuptiActivity = NULL;
  vt_cupti_gpumem_t *vtMalloc = NULL;
  
  if(devPtr == NULL) return;
  
  VT_SUSPEND_MALLOC_TRACING(VT_CURRENT_THREAD);
  
  vtMalloc = (vt_cupti_gpumem_t*)malloc(sizeof(vt_cupti_gpumem_t));
  
  vtMalloc->memPtr = devPtr;
  vtMalloc->size = size;
  
  /* check for VampirTrace CUPTI context */
  vtCtx = vt_cupti_getCtx(cuCtx);
  if(vtCtx == NULL){
    vtCtx = vt_cupti_createCtx(cuCtx, VT_CUPTI_NO_CUDA_DEVICE, ctxID, VT_CUPTI_NO_DEVICE_ID);
    vt_cupti_prependCtx(vtCtx);
  }
  
  /* check for VampirTrace CUPTI activity context */
  if(vtCtx->activity == NULL){
    vtCtx->activity = vt_cuptiact_createCtxActivity(cuCtx);
  }
  vtcuptiActivity = vtCtx->activity;

  /* lock the work on the context */
  VT_CUPTI_LOCK();
  
  /* flush activity buffer */
  vt_cuptiact_flushCtxActivities(vtCtx);
  
  /* add malloc entry to list */
  vtMalloc->next = vtcuptiActivity->gpuMemList;
  vtcuptiActivity->gpuMemList = vtMalloc;
  
  /* increase allocated memory counter */
  vtcuptiActivity->gpuMemAllocated += size;

  /* check if first CUDA stream is available */
  if(vtcuptiActivity->strmList == NULL){
    if(vt_gpu_init_time < vt_start_time)
      vt_gpu_init_time = vt_start_time;
        
    vtcuptiActivity->strmList = vt_cuptiact_createStream(vtCtx, vtcuptiActivity->defaultStrmID);
    vt_count(vtcuptiActivity->strmList->vtThrdID, &vt_gpu_init_time, vt_gpu_cid_memusage, 0);
  }
  
  VT_CUPTI_UNLOCK();
  
  VT_RESUME_MALLOC_TRACING(VT_CURRENT_THREAD);
  
  /* write counter value */
  vtTime = vt_pform_wtime();
  vt_count(vtcuptiActivity->strmList->vtThrdID, &vtTime, vt_gpu_cid_memusage, 
           (uint64_t)(vtcuptiActivity->gpuMemAllocated));
}
/*
 * Decreases the "Allocated CUDA memory" counter.
 *
 * @param ctxUID CUDA context identifier (@see CUPTI callback info)
 * @param devPtr pointer to the allocated memory
 */
void vt_cuptiact_writeFree(uint32_t ctxID, CUcontext cuCtx, void *devPtr)
{
  uint64_t vtTime;
  vt_cupti_ctx_t* vtCtx = NULL;
  vt_cupti_activity_t *vtcuptiActivity = NULL;
  vt_cupti_gpumem_t *curMalloc = NULL;
  vt_cupti_gpumem_t *lastMalloc = NULL;

  if(devPtr == NULL) return;
  
  VT_SUSPEND_MALLOC_TRACING(VT_CURRENT_THREAD);
  
  /* check for VampirTrace CUPTI context */
  vtCtx = vt_cupti_getCtx(cuCtx);
  if(vtCtx == NULL){
    
    vtCtx = vt_cupti_createCtx(cuCtx, VT_CUPTI_NO_CUDA_DEVICE, ctxID, VT_CUPTI_NO_DEVICE_ID);
    
    vt_cupti_prependCtx(vtCtx);
  }
  
  /* check for VampirTrace CUPTI activity context */
  if(vtCtx->activity == NULL){
    vtCtx->activity = vt_cuptiact_createCtxActivity(cuCtx);
  }
  vtcuptiActivity = vtCtx->activity;
  
  VT_CUPTI_LOCK();
  
  /* flush activity buffer */
  vt_cuptiact_flushCtxActivities(vtCtx);

  curMalloc = vtcuptiActivity->gpuMemList;
  lastMalloc = curMalloc;

  /* lookup the CUDA malloc entry by its memory pointer */
  while(curMalloc != NULL){
    if(devPtr == curMalloc->memPtr){

      /* decrease allocated counter value and write it */
      vtTime = vt_pform_wtime();
      vtcuptiActivity->gpuMemAllocated -= curMalloc->size;
      vt_count(vtcuptiActivity->strmList->vtThrdID, &vtTime, vt_gpu_cid_memusage,
               (uint64_t)(vtcuptiActivity->gpuMemAllocated));


      /* set pointer over current element to next one */
      lastMalloc->next = curMalloc->next;

      /* if current element is the first list entry, set the list entry */
      if(curMalloc == vtcuptiActivity->gpuMemList){
        vtcuptiActivity->gpuMemList = curMalloc->next;
      }

      /* free VT memory of CUDA malloc */
      curMalloc->next = NULL;
      free(curMalloc);
      curMalloc = NULL;

      /* set mallocList to NULL, if last element freed */
      if(vtcuptiActivity->gpuMemAllocated == 0) {
        vtcuptiActivity->gpuMemList = NULL;
      }
  
      VT_CUPTI_UNLOCK();
      VT_RESUME_MALLOC_TRACING(VT_CURRENT_THREAD);
      return;
    }

    lastMalloc = curMalloc;
    curMalloc = curMalloc->next;
  }

  VT_CUPTI_UNLOCK();
  
  VT_RESUME_MALLOC_TRACING(VT_CURRENT_THREAD);

  vt_warning("[CUPTI Activity] free CUDA memory, which has not been allocated!");
}