/*
 * Remove a context from the global context list and return it.
 * 
 * @param cuCtx pointer to the CUDA context
 * @return the VampirTrace CUPTI context, which has been removed 
 */
vt_cupti_ctx_t* vt_cupti_removeCtx(CUcontext *cuCtx)
{
  vt_cupti_ctx_t *currCtx = NULL;
  vt_cupti_ctx_t *lastCtx = NULL;

  VT_CUPTI_LOCK();
  currCtx = vt_cupti_ctxList;
  lastCtx = vt_cupti_ctxList;
  while(currCtx != NULL){
    if(currCtx->cuCtx == *cuCtx){
      /* if first element in list */
      if(currCtx == vt_cupti_ctxList){
        vt_cupti_ctxList = vt_cupti_ctxList->next;
      }else{
        lastCtx->next = currCtx->next;
      }
      VT_CUPTI_UNLOCK();
      return currCtx;
    }
    lastCtx = currCtx;
    currCtx = currCtx->next;
  }
  VT_CUPTI_UNLOCK();

  vt_cntl_msg(2, "[CUPTI] Could not remove context (CUDA Context not found)!");
  return NULL;
}
/*
 * Finalize the CUPTI common interface.
 * - free the VampirTrace CUPTI context list
 */
void vt_cupti_finalize()
{
  if(!vt_cupti_finalized && vt_cupti_initialized){
    VT_CUPTI_LOCK();
    if(!vt_cupti_finalized && vt_cupti_initialized){
      vt_cntl_msg(2, "[CUPTI] Finalizing ... ");
      
      /* free VampirTrace CUPTI context structures */
      while(vt_cupti_ctxList != NULL){
        vt_cupti_ctx_t *tmp =  vt_cupti_ctxList;

        vt_cupti_ctxList = vt_cupti_ctxList->next;

        vt_cupti_finalizeCtx(tmp);
        tmp = NULL;
      }
      
      vt_cupti_finalized = 1;
      VT_CUPTI_UNLOCK();

#if (defined(VT_MT) || defined (VT_HYB))
      VTTHRD_LOCK_ENV();
      VTThrd_deleteMutex(&VTThrdMutexCupti);
      VTTHRD_UNLOCK_ENV();
#endif /* VT_MT || VT_HYB */
    }
  }
}
/*
 * Prepend the given VampirTrace CUPTI context to the global context list.
 * 
 * @param vtCtx pointer to the VampirTrace CUPTI context to be prepended
 */
void vt_cupti_prependCtx(vt_cupti_ctx_t *vtCtx)
{
  VT_CUPTI_LOCK();
  vtCtx->next = vt_cupti_ctxList;
  vt_cupti_ctxList = vtCtx;
  VT_CUPTI_UNLOCK();
}
/*
 * Get a VampirTrace CUPTI context by CUDA context
 * 
 * @param cuCtx the CUDA context
 * 
 * @return VampirTrace CUPTI context
 */
vt_cupti_ctx_t* vt_cupti_getCtx(CUcontext cuCtx)
{
  vt_cupti_ctx_t* vtCtx = NULL;
  
  /* lookup context */
  VT_CUPTI_LOCK();
  vtCtx = vt_cupti_ctxList;
  while(vtCtx != NULL){
    if(vtCtx->cuCtx == cuCtx){
      
      /* workaround to set the correct device number 
      if(vtCtx->devID == VT_CUPTI_NO_DEVICE_ID){
        CUdevice cuDev;
        
        if(CUDA_SUCCESS != cuCtxGetDevice(&cuDev)){
          vt_warning("[CUPTI] Could not get CUdevice from context");
        }
        
        vtCtx->devID = (uint32_t)cuDev;
        vtCtx->cuDev = cuDev;
      }*/
      
      VT_CUPTI_UNLOCK();
      return vtCtx;
    }
    vtCtx = vtCtx->next;
  }
  VT_CUPTI_UNLOCK();
  
  return NULL;
}
void vt_cuptiact_markStreamAsDestroyed(CUcontext cuCtx, uint32_t strmID)
{
  vt_cupti_ctx_t *vtCtx = NULL;
  vt_cuptiact_strm_t *currStrm = NULL;

  VT_CUPTI_LOCK();
  
  if(cuCtx == NULL){
    vt_warning("[CUPTI Activity] No CUDA context given in "
               "vt_cuptiact_markStreamAsDestroyed()!");
    VT_CUPTI_UNLOCK();
    return;
  }
  
  vtCtx = vt_cupti_getCtxNoLock(cuCtx);
  
  if(vtCtx == NULL){
    vt_warning("[CUPTI Activity] No context found in "
               "vt_cuptiact_markStreamAsDestroyed()!");
    VT_CUPTI_UNLOCK();
    return;
  }
  
  currStrm = vtCtx->activity->strmList;
  while(currStrm != NULL){
    if(currStrm->strmID == strmID){
      currStrm->destroyed = 1;
      VT_CUPTI_UNLOCK();
      return;
    }
    currStrm = currStrm->next;
  }
  
  VT_CUPTI_UNLOCK();
}
void vt_cupti_activity_finalize()
{
  if(!vt_cuptiact_finalized && vt_cuptiact_initialized){
    VT_CUPTI_LOCK();
    if(!vt_cuptiact_finalized && vt_cuptiact_initialized){
      vt_cupti_ctx_t *vtCtx = vt_cupti_ctxList;
      
      vt_cntl_msg(2, "[CUPTI Activity] Finalizing ... ");

      while(vtCtx != NULL){
        
        /* write buffered activities, which have not been dumped yet */
        vt_cuptiact_flushCtxActivities(vtCtx);
        
        /* free the context */
        vt_cuptiact_destroyContext(vtCtx->activity);
        vtCtx->activity = NULL;
        
        /* set pointer to next context */
        vtCtx = vtCtx->next;
      }
      
      vt_cuptiact_finalized = 1;
      VT_CUPTI_UNLOCK();
    }
  }
}
/*
 * Initialize the CUPTI events data of the given VampirTrace CUPTI context.
 * 
 * @param vtCtx pointer to the VampirTrace CUPTI context
 */
void vt_cupti_events_initContext(vt_cupti_ctx_t *vtcuptiCtx)
{
  vt_cupti_events_t *vtcuptiEvtCtx = NULL;
  
  vt_cntl_msg(2, "[CUPTI Events] Initializing VampirTrace CUPTI events context");

  /* get a pointer to eventIDArray */
  {
    CUresult cuErr = CUDA_SUCCESS;
    int dev_major, dev_minor;
    vt_cupti_device_t *cuptiDev;

    /* TODO: do not trace this driver API function call */
    cuErr = cuDeviceComputeCapability(&dev_major, &dev_minor, vtcuptiCtx->cuDev);
    VT_CUDRV_CALL(cuErr, "cuDeviceComputeCapability");

    /* check if device capability already listed */
    VT_CUPTI_LOCK();
      cuptiDev = vtcuptievtCapList;
    VT_CUPTI_UNLOCK();
    
    cuptiDev = vt_cupti_checkMetricList(cuptiDev, dev_major, dev_minor);
    if(cuptiDev){
      /* allocate the VampirTrace CUPTI events context */
      vtcuptiEvtCtx = (vt_cupti_events_t *)malloc(sizeof(vt_cupti_events_t));
      if(vtcuptiEvtCtx == NULL)
        vt_error_msg("[CUPTI Events] malloc(sizeof(vt_cupti_events_t)) failed!");
      
      vtcuptiEvtCtx->vtDevCap = cuptiDev;
      vtcuptiEvtCtx->vtGrpList = NULL;
      vtcuptiEvtCtx->counterData = NULL;
      vtcuptiEvtCtx->cuptiEvtIDs = NULL;
      
      vtcuptiCtx->events = vtcuptiEvtCtx;
    }else{
      return;
    }
  }

  /* create and add the VampirTrace CUPTI groups to the context */
  vt_cupti_addEvtGrpsToCtx(vtcuptiCtx);

  /* allocate memory for CUPTI counter reads */
  {
    size_t allocSize = vtcuptiEvtCtx->vtGrpList->evtNum;
    
    vtcuptiEvtCtx->counterData = 
            (uint64_t *)malloc(allocSize*sizeof(uint64_t));
    vtcuptiEvtCtx->cuptiEvtIDs = 
            (CUpti_EventID *)malloc(allocSize*sizeof(CUpti_EventID));
  }
  
  vt_cuptievt_start(vtcuptiEvtCtx);
}
/*
 * Finalizes the VampirTrace CUPTI events interface.
 */
void vt_cupti_events_finalize()
{
  if(!vt_cuptievt_finalized && vt_cuptievt_initialized){ /* fast check without lock */
    VT_CUPTI_LOCK();
    if(!vt_cuptievt_finalized && vt_cuptievt_initialized){
      vt_cupti_ctx_t *vtcuptiCtxList =  vt_cupti_ctxList;
      
      /* needed because of the atexit in vt_cupti_events_init() */
      VT_SUSPEND_MALLOC_TRACING(VT_CURRENT_THREAD);

      vt_cntl_msg(2, "[CUPTI Events] Finalizing ...");

      /* free VampirTrace CUPTI events context structures */
      while(vtcuptiCtxList != NULL){
        if(vtcuptiCtxList->events != NULL){
          vt_cupti_events_finalizeContext(vtcuptiCtxList);
          free(vtcuptiCtxList->events);
          vtcuptiCtxList->events = NULL;
        }

        vtcuptiCtxList = vtcuptiCtxList->next;
      }

      /* free capability metric list */
      while(vtcuptievtCapList != NULL){
        vt_cupti_device_t *tmp = vtcuptievtCapList;
        vtcuptievtCapList = vtcuptievtCapList->next;
        
        /* free VampirTrace CUPTI events */
        while(tmp->vtcuptiEvtList != NULL){
          vt_cupti_evtctr_t *tmpEvt = tmp->vtcuptiEvtList;
          tmp->vtcuptiEvtList = tmp->vtcuptiEvtList->next;
          free(tmpEvt);
          tmpEvt = NULL;
        }

        free(tmp);
        tmp = NULL;
      }
      
      VT_RESUME_MALLOC_TRACING(VT_CURRENT_THREAD);

      vt_cuptievt_finalized = 1;
      VT_CUPTI_UNLOCK();
    }
  }
}
void vt_cupti_init()
{
  if(!vt_cupti_initialized){
#if (defined(VT_MT) || defined(VT_HYB))
    VTThrd_createMutex(&VTThrdMutexCupti);
#endif
    VT_CUPTI_LOCK();
    if(!vt_cupti_initialized){
      vt_cntl_msg(2, "[CUPTI] Initializing ... ");
      
      /* register the finalize function of VampirTrace CUPTI to be called before
       * the program exits */
      atexit(vt_cupti_finalize);
      
      vt_cupti_initialized = 1;
      VT_CUPTI_UNLOCK();
    }
  }
}
void vt_cuptiact_enableConcurrentKernel(vt_cupti_ctx_t* vtCtx)
{
    /* 
     * Disable collection of kernels for the given CUDA context. 
     * !!! does not work yet !!!
     
    VT_CUPTI_CALL(cuptiActivityDisableContext(cuCtx, CUPTI_ACTIVITY_KIND_KERNEL),
                  "cuptiActivityDisableContext");*
  
    * flush the already buffered activities for this CUDA context *
    vt_cuptiact_flushCtxActivities(cuCtx);

    * Enable collection of kernels for the given CUDA context 
    VT_CUPTI_CALL(cuptiActivityEnableContext(cuCtx, CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL), 
                  "cuptiActivityEnableContext");*/
  
  if((vt_gpu_config & VT_GPU_TRACE_CONCURRENT_KERNEL) 
         != VT_GPU_TRACE_CONCURRENT_KERNEL){

    vt_cntl_msg(2, "[CUPTI Activity] Enable concurrent kernel tracing.");
    
    /*
     * Disable normal (lower overhead) kernel tracing.
     */
    VT_CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL),
                  "cuptiActivityDisable");
    
    /* 
     * Flush the already buffered activities for this CUDA context.
     */
    VT_CUPTI_LOCK();
    vt_cuptiact_flushCtxActivities(vtCtx);
    VT_CUPTI_UNLOCK();

    /*
     * Enable concurrent kernel tracing (higher overhead).
     */
    VT_CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL), 
                  "cuptiActivityEnable");
    
    vt_gpu_config |= VT_GPU_TRACE_CONCURRENT_KERNEL;
  }
}
/*
 * Initialize VampirTrace IDs and registers the finalize function.
 * This may be done implicitly by vt_cupti_count().
 */
void vt_cupti_events_init()
{
  if(!vt_cuptievt_initialized){ /* fast check without lock */
    vt_cupti_init();
    VT_CUPTI_LOCK();
    if(!vt_cuptievt_initialized){
      vt_cntl_msg(2, "[CUPTI Events] Initializing ... ");

      /* create VampirTrace counter group ID only once */
  #if (defined(VT_MT) || defined(VT_HYB))
      VTTHRD_LOCK_IDS();
  #endif
      vt_cuptievt_rid_init = vt_def_region(VT_MASTER_THREAD, "vtcuptiHostThreadInit",
                      VT_NO_ID, VT_NO_LNO, VT_NO_LNO, "VT_CUPTI", VT_FUNCTION);

      vt_cuptievt_cgid = vt_def_counter_group(VT_MASTER_THREAD, "CUPTI");
  #if (defined(VT_MT) || defined(VT_HYB))
      VTTHRD_UNLOCK_IDS();
  #endif

      vt_cupti_events_sampling = (uint8_t)vt_env_cupti_sampling();

      vtcuptievtCapList = vt_cuptievt_setupMetricList();

      if(NULL == vtcuptievtCapList){
        vt_cupti_events_enabled = 0;
      }else{
        /* register the finalize function of VampirTrace CUPTI to be called before
         * the program exits */
        atexit(vt_cupti_events_finalize);
      }

      vt_cuptievt_initialized = 1;
      VT_CUPTI_UNLOCK();
    }
  }
}
/*
 * Increases the "Allocated CUDA memory" counter.
 *
 * @param ctxUID CUDA context identifier (@see CUPTI callback info)
 * @param devPtr pointer to the allocated memory (needed for vtcudaFree())
 * @param size the number of bytes allocated
 */
void vt_cuptiact_writeMalloc(uint32_t ctxID, CUcontext cuCtx, 
                             void *devPtr, size_t size)
{
  uint64_t vtTime;
  vt_cupti_ctx_t* vtCtx = NULL;
  vt_cupti_activity_t *vtcuptiActivity = NULL;
  vt_cupti_gpumem_t *vtMalloc = NULL;
  
  if(devPtr == NULL) return;
  
  VT_SUSPEND_MALLOC_TRACING(VT_CURRENT_THREAD);
  
  vtMalloc = (vt_cupti_gpumem_t*)malloc(sizeof(vt_cupti_gpumem_t));
  
  vtMalloc->memPtr = devPtr;
  vtMalloc->size = size;
  
  /* check for VampirTrace CUPTI context */
  vtCtx = vt_cupti_getCtx(cuCtx);
  if(vtCtx == NULL){
    vtCtx = vt_cupti_createCtx(cuCtx, VT_CUPTI_NO_CUDA_DEVICE, ctxID, VT_CUPTI_NO_DEVICE_ID);
    vt_cupti_prependCtx(vtCtx);
  }
  
  /* check for VampirTrace CUPTI activity context */
  if(vtCtx->activity == NULL){
    vtCtx->activity = vt_cuptiact_createCtxActivity(cuCtx);
  }
  vtcuptiActivity = vtCtx->activity;

  /* lock the work on the context */
  VT_CUPTI_LOCK();
  
  /* flush activity buffer */
  vt_cuptiact_flushCtxActivities(vtCtx);
  
  /* add malloc entry to list */
  vtMalloc->next = vtcuptiActivity->gpuMemList;
  vtcuptiActivity->gpuMemList = vtMalloc;
  
  /* increase allocated memory counter */
  vtcuptiActivity->gpuMemAllocated += size;

  /* check if first CUDA stream is available */
  if(vtcuptiActivity->strmList == NULL){
    if(vt_gpu_init_time < vt_start_time)
      vt_gpu_init_time = vt_start_time;
        
    vtcuptiActivity->strmList = vt_cuptiact_createStream(vtCtx, vtcuptiActivity->defaultStrmID);
    vt_count(vtcuptiActivity->strmList->vtThrdID, &vt_gpu_init_time, vt_gpu_cid_memusage, 0);
  }
  
  VT_CUPTI_UNLOCK();
  
  VT_RESUME_MALLOC_TRACING(VT_CURRENT_THREAD);
  
  /* write counter value */
  vtTime = vt_pform_wtime();
  vt_count(vtcuptiActivity->strmList->vtThrdID, &vtTime, vt_gpu_cid_memusage, 
           (uint64_t)(vtcuptiActivity->gpuMemAllocated));
}
/*
 * Decreases the "Allocated CUDA memory" counter.
 *
 * @param ctxUID CUDA context identifier (@see CUPTI callback info)
 * @param devPtr pointer to the allocated memory
 */
void vt_cuptiact_writeFree(uint32_t ctxID, CUcontext cuCtx, void *devPtr)
{
  uint64_t vtTime;
  vt_cupti_ctx_t* vtCtx = NULL;
  vt_cupti_activity_t *vtcuptiActivity = NULL;
  vt_cupti_gpumem_t *curMalloc = NULL;
  vt_cupti_gpumem_t *lastMalloc = NULL;

  if(devPtr == NULL) return;
  
  VT_SUSPEND_MALLOC_TRACING(VT_CURRENT_THREAD);
  
  /* check for VampirTrace CUPTI context */
  vtCtx = vt_cupti_getCtx(cuCtx);
  if(vtCtx == NULL){
    
    vtCtx = vt_cupti_createCtx(cuCtx, VT_CUPTI_NO_CUDA_DEVICE, ctxID, VT_CUPTI_NO_DEVICE_ID);
    
    vt_cupti_prependCtx(vtCtx);
  }
  
  /* check for VampirTrace CUPTI activity context */
  if(vtCtx->activity == NULL){
    vtCtx->activity = vt_cuptiact_createCtxActivity(cuCtx);
  }
  vtcuptiActivity = vtCtx->activity;
  
  VT_CUPTI_LOCK();
  
  /* flush activity buffer */
  vt_cuptiact_flushCtxActivities(vtCtx);

  curMalloc = vtcuptiActivity->gpuMemList;
  lastMalloc = curMalloc;

  /* lookup the CUDA malloc entry by its memory pointer */
  while(curMalloc != NULL){
    if(devPtr == curMalloc->memPtr){

      /* decrease allocated counter value and write it */
      vtTime = vt_pform_wtime();
      vtcuptiActivity->gpuMemAllocated -= curMalloc->size;
      vt_count(vtcuptiActivity->strmList->vtThrdID, &vtTime, vt_gpu_cid_memusage,
               (uint64_t)(vtcuptiActivity->gpuMemAllocated));


      /* set pointer over current element to next one */
      lastMalloc->next = curMalloc->next;

      /* if current element is the first list entry, set the list entry */
      if(curMalloc == vtcuptiActivity->gpuMemList){
        vtcuptiActivity->gpuMemList = curMalloc->next;
      }

      /* free VT memory of CUDA malloc */
      curMalloc->next = NULL;
      free(curMalloc);
      curMalloc = NULL;

      /* set mallocList to NULL, if last element freed */
      if(vtcuptiActivity->gpuMemAllocated == 0) {
        vtcuptiActivity->gpuMemList = NULL;
      }
  
      VT_CUPTI_UNLOCK();
      VT_RESUME_MALLOC_TRACING(VT_CURRENT_THREAD);
      return;
    }

    lastMalloc = curMalloc;
    curMalloc = curMalloc->next;
  }

  VT_CUPTI_UNLOCK();
  
  VT_RESUME_MALLOC_TRACING(VT_CURRENT_THREAD);

  vt_warning("[CUPTI Activity] free CUDA memory, which has not been allocated!");
}