예제 #1
0
PetscErrorCode PetscLogEventBeginDefault(PetscLogEvent event, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4) 
{
  StageLog       stageLog;
  EventPerfLog   eventLog;
  int            stage;
  PetscErrorCode ierr;

  PetscFunctionBegin;
  ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr);
  ierr = StageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr);
  ierr = StageLogGetEventPerfLog(stageLog, stage, &eventLog);CHKERRQ(ierr);
  /* Check for double counting */
  eventLog->eventInfo[event].depth++;
  if (eventLog->eventInfo[event].depth > 1) PetscFunctionReturn(0);
  /* Log performance info */
  eventLog->eventInfo[event].count++;
  PetscTimeSubtract(eventLog->eventInfo[event].time);
#if defined(PETSC_HAVE_CHUD)
  eventLog->eventInfo[event].flops         -= chudGetPMCEventCount(chudCPU1Dev,PMC_1);
#elif defined(PETSC_HAVE_PAPI)
  { long_long values[2];
    ierr = PAPI_read(PAPIEventSet,values);CHKERRQ(ierr);
    eventLog->eventInfo[event].flops -= values[0];
    /*    printf("fma %g flops %g\n",(double)values[1],(double)values[0]); */
  }
#else
  eventLog->eventInfo[event].flops         -= _TotalFlops;
#endif
  eventLog->eventInfo[event].numMessages   -= irecv_ct  + isend_ct  + recv_ct  + send_ct;
  eventLog->eventInfo[event].messageLength -= irecv_len + isend_len + recv_len + send_len;
  eventLog->eventInfo[event].numReductions -= allreduce_ct;
  PetscFunctionReturn(0);
}
예제 #2
0
PetscErrorCode PetscLogEventBeginTrace(PetscLogEvent event, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
{
  PetscStageLog     stageLog;
  PetscEventRegLog  eventRegLog;
  PetscEventPerfLog eventPerfLog = NULL;
  PetscLogDouble    cur_time;
  PetscMPIInt       rank;
  int               stage,err;
  PetscErrorCode    ierr;

  PetscFunctionBegin;
  if (!petsc_tracetime) PetscTime(&petsc_tracetime);

  ierr = MPI_Comm_rank(PETSC_COMM_WORLD, &rank);CHKERRQ(ierr);
  ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr);
  ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr);
  ierr = PetscStageLogGetEventRegLog(stageLog, &eventRegLog);CHKERRQ(ierr);
  ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventPerfLog);CHKERRQ(ierr);
  /* Check for double counting */
  eventPerfLog->eventInfo[event].depth++;
  petsc_tracelevel++;
  if (eventPerfLog->eventInfo[event].depth > 1) PetscFunctionReturn(0);
  /* Log performance info */
  PetscTime(&cur_time);
  ierr = PetscFPrintf(PETSC_COMM_SELF,petsc_tracefile, "%s[%d] %g Event begin: %s\n", petsc_tracespace, rank, cur_time-petsc_tracetime, eventRegLog->eventInfo[event].name);CHKERRQ(ierr);
  ierr = PetscStrncpy(petsc_tracespace, petsc_traceblanks, 2*petsc_tracelevel);CHKERRQ(ierr);

  petsc_tracespace[2*petsc_tracelevel] = 0;

  err = fflush(petsc_tracefile);
  if (err) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SYS,"fflush() failed on file");
  PetscFunctionReturn(0);
}
예제 #3
0
PetscErrorCode PetscLogEventEndTrace(PetscLogEvent event,int t,PetscObject o1,PetscObject o2,PetscObject o3,PetscObject o4) 
{
  StageLog       stageLog;
  EventRegLog    eventRegLog;
  EventPerfLog   eventPerfLog;
  PetscLogDouble cur_time;
  int            stage,err;
  PetscMPIInt    rank;
  PetscErrorCode ierr;

  PetscFunctionBegin;
  tracelevel--;
  ierr = MPI_Comm_rank(PETSC_COMM_WORLD, &rank);CHKERRQ(ierr);
  ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr);
  ierr = StageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr);
  ierr = StageLogGetEventRegLog(stageLog, &eventRegLog);CHKERRQ(ierr);
  ierr = StageLogGetEventPerfLog(stageLog, stage, &eventPerfLog);CHKERRQ(ierr);
  /* Check for double counting */
  eventPerfLog->eventInfo[event].depth--;
  if (eventPerfLog->eventInfo[event].depth > 0) {
    PetscFunctionReturn(0);
  } else if (eventPerfLog->eventInfo[event].depth < 0 || tracelevel < 0) {
    SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Logging event had unbalanced begin/end pairs");
  }
  /* Log performance info */
  ierr = PetscStrncpy(tracespace, traceblanks, 2*tracelevel);CHKERRQ(ierr);
  tracespace[2*tracelevel] = 0;
  PetscTime(cur_time);
  ierr = PetscFPrintf(PETSC_COMM_SELF,tracefile, "%s[%d] %g Event end: %s\n", tracespace, rank, cur_time-tracetime, eventRegLog->eventInfo[event].name);CHKERRQ(ierr);
  err = fflush(tracefile);
  if (err) SETERRQ(PETSC_ERR_SYS,"fflush() failed on file");        
  PetscFunctionReturn(0);
}
예제 #4
0
PetscErrorCode PetscLogEventEndComplete(PetscLogEvent event, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
{
  PetscStageLog     stageLog;
  PetscEventRegLog  eventRegLog;
  PetscEventPerfLog eventPerfLog = NULL;
  Action            *tmpAction;
  PetscLogDouble    start, end;
  PetscLogDouble    curTime;
  int               stage;
  PetscErrorCode    ierr;

  PetscFunctionBegin;
  /* Dynamically enlarge logging structures */
  if (petsc_numActions >= petsc_maxActions) {
    PetscTime(&start);
    ierr = PetscMalloc(petsc_maxActions*2 * sizeof(Action), &tmpAction);CHKERRQ(ierr);
    ierr = PetscMemcpy(tmpAction, petsc_actions, petsc_maxActions * sizeof(Action));CHKERRQ(ierr);
    ierr = PetscFree(petsc_actions);CHKERRQ(ierr);

    petsc_actions     = tmpAction;
    petsc_maxActions *= 2;
    PetscTime(&end);
    petsc_BaseTime += (end - start);
  }
  /* Record the event */
  ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr);
  ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr);
  ierr = PetscStageLogGetEventRegLog(stageLog, &eventRegLog);CHKERRQ(ierr);
  ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventPerfLog);CHKERRQ(ierr);
  PetscTime(&curTime);
  if (petsc_logActions) {
    petsc_actions[petsc_numActions].time    = curTime - petsc_BaseTime;
    petsc_actions[petsc_numActions].action  = ACTIONEND;
    petsc_actions[petsc_numActions].event   = event;
    petsc_actions[petsc_numActions].classid = eventRegLog->eventInfo[event].classid;
    if (o1) petsc_actions[petsc_numActions].id1 = o1->id;
    else petsc_actions[petsc_numActions].id1 = -1;
    if (o2) petsc_actions[petsc_numActions].id2 = o2->id;
    else petsc_actions[petsc_numActions].id2 = -1;
    if (o3) petsc_actions[petsc_numActions].id3 = o3->id;
    else petsc_actions[petsc_numActions].id3 = -1;
    petsc_actions[petsc_numActions].flops = petsc_TotalFlops;

    ierr = PetscMallocGetCurrentUsage(&petsc_actions[petsc_numActions].mem);CHKERRQ(ierr);
    ierr = PetscMallocGetMaximumUsage(&petsc_actions[petsc_numActions].maxmem);CHKERRQ(ierr);
    petsc_numActions++;
  }
  /* Check for double counting */
  eventPerfLog->eventInfo[event].depth--;
  if (eventPerfLog->eventInfo[event].depth > 0) PetscFunctionReturn(0);
  else if (eventPerfLog->eventInfo[event].depth < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Logging event had unbalanced begin/end pairs");
  /* Log the performance info */
  eventPerfLog->eventInfo[event].count++;
  eventPerfLog->eventInfo[event].time          += curTime;
  eventPerfLog->eventInfo[event].flops         += petsc_TotalFlops;
  eventPerfLog->eventInfo[event].numMessages   += petsc_irecv_ct  + petsc_isend_ct  + petsc_recv_ct  + petsc_send_ct;
  eventPerfLog->eventInfo[event].messageLength += petsc_irecv_len + petsc_isend_len + petsc_recv_len + petsc_send_len;
  eventPerfLog->eventInfo[event].numReductions += petsc_allreduce_ct + petsc_gather_ct + petsc_scatter_ct;
  PetscFunctionReturn(0);
}
예제 #5
0
PetscErrorCode PetscLogEventBeginDefault(PetscLogEvent event,int t,PetscObject o1,PetscObject o2,PetscObject o3,PetscObject o4)
{
  PetscStageLog     stageLog;
  PetscEventPerfLog eventLog = NULL;
  int               stage;
  PetscErrorCode    ierr;

  PetscFunctionBegin;
  ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr);
  ierr = PetscStageLogGetCurrent(stageLog,&stage);CHKERRQ(ierr);
  ierr = PetscStageLogGetEventPerfLog(stageLog,stage,&eventLog);CHKERRQ(ierr);
  /* Check for double counting */
  eventLog->eventInfo[event].depth++;
  if (eventLog->eventInfo[event].depth > 1) PetscFunctionReturn(0);
  /* Log performance info */
  eventLog->eventInfo[event].count++;
  eventLog->eventInfo[event].timeTmp = 0.0;
  PetscTimeSubtract(&eventLog->eventInfo[event].timeTmp);
  eventLog->eventInfo[event].flopsTmp       = 0.0;
  eventLog->eventInfo[event].flopsTmp      -= petsc_TotalFlops;
  eventLog->eventInfo[event].numMessages   -= petsc_irecv_ct  + petsc_isend_ct  + petsc_recv_ct  + petsc_send_ct;
  eventLog->eventInfo[event].messageLength -= petsc_irecv_len + petsc_isend_len + petsc_recv_len + petsc_send_len;
  eventLog->eventInfo[event].numReductions -= petsc_allreduce_ct + petsc_gather_ct + petsc_scatter_ct;
  PetscFunctionReturn(0);
}
예제 #6
0
PetscErrorCode PetscLogEventEndDefault(PetscLogEvent event,int t,PetscObject o1,PetscObject o2,PetscObject o3,PetscObject o4)
{
  PetscStageLog     stageLog;
  PetscEventPerfLog eventLog = NULL;
  int               stage;
  PetscErrorCode    ierr;

  PetscFunctionBegin;
  ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr);
  ierr = PetscStageLogGetCurrent(stageLog,&stage);CHKERRQ(ierr);
  ierr = PetscStageLogGetEventPerfLog(stageLog,stage,&eventLog);CHKERRQ(ierr);
  /* Check for double counting */
  eventLog->eventInfo[event].depth--;
  if (eventLog->eventInfo[event].depth > 0) PetscFunctionReturn(0);
  else if (eventLog->eventInfo[event].depth < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Logging event had unbalanced begin/end pairs");
  /* Log performance info */
  PetscTimeAdd(&eventLog->eventInfo[event].timeTmp);
  eventLog->eventInfo[event].time          += eventLog->eventInfo[event].timeTmp;
  eventLog->eventInfo[event].time2         += eventLog->eventInfo[event].timeTmp*eventLog->eventInfo[event].timeTmp;
  eventLog->eventInfo[event].flopsTmp      += petsc_TotalFlops;
  eventLog->eventInfo[event].flops         += eventLog->eventInfo[event].flopsTmp;
  eventLog->eventInfo[event].flops2        += eventLog->eventInfo[event].flopsTmp*eventLog->eventInfo[event].flopsTmp;
  eventLog->eventInfo[event].numMessages   += petsc_irecv_ct  + petsc_isend_ct  + petsc_recv_ct  + petsc_send_ct;
  eventLog->eventInfo[event].messageLength += petsc_irecv_len + petsc_isend_len + petsc_recv_len + petsc_send_len;
  eventLog->eventInfo[event].numReductions += petsc_allreduce_ct + petsc_gather_ct + petsc_scatter_ct;
  PetscFunctionReturn(0);
}
예제 #7
0
PetscErrorCode PetscLogEventEndComplete(PetscLogEvent event, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4) 
{
  StageLog       stageLog;
  EventRegLog    eventRegLog;
  EventPerfLog   eventPerfLog;
  Action        *tmpAction;
  PetscLogDouble start, end;
  PetscLogDouble curTime;
  int            stage;
  PetscErrorCode ierr;

  PetscFunctionBegin;
  /* Dynamically enlarge logging structures */
  if (numActions >= maxActions) {
    PetscTime(start);
    ierr = PetscMalloc(maxActions*2 * sizeof(Action), &tmpAction);CHKERRQ(ierr);
    ierr = PetscMemcpy(tmpAction, actions, maxActions * sizeof(Action));CHKERRQ(ierr);
    ierr = PetscFree(actions);CHKERRQ(ierr);
    actions     = tmpAction;
    maxActions *= 2;
    PetscTime(end);
    BaseTime += (end - start);
  }
  /* Record the event */
  ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr);
  ierr = StageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr);
  ierr = StageLogGetEventRegLog(stageLog, &eventRegLog);CHKERRQ(ierr);
  ierr = StageLogGetEventPerfLog(stageLog, stage, &eventPerfLog);CHKERRQ(ierr);
  PetscTime(curTime);
  if (logActions) {
    actions[numActions].time   = curTime - BaseTime;
    actions[numActions].action = ACTIONEND;
    actions[numActions].event  = event;
    actions[numActions].cookie = eventRegLog->eventInfo[event].cookie;
    if (o1) actions[numActions].id1 = o1->id; else actions[numActions].id1 = -1;
    if (o2) actions[numActions].id2 = o2->id; else actions[numActions].id2 = -1;
    if (o3) actions[numActions].id3 = o3->id; else actions[numActions].id3 = -1;
    actions[numActions].flops    = _TotalFlops;
    ierr = PetscMallocGetCurrentUsage(&actions[numActions].mem);CHKERRQ(ierr);
    ierr = PetscMallocGetMaximumUsage(&actions[numActions].maxmem);CHKERRQ(ierr);
    numActions++;
  }
  /* Check for double counting */
  eventPerfLog->eventInfo[event].depth--;
  if (eventPerfLog->eventInfo[event].depth > 0) {
    PetscFunctionReturn(0);
  } else if (eventPerfLog->eventInfo[event].depth < 0) {
    SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Logging event had unbalanced begin/end pairs");
  }
  /* Log the performance info */
  eventPerfLog->eventInfo[event].count++;
  eventPerfLog->eventInfo[event].time          += curTime;
  eventPerfLog->eventInfo[event].flops         += _TotalFlops;
  eventPerfLog->eventInfo[event].numMessages   += irecv_ct  + isend_ct  + recv_ct  + send_ct;
  eventPerfLog->eventInfo[event].messageLength += irecv_len + isend_len + recv_len + send_len;
  eventPerfLog->eventInfo[event].numReductions += allreduce_ct;
  PetscFunctionReturn(0);
}
예제 #8
0
PetscErrorCode PetscLogEventGetFlops(PetscLogEvent event, PetscLogDouble *flops)
{
  PetscStageLog     stageLog;
  PetscEventPerfLog eventLog = NULL;
  int               stage;
  PetscErrorCode    ierr;

  PetscFunctionBegin;
  ierr   = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr);
  ierr   = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr);
  ierr   = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);CHKERRQ(ierr);
  *flops = eventLog->eventInfo[event].flops;
  PetscFunctionReturn(0);
}
예제 #9
0
파일: ex9.c 프로젝트: ZJLi2013/petsc
PetscErrorCode TestVecClosure(DM dm, AppCtx *user)
{
  PetscSection       s;
  Vec                v;
  PetscInt           numRuns, cStart, cEnd, c, i;
  PetscScalar        tmpArray[64];
  PetscScalar       *userArray     = user->reuseArray ? tmpArray : NULL;
  PetscReal          maxTimePerRun = user->maxVecClosureTime;
  PetscStageLog      stageLog;
  PetscEventPerfLog  eventLog;
  PetscInt           stage;
  PetscLogEvent      event;
  PetscEventPerfInfo eventInfo;
  PetscErrorCode     ierr;

  PetscFunctionBegin;
  ierr = PetscLogStageRegister("DMPlex Vector Closure Test", &stage);CHKERRQ(ierr);
  ierr = PetscLogEventRegister("VecClosure", PETSC_OBJECT_CLASSID, &event);CHKERRQ(ierr);
  ierr = PetscLogStagePush(stage);CHKERRQ(ierr);
  ierr = DMPlexCreateSection(dm, user->dim, user->numFields, user->numComponents, user->numDof, 0, NULL, NULL, &s);CHKERRQ(ierr);
  ierr = DMSetDefaultSection(dm, s);CHKERRQ(ierr);
  ierr = PetscSectionDestroy(&s);CHKERRQ(ierr);
  ierr = DMPlexGetHeightStratum(dm, 0, &cStart, &cEnd);CHKERRQ(ierr);
  ierr = DMGetLocalVector(dm, &v);CHKERRQ(ierr);
  ierr = PetscLogEventBegin(event,0,0,0,0);CHKERRQ(ierr);
  for (i = 0; i < user->iterations; ++i) {
    for (c = cStart; c < cEnd; ++c) {
      PetscScalar *closure     = userArray;
      PetscInt     closureSize = 64;;

      ierr = DMPlexVecGetClosure(dm, s, v, c, &closureSize, &closure);CHKERRQ(ierr);
      if (!user->reuseArray) {ierr = DMPlexVecRestoreClosure(dm, s, v, c, &closureSize, &closure);CHKERRQ(ierr);}
    }
  }
  ierr = PetscLogEventEnd(event,0,0,0,0);CHKERRQ(ierr);
  ierr = DMRestoreLocalVector(dm, &v);CHKERRQ(ierr);
  ierr = PetscLogStagePop();CHKERRQ(ierr);

  ierr = PetscLogGetStageLog(&stageLog);
  ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);
  numRuns   = (cEnd-cStart) * user->iterations;
  eventInfo = eventLog->eventInfo[event];
  if (eventInfo.count != 1) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of event calls %d should be %d", eventInfo.count, 1);
  if ((PetscInt) eventInfo.flops != 0) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of event flops %d should be %d", (PetscInt) eventInfo.flops, 0);
  if (eventInfo.time > maxTimePerRun * numRuns) {
    ierr = PetscPrintf(PETSC_COMM_SELF, "VecClosures: %d Average time per cone: %gs standard: %gs\n", numRuns, eventInfo.time/numRuns, maxTimePerRun);
    if (user->errors) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Average time for vector closure %g > standard %g", eventInfo.time/numRuns, maxTimePerRun);
  }
  PetscFunctionReturn(0);
}
예제 #10
0
PetscErrorCode PetscLogEventZeroFlops(PetscLogEvent event)
{
  StageLog       stageLog;
  EventPerfLog   eventLog;
  int            stage;
  PetscErrorCode ierr;

  PetscFunctionBegin;
  ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr);
  ierr = StageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr);
  ierr = StageLogGetEventPerfLog(stageLog, stage, &eventLog);CHKERRQ(ierr);
  eventLog->eventInfo[event].flops = 0.0;
  PetscFunctionReturn(0);
}
예제 #11
0
PetscErrorCode PetscLogEventGetFlops(PetscLogEvent event,PetscLogDouble *flops)
{
  PetscStageLog     stageLog;
  PetscEventPerfLog eventLog = NULL;
  int               stage;
  PetscErrorCode    ierr;

  PetscFunctionBegin;
  if (!PetscLogPLB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Must use -log_summary or PetscLogDefaultBegin() before calling this routine");
  ierr   = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr);
  ierr   = PetscStageLogGetCurrent(stageLog,&stage);CHKERRQ(ierr);
  ierr   = PetscStageLogGetEventPerfLog(stageLog,stage,&eventLog);CHKERRQ(ierr);
  *flops = eventLog->eventInfo[event].flops;
  PetscFunctionReturn(0);
}
예제 #12
0
/*@C
  PetscLogEventGetPerfInfo - Return the performance information about the given event in the given stage

  Input Parameters:
+ stage - The stage number or PETSC_DETERMINE for the current stage
- event - The event number

  Output Parameters:
. info - This structure is filled with the performance information

  Level: Intermediate

.seealso: PetscLogEventGetFlops()
@*/
PetscErrorCode PetscLogEventGetPerfInfo(int stage,PetscLogEvent event,PetscEventPerfInfo *info)
{
  PetscStageLog     stageLog;
  PetscEventPerfLog eventLog = NULL;
  PetscErrorCode    ierr;

  PetscFunctionBegin;
  PetscValidPointer(info,3);
  if (!PetscLogPLB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Must use -log_summary or PetscLogDefaultBegin() before calling this routine");
  ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr);
  if (stage < 0) {ierr = PetscStageLogGetCurrent(stageLog,&stage);CHKERRQ(ierr);}
  ierr = PetscStageLogGetEventPerfLog(stageLog,stage,&eventLog);CHKERRQ(ierr);
  *info = eventLog->eventInfo[event];
  PetscFunctionReturn(0);
}
예제 #13
0
파일: ex9.c 프로젝트: ZJLi2013/petsc
PetscErrorCode TestTransitiveClosure(DM dm, AppCtx *user)
{
  PetscInt           numRuns, cStart, cEnd, c, i;
  PetscReal          maxTimePerRun = user->maxClosureTime;
  PetscStageLog      stageLog;
  PetscEventPerfLog  eventLog;
  PetscInt           stage;
  PetscLogEvent      event;
  PetscEventPerfInfo eventInfo;
  PetscErrorCode     ierr;

  PetscFunctionBegin;
  ierr = PetscLogStageRegister("DMPlex Transitive Closure Test", &stage);CHKERRQ(ierr);
  ierr = PetscLogEventRegister("TransitiveClosure", PETSC_OBJECT_CLASSID, &event);CHKERRQ(ierr);
  ierr = PetscLogStagePush(stage);CHKERRQ(ierr);
  ierr = DMPlexGetHeightStratum(dm, 0, &cStart, &cEnd);CHKERRQ(ierr);
  ierr = PetscLogEventBegin(event,0,0,0,0);CHKERRQ(ierr);
  for (i = 0; i < user->iterations; ++i) {
    for (c = cStart; c < cEnd; ++c) {
      PetscInt *closure = NULL;
      PetscInt  closureSize;

      ierr = DMPlexGetTransitiveClosure(dm, c, PETSC_TRUE, &closureSize, &closure);CHKERRQ(ierr);
      ierr = DMPlexRestoreTransitiveClosure(dm, c, PETSC_TRUE, &closureSize, &closure);CHKERRQ(ierr);
    }
  }
  ierr = PetscLogEventEnd(event,0,0,0,0);CHKERRQ(ierr);
  ierr = PetscLogStagePop();CHKERRQ(ierr);

  ierr = PetscLogGetStageLog(&stageLog);
  ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);
  numRuns   = (cEnd-cStart) * user->iterations;
  eventInfo = eventLog->eventInfo[event];
  if (eventInfo.count != 1) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of event calls %d should be %d", eventInfo.count, 1);
  if ((PetscInt) eventInfo.flops != 0) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of event flops %d should be %d", (PetscInt) eventInfo.flops, 0);
  if (eventInfo.time > maxTimePerRun * numRuns) {
    ierr = PetscPrintf(PETSC_COMM_SELF, "Closures: %d Average time per cone: %gs standard: %gs\n", numRuns, eventInfo.time/numRuns, maxTimePerRun);
    if (user->errors) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Average time for closure %g > standard %g", eventInfo.time/numRuns, maxTimePerRun);
  }
  PetscFunctionReturn(0);
}
예제 #14
0
PetscErrorCode PetscLogEventEndDefault(PetscLogEvent event, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
{
  PetscStageLog     stageLog;
  PetscEventPerfLog eventLog = NULL;
  int               stage;
  PetscErrorCode    ierr;

  PetscFunctionBegin;
  ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr);
  ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr);
  ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);CHKERRQ(ierr);
  /* Check for double counting */
  eventLog->eventInfo[event].depth--;
  if (eventLog->eventInfo[event].depth > 0) PetscFunctionReturn(0);
  else if (eventLog->eventInfo[event].depth < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Logging event had unbalanced begin/end pairs");
  /* Log performance info */
  PetscTimeAdd(&eventLog->eventInfo[event].timeTmp);
  eventLog->eventInfo[event].time  += eventLog->eventInfo[event].timeTmp;
  eventLog->eventInfo[event].time2 += eventLog->eventInfo[event].timeTmp*eventLog->eventInfo[event].timeTmp;
#if defined(PETSC_HAVE_CHUD)
  eventLog->eventInfo[event].flopsTmp += chudGetPMCEventCount(chudCPU1Dev,PMC_1);
#elif defined(PETSC_HAVE_PAPI)
  { long_long values[2];
    ierr = PAPI_read(PAPIEventSet,values);CHKERRQ(ierr);

    eventLog->eventInfo[event].flopsTmp += values[0];
    /* printf("fma %g flops %g\n",(double)values[1],(double)values[0]); */
  }
#else
  eventLog->eventInfo[event].flopsTmp += petsc_TotalFlops;
#endif
  eventLog->eventInfo[event].flops         += eventLog->eventInfo[event].flopsTmp;
  eventLog->eventInfo[event].flops2        += eventLog->eventInfo[event].flopsTmp*eventLog->eventInfo[event].flopsTmp;
  eventLog->eventInfo[event].numMessages   += petsc_irecv_ct  + petsc_isend_ct  + petsc_recv_ct  + petsc_send_ct;
  eventLog->eventInfo[event].messageLength += petsc_irecv_len + petsc_isend_len + petsc_recv_len + petsc_send_len;
  eventLog->eventInfo[event].numReductions += petsc_allreduce_ct + petsc_gather_ct + petsc_scatter_ct;
  PetscFunctionReturn(0);
}
예제 #15
0
파일: eventlog.c 프로젝트: mchandra/petsc
PetscErrorCode PetscLogEventBeginDefault(PetscLogEvent event, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
{
    PetscStageLog     stageLog;
    PetscEventPerfLog eventLog = NULL;
    int               stage;
    PetscErrorCode    ierr;

    PetscFunctionBegin;
    ierr = PetscLogGetStageLog(&stageLog);
    CHKERRQ(ierr);
    ierr = PetscStageLogGetCurrent(stageLog, &stage);
    CHKERRQ(ierr);
    ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);
    CHKERRQ(ierr);
    /* Check for double counting */
    eventLog->eventInfo[event].depth++;
    if (eventLog->eventInfo[event].depth > 1) PetscFunctionReturn(0);
    /* Log performance info */
    eventLog->eventInfo[event].count++;
    eventLog->eventInfo[event].timeTmp = 0.0;
    PetscTimeSubtract(&eventLog->eventInfo[event].timeTmp);
    eventLog->eventInfo[event].flopsTmp = 0.0;
#if defined(PETSC_HAVE_PAPI)
    {   long_long values[2];
        ierr = PAPI_read(PAPIEventSet,values);
        CHKERRQ(ierr);

        eventLog->eventInfo[event].flopsTmp -= values[0];
        /*    printf("fma %g flops %g\n",(double)values[1],(double)values[0]); */
    }
#else
    eventLog->eventInfo[event].flopsTmp -= petsc_TotalFlops;
#endif
    eventLog->eventInfo[event].numMessages   -= petsc_irecv_ct  + petsc_isend_ct  + petsc_recv_ct  + petsc_send_ct;
    eventLog->eventInfo[event].messageLength -= petsc_irecv_len + petsc_isend_len + petsc_recv_len + petsc_send_len;
    eventLog->eventInfo[event].numReductions -= petsc_allreduce_ct + petsc_gather_ct + petsc_scatter_ct;
    PetscFunctionReturn(0);
}
예제 #16
0
파일: mg.c 프로젝트: ziolai/petsc
PetscErrorCode PCSetFromOptions_MG(PetscOptionItems *PetscOptionsObject,PC pc)
{
  PetscErrorCode ierr;
  PetscInt       m,levels = 1,cycles;
  PetscBool      flg,set;
  PC_MG          *mg        = (PC_MG*)pc->data;
  PC_MG_Levels   **mglevels;
  PCMGType       mgtype;
  PCMGCycleType  mgctype;

  PetscFunctionBegin;
  ierr = PetscOptionsHead(PetscOptionsObject,"Multigrid options");CHKERRQ(ierr);
  if (!mg->levels) {
    ierr = PetscOptionsInt("-pc_mg_levels","Number of Levels","PCMGSetLevels",levels,&levels,&flg);CHKERRQ(ierr);
    if (!flg && pc->dm) {
      ierr = DMGetRefineLevel(pc->dm,&levels);CHKERRQ(ierr);
      levels++;
      mg->usedmfornumberoflevels = PETSC_TRUE;
    }
    ierr = PCMGSetLevels(pc,levels,NULL);CHKERRQ(ierr);
  }
  mglevels = mg->levels;

  mgctype = (PCMGCycleType) mglevels[0]->cycles;
  ierr    = PetscOptionsEnum("-pc_mg_cycle_type","V cycle or for W-cycle","PCMGSetCycleType",PCMGCycleTypes,(PetscEnum)mgctype,(PetscEnum*)&mgctype,&flg);CHKERRQ(ierr);
  if (flg) {
    ierr = PCMGSetCycleType(pc,mgctype);CHKERRQ(ierr);
  }
  flg  = PETSC_FALSE;
  ierr = PetscOptionsBool("-pc_mg_galerkin","Use Galerkin process to compute coarser operators","PCMGSetGalerkin",flg,&flg,&set);CHKERRQ(ierr);
  if (set) {
    ierr = PCMGSetGalerkin(pc,flg);CHKERRQ(ierr);
  }
  ierr = PetscOptionsInt("-pc_mg_smoothup","Number of post-smoothing steps","PCMGSetNumberSmoothUp",mg->default_smoothu,&m,&flg);CHKERRQ(ierr);
  if (flg) {
    ierr = PCMGSetNumberSmoothUp(pc,m);CHKERRQ(ierr);
  }
  ierr = PetscOptionsInt("-pc_mg_smoothdown","Number of pre-smoothing steps","PCMGSetNumberSmoothDown",mg->default_smoothd,&m,&flg);CHKERRQ(ierr);
  if (flg) {
    ierr = PCMGSetNumberSmoothDown(pc,m);CHKERRQ(ierr);
  }
  mgtype = mg->am;
  ierr   = PetscOptionsEnum("-pc_mg_type","Multigrid type","PCMGSetType",PCMGTypes,(PetscEnum)mgtype,(PetscEnum*)&mgtype,&flg);CHKERRQ(ierr);
  if (flg) {
    ierr = PCMGSetType(pc,mgtype);CHKERRQ(ierr);
  }
  if (mg->am == PC_MG_MULTIPLICATIVE) {
    ierr = PetscOptionsInt("-pc_mg_multiplicative_cycles","Number of cycles for each preconditioner step","PCMGMultiplicativeSetCycles",mg->cyclesperpcapply,&cycles,&flg);CHKERRQ(ierr);
    if (flg) {
      ierr = PCMGMultiplicativeSetCycles(pc,cycles);CHKERRQ(ierr);
    }
  }
  flg  = PETSC_FALSE;
  ierr = PetscOptionsBool("-pc_mg_log","Log times for each multigrid level","None",flg,&flg,NULL);CHKERRQ(ierr);
  if (flg) {
    PetscInt i;
    char     eventname[128];
    if (!mglevels) SETERRQ(PetscObjectComm((PetscObject)pc),PETSC_ERR_ARG_WRONGSTATE,"Must set MG levels before calling");
    levels = mglevels[0]->levels;
    for (i=0; i<levels; i++) {
      sprintf(eventname,"MGSetup Level %d",(int)i);
      ierr = PetscLogEventRegister(eventname,((PetscObject)pc)->classid,&mglevels[i]->eventsmoothsetup);CHKERRQ(ierr);
      sprintf(eventname,"MGSmooth Level %d",(int)i);
      ierr = PetscLogEventRegister(eventname,((PetscObject)pc)->classid,&mglevels[i]->eventsmoothsolve);CHKERRQ(ierr);
      if (i) {
        sprintf(eventname,"MGResid Level %d",(int)i);
        ierr = PetscLogEventRegister(eventname,((PetscObject)pc)->classid,&mglevels[i]->eventresidual);CHKERRQ(ierr);
        sprintf(eventname,"MGInterp Level %d",(int)i);
        ierr = PetscLogEventRegister(eventname,((PetscObject)pc)->classid,&mglevels[i]->eventinterprestrict);CHKERRQ(ierr);
      }
    }

#if defined(PETSC_USE_LOG)
    {
      const char    *sname = "MG Apply";
      PetscStageLog stageLog;
      PetscInt      st;

      PetscFunctionBegin;
      ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr);
      for (st = 0; st < stageLog->numStages; ++st) {
        PetscBool same;

        ierr = PetscStrcmp(stageLog->stageInfo[st].name, sname, &same);CHKERRQ(ierr);
        if (same) mg->stageApply = st;
      }
      if (!mg->stageApply) {
        ierr = PetscLogStageRegister(sname, &mg->stageApply);CHKERRQ(ierr);
      }
    }
#endif
  }
  ierr = PetscOptionsTail();CHKERRQ(ierr);
  PetscFunctionReturn(0);
}
예제 #17
0
PetscErrorCode  PetscLogView_VecScatter(PetscViewer viewer)
{
  MPI_Comm           comm       = PetscObjectComm((PetscObject) viewer);
  PetscEventPerfInfo *eventInfo = NULL;
  PetscLogDouble     locTotalTime,stats[6],maxstats[6],minstats[6],sumstats[6],avetime,ksptime;
  PetscStageLog      stageLog;
  const int          stage = 2;
  int                event,events[] = {VEC_ScatterBegin,VEC_ScatterEnd};
  PetscMPIInt        rank,size;
  PetscErrorCode     ierr;
  PetscInt           i;
  char               arch[128],hostname[128],username[128],pname[PETSC_MAX_PATH_LEN],date[128],version[256];

  PetscFunctionBegin;
  PetscTime(&locTotalTime);  locTotalTime -= petsc_BaseTime;
  ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
  ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
  ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr);
  ierr = PetscViewerASCIIPrintf(viewer,"numProcs   = %d\n",size);CHKERRQ(ierr);

  ierr = PetscGetArchType(arch,sizeof(arch));CHKERRQ(ierr);
  ierr = PetscGetHostName(hostname,sizeof(hostname));CHKERRQ(ierr);
  ierr = PetscGetUserName(username,sizeof(username));CHKERRQ(ierr);
  ierr = PetscGetProgramName(pname,sizeof(pname));CHKERRQ(ierr);
  ierr = PetscGetDate(date,sizeof(date));CHKERRQ(ierr);
  ierr = PetscGetVersion(version,sizeof(version));CHKERRQ(ierr);
  ierr = PetscViewerASCIIPrintf(viewer,"%s on a %s named %s with %d processors, by %s %s\n", pname, arch, hostname, size, username, date);CHKERRQ(ierr);
  ierr = PetscViewerASCIIPrintf(viewer, "Using %s\n", version);CHKERRQ(ierr);
  ierr = PetscViewerASCIIPrintf(viewer, "Configure options: %s",petscconfigureoptions);CHKERRQ(ierr);
  ierr = PetscViewerASCIIPrintf(viewer, "%s", petscmachineinfo);CHKERRQ(ierr);
  ierr = PetscViewerASCIIPrintf(viewer, "%s", petsccompilerinfo);CHKERRQ(ierr);
  ierr = PetscViewerASCIIPrintf(viewer, "%s", petsccompilerflagsinfo);CHKERRQ(ierr);
  ierr = PetscViewerASCIIPrintf(viewer, "%s", petsclinkerinfo);CHKERRQ(ierr);
  ierr = PetscViewerASCIIPrintf(viewer, "%s\n", PETSC_MPICC_SHOW);CHKERRQ(ierr);
  ierr = PetscOptionsView(NULL,viewer);CHKERRQ(ierr);
#if defined(PETSC_HAVE_HWLOC)
  ierr = PetscProcessPlacementView(viewer);CHKERRQ(ierr);
#endif
  ierr = PetscViewerASCIIPrintf(viewer, "----------------------------------------------------\n");CHKERRQ(ierr);

  ierr = PetscViewerASCIIPrintf(viewer,"                Time     Min to Max Range   Proportion of KSP\n");CHKERRQ(ierr);

  eventInfo = stageLog->stageInfo[stage].eventLog->eventInfo;
  ierr = MPI_Allreduce(&eventInfo[KSP_Solve].time,&ksptime,1,MPIU_PETSCLOGDOUBLE,MPI_SUM,PETSC_COMM_WORLD);CHKERRQ(ierr);
  ksptime = ksptime/size;

  for (i=0; i<(int)(sizeof(events)/sizeof(int)); i++) {
    event = events[i];
    stats[COUNT]   = eventInfo[event].count;
    stats[TIME]    = eventInfo[event].time;
    stats[NUMMESS] = eventInfo[event].numMessages;
    stats[MESSLEN] = eventInfo[event].messageLength;
    stats[REDUCT]  = eventInfo[event].numReductions;
    stats[FLOPS]   = eventInfo[event].flops;
    ierr = MPI_Allreduce(stats,maxstats,6,MPIU_PETSCLOGDOUBLE,MPI_MAX,PETSC_COMM_WORLD);CHKERRQ(ierr);
    ierr = MPI_Allreduce(stats,minstats,6,MPIU_PETSCLOGDOUBLE,MPI_MIN,PETSC_COMM_WORLD);CHKERRQ(ierr);
    ierr = MPI_Allreduce(stats,sumstats,6,MPIU_PETSCLOGDOUBLE,MPI_SUM,PETSC_COMM_WORLD);CHKERRQ(ierr);

    avetime  = sumstats[1]/size;
    ierr = PetscViewerASCIIPrintf(viewer,"%s %4.2e   -%5.1f %% %5.1f %%   %4.2e %%\n",stageLog->eventLog->eventInfo[event].name,
                                  avetime,100.*(avetime-minstats[1])/avetime,100.*(maxstats[1]-avetime)/avetime,100.*avetime/ksptime);CHKERRQ(ierr);
  }
  ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
  PetscFunctionReturn(0);
}
예제 #18
0
PetscErrorCode PetscLogPrintSummaryToPy(MPI_Comm comm, PetscViewer viewer) 
{
  PetscViewer_ASCII *ascii = (PetscViewer_ASCII*)viewer->data;
  FILE              *fd = ascii->fd; 
  PetscLogDouble zero = 0.0;
  StageLog       stageLog;
  StageInfo     *stageInfo = PETSC_NULL;
  EventPerfInfo *eventInfo = PETSC_NULL;
  ClassPerfInfo *classInfo;
  const char    *name;
  PetscLogDouble locTotalTime, TotalTime, TotalFlops;
  PetscLogDouble numMessages, messageLength, avgMessLen, numReductions;
  PetscLogDouble stageTime, flops, mem, mess, messLen, red;
  PetscLogDouble fracTime, fracFlops, fracMessages, fracLength;
  PetscLogDouble fracReductions;
  PetscLogDouble tot,avg,x,y,*mydata;
  PetscMPIInt    minCt, maxCt;
  PetscMPIInt    size, rank, *mycount;
  PetscTruth    *localStageUsed,    *stageUsed;
  PetscTruth    *localStageVisible, *stageVisible;
  int            numStages, localNumEvents, numEvents;
  int            stage, lastStage;
  PetscLogEvent  event;
  PetscErrorCode ierr;
  PetscInt       i;

  /* remove these two lines! */
  PetscLogDouble PETSC_DLLEXPORT BaseTime = 0.0;
  int            numObjects = 0;

  PetscFunctionBegin;
  ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
  ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
  ierr = PetscMalloc(size*sizeof(PetscLogDouble), &mydata);CHKERRQ(ierr);
  ierr = PetscMalloc(size*sizeof(PetscMPIInt), &mycount);CHKERRQ(ierr);

  /* Pop off any stages the user forgot to remove */
  lastStage = 0;
  ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr);
  ierr = StageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr);
  while (stage >= 0) {
    lastStage = stage;
    ierr = StageLogPop(stageLog);CHKERRQ(ierr);
    ierr = StageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr);
  }
  /* Get the total elapsed time */
  PetscTime(locTotalTime);  locTotalTime -= BaseTime;

  ierr = PetscFPrintf(comm, fd, "\n#------ PETSc Performance Summary ----------\n\n");CHKERRQ(ierr);
  ierr = PetscFPrintf(comm, fd, "Nproc = %d\n",size);CHKERRQ(ierr);

  /* Must preserve reduction count before we go on */
  red  = (allreduce_ct + gather_ct + scatter_ct)/((PetscLogDouble) size);

  /* Calculate summary information */
 
  /*   Time */  
  ierr = MPI_Gather(&locTotalTime,1,MPIU_PETSCLOGDOUBLE,mydata,1,MPIU_PETSCLOGDOUBLE,0,comm);CHKERRQ(ierr);
  if (!rank){
    ierr = PetscFPrintf(comm, fd, "Time = [ " );CHKERRQ(ierr); 
    tot  = 0.0;
    for (i=0; i<size; i++){
      tot += mydata[i];
      ierr = PetscFPrintf(comm, fd, "  %5.3e,",mydata[i] );CHKERRQ(ierr); 
    }
    ierr = PetscFPrintf(comm, fd, "]\n" );CHKERRQ(ierr); 
    avg  = (tot)/((PetscLogDouble) size);
    TotalTime = tot;
  }

  /*   Objects */
  avg  = (PetscLogDouble) numObjects;
  ierr = MPI_Gather(&avg,1,MPIU_PETSCLOGDOUBLE,mydata,1,MPIU_PETSCLOGDOUBLE,0,comm);CHKERRQ(ierr);
  if (!rank){
    ierr = PetscFPrintf(comm, fd, "Objects = [ " );CHKERRQ(ierr); 
    for (i=0; i<size; i++){
      ierr = PetscFPrintf(comm, fd, "  %5.3e,",mydata[i] );CHKERRQ(ierr); 
    }
    ierr = PetscFPrintf(comm, fd, "]\n" );CHKERRQ(ierr); 
  }

  /*   Flops */
  ierr = MPI_Gather(&_TotalFlops,1,MPIU_PETSCLOGDOUBLE,mydata,1,MPIU_PETSCLOGDOUBLE,0,comm);CHKERRQ(ierr);
  if (!rank){
    ierr = PetscFPrintf(comm, fd, "Flops = [ " );CHKERRQ(ierr); 
    tot  = 0.0;
    for (i=0; i<size; i++){
      tot += mydata[i];
      ierr = PetscFPrintf(comm, fd, "  %5.3e,",mydata[i] );CHKERRQ(ierr); 
    }
    ierr = PetscFPrintf(comm, fd, "]\n");CHKERRQ(ierr); 
    TotalFlops = tot;
  }

  /*   Memory */
  ierr = PetscMallocGetMaximumUsage(&mem);CHKERRQ(ierr); 
  ierr = MPI_Gather(&mem,1,MPIU_PETSCLOGDOUBLE,mydata,1,MPIU_PETSCLOGDOUBLE,0,comm);CHKERRQ(ierr);
  if (!rank){
    ierr = PetscFPrintf(comm, fd, "Memory = [ " );CHKERRQ(ierr); 
    for (i=0; i<size; i++){
      ierr = PetscFPrintf(comm, fd, "  %5.3e,",mydata[i] );CHKERRQ(ierr); 
    }
    ierr = PetscFPrintf(comm, fd, "]\n" );CHKERRQ(ierr); 
  }
 
  /*   Messages */
  mess = 0.5*(irecv_ct + isend_ct + recv_ct + send_ct);
  ierr = MPI_Gather(&mess,1,MPIU_PETSCLOGDOUBLE,mydata,1,MPIU_PETSCLOGDOUBLE,0,comm);CHKERRQ(ierr);
  if (!rank){
    ierr = PetscFPrintf(comm, fd, "MPIMessages = [ " );CHKERRQ(ierr); 
    tot  = 0.0;
    for (i=0; i<size; i++){
      tot += mydata[i];
      ierr = PetscFPrintf(comm, fd, "  %5.3e,",mydata[i] );CHKERRQ(ierr); 
    }
    ierr = PetscFPrintf(comm, fd, "]\n" );CHKERRQ(ierr); 
    numMessages = tot;
  }

  /*   Message Lengths */
  mess = 0.5*(irecv_len + isend_len + recv_len + send_len);
  ierr = MPI_Gather(&mess,1,MPIU_PETSCLOGDOUBLE,mydata,1,MPIU_PETSCLOGDOUBLE,0,comm);CHKERRQ(ierr);
  if (!rank){
    ierr = PetscFPrintf(comm, fd, "MPIMessageLengths = [ " );CHKERRQ(ierr); 
    tot  = 0.0;
    for (i=0; i<size; i++){
      tot += mydata[i];
      ierr = PetscFPrintf(comm, fd, "  %5.3e,",mydata[i] );CHKERRQ(ierr); 
    }
    ierr = PetscFPrintf(comm, fd, "]\n" );CHKERRQ(ierr); 
    messageLength = tot;
  }

  /*   Reductions */
  ierr = MPI_Gather(&red,1,MPIU_PETSCLOGDOUBLE,mydata,1,MPIU_PETSCLOGDOUBLE,0,comm);CHKERRQ(ierr);
  if (!rank){
    ierr = PetscFPrintf(comm, fd, "MPIReductions = [ " );CHKERRQ(ierr); 
    tot  = 0.0;
    for (i=0; i<size; i++){
      tot += mydata[i];
      ierr = PetscFPrintf(comm, fd, "  %5.3e,",mydata[i] );CHKERRQ(ierr); 
    }
    ierr = PetscFPrintf(comm, fd, "]\n" );CHKERRQ(ierr); 
    numReductions = tot;
  }

  /* Get total number of stages --
       Currently, a single processor can register more stages than another, but stages must all be registered in order.
       We can removed this requirement if necessary by having a global stage numbering and indirection on the stage ID.
       This seems best accomplished by assoicating a communicator with each stage.
  */
  ierr = MPI_Allreduce(&stageLog->numStages, &numStages, 1, MPI_INT, MPI_MAX, comm);CHKERRQ(ierr);
  ierr = PetscMalloc(numStages * sizeof(PetscTruth), &localStageUsed);CHKERRQ(ierr);
  ierr = PetscMalloc(numStages * sizeof(PetscTruth), &stageUsed);CHKERRQ(ierr);
  ierr = PetscMalloc(numStages * sizeof(PetscTruth), &localStageVisible);CHKERRQ(ierr);
  ierr = PetscMalloc(numStages * sizeof(PetscTruth), &stageVisible);CHKERRQ(ierr);
  if (numStages > 0) {
    stageInfo = stageLog->stageInfo;
    for(stage = 0; stage < numStages; stage++) {
      if (stage < stageLog->numStages) {
        localStageUsed[stage]    = stageInfo[stage].used;
        localStageVisible[stage] = stageInfo[stage].perfInfo.visible;
      } else {
        localStageUsed[stage]    = PETSC_FALSE;
        localStageVisible[stage] = PETSC_TRUE;
      }
    }
    ierr = MPI_Allreduce(localStageUsed,    stageUsed,    numStages, MPI_INT, MPI_LOR,  comm);CHKERRQ(ierr);
    ierr = MPI_Allreduce(localStageVisible, stageVisible, numStages, MPI_INT, MPI_LAND, comm);CHKERRQ(ierr);
for(stage = 0; stage < numStages; stage++) {
      if (stageUsed[stage]) {
        ierr = PetscFPrintf(comm, fd, "\n#Summary of Stages:   ----- Time ------  ----- Flops -----  --- Messages ---  -- Message Lengths --  -- Reductions --\n");CHKERRQ(ierr);
        ierr = PetscFPrintf(comm, fd, "#                       Avg     %%Total     Avg     %%Total   counts   %%Total     Avg         %%Total   counts   %%Total \n");CHKERRQ(ierr);
        break;
      }
    }
    for(stage = 0; stage < numStages; stage++) {
      if (!stageUsed[stage]) continue;
      if (localStageUsed[stage]) {
        ierr = MPI_Allreduce(&stageInfo[stage].perfInfo.time,          &stageTime, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr);
        ierr = MPI_Allreduce(&stageInfo[stage].perfInfo.flops,         &flops,     1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr);
        ierr = MPI_Allreduce(&stageInfo[stage].perfInfo.numMessages,   &mess,      1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr);
        ierr = MPI_Allreduce(&stageInfo[stage].perfInfo.messageLength, &messLen,   1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr);
        ierr = MPI_Allreduce(&stageInfo[stage].perfInfo.numReductions, &red,       1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr);
        name = stageInfo[stage].name;
      } else {
        ierr = MPI_Allreduce(&zero,                           &stageTime, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr);
        ierr = MPI_Allreduce(&zero,                           &flops,     1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr);
        ierr = MPI_Allreduce(&zero,                           &mess,      1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr);
        ierr = MPI_Allreduce(&zero,                           &messLen,   1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr);
        ierr = MPI_Allreduce(&zero,                           &red,       1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr);
        name = "";
      }
      mess *= 0.5; messLen *= 0.5; red /= size;
      if (TotalTime     != 0.0) fracTime       = stageTime/TotalTime;    else fracTime       = 0.0;
      if (TotalFlops    != 0.0) fracFlops      = flops/TotalFlops;       else fracFlops      = 0.0;
      /* Talk to Barry if (stageTime     != 0.0) flops          = (size*flops)/stageTime; else flops          = 0.0; */
      if (numMessages   != 0.0) fracMessages   = mess/numMessages;       else fracMessages   = 0.0;
      if (numMessages   != 0.0) avgMessLen     = messLen/numMessages;    else avgMessLen     = 0.0;
      if (messageLength != 0.0) fracLength     = messLen/messageLength;  else fracLength     = 0.0;
      if (numReductions != 0.0) fracReductions = red/numReductions;      else fracReductions = 0.0;
      ierr = PetscFPrintf(comm, fd, "# ");
      ierr = PetscFPrintf(comm, fd, "%2d: %15s: %6.4e %5.1f%%  %6.4e %5.1f%%  %5.3e %5.1f%%  %5.3e      %5.1f%%  %5.3e %5.1f%% \n",
                          stage, name, stageTime/size, 100.0*fracTime, flops, 100.0*fracFlops,
                          mess, 100.0*fracMessages, avgMessLen, 100.0*fracLength, red, 100.0*fracReductions);CHKERRQ(ierr);
    }
  }

  /* Report events */
  ierr = PetscFPrintf(comm, fd,"\n# Event\n");CHKERRQ(ierr);                        
  ierr = PetscFPrintf(comm,fd,"# ------------------------------------------------------\n");
                                                                                                          CHKERRQ(ierr); 
  /* Problem: The stage name will not show up unless the stage executed on proc 1 */
  for(stage = 0; stage < numStages; stage++) {
    if (!stageVisible[stage]) continue;
    if (localStageUsed[stage]) {
      ierr = MPI_Allreduce(&stageInfo[stage].perfInfo.time,          &stageTime, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr);
      ierr = MPI_Allreduce(&stageInfo[stage].perfInfo.flops,         &flops,     1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr);
      ierr = MPI_Allreduce(&stageInfo[stage].perfInfo.numMessages,   &mess,      1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr);
      ierr = MPI_Allreduce(&stageInfo[stage].perfInfo.messageLength, &messLen,   1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr);
      ierr = MPI_Allreduce(&stageInfo[stage].perfInfo.numReductions, &red,       1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr);
    } else {
      ierr = PetscFPrintf(comm, fd, "\n--- Event Stage %d: Unknown\n\n", stage);CHKERRQ(ierr);
      ierr = MPI_Allreduce(&zero,                           &stageTime, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr);
      ierr = MPI_Allreduce(&zero,                           &flops,     1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr);
      ierr = MPI_Allreduce(&zero,                           &mess,      1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr);
      ierr = MPI_Allreduce(&zero,                           &messLen,   1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr);
      ierr = MPI_Allreduce(&zero,                           &red,       1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr);
    }
    mess *= 0.5; messLen *= 0.5; red /= size;

    /* Get total number of events in this stage --
       Currently, a single processor can register more events than another, but events must all be registered in order,
       just like stages. We can removed this requirement if necessary by having a global event numbering and indirection
       on the event ID. This seems best accomplished by assoicating a communicator with each stage.

       Problem: If the event did not happen on proc 1, its name will not be available.
       Problem: Event visibility is not implemented
    */
    
    if (!rank){
      ierr = PetscFPrintf(comm, fd, "class Dummy(object):\n");CHKERRQ(ierr);
      ierr = PetscFPrintf(comm, fd, "    def foo(x):\n");CHKERRQ(ierr);
      ierr = PetscFPrintf(comm, fd, "        print x\n");CHKERRQ(ierr);
      ierr = PetscFPrintf(comm, fd, "Event = {}\n");CHKERRQ(ierr);
    }

    if (localStageUsed[stage]) {
      eventInfo      = stageLog->stageInfo[stage].eventLog->eventInfo;
      localNumEvents = stageLog->stageInfo[stage].eventLog->numEvents;
    } else {
      localNumEvents = 0;
    }
    ierr = MPI_Allreduce(&localNumEvents, &numEvents, 1, MPI_INT, MPI_MAX, comm);CHKERRQ(ierr);
    for(event = 0; event < numEvents; event++) {
      if (localStageUsed[stage] && (event < stageLog->stageInfo[stage].eventLog->numEvents) && (eventInfo[event].depth == 0)) {
        ierr = MPI_Allreduce(&eventInfo[event].count, &maxCt, 1, MPI_INT, MPI_MAX, comm);CHKERRQ(ierr);
        name = stageLog->eventLog->eventInfo[event].name;
      } else {
        ierr = MPI_Allreduce(&ierr, &maxCt, 1, MPI_INT, MPI_MAX, comm);CHKERRQ(ierr);
        name = "";
      }
     
      if (maxCt != 0) {
        ierr = PetscFPrintf(comm, fd,"#\n");CHKERRQ(ierr);
        if (!rank){
          ierr = PetscFPrintf(comm, fd, "%s = Dummy()\n",name);CHKERRQ(ierr);
          ierr = PetscFPrintf(comm, fd, "Event['%s'] = %s\n",name,name);CHKERRQ(ierr);
        }
        /* Count */
        ierr = MPI_Gather(&eventInfo[event].count,1,MPI_INT,mycount,1,MPI_INT,0,comm);CHKERRQ(ierr);
        ierr = PetscFPrintf(comm, fd, "%s.Count = [ ", name);CHKERRQ(ierr); 
          for (i=0; i<size; i++){
            ierr = PetscFPrintf(comm, fd, "  %7d,",mycount[i] );CHKERRQ(ierr); 
          }
          ierr = PetscFPrintf(comm, fd, "]\n" );CHKERRQ(ierr); 

        /* Time */
        ierr = MPI_Gather(&eventInfo[event].time,1,MPIU_PETSCLOGDOUBLE,mydata,1,MPIU_PETSCLOGDOUBLE,0,comm);CHKERRQ(ierr);
        if (!rank){
          ierr = PetscFPrintf(comm, fd, "%s.Time  = [ ", name);CHKERRQ(ierr);
          for (i=0; i<size; i++){
            ierr = PetscFPrintf(comm, fd, "  %5.3e,",mydata[i] );CHKERRQ(ierr); 
          }
          ierr = PetscFPrintf(comm, fd, "]\n" );CHKERRQ(ierr); 
        }
        /* Flops */
        ierr = MPI_Gather(&eventInfo[event].flops,1,MPIU_PETSCLOGDOUBLE,mydata,1,MPIU_PETSCLOGDOUBLE,0,comm);CHKERRQ(ierr);
        if (!rank){
          ierr = PetscFPrintf(comm, fd, "%s.Flops = [ ", name);CHKERRQ(ierr);
          for (i=0; i<size; i++){
            ierr = PetscFPrintf(comm, fd, "  %5.3e,",mydata[i] );CHKERRQ(ierr); 
          }
          ierr = PetscFPrintf(comm, fd, "]\n" );CHKERRQ(ierr); 
        }       
      }
    }
  }

  /* Right now, only stages on the first processor are reported here, meaning only objects associated with
     the global communicator, or MPI_COMM_SELF for proc 1. We really should report global stats and then
     stats for stages local to processor sets.
  */
  for(stage = 0; stage < numStages; stage++) {
    if (localStageUsed[stage]) {
      classInfo = stageLog->stageInfo[stage].classLog->classInfo;
    } else {
      ierr = PetscFPrintf(comm, fd, "\n--- Event Stage %d: Unknown\n\n", stage);CHKERRQ(ierr);
    }
  }

  ierr = PetscFree(localStageUsed);CHKERRQ(ierr);
  ierr = PetscFree(stageUsed);CHKERRQ(ierr);
  ierr = PetscFree(localStageVisible);CHKERRQ(ierr);
  ierr = PetscFree(stageVisible);CHKERRQ(ierr);
  ierr = PetscFree(mydata);CHKERRQ(ierr);
  ierr = PetscFree(mycount);CHKERRQ(ierr);

  /* Information unrelated to this particular run */
  ierr = PetscFPrintf(comm, fd,
    "# ========================================================================================================================\n");CHKERRQ(ierr);
  PetscTime(y); 
  PetscTime(x);
  PetscTime(y); PetscTime(y); PetscTime(y); PetscTime(y); PetscTime(y);
  PetscTime(y); PetscTime(y); PetscTime(y); PetscTime(y); PetscTime(y);
  ierr = PetscFPrintf(comm,fd,"AveragetimetogetPetscTime = %g\n", (y-x)/10.0);CHKERRQ(ierr);
  /* MPI information */
  if (size > 1) {
    MPI_Status  status;
    PetscMPIInt tag;
    MPI_Comm    newcomm;

    ierr = MPI_Barrier(comm);CHKERRQ(ierr);
    PetscTime(x);
    ierr = MPI_Barrier(comm);CHKERRQ(ierr);
    ierr = MPI_Barrier(comm);CHKERRQ(ierr);
    ierr = MPI_Barrier(comm);CHKERRQ(ierr);
    ierr = MPI_Barrier(comm);CHKERRQ(ierr);
    ierr = MPI_Barrier(comm);CHKERRQ(ierr);
    PetscTime(y);
    ierr = PetscFPrintf(comm, fd, "AveragetimeforMPI_Barrier = %g\n", (y-x)/5.0);CHKERRQ(ierr);
    ierr = PetscCommDuplicate(comm,&newcomm, &tag);CHKERRQ(ierr);
    ierr = MPI_Barrier(comm);CHKERRQ(ierr);
    if (rank) {
      ierr = MPI_Recv(0, 0, MPI_INT, rank-1,            tag, newcomm, &status);CHKERRQ(ierr);
      ierr = MPI_Send(0, 0, MPI_INT, (rank+1)%size, tag, newcomm);CHKERRQ(ierr);
    } else {
      PetscTime(x);
      ierr = MPI_Send(0, 0, MPI_INT, 1,          tag, newcomm);CHKERRQ(ierr);
      ierr = MPI_Recv(0, 0, MPI_INT, size-1, tag, newcomm, &status);CHKERRQ(ierr);
      PetscTime(y);
      ierr = PetscFPrintf(comm,fd,"AveragetimforzerosizeMPI_Send = %g\n", (y-x)/size);CHKERRQ(ierr);
    }
    ierr = PetscCommDestroy(&newcomm);CHKERRQ(ierr);
  }
  if (!rank) { /* print Optiontable */
    ierr = PetscFPrintf(comm,fd,"# ");CHKERRQ(ierr);
    //ierr = PetscOptionsPrint(fd);CHKERRQ(ierr);
  }

  /* Cleanup */
  ierr = PetscFPrintf(comm, fd, "\n");CHKERRQ(ierr);
  ierr = StageLogPush(stageLog, lastStage);CHKERRQ(ierr);
  PetscFunctionReturn(0);
}
/*
  IntegrateElementBatchOpenCL - Produces element vectors from input element solution and geometric information via quadrature

  Input Parameters:
+ Ne - The total number of cells, Nchunk * Ncb * Nbc
. Ncb - The number of serial cell batches
. Nbc - The number of cells per batch
. Nbl - The number of concurrent cells blocks per thread block
. coefficients - An array of the solution vector for each cell
. jacobianInverses - An array of the inverse Jacobian for each cell
. jacobianDeterminants - An array of the Jacobian determinant for each cell
. event - A PetscEvent, used to log flops
- debug - A flag for debugging information

  Output Parameter:
. elemVec - An array of the element vectors for each cell
*/
PETSC_EXTERN PetscErrorCode IntegrateElementBatchGPU(PetscInt spatial_dim, PetscInt Ne, PetscInt Ncb, PetscInt Nbc, PetscInt N_bl, const PetscScalar coefficients[],
                                                     const PetscReal jacobianInverses[], const PetscReal jacobianDeterminants[], PetscScalar elemVec[],
                                                     PetscLogEvent event, PetscInt debug, PetscInt pde_op)
{
  const cl_int numQuadraturePoints_0 = 1;

  const cl_int numBasisFunctions_0 = 3;
  const cl_int numBasisComponents_0 = (pde_op == LAPLACIAN) ? 1 : spatial_dim;

  const cl_int dim    = spatial_dim;
  const cl_int N_b    = numBasisFunctions_0;   /* The number of basis functions */
  const cl_int N_comp = numBasisComponents_0;  /* The number of basis function components */
  const cl_int N_bt   = N_b*N_comp;            /* The total number of scalar basis functions */
  const cl_int N_q    = numQuadraturePoints_0; /* The number of quadrature points */
  const cl_int N_bst  = N_bt*N_q;              /* The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously */
  const cl_int N_t    = N_bst*N_bl;            /* The number of threads, N_bst * N_bl */

  char            *program_buffer;
  char            build_buffer[8192];
  cl_build_status status;

  cl_event          ocl_ev;         /* The event for tracking kernel execution */
  cl_ulong          ns_start;       /* Nanoseconds counter on GPU at kernel start */
  cl_ulong          ns_end;         /* Nanoseconds counter on GPU at kernel stop */

  cl_mem            d_coefficients;
  cl_mem            d_jacobianInverses;
  cl_mem            d_jacobianDeterminants;
  cl_mem            d_elemVec;

  OpenCLEnvironment ocl_env;
  cl_program        ocl_prog;
  cl_kernel         ocl_kernel;
  size_t            ocl_source_length;
  size_t            local_work_size[3];
  size_t            global_work_size[3];
  size_t            i;
  unsigned int      x, y, z;
  PetscErrorCode    ierr;
  cl_int            ierr2;


  PetscFunctionBegin;
  ierr = initializeOpenCL(&ocl_env);CHKERRQ(ierr);
  ierr = PetscMalloc(8192 * sizeof(char), &program_buffer);CHKERRQ(ierr);
  ierr = generateOpenCLSource(&program_buffer, 8192, dim, N_bl, pde_op);CHKERRQ(ierr);
  ocl_source_length = strlen(program_buffer);
  ocl_prog = clCreateProgramWithSource(ocl_env.ctx_id, 1, (const char**)&program_buffer, &ocl_source_length, &ierr2);CHKERRQ(ierr2);
  ierr = clBuildProgram(ocl_prog, 0, NULL, NULL, NULL, NULL);
  if (ierr != CL_SUCCESS) {
    clGetProgramBuildInfo(ocl_prog, ocl_env.dev_id, CL_PROGRAM_BUILD_LOG, sizeof(char)*8192, &build_buffer, NULL);
    printf("Build failed! Log:\n %s", build_buffer);
  }
  CHKERRQ(ierr);
  ierr = PetscFree(program_buffer);CHKERRQ(ierr);

  ocl_kernel = clCreateKernel(ocl_prog, "integrateElementQuadrature", &ierr);CHKERRQ(ierr);

  if (Nbc*N_comp != N_t) SETERRQ3(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of threads %d should be %d * %d", N_t, Nbc, N_comp);
  if (!Ne) {
    PetscStageLog     stageLog;
    PetscEventPerfLog eventLog = NULL;
    PetscInt          stage;

    ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr);
    ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr);
    ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);CHKERRQ(ierr);
    /* Log performance info */
    eventLog->eventInfo[event].count++;
    eventLog->eventInfo[event].time  += 0.0;
    eventLog->eventInfo[event].flops += 0;
    PetscFunctionReturn(0);
  }

  /* Create buffers on the device and send data over */
  d_coefficients         = clCreateBuffer(ocl_env.ctx_id, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, Ne*N_bt    * sizeof(PetscReal), (void*)coefficients,         &ierr);CHKERRQ(ierr);
  d_jacobianInverses     = clCreateBuffer(ocl_env.ctx_id, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, Ne*dim*dim * sizeof(PetscReal), (void*)jacobianInverses,     &ierr);CHKERRQ(ierr);
  d_jacobianDeterminants = clCreateBuffer(ocl_env.ctx_id, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, Ne         * sizeof(PetscReal), (void*)jacobianDeterminants, &ierr);CHKERRQ(ierr);
  d_elemVec              = clCreateBuffer(ocl_env.ctx_id, CL_MEM_READ_WRITE,                        Ne*N_bt    * sizeof(PetscReal), NULL,                        &ierr);CHKERRQ(ierr);

  /* Work size preparations */
  ierr = calculateGridOpenCL(Ne, Ncb*Nbc, &x, &y, &z);CHKERRQ(ierr);
  local_work_size[0] = Nbc*N_comp;
  local_work_size[1] = 1;
  local_work_size[2] = 1;
  global_work_size[0] = x * local_work_size[0];
  global_work_size[1] = y * local_work_size[1];
  global_work_size[2] = z * local_work_size[2];

  /* if (debug) { */
  ierr = PetscPrintf(PETSC_COMM_SELF, "GPU layout grid(%d,%d,%d) block(%d,%d,%d) with %d batches\n",
                     x, y, z,
                     local_work_size[0], local_work_size[1], local_work_size[2], Ncb);CHKERRQ(ierr);
  ierr = PetscPrintf(PETSC_COMM_SELF, " N_t: %d, N_cb: %d\n", N_t, Ncb);
  /* } */

  /* Kernel launch */
  /* integrateElementQuadrature<<<grid, block>>>(Ncb, d_coefficients, d_jacobianInverses, d_jacobianDeterminants, d_elemVec); */
  ierr = clSetKernelArg(ocl_kernel, 0, sizeof(cl_int), (void*)&Ncb);CHKERRQ(ierr);
  ierr = clSetKernelArg(ocl_kernel, 1, sizeof(cl_mem), (void*)&d_coefficients);CHKERRQ(ierr);
  ierr = clSetKernelArg(ocl_kernel, 2, sizeof(cl_mem), (void*)&d_jacobianInverses);CHKERRQ(ierr);
  ierr = clSetKernelArg(ocl_kernel, 3, sizeof(cl_mem), (void*)&d_jacobianDeterminants);CHKERRQ(ierr);
  ierr = clSetKernelArg(ocl_kernel, 4, sizeof(cl_mem), (void*)&d_elemVec);CHKERRQ(ierr);

  ierr = clEnqueueNDRangeKernel(ocl_env.queue_id, ocl_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &ocl_ev);CHKERRQ(ierr);

  /* Read data back from device */
  ierr = clEnqueueReadBuffer(ocl_env.queue_id, d_elemVec, CL_TRUE, 0, Ne*N_bt * sizeof(PetscReal), elemVec, 0, NULL, NULL);CHKERRQ(ierr);

  {
    PetscStageLog     stageLog;
    PetscEventPerfLog eventLog = NULL;
    PetscInt          stage;

    ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr);
    ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr);
    ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);CHKERRQ(ierr);
    /* Log performance info */
    ierr = clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ns_start, NULL);CHKERRQ(ierr);
    ierr = clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_END,   sizeof(cl_ulong), &ns_end,   NULL);CHKERRQ(ierr);
    eventLog->eventInfo[event].count++;
    eventLog->eventInfo[event].time  += (ns_end - ns_start)*1.0e-9;
    eventLog->eventInfo[event].flops += (((2+(2+2*dim)*dim)*N_comp*N_b+(2+2)*dim*N_comp)*N_q + (2+2*dim)*dim*N_q*N_comp*N_b)*Ne;
  }

  /* We are done, clean up */
  ierr = clReleaseMemObject(d_coefficients);CHKERRQ(ierr);
  ierr = clReleaseMemObject(d_jacobianInverses);CHKERRQ(ierr);
  ierr = clReleaseMemObject(d_jacobianDeterminants);CHKERRQ(ierr);
  ierr = clReleaseMemObject(d_elemVec);CHKERRQ(ierr);
  ierr = clReleaseKernel(ocl_kernel);CHKERRQ(ierr);
  ierr = clReleaseProgram(ocl_prog);CHKERRQ(ierr);
  ierr = destroyOpenCL(&ocl_env);CHKERRQ(ierr);
  PetscFunctionReturn(0);
}