PetscErrorCode PetscLogEventBeginDefault(PetscLogEvent event, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4) { StageLog stageLog; EventPerfLog eventLog; int stage; PetscErrorCode ierr; PetscFunctionBegin; ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); ierr = StageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr); ierr = StageLogGetEventPerfLog(stageLog, stage, &eventLog);CHKERRQ(ierr); /* Check for double counting */ eventLog->eventInfo[event].depth++; if (eventLog->eventInfo[event].depth > 1) PetscFunctionReturn(0); /* Log performance info */ eventLog->eventInfo[event].count++; PetscTimeSubtract(eventLog->eventInfo[event].time); #if defined(PETSC_HAVE_CHUD) eventLog->eventInfo[event].flops -= chudGetPMCEventCount(chudCPU1Dev,PMC_1); #elif defined(PETSC_HAVE_PAPI) { long_long values[2]; ierr = PAPI_read(PAPIEventSet,values);CHKERRQ(ierr); eventLog->eventInfo[event].flops -= values[0]; /* printf("fma %g flops %g\n",(double)values[1],(double)values[0]); */ } #else eventLog->eventInfo[event].flops -= _TotalFlops; #endif eventLog->eventInfo[event].numMessages -= irecv_ct + isend_ct + recv_ct + send_ct; eventLog->eventInfo[event].messageLength -= irecv_len + isend_len + recv_len + send_len; eventLog->eventInfo[event].numReductions -= allreduce_ct; PetscFunctionReturn(0); }
PetscErrorCode PetscLogEventBeginTrace(PetscLogEvent event, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4) { PetscStageLog stageLog; PetscEventRegLog eventRegLog; PetscEventPerfLog eventPerfLog = NULL; PetscLogDouble cur_time; PetscMPIInt rank; int stage,err; PetscErrorCode ierr; PetscFunctionBegin; if (!petsc_tracetime) PetscTime(&petsc_tracetime); ierr = MPI_Comm_rank(PETSC_COMM_WORLD, &rank);CHKERRQ(ierr); ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr); ierr = PetscStageLogGetEventRegLog(stageLog, &eventRegLog);CHKERRQ(ierr); ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventPerfLog);CHKERRQ(ierr); /* Check for double counting */ eventPerfLog->eventInfo[event].depth++; petsc_tracelevel++; if (eventPerfLog->eventInfo[event].depth > 1) PetscFunctionReturn(0); /* Log performance info */ PetscTime(&cur_time); ierr = PetscFPrintf(PETSC_COMM_SELF,petsc_tracefile, "%s[%d] %g Event begin: %s\n", petsc_tracespace, rank, cur_time-petsc_tracetime, eventRegLog->eventInfo[event].name);CHKERRQ(ierr); ierr = PetscStrncpy(petsc_tracespace, petsc_traceblanks, 2*petsc_tracelevel);CHKERRQ(ierr); petsc_tracespace[2*petsc_tracelevel] = 0; err = fflush(petsc_tracefile); if (err) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SYS,"fflush() failed on file"); PetscFunctionReturn(0); }
PetscErrorCode PetscLogEventEndTrace(PetscLogEvent event,int t,PetscObject o1,PetscObject o2,PetscObject o3,PetscObject o4) { StageLog stageLog; EventRegLog eventRegLog; EventPerfLog eventPerfLog; PetscLogDouble cur_time; int stage,err; PetscMPIInt rank; PetscErrorCode ierr; PetscFunctionBegin; tracelevel--; ierr = MPI_Comm_rank(PETSC_COMM_WORLD, &rank);CHKERRQ(ierr); ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); ierr = StageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr); ierr = StageLogGetEventRegLog(stageLog, &eventRegLog);CHKERRQ(ierr); ierr = StageLogGetEventPerfLog(stageLog, stage, &eventPerfLog);CHKERRQ(ierr); /* Check for double counting */ eventPerfLog->eventInfo[event].depth--; if (eventPerfLog->eventInfo[event].depth > 0) { PetscFunctionReturn(0); } else if (eventPerfLog->eventInfo[event].depth < 0 || tracelevel < 0) { SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Logging event had unbalanced begin/end pairs"); } /* Log performance info */ ierr = PetscStrncpy(tracespace, traceblanks, 2*tracelevel);CHKERRQ(ierr); tracespace[2*tracelevel] = 0; PetscTime(cur_time); ierr = PetscFPrintf(PETSC_COMM_SELF,tracefile, "%s[%d] %g Event end: %s\n", tracespace, rank, cur_time-tracetime, eventRegLog->eventInfo[event].name);CHKERRQ(ierr); err = fflush(tracefile); if (err) SETERRQ(PETSC_ERR_SYS,"fflush() failed on file"); PetscFunctionReturn(0); }
PetscErrorCode PetscLogEventEndComplete(PetscLogEvent event, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4) { PetscStageLog stageLog; PetscEventRegLog eventRegLog; PetscEventPerfLog eventPerfLog = NULL; Action *tmpAction; PetscLogDouble start, end; PetscLogDouble curTime; int stage; PetscErrorCode ierr; PetscFunctionBegin; /* Dynamically enlarge logging structures */ if (petsc_numActions >= petsc_maxActions) { PetscTime(&start); ierr = PetscMalloc(petsc_maxActions*2 * sizeof(Action), &tmpAction);CHKERRQ(ierr); ierr = PetscMemcpy(tmpAction, petsc_actions, petsc_maxActions * sizeof(Action));CHKERRQ(ierr); ierr = PetscFree(petsc_actions);CHKERRQ(ierr); petsc_actions = tmpAction; petsc_maxActions *= 2; PetscTime(&end); petsc_BaseTime += (end - start); } /* Record the event */ ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr); ierr = PetscStageLogGetEventRegLog(stageLog, &eventRegLog);CHKERRQ(ierr); ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventPerfLog);CHKERRQ(ierr); PetscTime(&curTime); if (petsc_logActions) { petsc_actions[petsc_numActions].time = curTime - petsc_BaseTime; petsc_actions[petsc_numActions].action = ACTIONEND; petsc_actions[petsc_numActions].event = event; petsc_actions[petsc_numActions].classid = eventRegLog->eventInfo[event].classid; if (o1) petsc_actions[petsc_numActions].id1 = o1->id; else petsc_actions[petsc_numActions].id1 = -1; if (o2) petsc_actions[petsc_numActions].id2 = o2->id; else petsc_actions[petsc_numActions].id2 = -1; if (o3) petsc_actions[petsc_numActions].id3 = o3->id; else petsc_actions[petsc_numActions].id3 = -1; petsc_actions[petsc_numActions].flops = petsc_TotalFlops; ierr = PetscMallocGetCurrentUsage(&petsc_actions[petsc_numActions].mem);CHKERRQ(ierr); ierr = PetscMallocGetMaximumUsage(&petsc_actions[petsc_numActions].maxmem);CHKERRQ(ierr); petsc_numActions++; } /* Check for double counting */ eventPerfLog->eventInfo[event].depth--; if (eventPerfLog->eventInfo[event].depth > 0) PetscFunctionReturn(0); else if (eventPerfLog->eventInfo[event].depth < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Logging event had unbalanced begin/end pairs"); /* Log the performance info */ eventPerfLog->eventInfo[event].count++; eventPerfLog->eventInfo[event].time += curTime; eventPerfLog->eventInfo[event].flops += petsc_TotalFlops; eventPerfLog->eventInfo[event].numMessages += petsc_irecv_ct + petsc_isend_ct + petsc_recv_ct + petsc_send_ct; eventPerfLog->eventInfo[event].messageLength += petsc_irecv_len + petsc_isend_len + petsc_recv_len + petsc_send_len; eventPerfLog->eventInfo[event].numReductions += petsc_allreduce_ct + petsc_gather_ct + petsc_scatter_ct; PetscFunctionReturn(0); }
PetscErrorCode PetscLogEventBeginDefault(PetscLogEvent event,int t,PetscObject o1,PetscObject o2,PetscObject o3,PetscObject o4) { PetscStageLog stageLog; PetscEventPerfLog eventLog = NULL; int stage; PetscErrorCode ierr; PetscFunctionBegin; ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); ierr = PetscStageLogGetCurrent(stageLog,&stage);CHKERRQ(ierr); ierr = PetscStageLogGetEventPerfLog(stageLog,stage,&eventLog);CHKERRQ(ierr); /* Check for double counting */ eventLog->eventInfo[event].depth++; if (eventLog->eventInfo[event].depth > 1) PetscFunctionReturn(0); /* Log performance info */ eventLog->eventInfo[event].count++; eventLog->eventInfo[event].timeTmp = 0.0; PetscTimeSubtract(&eventLog->eventInfo[event].timeTmp); eventLog->eventInfo[event].flopsTmp = 0.0; eventLog->eventInfo[event].flopsTmp -= petsc_TotalFlops; eventLog->eventInfo[event].numMessages -= petsc_irecv_ct + petsc_isend_ct + petsc_recv_ct + petsc_send_ct; eventLog->eventInfo[event].messageLength -= petsc_irecv_len + petsc_isend_len + petsc_recv_len + petsc_send_len; eventLog->eventInfo[event].numReductions -= petsc_allreduce_ct + petsc_gather_ct + petsc_scatter_ct; PetscFunctionReturn(0); }
PetscErrorCode PetscLogEventEndDefault(PetscLogEvent event,int t,PetscObject o1,PetscObject o2,PetscObject o3,PetscObject o4) { PetscStageLog stageLog; PetscEventPerfLog eventLog = NULL; int stage; PetscErrorCode ierr; PetscFunctionBegin; ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); ierr = PetscStageLogGetCurrent(stageLog,&stage);CHKERRQ(ierr); ierr = PetscStageLogGetEventPerfLog(stageLog,stage,&eventLog);CHKERRQ(ierr); /* Check for double counting */ eventLog->eventInfo[event].depth--; if (eventLog->eventInfo[event].depth > 0) PetscFunctionReturn(0); else if (eventLog->eventInfo[event].depth < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Logging event had unbalanced begin/end pairs"); /* Log performance info */ PetscTimeAdd(&eventLog->eventInfo[event].timeTmp); eventLog->eventInfo[event].time += eventLog->eventInfo[event].timeTmp; eventLog->eventInfo[event].time2 += eventLog->eventInfo[event].timeTmp*eventLog->eventInfo[event].timeTmp; eventLog->eventInfo[event].flopsTmp += petsc_TotalFlops; eventLog->eventInfo[event].flops += eventLog->eventInfo[event].flopsTmp; eventLog->eventInfo[event].flops2 += eventLog->eventInfo[event].flopsTmp*eventLog->eventInfo[event].flopsTmp; eventLog->eventInfo[event].numMessages += petsc_irecv_ct + petsc_isend_ct + petsc_recv_ct + petsc_send_ct; eventLog->eventInfo[event].messageLength += petsc_irecv_len + petsc_isend_len + petsc_recv_len + petsc_send_len; eventLog->eventInfo[event].numReductions += petsc_allreduce_ct + petsc_gather_ct + petsc_scatter_ct; PetscFunctionReturn(0); }
PetscErrorCode PetscLogEventEndComplete(PetscLogEvent event, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4) { StageLog stageLog; EventRegLog eventRegLog; EventPerfLog eventPerfLog; Action *tmpAction; PetscLogDouble start, end; PetscLogDouble curTime; int stage; PetscErrorCode ierr; PetscFunctionBegin; /* Dynamically enlarge logging structures */ if (numActions >= maxActions) { PetscTime(start); ierr = PetscMalloc(maxActions*2 * sizeof(Action), &tmpAction);CHKERRQ(ierr); ierr = PetscMemcpy(tmpAction, actions, maxActions * sizeof(Action));CHKERRQ(ierr); ierr = PetscFree(actions);CHKERRQ(ierr); actions = tmpAction; maxActions *= 2; PetscTime(end); BaseTime += (end - start); } /* Record the event */ ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); ierr = StageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr); ierr = StageLogGetEventRegLog(stageLog, &eventRegLog);CHKERRQ(ierr); ierr = StageLogGetEventPerfLog(stageLog, stage, &eventPerfLog);CHKERRQ(ierr); PetscTime(curTime); if (logActions) { actions[numActions].time = curTime - BaseTime; actions[numActions].action = ACTIONEND; actions[numActions].event = event; actions[numActions].cookie = eventRegLog->eventInfo[event].cookie; if (o1) actions[numActions].id1 = o1->id; else actions[numActions].id1 = -1; if (o2) actions[numActions].id2 = o2->id; else actions[numActions].id2 = -1; if (o3) actions[numActions].id3 = o3->id; else actions[numActions].id3 = -1; actions[numActions].flops = _TotalFlops; ierr = PetscMallocGetCurrentUsage(&actions[numActions].mem);CHKERRQ(ierr); ierr = PetscMallocGetMaximumUsage(&actions[numActions].maxmem);CHKERRQ(ierr); numActions++; } /* Check for double counting */ eventPerfLog->eventInfo[event].depth--; if (eventPerfLog->eventInfo[event].depth > 0) { PetscFunctionReturn(0); } else if (eventPerfLog->eventInfo[event].depth < 0) { SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Logging event had unbalanced begin/end pairs"); } /* Log the performance info */ eventPerfLog->eventInfo[event].count++; eventPerfLog->eventInfo[event].time += curTime; eventPerfLog->eventInfo[event].flops += _TotalFlops; eventPerfLog->eventInfo[event].numMessages += irecv_ct + isend_ct + recv_ct + send_ct; eventPerfLog->eventInfo[event].messageLength += irecv_len + isend_len + recv_len + send_len; eventPerfLog->eventInfo[event].numReductions += allreduce_ct; PetscFunctionReturn(0); }
PetscErrorCode PetscLogEventGetFlops(PetscLogEvent event, PetscLogDouble *flops) { PetscStageLog stageLog; PetscEventPerfLog eventLog = NULL; int stage; PetscErrorCode ierr; PetscFunctionBegin; ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr); ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);CHKERRQ(ierr); *flops = eventLog->eventInfo[event].flops; PetscFunctionReturn(0); }
PetscErrorCode TestVecClosure(DM dm, AppCtx *user) { PetscSection s; Vec v; PetscInt numRuns, cStart, cEnd, c, i; PetscScalar tmpArray[64]; PetscScalar *userArray = user->reuseArray ? tmpArray : NULL; PetscReal maxTimePerRun = user->maxVecClosureTime; PetscStageLog stageLog; PetscEventPerfLog eventLog; PetscInt stage; PetscLogEvent event; PetscEventPerfInfo eventInfo; PetscErrorCode ierr; PetscFunctionBegin; ierr = PetscLogStageRegister("DMPlex Vector Closure Test", &stage);CHKERRQ(ierr); ierr = PetscLogEventRegister("VecClosure", PETSC_OBJECT_CLASSID, &event);CHKERRQ(ierr); ierr = PetscLogStagePush(stage);CHKERRQ(ierr); ierr = DMPlexCreateSection(dm, user->dim, user->numFields, user->numComponents, user->numDof, 0, NULL, NULL, &s);CHKERRQ(ierr); ierr = DMSetDefaultSection(dm, s);CHKERRQ(ierr); ierr = PetscSectionDestroy(&s);CHKERRQ(ierr); ierr = DMPlexGetHeightStratum(dm, 0, &cStart, &cEnd);CHKERRQ(ierr); ierr = DMGetLocalVector(dm, &v);CHKERRQ(ierr); ierr = PetscLogEventBegin(event,0,0,0,0);CHKERRQ(ierr); for (i = 0; i < user->iterations; ++i) { for (c = cStart; c < cEnd; ++c) { PetscScalar *closure = userArray; PetscInt closureSize = 64;; ierr = DMPlexVecGetClosure(dm, s, v, c, &closureSize, &closure);CHKERRQ(ierr); if (!user->reuseArray) {ierr = DMPlexVecRestoreClosure(dm, s, v, c, &closureSize, &closure);CHKERRQ(ierr);} } } ierr = PetscLogEventEnd(event,0,0,0,0);CHKERRQ(ierr); ierr = DMRestoreLocalVector(dm, &v);CHKERRQ(ierr); ierr = PetscLogStagePop();CHKERRQ(ierr); ierr = PetscLogGetStageLog(&stageLog); ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog); numRuns = (cEnd-cStart) * user->iterations; eventInfo = eventLog->eventInfo[event]; if (eventInfo.count != 1) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of event calls %d should be %d", eventInfo.count, 1); if ((PetscInt) eventInfo.flops != 0) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of event flops %d should be %d", (PetscInt) eventInfo.flops, 0); if (eventInfo.time > maxTimePerRun * numRuns) { ierr = PetscPrintf(PETSC_COMM_SELF, "VecClosures: %d Average time per cone: %gs standard: %gs\n", numRuns, eventInfo.time/numRuns, maxTimePerRun); if (user->errors) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Average time for vector closure %g > standard %g", eventInfo.time/numRuns, maxTimePerRun); } PetscFunctionReturn(0); }
PetscErrorCode PetscLogEventZeroFlops(PetscLogEvent event) { StageLog stageLog; EventPerfLog eventLog; int stage; PetscErrorCode ierr; PetscFunctionBegin; ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); ierr = StageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr); ierr = StageLogGetEventPerfLog(stageLog, stage, &eventLog);CHKERRQ(ierr); eventLog->eventInfo[event].flops = 0.0; PetscFunctionReturn(0); }
PetscErrorCode PetscLogEventGetFlops(PetscLogEvent event,PetscLogDouble *flops) { PetscStageLog stageLog; PetscEventPerfLog eventLog = NULL; int stage; PetscErrorCode ierr; PetscFunctionBegin; if (!PetscLogPLB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Must use -log_summary or PetscLogDefaultBegin() before calling this routine"); ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); ierr = PetscStageLogGetCurrent(stageLog,&stage);CHKERRQ(ierr); ierr = PetscStageLogGetEventPerfLog(stageLog,stage,&eventLog);CHKERRQ(ierr); *flops = eventLog->eventInfo[event].flops; PetscFunctionReturn(0); }
/*@C PetscLogEventGetPerfInfo - Return the performance information about the given event in the given stage Input Parameters: + stage - The stage number or PETSC_DETERMINE for the current stage - event - The event number Output Parameters: . info - This structure is filled with the performance information Level: Intermediate .seealso: PetscLogEventGetFlops() @*/ PetscErrorCode PetscLogEventGetPerfInfo(int stage,PetscLogEvent event,PetscEventPerfInfo *info) { PetscStageLog stageLog; PetscEventPerfLog eventLog = NULL; PetscErrorCode ierr; PetscFunctionBegin; PetscValidPointer(info,3); if (!PetscLogPLB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Must use -log_summary or PetscLogDefaultBegin() before calling this routine"); ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); if (stage < 0) {ierr = PetscStageLogGetCurrent(stageLog,&stage);CHKERRQ(ierr);} ierr = PetscStageLogGetEventPerfLog(stageLog,stage,&eventLog);CHKERRQ(ierr); *info = eventLog->eventInfo[event]; PetscFunctionReturn(0); }
PetscErrorCode TestTransitiveClosure(DM dm, AppCtx *user) { PetscInt numRuns, cStart, cEnd, c, i; PetscReal maxTimePerRun = user->maxClosureTime; PetscStageLog stageLog; PetscEventPerfLog eventLog; PetscInt stage; PetscLogEvent event; PetscEventPerfInfo eventInfo; PetscErrorCode ierr; PetscFunctionBegin; ierr = PetscLogStageRegister("DMPlex Transitive Closure Test", &stage);CHKERRQ(ierr); ierr = PetscLogEventRegister("TransitiveClosure", PETSC_OBJECT_CLASSID, &event);CHKERRQ(ierr); ierr = PetscLogStagePush(stage);CHKERRQ(ierr); ierr = DMPlexGetHeightStratum(dm, 0, &cStart, &cEnd);CHKERRQ(ierr); ierr = PetscLogEventBegin(event,0,0,0,0);CHKERRQ(ierr); for (i = 0; i < user->iterations; ++i) { for (c = cStart; c < cEnd; ++c) { PetscInt *closure = NULL; PetscInt closureSize; ierr = DMPlexGetTransitiveClosure(dm, c, PETSC_TRUE, &closureSize, &closure);CHKERRQ(ierr); ierr = DMPlexRestoreTransitiveClosure(dm, c, PETSC_TRUE, &closureSize, &closure);CHKERRQ(ierr); } } ierr = PetscLogEventEnd(event,0,0,0,0);CHKERRQ(ierr); ierr = PetscLogStagePop();CHKERRQ(ierr); ierr = PetscLogGetStageLog(&stageLog); ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog); numRuns = (cEnd-cStart) * user->iterations; eventInfo = eventLog->eventInfo[event]; if (eventInfo.count != 1) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of event calls %d should be %d", eventInfo.count, 1); if ((PetscInt) eventInfo.flops != 0) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of event flops %d should be %d", (PetscInt) eventInfo.flops, 0); if (eventInfo.time > maxTimePerRun * numRuns) { ierr = PetscPrintf(PETSC_COMM_SELF, "Closures: %d Average time per cone: %gs standard: %gs\n", numRuns, eventInfo.time/numRuns, maxTimePerRun); if (user->errors) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Average time for closure %g > standard %g", eventInfo.time/numRuns, maxTimePerRun); } PetscFunctionReturn(0); }
PetscErrorCode PetscLogEventEndDefault(PetscLogEvent event, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4) { PetscStageLog stageLog; PetscEventPerfLog eventLog = NULL; int stage; PetscErrorCode ierr; PetscFunctionBegin; ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr); ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);CHKERRQ(ierr); /* Check for double counting */ eventLog->eventInfo[event].depth--; if (eventLog->eventInfo[event].depth > 0) PetscFunctionReturn(0); else if (eventLog->eventInfo[event].depth < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Logging event had unbalanced begin/end pairs"); /* Log performance info */ PetscTimeAdd(&eventLog->eventInfo[event].timeTmp); eventLog->eventInfo[event].time += eventLog->eventInfo[event].timeTmp; eventLog->eventInfo[event].time2 += eventLog->eventInfo[event].timeTmp*eventLog->eventInfo[event].timeTmp; #if defined(PETSC_HAVE_CHUD) eventLog->eventInfo[event].flopsTmp += chudGetPMCEventCount(chudCPU1Dev,PMC_1); #elif defined(PETSC_HAVE_PAPI) { long_long values[2]; ierr = PAPI_read(PAPIEventSet,values);CHKERRQ(ierr); eventLog->eventInfo[event].flopsTmp += values[0]; /* printf("fma %g flops %g\n",(double)values[1],(double)values[0]); */ } #else eventLog->eventInfo[event].flopsTmp += petsc_TotalFlops; #endif eventLog->eventInfo[event].flops += eventLog->eventInfo[event].flopsTmp; eventLog->eventInfo[event].flops2 += eventLog->eventInfo[event].flopsTmp*eventLog->eventInfo[event].flopsTmp; eventLog->eventInfo[event].numMessages += petsc_irecv_ct + petsc_isend_ct + petsc_recv_ct + petsc_send_ct; eventLog->eventInfo[event].messageLength += petsc_irecv_len + petsc_isend_len + petsc_recv_len + petsc_send_len; eventLog->eventInfo[event].numReductions += petsc_allreduce_ct + petsc_gather_ct + petsc_scatter_ct; PetscFunctionReturn(0); }
PetscErrorCode PetscLogEventBeginDefault(PetscLogEvent event, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4) { PetscStageLog stageLog; PetscEventPerfLog eventLog = NULL; int stage; PetscErrorCode ierr; PetscFunctionBegin; ierr = PetscLogGetStageLog(&stageLog); CHKERRQ(ierr); ierr = PetscStageLogGetCurrent(stageLog, &stage); CHKERRQ(ierr); ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog); CHKERRQ(ierr); /* Check for double counting */ eventLog->eventInfo[event].depth++; if (eventLog->eventInfo[event].depth > 1) PetscFunctionReturn(0); /* Log performance info */ eventLog->eventInfo[event].count++; eventLog->eventInfo[event].timeTmp = 0.0; PetscTimeSubtract(&eventLog->eventInfo[event].timeTmp); eventLog->eventInfo[event].flopsTmp = 0.0; #if defined(PETSC_HAVE_PAPI) { long_long values[2]; ierr = PAPI_read(PAPIEventSet,values); CHKERRQ(ierr); eventLog->eventInfo[event].flopsTmp -= values[0]; /* printf("fma %g flops %g\n",(double)values[1],(double)values[0]); */ } #else eventLog->eventInfo[event].flopsTmp -= petsc_TotalFlops; #endif eventLog->eventInfo[event].numMessages -= petsc_irecv_ct + petsc_isend_ct + petsc_recv_ct + petsc_send_ct; eventLog->eventInfo[event].messageLength -= petsc_irecv_len + petsc_isend_len + petsc_recv_len + petsc_send_len; eventLog->eventInfo[event].numReductions -= petsc_allreduce_ct + petsc_gather_ct + petsc_scatter_ct; PetscFunctionReturn(0); }
PetscErrorCode PCSetFromOptions_MG(PetscOptionItems *PetscOptionsObject,PC pc) { PetscErrorCode ierr; PetscInt m,levels = 1,cycles; PetscBool flg,set; PC_MG *mg = (PC_MG*)pc->data; PC_MG_Levels **mglevels; PCMGType mgtype; PCMGCycleType mgctype; PetscFunctionBegin; ierr = PetscOptionsHead(PetscOptionsObject,"Multigrid options");CHKERRQ(ierr); if (!mg->levels) { ierr = PetscOptionsInt("-pc_mg_levels","Number of Levels","PCMGSetLevels",levels,&levels,&flg);CHKERRQ(ierr); if (!flg && pc->dm) { ierr = DMGetRefineLevel(pc->dm,&levels);CHKERRQ(ierr); levels++; mg->usedmfornumberoflevels = PETSC_TRUE; } ierr = PCMGSetLevels(pc,levels,NULL);CHKERRQ(ierr); } mglevels = mg->levels; mgctype = (PCMGCycleType) mglevels[0]->cycles; ierr = PetscOptionsEnum("-pc_mg_cycle_type","V cycle or for W-cycle","PCMGSetCycleType",PCMGCycleTypes,(PetscEnum)mgctype,(PetscEnum*)&mgctype,&flg);CHKERRQ(ierr); if (flg) { ierr = PCMGSetCycleType(pc,mgctype);CHKERRQ(ierr); } flg = PETSC_FALSE; ierr = PetscOptionsBool("-pc_mg_galerkin","Use Galerkin process to compute coarser operators","PCMGSetGalerkin",flg,&flg,&set);CHKERRQ(ierr); if (set) { ierr = PCMGSetGalerkin(pc,flg);CHKERRQ(ierr); } ierr = PetscOptionsInt("-pc_mg_smoothup","Number of post-smoothing steps","PCMGSetNumberSmoothUp",mg->default_smoothu,&m,&flg);CHKERRQ(ierr); if (flg) { ierr = PCMGSetNumberSmoothUp(pc,m);CHKERRQ(ierr); } ierr = PetscOptionsInt("-pc_mg_smoothdown","Number of pre-smoothing steps","PCMGSetNumberSmoothDown",mg->default_smoothd,&m,&flg);CHKERRQ(ierr); if (flg) { ierr = PCMGSetNumberSmoothDown(pc,m);CHKERRQ(ierr); } mgtype = mg->am; ierr = PetscOptionsEnum("-pc_mg_type","Multigrid type","PCMGSetType",PCMGTypes,(PetscEnum)mgtype,(PetscEnum*)&mgtype,&flg);CHKERRQ(ierr); if (flg) { ierr = PCMGSetType(pc,mgtype);CHKERRQ(ierr); } if (mg->am == PC_MG_MULTIPLICATIVE) { ierr = PetscOptionsInt("-pc_mg_multiplicative_cycles","Number of cycles for each preconditioner step","PCMGMultiplicativeSetCycles",mg->cyclesperpcapply,&cycles,&flg);CHKERRQ(ierr); if (flg) { ierr = PCMGMultiplicativeSetCycles(pc,cycles);CHKERRQ(ierr); } } flg = PETSC_FALSE; ierr = PetscOptionsBool("-pc_mg_log","Log times for each multigrid level","None",flg,&flg,NULL);CHKERRQ(ierr); if (flg) { PetscInt i; char eventname[128]; if (!mglevels) SETERRQ(PetscObjectComm((PetscObject)pc),PETSC_ERR_ARG_WRONGSTATE,"Must set MG levels before calling"); levels = mglevels[0]->levels; for (i=0; i<levels; i++) { sprintf(eventname,"MGSetup Level %d",(int)i); ierr = PetscLogEventRegister(eventname,((PetscObject)pc)->classid,&mglevels[i]->eventsmoothsetup);CHKERRQ(ierr); sprintf(eventname,"MGSmooth Level %d",(int)i); ierr = PetscLogEventRegister(eventname,((PetscObject)pc)->classid,&mglevels[i]->eventsmoothsolve);CHKERRQ(ierr); if (i) { sprintf(eventname,"MGResid Level %d",(int)i); ierr = PetscLogEventRegister(eventname,((PetscObject)pc)->classid,&mglevels[i]->eventresidual);CHKERRQ(ierr); sprintf(eventname,"MGInterp Level %d",(int)i); ierr = PetscLogEventRegister(eventname,((PetscObject)pc)->classid,&mglevels[i]->eventinterprestrict);CHKERRQ(ierr); } } #if defined(PETSC_USE_LOG) { const char *sname = "MG Apply"; PetscStageLog stageLog; PetscInt st; PetscFunctionBegin; ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); for (st = 0; st < stageLog->numStages; ++st) { PetscBool same; ierr = PetscStrcmp(stageLog->stageInfo[st].name, sname, &same);CHKERRQ(ierr); if (same) mg->stageApply = st; } if (!mg->stageApply) { ierr = PetscLogStageRegister(sname, &mg->stageApply);CHKERRQ(ierr); } } #endif } ierr = PetscOptionsTail();CHKERRQ(ierr); PetscFunctionReturn(0); }
PetscErrorCode PetscLogView_VecScatter(PetscViewer viewer) { MPI_Comm comm = PetscObjectComm((PetscObject) viewer); PetscEventPerfInfo *eventInfo = NULL; PetscLogDouble locTotalTime,stats[6],maxstats[6],minstats[6],sumstats[6],avetime,ksptime; PetscStageLog stageLog; const int stage = 2; int event,events[] = {VEC_ScatterBegin,VEC_ScatterEnd}; PetscMPIInt rank,size; PetscErrorCode ierr; PetscInt i; char arch[128],hostname[128],username[128],pname[PETSC_MAX_PATH_LEN],date[128],version[256]; PetscFunctionBegin; PetscTime(&locTotalTime); locTotalTime -= petsc_BaseTime; ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr); ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr); ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); ierr = PetscViewerASCIIPrintf(viewer,"numProcs = %d\n",size);CHKERRQ(ierr); ierr = PetscGetArchType(arch,sizeof(arch));CHKERRQ(ierr); ierr = PetscGetHostName(hostname,sizeof(hostname));CHKERRQ(ierr); ierr = PetscGetUserName(username,sizeof(username));CHKERRQ(ierr); ierr = PetscGetProgramName(pname,sizeof(pname));CHKERRQ(ierr); ierr = PetscGetDate(date,sizeof(date));CHKERRQ(ierr); ierr = PetscGetVersion(version,sizeof(version));CHKERRQ(ierr); ierr = PetscViewerASCIIPrintf(viewer,"%s on a %s named %s with %d processors, by %s %s\n", pname, arch, hostname, size, username, date);CHKERRQ(ierr); ierr = PetscViewerASCIIPrintf(viewer, "Using %s\n", version);CHKERRQ(ierr); ierr = PetscViewerASCIIPrintf(viewer, "Configure options: %s",petscconfigureoptions);CHKERRQ(ierr); ierr = PetscViewerASCIIPrintf(viewer, "%s", petscmachineinfo);CHKERRQ(ierr); ierr = PetscViewerASCIIPrintf(viewer, "%s", petsccompilerinfo);CHKERRQ(ierr); ierr = PetscViewerASCIIPrintf(viewer, "%s", petsccompilerflagsinfo);CHKERRQ(ierr); ierr = PetscViewerASCIIPrintf(viewer, "%s", petsclinkerinfo);CHKERRQ(ierr); ierr = PetscViewerASCIIPrintf(viewer, "%s\n", PETSC_MPICC_SHOW);CHKERRQ(ierr); ierr = PetscOptionsView(NULL,viewer);CHKERRQ(ierr); #if defined(PETSC_HAVE_HWLOC) ierr = PetscProcessPlacementView(viewer);CHKERRQ(ierr); #endif ierr = PetscViewerASCIIPrintf(viewer, "----------------------------------------------------\n");CHKERRQ(ierr); ierr = PetscViewerASCIIPrintf(viewer," Time Min to Max Range Proportion of KSP\n");CHKERRQ(ierr); eventInfo = stageLog->stageInfo[stage].eventLog->eventInfo; ierr = MPI_Allreduce(&eventInfo[KSP_Solve].time,&ksptime,1,MPIU_PETSCLOGDOUBLE,MPI_SUM,PETSC_COMM_WORLD);CHKERRQ(ierr); ksptime = ksptime/size; for (i=0; i<(int)(sizeof(events)/sizeof(int)); i++) { event = events[i]; stats[COUNT] = eventInfo[event].count; stats[TIME] = eventInfo[event].time; stats[NUMMESS] = eventInfo[event].numMessages; stats[MESSLEN] = eventInfo[event].messageLength; stats[REDUCT] = eventInfo[event].numReductions; stats[FLOPS] = eventInfo[event].flops; ierr = MPI_Allreduce(stats,maxstats,6,MPIU_PETSCLOGDOUBLE,MPI_MAX,PETSC_COMM_WORLD);CHKERRQ(ierr); ierr = MPI_Allreduce(stats,minstats,6,MPIU_PETSCLOGDOUBLE,MPI_MIN,PETSC_COMM_WORLD);CHKERRQ(ierr); ierr = MPI_Allreduce(stats,sumstats,6,MPIU_PETSCLOGDOUBLE,MPI_SUM,PETSC_COMM_WORLD);CHKERRQ(ierr); avetime = sumstats[1]/size; ierr = PetscViewerASCIIPrintf(viewer,"%s %4.2e -%5.1f %% %5.1f %% %4.2e %%\n",stageLog->eventLog->eventInfo[event].name, avetime,100.*(avetime-minstats[1])/avetime,100.*(maxstats[1]-avetime)/avetime,100.*avetime/ksptime);CHKERRQ(ierr); } ierr = PetscViewerFlush(viewer);CHKERRQ(ierr); PetscFunctionReturn(0); }
PetscErrorCode PetscLogPrintSummaryToPy(MPI_Comm comm, PetscViewer viewer) { PetscViewer_ASCII *ascii = (PetscViewer_ASCII*)viewer->data; FILE *fd = ascii->fd; PetscLogDouble zero = 0.0; StageLog stageLog; StageInfo *stageInfo = PETSC_NULL; EventPerfInfo *eventInfo = PETSC_NULL; ClassPerfInfo *classInfo; const char *name; PetscLogDouble locTotalTime, TotalTime, TotalFlops; PetscLogDouble numMessages, messageLength, avgMessLen, numReductions; PetscLogDouble stageTime, flops, mem, mess, messLen, red; PetscLogDouble fracTime, fracFlops, fracMessages, fracLength; PetscLogDouble fracReductions; PetscLogDouble tot,avg,x,y,*mydata; PetscMPIInt minCt, maxCt; PetscMPIInt size, rank, *mycount; PetscTruth *localStageUsed, *stageUsed; PetscTruth *localStageVisible, *stageVisible; int numStages, localNumEvents, numEvents; int stage, lastStage; PetscLogEvent event; PetscErrorCode ierr; PetscInt i; /* remove these two lines! */ PetscLogDouble PETSC_DLLEXPORT BaseTime = 0.0; int numObjects = 0; PetscFunctionBegin; ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr); ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr); ierr = PetscMalloc(size*sizeof(PetscLogDouble), &mydata);CHKERRQ(ierr); ierr = PetscMalloc(size*sizeof(PetscMPIInt), &mycount);CHKERRQ(ierr); /* Pop off any stages the user forgot to remove */ lastStage = 0; ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); ierr = StageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr); while (stage >= 0) { lastStage = stage; ierr = StageLogPop(stageLog);CHKERRQ(ierr); ierr = StageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr); } /* Get the total elapsed time */ PetscTime(locTotalTime); locTotalTime -= BaseTime; ierr = PetscFPrintf(comm, fd, "\n#------ PETSc Performance Summary ----------\n\n");CHKERRQ(ierr); ierr = PetscFPrintf(comm, fd, "Nproc = %d\n",size);CHKERRQ(ierr); /* Must preserve reduction count before we go on */ red = (allreduce_ct + gather_ct + scatter_ct)/((PetscLogDouble) size); /* Calculate summary information */ /* Time */ ierr = MPI_Gather(&locTotalTime,1,MPIU_PETSCLOGDOUBLE,mydata,1,MPIU_PETSCLOGDOUBLE,0,comm);CHKERRQ(ierr); if (!rank){ ierr = PetscFPrintf(comm, fd, "Time = [ " );CHKERRQ(ierr); tot = 0.0; for (i=0; i<size; i++){ tot += mydata[i]; ierr = PetscFPrintf(comm, fd, " %5.3e,",mydata[i] );CHKERRQ(ierr); } ierr = PetscFPrintf(comm, fd, "]\n" );CHKERRQ(ierr); avg = (tot)/((PetscLogDouble) size); TotalTime = tot; } /* Objects */ avg = (PetscLogDouble) numObjects; ierr = MPI_Gather(&avg,1,MPIU_PETSCLOGDOUBLE,mydata,1,MPIU_PETSCLOGDOUBLE,0,comm);CHKERRQ(ierr); if (!rank){ ierr = PetscFPrintf(comm, fd, "Objects = [ " );CHKERRQ(ierr); for (i=0; i<size; i++){ ierr = PetscFPrintf(comm, fd, " %5.3e,",mydata[i] );CHKERRQ(ierr); } ierr = PetscFPrintf(comm, fd, "]\n" );CHKERRQ(ierr); } /* Flops */ ierr = MPI_Gather(&_TotalFlops,1,MPIU_PETSCLOGDOUBLE,mydata,1,MPIU_PETSCLOGDOUBLE,0,comm);CHKERRQ(ierr); if (!rank){ ierr = PetscFPrintf(comm, fd, "Flops = [ " );CHKERRQ(ierr); tot = 0.0; for (i=0; i<size; i++){ tot += mydata[i]; ierr = PetscFPrintf(comm, fd, " %5.3e,",mydata[i] );CHKERRQ(ierr); } ierr = PetscFPrintf(comm, fd, "]\n");CHKERRQ(ierr); TotalFlops = tot; } /* Memory */ ierr = PetscMallocGetMaximumUsage(&mem);CHKERRQ(ierr); ierr = MPI_Gather(&mem,1,MPIU_PETSCLOGDOUBLE,mydata,1,MPIU_PETSCLOGDOUBLE,0,comm);CHKERRQ(ierr); if (!rank){ ierr = PetscFPrintf(comm, fd, "Memory = [ " );CHKERRQ(ierr); for (i=0; i<size; i++){ ierr = PetscFPrintf(comm, fd, " %5.3e,",mydata[i] );CHKERRQ(ierr); } ierr = PetscFPrintf(comm, fd, "]\n" );CHKERRQ(ierr); } /* Messages */ mess = 0.5*(irecv_ct + isend_ct + recv_ct + send_ct); ierr = MPI_Gather(&mess,1,MPIU_PETSCLOGDOUBLE,mydata,1,MPIU_PETSCLOGDOUBLE,0,comm);CHKERRQ(ierr); if (!rank){ ierr = PetscFPrintf(comm, fd, "MPIMessages = [ " );CHKERRQ(ierr); tot = 0.0; for (i=0; i<size; i++){ tot += mydata[i]; ierr = PetscFPrintf(comm, fd, " %5.3e,",mydata[i] );CHKERRQ(ierr); } ierr = PetscFPrintf(comm, fd, "]\n" );CHKERRQ(ierr); numMessages = tot; } /* Message Lengths */ mess = 0.5*(irecv_len + isend_len + recv_len + send_len); ierr = MPI_Gather(&mess,1,MPIU_PETSCLOGDOUBLE,mydata,1,MPIU_PETSCLOGDOUBLE,0,comm);CHKERRQ(ierr); if (!rank){ ierr = PetscFPrintf(comm, fd, "MPIMessageLengths = [ " );CHKERRQ(ierr); tot = 0.0; for (i=0; i<size; i++){ tot += mydata[i]; ierr = PetscFPrintf(comm, fd, " %5.3e,",mydata[i] );CHKERRQ(ierr); } ierr = PetscFPrintf(comm, fd, "]\n" );CHKERRQ(ierr); messageLength = tot; } /* Reductions */ ierr = MPI_Gather(&red,1,MPIU_PETSCLOGDOUBLE,mydata,1,MPIU_PETSCLOGDOUBLE,0,comm);CHKERRQ(ierr); if (!rank){ ierr = PetscFPrintf(comm, fd, "MPIReductions = [ " );CHKERRQ(ierr); tot = 0.0; for (i=0; i<size; i++){ tot += mydata[i]; ierr = PetscFPrintf(comm, fd, " %5.3e,",mydata[i] );CHKERRQ(ierr); } ierr = PetscFPrintf(comm, fd, "]\n" );CHKERRQ(ierr); numReductions = tot; } /* Get total number of stages -- Currently, a single processor can register more stages than another, but stages must all be registered in order. We can removed this requirement if necessary by having a global stage numbering and indirection on the stage ID. This seems best accomplished by assoicating a communicator with each stage. */ ierr = MPI_Allreduce(&stageLog->numStages, &numStages, 1, MPI_INT, MPI_MAX, comm);CHKERRQ(ierr); ierr = PetscMalloc(numStages * sizeof(PetscTruth), &localStageUsed);CHKERRQ(ierr); ierr = PetscMalloc(numStages * sizeof(PetscTruth), &stageUsed);CHKERRQ(ierr); ierr = PetscMalloc(numStages * sizeof(PetscTruth), &localStageVisible);CHKERRQ(ierr); ierr = PetscMalloc(numStages * sizeof(PetscTruth), &stageVisible);CHKERRQ(ierr); if (numStages > 0) { stageInfo = stageLog->stageInfo; for(stage = 0; stage < numStages; stage++) { if (stage < stageLog->numStages) { localStageUsed[stage] = stageInfo[stage].used; localStageVisible[stage] = stageInfo[stage].perfInfo.visible; } else { localStageUsed[stage] = PETSC_FALSE; localStageVisible[stage] = PETSC_TRUE; } } ierr = MPI_Allreduce(localStageUsed, stageUsed, numStages, MPI_INT, MPI_LOR, comm);CHKERRQ(ierr); ierr = MPI_Allreduce(localStageVisible, stageVisible, numStages, MPI_INT, MPI_LAND, comm);CHKERRQ(ierr); for(stage = 0; stage < numStages; stage++) { if (stageUsed[stage]) { ierr = PetscFPrintf(comm, fd, "\n#Summary of Stages: ----- Time ------ ----- Flops ----- --- Messages --- -- Message Lengths -- -- Reductions --\n");CHKERRQ(ierr); ierr = PetscFPrintf(comm, fd, "# Avg %%Total Avg %%Total counts %%Total Avg %%Total counts %%Total \n");CHKERRQ(ierr); break; } } for(stage = 0; stage < numStages; stage++) { if (!stageUsed[stage]) continue; if (localStageUsed[stage]) { ierr = MPI_Allreduce(&stageInfo[stage].perfInfo.time, &stageTime, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr); ierr = MPI_Allreduce(&stageInfo[stage].perfInfo.flops, &flops, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr); ierr = MPI_Allreduce(&stageInfo[stage].perfInfo.numMessages, &mess, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr); ierr = MPI_Allreduce(&stageInfo[stage].perfInfo.messageLength, &messLen, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr); ierr = MPI_Allreduce(&stageInfo[stage].perfInfo.numReductions, &red, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr); name = stageInfo[stage].name; } else { ierr = MPI_Allreduce(&zero, &stageTime, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr); ierr = MPI_Allreduce(&zero, &flops, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr); ierr = MPI_Allreduce(&zero, &mess, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr); ierr = MPI_Allreduce(&zero, &messLen, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr); ierr = MPI_Allreduce(&zero, &red, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr); name = ""; } mess *= 0.5; messLen *= 0.5; red /= size; if (TotalTime != 0.0) fracTime = stageTime/TotalTime; else fracTime = 0.0; if (TotalFlops != 0.0) fracFlops = flops/TotalFlops; else fracFlops = 0.0; /* Talk to Barry if (stageTime != 0.0) flops = (size*flops)/stageTime; else flops = 0.0; */ if (numMessages != 0.0) fracMessages = mess/numMessages; else fracMessages = 0.0; if (numMessages != 0.0) avgMessLen = messLen/numMessages; else avgMessLen = 0.0; if (messageLength != 0.0) fracLength = messLen/messageLength; else fracLength = 0.0; if (numReductions != 0.0) fracReductions = red/numReductions; else fracReductions = 0.0; ierr = PetscFPrintf(comm, fd, "# "); ierr = PetscFPrintf(comm, fd, "%2d: %15s: %6.4e %5.1f%% %6.4e %5.1f%% %5.3e %5.1f%% %5.3e %5.1f%% %5.3e %5.1f%% \n", stage, name, stageTime/size, 100.0*fracTime, flops, 100.0*fracFlops, mess, 100.0*fracMessages, avgMessLen, 100.0*fracLength, red, 100.0*fracReductions);CHKERRQ(ierr); } } /* Report events */ ierr = PetscFPrintf(comm, fd,"\n# Event\n");CHKERRQ(ierr); ierr = PetscFPrintf(comm,fd,"# ------------------------------------------------------\n"); CHKERRQ(ierr); /* Problem: The stage name will not show up unless the stage executed on proc 1 */ for(stage = 0; stage < numStages; stage++) { if (!stageVisible[stage]) continue; if (localStageUsed[stage]) { ierr = MPI_Allreduce(&stageInfo[stage].perfInfo.time, &stageTime, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr); ierr = MPI_Allreduce(&stageInfo[stage].perfInfo.flops, &flops, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr); ierr = MPI_Allreduce(&stageInfo[stage].perfInfo.numMessages, &mess, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr); ierr = MPI_Allreduce(&stageInfo[stage].perfInfo.messageLength, &messLen, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr); ierr = MPI_Allreduce(&stageInfo[stage].perfInfo.numReductions, &red, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr); } else { ierr = PetscFPrintf(comm, fd, "\n--- Event Stage %d: Unknown\n\n", stage);CHKERRQ(ierr); ierr = MPI_Allreduce(&zero, &stageTime, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr); ierr = MPI_Allreduce(&zero, &flops, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr); ierr = MPI_Allreduce(&zero, &mess, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr); ierr = MPI_Allreduce(&zero, &messLen, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr); ierr = MPI_Allreduce(&zero, &red, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);CHKERRQ(ierr); } mess *= 0.5; messLen *= 0.5; red /= size; /* Get total number of events in this stage -- Currently, a single processor can register more events than another, but events must all be registered in order, just like stages. We can removed this requirement if necessary by having a global event numbering and indirection on the event ID. This seems best accomplished by assoicating a communicator with each stage. Problem: If the event did not happen on proc 1, its name will not be available. Problem: Event visibility is not implemented */ if (!rank){ ierr = PetscFPrintf(comm, fd, "class Dummy(object):\n");CHKERRQ(ierr); ierr = PetscFPrintf(comm, fd, " def foo(x):\n");CHKERRQ(ierr); ierr = PetscFPrintf(comm, fd, " print x\n");CHKERRQ(ierr); ierr = PetscFPrintf(comm, fd, "Event = {}\n");CHKERRQ(ierr); } if (localStageUsed[stage]) { eventInfo = stageLog->stageInfo[stage].eventLog->eventInfo; localNumEvents = stageLog->stageInfo[stage].eventLog->numEvents; } else { localNumEvents = 0; } ierr = MPI_Allreduce(&localNumEvents, &numEvents, 1, MPI_INT, MPI_MAX, comm);CHKERRQ(ierr); for(event = 0; event < numEvents; event++) { if (localStageUsed[stage] && (event < stageLog->stageInfo[stage].eventLog->numEvents) && (eventInfo[event].depth == 0)) { ierr = MPI_Allreduce(&eventInfo[event].count, &maxCt, 1, MPI_INT, MPI_MAX, comm);CHKERRQ(ierr); name = stageLog->eventLog->eventInfo[event].name; } else { ierr = MPI_Allreduce(&ierr, &maxCt, 1, MPI_INT, MPI_MAX, comm);CHKERRQ(ierr); name = ""; } if (maxCt != 0) { ierr = PetscFPrintf(comm, fd,"#\n");CHKERRQ(ierr); if (!rank){ ierr = PetscFPrintf(comm, fd, "%s = Dummy()\n",name);CHKERRQ(ierr); ierr = PetscFPrintf(comm, fd, "Event['%s'] = %s\n",name,name);CHKERRQ(ierr); } /* Count */ ierr = MPI_Gather(&eventInfo[event].count,1,MPI_INT,mycount,1,MPI_INT,0,comm);CHKERRQ(ierr); ierr = PetscFPrintf(comm, fd, "%s.Count = [ ", name);CHKERRQ(ierr); for (i=0; i<size; i++){ ierr = PetscFPrintf(comm, fd, " %7d,",mycount[i] );CHKERRQ(ierr); } ierr = PetscFPrintf(comm, fd, "]\n" );CHKERRQ(ierr); /* Time */ ierr = MPI_Gather(&eventInfo[event].time,1,MPIU_PETSCLOGDOUBLE,mydata,1,MPIU_PETSCLOGDOUBLE,0,comm);CHKERRQ(ierr); if (!rank){ ierr = PetscFPrintf(comm, fd, "%s.Time = [ ", name);CHKERRQ(ierr); for (i=0; i<size; i++){ ierr = PetscFPrintf(comm, fd, " %5.3e,",mydata[i] );CHKERRQ(ierr); } ierr = PetscFPrintf(comm, fd, "]\n" );CHKERRQ(ierr); } /* Flops */ ierr = MPI_Gather(&eventInfo[event].flops,1,MPIU_PETSCLOGDOUBLE,mydata,1,MPIU_PETSCLOGDOUBLE,0,comm);CHKERRQ(ierr); if (!rank){ ierr = PetscFPrintf(comm, fd, "%s.Flops = [ ", name);CHKERRQ(ierr); for (i=0; i<size; i++){ ierr = PetscFPrintf(comm, fd, " %5.3e,",mydata[i] );CHKERRQ(ierr); } ierr = PetscFPrintf(comm, fd, "]\n" );CHKERRQ(ierr); } } } } /* Right now, only stages on the first processor are reported here, meaning only objects associated with the global communicator, or MPI_COMM_SELF for proc 1. We really should report global stats and then stats for stages local to processor sets. */ for(stage = 0; stage < numStages; stage++) { if (localStageUsed[stage]) { classInfo = stageLog->stageInfo[stage].classLog->classInfo; } else { ierr = PetscFPrintf(comm, fd, "\n--- Event Stage %d: Unknown\n\n", stage);CHKERRQ(ierr); } } ierr = PetscFree(localStageUsed);CHKERRQ(ierr); ierr = PetscFree(stageUsed);CHKERRQ(ierr); ierr = PetscFree(localStageVisible);CHKERRQ(ierr); ierr = PetscFree(stageVisible);CHKERRQ(ierr); ierr = PetscFree(mydata);CHKERRQ(ierr); ierr = PetscFree(mycount);CHKERRQ(ierr); /* Information unrelated to this particular run */ ierr = PetscFPrintf(comm, fd, "# ========================================================================================================================\n");CHKERRQ(ierr); PetscTime(y); PetscTime(x); PetscTime(y); PetscTime(y); PetscTime(y); PetscTime(y); PetscTime(y); PetscTime(y); PetscTime(y); PetscTime(y); PetscTime(y); PetscTime(y); ierr = PetscFPrintf(comm,fd,"AveragetimetogetPetscTime = %g\n", (y-x)/10.0);CHKERRQ(ierr); /* MPI information */ if (size > 1) { MPI_Status status; PetscMPIInt tag; MPI_Comm newcomm; ierr = MPI_Barrier(comm);CHKERRQ(ierr); PetscTime(x); ierr = MPI_Barrier(comm);CHKERRQ(ierr); ierr = MPI_Barrier(comm);CHKERRQ(ierr); ierr = MPI_Barrier(comm);CHKERRQ(ierr); ierr = MPI_Barrier(comm);CHKERRQ(ierr); ierr = MPI_Barrier(comm);CHKERRQ(ierr); PetscTime(y); ierr = PetscFPrintf(comm, fd, "AveragetimeforMPI_Barrier = %g\n", (y-x)/5.0);CHKERRQ(ierr); ierr = PetscCommDuplicate(comm,&newcomm, &tag);CHKERRQ(ierr); ierr = MPI_Barrier(comm);CHKERRQ(ierr); if (rank) { ierr = MPI_Recv(0, 0, MPI_INT, rank-1, tag, newcomm, &status);CHKERRQ(ierr); ierr = MPI_Send(0, 0, MPI_INT, (rank+1)%size, tag, newcomm);CHKERRQ(ierr); } else { PetscTime(x); ierr = MPI_Send(0, 0, MPI_INT, 1, tag, newcomm);CHKERRQ(ierr); ierr = MPI_Recv(0, 0, MPI_INT, size-1, tag, newcomm, &status);CHKERRQ(ierr); PetscTime(y); ierr = PetscFPrintf(comm,fd,"AveragetimforzerosizeMPI_Send = %g\n", (y-x)/size);CHKERRQ(ierr); } ierr = PetscCommDestroy(&newcomm);CHKERRQ(ierr); } if (!rank) { /* print Optiontable */ ierr = PetscFPrintf(comm,fd,"# ");CHKERRQ(ierr); //ierr = PetscOptionsPrint(fd);CHKERRQ(ierr); } /* Cleanup */ ierr = PetscFPrintf(comm, fd, "\n");CHKERRQ(ierr); ierr = StageLogPush(stageLog, lastStage);CHKERRQ(ierr); PetscFunctionReturn(0); }
/* IntegrateElementBatchOpenCL - Produces element vectors from input element solution and geometric information via quadrature Input Parameters: + Ne - The total number of cells, Nchunk * Ncb * Nbc . Ncb - The number of serial cell batches . Nbc - The number of cells per batch . Nbl - The number of concurrent cells blocks per thread block . coefficients - An array of the solution vector for each cell . jacobianInverses - An array of the inverse Jacobian for each cell . jacobianDeterminants - An array of the Jacobian determinant for each cell . event - A PetscEvent, used to log flops - debug - A flag for debugging information Output Parameter: . elemVec - An array of the element vectors for each cell */ PETSC_EXTERN PetscErrorCode IntegrateElementBatchGPU(PetscInt spatial_dim, PetscInt Ne, PetscInt Ncb, PetscInt Nbc, PetscInt N_bl, const PetscScalar coefficients[], const PetscReal jacobianInverses[], const PetscReal jacobianDeterminants[], PetscScalar elemVec[], PetscLogEvent event, PetscInt debug, PetscInt pde_op) { const cl_int numQuadraturePoints_0 = 1; const cl_int numBasisFunctions_0 = 3; const cl_int numBasisComponents_0 = (pde_op == LAPLACIAN) ? 1 : spatial_dim; const cl_int dim = spatial_dim; const cl_int N_b = numBasisFunctions_0; /* The number of basis functions */ const cl_int N_comp = numBasisComponents_0; /* The number of basis function components */ const cl_int N_bt = N_b*N_comp; /* The total number of scalar basis functions */ const cl_int N_q = numQuadraturePoints_0; /* The number of quadrature points */ const cl_int N_bst = N_bt*N_q; /* The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously */ const cl_int N_t = N_bst*N_bl; /* The number of threads, N_bst * N_bl */ char *program_buffer; char build_buffer[8192]; cl_build_status status; cl_event ocl_ev; /* The event for tracking kernel execution */ cl_ulong ns_start; /* Nanoseconds counter on GPU at kernel start */ cl_ulong ns_end; /* Nanoseconds counter on GPU at kernel stop */ cl_mem d_coefficients; cl_mem d_jacobianInverses; cl_mem d_jacobianDeterminants; cl_mem d_elemVec; OpenCLEnvironment ocl_env; cl_program ocl_prog; cl_kernel ocl_kernel; size_t ocl_source_length; size_t local_work_size[3]; size_t global_work_size[3]; size_t i; unsigned int x, y, z; PetscErrorCode ierr; cl_int ierr2; PetscFunctionBegin; ierr = initializeOpenCL(&ocl_env);CHKERRQ(ierr); ierr = PetscMalloc(8192 * sizeof(char), &program_buffer);CHKERRQ(ierr); ierr = generateOpenCLSource(&program_buffer, 8192, dim, N_bl, pde_op);CHKERRQ(ierr); ocl_source_length = strlen(program_buffer); ocl_prog = clCreateProgramWithSource(ocl_env.ctx_id, 1, (const char**)&program_buffer, &ocl_source_length, &ierr2);CHKERRQ(ierr2); ierr = clBuildProgram(ocl_prog, 0, NULL, NULL, NULL, NULL); if (ierr != CL_SUCCESS) { clGetProgramBuildInfo(ocl_prog, ocl_env.dev_id, CL_PROGRAM_BUILD_LOG, sizeof(char)*8192, &build_buffer, NULL); printf("Build failed! Log:\n %s", build_buffer); } CHKERRQ(ierr); ierr = PetscFree(program_buffer);CHKERRQ(ierr); ocl_kernel = clCreateKernel(ocl_prog, "integrateElementQuadrature", &ierr);CHKERRQ(ierr); if (Nbc*N_comp != N_t) SETERRQ3(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of threads %d should be %d * %d", N_t, Nbc, N_comp); if (!Ne) { PetscStageLog stageLog; PetscEventPerfLog eventLog = NULL; PetscInt stage; ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr); ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);CHKERRQ(ierr); /* Log performance info */ eventLog->eventInfo[event].count++; eventLog->eventInfo[event].time += 0.0; eventLog->eventInfo[event].flops += 0; PetscFunctionReturn(0); } /* Create buffers on the device and send data over */ d_coefficients = clCreateBuffer(ocl_env.ctx_id, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, Ne*N_bt * sizeof(PetscReal), (void*)coefficients, &ierr);CHKERRQ(ierr); d_jacobianInverses = clCreateBuffer(ocl_env.ctx_id, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, Ne*dim*dim * sizeof(PetscReal), (void*)jacobianInverses, &ierr);CHKERRQ(ierr); d_jacobianDeterminants = clCreateBuffer(ocl_env.ctx_id, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, Ne * sizeof(PetscReal), (void*)jacobianDeterminants, &ierr);CHKERRQ(ierr); d_elemVec = clCreateBuffer(ocl_env.ctx_id, CL_MEM_READ_WRITE, Ne*N_bt * sizeof(PetscReal), NULL, &ierr);CHKERRQ(ierr); /* Work size preparations */ ierr = calculateGridOpenCL(Ne, Ncb*Nbc, &x, &y, &z);CHKERRQ(ierr); local_work_size[0] = Nbc*N_comp; local_work_size[1] = 1; local_work_size[2] = 1; global_work_size[0] = x * local_work_size[0]; global_work_size[1] = y * local_work_size[1]; global_work_size[2] = z * local_work_size[2]; /* if (debug) { */ ierr = PetscPrintf(PETSC_COMM_SELF, "GPU layout grid(%d,%d,%d) block(%d,%d,%d) with %d batches\n", x, y, z, local_work_size[0], local_work_size[1], local_work_size[2], Ncb);CHKERRQ(ierr); ierr = PetscPrintf(PETSC_COMM_SELF, " N_t: %d, N_cb: %d\n", N_t, Ncb); /* } */ /* Kernel launch */ /* integrateElementQuadrature<<<grid, block>>>(Ncb, d_coefficients, d_jacobianInverses, d_jacobianDeterminants, d_elemVec); */ ierr = clSetKernelArg(ocl_kernel, 0, sizeof(cl_int), (void*)&Ncb);CHKERRQ(ierr); ierr = clSetKernelArg(ocl_kernel, 1, sizeof(cl_mem), (void*)&d_coefficients);CHKERRQ(ierr); ierr = clSetKernelArg(ocl_kernel, 2, sizeof(cl_mem), (void*)&d_jacobianInverses);CHKERRQ(ierr); ierr = clSetKernelArg(ocl_kernel, 3, sizeof(cl_mem), (void*)&d_jacobianDeterminants);CHKERRQ(ierr); ierr = clSetKernelArg(ocl_kernel, 4, sizeof(cl_mem), (void*)&d_elemVec);CHKERRQ(ierr); ierr = clEnqueueNDRangeKernel(ocl_env.queue_id, ocl_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &ocl_ev);CHKERRQ(ierr); /* Read data back from device */ ierr = clEnqueueReadBuffer(ocl_env.queue_id, d_elemVec, CL_TRUE, 0, Ne*N_bt * sizeof(PetscReal), elemVec, 0, NULL, NULL);CHKERRQ(ierr); { PetscStageLog stageLog; PetscEventPerfLog eventLog = NULL; PetscInt stage; ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr); ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);CHKERRQ(ierr); /* Log performance info */ ierr = clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ns_start, NULL);CHKERRQ(ierr); ierr = clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ns_end, NULL);CHKERRQ(ierr); eventLog->eventInfo[event].count++; eventLog->eventInfo[event].time += (ns_end - ns_start)*1.0e-9; eventLog->eventInfo[event].flops += (((2+(2+2*dim)*dim)*N_comp*N_b+(2+2)*dim*N_comp)*N_q + (2+2*dim)*dim*N_q*N_comp*N_b)*Ne; } /* We are done, clean up */ ierr = clReleaseMemObject(d_coefficients);CHKERRQ(ierr); ierr = clReleaseMemObject(d_jacobianInverses);CHKERRQ(ierr); ierr = clReleaseMemObject(d_jacobianDeterminants);CHKERRQ(ierr); ierr = clReleaseMemObject(d_elemVec);CHKERRQ(ierr); ierr = clReleaseKernel(ocl_kernel);CHKERRQ(ierr); ierr = clReleaseProgram(ocl_prog);CHKERRQ(ierr); ierr = destroyOpenCL(&ocl_env);CHKERRQ(ierr); PetscFunctionReturn(0); }