void CentralLB::ProcessAtSync() { #if CMK_LBDB_ON if (reduction_started) return; // reducton in progress CmiAssert(CmiNodeAlive(CkMyPe())); if (CkMyPe() == cur_ld_balancer) { start_lb_time = CkWallTimer(); } #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_)) initMlogLBStep(thisgroup); #endif // build message BuildStatsMsg(); #if USE_REDUCTION // reduction to get total number of objects and comm // so that processor 0 can pre-allocate load balancing database int counts[2]; counts[0] = theLbdb->GetObjDataSz(); counts[1] = theLbdb->GetCommDataSz(); CkCallback cb(CkIndex_CentralLB::ReceiveCounts((CkReductionMsg*)NULL), thisProxy[0]); contribute(2*sizeof(int), counts, CkReduction::sum_int, cb); reduction_started = 1; #else SendStats(); #endif #endif }
void CentralLB::endMigrationDone(int balancing){ DEBUGF(("[%d] CentralLB::endMigrationDone step %d\n",CkMyPe(),step())); if (balancing && _lb_args.syncResume()) { CkCallback cb(CkIndex_CentralLB::ResumeClients((CkReductionMsg*)NULL), thisProxy); contribute(0, NULL, CkReduction::sum_int, cb); } else{ if(CmiNodeAlive(CkMyPe())){ DEBUGF(("[%d] Sending ResumeClients balancing %d \n",CkMyPe(),balancing)); thisProxy [CkMyPe()].ResumeClients(balancing); } } }
void CentralLB::AtSync() { #if CMK_LBDB_ON DEBUGF(("[%d] CentralLB AtSync step %d!!!!!\n",CkMyPe(),step())); #if CMK_MEM_CHECKPOINT CkSetInLdb(); #endif #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_)) CpvAccess(_currentObj)=this; #endif // if num of processor is only 1, nothing should happen if (!QueryBalanceNow(step()) || CkNumPes() == 1) { MigrationDone(0); return; } if(CmiNodeAlive(CkMyPe())){ thisProxy [CkMyPe()].ProcessAtSync(); } #endif }
void CentralLB::MigrationDone(int balancing) { #if CMK_LBDB_ON migrates_completed = 0; migrates_expected = -1; // clear load stats if (balancing) theLbdb->ClearLoads(); // Increment to next step theLbdb->incStep(); DEBUGF(("[%d] Incrementing Step %d \n",CkMyPe(),step())); // if sync resume, invoke a barrier #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_)) savedBalancing = balancing; startLoadBalancingMlog(&resumeCentralLbAfterChkpt,(void *)this); #endif LBDatabase::Object()->MigrationDone(); // call registered callbacks LoadbalanceDone(balancing); // callback #if (!defined(_FAULT_MLOG_) && !defined(_FAULT_CAUSAL_)) // if sync resume invoke a barrier if (balancing && _lb_args.syncResume()) { CkCallback cb(CkIndex_CentralLB::ResumeClients((CkReductionMsg*)NULL), thisProxy); contribute(0, NULL, CkReduction::sum_int, cb); } else{ if(CmiNodeAlive(CkMyPe())){ thisProxy [CkMyPe()].ResumeClients(balancing); } } #if CMK_GRID_QUEUE_AVAILABLE CmiGridQueueDeregisterAll (); CpvAccess(CkGridObject) = NULL; #endif #endif #endif }
void CentralLB::ProcessReceiveMigration(CkReductionMsg *msg) { #if CMK_LBDB_ON int i; LBMigrateMsg *m = storedMigrateMsg; CmiAssert(m!=NULL); delete msg; #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_)) int *dummyCounts; DEBUGF(("[%d] Starting ReceiveMigration WITH step %d m->step %d\n",CkMyPe(),step(),m->step)); // CmiPrintf("[%d] Starting ReceiveMigration step %d m->step %d\n",CkMyPe(),step(),m->step); if(step() > m->step){ char str[100]; envelope *env = UsrToEnv(m); return; } lbDecisionCount = m->lbDecisionCount; #endif if (_lb_args.debug() > 1) if (CkMyPe()%1024==0) CmiPrintf("[%d] Starting ReceiveMigration step %d at %f\n",CkMyPe(),step(), CmiWallTimer()); for (i=0; i<CkNumPes(); i++) theLbdb->lastLBInfo.expectedLoad[i] = m->expectedLoad[i]; CmiAssert(migrates_expected <= 0 || migrates_completed == migrates_expected); /*FAULT_EVAC*/ if(!CmiNodeAlive(CkMyPe())){ delete m; return; } migrates_expected = 0; future_migrates_expected = 0; #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_)) int sending=0; int dummy=0; LBDB *_myLBDB = theLbdb->getLBDB(); if(_restartFlag){ dummyCounts = new int[CmiNumPes()]; bzero(dummyCounts,sizeof(int)*CmiNumPes()); } #endif for(i=0; i < m->n_moves; i++) { MigrateInfo& move = m->moves[i]; const int me = CkMyPe(); if (move.from_pe == me && move.to_pe != me) { DEBUGF(("[%d] migrating object to %d\n",move.from_pe,move.to_pe)); // migrate object, in case it is already gone, inform toPe #if (!defined(_FAULT_MLOG_) && !defined(_FAULT_CAUSAL_)) if (theLbdb->Migrate(move.obj,move.to_pe) == 0) thisProxy[move.to_pe].MissMigrate(!move.async_arrival); #else if(_restartFlag == 0){ DEBUG(CmiPrintf("[%d] need to move object from %d to %d \n",CkMyPe(),move.from_pe,move.to_pe)); theLbdb->Migrate(move.obj,move.to_pe); sending++; }else{ if(_myLBDB->validObjHandle(move.obj)){ DEBUG(CmiPrintf("[%d] need to move object from %d to %d \n",CkMyPe(),move.from_pe,move.to_pe)); theLbdb->Migrate(move.obj,move.to_pe); sending++; }else{ DEBUG(CmiPrintf("[%d] dummy move to pe %d detected after restart \n",CmiMyPe(),move.to_pe)); dummyCounts[move.to_pe]++; dummy++; } } #endif } else if (move.from_pe != me && move.to_pe == me) { DEBUGF(("[%d] expecting object from %d\n",move.to_pe,move.from_pe)); if (!move.async_arrival) migrates_expected++; else future_migrates_expected++; } else { #if CMK_GLOBAL_LOCATION_UPDATE UpdateLocation(move); #endif } } DEBUGF(("[%d] in ReceiveMigration %d moves expected: %d future expected: %d\n",CkMyPe(),m->n_moves, migrates_expected, future_migrates_expected)); // if (_lb_debug) CkPrintf("[%d] expecting %d objects migrating.\n", CkMyPe(), migrates_expected); #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_)) if(_restartFlag){ sendDummyMigrationCounts(dummyCounts); _restartFlag =0; delete []dummyCounts; } #endif #if 0 if (m->n_moves ==0) { theLbdb->SetLBPeriod(theLbdb->GetLBPeriod()*2); } #endif cur_ld_balancer = m->next_lb; if((CkMyPe() == cur_ld_balancer) && (cur_ld_balancer != 0)){ LBDatabaseObj()->set_avail_vector(m->avail_vector, -2); } if (migrates_expected == 0 || migrates_completed == migrates_expected) MigrationDone(1); delete m; // CkEvacuatedElement(); #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_)) // migrates_expected = 0; // // ResumeClients(1); #endif #endif }
void CentralLB::ReceiveStats(CkMarshalledCLBStatsMessage &msg) { #if CMK_LBDB_ON if (statsMsgsList == NULL) { statsMsgsList = new CLBStatsMsg*[CkNumPes()]; CmiAssert(statsMsgsList != NULL); for(int i=0; i < CkNumPes(); i++) statsMsgsList[i] = 0; } if (statsData == NULL) statsData = new LDStats; // loop through all CLBStatsMsg in the incoming msg int count = msg.getCount(); for (int num = 0; num < count; num++) { CLBStatsMsg *m = msg.getMessage(num); CmiAssert(m!=NULL); const int pe = m->from_pe; DEBUGF(("Stats msg received, %d %d %d %p step %d\n", pe,stats_msg_count,m->n_objs,m,step())); #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_)) /* * if(m->step < step()){ * //TODO: if a processor is redoing an old load balance step.. * //tell it that the step is done and that it should not perform any migrations * thisProxy[pe].ReceiveDummyMigration(); * }*/ #endif if(!CmiNodeAlive(pe)){ DEBUGF(("[%d] ReceiveStats called from invalidProcessor %d\n",CkMyPe(),pe)); continue; } if (m->avail_vector!=NULL) { LBDatabaseObj()->set_avail_vector(m->avail_vector, m->next_lb); } if (statsMsgsList[pe] != 0) { CkPrintf("*** Unexpected CLBStatsMsg in ReceiveStats from PE %d ***\n", pe); } else { statsMsgsList[pe] = m; #if USE_REDUCTION depositData(m); #else // store per processor data right away struct ProcStats &procStat = statsData->procs[pe]; procStat.pe = pe; procStat.total_walltime = m->total_walltime; procStat.idletime = m->idletime; procStat.bg_walltime = m->bg_walltime; #if CMK_LB_CPUTIMER procStat.total_cputime = m->total_cputime; procStat.bg_cputime = m->bg_cputime; #endif procStat.pe_speed = m->pe_speed; //procStat.utilization = 1.0; procStat.available = CmiTrue; procStat.n_objs = m->n_objs; statsData->n_objs += m->n_objs; statsData->n_comm += m->n_comm; #endif #if defined(TEMP_LDB) procStat.pe_temp=m->pe_temp; procStat.pe_speed=m->pe_speed; #endif stats_msg_count++; } } // end of for const int clients = CkNumValidPes(); DEBUGF(("THIS POINT count = %d, clients = %d\n",stats_msg_count,clients)); if (stats_msg_count == clients) { DEBUGF(("[%d] All stats messages received \n",CmiMyPe())); statsData->nprocs() = stats_msg_count; thisProxy[CkMyPe()].LoadBalance(); } #endif }
static void _exitHandler(envelope *env) { DEBUGF(("exitHandler called on %d msgtype: %d\n", CkMyPe(), env->getMsgtype())); switch(env->getMsgtype()) { case StartExitMsg: CkAssert(CkMyPe()==0); if (!_CkExitFnVec.isEmpty()) { CkExitFn fn = _CkExitFnVec.deq(); fn(); break; } // else goto next case ExitMsg: CkAssert(CkMyPe()==0); if(_exitStarted) { CmiFree(env); return; } _exitStarted = 1; CkNumberHandler(_charmHandlerIdx,(CmiHandler)_discardHandler); CkNumberHandler(_bocHandlerIdx, (CmiHandler)_discardHandler); env->setMsgtype(ReqStatMsg); env->setSrcPe(CkMyPe()); // if exit in ring, instead of broadcasting, send in ring if (_ringexit){ DEBUGF(("[%d] Ring Exit \n",CkMyPe())); const int stride = CkNumPes()/_ringtoken; int pe = 0; while (pe<CkNumPes()) { CmiSyncSend(pe, env->getTotalsize(), (char *)env); pe += stride; } CmiFree(env); }else{ CmiSyncBroadcastAllAndFree(env->getTotalsize(), (char *)env); } break; case ReqStatMsg: #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_)) _messageLoggingExit(); #endif DEBUGF(("ReqStatMsg on %d\n", CkMyPe())); CkNumberHandler(_charmHandlerIdx,(CmiHandler)_discardHandler); CkNumberHandler(_bocHandlerIdx, (CmiHandler)_discardHandler); /*FAULT_EVAC*/ if(CmiNodeAlive(CkMyPe())){ #if CMK_WITH_STATS _sendStats(); #endif _mainDone = 1; // This is needed because the destructors for // readonly variables will be called when the program // exits. If the destructor is called while _mainDone // is 0, it will assume that the readonly variable was // declared locally. On all processors other than 0, // _mainDone is never set to 1 before the program exits. #if CMK_TRACE_ENABLED if (_ringexit) traceClose(); #endif } if (_ringexit) { int stride = CkNumPes()/_ringtoken; int pe = CkMyPe()+1; if (pe < CkNumPes() && pe % stride != 0) CmiSyncSendAndFree(pe, env->getTotalsize(), (char *)env); else CmiFree(env); } else CmiFree(env); //everyone exits here - there may be issues with leftover messages in the queue #if CMK_WITH_STATS if(CkMyPe()) #endif { DEBUGF(("[%d] Calling converse exit \n",CkMyPe())); ConverseExit(); if(CharmLibInterOperate) CpvAccess(interopExitFlag) = 1; } break; #if CMK_WITH_STATS case StatMsg: CkAssert(CkMyPe()==0); _allStats[env->getSrcPe()] = (Stats*) EnvToUsr(env); _numStatsRecd++; DEBUGF(("StatMsg on %d with %d\n", CkMyPe(), _numStatsRecd)); /*FAULT_EVAC*/ if(_numStatsRecd==CkNumValidPes()) { _printStats(); DEBUGF(("[%d] Calling converse exit \n",CkMyPe())); ConverseExit(); if(CharmLibInterOperate) CpvAccess(interopExitFlag) = 1; } break; #endif default: CmiAbort("Internal Error(_exitHandler): Unknown-msg-type. Contact Developers.\n"); } }