static void _exitHandler(envelope *env) { DEBUGF(("exitHandler called on %d msgtype: %d\n", CkMyPe(), env->getMsgtype())); switch(env->getMsgtype()) { case StartExitMsg: CkAssert(CkMyPe()==0); if (!_CkExitFnVec.isEmpty()) { CkExitFn fn = _CkExitFnVec.deq(); fn(); break; } // else goto next case ExitMsg: CkAssert(CkMyPe()==0); if(_exitStarted) { CmiFree(env); return; } _exitStarted = 1; CkNumberHandler(_charmHandlerIdx,(CmiHandler)_discardHandler); CkNumberHandler(_bocHandlerIdx, (CmiHandler)_discardHandler); env->setMsgtype(ReqStatMsg); env->setSrcPe(CkMyPe()); // if exit in ring, instead of broadcasting, send in ring if (_ringexit){ DEBUGF(("[%d] Ring Exit \n",CkMyPe())); const int stride = CkNumPes()/_ringtoken; int pe = 0; while (pe<CkNumPes()) { CmiSyncSend(pe, env->getTotalsize(), (char *)env); pe += stride; } CmiFree(env); }else{ CmiSyncBroadcastAllAndFree(env->getTotalsize(), (char *)env); } break; case ReqStatMsg: #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_)) _messageLoggingExit(); #endif DEBUGF(("ReqStatMsg on %d\n", CkMyPe())); CkNumberHandler(_charmHandlerIdx,(CmiHandler)_discardHandler); CkNumberHandler(_bocHandlerIdx, (CmiHandler)_discardHandler); /*FAULT_EVAC*/ if(CmiNodeAlive(CkMyPe())){ #if CMK_WITH_STATS _sendStats(); #endif _mainDone = 1; // This is needed because the destructors for // readonly variables will be called when the program // exits. If the destructor is called while _mainDone // is 0, it will assume that the readonly variable was // declared locally. On all processors other than 0, // _mainDone is never set to 1 before the program exits. #if CMK_TRACE_ENABLED if (_ringexit) traceClose(); #endif } if (_ringexit) { int stride = CkNumPes()/_ringtoken; int pe = CkMyPe()+1; if (pe < CkNumPes() && pe % stride != 0) CmiSyncSendAndFree(pe, env->getTotalsize(), (char *)env); else CmiFree(env); } else CmiFree(env); //everyone exits here - there may be issues with leftover messages in the queue #if CMK_WITH_STATS if(CkMyPe()) #endif { DEBUGF(("[%d] Calling converse exit \n",CkMyPe())); ConverseExit(); if(CharmLibInterOperate) CpvAccess(interopExitFlag) = 1; } break; #if CMK_WITH_STATS case StatMsg: CkAssert(CkMyPe()==0); _allStats[env->getSrcPe()] = (Stats*) EnvToUsr(env); _numStatsRecd++; DEBUGF(("StatMsg on %d with %d\n", CkMyPe(), _numStatsRecd)); /*FAULT_EVAC*/ if(_numStatsRecd==CkNumValidPes()) { _printStats(); DEBUGF(("[%d] Calling converse exit \n",CkMyPe())); ConverseExit(); if(CharmLibInterOperate) CpvAccess(interopExitFlag) = 1; } break; #endif default: CmiAbort("Internal Error(_exitHandler): Unknown-msg-type. Contact Developers.\n"); } }
Hello(const CollideHandle &collide_) :collide(collide_) { CkPrintf("Creating element %d on PE %d\n",thisIndex,CkMyPe()); nTimes=0; CollideRegister(collide,thisIndex); }
LdbCoordinator::LdbCoordinator() { if (CkpvAccess(LdbCoordinator_instance) == NULL) { CkpvAccess(LdbCoordinator_instance) = this; } else { iout << iFILE << iERROR << iPE << "LdbCoordinator instanced twice on same node!" << endi; CkExit(); } #if 0 // Create a load balancer if (CkMyPe() == 0) { // CreateCentralLB(); CreateNamdCentLB(); // CreateNamdNborLB(); } #endif ldbCycleNum = 1; takingLdbData = 1; totalStepsDone = 0; nLocalComputes = nLocalPatches = 0; patchNAtoms = (int *) NULL; sequencerThreads = (Sequencer **) NULL; ldbStatsFP = NULL; computeArray = NULL; patchArray = NULL; processorArray = NULL; // Register self as an object manager for new charm++ balancer framework theLbdb = LBDatabase::Object(); // Set the load balancing period (in seconds). Without this the // load balancing framework will hang until 1 second has passed // since the last load balancing, causing hiccups in very fast runs. // Unfortunately, the clock is already set for the first load // balancing, but only +LBPeriod 1.0e-5 can fix that in older charm. // For newer versions this is handled in initproc above. theLbdb->SetLBPeriod(1.0e-5); myOMid.id.idx = 1; LDCallbacks cb = { (LDMigrateFn)staticMigrateFn, (LDStatsFn)staticStatsFn, (LDQueryEstLoadFn)staticQueryEstLoadFn }; myHandle = theLbdb->RegisterOM(myOMid,(void*)this,cb); // Add myself as a local barrier receiver, so I know when I might // be registering objects. theLbdb->AddLocalBarrierReceiver((LDBarrierFn)staticReceiveAtSync, (void*)this);; // Also, add a local barrier client, to trigger load balancing ldBarrierHandle = theLbdb-> AddLocalBarrierClient((LDResumeFn)staticResumeFromSync, (void*)this); migrateMsgs = 0; // linked list numComputes = 0; reg_all_objs = 1; }
void OrbLB::work(LDStats* stats) { #if CMK_LBDB_ON int i,j; statsData = stats; P = stats->nprocs(); // calculate total number of migratable objects nObjs = stats->n_migrateobjs; #ifdef DEBUG CmiPrintf("ORB: num objects:%d\n", nObjs); #endif // create computeLoad and calculate tentative computes coordinates computeLoad = new ComputeLoad[nObjs]; for (i=XDIR; i<=ZDIR; i++) vArray[i] = new VecArray[nObjs]; // v[0] = XDIR v[1] = YDIR v[2] = ZDIR // vArray[XDIR] is an array holding the x vector for all computes int objIdx = 0; for (i=0; i<stats->n_objs; i++) { LDObjData &odata = stats->objData[i]; if (odata.migratable == 0) continue; computeLoad[objIdx].id = objIdx; computeLoad[objIdx].v[XDIR] = odata.objID().id[0]; computeLoad[objIdx].v[YDIR] = odata.objID().id[1]; computeLoad[objIdx].v[ZDIR] = odata.objID().id[2]; #if CMK_LB_CPUTIMER computeLoad[objIdx].load = _lb_args.useCpuTime()?odata.cpuTime:odata.wallTime; #else computeLoad[objIdx].load = odata.wallTime; #endif computeLoad[objIdx].refno = 0; computeLoad[objIdx].partition = NULL; for (int k=XDIR; k<=ZDIR; k++) { vArray[k][objIdx].id = objIdx; vArray[k][objIdx].v = computeLoad[objIdx].v[k]; } #ifdef DEBUG CmiPrintf("Object %d: %d %d %d load:%f\n", objIdx, computeLoad[objIdx].v[XDIR], computeLoad[objIdx].v[YDIR], computeLoad[objIdx].v[ZDIR], computeLoad[objIdx].load); #endif objIdx ++; } CmiAssert(nObjs == objIdx); double t = CkWallTimer(); quicksort(XDIR); quicksort(YDIR); quicksort(ZDIR); #ifdef DEBUG CmiPrintf("qsort time: %f\n", CkWallTimer() - t); #endif npartition = 0; for (i=0; i<P; i++) if (stats->procs[i].available == CmiTrue) npartition++; partitions = new Partition[npartition]; double totalLoad = 0.0; int minx, miny, minz, maxx, maxy, maxz; minx = maxx= computeLoad[0].v[XDIR]; miny = maxy= computeLoad[0].v[YDIR]; minz = maxz= computeLoad[0].v[ZDIR]; for (i=1; i<nObjs; i++) { totalLoad += computeLoad[i].load; if (computeLoad[i].v[XDIR] < minx) minx = computeLoad[i].v[XDIR]; else if (computeLoad[i].v[XDIR] > maxx) maxx = computeLoad[i].v[XDIR]; if (computeLoad[i].v[YDIR] < miny) miny = computeLoad[i].v[YDIR]; else if (computeLoad[i].v[YDIR] > maxy) maxy = computeLoad[i].v[YDIR]; if (computeLoad[i].v[ZDIR] < minz) minz = computeLoad[i].v[ZDIR]; else if (computeLoad[i].v[ZDIR] > maxz) maxz = computeLoad[i].v[ZDIR]; } top_partition.origin[XDIR] = minx; top_partition.origin[YDIR] = miny; top_partition.origin[ZDIR] = minz; top_partition.corner[XDIR] = maxx; top_partition.corner[YDIR] = maxy; top_partition.corner[ZDIR] = maxz; top_partition.refno = 0; top_partition.load = 0.0; top_partition.count = nObjs; // if we take background load into account if (!_lb_args.ignoreBgLoad()) { top_partition.bkpes.resize(0); double total = totalLoad; for (i=0; i<P; i++) { if (!stats->procs[i].available) continue; double bkload = stats->procs[i].bg_walltime; total += bkload; } double averageLoad = total / npartition; for (i=0; i<P; i++) { if (!stats->procs[i].available) continue; double bkload = stats->procs[i].bg_walltime; if (bkload < averageLoad) top_partition.bkpes.push_back(i); else CkPrintf("OrbLB Info> PE %d with %f background load will have 0 object.\n", i, bkload); } npartition = top_partition.bkpes.size(); // formally add these bg load to total load for (i=0; i<npartition; i++) totalLoad += stats->procs[top_partition.bkpes[i]].bg_walltime; if (_lb_args.debug()>=2) { CkPrintf("BG load: "); for (i=0; i<P; i++) CkPrintf(" %f", stats->procs[i].bg_walltime); CkPrintf("\n"); CkPrintf("Partition BG load: "); for (i=0; i<npartition; i++) CkPrintf(" %f", stats->procs[top_partition.bkpes[i]].bg_walltime); CkPrintf("\n"); } } top_partition.load = totalLoad; currentp = 0; refno = 0; // recursively divide rec_divide(npartition, top_partition); // mapping partitions to nodes mapPartitionsToNodes(); // this is for sanity check int *num = new int[P]; for (i=0; i<P; i++) num[i] = 0; for (i=0; i<nObjs; i++) { for (j=0; j<npartition; j++) if (computeLoad[i].refno == partitions[j].refno) { computeLoad[i].partition = partitions+j; num[j] ++; } CmiAssert(computeLoad[i].partition != NULL); } for (i=0; i<npartition; i++) if (num[i] != partitions[i].count) CmiAbort("OrbLB: Compute counts don't agree!\n"); delete [] num; // Save output objIdx = 0; for(int obj=0;obj<stats->n_objs;obj++) { stats->to_proc[obj] = stats->from_proc[obj]; LDObjData &odata = stats->objData[obj]; if (odata.migratable == 0) { continue; } int frompe = stats->from_proc[obj]; int tope = computeLoad[objIdx].partition->node; if (frompe != tope) { if (_lb_args.debug() >= 3) { CkPrintf("[%d] Obj %d migrating from %d to %d\n", CkMyPe(),obj,frompe,tope); } stats->to_proc[obj] = tope; } objIdx ++; } // free memory delete [] computeLoad; for (i=0; i<3; i++) delete [] vArray[i]; delete [] partitions; if (_lb_args.debug() >= 1) CkPrintf("OrbLB finished time: %fs\n", CkWallTimer() - t); #endif }
void ComputeDPME::doWork() { DebugM(4,"Entering ComputeDPME::doWork().\n"); Pme2Particle *localData; ResizeArrayIter<PatchElem> ap(patchList); // Skip computations if nothing to do. if ( ! patchList[0].p->flags.doFullElectrostatics ) { for (ap = ap.begin(); ap != ap.end(); ap++) { CompAtom *x = (*ap).positionBox->open(); Results *r = (*ap).forceBox->open(); (*ap).positionBox->close(&x); (*ap).forceBox->close(&r); } if ( master ) { master->reduction->submit(); } return; } // allocate storage numLocalAtoms = 0; for (ap = ap.begin(); ap != ap.end(); ap++) { numLocalAtoms += (*ap).p->getNumAtoms(); } Lattice lattice = patchList[0].p->flags.lattice; localData = new Pme2Particle[numLocalAtoms]; // given to message // get positions and charges Pme2Particle * data_ptr = localData; const BigReal coulomb_sqrt = sqrt( COULOMB * ComputeNonbondedUtil::scaling * ComputeNonbondedUtil::dielectric_1 ); for (ap = ap.begin(); ap != ap.end(); ap++) { CompAtom *x = (*ap).positionBox->open(); if ( patchList[0].p->flags.doMolly ) { (*ap).positionBox->close(&x); x = (*ap).avgPositionBox->open(); } int numAtoms = (*ap).p->getNumAtoms(); for(int i=0; i<numAtoms; ++i) { Vector tmp = lattice.delta(x[i].position); data_ptr->x = tmp.x; data_ptr->y = tmp.y; data_ptr->z = tmp.z; data_ptr->cg = coulomb_sqrt * x[i].charge; data_ptr->id = x[i].id; ++data_ptr; } if ( patchList[0].p->flags.doMolly ) { (*ap).avgPositionBox->close(&x); } else { (*ap).positionBox->close(&x); } } // send data to master ComputeDPMEDataMsg *msg = new ComputeDPMEDataMsg; msg->node = CkMyPe(); msg->numParticles = numLocalAtoms; msg->particles = localData; comm->sendComputeDPMEData(msg); }
void TopoCentLB :: work(LDStats *stats) { int proc; int i,j; int n_pes = stats->nprocs(); if (_lb_args.debug() >= 2) { CkPrintf("In TopoCentLB Strategy...\n"); } // Make sure that there is at least one available processor. for (proc = 0; proc < n_pes; proc++) { if (stats->procs[proc].available) { break; } } if (proc == n_pes) { CmiAbort ("TopoCentLB: no available processors!"); } removeNonMigratable(stats, n_pes); int *newmap = new int[stats->n_objs]; if(make_mapping) computePartitions(stats, n_pes, newmap); else { //mapping taken from previous algo for(i=0;i<stats->n_objs;i++) { newmap[i]=stats->from_proc[i]; } } //Debugging Code if(_lb_args.debug() >=2){ CkPrintf("Map obtained from partitioning:\n"); for(i=0;i<stats->n_objs;i++) CkPrintf(" %d,%d ",i,newmap[i]); } int max_objs = findMaxObjs(newmap,stats->n_objs, n_pes); partgraph = new PartGraph(n_pes, max_objs); //Fill up the partition graph - first fill the nodes and then, the edges for(i=0;i<stats->n_objs;i++) { PartGraph::Node* n = &partgraph->nodes[newmap[i]]; n->obj_list[n->num_objs]=i; n->num_objs++; } int *addedComm=new int[n_pes]; stats->makeCommHash(); int max_comm_part=-1; double max_comm=0; //Try putting random amount of communication on the partition graph edges to see if things work fine //This also checks the running time of the algorithm since number of edges is high than in a practical scenario #ifdef RAND_COMM for(i = 0; i < n_pes; i++) { for(j = i+1; j < n_pes; j++) { int val; if(rand()%5==0) val=0; else val= rand()%1000; partgraph->edges[i][j] = val; partgraph->edges[j][i] = val; partgraph->nodes[i].comm += val; partgraph->nodes[j].comm += val; if(partgraph->nodes[i].comm > max_comm){ max_comm = partgraph->nodes[i].comm; max_comm_part = i; } if(partgraph->nodes[j].comm > max_comm){ max_comm = partgraph->nodes[j].comm; max_comm_part = j; } } } #else //Adding communication to the partition graph edges for(i=0;i<stats->n_comm;i++) { //DO I consider other comm too....i.e. to or from a processor LDCommData &cdata = stats->commData[i]; if(!cdata.from_proc() && cdata.receiver.get_type() == LD_OBJ_MSG){ int senderID = stats->getHash(cdata.sender); int recverID = stats->getHash(cdata.receiver.get_destObj()); CmiAssert(senderID < stats->n_objs); CmiAssert(recverID < stats->n_objs); if(newmap[senderID]==newmap[recverID]) continue; if(partgraph->edges[newmap[senderID]][newmap[recverID]] == 0){ partgraph->nodes[newmap[senderID]].degree++; partgraph->nodes[newmap[recverID]].degree++; } partgraph->edges[newmap[senderID]][newmap[recverID]] += cdata.bytes; partgraph->edges[newmap[recverID]][newmap[senderID]] += cdata.bytes; partgraph->nodes[newmap[senderID]].comm += cdata.bytes; partgraph->nodes[newmap[recverID]].comm += cdata.bytes; //Keeping track of maximum communiacting partition if(partgraph->nodes[newmap[senderID]].comm > max_comm){ max_comm = partgraph->nodes[newmap[senderID]].comm; max_comm_part = newmap[senderID]; } if(partgraph->nodes[newmap[recverID]].comm > max_comm){ max_comm = partgraph->nodes[newmap[recverID]].comm; max_comm_part = newmap[recverID]; } } else if(cdata.receiver.get_type() == LD_OBJLIST_MSG) { int nobjs; LDObjKey *objs = cdata.receiver.get_destObjs(nobjs); int senderID = stats->getHash(cdata.sender); for(j = 0; j < n_pes; j++) addedComm[j]=0; for (j=0; j<nobjs; j++) { int recverID = stats->getHash(objs[j]); if((senderID == -1)||(recverID == -1)) if (_lb_args.migObjOnly()) continue; else CkAbort("Error in search\n"); if(newmap[senderID]==newmap[recverID]) continue; if(partgraph->edges[newmap[senderID]][newmap[recverID]] == 0){ partgraph->nodes[newmap[senderID]].degree++; partgraph->nodes[newmap[recverID]].degree++; } //Communication added only once for a message sent to many objects on a single processor if(!addedComm[newmap[recverID]]){ partgraph->edges[newmap[senderID]][newmap[recverID]] += cdata.bytes; partgraph->edges[newmap[recverID]][newmap[senderID]] += cdata.bytes; partgraph->nodes[newmap[senderID]].comm += cdata.bytes; partgraph->nodes[newmap[recverID]].comm += cdata.bytes; if(partgraph->nodes[newmap[senderID]].comm > max_comm){ max_comm = partgraph->nodes[newmap[senderID]].comm; max_comm_part = newmap[senderID]; } if(partgraph->nodes[newmap[recverID]].comm > max_comm){ max_comm = partgraph->nodes[newmap[recverID]].comm; max_comm_part = newmap[recverID]; } //bytesComm[newmap[senderID]][newmap[recverID]] += cdata.bytes; //bytesComm[newmap[recverID]][newmap[senderID]] += cdata.bytes; addedComm[newmap[recverID]]=1; } } } } #endif int *proc_mapping = new int[n_pes]; delete [] addedComm; LBtopoFn topofn; //Parsing the command line input for getting the processor topology char *lbcopy = strdup(_lbtopo); char *ptr = strchr(lbcopy, ':'); if (ptr!=NULL) ptr = strtok(lbcopy, ":"); else ptr=lbcopy; topofn = LBTopoLookup(ptr); if (topofn == NULL) { char str[1024]; CmiPrintf("TopoCentLB> Fatal error: Unknown topology: %s. Choose from:\n", ptr); printoutTopo(); sprintf(str, "TopoCentLB> Fatal error: Unknown topology: %s", ptr); CmiAbort(str); } topo = topofn(n_pes); //Call the core routine to produce the partition processor mapping calculateMST(partgraph,topo,proc_mapping,max_comm_part); //Returned partition graph is a Maximum Spanning Tree -- converted in above function itself //Debugging code: Result of mapping partition graph onto processor graph if (_lb_args.debug()>1) { CkPrintf("Resultant mapping..(partition,processor)\n"); for(i = 0; i < n_pes; i++) CkPrintf("%d,%d\n",i,proc_mapping[i]); } //Store the result in the load balancing database int pe; PartGraph::Node* n; for(i = 0; i < n_pes; i++){ pe = proc_mapping[i]; n = &partgraph->nodes[i]; for(j=0;j<n->num_objs;j++){ stats->to_proc[n->obj_list[j]] = pe; if (_lb_args.debug()>1) CkPrintf("[%d] Obj %d migrating from %d to %d\n", CkMyPe(),n->obj_list[j],stats->from_proc[n->obj_list[j]],pe); } } delete[] newmap; delete[] proc_mapping; //Delete hopCount for(i = 0; i < n_pes; i++) delete[] hopCount[i]; delete[] hopCount; delete[] heapMapping; delete partgraph; }
//! process command line arguments! void TraceCounter::traceInit(char **argv) { CpvInitialize(CountLogPool*, _logPool); CpvInitialize(char*, _logName); CpvInitialize(double, version); CpvInitialize(char**, _counterNames); CpvInitialize(char**, _counterDesc); CpvInitialize(int, _numCounters); CpvInitialize(int, _reductionID); CpvAccess(_logName) = (char *) malloc(strlen(argv[0])+1); _MEMCHECK(CpvAccess(_logName)); strcpy(CpvAccess(_logName), argv[0]); CpvAccess(version) = VER; int i; // parse command line args char* counters = NULL; commandLine_ = NULL; bool badArg = false; int numCounters = 0; if (CmiGetArgStringDesc(argv, "+counters", &counters, "Measure these performance counters")) { if (CmiMyPe()==0) { CmiPrintf("Counters: %s\n", counters); } int offset = 0; int limit = strlen(counters); char* ptr = counters; while (offset < limit && (ptr = strtok(&counters[offset], ",")) != NULL) { offset += strlen(ptr)+1; ptr = &ptr[strlen(ptr)+1]; numCounters++; } if (CmiMyPe()==0) { CmiPrintf("There are %d counters\n", numCounters); } commandLine_ = new CounterArg[numCounters]; ptr = counters; for (i=0; i<numCounters; i++) { commandLine_[i].arg = ptr; if (!matchArg(&commandLine_[i])) { if (CmiMyPe()==0) { CmiPrintf("Bad arg: [%s]\n", ptr); } badArg = true; } ptr = &ptr[strlen(ptr)+1]; } } commandLineSz_ = numCounters; // check to see if args are valid, output if not if (badArg || CmiGetArgFlagDesc(argv, "+count-help", "List available performance counters")) { if (CmiMyPe() == 0) { printHelp(); } ConverseExit(); return; } else if (counters == NULL) { if (CmiMyPe() == 0) { usage(); } ConverseExit(); return; } // get optional command line args overview_ = CmiGetArgFlag(argv, "+count-overview"); switchRandom_ = CmiGetArgFlag(argv, "+count-switchrandom"); switchByPhase_ = CmiGetArgFlag(argv, "+count-switchbyphase"); noLog_ = CmiGetArgFlag(argv, "+count-nolog"); writeByPhase_ = CmiGetArgFlag(argv, "+count-writebyphase"); char* logName = NULL; if (CmiGetArgString(argv, "+count-logname", &logName)) { CpvAccess(_logName) = logName; if (noLog_) { if (CkMyPe()==0) { CmiPrintf("+count-logname and +count-nolog are MUTUALLY EXCLUSIVE\n"); usage(); CmiAbort(""); } } } if (switchByPhase_ && overview_) { if (CkMyPe()==0) { CmiPrintf( "+count-switchbyphase and +count-overview are MUTUALLY EXCLUSIVE\n" "+count-overview automatically switches by phase.\n"); usage(); CmiAbort(""); } } if (writeByPhase_ && noLog_) { if (CkMyPe()==0) { CmiPrintf("+count-writebyphase and +count-nolog are MUTUALLY EXCLUSIVE\n"); usage(); CmiAbort(""); } } // parse through commandLine_, figure out which belongs on which list (1 vs 2) CounterArg* last1 = NULL; CounterArg* last2 = NULL; CounterArg* tmp = NULL; counter1Sz_ = counter2Sz_ = 0; for (i=0; i<commandLineSz_; i++) { tmp = &commandLine_[i]; if (tmp->code < NUM_COUNTER_ARGS/2) { if (counter1_ == NULL) { counter1_ = tmp; last1 = counter1_; } else { last1->next = tmp; last1 = tmp; } counter1Sz_++; } else { if (counter2_ == NULL) { counter2_ = tmp; last2 = counter2_; } else { last2->next = tmp; last2 = tmp; } counter2Sz_++; } } if (counter1_ == NULL) { printHelp(); if (CmiMyPe()==0) { CmiPrintf("\nMust specify some counters with code < %d\n", NUM_COUNTER_ARGS/2); } ConverseExit(); } if (counter2_ == NULL) { printHelp(); if (CmiMyPe()==0) { CmiPrintf("\nMust specify some counters with code >= %d\n", NUM_COUNTER_ARGS/2); } ConverseExit(); } last1->next = counter1_; last2->next = counter2_; // all args valid, now set up logging if (CmiMyPe() == 0) { CmiPrintf("Running with tracemode=counter and args:\n"); // print out counter1 set tmp = counter1_; i = 0; do { CmiPrintf(" <counter1-%d>=%d %s %s\n", i, tmp->code, tmp->arg, tmp->desc); tmp = tmp->next; i++; } while (tmp != counter1_); // print out counter2 set tmp = counter2_; i = 0; do { CmiPrintf(" <counter2-%d>=%d %s %s\n", i, tmp->code, tmp->arg, tmp->desc); tmp = tmp->next; i++; } while (tmp != counter2_); CmiPrintf( "+count-overview %d\n+count-switchrandom %d\n" "+count-switchbyphase %d\n+count-nolog %d\n" "+count-logname %s\n+count-writebyphase %d\n", overview_, switchRandom_, switchByPhase_, noLog_, logName, writeByPhase_); } // DEBUGF((" DEBUG: Counter1=%d Counter2=%d\n", counter1_, counter2_)); CpvAccess(_logPool) = new CountLogPool(); // allocate names so can do reduction/analysis on the fly char** counterNames = new char*[counter1Sz_+counter2Sz_]; char** counterDesc = new char*[counter1Sz_+counter2Sz_]; tmp = counter1_; for (i=0; i<counter1Sz_; i++) { tmp->index = i; counterNames[i] = tmp->arg; counterDesc[i] = tmp->desc; tmp = tmp->next; } tmp = counter2_; for (i=0; i<counter2Sz_; i++) { tmp->index = counter1Sz_+i; counterNames[counter1Sz_+i] = tmp->arg; counterDesc[counter1Sz_+i] = tmp->desc; tmp = tmp->next; } CpvAccess(_counterNames) = counterNames; CpvAccess(_counterDesc) = counterDesc; CpvAccess(_numCounters) = numCounters; // don't erase counterNames or counterDesc, // the reduction client will do it on the final reduction _MEMCHECK(CpvAccess(_logPool)); CpvAccess(_logPool)->init(numCounters); DEBUGF(("%d/%d DEBUG: Created _logPool at %08x\n", CmiMyPe(), CmiNumPes(), CpvAccess(_logPool))); }
/// ENTRY: Gathers PVT reports; calculates and broadcasts GVT to PVTs void GVT::computeGVT(UpdateMsg *m) { #ifndef CMK_OPTIMIZE if(pose_config.stats) localStats->TimerStart(GVT_TIMER); #endif CProxy_PVT p(ThePVT); CProxy_GVT g(TheGVT); GVTMsg *gmsg = new GVTMsg; POSE_TimeType lastGVT = 0, earliestMsg = POSE_UnsetTS, earlyAny = POSE_UnsetTS; SRentry *tmpSRs = SRs; if (CkMyPe() != 0) startOffset = 1; if (m->runGVTflag == 1) done++; else { // see if message provides new min optGVT or conGVT if ((optGVT < 0) || ((m->optPVT > POSE_UnsetTS) && (m->optPVT < optGVT))) optGVT = m->optPVT; if ((conGVT < 0) || ((m->conPVT > POSE_UnsetTS) && (m->conPVT < conGVT))) conGVT = m->conPVT; if (m->maxSR > earlyAny) earlyAny = m->maxSR; // add send/recv info to SRs /* if (m->numEntries > 0) CkPrintf("GVT recv'd %d SRs from a PE, earliest=%d\n", m->numEntries, m->SRs[0].timestamp);*/ addSR(&SRs, m->SRs, optGVT, m->numEntries); done++; } CkFreeMsg(m); if (done == reportsExpected+startOffset) { // all PVT reports are in #ifndef CMK_OPTIMIZE if(pose_config.stats) localStats->GvtInc(); #endif gvtIterationCount++; done = 0; startOffset = 1; lastGVT = estGVT; // store previous estimate if (lastGVT < 0) lastGVT = 0; estGVT = POSE_UnsetTS; // derive GVT estimate from min optimistic & conservative GVTs estGVT = optGVT; if ((conGVT > POSE_UnsetTS) && (estGVT > POSE_UnsetTS) && (conGVT < estGVT)) estGVT = conGVT; // Check if send/recv activity provides lower possible estimate /* if (SRs) SRs->dump(); else CkPrintf("No SRs reported to GVT!\n");*/ SRentry *tmp = SRs; POSE_TimeType lastSR = POSE_UnsetTS; while (tmp && ((tmp->timestamp <= estGVT) || (estGVT == POSE_UnsetTS))) { lastSR = tmp->timestamp; if (tmp->sends != tmp->recvs) { earliestMsg = tmp->timestamp; break; } tmp = tmp->next; } /* if ((earliestMsg > POSE_UnsetTS) || (earlyAny > POSE_UnsetTS)) CkPrintf("GVT: earlyDiff=%d earlyAny=%d estGVT was %d.\n", earliestMsg, earlyAny, estGVT);*/ if (((earliestMsg < estGVT) && (earliestMsg != POSE_UnsetTS)) || (estGVT == POSE_UnsetTS)) estGVT = earliestMsg; if ((lastSR != POSE_UnsetTS) && (estGVT == POSE_UnsetTS) && (lastSR > lastGVT)) estGVT = lastSR; // check for inactivity if ((optGVT == POSE_UnsetTS) && (earliestMsg == POSE_UnsetTS)) { inactive++; /* if (inactive == 1) { CkPrintf("[%d] Inactive... calling CkWaitQD...\n", CkMyPe()); CkWaitQD(); CkPrintf("[%d] Back from CkWaitQD...\n", CkMyPe()); } */ estGVT = lastGVT; if (inactive == 1) inactiveTime = lastGVT; } else if (estGVT < 0) { estGVT = lastGVT; inactive = 0; } else inactive = 0; // check the estimate //CkPrintf("opt=%d con=%d lastGVT=%d early=%d lastSR=%d et=%d\n", optGVT, conGVT, lastGVT, earliestMsg, lastSR, POSE_endtime); CmiAssert(estGVT >= lastGVT); //if (estGVT % 1000 == 0) //CkPrintf("[%d] New GVT = %d\n", CkMyPe(), estGVT); //CkPrintf("[%d] New GVT = %lld\n", CkMyPe(), estGVT); // check for termination conditions int term = 0; if ((estGVT >= POSE_endtime) && (POSE_endtime > POSE_UnsetTS)) { #if USE_LONG_TIMESTAMPS CkPrintf("At endtime: %lld\n", POSE_endtime); #else CkPrintf("At endtime: %d\n", POSE_endtime); #endif term = 1; } else if (inactive > 2) { #if USE_LONG_TIMESTAMPS CkPrintf("Simulation inactive at time: %lld\n", inactiveTime); #else CkPrintf("Simulation inactive at time: %d\n", inactiveTime); #endif term = 1; } // report the last new GVT estimate to all PVT branches gmsg->estGVT = estGVT; gmsg->done = term; if (term) { //if (POSE_endtime > POSE_UnsetTS) gmsg->estGVT = POSE_endtime + 1; // else gmsg->estGVT++; #if USE_LONG_TIMESTAMPS CkPrintf("Final GVT = %lld\n", gmsg->estGVT); #else CkPrintf("Final GVT = %d\n", gmsg->estGVT); #endif p.setGVT(gmsg); POSE_stop(); } else { p.setGVT(gmsg); if(pose_config.lb_on) { // perform load balancing #ifndef CMK_OPTIMIZE if(pose_config.stats) localStats->SwitchTimer(LB_TIMER); #endif if (CkNumPes() > 1) { nextLBstart++; if (pose_config.lb_skip == nextLBstart) { TheLBG.calculateLocalLoad(); nextLBstart = 0; } } #ifndef CMK_OPTIMIZE if(pose_config.stats) localStats->SwitchTimer(GVT_TIMER); #endif } // transmit data to start next GVT estimation on next GVT branch UpdateMsg *umsg = new UpdateMsg; umsg->maxSR=0; umsg->optPVT = estGVT; umsg->inactive = inactive; umsg->inactiveTime = inactiveTime; umsg->nextLB = nextLBstart; umsg->runGVTflag = 0; g[(CkMyPe()+1) % CkNumPes()].runGVT(umsg); } // reset static data optGVT = conGVT = POSE_UnsetTS; SRentry *cur = SRs; SRs = NULL; while (cur) { tmp = cur->next; delete cur; cur = tmp; } } #ifndef CMK_OPTIMIZE if(pose_config.stats) localStats->TimerStop(); #endif }
void CentralLB::ReceiveStats(CkMarshalledCLBStatsMessage &msg) { #if CMK_LBDB_ON if (statsMsgsList == NULL) { statsMsgsList = new CLBStatsMsg*[CkNumPes()]; CmiAssert(statsMsgsList != NULL); for(int i=0; i < CkNumPes(); i++) statsMsgsList[i] = 0; } if (statsData == NULL) statsData = new LDStats; // loop through all CLBStatsMsg in the incoming msg int count = msg.getCount(); for (int num = 0; num < count; num++) { CLBStatsMsg *m = msg.getMessage(num); CmiAssert(m!=NULL); const int pe = m->from_pe; DEBUGF(("Stats msg received, %d %d %d %p step %d\n", pe,stats_msg_count,m->n_objs,m,step())); #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_)) /* * if(m->step < step()){ * //TODO: if a processor is redoing an old load balance step.. * //tell it that the step is done and that it should not perform any migrations * thisProxy[pe].ReceiveDummyMigration(); * }*/ #endif if(!CmiNodeAlive(pe)){ DEBUGF(("[%d] ReceiveStats called from invalidProcessor %d\n",CkMyPe(),pe)); continue; } if (m->avail_vector!=NULL) { LBDatabaseObj()->set_avail_vector(m->avail_vector, m->next_lb); } if (statsMsgsList[pe] != 0) { CkPrintf("*** Unexpected CLBStatsMsg in ReceiveStats from PE %d ***\n", pe); } else { statsMsgsList[pe] = m; #if USE_REDUCTION depositData(m); #else // store per processor data right away struct ProcStats &procStat = statsData->procs[pe]; procStat.pe = pe; procStat.total_walltime = m->total_walltime; procStat.idletime = m->idletime; procStat.bg_walltime = m->bg_walltime; #if CMK_LB_CPUTIMER procStat.total_cputime = m->total_cputime; procStat.bg_cputime = m->bg_cputime; #endif procStat.pe_speed = m->pe_speed; //procStat.utilization = 1.0; procStat.available = CmiTrue; procStat.n_objs = m->n_objs; statsData->n_objs += m->n_objs; statsData->n_comm += m->n_comm; #endif #if defined(TEMP_LDB) procStat.pe_temp=m->pe_temp; procStat.pe_speed=m->pe_speed; #endif stats_msg_count++; } } // end of for const int clients = CkNumValidPes(); DEBUGF(("THIS POINT count = %d, clients = %d\n",stats_msg_count,clients)); if (stats_msg_count == clients) { DEBUGF(("[%d] All stats messages received \n",CmiMyPe())); statsData->nprocs() = stats_msg_count; thisProxy[CkMyPe()].LoadBalance(); } #endif }
/// Register poser with PVT int PVT::objRegister(int arrIdx, POSE_TimeType safeTime, int sync, sim *myPtr) { int i = objs.Insert(arrIdx, POSE_UnsetTS, sync, myPtr); // add to object list return(i*1000 + CkMyPe()); // return unique PVT idx }
// Unregister poser from PVT void PVT::objRemove(int pvtIdx) { int idx = (pvtIdx-CkMyPe())/1000; // calculate local index from unique index objs.Delete(idx); // delete the object }
/// ENTRY: receive GVT estimate; wake up objects void PVT::setGVT(GVTMsg *m) { #ifndef CMK_OPTIMIZE if(pose_config.stats) localStats->TimerStart(GVT_TIMER); #endif CProxy_PVT p(ThePVT); CkAssert(m->estGVT >= estGVT); estGVT = m->estGVT; int i, end = objs.getNumSpaces(); #ifdef POSE_COMM_ON //PrioStreaming *pstrat = (PrioStreaming *)(POSE_commlib_insthndl.getStrategy()); //pstrat->setBasePriority((estGVT+10) - POSE_TimeMax); //pstrat->setBasePriority(estGVT+10); #endif simdone = m->done; CkFreeMsg(m); waitForFirst = 1; objs.Commit(); objs.StratCalcs(); // sync strategy calculations #ifdef MEM_TEMPORAL localTimePool->set_min_time(estGVT); #endif // Parallel checkpointing: setGVT was broken into two functions, and // beginCheckpoint was added. Only initiate the checkpointing // procedure on PE 0, after commits have occurred. This should // minimize the amount of data written to disk. In order to ensure // a stable state, we wait for quiescence to be reached before // beginning the checkpoint. Inconsistent results were obtained // (possibly from messages still in transit) without this step. // Once quiescence is reached, PE 0 begins the checkpoint, and then // resumes the simulation in resumeAfterCheckpoint. This Callback // function is also the first POSE function to be called when // restarting from a checkpoint. // Checkpoints are initiated approximately every // pose_config.checkpoint_gvt_interval GVT ticks or // pose_config.checkpoint_time_interval seconds (both defined in // pose_config.h). if ((CkMyPe() == 0) && (parCheckpointInProgress == 0) && (estGVT > 0) && (((pose_config.checkpoint_gvt_interval > 0) && (estGVT >= (parLastCheckpointGVT + pose_config.checkpoint_gvt_interval))) || ((pose_config.checkpoint_time_interval > 0) && ((CmiWallTimer() + parStartTime) >= (parLastCheckpointTime + (double)pose_config.checkpoint_time_interval))))) { // ensure that everything that can be committed has been objs.CheckpointCommit(); // wait for quiescence to occur before checkpointing eventMsg *dummyMsg = new eventMsg(); CkCallback cb(CkIndex_PVT::beginCheckpoint(dummyMsg), CkMyPe(), ThePVT); parCheckpointInProgress = 1; parLastCheckpointTime = CmiWallTimer() + parStartTime; CkStartQD(cb); } else if ((CkMyPe() == 0) && (parLBInProgress == 0) && (((pose_config.lb_gvt_interval > 0) && (estGVT >= (parLastLBGVT + pose_config.lb_gvt_interval))))) { // wait for quiescence to occur before checkpointing eventMsg *dummyMsg = new eventMsg(); CkCallback cb(CkIndex_PVT::beginLoadbalancing(dummyMsg), CkMyPe(), ThePVT); parLBInProgress = 1; CkStartQD(cb); } else { // skip checkpointing eventMsg *dummyMsg = new eventMsg(); p[CkMyPe()].resumeAfterCheckpoint(dummyMsg); } #ifndef CMK_OPTIMIZE if(pose_config.stats) localStats->TimerStop(); #endif }
/// Basic Constructor PVT::PVT() { #ifdef VERBOSE_DEBUG CkPrintf("[%d] constructing PVT\n",CkMyPe()); #endif CpvInitialize(int, stateRecovery); CpvAccess(stateRecovery) = 0; CpvInitialize(eventID, theEventID); CpvAccess(theEventID)=eventID(); // CpvAccess(theEventID).dump(); //LBTurnInstrumentOff(); optGVT = POSE_UnsetTS; conGVT = POSE_UnsetTS; rdone=0; SRs=NULL; #ifdef POSE_COMM_ON //com_debug = 1; #endif #ifndef CMK_OPTIMIZE localStats = (localStat *)CkLocalBranch(theLocalStats); if (pose_config.stats) { localStats->TimerStart(GVT_TIMER); } #endif #ifdef MEM_TEMPORAL localTimePool = (TimePool *)CkLocalBranch(TempMemID); CkPrintf("NOTE: Temporal memory manager is ON!\n"); #endif optPVT = conPVT = estGVT = POSE_UnsetTS; startPhaseActive = gvtTurn = simdone = 0; SendsAndRecvs = new SRtable(); SendsAndRecvs->Initialize(); specEventCount = eventCount = waitForFirst = 0; iterMin = POSE_UnsetTS; int P=CkNumPes(), N=CkMyPe(); reportReduceTo = -1; if ((N < P-2) && (N%2 == 1)) { //odd reportTo = N-1; reportsExpected = reportEnd = 0; } else if (N < P-2) { //even reportTo = N; reportsExpected = 2; if (N == P-3) reportsExpected = 1; reportEnd = 0; if (N < (P-2)/2) reportReduceTo = P-2; else reportReduceTo = P-1; } if (N == P-2) { reportTo = N; reportEnd = 1; reportsExpected = 1 + (P-2)/4 + ((P-2)%4)/2; } else if (N == P-1) { reportTo = N; reportEnd = 1; if (P==1) reportsExpected = 1; else reportsExpected = 1 + (P-2)/4 + (P-2)%2; } // CkPrintf("PE %d reports to %d, receives %d reports, reduces and sends to %d, and reports directly to GVT if %d = 1!\n", CkMyPe(), reportTo, reportsExpected, reportReduceTo, reportEnd); parCheckpointInProgress = 0; parLastCheckpointGVT = 0; parLastCheckpointTime = parStartTime = 0.0; parLBInProgress = 0; parLastLBGVT = 0; // debugBufferLoc = debugBufferWrapped = debugBufferDumped = 0; #ifndef CMK_OPTIMIZE if(pose_config.stats) localStats->TimerStop(); #endif LBDatabase::Object()->AddMigrationDoneFn(staticDoneLB, this); }
/** This is the main charm setup routine. It's called on all processors after Converse initialization. This routine gets passed to Converse from "main.C". The main purpose of this routine is to set up the objects and Ckpv's used during a regular Charm run. See the comment at the top of the file for overall flow. */ void _initCharm(int unused_argc, char **argv) { int inCommThread = (CmiMyRank() == CmiMyNodeSize()); DEBUGF(("[%d,%.6lf ] _initCharm started\n",CmiMyPe(),CmiWallTimer())); CkpvInitialize(size_t *, _offsets); CkpvAccess(_offsets) = new size_t[32]; CkpvInitialize(PtrQ*,_buffQ); CkpvInitialize(PtrVec*,_bocInitVec); CkpvInitialize(void*, _currentChare); CkpvInitialize(int, _currentChareType); CkpvInitialize(CkGroupID, _currentGroup); CkpvInitialize(void *, _currentNodeGroupObj); CkpvInitialize(CkGroupID, _currentGroupRednMgr); CkpvInitialize(GroupTable*, _groupTable); CkpvInitialize(GroupIDTable*, _groupIDTable); CkpvInitialize(CmiImmediateLockType, _groupTableImmLock); CkpvInitialize(bool, _destroyingNodeGroup); CkpvAccess(_destroyingNodeGroup) = false; CkpvInitialize(UInt, _numGroups); CkpvInitialize(int, _numInitsRecd); CkpvInitialize(int, _initdone); CkpvInitialize(char**, Ck_argv); CkpvAccess(Ck_argv)=argv; CkpvInitialize(MsgPool*, _msgPool); CkpvInitialize(CkCoreState *, _coreState); /* Added for evacuation-sayantan */ #ifndef __BIGSIM__ CpvInitialize(char *,_validProcessors); #endif CkpvInitialize(char ,startedEvac); CpvInitialize(int,serializer); _initChareTables(); // for checkpointable plain chares CksvInitialize(UInt, _numNodeGroups); CksvInitialize(GroupTable*, _nodeGroupTable); CksvInitialize(GroupIDTable, _nodeGroupIDTable); CksvInitialize(CmiImmediateLockType, _nodeGroupTableImmLock); CksvInitialize(CmiNodeLock, _nodeLock); CksvInitialize(PtrVec*,_nodeBocInitVec); CksvInitialize(UInt,_numInitNodeMsgs); CkpvInitialize(int,_charmEpoch); CkpvAccess(_charmEpoch)=0; CksvInitialize(int, _triggersSent); CksvAccess(_triggersSent) = 0; CkpvInitialize(_CkOutStream*, _ckout); CkpvInitialize(_CkErrStream*, _ckerr); CkpvInitialize(Stats*, _myStats); CkpvAccess(_groupIDTable) = new GroupIDTable(0); CkpvAccess(_groupTable) = new GroupTable; CkpvAccess(_groupTable)->init(); CkpvAccess(_groupTableImmLock) = CmiCreateImmediateLock(); CkpvAccess(_numGroups) = 1; // make 0 an invalid group number CkpvAccess(_buffQ) = new PtrQ(); CkpvAccess(_bocInitVec) = new PtrVec(); CkpvAccess(_currentNodeGroupObj) = NULL; if(CkMyRank()==0) { CksvAccess(_numNodeGroups) = 1; //make 0 an invalid group number CksvAccess(_numInitNodeMsgs) = 0; CksvAccess(_nodeLock) = CmiCreateLock(); CksvAccess(_nodeGroupTable) = new GroupTable(); CksvAccess(_nodeGroupTable)->init(); CksvAccess(_nodeGroupTableImmLock) = CmiCreateImmediateLock(); CksvAccess(_nodeBocInitVec) = new PtrVec(); } CkCallbackInit(); CmiNodeAllBarrier(); #if ! CMK_BIGSIM_CHARM initQd(argv); // bigsim calls it in ConverseCommonInit #endif CkpvAccess(_coreState)=new CkCoreState(); CkpvAccess(_numInitsRecd) = 0; CkpvAccess(_initdone) = 0; CkpvAccess(_ckout) = new _CkOutStream(); CkpvAccess(_ckerr) = new _CkErrStream(); _charmHandlerIdx = CkRegisterHandler((CmiHandler)_bufferHandler); _initHandlerIdx = CkRegisterHandler((CmiHandler)_initHandler); CkNumberHandlerEx(_initHandlerIdx, (CmiHandlerEx)_initHandler, CkpvAccess(_coreState)); _roRestartHandlerIdx = CkRegisterHandler((CmiHandler)_roRestartHandler); _exitHandlerIdx = CkRegisterHandler((CmiHandler)_exitHandler); //added for interoperabilitY _libExitHandlerIdx = CkRegisterHandler((CmiHandler)_libExitHandler); _bocHandlerIdx = CkRegisterHandler((CmiHandler)_initHandler); CkNumberHandlerEx(_bocHandlerIdx, (CmiHandlerEx)_initHandler, CkpvAccess(_coreState)); #ifdef __BIGSIM__ if(BgNodeRank()==0) #endif _infoIdx = CldRegisterInfoFn((CldInfoFn)_infoFn); _triggerHandlerIdx = CkRegisterHandler((CmiHandler)_triggerHandler); _ckModuleInit(); CldRegisterEstimator((CldEstimator)_charmLoadEstimator); _futuresModuleInit(); // part of futures implementation is a converse module _loadbalancerInit(); _metabalancerInit(); #if CMK_MEM_CHECKPOINT init_memcheckpt(argv); #endif initCharmProjections(); #if CMK_TRACE_IN_CHARM // initialize trace module in ck traceCharmInit(argv); #endif CkpvInitialize(int, envelopeEventID); CkpvAccess(envelopeEventID) = 0; CkMessageWatcherInit(argv,CkpvAccess(_coreState)); /** The rank-0 processor of each node calls the translator-generated "_register" routines. _register routines call the charm.h "CkRegister*" routines, which record function pointers and class information for all Charm entities, like Chares, Arrays, and readonlies. There's one _register routine generated for each .ci file. _register routines *must* be called in the same order on every node, and *must not* be called by multiple threads simultaniously. */ #ifdef __BIGSIM__ if(BgNodeRank()==0) #else if(CkMyRank()==0) #endif { SDAG::registerPUPables(); CmiArgGroup("Charm++",NULL); _parseCommandLineOpts(argv); _registerInit(); CkRegisterMsg("System", 0, 0, CkFreeMsg, sizeof(int)); CkRegisterChareInCharm(CkRegisterChare("null", 0, TypeChare)); CkIndex_Chare::__idx=CkRegisterChare("Chare", sizeof(Chare), TypeChare); CkRegisterChareInCharm(CkIndex_Chare::__idx); CkIndex_Group::__idx=CkRegisterChare("Group", sizeof(Group), TypeGroup); CkRegisterChareInCharm(CkIndex_Group::__idx); CkRegisterEp("null", (CkCallFnPtr)_nullFn, 0, 0, 0+CK_EP_INTRINSIC); /** These _register calls are for the built-in Charm .ci files, like arrays and load balancing. If you add a .ci file to charm, you'll have to add a call to the _register routine here, or make your library into a "-module". */ _registerCkFutures(); _registerCkArray(); _registerLBDatabase(); _registerMetaBalancer(); _registerCkCallback(); _registertempo(); _registerwaitqd(); _registerCkCheckpoint(); #if CMK_MEM_CHECKPOINT _registerCkMemCheckpoint(); #endif /* Setup Control Point Automatic Tuning Framework. By default it is enabled as a part of charm, however it won't enable its tracing module unless a +CPEnableMeasurements command line argument is specified. See trace-common.C for more info Thus there should be no noticable overhead to always having the control point framework linked in. */ #if CMK_WITH_CONTROLPOINT _registerPathHistory(); _registerControlPoints(); _registerTraceControlPoints(); #endif /** CkRegisterMainModule is generated by the (unique) "mainmodule" .ci file. It will include calls to register all the .ci files. */ CkRegisterMainModule(); /** _registerExternalModules is actually generated by charmc at link time (as "moduleinit<pid>.C"). This generated routine calls the _register functions for the .ci files of libraries linked using "-module". This funny initialization is most useful for AMPI/FEM programs, which don't have a .ci file and hence have no other way to control the _register process. */ _registerExternalModules(argv); _registerDone(); } /* The following will happen on every virtual processor in BigEmulator, not just on once per real processor */ if (CkMyRank() == 0) { CpdBreakPointInit(); } CmiNodeAllBarrier(); // Execute the initcalls registered in modules _initCallTable.enumerateInitCalls(); #if CMK_CHARMDEBUG CpdFinishInitialization(); #endif //CmiNodeAllBarrier(); CkpvAccess(_myStats) = new Stats(); CkpvAccess(_msgPool) = new MsgPool(); CmiNodeAllBarrier(); #if !(__FAULT__) CmiBarrier(); CmiBarrier(); CmiBarrier(); #endif #if CMK_SMP_TRACE_COMMTHREAD _TRACE_BEGIN_COMPUTATION(); #else if (!inCommThread) { _TRACE_BEGIN_COMPUTATION(); } #endif #ifdef ADAPT_SCHED_MEM if(CkMyRank()==0){ memCriticalEntries = new int[numMemCriticalEntries]; int memcnt=0; for(int i=0; i<_entryTable.size(); i++){ if(_entryTable[i]->isMemCritical){ memCriticalEntries[memcnt++] = i; } } } #endif #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_)) _messageLoggingInit(); #endif #ifndef __BIGSIM__ /* FAULT_EVAC */ CpvAccess(_validProcessors) = new char[CkNumPes()]; for(int vProc=0;vProc<CkNumPes();vProc++){ CpvAccess(_validProcessors)[vProc]=1; } _ckEvacBcastIdx = CkRegisterHandler((CmiHandler)_ckEvacBcast); _ckAckEvacIdx = CkRegisterHandler((CmiHandler)_ckAckEvac); #endif CkpvAccess(startedEvac) = 0; CpvAccess(serializer) = 0; evacuate = 0; CcdCallOnCondition(CcdSIGUSR1,(CcdVoidFn)CkDecideEvacPe,0); #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_)) CcdCallOnCondition(CcdSIGUSR2,(CcdVoidFn)CkMlogRestart,0); #endif if(_raiseEvac){ processRaiseEvacFile(_raiseEvacFile); /* if(CkMyPe() == 2){ // CcdCallOnConditionKeep(CcdPERIODIC_10s,(CcdVoidFn)CkDecideEvacPe,0); CcdCallFnAfter((CcdVoidFn)CkDecideEvacPe, 0, 10000); } if(CkMyPe() == 3){ CcdCallFnAfter((CcdVoidFn)CkDecideEvacPe, 0, 10000); }*/ } if (CkMyRank() == 0) { TopoManager_init(); } CmiNodeAllBarrier(); if (!_replaySystem) { CkFtFn faultFunc_restart = CkRestartMain; if (faultFunc == NULL || faultFunc == faultFunc_restart) { // this is not restart from memory // these two are blocking calls for non-bigsim #if ! CMK_BIGSIM_CHARM CmiInitCPUAffinity(argv); CmiInitMemAffinity(argv); #endif } CmiInitCPUTopology(argv); #if CMK_SHARED_VARS_POSIX_THREADS_SMP if (CmiCpuTopologyEnabled()) { int *pelist; int num; CmiGetPesOnPhysicalNode(0, &pelist, &num); #if !CMK_MULTICORE && !CMK_SMP_NO_COMMTHD // Count communication threads, if present // XXX: Assuming uniformity of node size here num += num/CmiMyNodeSize(); #endif if (!_Cmi_forceSpinOnIdle && num > CmiNumCores()) { if (CmiMyPe() == 0) CmiPrintf("\nCharm++> Warning: the number of SMP threads (%d) is greater than the number of physical cores (%d), so threads will sleep while idling. Use +CmiSpinOnIdle or +CmiSleepOnIdle to control this directly.\n\n", num, CmiNumCores()); CmiLock(CksvAccess(_nodeLock)); if (! _Cmi_sleepOnIdle) _Cmi_sleepOnIdle = 1; CmiUnlock(CksvAccess(_nodeLock)); } } #endif } if(CmiMyPe() == 0) { char *topoFilename; if(CmiGetArgStringDesc(argv,"+printTopo",&topoFilename,"topo file name")) { std::stringstream sstm; sstm << topoFilename << "." << CmiMyPartition(); std::string result = sstm.str(); FILE *fp; fp = fopen(result.c_str(), "w"); if (fp == NULL) { CkPrintf("Error opening %s file, writing to stdout\n", topoFilename); fp = stdout; } TopoManager_printAllocation(fp); fclose(fp); } } #if CMK_USE_PXSHM && ( CMK_CRAYXE || CMK_CRAYXC ) && CMK_SMP // for SMP on Cray XE6 (hopper) it seems pxshm has to be initialized // again after cpuaffinity is done if (CkMyRank() == 0) { CmiInitPxshm(argv); } CmiNodeAllBarrier(); #endif //CldCallback(); #if CMK_BIGSIM_CHARM && CMK_CHARMDEBUG // Register the BG handler for CCS. Notice that this is put into a variable shared by // the whole real processor. This because converse needs to find it. We check that all // virtual processors register the same index for this handler. CpdBgInit(); #endif if (faultFunc) { #if CMK_WITH_STATS if (CkMyPe()==0) _allStats = new Stats*[CkNumPes()]; #endif if (!inCommThread) { CkArgMsg *msg = (CkArgMsg *)CkAllocMsg(0, sizeof(CkArgMsg), 0); msg->argc = CmiGetArgc(argv); msg->argv = argv; faultFunc(_restartDir, msg); CkFreeMsg(msg); } }else if(CkMyPe()==0){ #if CMK_WITH_STATS _allStats = new Stats*[CkNumPes()]; #endif register size_t i, nMains=_mainTable.size(); for(i=0;i<nMains;i++) /* Create all mainchares */ { register int size = _chareTable[_mainTable[i]->chareIdx]->size; register void *obj = malloc(size); _MEMCHECK(obj); _mainTable[i]->setObj(obj); CkpvAccess(_currentChare) = obj; CkpvAccess(_currentChareType) = _mainTable[i]->chareIdx; register CkArgMsg *msg = (CkArgMsg *)CkAllocMsg(0, sizeof(CkArgMsg), 0); msg->argc = CmiGetArgc(argv); msg->argv = argv; _entryTable[_mainTable[i]->entryIdx]->call(msg, obj); #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_)) CpvAccess(_currentObj) = (Chare *)obj; #endif } _mainDone = 1; _STATS_RECORD_CREATE_CHARE_N(nMains); _STATS_RECORD_PROCESS_CHARE_N(nMains); for(i=0;i<_readonlyMsgs.size();i++) /* Send out readonly messages */ { register void *roMsg = (void *) *((char **)(_readonlyMsgs[i]->pMsg)); if(roMsg==0) continue; //Pack the message and send it to all other processors register envelope *env = UsrToEnv(roMsg); env->setSrcPe(CkMyPe()); env->setMsgtype(ROMsgMsg); env->setRoIdx(i); CmiSetHandler(env, _initHandlerIdx); CkPackMessage(&env); CmiSyncBroadcast(env->getTotalsize(), (char *)env); CpvAccess(_qd)->create(CkNumPes()-1); //For processor 0, unpack and re-set the global CkUnpackMessage(&env); _processROMsgMsg(env); _numInitMsgs++; } //Determine the size of the RODataMessage PUP::sizer ps; for(i=0;i<_readonlyTable.size();i++) _readonlyTable[i]->pupData(ps); //Allocate and fill out the RODataMessage envelope *env = _allocEnv(RODataMsg, ps.size()); PUP::toMem pp((char *)EnvToUsr(env)); for(i=0;i<_readonlyTable.size();i++) _readonlyTable[i]->pupData(pp); env->setCount(++_numInitMsgs); env->setSrcPe(CkMyPe()); CmiSetHandler(env, _initHandlerIdx); DEBUGF(("[%d,%.6lf] RODataMsg being sent of size %d \n",CmiMyPe(),CmiWallTimer(),env->getTotalsize())); CmiSyncBroadcastAndFree(env->getTotalsize(), (char *)env); CpvAccess(_qd)->create(CkNumPes()-1); _initDone(); } DEBUGF(("[%d,%d%.6lf] inCommThread %d\n",CmiMyPe(),CmiMyRank(),CmiWallTimer(),inCommThread)); // when I am a communication thread, I don't participate initDone. if (inCommThread) { CkNumberHandlerEx(_bocHandlerIdx,(CmiHandlerEx)_processHandler, CkpvAccess(_coreState)); CkNumberHandlerEx(_charmHandlerIdx,(CmiHandlerEx)_processHandler , CkpvAccess(_coreState)); _processBufferedMsgs(); } #if CMK_CHARMDEBUG // Should not use CpdFreeze inside a thread (since this processor is really a user-level thread) if (CpvAccess(cpdSuspendStartup)) { //CmiPrintf("In Parallel Debugging mode .....\n"); CpdFreeze(); } #endif #if __FAULT__ if(killFlag){ readKillFile(); } #endif }
simple(double pi) { ckout<<"I am a simple chare running from processor:"<<CkMyPe()<<endl; y = pi; };
void CentralLB::LoadBalance() { #if CMK_LBDB_ON int proc; const int clients = CkNumPes(); #if ! USE_REDUCTION // build data buildStats(); #else for (proc = 0; proc < clients; proc++) statsMsgsList[proc] = NULL; #endif theLbdb->ResetAdaptive(); if (!_lb_args.samePeSpeed()) statsData->normalize_speed(); if (_lb_args.debug()) CmiPrintf("\nCharmLB> %s: PE [%d] step %d starting at %f Memory: %f MB\n", lbname, cur_ld_balancer, step(), start_lb_time, CmiMemoryUsage()/(1024.0*1024.0)); // if we are in simulation mode read data if (LBSimulation::doSimulation) simulationRead(); char *availVector = LBDatabaseObj()->availVector(); for(proc = 0; proc < clients; proc++) statsData->procs[proc].available = (CmiBool)availVector[proc]; preprocess(statsData); // CkPrintf("Before Calling Strategy\n"); if (_lb_args.printSummary()) { LBInfo info(clients); // not take comm data info.getInfo(statsData, clients, 0); LBRealType mLoad, mCpuLoad, totalLoad; info.getSummary(mLoad, mCpuLoad, totalLoad); int nmsgs, nbytes; statsData->computeNonlocalComm(nmsgs, nbytes); CkPrintf("[%d] Load Summary (before LB): max (with bg load): %f max (obj only): %f average: %f at step %d nonlocal: %d msgs %.2fKB.\n", CkMyPe(), mLoad, mCpuLoad, totalLoad/clients, step(), nmsgs, 1.0*nbytes/1024); // if (_lb_args.debug() > 1) { // for (int i=0; i<statsData->n_objs; i++) // CmiPrintf("[%d] %.10f %.10f\n", i, statsData->objData[i].minWall, statsData->objData[i].maxWall); // } } #if CMK_REPLAYSYSTEM LDHandle *loadBalancer_pointers; if (_replaySystem) { loadBalancer_pointers = (LDHandle*)malloc(CkNumPes()*sizeof(LDHandle)); for (int i=0; i<statsData->n_objs; ++i) loadBalancer_pointers[statsData->from_proc[i]] = statsData->objData[i].handle.omhandle.ldb; } #endif LBMigrateMsg* migrateMsg = Strategy(statsData); #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_)) migrateMsg->step = step(); #endif #if CMK_REPLAYSYSTEM CpdHandleLBMessage(&migrateMsg); if (_replaySystem) { for (int i=0; i<migrateMsg->n_moves; ++i) migrateMsg->moves[i].obj.omhandle.ldb = loadBalancer_pointers[migrateMsg->moves[i].from_pe]; free(loadBalancer_pointers); } #endif LBDatabaseObj()->get_avail_vector(migrateMsg->avail_vector); migrateMsg->next_lb = LBDatabaseObj()->new_lbbalancer(); // if this is the step at which we need to dump the database simulationWrite(); // calculate predicted load // very time consuming though, so only happen when debugging is on if (_lb_args.printSummary()) { LBInfo info(clients); // not take comm data getPredictedLoadWithMsg(statsData, clients, migrateMsg, info, 0); LBRealType mLoad, mCpuLoad, totalLoad; info.getSummary(mLoad, mCpuLoad, totalLoad); int nmsgs, nbytes; statsData->computeNonlocalComm(nmsgs, nbytes); CkPrintf("[%d] Load Summary (after LB): max (with bg load): %f max (obj only): %f average: %f at step %d nonlocal: %d msgs %.2fKB useMem: %.2fKB.\n", CkMyPe(), mLoad, mCpuLoad, totalLoad/clients, step(), nmsgs, 1.0*nbytes/1024, (1.0*useMem())/1024); for (int i=0; i<clients; i++) migrateMsg->expectedLoad[i] = info.peLoads[i]; } DEBUGF(("[%d]calling recv migration\n",CkMyPe())); #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_)) lbDecisionCount++; migrateMsg->lbDecisionCount = lbDecisionCount; #endif envelope *env = UsrToEnv(migrateMsg); if (1) { // broadcast thisProxy.ReceiveMigration(migrateMsg); } else { // split the migration for each processor for (int p=0; p<CkNumPes(); p++) { LBMigrateMsg *m = extractMigrateMsg(migrateMsg, p); thisProxy[p].ReceiveMigration(m); } delete migrateMsg; } // Zero out data structures for next cycle // CkPrintf("zeroing out data\n"); statsData->clear(); stats_msg_count=0; #endif }
int FEM_master_parallel_part(int fem_mesh,int masterRank,FEM_Comm_t comm_context){ const char *caller="FEM_Create_connmsa"; FEMAPI(caller); FEM_chunk *c=FEM_chunk::get(caller); FEM_Mesh *m=c->lookup(fem_mesh,caller); m->setAbsoluteGlobalno(); int nelem = m->nElems(); int numChunks; MPI_Comm_size((MPI_Comm)comm_context,&numChunks); printf("master -> number of elements %d \n",nelem); DEBUG(m->print(0)); /*load the connectivity information into the eptr and eind datastructure. It will be read by the other slave elements and used to call parmetis*/ MSA1DINT eptrMSA(nelem,numChunks); MSA1DINT eindMSA(nelem*10,numChunks); /* after the msa array has been created and loaded with connectivity data tell the slaves about the msa array */ struct conndata data; data.nelem = nelem; data.nnode = m->node.size(); data.arr1 = eptrMSA; data.arr2 = eindMSA; MPI_Bcast_pup(data,masterRank,(MPI_Comm)comm_context); eptrMSA.enroll(numChunks); eindMSA.enroll(numChunks); MSA1DINT::Write wPtr = eptrMSA.getInitialWrite(); MSA1DINT::Write wInd = eindMSA.getInitialWrite(); int indcount=0,ptrcount=0; for(int t=0;t<m->elem.size();t++){ if(m->elem.has(t)){ FEM_Elem &k=m->elem[t]; for(int e=0;e<k.size();e++){ wPtr.set(ptrcount)=indcount; ptrcount++; for(int n=0;n<k.getNodesPer();n++){ wInd.set(indcount)=k.getConn(e,n); indcount++; } } } } wPtr.set(ptrcount) = indcount; printf("master -> ptrcount %d indcount %d sizeof(MSA1DINT) %d sizeof(MSA1DINTLIST) %d memory %d\n",ptrcount,indcount,sizeof(MSA1DINT),sizeof(MSA1DINTLIST),CmiMemoryUsage()); /* break up the mesh such that each chunk gets the same number of elements and the nodes corresponding to those elements. However this is not the partition. This is just distributing the data, so that when partition is done using parmetis all the requests for data do not go to chunk 0. Instead after partition each chunk can send the element and node data to the chunks that will need it */ FEM_Mesh *mesh_array=FEM_break_mesh(m,ptrcount,numChunks); /* Send the broken up meshes to the different chunks. */ sendBrokenMeshes(mesh_array,comm_context); delete [] mesh_array; FEM_Mesh mypiece; MPI_Recv_pup(mypiece,masterRank,MESH_CHUNK_TAG,(MPI_Comm)comm_context); /* call parmetis */ double parStartTime = CkWallTimer(); MSA1DINT::Read rPtr = wPtr.syncToRead(); MSA1DINT::Read rInd = wInd.syncToRead(); printf("starting FEM_call_parmetis \n"); struct partconndata *partdata = FEM_call_parmetis(data.nelem, rPtr, rInd, comm_context); printf("done with parmetis %d FEM_Mesh %d in %.6lf \n",CmiMemoryUsage(),sizeof(FEM_Mesh),CkWallTimer()-parStartTime); double dataArrangeStartTime = CkWallTimer(); /* Set up a msa to store the partitions to which a node belongs. A node can belong to multiple partitions. */ int totalNodes = m->node.size(); MSA1DINTLIST nodepart(totalNodes,numChunks); MPI_Bcast_pup(nodepart,masterRank,(MPI_Comm)comm_context); nodepart.enroll(numChunks); MSA1DINTLIST::Accum nodepartAcc = nodepart.getInitialAccum(); FEM_write_nodepart(nodepartAcc,partdata,(MPI_Comm)comm_context); printf("Creating mapping of node to partition took %.6lf\n",CkWallTimer()-dataArrangeStartTime); dataArrangeStartTime = CkWallTimer(); MSA1DINTLIST::Read nodepartRead = nodepartAcc.syncToRead(); /* Set up a msa to store the nodes that belong to a partition */ MSA1DNODELIST part2node(numChunks,numChunks); MPI_Bcast_pup(part2node,masterRank,(MPI_Comm)comm_context); part2node.enroll(numChunks); MSA1DNODELIST::Accum part2nodeAcc = part2node.getInitialAccum(); FEM_write_part2node(nodepartRead, part2nodeAcc, partdata, (MPI_Comm)comm_context); /* Get the list of elements and nodes that belong to this partition */ MSA1DNODELIST::Read rPart2node = part2nodeAcc.syncToRead(); NodeList lnodes = rPart2node.get(masterRank); lnodes.uniquify(); // IntList lelems = part2elem.get(masterRank); printf("Creating mapping of partition to node took %.6lf\n",CkWallTimer()-dataArrangeStartTime); printf("Time spent doing +=ElemList %.6lf \n",elemlistaccTime); dataArrangeStartTime = CkWallTimer(); /* Build an MSA of FEM_Mesh, with each index containing the mesh for that chunk */ MSA1DFEMMESH part2mesh(numChunks,numChunks); MPI_Bcast_pup(part2mesh,masterRank,(MPI_Comm)comm_context); part2mesh.enroll(numChunks); MSA1DFEMMESH::Accum aPart2mesh = part2mesh.getInitialAccum(); FEM_write_part2mesh(aPart2mesh,partdata, &data,nodepartRead,numChunks,masterRank,&mypiece); /* Get your mesh consisting of elements and nodes out of the mesh MSA */ MSA1DFEMMESH::Read rPart2mesh = aPart2mesh.syncToRead(); MeshElem me = rPart2mesh.get(masterRank); //printf("[%d] Number of elements in my partitioned mesh %d number of nodes %d \n",masterRank,me.m->nElems(),me.m->node.size()); DEBUG(printf("[%d] Memory usage on vp 0 close to max %d \n",CkMyPe(),CmiMemoryUsage())); //Free up the eptr and eind MSA arrays stored in data delete &rPtr; delete &rInd; data.arr1.FreeMem(); data.arr2.FreeMem(); nodepart.FreeMem(); DEBUG(printf("[%d] Memory usage on vp 0 after FreeMem %d \n",CkMyPe(),CmiMemoryUsage())); addIDXLists(me.m,lnodes,masterRank); part2node.FreeMem(); DEBUG(printf("[%d] Memory usage on vp 0 after addIDXL %d \n",CkMyPe(),CmiMemoryUsage())); /* Broadcast the user data to all the meshes */ DEBUG(printf("[%d] Length of udata vector in master %d \n",masterRank,m->udata.size())); MPI_Bcast_pup(m->udata,masterRank,(MPI_Comm)comm_context); me.m->udata = m->udata; delete partdata; printf("[%d] Data Arrangement took %.6lf \n",masterRank,CkWallTimer()-dataArrangeStartTime); /* collect the ghost data and send it to all the chunks. */ struct ghostdata *gdata = gatherGhosts(); DEBUG(printf("[%d] number of ghost layers %d \n",masterRank,gdata->numLayers)); MPI_Bcast_pup(*gdata,masterRank,(MPI_Comm)comm_context); /* make ghosts for this mesh */ printf("[%d] Starting to generate number of ghost layers %d \n",masterRank,gdata->numLayers); double _startTime = CkWallTimer(); makeGhosts(me.m,(MPI_Comm)comm_context,masterRank,gdata->numLayers,gdata->layers); delete gdata; printf("[%d] Ghost generation took %.6lf \n",masterRank,CkWallTimer()-_startTime); me.m->becomeGetting(); FEM_chunk *chunk = FEM_chunk::get("FEM_Mesh_Parallel_broadcast"); int tempMeshNo = chunk->meshes.put(me.m); int new_mesh = FEM_Mesh_copy(tempMeshNo); FEM_Mesh *nmesh = c->lookup(new_mesh,"master_parallel_broadcast"); DEBUG(printf("[%d] Length of udata vector in master new_mesh %d \n",masterRank,nmesh->udata.size())); part2mesh.FreeMem(); printf("[%d] Max Memory usage on vp 0 at end of parallel partition %d \n",CkMyPe(),CmiMaxMemoryUsage()); return new_mesh; }
void CentralLB::ProcessReceiveMigration(CkReductionMsg *msg) { #if CMK_LBDB_ON int i; LBMigrateMsg *m = storedMigrateMsg; CmiAssert(m!=NULL); delete msg; #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_)) int *dummyCounts; DEBUGF(("[%d] Starting ReceiveMigration WITH step %d m->step %d\n",CkMyPe(),step(),m->step)); // CmiPrintf("[%d] Starting ReceiveMigration step %d m->step %d\n",CkMyPe(),step(),m->step); if(step() > m->step){ char str[100]; envelope *env = UsrToEnv(m); return; } lbDecisionCount = m->lbDecisionCount; #endif if (_lb_args.debug() > 1) if (CkMyPe()%1024==0) CmiPrintf("[%d] Starting ReceiveMigration step %d at %f\n",CkMyPe(),step(), CmiWallTimer()); for (i=0; i<CkNumPes(); i++) theLbdb->lastLBInfo.expectedLoad[i] = m->expectedLoad[i]; CmiAssert(migrates_expected <= 0 || migrates_completed == migrates_expected); /*FAULT_EVAC*/ if(!CmiNodeAlive(CkMyPe())){ delete m; return; } migrates_expected = 0; future_migrates_expected = 0; #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_)) int sending=0; int dummy=0; LBDB *_myLBDB = theLbdb->getLBDB(); if(_restartFlag){ dummyCounts = new int[CmiNumPes()]; bzero(dummyCounts,sizeof(int)*CmiNumPes()); } #endif for(i=0; i < m->n_moves; i++) { MigrateInfo& move = m->moves[i]; const int me = CkMyPe(); if (move.from_pe == me && move.to_pe != me) { DEBUGF(("[%d] migrating object to %d\n",move.from_pe,move.to_pe)); // migrate object, in case it is already gone, inform toPe #if (!defined(_FAULT_MLOG_) && !defined(_FAULT_CAUSAL_)) if (theLbdb->Migrate(move.obj,move.to_pe) == 0) thisProxy[move.to_pe].MissMigrate(!move.async_arrival); #else if(_restartFlag == 0){ DEBUG(CmiPrintf("[%d] need to move object from %d to %d \n",CkMyPe(),move.from_pe,move.to_pe)); theLbdb->Migrate(move.obj,move.to_pe); sending++; }else{ if(_myLBDB->validObjHandle(move.obj)){ DEBUG(CmiPrintf("[%d] need to move object from %d to %d \n",CkMyPe(),move.from_pe,move.to_pe)); theLbdb->Migrate(move.obj,move.to_pe); sending++; }else{ DEBUG(CmiPrintf("[%d] dummy move to pe %d detected after restart \n",CmiMyPe(),move.to_pe)); dummyCounts[move.to_pe]++; dummy++; } } #endif } else if (move.from_pe != me && move.to_pe == me) { DEBUGF(("[%d] expecting object from %d\n",move.to_pe,move.from_pe)); if (!move.async_arrival) migrates_expected++; else future_migrates_expected++; } else { #if CMK_GLOBAL_LOCATION_UPDATE UpdateLocation(move); #endif } } DEBUGF(("[%d] in ReceiveMigration %d moves expected: %d future expected: %d\n",CkMyPe(),m->n_moves, migrates_expected, future_migrates_expected)); // if (_lb_debug) CkPrintf("[%d] expecting %d objects migrating.\n", CkMyPe(), migrates_expected); #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_)) if(_restartFlag){ sendDummyMigrationCounts(dummyCounts); _restartFlag =0; delete []dummyCounts; } #endif #if 0 if (m->n_moves ==0) { theLbdb->SetLBPeriod(theLbdb->GetLBPeriod()*2); } #endif cur_ld_balancer = m->next_lb; if((CkMyPe() == cur_ld_balancer) && (cur_ld_balancer != 0)){ LBDatabaseObj()->set_avail_vector(m->avail_vector, -2); } if (migrates_expected == 0 || migrates_completed == migrates_expected) MigrationDone(1); delete m; // CkEvacuatedElement(); #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_)) // migrates_expected = 0; // // ResumeClients(1); #endif #endif }
Hello(int _aNum, CkGroupID mcastMgrGID): aNum(_aNum), mcastMgr(NULL), isCookieSet(false) { CkPrintf("Array %d, Element %d created on PE %d\n", aNum, thisIndex, CkMyPe()); mcastMgr = CProxy_CkMulticastMgr(mcastMgrGID).ckLocalBranch(); }
void GridCommLB::Map_NonMigratable_Objects_To_PEs () { int i; for (i = 0; i < Num_Objects; i++) { if (!((&Object_Data[i])->migratable)) { if (_lb_args.debug() > 1) { CkPrintf ("[%d] GridCommLB identifies object %d as non-migratable.\n", CkMyPe(), i); } Assign_Object_To_PE (i, (&Object_Data[i])->from_pe); } } }
OrbLB::OrbLB(const CkLBOptions &opt): CentralLB(opt) { lbname = "OrbLB"; if (CkMyPe() == 0) CkPrintf("[%d] OrbLB created\n",CkMyPe()); }
/************************************************************************** ** This method locates the minimum WAN PE in terms of number of objects ** that communicate with objects across a wide-area connection. The search ** is constrained to PEs within the specified cluster. ** ** In the event of a "tie" (i.e., the number of WAN objects on a candidate ** PE is equal to the minimum number of WAN objects discovered so far) the ** tie is broken by considering the scaled CPU loads on the PEs. The PE ** with the smaller scaled load is the better candidate. In the event of ** a secondary tie, the secondary tie is broken by considering the number ** of LAN objects on the two PEs. ** ** The method returns -1 if no matching PE is found. */ int GridCommLB::Find_Minimum_PE (int cluster) { if (CK_LDB_GridCommLB_Mode == 0) { int min_index; int min_objs; int i; min_index = -1; min_objs = MAXINT; for (i = 0; i < Num_PEs; i++) { if (((&PE_Data[i])->available) && ((&PE_Data[i])->cluster == cluster)) { if ((&PE_Data[i])->num_objs < min_objs) { min_index = i; min_objs = (&PE_Data[i])->num_objs; } else if (((&PE_Data[i])->num_objs == min_objs) && ((&PE_Data[i])->num_wan_objs < (&PE_Data[min_index])->num_wan_objs)) { min_index = i; } else if (((&PE_Data[i])->num_objs == min_objs) && ((&PE_Data[i])->num_wan_objs == (&PE_Data[min_index])->num_wan_objs) && ((&PE_Data[i])->num_wan_msgs < (&PE_Data[min_index])->num_wan_msgs)) { min_index = i; } else if (((&PE_Data[i])->num_objs == min_objs) && ((&PE_Data[i])->num_wan_objs == (&PE_Data[min_index])->num_wan_objs) && ((&PE_Data[i])->num_wan_msgs == (&PE_Data[min_index])->num_wan_msgs) && ((&PE_Data[i])->scaled_load < (&PE_Data[min_index])->scaled_load)) { min_index = i; } } } return (min_index); } else if (CK_LDB_GridCommLB_Mode == 1) { int min_index; int min_load_index; double min_scaled_load; int min_wan_msgs_index; int min_wan_msgs; double load_tolerance; int i; min_index = -1; min_load_index = -1; min_scaled_load = MAXDOUBLE; min_wan_msgs_index = -1; min_wan_msgs = MAXINT; for (i = 0; i < Num_PEs; i++) { if (((&PE_Data[i])->available) && ((&PE_Data[i])->cluster == cluster)) { if ((&PE_Data[i])->scaled_load < min_scaled_load) { min_load_index = i; min_scaled_load = (&PE_Data[i])->scaled_load; } if ((&PE_Data[i])->num_wan_msgs < min_wan_msgs) { min_wan_msgs_index = i; min_wan_msgs = (&PE_Data[i])->num_wan_msgs; } } } // If no PE at all was found, return a -1. if (min_load_index < 0) { return (min_load_index); } // If the number of WAN messages on the lightest loaded PE happens to match the minimum number // of WAN messages overall, we win because this target PE is overall the minimum PE in terms // of both load *and* WAN messages. if ((&PE_Data[min_load_index])->num_wan_msgs <= (&PE_Data[min_wan_msgs_index])->num_wan_msgs) { return (min_load_index); } // Otherwise, we now search for PEs that have loads +/- our tolerance. If any PE has a load // within our tolerance, check its number of WAN messages. The one of these that has the // fewest WAN messages is probably the best candidate for placing the next object onto. load_tolerance = (&PE_Data[min_load_index])->scaled_load * CK_LDB_GridCommLB_Load_Tolerance; min_index = min_load_index; for (i = 0; i < Num_PEs; i++) { if (((&PE_Data[i])->available) && ((&PE_Data[i])->cluster == cluster)) { if (i != min_load_index) { if (fabs ((&PE_Data[i])->scaled_load - (&PE_Data[min_load_index])->scaled_load) <= load_tolerance) { if ((&PE_Data[i])->num_wan_msgs < (&PE_Data[min_index])->num_wan_msgs) { min_index = i; } } } } } return (min_index); } else { if (_lb_args.debug() > 0) { CkPrintf ("[%d] GridCommLB was told to use bad mode (%d).\n", CkMyPe(), CK_LDB_GridCommLB_Mode); } return (-1); } }
extern "C" void driver(void) { int ignored; int i, count; int myChunk=FEM_My_partition(); /*Add a refinement object to FEM array*/ CkPrintf("[%d] begin init\n",myChunk); FEM_REFINE2D_Init(); CkPrintf("[%d] end init\n",myChunk); myGlobals g; FEM_Register(&g,(FEM_PupFn)pup_myGlobals); init_myGlobal(&g); g.nnodes = FEM_Mesh_get_length(FEM_Mesh_default_read(),FEM_NODE); int maxNodes = g.nnodes; g.maxnodes=2*maxNodes; g.m_i_fid=FEM_Create_field(FEM_DOUBLE,1,0,sizeof(double)); resize_nodes((void *)&g,&g.nnodes,&maxNodes); int nghost=0; g.nelems=FEM_Mesh_get_length(FEM_Mesh_default_read(),FEM_ELEM); g.maxelems=g.nelems; resize_elems((void *)&g,&g.nelems,&g.maxelems); FEM_REFINE2D_Newmesh(FEM_Mesh_default_read(),FEM_NODE,FEM_ELEM); //Initialize associated data for (i=0;i<g.maxnodes;i++) { g.R_net[i]=g.d[i]=g.v[i]=g.a[i]=vector2d(0.0); } //Apply a small initial perturbation to positions for (i=0;i<g.nnodes;i++) { const double max=1.0e-15/15.0; //Tiny perturbation g.d[i].x+=max*(i&15); g.d[i].y+=max*((i+5)&15); } int fid=FEM_Create_field(FEM_DOUBLE,2,0,sizeof(vector2d)); for (i=0;i<g.nelems;i++){ checkTriangle(g,i); } sleep(5); //Timeloop if (CkMyPe()==0){ CkPrintf("Entering timeloop\n"); } // int tSteps=0x70FF00FF; int tSteps=4; int z=13; calcMasses(g); double startTime=CkWallTimer(); double curArea=2.5e-5/1024; int t = 0; // THIS IS THE INITIAL MESH SENT TO NetFEM if (1) { //Publish data to the net publishMeshToNetFEM(g,myChunk,t); } double desiredArea; /* //should not be necessary as it would have been set in the init for (i=0; i<g.nnodes; i++) { g.validNode[i] = 1; } for (i=0; i<g.nelems; i++) { g.validElem[i] = 1; }*/ double avgArea = 0.0; for (i=0;i<g.nelems;i++) { avgArea += calcArea(g, i); } avgArea /= g.nelems; for (t=1;t<=tSteps;t++) { /* if (1) { //Structural mechanics //Compute forces on nodes exerted by elements CST_NL(g.coord,g.conn,g.R_net,g.d,matConst,g.nnodes,g.nelems,g.S11,g.S22,g.S12); //Communicate net force on shared nodes FEM_Update_field(fid,g.R_net); //Advance node positions advanceNodes(dt,g.nnodes,g.coord,g.R_net,g.a,g.v,g.d,g.m_i,(t%4)==0); }*/ //Debugging/perf. output double curTime=CkWallTimer(); double total=curTime-startTime; startTime=curTime; vector2d *loc; double *areas; // prepare to coarsen loc=new vector2d[2*g.nnodes]; for (i=0;i<g.nnodes;i++) { loc[i]=g.coord[i];//+g.d[i]; } areas=new double[g.nelems]; for (i=0;i<g.nelems;i++) { areas[i] = avgArea; } //coarsen one element at a time //int coarseIdx = (23 + 4*t)%g.nnodes; //areas[coarseIdx] = calcArea(g,coarseIdx)*2.5; CkPrintf("[%d] Starting coarsening step: %d nodes, %d elements\n", myChunk,countValidEntities(g.validNode,g.nnodes),countValidEntities(g.validElem,g.nelems)); FEM_REFINE2D_Coarsen(FEM_Mesh_default_read(),FEM_NODE,(double *)g.coord,FEM_ELEM,areas,FEM_SPARSE); repeat_after_split((void *)&g); g.nelems = FEM_Mesh_get_length(FEM_Mesh_default_read(),FEM_ELEM); g.nnodes = FEM_Mesh_get_length(FEM_Mesh_default_read(),FEM_NODE); CkPrintf("[%d] Done with coarsening step: %d nodes, %d elements\n", myChunk,countValidEntities(g.validNode,g.nnodes),countValidEntities(g.validElem,g.nelems)); delete [] loc; delete[] areas; // THIS IS THE COARSENED MESH SENT TO NetFEM if (1) { //Publish data to the net publishMeshToNetFEM(g,myChunk,2*t-1); } //prepare to refine loc=new vector2d[2*g.nnodes]; for (i=0;i<g.nnodes;i++) { loc[i]=g.coord[i];//+g.d[i]; } areas=new double[g.nelems]; for (i=0;i<g.nelems;i++) { areas[i] = avgArea; } //refine one element at a time //int refIdx = (13 + 3*t)%g.nnodes; //areas[refIdx] = calcArea(g,refIdx)/1.5; CkPrintf("[%d] Starting refinement step: %d nodes, %d elements\n", myChunk,countValidEntities(g.validNode,g.nnodes),countValidEntities(g.validElem,g.nelems)); FEM_REFINE2D_Split(FEM_Mesh_default_read(),FEM_NODE,(double *)loc,FEM_ELEM,areas,FEM_SPARSE); repeat_after_split((void *)&g); g.nelems = FEM_Mesh_get_length(FEM_Mesh_default_read(),FEM_ELEM); g.nnodes = FEM_Mesh_get_length(FEM_Mesh_default_read(),FEM_NODE); CkPrintf("[%d] Done with refinement step: %d nodes, %d elements\n", myChunk,countValidEntities(g.validNode,g.nnodes),countValidEntities(g.validElem,g.nelems)); delete [] loc; delete[] areas; // THIS IS THE REFINED MESH SENT TO NetFEM if (1) { //Publish data to the net publishMeshToNetFEM(g,myChunk,2*t); } } if (CkMyPe()==0) CkPrintf("Driver finished\n"); }
/************************************************************************** ** The Charm++ load balancing framework invokes this method to cause the ** load balancer to migrate objects to "better" PEs. */ void GridCommLB::work (LDStats *stats) { int i; if (_lb_args.debug() > 0) { CkPrintf ("[%d] GridCommLB is working (mode=%d, background load=%d, load tolerance=%f).\n", CkMyPe(), CK_LDB_GridCommLB_Mode, CK_LDB_GridCommLB_Background_Load, CK_LDB_GridCommLB_Load_Tolerance); } // Since this load balancer looks at communications data, it must initialize the CommHash. stats->makeCommHash (); // Initialize object variables for the number of PEs and number of objects. Num_PEs = stats->nprocs(); Num_Objects = stats->n_objs; if (_lb_args.debug() > 0) { CkPrintf ("[%d] GridCommLB is examining %d PEs and %d objects.\n", CkMyPe(), Num_PEs, Num_Objects); } // Initialize the PE_Data[] data structure. Initialize_PE_Data (stats); // If at least one available PE does not exist, return from load balancing. if (Available_PE_Count() < 1) { if (_lb_args.debug() > 0) { CkPrintf ("[%d] GridCommLB finds no available PEs -- no balancing done.\n", CkMyPe()); } delete [] PE_Data; return; } // Determine the number of clusters. // If any PE is not mapped to a cluster, return from load balancing. Num_Clusters = Compute_Number_Of_Clusters (); if (Num_Clusters < 1) { if (_lb_args.debug() > 0) { CkPrintf ("[%d] GridCommLB finds incomplete PE cluster map -- no balancing done.\n", CkMyPe()); } delete [] PE_Data; return; } if (_lb_args.debug() > 0) { CkPrintf ("[%d] GridCommLB finds %d clusters.\n", CkMyPe(), Num_Clusters); } // Initialize the Object_Data[] data structure. Initialize_Object_Data (stats); // Examine all object-to-object messages for intra-cluster and inter-cluster communications. Examine_InterObject_Messages (stats); // Map non-migratable objects to PEs. Map_NonMigratable_Objects_To_PEs (); // Map migratable objects to PEs in each cluster. for (i = 0; i < Num_Clusters; i++) { Map_Migratable_Objects_To_PEs (i); } // Make the assignment of objects to PEs in the load balancer framework. for (i = 0; i < Num_Objects; i++) { stats->to_proc[i] = (&Object_Data[i])->to_pe; if (_lb_args.debug() > 2) { CkPrintf ("[%d] GridCommLB migrates object %d from PE %d to PE %d.\n", CkMyPe(), i, stats->from_proc[i], stats->to_proc[i]); } else if (_lb_args.debug() > 1) { if (stats->to_proc[i] != stats->from_proc[i]) { CkPrintf ("[%d] GridCommLB migrates object %d from PE %d to PE %d.\n", CkMyPe(), i, stats->from_proc[i], stats->to_proc[i]); } } } // Free memory. delete [] Object_Data; delete [] PE_Data; }
Slave::Slave() { /* ==> read-only variables set by the main chare k = kInput; thresh = threshInput; max_level = 30; */ CkPrintf("Constructor of the Slave chare # %d is called on processor %d.\n", thisIndex, CkMyPe()); int n = (int) log2(numProcesses); int l = thisIndex; Function *function = new Function(n, l, k, thresh, test1); mainProxy.done_refine(); }
MetisLB::MetisLB(const CkLBOptions &opt): CBase_MetisLB(opt) { lbname = "MetisLB"; if (CkMyPe() == 0) CkPrintf("[%d] MetisLB created\n",CkMyPe()); }
void ComputeNonbondedUtil::select(void) { if ( CkMyRank() ) return; // These defaults die cleanly if nothing appropriate is assigned. ComputeNonbondedUtil::calcPair = calc_error; ComputeNonbondedUtil::calcPairEnergy = calc_error; ComputeNonbondedUtil::calcSelf = calc_error; ComputeNonbondedUtil::calcSelfEnergy = calc_error; ComputeNonbondedUtil::calcFullPair = calc_error; ComputeNonbondedUtil::calcFullPairEnergy = calc_error; ComputeNonbondedUtil::calcFullSelf = calc_error; ComputeNonbondedUtil::calcFullSelfEnergy = calc_error; ComputeNonbondedUtil::calcMergePair = calc_error; ComputeNonbondedUtil::calcMergePairEnergy = calc_error; ComputeNonbondedUtil::calcMergeSelf = calc_error; ComputeNonbondedUtil::calcMergeSelfEnergy = calc_error; ComputeNonbondedUtil::calcSlowPair = calc_error; ComputeNonbondedUtil::calcSlowPairEnergy = calc_error; ComputeNonbondedUtil::calcSlowSelf = calc_error; ComputeNonbondedUtil::calcSlowSelfEnergy = calc_error; SimParameters * simParams = Node::Object()->simParameters; Parameters * params = Node::Object()->parameters; table_ener = params->table_ener; rowsize = params->rowsize; columnsize = params->columnsize; commOnly = simParams->commOnly; fixedAtomsOn = ( simParams->fixedAtomsOn && ! simParams->fixedAtomsForces ); cutoff = simParams->cutoff; cutoff2 = cutoff*cutoff; //fepb alchFepOn = simParams->alchFepOn; Fep_WCA_repuOn = simParams->alchFepWCARepuOn; Fep_WCA_dispOn = simParams->alchFepWCADispOn; alchThermIntOn = simParams->alchThermIntOn; alchLambda = alchLambda2 = 0; lesOn = simParams->lesOn; lesScaling = lesFactor = 0; Bool tabulatedEnergies = simParams->tabulatedEnergies; alchVdwShiftCoeff = simParams->alchVdwShiftCoeff; WCA_rcut1 = simParams->alchFepWCArcut1; WCA_rcut2 = simParams->alchFepWCArcut2; alchVdwLambdaEnd = simParams->alchVdwLambdaEnd; alchElecLambdaStart = simParams->alchElecLambdaStart; alchDecouple = simParams->alchDecouple; delete [] lambda_table; lambda_table = 0; pairInteractionOn = simParams->pairInteractionOn; pairInteractionSelf = simParams->pairInteractionSelf; pressureProfileOn = simParams->pressureProfileOn; // Ported by JLai -- Original JE - Go goForcesOn = simParams->goForcesOn; goMethod = simParams->goMethod; // End of port accelMDOn = simParams->accelMDOn; drudeNbthole = simParams->drudeOn && (simParams->drudeNbtholeCut > 0.0); if ( drudeNbthole ) { #ifdef NAMD_CUDA NAMD_die("drudeNbthole is not supported in CUDA version"); #endif if ( alchFepOn ) NAMD_die("drudeNbthole is not supported with alchemical free-energy perturbation"); if ( alchThermIntOn ) NAMD_die("drudeNbthole is not supported with alchemical thermodynamic integration"); if ( lesOn ) NAMD_die("drudeNbthole is not supported with locally enhanced sampling"); if ( pairInteractionOn ) NAMD_die("drudeNbthole is not supported with pair interaction calculation"); if ( pressureProfileOn ) NAMD_die("drudeNbthole is not supported with pressure profile calculation"); } if ( alchFepOn ) { #ifdef NAMD_CUDA NAMD_die("Alchemical free-energy perturbation is not supported in CUDA version"); #endif alchLambda = simParams->alchLambda; alchLambda2 = simParams->alchLambda2; ComputeNonbondedUtil::calcPair = calc_pair_energy_fep; ComputeNonbondedUtil::calcPairEnergy = calc_pair_energy_fep; ComputeNonbondedUtil::calcSelf = calc_self_energy_fep; ComputeNonbondedUtil::calcSelfEnergy = calc_self_energy_fep; ComputeNonbondedUtil::calcFullPair = calc_pair_energy_fullelect_fep; ComputeNonbondedUtil::calcFullPairEnergy = calc_pair_energy_fullelect_fep; ComputeNonbondedUtil::calcFullSelf = calc_self_energy_fullelect_fep; ComputeNonbondedUtil::calcFullSelfEnergy = calc_self_energy_fullelect_fep; ComputeNonbondedUtil::calcMergePair = calc_pair_energy_merge_fullelect_fep; ComputeNonbondedUtil::calcMergePairEnergy = calc_pair_energy_merge_fullelect_fep; ComputeNonbondedUtil::calcMergeSelf = calc_self_energy_merge_fullelect_fep; ComputeNonbondedUtil::calcMergeSelfEnergy = calc_self_energy_merge_fullelect_fep; ComputeNonbondedUtil::calcSlowPair = calc_pair_energy_slow_fullelect_fep; ComputeNonbondedUtil::calcSlowPairEnergy = calc_pair_energy_slow_fullelect_fep; ComputeNonbondedUtil::calcSlowSelf = calc_self_energy_slow_fullelect_fep; ComputeNonbondedUtil::calcSlowSelfEnergy = calc_self_energy_slow_fullelect_fep; } else if ( alchThermIntOn ) { #ifdef NAMD_CUDA NAMD_die("Alchemical thermodynamic integration is not supported in CUDA version"); #endif alchLambda = simParams->alchLambda; ComputeNonbondedUtil::calcPair = calc_pair_ti; ComputeNonbondedUtil::calcPairEnergy = calc_pair_energy_ti; ComputeNonbondedUtil::calcSelf = calc_self_ti; ComputeNonbondedUtil::calcSelfEnergy = calc_self_energy_ti; ComputeNonbondedUtil::calcFullPair = calc_pair_fullelect_ti; ComputeNonbondedUtil::calcFullPairEnergy = calc_pair_energy_fullelect_ti; ComputeNonbondedUtil::calcFullSelf = calc_self_fullelect_ti; ComputeNonbondedUtil::calcFullSelfEnergy = calc_self_energy_fullelect_ti; ComputeNonbondedUtil::calcMergePair = calc_pair_merge_fullelect_ti; ComputeNonbondedUtil::calcMergePairEnergy = calc_pair_energy_merge_fullelect_ti; ComputeNonbondedUtil::calcMergeSelf = calc_self_merge_fullelect_ti; ComputeNonbondedUtil::calcMergeSelfEnergy = calc_self_energy_merge_fullelect_ti; ComputeNonbondedUtil::calcSlowPair = calc_pair_slow_fullelect_ti; ComputeNonbondedUtil::calcSlowPairEnergy = calc_pair_energy_slow_fullelect_ti; ComputeNonbondedUtil::calcSlowSelf = calc_self_slow_fullelect_ti; ComputeNonbondedUtil::calcSlowSelfEnergy = calc_self_energy_slow_fullelect_ti; } else if ( lesOn ) { #ifdef NAMD_CUDA NAMD_die("Locally enhanced sampling is not supported in CUDA version"); #endif lesFactor = simParams->lesFactor; lesScaling = 1.0 / (double)lesFactor; lambda_table = new BigReal[(lesFactor+1)*(lesFactor+1)]; for ( int ip=0; ip<=lesFactor; ++ip ) { for ( int jp=0; jp<=lesFactor; ++jp ) { BigReal lambda_pair = 1.0; if (ip || jp ) { if (ip && jp && ip != jp) { lambda_pair = 0.0; } else { lambda_pair = lesScaling; } } lambda_table[(lesFactor+1)*ip+jp] = lambda_pair; } } ComputeNonbondedUtil::calcPair = calc_pair_les; ComputeNonbondedUtil::calcPairEnergy = calc_pair_energy_les; ComputeNonbondedUtil::calcSelf = calc_self_les; ComputeNonbondedUtil::calcSelfEnergy = calc_self_energy_les; ComputeNonbondedUtil::calcFullPair = calc_pair_fullelect_les; ComputeNonbondedUtil::calcFullPairEnergy = calc_pair_energy_fullelect_les; ComputeNonbondedUtil::calcFullSelf = calc_self_fullelect_les; ComputeNonbondedUtil::calcFullSelfEnergy = calc_self_energy_fullelect_les; ComputeNonbondedUtil::calcMergePair = calc_pair_merge_fullelect_les; ComputeNonbondedUtil::calcMergePairEnergy = calc_pair_energy_merge_fullelect_les; ComputeNonbondedUtil::calcMergeSelf = calc_self_merge_fullelect_les; ComputeNonbondedUtil::calcMergeSelfEnergy = calc_self_energy_merge_fullelect_les; ComputeNonbondedUtil::calcSlowPair = calc_pair_slow_fullelect_les; ComputeNonbondedUtil::calcSlowPairEnergy = calc_pair_energy_slow_fullelect_les; ComputeNonbondedUtil::calcSlowSelf = calc_self_slow_fullelect_les; ComputeNonbondedUtil::calcSlowSelfEnergy = calc_self_energy_slow_fullelect_les; } else if ( pressureProfileOn) { #ifdef NAMD_CUDA NAMD_die("Pressure profile calculation is not supported in CUDA version"); #endif pressureProfileSlabs = simParams->pressureProfileSlabs; pressureProfileAtomTypes = simParams->pressureProfileAtomTypes; ComputeNonbondedUtil::calcPair = calc_pair_pprof; ComputeNonbondedUtil::calcPairEnergy = calc_pair_energy_pprof; ComputeNonbondedUtil::calcSelf = calc_self_pprof; ComputeNonbondedUtil::calcSelfEnergy = calc_self_energy_pprof; ComputeNonbondedUtil::calcFullPair = calc_pair_fullelect_pprof; ComputeNonbondedUtil::calcFullPairEnergy = calc_pair_energy_fullelect_pprof; ComputeNonbondedUtil::calcFullSelf = calc_self_fullelect_pprof; ComputeNonbondedUtil::calcFullSelfEnergy = calc_self_energy_fullelect_pprof; ComputeNonbondedUtil::calcMergePair = calc_pair_merge_fullelect_pprof; ComputeNonbondedUtil::calcMergePairEnergy = calc_pair_energy_merge_fullelect_pprof; ComputeNonbondedUtil::calcMergeSelf = calc_self_merge_fullelect_pprof; ComputeNonbondedUtil::calcMergeSelfEnergy = calc_self_energy_merge_fullelect_pprof; ComputeNonbondedUtil::calcSlowPair = calc_pair_slow_fullelect_pprof; ComputeNonbondedUtil::calcSlowPairEnergy = calc_pair_energy_slow_fullelect_pprof; ComputeNonbondedUtil::calcSlowSelf = calc_self_slow_fullelect_pprof; ComputeNonbondedUtil::calcSlowSelfEnergy = calc_self_energy_slow_fullelect_pprof; } else if ( pairInteractionOn ) { #ifdef NAMD_CUDA NAMD_die("Pair interaction calculation is not supported in CUDA version"); #endif ComputeNonbondedUtil::calcPairEnergy = calc_pair_energy_int; ComputeNonbondedUtil::calcSelfEnergy = calc_self_energy_int; ComputeNonbondedUtil::calcFullPairEnergy = calc_pair_energy_fullelect_int; ComputeNonbondedUtil::calcFullSelfEnergy = calc_self_energy_fullelect_int; ComputeNonbondedUtil::calcMergePairEnergy = calc_pair_energy_merge_fullelect_int; ComputeNonbondedUtil::calcMergeSelfEnergy = calc_self_energy_merge_fullelect_int; } else if ( tabulatedEnergies ) { #ifdef NAMD_CUDA NAMD_die("Tabulated energies is not supported in CUDA version"); #endif ComputeNonbondedUtil::calcPair = calc_pair_tabener; ComputeNonbondedUtil::calcPairEnergy = calc_pair_energy_tabener; ComputeNonbondedUtil::calcSelf = calc_self_tabener; ComputeNonbondedUtil::calcSelfEnergy = calc_self_energy_tabener; ComputeNonbondedUtil::calcFullPair = calc_pair_fullelect_tabener; ComputeNonbondedUtil::calcFullPairEnergy = calc_pair_energy_fullelect_tabener; ComputeNonbondedUtil::calcFullSelf = calc_self_fullelect_tabener; ComputeNonbondedUtil::calcFullSelfEnergy = calc_self_energy_fullelect_tabener; ComputeNonbondedUtil::calcMergePair = calc_pair_merge_fullelect_tabener; ComputeNonbondedUtil::calcMergePairEnergy = calc_pair_energy_merge_fullelect_tabener; ComputeNonbondedUtil::calcMergeSelf = calc_self_merge_fullelect_tabener; ComputeNonbondedUtil::calcMergeSelfEnergy = calc_self_energy_merge_fullelect_tabener; ComputeNonbondedUtil::calcSlowPair = calc_pair_slow_fullelect_tabener; ComputeNonbondedUtil::calcSlowPairEnergy = calc_pair_energy_slow_fullelect_tabener; ComputeNonbondedUtil::calcSlowSelf = calc_self_slow_fullelect_tabener; ComputeNonbondedUtil::calcSlowSelfEnergy = calc_self_energy_slow_fullelect_tabener; } else if ( goForcesOn ) { #ifdef NAMD_CUDA NAMD_die("Go forces is not supported in CUDA version"); #endif ComputeNonbondedUtil::calcPair = calc_pair_go; ComputeNonbondedUtil::calcPairEnergy = calc_pair_energy_go; ComputeNonbondedUtil::calcSelf = calc_self_go; ComputeNonbondedUtil::calcSelfEnergy = calc_self_energy_go; ComputeNonbondedUtil::calcFullPair = calc_pair_fullelect_go; ComputeNonbondedUtil::calcFullPairEnergy = calc_pair_energy_fullelect_go; ComputeNonbondedUtil::calcFullSelf = calc_self_fullelect_go; ComputeNonbondedUtil::calcFullSelfEnergy = calc_self_energy_fullelect_go; ComputeNonbondedUtil::calcMergePair = calc_pair_merge_fullelect_go; ComputeNonbondedUtil::calcMergePairEnergy = calc_pair_energy_merge_fullelect_go; ComputeNonbondedUtil::calcMergeSelf = calc_self_merge_fullelect_go; ComputeNonbondedUtil::calcMergeSelfEnergy = calc_self_energy_merge_fullelect_go; ComputeNonbondedUtil::calcSlowPair = calc_pair_slow_fullelect_go; ComputeNonbondedUtil::calcSlowPairEnergy = calc_pair_energy_slow_fullelect_go; ComputeNonbondedUtil::calcSlowSelf = calc_self_slow_fullelect_go; ComputeNonbondedUtil::calcSlowSelfEnergy = calc_self_energy_slow_fullelect_go; } else { ComputeNonbondedUtil::calcPair = calc_pair; ComputeNonbondedUtil::calcPairEnergy = calc_pair_energy; ComputeNonbondedUtil::calcSelf = calc_self; ComputeNonbondedUtil::calcSelfEnergy = calc_self_energy; ComputeNonbondedUtil::calcFullPair = calc_pair_fullelect; ComputeNonbondedUtil::calcFullPairEnergy = calc_pair_energy_fullelect; ComputeNonbondedUtil::calcFullSelf = calc_self_fullelect; ComputeNonbondedUtil::calcFullSelfEnergy = calc_self_energy_fullelect; ComputeNonbondedUtil::calcMergePair = calc_pair_merge_fullelect; ComputeNonbondedUtil::calcMergePairEnergy = calc_pair_energy_merge_fullelect; ComputeNonbondedUtil::calcMergeSelf = calc_self_merge_fullelect; ComputeNonbondedUtil::calcMergeSelfEnergy = calc_self_energy_merge_fullelect; ComputeNonbondedUtil::calcSlowPair = calc_pair_slow_fullelect; ComputeNonbondedUtil::calcSlowPairEnergy = calc_pair_energy_slow_fullelect; ComputeNonbondedUtil::calcSlowSelf = calc_self_slow_fullelect; ComputeNonbondedUtil::calcSlowSelfEnergy = calc_self_energy_slow_fullelect; } //fepe dielectric_1 = 1.0/simParams->dielectric; if ( ! ljTable ) ljTable = new LJTable; mol = Node::Object()->molecule; scaling = simParams->nonbondedScaling; if ( simParams->exclude == SCALED14 ) { scale14 = simParams->scale14; } else { scale14 = 1.; } if ( simParams->switchingActive ) { switchOn = simParams->switchingDist; switchOn_1 = 1.0/switchOn; // d0 = 1.0/(cutoff-switchOn); switchOn2 = switchOn*switchOn; c0 = 1.0/(cutoff2-switchOn2); if ( simParams->vdwForceSwitching ) { double switchOn3 = switchOn * switchOn2; double cutoff3 = cutoff * cutoff2; double switchOn6 = switchOn3 * switchOn3; double cutoff6 = cutoff3 * cutoff3; v_vdwa = -1. / ( switchOn6 * cutoff6 ); v_vdwb = -1. / ( switchOn3 * cutoff3 ); k_vdwa = cutoff6 / ( cutoff6 - switchOn6 ); k_vdwb = cutoff3 / ( cutoff3 - switchOn3 ); cutoff_3 = 1. / cutoff3; cutoff_6 = 1. / cutoff6; } } else { switchOn = cutoff; switchOn_1 = 1.0/switchOn; // d0 = 0.; // avoid division by zero switchOn2 = switchOn*switchOn; c0 = 0.; // avoid division by zero } c1 = c0*c0*c0; c3 = 3.0 * (cutoff2 - switchOn2); c5 = 0; c6 = 0; c7 = 0; c8 = 0; const int PMEOn = simParams->PMEOn; const int MSMOn = simParams->MSMOn; const int MSMSplit = simParams->MSMSplit; if ( PMEOn ) { ewaldcof = simParams->PMEEwaldCoefficient; BigReal TwoBySqrtPi = 1.12837916709551; pi_ewaldcof = TwoBySqrtPi * ewaldcof; } int splitType = SPLIT_NONE; if ( simParams->switchingActive ) splitType = SPLIT_SHIFT; if ( simParams->martiniSwitching ) splitType = SPLIT_MARTINI; if ( simParams->fullDirectOn || simParams->FMAOn || PMEOn || MSMOn ) { switch ( simParams->longSplitting ) { case C2: splitType = SPLIT_C2; break; case C1: splitType = SPLIT_C1; break; case XPLOR: NAMD_die("Sorry, XPLOR splitting not supported."); break; case SHARP: NAMD_die("Sorry, SHARP splitting not supported."); break; default: NAMD_die("Unknown splitting type found!"); } } BigReal r2_tol = 0.1; r2_delta = 1.0; r2_delta_exp = 0; while ( r2_delta > r2_tol ) { r2_delta /= 2.0; r2_delta_exp += 1; } r2_delta_1 = 1.0 / r2_delta; if ( ! CkMyPe() ) { iout << iINFO << "NONBONDED TABLE R-SQUARED SPACING: " << r2_delta << "\n" << endi; } BigReal r2_tmp = 1.0; int cutoff2_exp = 0; while ( (cutoff2 + r2_delta) > r2_tmp ) { r2_tmp *= 2.0; cutoff2_exp += 1; } int i; int n = (r2_delta_exp + cutoff2_exp) * 64 + 1; if ( ! CkMyPe() ) { iout << iINFO << "NONBONDED TABLE SIZE: " << n << " POINTS\n" << endi; } if ( table_alloc ) delete [] table_alloc; table_alloc = new BigReal[61*n+16]; BigReal *table_align = table_alloc; while ( ((long)table_align) % 128 ) ++table_align; table_noshort = table_align; table_short = table_align + 16*n; slow_table = table_align + 32*n; fast_table = table_align + 36*n; scor_table = table_align + 40*n; corr_table = table_align + 44*n; full_table = table_align + 48*n; vdwa_table = table_align + 52*n; vdwb_table = table_align + 56*n; r2_table = table_align + 60*n; BigReal *fast_i = fast_table + 4; BigReal *scor_i = scor_table + 4; BigReal *slow_i = slow_table + 4; BigReal *vdwa_i = vdwa_table + 4; BigReal *vdwb_i = vdwb_table + 4; BigReal *r2_i = r2_table; *(r2_i++) = r2_delta; BigReal r2_limit = simParams->limitDist * simParams->limitDist; if ( r2_limit < r2_delta ) r2_limit = r2_delta; int r2_delta_i = 0; // entry for r2 == r2_delta // fill in the table, fix up i==0 (r2==0) below for ( i=1; i<n; ++i ) { const BigReal r2_base = r2_delta * ( 1 << (i/64) ); const BigReal r2_del = r2_base / 64.0; const BigReal r2 = r2_base - r2_delta + r2_del * (i%64); if ( r2 <= r2_limit ) r2_delta_i = i; const BigReal r = sqrt(r2); const BigReal r_1 = 1.0/r; const BigReal r_2 = 1.0/r2; // fast_ is defined as (full_ - slow_) // corr_ and fast_ are both zero at the cutoff, full_ is not // all three are approx 1/r at short distances // for actual interpolation, we use fast_ for fast forces and // scor_ = slow_ + corr_ - full_ and slow_ for slow forces // since these last two are of small magnitude BigReal fast_energy, fast_gradient; BigReal scor_energy, scor_gradient; BigReal slow_energy, slow_gradient; // corr_ is PME direct sum, or similar correction term // corr_energy is multiplied by r until later // corr_gradient is multiplied by -r^2 until later BigReal corr_energy, corr_gradient; if ( PMEOn ) { BigReal tmp_a = r * ewaldcof; BigReal tmp_b = erfc(tmp_a); corr_energy = tmp_b; corr_gradient = pi_ewaldcof*exp(-(tmp_a*tmp_a))*r + tmp_b; } else if ( MSMOn ) { BigReal a_1 = 1.0/cutoff; BigReal r_a = r * a_1; BigReal g, dg; SPOLY(&g, &dg, r_a, MSMSplit); corr_energy = 1 - r_a * g; corr_gradient = 1 + r_a*r_a * dg; } else { corr_energy = corr_gradient = 0; } switch(splitType) { case SPLIT_NONE: fast_energy = 1.0/r; fast_gradient = -1.0/r2; scor_energy = scor_gradient = 0; slow_energy = slow_gradient = 0; break; case SPLIT_SHIFT: { BigReal shiftVal = r2/cutoff2 - 1.0; shiftVal *= shiftVal; BigReal dShiftVal = 2.0 * (r2/cutoff2 - 1.0) * 2.0*r/cutoff2; fast_energy = shiftVal/r; fast_gradient = dShiftVal/r - shiftVal/r2; scor_energy = scor_gradient = 0; slow_energy = slow_gradient = 0; } break; case SPLIT_MARTINI: { // in Martini, the Coulomb switching distance is zero const BigReal COUL_SWITCH = 0.; // Gromacs shifting function const BigReal p1 = 1.; BigReal A1 = p1 * ((p1+1)*COUL_SWITCH-(p1+4)*cutoff)/(pow(cutoff,p1+2)*pow(cutoff-COUL_SWITCH,2)); BigReal B1 = -p1 * ((p1+1)*COUL_SWITCH-(p1+3)*cutoff)/(pow(cutoff,p1+2)*pow(cutoff-COUL_SWITCH,3)); BigReal X1 = 1.0/pow(cutoff,p1)-A1/3.0*pow(cutoff-COUL_SWITCH,3)-B1/4.0*pow(cutoff-COUL_SWITCH,4); BigReal r12 = (r-COUL_SWITCH)*(r-COUL_SWITCH); BigReal r13 = (r-COUL_SWITCH)*(r-COUL_SWITCH)*(r-COUL_SWITCH); BigReal shiftVal = -(A1/3.0)*r13 - (B1/4.0)*r12*r12 - X1; BigReal dShiftVal = -A1*r12 - B1*r13; fast_energy = (1/r) + shiftVal; fast_gradient = -1/(r2) + dShiftVal; scor_energy = scor_gradient = 0; slow_energy = slow_gradient = 0; } break; case SPLIT_C1: // calculate actual energy and gradient slow_energy = 0.5/cutoff * (3.0 - (r2/cutoff2)); slow_gradient = -1.0/cutoff2 * (r/cutoff); // calculate scor from slow and corr scor_energy = slow_energy + (corr_energy - 1.0)/r; scor_gradient = slow_gradient - (corr_gradient - 1.0)/r2; // calculate fast from slow fast_energy = 1.0/r - slow_energy; fast_gradient = -1.0/r2 - slow_gradient; break; case SPLIT_C2: // // Quintic splitting function contributed by // Bruce Berne, Ruhong Zhou, and Joe Morrone // // calculate actual energy and gradient slow_energy = r2/(cutoff*cutoff2) * (6.0 * (r2/cutoff2) - 15.0*(r/cutoff) + 10.0); slow_gradient = r/(cutoff*cutoff2) * (24.0 * (r2/cutoff2) - 45.0 *(r/cutoff) + 20.0); // calculate scor from slow and corr scor_energy = slow_energy + (corr_energy - 1.0)/r; scor_gradient = slow_gradient - (corr_gradient - 1.0)/r2; // calculate fast from slow fast_energy = 1.0/r - slow_energy; fast_gradient = -1.0/r2 - slow_gradient; break; } // foo_gradient is calculated as ( d foo_energy / d r ) // and now divided by 2r to get ( d foo_energy / d r2 ) fast_gradient *= 0.5 * r_1; scor_gradient *= 0.5 * r_1; slow_gradient *= 0.5 * r_1; // let modf be 1 if excluded, 1-scale14 if modified, 0 otherwise, // add scor_ - modf * slow_ to slow terms and // add fast_ - modf * fast_ to fast terms. BigReal vdwa_energy, vdwa_gradient; BigReal vdwb_energy, vdwb_gradient; const BigReal r_6 = r_2*r_2*r_2; const BigReal r_12 = r_6*r_6; // Lennard-Jones switching function if ( simParams->vdwForceSwitching ) { // switch force // from Steinbach & Brooks, JCC 15, pgs 667-683, 1994, eqns 10-13 if ( r2 > switchOn2 ) { BigReal tmpa = r_6 - cutoff_6; vdwa_energy = k_vdwa * tmpa * tmpa; BigReal tmpb = r_1 * r_2 - cutoff_3; vdwb_energy = k_vdwb * tmpb * tmpb; vdwa_gradient = -6.0 * k_vdwa * tmpa * r_2 * r_6; vdwb_gradient = -3.0 * k_vdwb * tmpb * r_2 * r_2 * r_1; } else { vdwa_energy = r_12 + v_vdwa; vdwb_energy = r_6 + v_vdwb; vdwa_gradient = -6.0 * r_2 * r_12; vdwb_gradient = -3.0 * r_2 * r_6; } } else if ( simParams->martiniSwitching ) { // switching fxn for Martini RBCG BigReal r12 = (r-switchOn)*(r-switchOn); BigReal r13 = (r-switchOn)*(r-switchOn)*(r-switchOn); BigReal p6 = 6; BigReal A6 = p6 * ((p6+1)*switchOn-(p6+4)*cutoff)/(pow(cutoff,p6+2)*pow(cutoff-switchOn,2)); BigReal B6 = -p6 * ((p6+1)*switchOn-(p6+3)*cutoff)/(pow(cutoff,p6+2)*pow(cutoff-switchOn,3)); BigReal C6 = 1.0/pow(cutoff,p6)-A6/3.0*pow(cutoff-switchOn,3)-B6/4.0*pow(cutoff-switchOn,4); BigReal p12 = 12; BigReal A12 = p12 * ((p12+1)*switchOn-(p12+4)*cutoff)/(pow(cutoff,p12+2)*pow(cutoff-switchOn,2)); BigReal B12 = -p12 * ((p12+1)*switchOn-(p12+3)*cutoff)/(pow(cutoff,p12+2)*pow(cutoff-switchOn,3)); BigReal C12 = 1.0/pow(cutoff,p12)-A12/3.0*pow(cutoff-switchOn,3)-B12/4.0*pow(cutoff-switchOn,4); BigReal LJshifttempA = -(A12/3)*r13 - (B12/4)*r12*r12 - C12; BigReal LJshifttempB = -(A6/3)*r13 - (B6/4)*r12*r12 - C6; const BigReal shiftValA = // used for Lennard-Jones ( r2 > switchOn2 ? LJshifttempA : -C12); const BigReal shiftValB = // used for Lennard-Jones ( r2 > switchOn2 ? LJshifttempB : -C6); BigReal LJdshifttempA = -A12*r12 - B12*r13; BigReal LJdshifttempB = -A6*r12 - B6*r13; const BigReal dshiftValA = // used for Lennard-Jones ( r2 > switchOn2 ? LJdshifttempA*0.5*r_1 : 0 ); const BigReal dshiftValB = // used for Lennard-Jones ( r2 > switchOn2 ? LJdshifttempB*0.5*r_1 : 0 ); //have not addressed r > cutoff // dshiftValA*= 0.5*r_1; // dshiftValB*= 0.5*r_1; vdwa_energy = r_12 + shiftValA; vdwb_energy = r_6 + shiftValB; vdwa_gradient = -6/pow(r,14) + dshiftValA ; vdwb_gradient = -3/pow(r,8) + dshiftValB; } else { // switch energy const BigReal c2 = cutoff2-r2; const BigReal c4 = c2*(c3-2.0*c2); const BigReal switchVal = // used for Lennard-Jones ( r2 > switchOn2 ? c2*c4*c1 : 1.0 ); const BigReal dSwitchVal = // d switchVal / d r2 ( r2 > switchOn2 ? 2*c1*(c2*c2-c4) : 0.0 ); vdwa_energy = switchVal * r_12; vdwb_energy = switchVal * r_6; vdwa_gradient = ( dSwitchVal - 6.0 * switchVal * r_2 ) * r_12; vdwb_gradient = ( dSwitchVal - 3.0 * switchVal * r_2 ) * r_6; } *(fast_i++) = fast_energy; *(fast_i++) = fast_gradient; *(fast_i++) = 0; *(fast_i++) = 0; *(scor_i++) = scor_energy; *(scor_i++) = scor_gradient; *(scor_i++) = 0; *(scor_i++) = 0; *(slow_i++) = slow_energy; *(slow_i++) = slow_gradient; *(slow_i++) = 0; *(slow_i++) = 0; *(vdwa_i++) = vdwa_energy; *(vdwa_i++) = vdwa_gradient; *(vdwa_i++) = 0; *(vdwa_i++) = 0; *(vdwb_i++) = vdwb_energy; *(vdwb_i++) = vdwb_gradient; *(vdwb_i++) = 0; *(vdwb_i++) = 0; *(r2_i++) = r2 + r2_delta; } if ( ! r2_delta_i ) { NAMD_bug("Failed to find table entry for r2 == r2_limit\n"); } if ( r2_table[r2_delta_i] > r2_limit + r2_delta ) { NAMD_bug("Found bad table entry for r2 == r2_limit\n"); } int j; const char *table_name = "XXXX"; int smooth_short = 0; for ( j=0; j<5; ++j ) { BigReal *t0 = 0; switch (j) { case 0: t0 = fast_table; table_name = "FAST"; smooth_short = 1; break; case 1: t0 = scor_table; table_name = "SCOR"; smooth_short = 0; break; case 2: t0 = slow_table; table_name = "SLOW"; smooth_short = 0; break; case 3: t0 = vdwa_table; table_name = "VDWA"; smooth_short = 1; break; case 4: t0 = vdwb_table; table_name = "VDWB"; smooth_short = 1; break; } // patch up data for i=0 t0[0] = t0[4] - t0[5] * ( r2_delta / 64.0 ); // energy t0[1] = t0[5]; // gradient t0[2] = 0; t0[3] = 0; if ( smooth_short ) { BigReal energy0 = t0[4*r2_delta_i]; BigReal gradient0 = t0[4*r2_delta_i+1]; BigReal r20 = r2_table[r2_delta_i]; t0[0] = energy0 - gradient0 * (r20 - r2_table[0]); // energy t0[1] = gradient0; // gradient } BigReal *t; for ( i=0,t=t0; i<(n-1); ++i,t+=4 ) { BigReal x = ( r2_delta * ( 1 << (i/64) ) ) / 64.0; if ( r2_table[i+1] != r2_table[i] + x ) { NAMD_bug("Bad table delta calculation.\n"); } if ( smooth_short && i+1 < r2_delta_i ) { BigReal energy0 = t0[4*r2_delta_i]; BigReal gradient0 = t0[4*r2_delta_i+1]; BigReal r20 = r2_table[r2_delta_i]; t[4] = energy0 - gradient0 * (r20 - r2_table[i+1]); // energy t[5] = gradient0; // gradient } BigReal v1 = t[0]; BigReal g1 = t[1]; BigReal v2 = t[4]; BigReal g2 = t[5]; // explicit formulas for v1 + g1 x + c x^2 + d x^3 BigReal c = ( 3.0 * (v2 - v1) - x * (2.0 * g1 + g2) ) / ( x * x ); BigReal d = ( -2.0 * (v2 - v1) + x * (g1 + g2) ) / ( x * x * x ); // since v2 - v1 is imprecise, we refine c and d numerically // important because we need accurate forces (more than energies!) for ( int k=0; k < 2; ++k ) { BigReal dv = (v1 - v2) + ( ( d * x + c ) * x + g1 ) * x; BigReal dg = (g1 - g2) + ( 3.0 * d * x + 2.0 * c ) * x; c -= ( 3.0 * dv - x * dg ) / ( x * x ); d -= ( -2.0 * dv + x * dg ) / ( x * x * x ); } // store in the array; t[2] = c; t[3] = d; } if ( ! CkMyPe() ) { BigReal dvmax = 0; BigReal dgmax = 0; BigReal dvmax_r = 0; BigReal dgmax_r = 0; BigReal fdvmax = 0; BigReal fdgmax = 0; BigReal fdvmax_r = 0; BigReal fdgmax_r = 0; BigReal dgcdamax = 0; BigReal dgcdimax = 0; BigReal dgcaimax = 0; BigReal dgcdamax_r = 0; BigReal dgcdimax_r = 0; BigReal dgcaimax_r = 0; BigReal fdgcdamax = 0; BigReal fdgcdimax = 0; BigReal fdgcaimax = 0; BigReal fdgcdamax_r = 0; BigReal fdgcdimax_r = 0; BigReal fdgcaimax_r = 0; BigReal gcm = fabs(t0[1]); // gradient magnitude running average for ( i=0,t=t0; i<(n-1); ++i,t+=4 ) { const BigReal r2_base = r2_delta * ( 1 << (i/64) ); const BigReal r2_del = r2_base / 64.0; const BigReal r2 = r2_base - r2_delta + r2_del * (i%64); const BigReal r = sqrt(r2); if ( r > cutoff ) break; BigReal x = r2_del; BigReal dv = ( ( t[3] * x + t[2] ) * x + t[1] ) * x + t[0] - t[4]; BigReal dg = ( 3.0 * t[3] * x + 2.0 * t[2] ) * x + t[1] - t[5]; if ( t[4] != 0. && fabs(dv/t[4]) > fdvmax ) { fdvmax = fabs(dv/t[4]); fdvmax_r = r; } if ( fabs(dv) > dvmax ) { dvmax = fabs(dv); dvmax_r = r; } if ( t[5] != 0. && fabs(dg/t[5]) > fdgmax ) { fdgmax = fabs(dg/t[5]); fdgmax_r = r; } if ( fabs(dg) > dgmax ) { dgmax = fabs(dg); dgmax_r = r; } BigReal gcd = (t[4] - t[0]) / x; // centered difference gradient BigReal gcd_prec = (fabs(t[0]) + fabs(t[4])) * 1.e-15 / x; // roundoff gcm = 0.9 * gcm + 0.1 * fabs(t[5]); // magnitude running average BigReal gca = 0.5 * (t[1] + t[5]); // centered average gradient BigReal gci = ( 0.75 * t[3] * x + t[2] ) * x + t[1]; // interpolated BigReal rc = sqrt(r2 + 0.5 * x); BigReal dgcda = gcd - gca; if ( dgcda != 0. && fabs(dgcda) < gcd_prec ) { // CkPrintf("ERROR %g < PREC %g AT %g AVG VAL %g\n", dgcda, gcd_prec, rc, gca); dgcda = 0.; } BigReal dgcdi = gcd - gci; if ( dgcdi != 0. && fabs(dgcdi) < gcd_prec ) { // CkPrintf("ERROR %g < PREC %g AT %g INT VAL %g\n", dgcdi, gcd_prec, rc, gci); dgcdi = 0.; } BigReal dgcai = gca - gci; if ( t[1]*t[5] > 0. && gcm != 0. && fabs(dgcda/gcm) > fdgcdamax ) { fdgcdamax = fabs(dgcda/gcm); fdgcdamax_r = rc; } if ( fabs(dgcda) > fdgcdamax ) { dgcdamax = fabs(dgcda); dgcdamax_r = rc; } if ( t[1]*t[5] > 0. && gcm != 0. && fabs(dgcdi/gcm) > fdgcdimax ) { fdgcdimax = fabs(dgcdi/gcm); fdgcdimax_r = rc; } if ( fabs(dgcdi) > fdgcdimax ) { dgcdimax = fabs(dgcdi); dgcdimax_r = rc; } if ( t[1]*t[5] > 0. && gcm != 0. && fabs(dgcai/gcm) > fdgcaimax ) { fdgcaimax = fabs(dgcai/gcm); fdgcaimax_r = rc; } if ( fabs(dgcai) > fdgcaimax ) { dgcaimax = fabs(dgcai); dgcaimax_r = rc; } #if 0 CkPrintf("TABLE %s %g %g %g %g\n",table_name,rc,dgcda/gcm,dgcda,gci); if (dv != 0.) CkPrintf("TABLE %d ENERGY ERROR %g AT %g (%d)\n",j,dv,r,i); if (dg != 0.) CkPrintf("TABLE %d FORCE ERROR %g AT %g (%d)\n",j,dg,r,i); #endif } if ( dvmax != 0.0 ) { iout << iINFO << "ABSOLUTE IMPRECISION IN " << table_name << " TABLE ENERGY: " << dvmax << " AT " << dvmax_r << "\n" << endi; } if ( fdvmax != 0.0 ) { iout << iINFO << "RELATIVE IMPRECISION IN " << table_name << " TABLE ENERGY: " << fdvmax << " AT " << fdvmax_r << "\n" << endi; } if ( dgmax != 0.0 ) { iout << iINFO << "ABSOLUTE IMPRECISION IN " << table_name << " TABLE FORCE: " << dgmax << " AT " << dgmax_r << "\n" << endi; } if ( fdgmax != 0.0 ) { iout << iINFO << "RELATIVE IMPRECISION IN " << table_name << " TABLE FORCE: " << fdgmax << " AT " << fdgmax_r << "\n" << endi; } if (fdgcdamax != 0.0 ) { iout << iINFO << "INCONSISTENCY IN " << table_name << " TABLE ENERGY VS FORCE: " << fdgcdamax << " AT " << fdgcdamax_r << "\n" << endi; if ( fdgcdamax > 0.1 ) { iout << iERROR << "\n"; iout << iERROR << "CALCULATED " << table_name << " FORCE MAY NOT MATCH ENERGY! POSSIBLE BUG!\n"; iout << iERROR << "\n"; } } if (0 && fdgcdimax != 0.0 ) { iout << iINFO << "INCONSISTENCY IN " << table_name << " TABLE ENERGY VS FORCE: " << fdgcdimax << " AT " << fdgcdimax_r << "\n" << endi; } if ( 0 && fdgcaimax != 0.0 ) { iout << iINFO << "INCONSISTENCY IN " << table_name << " TABLE AVG VS INT FORCE: " << fdgcaimax << " AT " << fdgcaimax_r << "\n" << endi; } } } for ( i=0; i<4*n; ++i ) { corr_table[i] = fast_table[i] + scor_table[i]; full_table[i] = fast_table[i] + slow_table[i]; } #if 0 for ( i=0; i<n; ++i ) { for ( int j=0; j<4; ++j ) { table_short[16*i+6-2*j] = table_noshort[16*i+6-2*j] = vdwa_table[4*i+j]; table_short[16*i+7-2*j] = table_noshort[16*i+7-2*j] = vdwb_table[4*i+j]; table_short[16*i+8+3-j] = fast_table[4*i+j]; table_short[16*i+12+3-j] = scor_table[4*i+j]; table_noshort[16*i+8+3-j] = corr_table[4*i+j]; table_noshort[16*i+12+3-j] = full_table[4*i+j]; } } #endif for ( i=0; i<n; ++i ) { table_short[16*i+ 0] = table_noshort[16*i+0] = -6.*vdwa_table[4*i+3]; table_short[16*i+ 2] = table_noshort[16*i+2] = -6.*vdwb_table[4*i+3]; table_short[16*i+ 4] = table_noshort[16*i+4] = -2.*vdwa_table[4*i+1]; table_short[16*i+ 6] = table_noshort[16*i+6] = -2.*vdwb_table[4*i+1]; table_short[16*i+1] = table_noshort[16*i+1] = -4.*vdwa_table[4*i+2]; table_short[16*i+3] = table_noshort[16*i+3] = -4.*vdwb_table[4*i+2]; table_short[16*i+5] = table_noshort[16*i+5] = -1.*vdwa_table[4*i+0]; table_short[16*i+7] = table_noshort[16*i+7] = -1.*vdwb_table[4*i+0]; table_short[16*i+8] = -6.*fast_table[4*i+3]; table_short[16*i+9] = -4.*fast_table[4*i+2]; table_short[16*i+10] = -2.*fast_table[4*i+1]; table_short[16*i+11] = -1.*fast_table[4*i+0]; table_noshort[16*i+8] = -6.*corr_table[4*i+3]; table_noshort[16*i+9] = -4.*corr_table[4*i+2]; table_noshort[16*i+10] = -2.*corr_table[4*i+1]; table_noshort[16*i+11] = -1.*corr_table[4*i+0]; table_short[16*i+12] = -6.*scor_table[4*i+3]; table_short[16*i+13] = -4.*scor_table[4*i+2]; table_short[16*i+14] = -2.*scor_table[4*i+1]; table_short[16*i+15] = -1.*scor_table[4*i+0]; table_noshort[16*i+12] = -6.*full_table[4*i+3]; table_noshort[16*i+13] = -4.*full_table[4*i+2]; table_noshort[16*i+14] = -2.*full_table[4*i+1]; table_noshort[16*i+15] = -1.*full_table[4*i+0]; } #if 0 char fname[100]; sprintf(fname,"/tmp/namd.table.pe%d.dat",CkMyPe()); FILE *f = fopen(fname,"w"); for ( i=0; i<(n-1); ++i ) { const BigReal r2_base = r2_delta * ( 1 << (i/64) ); const BigReal r2_del = r2_base / 64.0; const BigReal r2 = r2_base - r2_delta + r2_del * (i%64); BigReal *t; if ( r2 + r2_delta != r2_table[i] ) fprintf(f,"r2 error! "); fprintf(f,"%g",r2); t = fast_table + 4*i; fprintf(f," %g %g %g %g", t[0], t[1], t[2], t[3]); t = scor_table + 4*i; fprintf(f," %g %g %g %g", t[0], t[1], t[2], t[3]); t = slow_table + 4*i; fprintf(f," %g %g %g %g", t[0], t[1], t[2], t[3]); t = corr_table + 4*i; fprintf(f," %g %g %g %g", t[0], t[1], t[2], t[3]); t = full_table + 4*i; fprintf(f," %g %g %g %g", t[0], t[1], t[2], t[3]); t = vdwa_table + 4*i; fprintf(f," %g %g %g %g", t[0], t[1], t[2], t[3]); t = vdwb_table + 4*i; fprintf(f," %g %g %g %g", t[0], t[1], t[2], t[3]); fprintf(f,"\n"); } fclose(f); #endif #ifdef NAMD_CUDA send_build_cuda_force_table(); #endif }
void MetisLB::work(LDStats* stats) { /** ========================== INITIALIZATION ============================= */ ProcArray *parr = new ProcArray(stats); ObjGraph *ogr = new ObjGraph(stats); /** ============================= STRATEGY ================================ */ if (_lb_args.debug() >= 2) { CkPrintf("[%d] In MetisLB Strategy...\n", CkMyPe()); } // convert ObjGraph to the adjacency structure int numVertices = ogr->vertices.size(); // number of vertices int numEdges = 0; // number of edges double maxLoad = 0.0; int i, j, k, vert; /** remove duplicate edges from recvFrom */ for(i = 0; i < numVertices; i++) { for(j = 0; j < ogr->vertices[i].sendToList.size(); j++) { vert = ogr->vertices[i].sendToList[j].getNeighborId(); for(k = 0; k < ogr->vertices[i].recvFromList.size(); k++) { if(ogr->vertices[i].recvFromList[k].getNeighborId() == vert) { ogr->vertices[i].sendToList[j].setNumBytes(ogr->vertices[i].sendToList[j].getNumBytes() + ogr->vertices[i].recvFromList[k].getNumBytes()); ogr->vertices[i].recvFromList.erase(ogr->vertices[i].recvFromList.begin() + k); } } } } /** the object load is normalized to an integer between 0 and 256 */ for(i = 0; i < numVertices; i++) { if(ogr->vertices[i].getVertexLoad() > maxLoad) maxLoad = ogr->vertices[i].getVertexLoad(); numEdges = numEdges + ogr->vertices[i].sendToList.size() + ogr->vertices[i].recvFromList.size(); } /* adjacency list */ idx_t *xadj = new idx_t[numVertices + 1]; /* id of the neighbors */ idx_t *adjncy = new idx_t[numEdges]; /* weights of the vertices */ idx_t *vwgt = new idx_t[numVertices]; /* weights of the edges */ idx_t *adjwgt = new idx_t[numEdges]; int edgeNum = 0; double ratio = 256.0/maxLoad; for(i = 0; i < numVertices; i++) { xadj[i] = edgeNum; vwgt[i] = (int)ceil(ogr->vertices[i].getVertexLoad() * ratio); for(j = 0; j < ogr->vertices[i].sendToList.size(); j++) { adjncy[edgeNum] = ogr->vertices[i].sendToList[j].getNeighborId(); adjwgt[edgeNum] = ogr->vertices[i].sendToList[j].getNumBytes(); edgeNum++; } for(j = 0; j < ogr->vertices[i].recvFromList.size(); j++) { adjncy[edgeNum] = ogr->vertices[i].recvFromList[j].getNeighborId(); adjwgt[edgeNum] = ogr->vertices[i].recvFromList[j].getNumBytes(); edgeNum++; } } xadj[i] = edgeNum; CkAssert(edgeNum == numEdges); idx_t edgecut; // number of edges cut by the partitioning idx_t *pemap; idx_t options[METIS_NOPTIONS]; METIS_SetDefaultOptions(options); //options[METIS_OPTION_PTYPE] = METIS_PTYPE_RB; // C style numbering options[METIS_OPTION_NUMBERING] = 0; // number of constrains idx_t ncon = 1; // number of partitions idx_t numPes = parr->procs.size(); real_t ubvec[ncon]; // allow 10% imbalance ubvec[0] = 1.1; // mapping of objs to partitions pemap = new idx_t[numVertices]; // Specifies size of vertices for computing the total communication volume idx_t *vsize = NULL; // This array of size nparts specifies the desired weight for each partition // and setting it to NULL indicates graph should be equally divided among // partitions real_t *tpwgts = NULL; int option = 0; if (WEIGHTED == option) { // set up the different weights between 0 and 1 tpwgts = new real_t[numPes]; for (i = 0; i < numPes; i++) { tpwgts[i] = 1.0/(real_t)numPes; } } else if (MULTI_CONSTRAINT == option) { CkAbort("Multiple constraints not implemented.\n"); } // numVertices: num vertices in the graph; ncon: num balancing constrains // xadj, adjncy: of size n+1 and adjncy of 2m, adjncy[xadj[i]] through and // including adjncy[xadj[i+1]-1]; // vwgt: weight of the vertices; vsize: amt of data that needs to be sent // for ith vertex is vsize[i] // adjwght: the weight of edges; numPes: total parts // tpwghts: target partition weight, can pass NULL to equally divide // ubvec: of size ncon to indicate allowed load imbalance tolerance (> 1.0) // options: array of options; edgecut: stores the edgecut; pemap: mapping METIS_PartGraphRecursive(&numVertices, &ncon, xadj, adjncy, vwgt, vsize, adjwgt, &numPes, tpwgts, ubvec, options, &edgecut, pemap); delete[] xadj; delete[] adjncy; delete[] vwgt; delete[] adjwgt; delete[] vsize; delete[] tpwgts; if (_lb_args.debug() >= 1) { CkPrintf("[%d] MetisLB done! \n", CkMyPe()); } for(i = 0; i < numVertices; i++) { if(pemap[i] != ogr->vertices[i].getCurrentPe()) ogr->vertices[i].setNewPe(pemap[i]); } delete[] pemap; /** ============================== CLEANUP ================================ */ ogr->convertDecisions(stats); delete parr; delete ogr; }
void LdbCoordinator::initialize(PatchMap *pMap, ComputeMap *cMap, int reinit) { const SimParameters *simParams = Node::Object()->simParameters; #if 0 static int lbcreated = 0; // XXX static variables are unsafe for SMP // PE0 first time Create a load balancer if (CkMyPe() == 0 && !lbcreated) { if (simParams->ldbStrategy == LDBSTRAT_ALGNBOR) CreateNamdNborLB(); else { // CreateCentralLB(); CreateNamdCentLB(); } lbcreated = 1; } #endif // DebugM(10,"stepsPerLdbCycle initialized\n"); stepsPerLdbCycle = simParams->ldbPeriod; firstLdbStep = simParams->firstLdbStep; int lastLdbStep = simParams->lastLdbStep; int stepsPerCycle = simParams->stepsPerCycle; computeMap = cMap; patchMap = pMap; // Set the number of received messages correctly for node 0 nStatsMessagesExpected = Node::Object()->numNodes(); nStatsMessagesReceived = 0; if (patchNAtoms) delete [] patchNAtoms; // Depends on delete NULL to do nothing nPatches = patchMap->numPatches(); patchNAtoms = new int[nPatches]; typedef Sequencer *seqPtr; if ( ! reinit ) { delete [] sequencerThreads; // Depends on delete NULL to do nothing sequencerThreads = new seqPtr[nPatches]; } nLocalPatches=0; int i; for(i=0;i<nPatches;i++) { if (patchMap->node(i) == Node::Object()->myid()) { nLocalPatches++; patchNAtoms[i]=0; } else { patchNAtoms[i]=-1; } if ( ! reinit ) sequencerThreads[i]=NULL; } if ( ! reinit ) controllerThread = NULL; if (nLocalPatches != patchMap->numHomePatches()) NAMD_die("Disaggreement in patchMap data.\n"); const int oldNumComputes = numComputes; nLocalComputes = 0; numComputes = computeMap->numComputes(); for(i=0;i<numComputes;i++) { if ( (computeMap->node(i) == Node::Object()->myid()) && ( 0 #ifndef NAMD_CUDA || (computeMap->type(i) == computeNonbondedSelfType) || (computeMap->type(i) == computeNonbondedPairType) #endif || (computeMap->type(i) == computeLCPOType) || (computeMap->type(i) == computeSelfExclsType) || (computeMap->type(i) == computeSelfBondsType) || (computeMap->type(i) == computeSelfAnglesType) || (computeMap->type(i) == computeSelfDihedralsType) || (computeMap->type(i) == computeSelfImpropersType) || (computeMap->type(i) == computeSelfTholeType) || (computeMap->type(i) == computeSelfAnisoType) || (computeMap->type(i) == computeSelfCrosstermsType) || (computeMap->type(i) == computeBondsType) || (computeMap->type(i) == computeExclsType) || (computeMap->type(i) == computeAnglesType) || (computeMap->type(i) == computeDihedralsType) || (computeMap->type(i) == computeImpropersType) || (computeMap->type(i) == computeTholeType) || (computeMap->type(i) == computeAnisoType) || (computeMap->type(i) == computeCrosstermsType) ) ) { nLocalComputes++; } } // New LB frameworks registration // Allocate data structure to save incoming migrations. Processor // zero will get all migrations // If this is the first time through, we need it register patches if (ldbCycleNum == reg_all_objs) { if ( Node::Object()->simParameters->ldBalancer == LDBAL_CENTRALIZED ) { reg_all_objs = 3; } // Tell the lbdb that I'm registering objects, until I'm done // registering them. theLbdb->RegisteringObjects(myHandle); if ( ldbCycleNum == 1 ) { patchHandles = new LDObjHandle[nLocalPatches]; int patch_count=0; int i; for(i=0;i<nPatches;i++) if (patchMap->node(i) == Node::Object()->myid()) { LDObjid elemID; elemID.id[0] = i; elemID.id[1] = elemID.id[2] = elemID.id[3] = -2; if (patch_count >= nLocalPatches) { iout << iFILE << iERROR << iPE << "LdbCoordinator found too many local patches!" << endi; CkExit(); } HomePatch *p = patchMap->homePatch(i); p->ldObjHandle = patchHandles[patch_count] = theLbdb->RegisterObj(myHandle,elemID,0,0); patch_count++; } } if ( numComputes > oldNumComputes ) { // Register computes for(i=oldNumComputes; i<numComputes; i++) { if ( computeMap->node(i) == Node::Object()->myid()) { if ( 0 #ifndef NAMD_CUDA || (computeMap->type(i) == computeNonbondedSelfType) || (computeMap->type(i) == computeNonbondedPairType) #endif || (computeMap->type(i) == computeLCPOType) || (computeMap->type(i) == computeSelfExclsType) || (computeMap->type(i) == computeSelfBondsType) || (computeMap->type(i) == computeSelfAnglesType) || (computeMap->type(i) == computeSelfDihedralsType) || (computeMap->type(i) == computeSelfImpropersType) || (computeMap->type(i) == computeSelfTholeType) || (computeMap->type(i) == computeSelfAnisoType) || (computeMap->type(i) == computeSelfCrosstermsType) ) { // Register the object with the load balancer // Store the depended patch IDs in the rest of the element ID LDObjid elemID; elemID.id[0] = i; if (computeMap->numPids(i) > 2) elemID.id[3] = computeMap->pid(i,2); else elemID.id[3] = -1; if (computeMap->numPids(i) > 1) elemID.id[2] = computeMap->pid(i,1); else elemID.id[2] = -1; if (computeMap->numPids(i) > 0) elemID.id[1] = computeMap->pid(i,0); else elemID.id[1] = -1; Compute *c = computeMap->compute(i); if ( ! c ) NAMD_bug("LdbCoordinator::initialize() null compute pointer"); c->ldObjHandle = theLbdb->RegisterObj(myHandle,elemID,0,1); } else if ( (computeMap->type(i) == computeBondsType) || (computeMap->type(i) == computeExclsType) || (computeMap->type(i) == computeAnglesType) || (computeMap->type(i) == computeDihedralsType) || (computeMap->type(i) == computeImpropersType) || (computeMap->type(i) == computeTholeType) || (computeMap->type(i) == computeAnisoType) || (computeMap->type(i) == computeCrosstermsType) ) { // Register the object with the load balancer // Store the depended patch IDs in the rest of the element ID LDObjid elemID; elemID.id[0] = i; elemID.id[1] = elemID.id[2] = elemID.id[3] = -3; Compute *c = computeMap->compute(i); if ( ! c ) NAMD_bug("LdbCoordinator::initialize() null compute pointer"); c->ldObjHandle = theLbdb->RegisterObj(myHandle,elemID,0,0); } } } } theLbdb->DoneRegisteringObjects(myHandle); } // process saved migration messages, if any while ( migrateMsgs ) { LdbMigrateMsg *m = migrateMsgs; migrateMsgs = m->next; Compute *c = computeMap->compute(m->handle.id.id[0]); if ( ! c ) NAMD_bug("LdbCoordinator::initialize() null compute pointer 2"); c->ldObjHandle = m->handle; delete m; } // Fixup to take care of the extra timestep at startup // This is pretty ugly here, but it makes the count correct // iout << "LDB Cycle Num: " << ldbCycleNum << "\n"; if ( simParams->ldBalancer == LDBAL_CENTRALIZED ) { if (ldbCycleNum == 1 || ldbCycleNum == 3) { numStepsToRun = stepsPerCycle; totalStepsDone += numStepsToRun; takingLdbData = 0; theLbdb->CollectStatsOff(); } else if (ldbCycleNum == 2 || ldbCycleNum == 4) { numStepsToRun = firstLdbStep - stepsPerCycle; while ( numStepsToRun <= 0 ) numStepsToRun += stepsPerCycle; totalStepsDone += numStepsToRun; takingLdbData = 1; theLbdb->CollectStatsOn(); } else if ( (ldbCycleNum <= 6) || !takingLdbData ) { totalStepsDone += firstLdbStep; if(lastLdbStep != -1 && totalStepsDone > lastLdbStep) { numStepsToRun = -1; takingLdbData = 0; theLbdb->CollectStatsOff(); } else { numStepsToRun = firstLdbStep; takingLdbData = 1; theLbdb->CollectStatsOn(); } } else { totalStepsDone += stepsPerLdbCycle - firstLdbStep; if(lastLdbStep != -1 && totalStepsDone > lastLdbStep) { numStepsToRun = -1; takingLdbData = 0; theLbdb->CollectStatsOff(); } else { numStepsToRun = stepsPerLdbCycle - firstLdbStep; takingLdbData = 0; theLbdb->CollectStatsOff(); } } } else { if (ldbCycleNum==1) { totalStepsDone += firstLdbStep; numStepsToRun = firstLdbStep; takingLdbData = 0; theLbdb->CollectStatsOff(); } else if ( (ldbCycleNum <= 4) || !takingLdbData ) { totalStepsDone += firstLdbStep; if(lastLdbStep != -1 && totalStepsDone > lastLdbStep) { numStepsToRun = -1; takingLdbData = 0; theLbdb->CollectStatsOff(); } else { numStepsToRun = firstLdbStep; takingLdbData = 1; theLbdb->CollectStatsOn(); } } else { totalStepsDone += stepsPerLdbCycle - firstLdbStep; if(lastLdbStep != -1 && totalStepsDone > lastLdbStep) { numStepsToRun = -1; takingLdbData = 0; theLbdb->CollectStatsOff(); } else { numStepsToRun = stepsPerLdbCycle - firstLdbStep; takingLdbData = 0; theLbdb->CollectStatsOff(); } } } /*-----------------------------------------------------------------------------* * --------------------------------------------------------------------------- * * Comments inserted by Abhinav to clarify relation between ldbCycleNum, * * load balancing step numbers (printed by the step() function) and * * tracing of the steps * * --------------------------------------------------------------------------- * * If trace is turned off in the beginning, then tracing is turned on * * at ldbCycleNum = 4 and turned off at ldbCycleNum = 8. ldbCycleNum can * * be adjusted by specifying firstLdbStep and ldbPeriod which are set by * * default to 5*stepspercycle and 200*stepspercycle if not specified. * * * * If we choose firstLdbStep = 20 and ldbPeriod = 100, we have the * * following timeline (for these particular numbers): * * * * Tracing : <------ off ------><------------- on -----------><-- off * * Ldb Step() No : 1 2 3 4 5 6 7 * * Iteration Steps : 00====20====40====60====80======160====180=====260====280 * * ldbCycleNum : 1 2 3 4 5 6 7 8 9 * * Instrumention : Inst Inst Inst Inst Inst * * LDB Strategy : TLB RLB RLB RLB RLB * * * * TLB = TorusLB * * RLB = RefineTorusLB * * Inst = Instrumentation Phase (no real load balancing) * * --------------------------------------------------------------------------- * *-----------------------------------------------------------------------------* */ #if 0 //replaced by traceBarrier at Controller and Sequencer if (traceAvailable()) { static int specialTracing = 0; // XXX static variables are unsafe for SMP if (ldbCycleNum == 1 && traceIsOn() == 0) specialTracing = 1; if (specialTracing) { if (ldbCycleNum == 4) traceBegin(); if (ldbCycleNum == 8) traceEnd(); } } #endif nPatchesReported = 0; nPatchesExpected = nLocalPatches; nComputesReported = 0; nComputesExpected = nLocalComputes * numStepsToRun; controllerReported = 0; controllerExpected = ! CkMyPe(); if (CkMyPe() == 0) { if (computeArray == NULL) computeArray = new computeInfo[numComputes]; if (patchArray == NULL) patchArray = new patchInfo[nPatches]; if (processorArray == NULL) processorArray = new processorInfo[CkNumPes()]; } theLbdb->ClearLoads(); }
static void _bufferHandler(void *msg) { DEBUGF(("[%d] _bufferHandler called.\n", CkMyPe())); CkpvAccess(_buffQ)->enq(msg); }