Ejemplo n.º 1
0
void
ComputeMgr::updateLocalComputes5()
{
    if ( ! CkMyRank() ) {
      ComputeMap::Object()->checkMap();
      PatchMap::Object()->checkMap();
    }

    // we always use the centralized building of spanning tree
    // distributed building of ST called in Node.C only
    if (proxySendSpanning || proxyRecvSpanning)
        ProxyMgr::Object()->buildProxySpanningTree2();

    // this code needs to be turned on if we want to
    // shift the creation of ST to the load balancer

#if 0
    if (proxySendSpanning || proxyRecvSpanning)
    {
        if (firstphase)
            ProxyMgr::Object()->buildProxySpanningTree2();
        else
            if (CkMyPe() == 0)
                ProxyMgr::Object()->sendSpanningTrees();

        firstphase = 0;
    }
#endif

    if (!CkMyPe())
        CkStartQD(CkIndex_ComputeMgr::doneUpdateLocalComputes(), &thishandle);
}
Ejemplo n.º 2
0
void
ComputeMgr::updateLocalComputes3()
{
    ComputeMap *computeMap = ComputeMap::Object();
    CProxy_ProxyMgr pm(CkpvAccess(BOCclass_group).proxyMgr);
    ProxyMgr *proxyMgr = pm.ckLocalBranch();

    ProxyMgr::nodecount = 0;

    const int nc = computeMap->numComputes();

    if ( ! CkMyRank() ) {
      for (int i=0; i<nc; i++) {
        computeMap->setNewNumPartitions(i,0);
        if (computeMap->newNode(i) != -1) {
          computeMap->setNode(i,computeMap->newNode(i));
          computeMap->setNewNode(i,-1);
        }
      }
    }
 
    for(int i=0; i<computeFlag.size(); i++) createCompute(computeFlag[i], computeMap);
    computeFlag.clear();

    proxyMgr->removeUnusedProxies();

    if (!CkMyPe())
    {
        CkStartQD(CkIndex_ComputeMgr::updateLocalComputes4((CkQdMsg*)0), &thishandle);
    }
}
Ejemplo n.º 3
0
void ComputeMgr::splitComputes()
{
  if ( ! CkMyRank() ) {
    ComputeMap *computeMap = ComputeMap::Object();
    const int nc = computeMap->numComputes();

    for (int i=0; i<nc; i++) {
      int nnp = computeMap->newNumPartitions(i);
      if ( nnp > 0 ) {
        if ( computeMap->numPartitions(i) != 1 ) {
          CkPrintf("Warning: unable to partition compute %d\n", i);
          computeMap->setNewNumPartitions(i,0);
          continue;
        }
        //CkPrintf("splitting compute %d by %d\n",i,nnp);
        computeMap->setNumPartitions(i,nnp);
        if (computeMap->newNode(i) == -1) {
          computeMap->setNewNode(i,computeMap->node(i));
        }
        for ( int j=1; j<nnp; ++j ) {
          int newcid = computeMap->cloneCompute(i,j);
          //CkPrintf("compute %d partition %d is %d\n",i,j,newcid);
        }
      }
    }
    computeMap->extendPtrs();
  }

  if (!CkMyPe())
  {
    CkStartQD(CkIndex_ComputeMgr::splitComputes2((CkQdMsg*)0), &thishandle);
  }
}
Ejemplo n.º 4
0
static void namdInitPapiCounters(){
	if(CkMyRank()==0){
		//only initialize per OS process (i.e. a charm node)
		int retval = PAPI_library_init(PAPI_VER_CURRENT);
		if(retval != PAPI_VER_CURRENT) {
			if(CkMyPe()==0){
				CkPrintf("ERROR: PAPI library is not compatitible!");
				CkExit();
			}
		}
	#if CMK_SMP
		//now only consider systems that are compatible with POSIX
		if(PAPI_thread_init(pthread_self)!=PAPI_OK) {
			if(CkMyPe()==0){
				CkPrintf("ERROR: multi-thread mode in PAPI could not be initialized!");
				CkExit();
			}
		}
	#endif
	}
	CkpvInitialize(int *, papiEvents);
	CkpvAccess(papiEvents) = new int[NUM_PAPI_EVENTS];

#if MEASURE_PAPI_CACHE
	if(PAPI_query_event(PAPI_L1_DCM)==PAPI_OK) {
		CkpvAccess(papiEvents)[0] = PAPI_L1_DCM;
	}else{
		if(CkMyPe()==0){
			CkPrintf("WARNING: PAPI_L1_DCM doesn't exsit on this platform!\n");			
		}
		//if not default to PAPI_TOT_INS
		CkpvAccess(papiEvents)[0] = PAPI_TOT_INS;
	}

	if(PAPI_query_event(PAPI_L2_DCM)==PAPI_OK) {
		CkpvAccess(papiEvents)[1] = PAPI_L2_DCM;
	}else{
		//if not default to PAPI_TOT_CYC
		CkpvAccess(papiEvents)[1] = PAPI_TOT_CYC;
	}	
#elif MEASURE_PAPI_FLOPS
	if(PAPI_query_event(PAPI_FP_INS)==PAPI_OK) {
		CkpvAccess(papiEvents)[0] = PAPI_FP_INS;
	}else{
		if(CkMyPe()==0){
			CkPrintf("WARNING: PAPI_FP_INS doesn't exsit on this platform!\n");
		}
		//if not default to PAPI_TOT_INS
		CkpvAccess(papiEvents)[0] = PAPI_TOT_INS;
	}

	if(PAPI_query_event(PAPI_FMA_INS)==PAPI_OK) {
		CkpvAccess(papiEvents)[1] = PAPI_FMA_INS;
	}else{
		//if not default to PAPI_TOT_CYC
		CkpvAccess(papiEvents)[1] = PAPI_TOT_CYC;
	}
#endif
}
Ejemplo n.º 5
0
// Initialization of the parallel trace module.
void initTraceSimpleBOC() {
#ifdef __BIGSIM__
  if (BgNodeRank()==0) {
#else
    if (CkMyRank() == 0) {
#endif
      registerExitFn(traceSimpleExitFunction);
    }
}
Ejemplo n.º 6
0
/**
 * This function (not a handler) is called once and only once per processor.
 * It signals the processor that the initialization is done and regular messages
 * can be processed.
 *
 * On processor zero it is called by _initCharm, on all other processors either
 * by _initHandler or _triggerHandler (cannot be both).
 * When fault-tolerance is active, it is called by the fault-tolerance scheme itself.
 */
void _initDone(void)
{
  if (CkpvAccess(_initdone)) return;
  CkpvAccess(_initdone) ++;
  DEBUGF(("[%d] _initDone.\n", CkMyPe()));
  if (!CksvAccess(_triggersSent)) _sendTriggers();
  CkNumberHandler(_triggerHandlerIdx, (CmiHandler)_discardHandler);
  CmiNodeBarrier();
  if(CkMyRank() == 0) {
    _processBufferedNodeBocInits();
  }
  CmiNodeBarrier(); // wait for all nodegroups to be created
  _processBufferedBocInits();
  DEBUGF(("Reached CmiNodeBarrier(), pe = %d, rank = %d\n", CkMyPe(), CkMyRank()));
  CmiNodeBarrier();
  DEBUGF(("Crossed CmiNodeBarrier(), pe = %d, rank = %d\n", CkMyPe(), CkMyRank()));
  _processBufferedMsgs();
  CkpvAccess(_charmEpoch)=1;
}
Ejemplo n.º 7
0
void CkArrayReductionMgr::flushStates(){
  if(CkMyRank()== 0){
    // CmiPrintf("[%d] CkArrayReductionMgr::flushState\n", CkMyPe());
    redNo=0;
    count = 0;
    while (!my_msgs.isEmpty())  delete my_msgs.deq();
    while (!my_futureMsgs.isEmpty()) delete my_futureMsgs.deq();
    reductionInfo.redNo = 0;
    CkNodeReductionMgr::flushStates();
  }
}
Ejemplo n.º 8
0
void immediatering_init(void)
{ 
  int setNum = 0;
  if (CkMyRank()==0 && numTests==0) setNum = 1;
#if 1
  // test Charm immediate messages
  if (setNum) numTests +=2;
  immring_nodegrp[0].start(new immMessage);
  immring_grp[0].start(new immMessage);
#endif
#if 1
  if (setNum) numTests ++;
  sendImmediate(0,0);
#endif
}
Ejemplo n.º 9
0
void InitCallTable::enumerateInitCalls()
{
  int i;
#ifdef __BIGSIM__
  if(BgNodeRank()==0)        // called only once on an emulating node
#else
  if(CkMyRank()==0) 
#endif
  {
    for (i=0; i<initNodeCalls.length(); i++) initNodeCalls[i]();
  }
  // initproc may depend on initnode calls.
  CmiNodeAllBarrier();
  for (i=0; i<initProcCalls.length(); i++) initProcCalls[i]();
}
Ejemplo n.º 10
0
// called on slave procs
void slave_init(int argc, char **argv)
{
#if CMK_SMP
  //the original main thread could now be a comm thread
  //and a slave thread could now be the main thread,
  //so we have to do the master initialization here
  if(CmiMyRank()==0){
    master_init(argc, argv);
    if(CmiMyPe()==0)
      after_backend_init(argc, argv);
    return;
  }
#endif

  all_init(argc, argv);

  if (CkMyRank() < CkMyNodeSize()) 	// skip the communication thread
    CsdScheduler(-1);
}
Ejemplo n.º 11
0
void ComputeNonbondedUtil::select(void)
{
  if ( CkMyRank() ) return;

  // These defaults die cleanly if nothing appropriate is assigned.
  ComputeNonbondedUtil::calcPair = calc_error;
  ComputeNonbondedUtil::calcPairEnergy = calc_error;
  ComputeNonbondedUtil::calcSelf = calc_error;
  ComputeNonbondedUtil::calcSelfEnergy = calc_error;
  ComputeNonbondedUtil::calcFullPair = calc_error;
  ComputeNonbondedUtil::calcFullPairEnergy = calc_error;
  ComputeNonbondedUtil::calcFullSelf = calc_error;
  ComputeNonbondedUtil::calcFullSelfEnergy = calc_error;
  ComputeNonbondedUtil::calcMergePair = calc_error;
  ComputeNonbondedUtil::calcMergePairEnergy = calc_error;
  ComputeNonbondedUtil::calcMergeSelf = calc_error;
  ComputeNonbondedUtil::calcMergeSelfEnergy = calc_error;
  ComputeNonbondedUtil::calcSlowPair = calc_error;
  ComputeNonbondedUtil::calcSlowPairEnergy = calc_error;
  ComputeNonbondedUtil::calcSlowSelf = calc_error;
  ComputeNonbondedUtil::calcSlowSelfEnergy = calc_error;

  SimParameters * simParams = Node::Object()->simParameters;
  Parameters * params = Node::Object()->parameters;

  table_ener = params->table_ener;
  rowsize = params->rowsize;
  columnsize = params->columnsize;

  commOnly = simParams->commOnly;
  fixedAtomsOn = ( simParams->fixedAtomsOn && ! simParams->fixedAtomsForces );

  cutoff = simParams->cutoff;
  cutoff2 = cutoff*cutoff;

//fepb
  alchFepOn = simParams->alchFepOn;
  Fep_WCA_repuOn = simParams->alchFepWCARepuOn;
  Fep_WCA_dispOn = simParams->alchFepWCADispOn;
  alchThermIntOn = simParams->alchThermIntOn;
  alchLambda = alchLambda2 = 0;
  lesOn = simParams->lesOn;
  lesScaling = lesFactor = 0;
  Bool tabulatedEnergies = simParams->tabulatedEnergies;
  alchVdwShiftCoeff = simParams->alchVdwShiftCoeff;
  WCA_rcut1 = simParams->alchFepWCArcut1;
  WCA_rcut2 = simParams->alchFepWCArcut2;
  alchVdwLambdaEnd = simParams->alchVdwLambdaEnd;
  alchElecLambdaStart = simParams->alchElecLambdaStart;

  alchDecouple = simParams->alchDecouple;

  delete [] lambda_table;
  lambda_table = 0;

  pairInteractionOn = simParams->pairInteractionOn;
  pairInteractionSelf = simParams->pairInteractionSelf;
  pressureProfileOn = simParams->pressureProfileOn;

  // Ported by JLai -- Original JE - Go
  goForcesOn = simParams->goForcesOn;
  goMethod = simParams->goMethod; 
  // End of port

  accelMDOn = simParams->accelMDOn;

  drudeNbthole = simParams->drudeOn && (simParams->drudeNbtholeCut > 0.0);

  if ( drudeNbthole ) {
#ifdef NAMD_CUDA
    NAMD_die("drudeNbthole is not supported in CUDA version");
#endif
    if ( alchFepOn )
      NAMD_die("drudeNbthole is not supported with alchemical free-energy perturbation");
    if ( alchThermIntOn )
      NAMD_die("drudeNbthole is not supported with alchemical thermodynamic integration");
    if ( lesOn )
      NAMD_die("drudeNbthole is not supported with locally enhanced sampling");
    if ( pairInteractionOn )
      NAMD_die("drudeNbthole is not supported with pair interaction calculation");
    if ( pressureProfileOn )
      NAMD_die("drudeNbthole is not supported with pressure profile calculation");
  }

  if ( alchFepOn ) {
#ifdef NAMD_CUDA
    NAMD_die("Alchemical free-energy perturbation is not supported in CUDA version");
#endif
    alchLambda = simParams->alchLambda;
    alchLambda2 = simParams->alchLambda2;
    ComputeNonbondedUtil::calcPair = calc_pair_energy_fep;
    ComputeNonbondedUtil::calcPairEnergy = calc_pair_energy_fep;
    ComputeNonbondedUtil::calcSelf = calc_self_energy_fep;
    ComputeNonbondedUtil::calcSelfEnergy = calc_self_energy_fep;
    ComputeNonbondedUtil::calcFullPair = calc_pair_energy_fullelect_fep;
    ComputeNonbondedUtil::calcFullPairEnergy = calc_pair_energy_fullelect_fep;
    ComputeNonbondedUtil::calcFullSelf = calc_self_energy_fullelect_fep;
    ComputeNonbondedUtil::calcFullSelfEnergy = calc_self_energy_fullelect_fep;
    ComputeNonbondedUtil::calcMergePair = calc_pair_energy_merge_fullelect_fep;
    ComputeNonbondedUtil::calcMergePairEnergy = calc_pair_energy_merge_fullelect_fep;
    ComputeNonbondedUtil::calcMergeSelf = calc_self_energy_merge_fullelect_fep;
    ComputeNonbondedUtil::calcMergeSelfEnergy = calc_self_energy_merge_fullelect_fep;
    ComputeNonbondedUtil::calcSlowPair = calc_pair_energy_slow_fullelect_fep;
    ComputeNonbondedUtil::calcSlowPairEnergy = calc_pair_energy_slow_fullelect_fep;
    ComputeNonbondedUtil::calcSlowSelf = calc_self_energy_slow_fullelect_fep;
    ComputeNonbondedUtil::calcSlowSelfEnergy = calc_self_energy_slow_fullelect_fep;
  }  else if ( alchThermIntOn ) {
#ifdef NAMD_CUDA
    NAMD_die("Alchemical thermodynamic integration is not supported in CUDA version");
#endif
    alchLambda = simParams->alchLambda;
    ComputeNonbondedUtil::calcPair = calc_pair_ti;
    ComputeNonbondedUtil::calcPairEnergy = calc_pair_energy_ti;
    ComputeNonbondedUtil::calcSelf = calc_self_ti;
    ComputeNonbondedUtil::calcSelfEnergy = calc_self_energy_ti;
    ComputeNonbondedUtil::calcFullPair = calc_pair_fullelect_ti;
    ComputeNonbondedUtil::calcFullPairEnergy = calc_pair_energy_fullelect_ti;
    ComputeNonbondedUtil::calcFullSelf = calc_self_fullelect_ti;
    ComputeNonbondedUtil::calcFullSelfEnergy = calc_self_energy_fullelect_ti;
    ComputeNonbondedUtil::calcMergePair = calc_pair_merge_fullelect_ti;
    ComputeNonbondedUtil::calcMergePairEnergy = calc_pair_energy_merge_fullelect_ti;
    ComputeNonbondedUtil::calcMergeSelf = calc_self_merge_fullelect_ti;
    ComputeNonbondedUtil::calcMergeSelfEnergy = calc_self_energy_merge_fullelect_ti;
    ComputeNonbondedUtil::calcSlowPair = calc_pair_slow_fullelect_ti;
    ComputeNonbondedUtil::calcSlowPairEnergy = calc_pair_energy_slow_fullelect_ti;
    ComputeNonbondedUtil::calcSlowSelf = calc_self_slow_fullelect_ti;
    ComputeNonbondedUtil::calcSlowSelfEnergy = calc_self_energy_slow_fullelect_ti;
  } else if ( lesOn ) {
#ifdef NAMD_CUDA
    NAMD_die("Locally enhanced sampling is not supported in CUDA version");
#endif
    lesFactor = simParams->lesFactor;
    lesScaling = 1.0 / (double)lesFactor;
    lambda_table = new BigReal[(lesFactor+1)*(lesFactor+1)];
    for ( int ip=0; ip<=lesFactor; ++ip ) {
      for ( int jp=0; jp<=lesFactor; ++jp ) {
        BigReal lambda_pair = 1.0;
        if (ip || jp ) {
          if (ip && jp && ip != jp) {
            lambda_pair = 0.0;
          } else {
            lambda_pair = lesScaling;
          }
        }
        lambda_table[(lesFactor+1)*ip+jp] = lambda_pair;
      }
    }
    ComputeNonbondedUtil::calcPair = calc_pair_les;
    ComputeNonbondedUtil::calcPairEnergy = calc_pair_energy_les;
    ComputeNonbondedUtil::calcSelf = calc_self_les;
    ComputeNonbondedUtil::calcSelfEnergy = calc_self_energy_les;
    ComputeNonbondedUtil::calcFullPair = calc_pair_fullelect_les;
    ComputeNonbondedUtil::calcFullPairEnergy = calc_pair_energy_fullelect_les;
    ComputeNonbondedUtil::calcFullSelf = calc_self_fullelect_les;
    ComputeNonbondedUtil::calcFullSelfEnergy = calc_self_energy_fullelect_les;
    ComputeNonbondedUtil::calcMergePair = calc_pair_merge_fullelect_les;
    ComputeNonbondedUtil::calcMergePairEnergy = calc_pair_energy_merge_fullelect_les;
    ComputeNonbondedUtil::calcMergeSelf = calc_self_merge_fullelect_les;
    ComputeNonbondedUtil::calcMergeSelfEnergy = calc_self_energy_merge_fullelect_les;
    ComputeNonbondedUtil::calcSlowPair = calc_pair_slow_fullelect_les;
    ComputeNonbondedUtil::calcSlowPairEnergy = calc_pair_energy_slow_fullelect_les;
    ComputeNonbondedUtil::calcSlowSelf = calc_self_slow_fullelect_les;
    ComputeNonbondedUtil::calcSlowSelfEnergy = calc_self_energy_slow_fullelect_les;
  } else if ( pressureProfileOn) {
#ifdef NAMD_CUDA
    NAMD_die("Pressure profile calculation is not supported in CUDA version");
#endif
    pressureProfileSlabs = simParams->pressureProfileSlabs;
    pressureProfileAtomTypes = simParams->pressureProfileAtomTypes;

    ComputeNonbondedUtil::calcPair = calc_pair_pprof;
    ComputeNonbondedUtil::calcPairEnergy = calc_pair_energy_pprof;
    ComputeNonbondedUtil::calcSelf = calc_self_pprof;
    ComputeNonbondedUtil::calcSelfEnergy = calc_self_energy_pprof;
    ComputeNonbondedUtil::calcFullPair = calc_pair_fullelect_pprof;
    ComputeNonbondedUtil::calcFullPairEnergy = calc_pair_energy_fullelect_pprof;
    ComputeNonbondedUtil::calcFullSelf = calc_self_fullelect_pprof;
    ComputeNonbondedUtil::calcFullSelfEnergy = calc_self_energy_fullelect_pprof;
    ComputeNonbondedUtil::calcMergePair = calc_pair_merge_fullelect_pprof;
    ComputeNonbondedUtil::calcMergePairEnergy = calc_pair_energy_merge_fullelect_pprof;
    ComputeNonbondedUtil::calcMergeSelf = calc_self_merge_fullelect_pprof;
    ComputeNonbondedUtil::calcMergeSelfEnergy = calc_self_energy_merge_fullelect_pprof;
    ComputeNonbondedUtil::calcSlowPair = calc_pair_slow_fullelect_pprof;
    ComputeNonbondedUtil::calcSlowPairEnergy = calc_pair_energy_slow_fullelect_pprof;
    ComputeNonbondedUtil::calcSlowSelf = calc_self_slow_fullelect_pprof;
    ComputeNonbondedUtil::calcSlowSelfEnergy = calc_self_energy_slow_fullelect_pprof;
  } else if ( pairInteractionOn ) {
#ifdef NAMD_CUDA
    NAMD_die("Pair interaction calculation is not supported in CUDA version");
#endif
    ComputeNonbondedUtil::calcPairEnergy = calc_pair_energy_int;
    ComputeNonbondedUtil::calcSelfEnergy = calc_self_energy_int;
    ComputeNonbondedUtil::calcFullPairEnergy = calc_pair_energy_fullelect_int;
    ComputeNonbondedUtil::calcFullSelfEnergy = calc_self_energy_fullelect_int;
    ComputeNonbondedUtil::calcMergePairEnergy = calc_pair_energy_merge_fullelect_int;
    ComputeNonbondedUtil::calcMergeSelfEnergy = calc_self_energy_merge_fullelect_int;
  } else if ( tabulatedEnergies ) {
#ifdef NAMD_CUDA
    NAMD_die("Tabulated energies is not supported in CUDA version");
#endif
    ComputeNonbondedUtil::calcPair = calc_pair_tabener;
    ComputeNonbondedUtil::calcPairEnergy = calc_pair_energy_tabener;
    ComputeNonbondedUtil::calcSelf = calc_self_tabener;
    ComputeNonbondedUtil::calcSelfEnergy = calc_self_energy_tabener;
    ComputeNonbondedUtil::calcFullPair = calc_pair_fullelect_tabener;
    ComputeNonbondedUtil::calcFullPairEnergy = calc_pair_energy_fullelect_tabener;
    ComputeNonbondedUtil::calcFullSelf = calc_self_fullelect_tabener;
    ComputeNonbondedUtil::calcFullSelfEnergy = calc_self_energy_fullelect_tabener;
    ComputeNonbondedUtil::calcMergePair = calc_pair_merge_fullelect_tabener;
    ComputeNonbondedUtil::calcMergePairEnergy = calc_pair_energy_merge_fullelect_tabener;
    ComputeNonbondedUtil::calcMergeSelf = calc_self_merge_fullelect_tabener;
    ComputeNonbondedUtil::calcMergeSelfEnergy = calc_self_energy_merge_fullelect_tabener;
    ComputeNonbondedUtil::calcSlowPair = calc_pair_slow_fullelect_tabener;
    ComputeNonbondedUtil::calcSlowPairEnergy = calc_pair_energy_slow_fullelect_tabener;
    ComputeNonbondedUtil::calcSlowSelf = calc_self_slow_fullelect_tabener;
    ComputeNonbondedUtil::calcSlowSelfEnergy = calc_self_energy_slow_fullelect_tabener;
  } else if ( goForcesOn ) {
#ifdef NAMD_CUDA
    NAMD_die("Go forces is not supported in CUDA version");
#endif
    ComputeNonbondedUtil::calcPair = calc_pair_go;
    ComputeNonbondedUtil::calcPairEnergy = calc_pair_energy_go;
    ComputeNonbondedUtil::calcSelf = calc_self_go;
    ComputeNonbondedUtil::calcSelfEnergy = calc_self_energy_go;
    ComputeNonbondedUtil::calcFullPair = calc_pair_fullelect_go;
    ComputeNonbondedUtil::calcFullPairEnergy = calc_pair_energy_fullelect_go;
    ComputeNonbondedUtil::calcFullSelf = calc_self_fullelect_go;
    ComputeNonbondedUtil::calcFullSelfEnergy = calc_self_energy_fullelect_go;
    ComputeNonbondedUtil::calcMergePair = calc_pair_merge_fullelect_go;
    ComputeNonbondedUtil::calcMergePairEnergy = calc_pair_energy_merge_fullelect_go;
    ComputeNonbondedUtil::calcMergeSelf = calc_self_merge_fullelect_go;
    ComputeNonbondedUtil::calcMergeSelfEnergy = calc_self_energy_merge_fullelect_go;
    ComputeNonbondedUtil::calcSlowPair = calc_pair_slow_fullelect_go;
    ComputeNonbondedUtil::calcSlowPairEnergy = calc_pair_energy_slow_fullelect_go;
    ComputeNonbondedUtil::calcSlowSelf = calc_self_slow_fullelect_go;
    ComputeNonbondedUtil::calcSlowSelfEnergy = calc_self_energy_slow_fullelect_go;
  } else {
    ComputeNonbondedUtil::calcPair = calc_pair;
    ComputeNonbondedUtil::calcPairEnergy = calc_pair_energy;
    ComputeNonbondedUtil::calcSelf = calc_self;
    ComputeNonbondedUtil::calcSelfEnergy = calc_self_energy;
    ComputeNonbondedUtil::calcFullPair = calc_pair_fullelect;
    ComputeNonbondedUtil::calcFullPairEnergy = calc_pair_energy_fullelect;
    ComputeNonbondedUtil::calcFullSelf = calc_self_fullelect;
    ComputeNonbondedUtil::calcFullSelfEnergy = calc_self_energy_fullelect;
    ComputeNonbondedUtil::calcMergePair = calc_pair_merge_fullelect;
    ComputeNonbondedUtil::calcMergePairEnergy = calc_pair_energy_merge_fullelect;
    ComputeNonbondedUtil::calcMergeSelf = calc_self_merge_fullelect;
    ComputeNonbondedUtil::calcMergeSelfEnergy = calc_self_energy_merge_fullelect;
    ComputeNonbondedUtil::calcSlowPair = calc_pair_slow_fullelect;
    ComputeNonbondedUtil::calcSlowPairEnergy = calc_pair_energy_slow_fullelect;
    ComputeNonbondedUtil::calcSlowSelf = calc_self_slow_fullelect;
    ComputeNonbondedUtil::calcSlowSelfEnergy = calc_self_energy_slow_fullelect;
  }

//fepe

  dielectric_1 = 1.0/simParams->dielectric;
  if ( ! ljTable ) ljTable = new LJTable;
  mol = Node::Object()->molecule;
  scaling = simParams->nonbondedScaling;
  if ( simParams->exclude == SCALED14 )
  {
    scale14 = simParams->scale14;
  }
  else
  {
    scale14 = 1.;
  }
  if ( simParams->switchingActive )
  {
    switchOn = simParams->switchingDist;
    switchOn_1 = 1.0/switchOn;
    // d0 = 1.0/(cutoff-switchOn);
    switchOn2 = switchOn*switchOn;
    c0 = 1.0/(cutoff2-switchOn2);

    if ( simParams->vdwForceSwitching ) {
      double switchOn3 = switchOn * switchOn2;
      double cutoff3 = cutoff * cutoff2;
      double switchOn6 = switchOn3 * switchOn3;
      double cutoff6 = cutoff3 * cutoff3;
      v_vdwa = -1. / ( switchOn6 * cutoff6 );
      v_vdwb = -1. / ( switchOn3 * cutoff3 );
      k_vdwa = cutoff6 / ( cutoff6 - switchOn6 );
      k_vdwb = cutoff3 / ( cutoff3 - switchOn3 );
      cutoff_3 = 1. / cutoff3;
      cutoff_6 = 1. / cutoff6;
    }
  }
  else
  {
    switchOn = cutoff;
    switchOn_1 = 1.0/switchOn;
    // d0 = 0.;  // avoid division by zero
    switchOn2 = switchOn*switchOn;
    c0 = 0.;  // avoid division by zero
  }
  c1 = c0*c0*c0;
  c3 = 3.0 * (cutoff2 - switchOn2);
  c5 = 0;
  c6 = 0;
  c7 = 0;
  c8 = 0;

  const int PMEOn = simParams->PMEOn;
  const int MSMOn = simParams->MSMOn;
  const int MSMSplit = simParams->MSMSplit;

  if ( PMEOn ) {
    ewaldcof = simParams->PMEEwaldCoefficient;
    BigReal TwoBySqrtPi = 1.12837916709551;
    pi_ewaldcof = TwoBySqrtPi * ewaldcof;
  }

  int splitType = SPLIT_NONE;
  if ( simParams->switchingActive ) splitType = SPLIT_SHIFT;
  if ( simParams->martiniSwitching ) splitType = SPLIT_MARTINI;
  if ( simParams->fullDirectOn || simParams->FMAOn || PMEOn || MSMOn ) {
    switch ( simParams->longSplitting ) {
      case C2:
      splitType = SPLIT_C2;
      break;

      case C1:
      splitType = SPLIT_C1;
      break;

      case XPLOR:
      NAMD_die("Sorry, XPLOR splitting not supported.");
      break;

      case SHARP:
      NAMD_die("Sorry, SHARP splitting not supported.");
      break;

      default:
      NAMD_die("Unknown splitting type found!");

    }
  }

  BigReal r2_tol = 0.1;
  
  r2_delta = 1.0;
  r2_delta_exp = 0;
  while ( r2_delta > r2_tol ) { r2_delta /= 2.0; r2_delta_exp += 1; }
  r2_delta_1 = 1.0 / r2_delta;

  if ( ! CkMyPe() ) {
    iout << iINFO << "NONBONDED TABLE R-SQUARED SPACING: " <<
				r2_delta << "\n" << endi;
  }

  BigReal r2_tmp = 1.0;
  int cutoff2_exp = 0;
  while ( (cutoff2 + r2_delta) > r2_tmp ) { r2_tmp *= 2.0; cutoff2_exp += 1; }

  int i;
  int n = (r2_delta_exp + cutoff2_exp) * 64 + 1;

  if ( ! CkMyPe() ) {
    iout << iINFO << "NONBONDED TABLE SIZE: " <<
				n << " POINTS\n" << endi;
  }

  if ( table_alloc ) delete [] table_alloc;
  table_alloc = new BigReal[61*n+16];
  BigReal *table_align = table_alloc;
  while ( ((long)table_align) % 128 ) ++table_align;
  table_noshort = table_align;
  table_short = table_align + 16*n;
  slow_table = table_align + 32*n;
  fast_table = table_align + 36*n;
  scor_table = table_align + 40*n;
  corr_table = table_align + 44*n;
  full_table = table_align + 48*n;
  vdwa_table = table_align + 52*n;
  vdwb_table = table_align + 56*n;
  r2_table = table_align + 60*n;
  BigReal *fast_i = fast_table + 4;
  BigReal *scor_i = scor_table + 4;
  BigReal *slow_i = slow_table + 4;
  BigReal *vdwa_i = vdwa_table + 4;
  BigReal *vdwb_i = vdwb_table + 4;
  BigReal *r2_i = r2_table;  *(r2_i++) = r2_delta;
  BigReal r2_limit = simParams->limitDist * simParams->limitDist;
  if ( r2_limit < r2_delta ) r2_limit = r2_delta;
  int r2_delta_i = 0;  // entry for r2 == r2_delta

  // fill in the table, fix up i==0 (r2==0) below
  for ( i=1; i<n; ++i ) {

    const BigReal r2_base = r2_delta * ( 1 << (i/64) );
    const BigReal r2_del = r2_base / 64.0;
    const BigReal r2 = r2_base - r2_delta + r2_del * (i%64);

    if ( r2 <= r2_limit ) r2_delta_i = i;

    const BigReal r = sqrt(r2);
    const BigReal r_1 = 1.0/r;
    const BigReal r_2 = 1.0/r2;

    // fast_ is defined as (full_ - slow_)
    // corr_ and fast_ are both zero at the cutoff, full_ is not
    // all three are approx 1/r at short distances

    // for actual interpolation, we use fast_ for fast forces and
    // scor_ = slow_ + corr_ - full_ and slow_ for slow forces
    // since these last two are of small magnitude

    BigReal fast_energy, fast_gradient;
    BigReal scor_energy, scor_gradient;
    BigReal slow_energy, slow_gradient;

    // corr_ is PME direct sum, or similar correction term
    // corr_energy is multiplied by r until later
    // corr_gradient is multiplied by -r^2 until later
    BigReal corr_energy, corr_gradient;

    
    if ( PMEOn ) {
      BigReal tmp_a = r * ewaldcof;
      BigReal tmp_b = erfc(tmp_a);
      corr_energy = tmp_b;
      corr_gradient = pi_ewaldcof*exp(-(tmp_a*tmp_a))*r + tmp_b;
    } else if ( MSMOn ) {
      BigReal a_1 = 1.0/cutoff;
      BigReal r_a = r * a_1;
      BigReal g, dg;
      SPOLY(&g, &dg, r_a, MSMSplit);
      corr_energy = 1 - r_a * g;
      corr_gradient = 1 + r_a*r_a * dg;
    } else {
      corr_energy = corr_gradient = 0;
    }

    switch(splitType) {
      case SPLIT_NONE:
        fast_energy = 1.0/r;
        fast_gradient = -1.0/r2;
        scor_energy = scor_gradient = 0;
        slow_energy = slow_gradient = 0;
	break;
      case SPLIT_SHIFT: {
	BigReal shiftVal = r2/cutoff2 - 1.0;
	shiftVal *= shiftVal;
	BigReal dShiftVal = 2.0 * (r2/cutoff2 - 1.0) * 2.0*r/cutoff2;
        fast_energy = shiftVal/r;
        fast_gradient = dShiftVal/r - shiftVal/r2;
        scor_energy = scor_gradient = 0;
        slow_energy = slow_gradient = 0;
        } 
	break;
      case SPLIT_MARTINI: { 
        // in Martini, the Coulomb switching distance is zero
        const BigReal COUL_SWITCH = 0.;
        // Gromacs shifting function
        const BigReal p1 = 1.;
        BigReal A1 = p1 * ((p1+1)*COUL_SWITCH-(p1+4)*cutoff)/(pow(cutoff,p1+2)*pow(cutoff-COUL_SWITCH,2));
        BigReal B1 = -p1 * ((p1+1)*COUL_SWITCH-(p1+3)*cutoff)/(pow(cutoff,p1+2)*pow(cutoff-COUL_SWITCH,3));
        BigReal X1 = 1.0/pow(cutoff,p1)-A1/3.0*pow(cutoff-COUL_SWITCH,3)-B1/4.0*pow(cutoff-COUL_SWITCH,4);
        BigReal r12 = (r-COUL_SWITCH)*(r-COUL_SWITCH);
        BigReal r13 = (r-COUL_SWITCH)*(r-COUL_SWITCH)*(r-COUL_SWITCH);
        BigReal shiftVal = -(A1/3.0)*r13 - (B1/4.0)*r12*r12 - X1;
        BigReal dShiftVal = -A1*r12 - B1*r13;
        fast_energy = (1/r) + shiftVal;
        fast_gradient = -1/(r2) + dShiftVal;
        scor_energy = scor_gradient = 0;
        slow_energy = slow_gradient = 0;
        } 
	break;
      case SPLIT_C1:
	// calculate actual energy and gradient
	slow_energy = 0.5/cutoff * (3.0 - (r2/cutoff2));
	slow_gradient = -1.0/cutoff2 * (r/cutoff);
	// calculate scor from slow and corr
	scor_energy = slow_energy + (corr_energy - 1.0)/r;
	scor_gradient = slow_gradient - (corr_gradient - 1.0)/r2;
	// calculate fast from slow
	fast_energy = 1.0/r - slow_energy;
	fast_gradient = -1.0/r2 - slow_gradient;
	break;
      case SPLIT_C2:
        //
        // Quintic splitting function contributed by
        // Bruce Berne, Ruhong Zhou, and Joe Morrone
        //
	// calculate actual energy and gradient
        slow_energy = r2/(cutoff*cutoff2) * (6.0 * (r2/cutoff2)
            - 15.0*(r/cutoff) + 10.0);
        slow_gradient = r/(cutoff*cutoff2) * (24.0 * (r2/cutoff2)
            - 45.0 *(r/cutoff) + 20.0);
	// calculate scor from slow and corr
        scor_energy = slow_energy + (corr_energy - 1.0)/r;
        scor_gradient = slow_gradient - (corr_gradient - 1.0)/r2;
	// calculate fast from slow
	fast_energy = 1.0/r - slow_energy;
	fast_gradient = -1.0/r2 - slow_gradient;
	break;
    }

    // foo_gradient is calculated as ( d foo_energy / d r )
    // and now divided by 2r to get ( d foo_energy / d r2 )

    fast_gradient *= 0.5 * r_1;
    scor_gradient *= 0.5 * r_1;
    slow_gradient *= 0.5 * r_1;

    // let modf be 1 if excluded, 1-scale14 if modified, 0 otherwise,
    // add scor_ - modf * slow_ to slow terms and
    // add fast_ - modf * fast_ to fast terms.

    BigReal vdwa_energy, vdwa_gradient;
    BigReal vdwb_energy, vdwb_gradient;

    const BigReal r_6 = r_2*r_2*r_2;
    const BigReal r_12 = r_6*r_6;

    // Lennard-Jones switching function
  if ( simParams->vdwForceSwitching ) {  // switch force
    // from Steinbach & Brooks, JCC 15, pgs 667-683, 1994, eqns 10-13
    if ( r2 > switchOn2 ) {
      BigReal tmpa = r_6 - cutoff_6;
      vdwa_energy = k_vdwa * tmpa * tmpa;
      BigReal tmpb = r_1 * r_2 - cutoff_3;
      vdwb_energy = k_vdwb * tmpb * tmpb;
      vdwa_gradient = -6.0 * k_vdwa * tmpa * r_2 * r_6;
      vdwb_gradient = -3.0 * k_vdwb * tmpb * r_2 * r_2 * r_1;
    } else {
      vdwa_energy = r_12 + v_vdwa;
      vdwb_energy = r_6 + v_vdwb;
      vdwa_gradient = -6.0 * r_2 * r_12;
      vdwb_gradient = -3.0 * r_2 * r_6;
    }
  } else if ( simParams->martiniSwitching ) { // switching fxn for Martini RBCG

    BigReal r12 = (r-switchOn)*(r-switchOn);        BigReal r13 = (r-switchOn)*(r-switchOn)*(r-switchOn);

    BigReal p6 = 6;
    BigReal A6 = p6 * ((p6+1)*switchOn-(p6+4)*cutoff)/(pow(cutoff,p6+2)*pow(cutoff-switchOn,2));
    BigReal B6 = -p6 * ((p6+1)*switchOn-(p6+3)*cutoff)/(pow(cutoff,p6+2)*pow(cutoff-switchOn,3));        
    BigReal C6 = 1.0/pow(cutoff,p6)-A6/3.0*pow(cutoff-switchOn,3)-B6/4.0*pow(cutoff-switchOn,4);

    BigReal p12 = 12;
    BigReal A12 = p12 * ((p12+1)*switchOn-(p12+4)*cutoff)/(pow(cutoff,p12+2)*pow(cutoff-switchOn,2));
    BigReal B12 = -p12 * ((p12+1)*switchOn-(p12+3)*cutoff)/(pow(cutoff,p12+2)*pow(cutoff-switchOn,3));
    BigReal C12 = 1.0/pow(cutoff,p12)-A12/3.0*pow(cutoff-switchOn,3)-B12/4.0*pow(cutoff-switchOn,4);

    BigReal LJshifttempA = -(A12/3)*r13 - (B12/4)*r12*r12 - C12;
    BigReal LJshifttempB = -(A6/3)*r13 - (B6/4)*r12*r12 - C6;
    const BigReal shiftValA =         // used for Lennard-Jones
                        ( r2 > switchOn2 ? LJshifttempA : -C12);
    const BigReal shiftValB =         // used for Lennard-Jones
                        ( r2 > switchOn2 ? LJshifttempB : -C6);

    BigReal LJdshifttempA = -A12*r12 - B12*r13;
    BigReal LJdshifttempB = -A6*r12 - B6*r13;
    const BigReal dshiftValA =         // used for Lennard-Jones
                        ( r2 > switchOn2 ? LJdshifttempA*0.5*r_1 : 0 );
    const BigReal dshiftValB =         // used for Lennard-Jones
                        ( r2 > switchOn2 ? LJdshifttempB*0.5*r_1 : 0 );




    //have not addressed r > cutoff

    //  dshiftValA*= 0.5*r_1;
    //  dshiftValB*= 0.5*r_1;

    vdwa_energy = r_12 + shiftValA;
    vdwb_energy = r_6 + shiftValB;
   
    vdwa_gradient = -6/pow(r,14) + dshiftValA ;
    vdwb_gradient = -3/pow(r,8) + dshiftValB;

  } else {  // switch energy
    const BigReal c2 = cutoff2-r2;
    const BigReal c4 = c2*(c3-2.0*c2);
    const BigReal switchVal =         // used for Lennard-Jones
                        ( r2 > switchOn2 ? c2*c4*c1 : 1.0 );
    const BigReal dSwitchVal =        // d switchVal / d r2
                        ( r2 > switchOn2 ? 2*c1*(c2*c2-c4) : 0.0 );

    vdwa_energy = switchVal * r_12;
    vdwb_energy = switchVal * r_6;

    vdwa_gradient = ( dSwitchVal - 6.0 * switchVal * r_2 ) * r_12;
    vdwb_gradient = ( dSwitchVal - 3.0 * switchVal * r_2 ) * r_6;
  }


    *(fast_i++) = fast_energy;
    *(fast_i++) = fast_gradient;
    *(fast_i++) = 0;
    *(fast_i++) = 0;
    *(scor_i++) = scor_energy;
    *(scor_i++) = scor_gradient;
    *(scor_i++) = 0;
    *(scor_i++) = 0;
    *(slow_i++) = slow_energy;
    *(slow_i++) = slow_gradient;
    *(slow_i++) = 0;
    *(slow_i++) = 0;
    *(vdwa_i++) = vdwa_energy;
    *(vdwa_i++) = vdwa_gradient;
    *(vdwa_i++) = 0;
    *(vdwa_i++) = 0;
    *(vdwb_i++) = vdwb_energy;
    *(vdwb_i++) = vdwb_gradient;
    *(vdwb_i++) = 0;
    *(vdwb_i++) = 0;
    *(r2_i++) = r2 + r2_delta;

  }

  if ( ! r2_delta_i ) {
    NAMD_bug("Failed to find table entry for r2 == r2_limit\n");
  }
  if ( r2_table[r2_delta_i] > r2_limit + r2_delta ) {
    NAMD_bug("Found bad table entry for r2 == r2_limit\n");
  }

  int j;
  const char *table_name = "XXXX";
  int smooth_short = 0;
  for ( j=0; j<5; ++j ) {
    BigReal *t0 = 0;
    switch (j) {
      case 0: 
        t0 = fast_table;
        table_name = "FAST";
        smooth_short = 1;
      break;
      case 1: 
        t0 = scor_table;
        table_name = "SCOR";
        smooth_short = 0;
      break;
      case 2: 
        t0 = slow_table;
        table_name = "SLOW";
        smooth_short = 0;
      break;
      case 3: 
        t0 = vdwa_table;
        table_name = "VDWA";
        smooth_short = 1;
      break;
      case 4: 
        t0 = vdwb_table;
        table_name = "VDWB";
        smooth_short = 1;
      break;
    }
    // patch up data for i=0
    t0[0] = t0[4] - t0[5] * ( r2_delta / 64.0 );  // energy
    t0[1] = t0[5];  // gradient
    t0[2] = 0;
    t0[3] = 0;
    if ( smooth_short ) {
      BigReal energy0 = t0[4*r2_delta_i];
      BigReal gradient0 = t0[4*r2_delta_i+1];
      BigReal r20 = r2_table[r2_delta_i];
      t0[0] = energy0 - gradient0 * (r20 - r2_table[0]);  // energy
      t0[1] = gradient0;  // gradient
    }
    BigReal *t;
    for ( i=0,t=t0; i<(n-1); ++i,t+=4 ) {
      BigReal x = ( r2_delta * ( 1 << (i/64) ) ) / 64.0;
      if ( r2_table[i+1] != r2_table[i] + x ) {
        NAMD_bug("Bad table delta calculation.\n");
      }
      if ( smooth_short && i+1 < r2_delta_i ) {
        BigReal energy0 = t0[4*r2_delta_i];
        BigReal gradient0 = t0[4*r2_delta_i+1];
        BigReal r20 = r2_table[r2_delta_i];
        t[4] = energy0 - gradient0 * (r20 - r2_table[i+1]);  // energy
        t[5] = gradient0;  // gradient
      }
      BigReal v1 = t[0];
      BigReal g1 = t[1];
      BigReal v2 = t[4];
      BigReal g2 = t[5];
      // explicit formulas for v1 + g1 x + c x^2 + d x^3
      BigReal c = ( 3.0 * (v2 - v1) - x * (2.0 * g1 + g2) ) / ( x * x );
      BigReal d = ( -2.0 * (v2 - v1) + x * (g1 + g2) ) / ( x * x * x );
      // since v2 - v1 is imprecise, we refine c and d numerically
      // important because we need accurate forces (more than energies!)
      for ( int k=0; k < 2; ++k ) {
        BigReal dv = (v1 - v2) + ( ( d * x + c ) * x + g1 ) * x;
        BigReal dg = (g1 - g2) + ( 3.0 * d * x + 2.0 * c ) * x;
        c -= ( 3.0 * dv - x * dg ) / ( x * x );
        d -= ( -2.0 * dv + x * dg ) / ( x * x * x );
      }
      // store in the array;
      t[2] = c;  t[3] = d;
    }

    if ( ! CkMyPe() ) {
    BigReal dvmax = 0;
    BigReal dgmax = 0;
    BigReal dvmax_r = 0;
    BigReal dgmax_r = 0;
    BigReal fdvmax = 0;
    BigReal fdgmax = 0;
    BigReal fdvmax_r = 0;
    BigReal fdgmax_r = 0;
    BigReal dgcdamax = 0;
    BigReal dgcdimax = 0;
    BigReal dgcaimax = 0;
    BigReal dgcdamax_r = 0;
    BigReal dgcdimax_r = 0;
    BigReal dgcaimax_r = 0;
    BigReal fdgcdamax = 0;
    BigReal fdgcdimax = 0;
    BigReal fdgcaimax = 0;
    BigReal fdgcdamax_r = 0;
    BigReal fdgcdimax_r = 0;
    BigReal fdgcaimax_r = 0;
    BigReal gcm = fabs(t0[1]);  // gradient magnitude running average
    for ( i=0,t=t0; i<(n-1); ++i,t+=4 ) {
      const BigReal r2_base = r2_delta * ( 1 << (i/64) );
      const BigReal r2_del = r2_base / 64.0;
      const BigReal r2 = r2_base - r2_delta + r2_del * (i%64);
      const BigReal r = sqrt(r2);
      if ( r > cutoff ) break;
      BigReal x = r2_del;
      BigReal dv = ( ( t[3] * x + t[2] ) * x + t[1] ) * x + t[0] - t[4];
      BigReal dg = ( 3.0 * t[3] * x + 2.0 * t[2] ) * x + t[1] - t[5];
      if ( t[4] != 0. && fabs(dv/t[4]) > fdvmax ) {
        fdvmax = fabs(dv/t[4]); fdvmax_r = r;
      }
      if ( fabs(dv) > dvmax ) {
        dvmax = fabs(dv); dvmax_r = r;
      }
      if ( t[5] != 0. && fabs(dg/t[5]) > fdgmax ) {
        fdgmax = fabs(dg/t[5]); fdgmax_r = r;
      }
      if ( fabs(dg) > dgmax ) {
        dgmax = fabs(dg); dgmax_r = r;
      }
      BigReal gcd = (t[4] - t[0]) / x;  // centered difference gradient
      BigReal gcd_prec = (fabs(t[0]) + fabs(t[4])) * 1.e-15 / x;  // roundoff
      gcm = 0.9 * gcm + 0.1 * fabs(t[5]);  // magnitude running average
      BigReal gca = 0.5  * (t[1] + t[5]);  // centered average gradient
      BigReal gci = ( 0.75 * t[3] * x + t[2] ) * x + t[1];  // interpolated
      BigReal rc = sqrt(r2 + 0.5 * x);
      BigReal dgcda = gcd - gca;
      if ( dgcda != 0. && fabs(dgcda) < gcd_prec ) {
        // CkPrintf("ERROR %g < PREC %g AT %g AVG VAL %g\n", dgcda, gcd_prec, rc, gca);
        dgcda = 0.;
      }
      BigReal dgcdi = gcd - gci;
      if ( dgcdi != 0. && fabs(dgcdi) < gcd_prec ) {
        // CkPrintf("ERROR %g < PREC %g AT %g INT VAL %g\n", dgcdi, gcd_prec, rc, gci);
        dgcdi = 0.;
      }
      BigReal dgcai = gca - gci;
      if ( t[1]*t[5] > 0. && gcm != 0. && fabs(dgcda/gcm) > fdgcdamax ) {
        fdgcdamax = fabs(dgcda/gcm); fdgcdamax_r = rc;
      }
      if ( fabs(dgcda) > fdgcdamax ) {
        dgcdamax = fabs(dgcda); dgcdamax_r = rc;
      }
      if ( t[1]*t[5] > 0. && gcm != 0. && fabs(dgcdi/gcm) > fdgcdimax ) {
        fdgcdimax = fabs(dgcdi/gcm); fdgcdimax_r = rc;
      }
      if ( fabs(dgcdi) > fdgcdimax ) {
        dgcdimax = fabs(dgcdi); dgcdimax_r = rc;
      }
      if ( t[1]*t[5] > 0. && gcm != 0. && fabs(dgcai/gcm) > fdgcaimax ) {
        fdgcaimax = fabs(dgcai/gcm); fdgcaimax_r = rc;
      }
      if ( fabs(dgcai) > fdgcaimax ) {
        dgcaimax = fabs(dgcai); dgcaimax_r = rc;
      }
#if 0
      CkPrintf("TABLE %s %g %g %g %g\n",table_name,rc,dgcda/gcm,dgcda,gci);
      if (dv != 0.) CkPrintf("TABLE %d ENERGY ERROR %g AT %g (%d)\n",j,dv,r,i);
      if (dg != 0.) CkPrintf("TABLE %d FORCE ERROR %g AT %g (%d)\n",j,dg,r,i);
#endif
    }
    if ( dvmax != 0.0 ) {
      iout << iINFO << "ABSOLUTE IMPRECISION IN " << table_name <<
        " TABLE ENERGY: " << dvmax << " AT " << dvmax_r << "\n" << endi;
    }
    if ( fdvmax != 0.0 ) {
      iout << iINFO << "RELATIVE IMPRECISION IN " << table_name <<
        " TABLE ENERGY: " << fdvmax << " AT " << fdvmax_r << "\n" << endi;
    }
    if ( dgmax != 0.0 ) {
      iout << iINFO << "ABSOLUTE IMPRECISION IN " << table_name <<
        " TABLE FORCE: " << dgmax << " AT " << dgmax_r << "\n" << endi;
    }
    if ( fdgmax != 0.0 ) {
      iout << iINFO << "RELATIVE IMPRECISION IN " << table_name <<
        " TABLE FORCE: " << fdgmax << " AT " << fdgmax_r << "\n" << endi;
    }
    if (fdgcdamax != 0.0 ) {
      iout << iINFO << "INCONSISTENCY IN " << table_name <<
        " TABLE ENERGY VS FORCE: " << fdgcdamax << " AT " << fdgcdamax_r << "\n" << endi;
      if ( fdgcdamax > 0.1 ) {
        iout << iERROR << "\n";
        iout << iERROR << "CALCULATED " << table_name <<
          " FORCE MAY NOT MATCH ENERGY! POSSIBLE BUG!\n";
        iout << iERROR << "\n";
      }
    }
    if (0 && fdgcdimax != 0.0 ) {
      iout << iINFO << "INCONSISTENCY IN " << table_name <<
        " TABLE ENERGY VS FORCE: " << fdgcdimax << " AT " << fdgcdimax_r << "\n" << endi;
    }
    if ( 0 && fdgcaimax != 0.0 ) {
      iout << iINFO << "INCONSISTENCY IN " << table_name <<
        " TABLE AVG VS INT FORCE: " << fdgcaimax << " AT " << fdgcaimax_r << "\n" << endi;
    }
    }

  }

  for ( i=0; i<4*n; ++i ) {
    corr_table[i] = fast_table[i] + scor_table[i];
    full_table[i] = fast_table[i] + slow_table[i];
  }

#if 0  
  for ( i=0; i<n; ++i ) {
   for ( int j=0; j<4; ++j ) {
    table_short[16*i+6-2*j] = table_noshort[16*i+6-2*j] = vdwa_table[4*i+j];
    table_short[16*i+7-2*j] = table_noshort[16*i+7-2*j] = vdwb_table[4*i+j];
    table_short[16*i+8+3-j] = fast_table[4*i+j];
    table_short[16*i+12+3-j] = scor_table[4*i+j];
    table_noshort[16*i+8+3-j] = corr_table[4*i+j];
    table_noshort[16*i+12+3-j] = full_table[4*i+j];
   }
  }
#endif 

  for ( i=0; i<n; ++i ) {
    table_short[16*i+ 0] = table_noshort[16*i+0] = -6.*vdwa_table[4*i+3];
    table_short[16*i+ 2] = table_noshort[16*i+2] = -6.*vdwb_table[4*i+3];
    table_short[16*i+ 4] = table_noshort[16*i+4] = -2.*vdwa_table[4*i+1];
    table_short[16*i+ 6] = table_noshort[16*i+6] = -2.*vdwb_table[4*i+1];
    
    table_short[16*i+1] = table_noshort[16*i+1] = -4.*vdwa_table[4*i+2];
    table_short[16*i+3] = table_noshort[16*i+3] = -4.*vdwb_table[4*i+2];
    table_short[16*i+5] = table_noshort[16*i+5] = -1.*vdwa_table[4*i+0];
    table_short[16*i+7] = table_noshort[16*i+7] = -1.*vdwb_table[4*i+0];
    
    table_short[16*i+8]  = -6.*fast_table[4*i+3];
    table_short[16*i+9]  = -4.*fast_table[4*i+2];
    table_short[16*i+10] = -2.*fast_table[4*i+1];
    table_short[16*i+11] = -1.*fast_table[4*i+0];

    table_noshort[16*i+8]  = -6.*corr_table[4*i+3];
    table_noshort[16*i+9]  = -4.*corr_table[4*i+2];
    table_noshort[16*i+10] = -2.*corr_table[4*i+1];
    table_noshort[16*i+11] = -1.*corr_table[4*i+0];

    table_short[16*i+12] = -6.*scor_table[4*i+3];
    table_short[16*i+13] = -4.*scor_table[4*i+2];
    table_short[16*i+14] = -2.*scor_table[4*i+1];
    table_short[16*i+15] = -1.*scor_table[4*i+0];

    table_noshort[16*i+12] = -6.*full_table[4*i+3];
    table_noshort[16*i+13] = -4.*full_table[4*i+2];
    table_noshort[16*i+14] = -2.*full_table[4*i+1];
    table_noshort[16*i+15] = -1.*full_table[4*i+0];
  }

#if 0
  char fname[100];
  sprintf(fname,"/tmp/namd.table.pe%d.dat",CkMyPe());
  FILE *f = fopen(fname,"w");
  for ( i=0; i<(n-1); ++i ) {
    const BigReal r2_base = r2_delta * ( 1 << (i/64) );
    const BigReal r2_del = r2_base / 64.0;
    const BigReal r2 = r2_base - r2_delta + r2_del * (i%64);
    BigReal *t;
    if ( r2 + r2_delta != r2_table[i] ) fprintf(f,"r2 error! ");
    fprintf(f,"%g",r2);
    t = fast_table + 4*i;
    fprintf(f,"   %g %g %g %g", t[0], t[1], t[2], t[3]);
    t = scor_table + 4*i;
    fprintf(f,"   %g %g %g %g", t[0], t[1], t[2], t[3]);
    t = slow_table + 4*i;
    fprintf(f,"   %g %g %g %g", t[0], t[1], t[2], t[3]);
    t = corr_table + 4*i;
    fprintf(f,"   %g %g %g %g", t[0], t[1], t[2], t[3]);
    t = full_table + 4*i;
    fprintf(f,"   %g %g %g %g", t[0], t[1], t[2], t[3]);
    t = vdwa_table + 4*i;
    fprintf(f,"   %g %g %g %g", t[0], t[1], t[2], t[3]);
    t = vdwb_table + 4*i;
    fprintf(f,"   %g %g %g %g", t[0], t[1], t[2], t[3]);
    fprintf(f,"\n");
  }
  fclose(f);
#endif

#ifdef NAMD_CUDA
  send_build_cuda_force_table();
#endif

}
Ejemplo n.º 12
0
/**
  This is the main charm setup routine.  It's called
  on all processors after Converse initialization.
  This routine gets passed to Converse from "main.C".
  
  The main purpose of this routine is to set up the objects
  and Ckpv's used during a regular Charm run.  See the comment
  at the top of the file for overall flow.
*/
void _initCharm(int unused_argc, char **argv)
{ 
	int inCommThread = (CmiMyRank() == CmiMyNodeSize());

	DEBUGF(("[%d,%.6lf ] _initCharm started\n",CmiMyPe(),CmiWallTimer()));

	CkpvInitialize(size_t *, _offsets);
	CkpvAccess(_offsets) = new size_t[32];
	CkpvInitialize(PtrQ*,_buffQ);
	CkpvInitialize(PtrVec*,_bocInitVec);
	CkpvInitialize(void*, _currentChare);
	CkpvInitialize(int,   _currentChareType);
	CkpvInitialize(CkGroupID, _currentGroup);
	CkpvInitialize(void *, _currentNodeGroupObj);
	CkpvInitialize(CkGroupID, _currentGroupRednMgr);
	CkpvInitialize(GroupTable*, _groupTable);
	CkpvInitialize(GroupIDTable*, _groupIDTable);
	CkpvInitialize(CmiImmediateLockType, _groupTableImmLock);
        CkpvInitialize(bool, _destroyingNodeGroup);
        CkpvAccess(_destroyingNodeGroup) = false;
	CkpvInitialize(UInt, _numGroups);
	CkpvInitialize(int, _numInitsRecd);
	CkpvInitialize(int, _initdone);
	CkpvInitialize(char**, Ck_argv); CkpvAccess(Ck_argv)=argv;
	CkpvInitialize(MsgPool*, _msgPool);
	CkpvInitialize(CkCoreState *, _coreState);
	/*
		Added for evacuation-sayantan
	*/
#ifndef __BIGSIM__
	CpvInitialize(char *,_validProcessors);
#endif
	CkpvInitialize(char ,startedEvac);
	CpvInitialize(int,serializer);

	_initChareTables();            // for checkpointable plain chares

	CksvInitialize(UInt, _numNodeGroups);
	CksvInitialize(GroupTable*, _nodeGroupTable);
	CksvInitialize(GroupIDTable, _nodeGroupIDTable);
	CksvInitialize(CmiImmediateLockType, _nodeGroupTableImmLock);
	CksvInitialize(CmiNodeLock, _nodeLock);
	CksvInitialize(PtrVec*,_nodeBocInitVec);
	CksvInitialize(UInt,_numInitNodeMsgs);
	CkpvInitialize(int,_charmEpoch);
	CkpvAccess(_charmEpoch)=0;
	CksvInitialize(int, _triggersSent);
	CksvAccess(_triggersSent) = 0;

	CkpvInitialize(_CkOutStream*, _ckout);
	CkpvInitialize(_CkErrStream*, _ckerr);
	CkpvInitialize(Stats*, _myStats);

	CkpvAccess(_groupIDTable) = new GroupIDTable(0);
	CkpvAccess(_groupTable) = new GroupTable;
	CkpvAccess(_groupTable)->init();
	CkpvAccess(_groupTableImmLock) = CmiCreateImmediateLock();
	CkpvAccess(_numGroups) = 1; // make 0 an invalid group number
	CkpvAccess(_buffQ) = new PtrQ();
	CkpvAccess(_bocInitVec) = new PtrVec();

	CkpvAccess(_currentNodeGroupObj) = NULL;

	if(CkMyRank()==0)
	{
	  	CksvAccess(_numNodeGroups) = 1; //make 0 an invalid group number
          	CksvAccess(_numInitNodeMsgs) = 0;
		CksvAccess(_nodeLock) = CmiCreateLock();
		CksvAccess(_nodeGroupTable) = new GroupTable();
		CksvAccess(_nodeGroupTable)->init();
		CksvAccess(_nodeGroupTableImmLock) = CmiCreateImmediateLock();
		CksvAccess(_nodeBocInitVec) = new PtrVec();
	}

	CkCallbackInit();
	
	CmiNodeAllBarrier();

#if ! CMK_BIGSIM_CHARM
	initQd(argv);         // bigsim calls it in ConverseCommonInit
#endif

	CkpvAccess(_coreState)=new CkCoreState();

	CkpvAccess(_numInitsRecd) = 0;
	CkpvAccess(_initdone) = 0;

	CkpvAccess(_ckout) = new _CkOutStream();
	CkpvAccess(_ckerr) = new _CkErrStream();

	_charmHandlerIdx = CkRegisterHandler((CmiHandler)_bufferHandler);
	_initHandlerIdx = CkRegisterHandler((CmiHandler)_initHandler);
	CkNumberHandlerEx(_initHandlerIdx, (CmiHandlerEx)_initHandler, CkpvAccess(_coreState));
	_roRestartHandlerIdx = CkRegisterHandler((CmiHandler)_roRestartHandler);
	_exitHandlerIdx = CkRegisterHandler((CmiHandler)_exitHandler);
	//added for interoperabilitY
	_libExitHandlerIdx = CkRegisterHandler((CmiHandler)_libExitHandler);
	_bocHandlerIdx = CkRegisterHandler((CmiHandler)_initHandler);
	CkNumberHandlerEx(_bocHandlerIdx, (CmiHandlerEx)_initHandler, CkpvAccess(_coreState));

#ifdef __BIGSIM__
	if(BgNodeRank()==0) 
#endif
	_infoIdx = CldRegisterInfoFn((CldInfoFn)_infoFn);

	_triggerHandlerIdx = CkRegisterHandler((CmiHandler)_triggerHandler);
	_ckModuleInit();

	CldRegisterEstimator((CldEstimator)_charmLoadEstimator);

	_futuresModuleInit(); // part of futures implementation is a converse module
	_loadbalancerInit();
        _metabalancerInit();
	
#if CMK_MEM_CHECKPOINT
        init_memcheckpt(argv);
#endif

	initCharmProjections();
#if CMK_TRACE_IN_CHARM
        // initialize trace module in ck
        traceCharmInit(argv);
#endif
 	
    CkpvInitialize(int, envelopeEventID);
    CkpvAccess(envelopeEventID) = 0;
	CkMessageWatcherInit(argv,CkpvAccess(_coreState));
	
	/**
	  The rank-0 processor of each node calls the 
	  translator-generated "_register" routines. 
	  
	  _register routines call the charm.h "CkRegister*" routines,
	  which record function pointers and class information for
	  all Charm entities, like Chares, Arrays, and readonlies.
	  
	  There's one _register routine generated for each
	  .ci file.  _register routines *must* be called in the 
	  same order on every node, and *must not* be called by 
	  multiple threads simultaniously.
	*/
#ifdef __BIGSIM__
	if(BgNodeRank()==0) 
#else
	if(CkMyRank()==0)
#endif
	{
		SDAG::registerPUPables();
		CmiArgGroup("Charm++",NULL);
		_parseCommandLineOpts(argv);
		_registerInit();
		CkRegisterMsg("System", 0, 0, CkFreeMsg, sizeof(int));
		CkRegisterChareInCharm(CkRegisterChare("null", 0, TypeChare));
		CkIndex_Chare::__idx=CkRegisterChare("Chare", sizeof(Chare), TypeChare);
		CkRegisterChareInCharm(CkIndex_Chare::__idx);
		CkIndex_Group::__idx=CkRegisterChare("Group", sizeof(Group), TypeGroup);
                CkRegisterChareInCharm(CkIndex_Group::__idx);
		CkRegisterEp("null", (CkCallFnPtr)_nullFn, 0, 0, 0+CK_EP_INTRINSIC);
		
		/**
		  These _register calls are for the built-in
		  Charm .ci files, like arrays and load balancing.
		  If you add a .ci file to charm, you'll have to 
		  add a call to the _register routine here, or make
		  your library into a "-module".
		*/
		_registerCkFutures();
		_registerCkArray();
		_registerLBDatabase();
    _registerMetaBalancer();
		_registerCkCallback();
		_registertempo();
		_registerwaitqd();
		_registerCkCheckpoint();
#if CMK_MEM_CHECKPOINT
		_registerCkMemCheckpoint();
#endif


		/*
		  Setup Control Point Automatic Tuning Framework.

		  By default it is enabled as a part of charm, 
		  however it won't enable its tracing module 
		  unless a +CPEnableMeasurements command line argument
		  is specified. See trace-common.C for more info

		  Thus there should be no noticable overhead to 
		  always having the control point framework linked
		  in.
		  
		*/
#if CMK_WITH_CONTROLPOINT
		_registerPathHistory();
		_registerControlPoints();
		_registerTraceControlPoints();
#endif


		/**
		  CkRegisterMainModule is generated by the (unique)
		  "mainmodule" .ci file.  It will include calls to 
		  register all the .ci files.
		*/
		CkRegisterMainModule();

		/**
		  _registerExternalModules is actually generated by 
		  charmc at link time (as "moduleinit<pid>.C").  
		  
		  This generated routine calls the _register functions
		  for the .ci files of libraries linked using "-module".
		  This funny initialization is most useful for AMPI/FEM
		  programs, which don't have a .ci file and hence have
		  no other way to control the _register process.
		*/
		_registerExternalModules(argv);
		
		_registerDone();
	}
	/* The following will happen on every virtual processor in BigEmulator, not just on once per real processor */
	if (CkMyRank() == 0) {
	  CpdBreakPointInit();
	}
	CmiNodeAllBarrier();

	// Execute the initcalls registered in modules
	_initCallTable.enumerateInitCalls();

#if CMK_CHARMDEBUG
	CpdFinishInitialization();
#endif

	//CmiNodeAllBarrier();

	CkpvAccess(_myStats) = new Stats();
	CkpvAccess(_msgPool) = new MsgPool();

	CmiNodeAllBarrier();

#if !(__FAULT__)
	CmiBarrier();
	CmiBarrier();
	CmiBarrier();
#endif
#if CMK_SMP_TRACE_COMMTHREAD
	_TRACE_BEGIN_COMPUTATION();	
#else
 	if (!inCommThread) {
	  _TRACE_BEGIN_COMPUTATION();
	}
#endif

#ifdef ADAPT_SCHED_MEM
    if(CkMyRank()==0){
	memCriticalEntries = new int[numMemCriticalEntries];
	int memcnt=0;
	for(int i=0; i<_entryTable.size(); i++){
	    if(_entryTable[i]->isMemCritical){
		memCriticalEntries[memcnt++] = i;
	    }
	}
    }
#endif

#if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
    _messageLoggingInit();
#endif

#ifndef __BIGSIM__
	/*
		FAULT_EVAC
	*/
	CpvAccess(_validProcessors) = new char[CkNumPes()];
	for(int vProc=0;vProc<CkNumPes();vProc++){
		CpvAccess(_validProcessors)[vProc]=1;
	}
	_ckEvacBcastIdx = CkRegisterHandler((CmiHandler)_ckEvacBcast);
	_ckAckEvacIdx = CkRegisterHandler((CmiHandler)_ckAckEvac);
#endif
	CkpvAccess(startedEvac) = 0;
	CpvAccess(serializer) = 0;

	evacuate = 0;
	CcdCallOnCondition(CcdSIGUSR1,(CcdVoidFn)CkDecideEvacPe,0);
#if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_)) 
    CcdCallOnCondition(CcdSIGUSR2,(CcdVoidFn)CkMlogRestart,0);
#endif

	if(_raiseEvac){
		processRaiseEvacFile(_raiseEvacFile);
		/*
		if(CkMyPe() == 2){
		//	CcdCallOnConditionKeep(CcdPERIODIC_10s,(CcdVoidFn)CkDecideEvacPe,0);
			CcdCallFnAfter((CcdVoidFn)CkDecideEvacPe, 0, 10000);
		}
		if(CkMyPe() == 3){
			CcdCallFnAfter((CcdVoidFn)CkDecideEvacPe, 0, 10000);
		}*/
	}	
    
    if (CkMyRank() == 0) {
      TopoManager_init();
    }
    CmiNodeAllBarrier();

    if (!_replaySystem) {
        CkFtFn  faultFunc_restart = CkRestartMain;
        if (faultFunc == NULL || faultFunc == faultFunc_restart) {         // this is not restart from memory
            // these two are blocking calls for non-bigsim
#if ! CMK_BIGSIM_CHARM
	  CmiInitCPUAffinity(argv);
          CmiInitMemAffinity(argv);
#endif
        }
        CmiInitCPUTopology(argv);
#if CMK_SHARED_VARS_POSIX_THREADS_SMP
        if (CmiCpuTopologyEnabled()) {
            int *pelist;
            int num;
            CmiGetPesOnPhysicalNode(0, &pelist, &num);
#if !CMK_MULTICORE && !CMK_SMP_NO_COMMTHD
            // Count communication threads, if present
            // XXX: Assuming uniformity of node size here
            num += num/CmiMyNodeSize();
#endif
            if (!_Cmi_forceSpinOnIdle && num > CmiNumCores())
            {
              if (CmiMyPe() == 0)
                CmiPrintf("\nCharm++> Warning: the number of SMP threads (%d) is greater than the number of physical cores (%d), so threads will sleep while idling. Use +CmiSpinOnIdle or +CmiSleepOnIdle to control this directly.\n\n", num, CmiNumCores());
              CmiLock(CksvAccess(_nodeLock));
              if (! _Cmi_sleepOnIdle) _Cmi_sleepOnIdle = 1;
              CmiUnlock(CksvAccess(_nodeLock));
            }
        }
#endif
    }

    if(CmiMyPe() == 0) {
        char *topoFilename;
        if(CmiGetArgStringDesc(argv,"+printTopo",&topoFilename,"topo file name")) 
        {
            std::stringstream sstm;
            sstm << topoFilename << "." << CmiMyPartition();
            std::string result = sstm.str();
            FILE *fp;
            fp = fopen(result.c_str(), "w");
            if (fp == NULL) {
              CkPrintf("Error opening %s file, writing to stdout\n", topoFilename);
              fp = stdout;
            }
	    TopoManager_printAllocation(fp);
            fclose(fp);
        }
    }

#if CMK_USE_PXSHM && ( CMK_CRAYXE || CMK_CRAYXC ) && CMK_SMP
      // for SMP on Cray XE6 (hopper) it seems pxshm has to be initialized
      // again after cpuaffinity is done
    if (CkMyRank() == 0) {
      CmiInitPxshm(argv);
    }
    CmiNodeAllBarrier();
#endif

    //CldCallback();
#if CMK_BIGSIM_CHARM && CMK_CHARMDEBUG
      // Register the BG handler for CCS. Notice that this is put into a variable shared by
      // the whole real processor. This because converse needs to find it. We check that all
      // virtual processors register the same index for this handler.
    CpdBgInit();
#endif

	if (faultFunc) {
#if CMK_WITH_STATS
		if (CkMyPe()==0) _allStats = new Stats*[CkNumPes()];
#endif
		if (!inCommThread) {
                  CkArgMsg *msg = (CkArgMsg *)CkAllocMsg(0, sizeof(CkArgMsg), 0);
                  msg->argc = CmiGetArgc(argv);
                  msg->argv = argv;
                  faultFunc(_restartDir, msg);
                  CkFreeMsg(msg);
                }
	}else if(CkMyPe()==0){
#if CMK_WITH_STATS
		_allStats = new Stats*[CkNumPes()];
#endif
		register size_t i, nMains=_mainTable.size();
		for(i=0;i<nMains;i++)  /* Create all mainchares */
		{
			register int size = _chareTable[_mainTable[i]->chareIdx]->size;
			register void *obj = malloc(size);
			_MEMCHECK(obj);
			_mainTable[i]->setObj(obj);
			CkpvAccess(_currentChare) = obj;
			CkpvAccess(_currentChareType) = _mainTable[i]->chareIdx;
			register CkArgMsg *msg = (CkArgMsg *)CkAllocMsg(0, sizeof(CkArgMsg), 0);
			msg->argc = CmiGetArgc(argv);
			msg->argv = argv;
			_entryTable[_mainTable[i]->entryIdx]->call(msg, obj);
#if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
            CpvAccess(_currentObj) = (Chare *)obj;
#endif
		}
                _mainDone = 1;

		_STATS_RECORD_CREATE_CHARE_N(nMains);
		_STATS_RECORD_PROCESS_CHARE_N(nMains);




		for(i=0;i<_readonlyMsgs.size();i++) /* Send out readonly messages */
		{
			register void *roMsg = (void *) *((char **)(_readonlyMsgs[i]->pMsg));
			if(roMsg==0)
				continue;
			//Pack the message and send it to all other processors
			register envelope *env = UsrToEnv(roMsg);
			env->setSrcPe(CkMyPe());
			env->setMsgtype(ROMsgMsg);
			env->setRoIdx(i);
			CmiSetHandler(env, _initHandlerIdx);
			CkPackMessage(&env);
			CmiSyncBroadcast(env->getTotalsize(), (char *)env);
			CpvAccess(_qd)->create(CkNumPes()-1);

			//For processor 0, unpack and re-set the global
			CkUnpackMessage(&env);
			_processROMsgMsg(env);
			_numInitMsgs++;
		}

		//Determine the size of the RODataMessage
		PUP::sizer ps;
		for(i=0;i<_readonlyTable.size();i++) _readonlyTable[i]->pupData(ps);

		//Allocate and fill out the RODataMessage
		envelope *env = _allocEnv(RODataMsg, ps.size());
		PUP::toMem pp((char *)EnvToUsr(env));
		for(i=0;i<_readonlyTable.size();i++) _readonlyTable[i]->pupData(pp);

		env->setCount(++_numInitMsgs);
		env->setSrcPe(CkMyPe());
		CmiSetHandler(env, _initHandlerIdx);
		DEBUGF(("[%d,%.6lf] RODataMsg being sent of size %d \n",CmiMyPe(),CmiWallTimer(),env->getTotalsize()));
		CmiSyncBroadcastAndFree(env->getTotalsize(), (char *)env);
		CpvAccess(_qd)->create(CkNumPes()-1);
		_initDone();
	}

	DEBUGF(("[%d,%d%.6lf] inCommThread %d\n",CmiMyPe(),CmiMyRank(),CmiWallTimer(),inCommThread));
	// when I am a communication thread, I don't participate initDone.
        if (inCommThread) {
                CkNumberHandlerEx(_bocHandlerIdx,(CmiHandlerEx)_processHandler,
                                        CkpvAccess(_coreState));
                CkNumberHandlerEx(_charmHandlerIdx,(CmiHandlerEx)_processHandler
,
                                        CkpvAccess(_coreState));
                _processBufferedMsgs();
        }

#if CMK_CHARMDEBUG
        // Should not use CpdFreeze inside a thread (since this processor is really a user-level thread)
       if (CpvAccess(cpdSuspendStartup))
       { 
          //CmiPrintf("In Parallel Debugging mode .....\n");
          CpdFreeze();
       }
#endif


#if __FAULT__
	if(killFlag){                                                  
                readKillFile();                                        
        }
#endif

}
Ejemplo n.º 13
0
// broadcast
void CkCheckpointMgr::Checkpoint(const char *dirname, CkCallback& cb){
	chkptStartTimer = CmiWallTimer();
	// every body make dir in case it is local directory
	CmiMkdir(dirname);

	if (CkMyPe() == 0) {
          checkpointOne(dirname, cb);
 	}

	char fileName[1024];

#ifndef CMK_CHARE_USE_PTR
	// save groups into Chares.dat
	sprintf(fileName,"%s/Chares_%d.dat",dirname,CkMyPe());
	FILE* fChares = CmiFopen(fileName,"wb");
	if(!fChares) CkAbort("Failed to create checkpoint file for chares!");
	PUP::toDisk pChares(fChares);
	CkPupChareData(pChares);
	CmiFclose(fChares);
#endif

	// save groups into Groups.dat
	// content of the file: numGroups, GroupInfo[numGroups], _groupTable(PUP'ed), groups(PUP'ed)
	sprintf(fileName,"%s/Groups_%d.dat",dirname,CkMyPe());
	FILE* fGroups = CmiFopen(fileName,"wb");
	if(!fGroups) CkAbort("Failed to create checkpoint file for group table!");
	PUP::toDisk pGroups(fGroups);
#if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
    CkPupGroupData(pGroups,CmiTrue);
#else
    CkPupGroupData(pGroups);
#endif
	CmiFclose(fGroups);

	// save nodegroups into NodeGroups.dat
	// content of the file: numNodeGroups, GroupInfo[numNodeGroups], _nodeGroupTable(PUP'ed), nodegroups(PUP'ed)
	if (CkMyRank() == 0) {
	  sprintf(fileName,"%s/NodeGroups_%d.dat",dirname,CkMyNode());
	  FILE* fNodeGroups = CmiFopen(fileName,"wb");
	  if(!fNodeGroups) 
	    CkAbort("Failed to create checkpoint file for nodegroup table!");
	  PUP::toDisk pNodeGroups(fNodeGroups);
#if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
      CkPupNodeGroupData(pNodeGroups,CmiTrue);
#else
      CkPupNodeGroupData(pNodeGroups);
#endif
	  CmiFclose(fNodeGroups);
  	}

	//DEBCHK("[%d]CkCheckpointMgr::Checkpoint called dirname={%s}\n",CkMyPe(),dirname);
	sprintf(fileName,"%s/arr_%d.dat",dirname, CkMyPe());
	FILE *datFile=CmiFopen(fileName,"wb");
	if (datFile==NULL) CkAbort("Could not create data file");
	PUP::toDisk  p(datFile);
	CkPupArrayElementsData(p);
	CmiFclose(datFile);

#if CMK_HAS_SYNC && ! CMK_DISABLE_SYNC
	system("sync");
#endif

	restartCB = cb;
	DEBCHK("[%d]restartCB installed\n",CkMyPe());
	CkCallback localcb(CkIndex_CkCheckpointMgr::SendRestartCB(NULL),0,thisgroup);
	contribute(0,NULL,CkReduction::sum_int,localcb);
}
Ejemplo n.º 14
0
// called from init.C
void _loadbalancerInit()
{
  CkpvInitialize(int, lbdatabaseInited);
  CkpvAccess(lbdatabaseInited) = 0;
  CkpvInitialize(int, numLoadBalancers);
  CkpvAccess(numLoadBalancers) = 0;
  CkpvInitialize(int, hasNullLB);
  CkpvAccess(hasNullLB) = 0;

  char **argv = CkGetArgv();
  char *balancer = NULL;
  CmiArgGroup("Charm++","Load Balancer");
  while (CmiGetArgStringDesc(argv, "+balancer", &balancer, "Use this load balancer")) {
    if (CkMyRank() == 0)                
      lbRegistry.addRuntimeBalancer(balancer);   /* lbRegistry is a static */
  }

  // set up init value for LBPeriod time in seconds
  // it can also be set by calling LDSetLBPeriod()
  CmiGetArgDoubleDesc(argv,"+LBPeriod", &_lb_args.lbperiod(),"the minimum time period in seconds allowed for two consecutive automatic load balancing");
  _lb_args.loop() = CmiGetArgFlagDesc(argv, "+LBLoop", "Use multiple load balancing strategies in loop");

  // now called in cldb.c: CldModuleGeneralInit()
  // registerLBTopos();
  CmiGetArgStringDesc(argv, "+LBTopo", &_lbtopo, "define load balancing topology");
  //Read the K parameter for RefineKLB
  CmiGetArgIntDesc(argv, "+LBNumMoves", &_lb_args.percentMovesAllowed() , "Percentage of chares to be moved (used by RefineKLB) [0-100]");

  /**************** FUTURE PREDICTOR ****************/
  _lb_predict = CmiGetArgFlagDesc(argv, "+LBPredictor", "Turn on LB future predictor");
  CmiGetArgIntDesc(argv, "+LBPredictorDelay", &_lb_predict_delay, "Number of balance steps before learning a model");
  CmiGetArgIntDesc(argv, "+LBPredictorWindow", &_lb_predict_window, "Number of steps to use to learn a model");
  if (_lb_predict_window < _lb_predict_delay) {
    CmiPrintf("LB> [%d] Argument LBPredictorWindow (%d) less than LBPredictorDelay (%d) , fixing\n", CkMyPe(), _lb_predict_window, _lb_predict_delay);
    _lb_predict_delay = _lb_predict_window;
  }

  /******************* SIMULATION *******************/
  // get the step number at which to dump the LB database
  CmiGetArgIntDesc(argv, "+LBVersion", &_lb_args.lbversion(), "LB database file version number");
  CmiGetArgIntDesc(argv, "+LBCentPE", &_lb_args.central_pe(), "CentralLB processor");
  int _lb_dump_activated = 0;
  if (CmiGetArgIntDesc(argv, "+LBDump", &LBSimulation::dumpStep, "Dump the LB state from this step"))
    _lb_dump_activated = 1;
  if (_lb_dump_activated && LBSimulation::dumpStep < 0) {
    CmiPrintf("LB> Argument LBDump (%d) negative, setting to 0\n",LBSimulation::dumpStep);
    LBSimulation::dumpStep = 0;
  }
  CmiGetArgIntDesc(argv, "+LBDumpSteps", &LBSimulation::dumpStepSize, "Dump the LB state for this amount of steps");
  if (LBSimulation::dumpStepSize <= 0) {
    CmiPrintf("LB> Argument LBDumpSteps (%d) too small, setting to 1\n",LBSimulation::dumpStepSize);
    LBSimulation::dumpStepSize = 1;
  }
  CmiGetArgStringDesc(argv, "+LBDumpFile", &LBSimulation::dumpFile, "Set the LB state file name");
  // get the simulation flag and number. Now the flag can also be avoided by the presence of the number
  LBSimulation::doSimulation = CmiGetArgIntDesc(argv, "+LBSim", &LBSimulation::simStep, "Read LB state from LBDumpFile since this step");
  // check for stupid LBSim parameter
  if (LBSimulation::doSimulation && LBSimulation::simStep < 0) {
    CmiPrintf("LB> Argument LBSim (%d) invalid, should be >= 0\n");
    CkExit();
    return;
  }
  CmiGetArgIntDesc(argv, "+LBSimSteps", &LBSimulation::simStepSize, "Read LB state for this number of steps");
  if (LBSimulation::simStepSize <= 0) {
    CmiPrintf("LB> Argument LBSimSteps (%d) too small, setting to 1\n",LBSimulation::simStepSize);
    LBSimulation::simStepSize = 1;
  }


  LBSimulation::simProcs = 0;
  CmiGetArgIntDesc(argv, "+LBSimProcs", &LBSimulation::simProcs, "Number of target processors.");

  LBSimulation::showDecisionsOnly = 
    CmiGetArgFlagDesc(argv, "+LBShowDecisions",
		      "Write to File: Load Balancing Object to Processor Map decisions during LB Simulation");

  // force a global barrier after migration done
  _lb_args.syncResume() = CmiGetArgFlagDesc(argv, "+LBSyncResume", 
                  "LB performs a barrier after migration is finished");

  // both +LBDebug and +LBDebug level should work
  if (!CmiGetArgIntDesc(argv, "+LBDebug", &_lb_args.debug(), 
                                          "Turn on LB debugging printouts"))
    _lb_args.debug() = CmiGetArgFlagDesc(argv, "+LBDebug", 
  					     "Turn on LB debugging printouts");

  // getting the size of the team with +teamSize
  if (!CmiGetArgIntDesc(argv, "+teamSize", &_lb_args.teamSize(), 
                                          "Team size"))
    _lb_args.teamSize() = 1;

  // ask to print summary/quality of load balancer
  _lb_args.printSummary() = CmiGetArgFlagDesc(argv, "+LBPrintSummary",
		"Print load balancing result summary");

  // to ignore baclground load
  _lb_args.ignoreBgLoad() = CmiGetArgFlagDesc(argv, "+LBNoBackground", 
                      "Load balancer ignores the background load.");
#ifdef __BIGSIM__
  _lb_args.ignoreBgLoad() = 1;
#endif
  _lb_args.migObjOnly() = CmiGetArgFlagDesc(argv, "+LBObjOnly", 
                      "Only load balancing migratable objects, ignoring all others.");
  if (_lb_args.migObjOnly()) _lb_args.ignoreBgLoad() = 1;

  // assume all CPUs are identical
  _lb_args.testPeSpeed() = CmiGetArgFlagDesc(argv, "+LBTestPESpeed", 
                      "Load balancer test all CPUs speed.");
  _lb_args.samePeSpeed() = CmiGetArgFlagDesc(argv, "+LBSameCpus", 
                      "Load balancer assumes all CPUs are of same speed.");
  if (!_lb_args.testPeSpeed()) _lb_args.samePeSpeed() = 1;

  _lb_args.useCpuTime() = CmiGetArgFlagDesc(argv, "+LBUseCpuTime", 
                      "Load balancer uses CPU time instead of wallclock time.");

  // turn instrumentation off at startup
  _lb_args.statsOn() = !CmiGetArgFlagDesc(argv, "+LBOff",
			"Turn load balancer instrumentation off");

  // turn instrumentation of communicatin off at startup
  _lb_args.traceComm() = !CmiGetArgFlagDesc(argv, "+LBCommOff",
		"Turn load balancer instrumentation of communication off");

  // set alpha and beeta
  _lb_args.alpha() = PER_MESSAGE_SEND_OVERHEAD_DEFAULT;
  _lb_args.beeta() = PER_BYTE_SEND_OVERHEAD_DEFAULT;
  CmiGetArgDoubleDesc(argv,"+LBAlpha", &_lb_args.alpha(),
                           "per message send overhead");
  CmiGetArgDoubleDesc(argv,"+LBBeta", &_lb_args.beeta(),
                           "per byte send overhead");

  if (CkMyPe() == 0) {
    if (_lb_args.debug()) {
      CmiPrintf("CharmLB> Verbose level %d, load balancing period: %g seconds\n", _lb_args.debug(), _lb_args.lbperiod());
    }
    if (_lb_args.debug() > 1) {
      CmiPrintf("CharmLB> Topology %s alpha: %es beta: %es.\n", _lbtopo, _lb_args.alpha(), _lb_args.beeta());
    }
    if (_lb_args.printSummary())
      CmiPrintf("CharmLB> Load balancer print summary of load balancing result.\n");
    if (_lb_args.ignoreBgLoad())
      CmiPrintf("CharmLB> Load balancer ignores processor background load.\n");
    if (_lb_args.samePeSpeed())
      CmiPrintf("CharmLB> Load balancer assumes all CPUs are same.\n");
    if (_lb_args.useCpuTime())
      CmiPrintf("CharmLB> Load balancer uses CPU time instead of wallclock time.\n");
    if (LBSimulation::doSimulation)
      CmiPrintf("CharmLB> Load balancer running in simulation mode on file '%s' version %d.\n", LBSimulation::dumpFile, _lb_args.lbversion());
    if (_lb_args.statsOn()==0)
      CkPrintf("CharmLB> Load balancing instrumentation is off.\n");
    if (_lb_args.traceComm()==0)
      CkPrintf("CharmLB> Load balancing instrumentation for communication is off.\n");
    if (_lb_args.migObjOnly())
      CkPrintf("LB> Load balancing strategy ignores non-migratable objects.\n");
  }
}