Exemplo n.º 1
0
CmiCommHandle CmiAsyncBroadcastFn(int size, char *msg) {
#if ENSURE_MSG_PAIRORDER
    /* Not sure how to add the msg seq no for async broadcast messages --Chao Mei */
    /* so abort here ! */
    CmiAssert(0);
    return 0;
#else
    int i, rank;
    int mype = CmiMyPe();
#if ENABLE_CONVERSE_QD
    CQdCreate(CpvAccess(cQdState), CmiNumPes()-1);
#endif
    MACHSTATE1(3,"[%d] Sending async broadcast message from {",CmiMyNode());
    CMI_BROADCAST_ROOT(msg) = 0;
    void *handle = malloc(sizeof(int));
    *((int *)handle) = CmiNumPes()-1;

    for (i=mype+1; i<CmiNumPes(); i++) {
        CMI_DEST_RANK(msg) = CmiRankOf(i);
        lapiSendFn(CmiNodeOf(i), size, msg, DeliveredMsg, handle);
    }
    for (i=0; i<mype; i++) {
        CMI_DEST_RANK(msg) = CmiRankOf(i);
        lapiSendFn(CmiNodeOf(i), size, msg, DeliveredMsg, handle);
    }

    MACHSTATE(3,"} Sending async broadcast message end");
    return handle;
#endif
}
Exemplo n.º 2
0
void CldEnqueue(int pe, void *msg, int infofn)
{
  int len, queueing, priobits; unsigned int *prioptr;
  CldInfoFn ifn; CldPackFn pfn;
  peinfo *pinf = &(CpvAccess(peinf));
  ifn = (CldInfoFn)CmiHandlerToFunction(infofn);
  ifn(msg, &pfn, &len, &queueing, &priobits, &prioptr);
  if (pe != CLD_ANYWHERE) {
    if (pfn && (CmiNodeOf(pe) != CmiMyNode())) {
      pfn(&msg);
      ifn(msg, &pfn, &len, &queueing, &priobits, &prioptr);
    }
    CmiSetInfo(msg, infofn);
    CmiSetXHandler(msg, CmiGetHandler(msg));
    CmiSetHandler(msg, pinf->EnqueueHandler);
    if (pe==CLD_BROADCAST) CmiSyncBroadcastAndFree(len, msg);
    else if (pe==CLD_BROADCAST_ALL) CmiSyncBroadcastAllAndFree(len, msg);
    else CmiSyncSendAndFree(pe, len, msg);
  } else {
    CmiSetInfo(msg, infofn);
    CmiSetXHandler(msg, CmiGetHandler(msg));
    CmiSetHandler(msg, pinf->HopHandler);
    CsdEnqueueGeneral(msg, queueing, priobits, prioptr);
  }
}
Exemplo n.º 3
0
void BGQTorusManager::populateLocalNodes() {
  if(CmiNumPartitions() == 1) return;

  CmiLock(bgq_lock);
  if(bgq_isLocalSet) {
    CmiUnlock(bgq_lock);
    return;
  }

  if(bgq_localNodes == NULL)
    bgq_localNodes = (int *)malloc(CmiNumNodesGlobal()*sizeof(int));

  CmiAssert(bgq_localNodes != NULL);

  for(int i = 0; i < CmiNumNodesGlobal(); i++)
    bgq_localNodes[i] = -1;

  for(int i = 0; i < CmiNumNodes(); i++) {
    int a, b, c, d, e, t;
    int global;

    rankToCoordinates(CmiNodeFirst(i), a, b, c, d, e, t);
    global = CmiNodeOf(coordinatesToRank(a, b, c, d, e, t));

    bgq_localNodes[global] = i;
  }

  bgq_isLocalSet = 1;

  CmiUnlock(bgq_lock);
}
Exemplo n.º 4
0
/** \function pidtonid
 *  finds nids for pids 1 to CmiNumPes and stores them in an array
 *  correspondingly also creates an array for nids to pids
 */
void pidtonid(int numpes) {
  CmiLock(cray_lock);
  if (pid2nid != NULL) {
      CmiUnlock(cray_lock);
      return;          /* did once already */
  }

  getDimension(&maxNID,&maxX,&maxY,&maxZ);
  int numCores = CmiNumCores();
  
  pid2nid = (int *)malloc(sizeof(int) * numpes);

#if XT4_TOPOLOGY || XT5_TOPOLOGY || XE6_TOPOLOGY
  int i, nid, ret;
  CmiAssert(rca_coords == NULL);
  rca_coords = (rca_mesh_coord_t *)malloc(sizeof(rca_mesh_coord_t)*(maxNID+1));
  for (i=0; i<maxNID; i++) {
    rca_coords[i].mesh_x = rca_coords[i].mesh_y = rca_coords[i].mesh_z = -1;
  }
  for (i=0; i<numpes; i++) {
    PMI_Get_nid(CmiGetNodeGlobal(CmiNodeOf(i),CmiMyPartition()), &nid);
    pid2nid[i] = nid;
    CmiAssert(nid < maxNID);
    ret = rca_get_meshcoord(nid, &rca_coords[nid]);
    CmiAssert(ret != -1);
  }
#endif
  CmiUnlock(cray_lock);
}
Exemplo n.º 5
0
/* send msg along the hypercube in broadcast. (Sameer) */
static void SendHyperCubeProc(int size, char *msg) {
    int startpe = CMI_BROADCAST_ROOT(msg)-1;
    int startnode = CmiNodeOf(startpe);
#if CMK_SMP
    if (startpe > CmiNumPes()) startnode = startpe - CmiNumPes();
#endif
    SendHyperCube(size, msg, 0, startnode);
#if CMK_SMP
    /* second send msgs to my peers on this node */
    SendToPeers(size, msg);
#endif
}
Exemplo n.º 6
0
void   CmiDirect_manytomany_initialize_send ( void        * h,
					      unsigned      tag, 
					      unsigned      idx,
					      unsigned      displ,
					      unsigned      bytes,
					      unsigned      rank )
{
  BGPCmiDirectM2mHandle *handle = (BGPCmiDirectM2mHandle *) h;

  assert ( tag < MAX_CONN  );
  
  handle->m2m_sndlens    [tag][idx]   = bytes;
  handle->m2m_sdispls    [tag][idx]   = displ;
  handle->m2m_ranks      [tag][idx]   = CmiGetNodeGlobal(CmiNodeOf(rank),CmiMyPartition());
  handle->m2m_permutation[tag][idx]   = (idx+1)%handle->m2m_nsndranks[tag];
}
Exemplo n.º 7
0
void CldHopHandler(char *msg)
{
  peinfo *pinf = &(CpvAccess(peinf));
  int len, queueing, priobits; unsigned int *prioptr;
  CldInfoFn ifn; CldPackFn pfn; int pe;

  if (pinf->rebalance) {
    /* do pe = ((lrand48()&0x7FFFFFFF)%CmiNumPes()); */
    do pe = ((CrnRand()&0x7FFFFFFF)%CmiNumPes());
    while (pe == pinf->mype);
    ifn = (CldInfoFn)CmiHandlerToFunction(CmiGetInfo(msg));
    ifn(msg, &pfn, &len, &queueing, &priobits, &prioptr);
    if (pfn && CmiNodeOf(pe) != CmiMyNode()) {
      pfn(&msg);
      ifn(msg, &pfn, &len, &queueing, &priobits, &prioptr);
    }
    CmiSyncSendAndFree(pe, len, msg);
    pinf->rebalance--;
  } else {
    CmiSetHandler(msg, CmiGetXHandler(msg));
    CmiHandleMessage(msg);
  }
}
Exemplo n.º 8
0
void CmiInitCPUAffinity(char **argv)
{
  static skt_ip_t myip;
  int ret, i, exclude;
  hostnameMsg  *msg;
  char *pemap = NULL;
  char *commap = NULL;
  char *pemapfile = NULL;
 
  int show_affinity_flag;
  int affinity_flag = CmiGetArgFlagDesc(argv,"+setcpuaffinity",
						"set cpu affinity");

  while (CmiGetArgIntDesc(argv,"+excludecore", &exclude, "avoid core when setting cpuaffinity"))  {
    if (CmiMyRank() == 0) add_exclude(exclude);
    affinity_flag = 1;
  }

  if (CmiGetArgStringDesc(argv, "+pemapfile", &pemapfile, "define pe to core mapping file")) {
    FILE *fp;
    char buf[128];
    pemap = (char*)malloc(1024);
    fp = fopen(pemapfile, "r");
    if (fp == NULL) CmiAbort("pemapfile does not exist");
    while (!feof(fp)) {
      if (fgets(buf, 128, fp)) {
        if (buf[strlen(buf)-1] == '\n') buf[strlen(buf)-1] = 0;
        strcat(pemap, buf);
      }
    }
    fclose(fp);
    if (CmiMyPe()==0) CmiPrintf("Charm++> read from pemap file '%s': %s\n", pemapfile, pemap);
  }

  CmiGetArgStringDesc(argv, "+pemap", &pemap, "define pe to core mapping");
  if (pemap!=NULL && excludecount>0)
    CmiAbort("Charm++> +pemap can not be used with +excludecore.\n");

  CmiGetArgStringDesc(argv, "+commap", &commap, "define comm threads to core mapping");

  if (pemap!=NULL || commap!=NULL) affinity_flag = 1;

  show_affinity_flag = CmiGetArgFlagDesc(argv,"+showcpuaffinity",
						"print cpu affinity");

  cpuAffinityHandlerIdx =
       CmiRegisterHandler((CmiHandler)cpuAffinityHandler);
  cpuAffinityRecvHandlerIdx =
       CmiRegisterHandler((CmiHandler)cpuAffinityRecvHandler);

  if (CmiMyRank() ==0) {
     affLock = CmiCreateLock();
  }

#if CMK_BLUEGENEP || CMK_BLUEGENEQ
  if(affinity_flag){
      affinity_flag = 0;
      if(CmiMyPe()==0) CmiPrintf("Charm++> cpu affinity setting is not needed on Blue Gene, thus ignored.\n");
  }
  if(show_affinity_flag){
      show_affinity_flag = 0;
      if(CmiMyPe()==0) CmiPrintf("Charm++> printing cpu affinity is not supported on Blue Gene.\n");
  }
#endif

  if (!affinity_flag) {
    if (show_affinity_flag) CmiPrintCPUAffinity();
    return;
  }

  if (CmiMyPe() == 0) {
     CmiPrintf("Charm++> cpu affinity enabled. \n");
     if (excludecount > 0) {
       CmiPrintf("Charm++> cpuaffinity excludes core: %d", excludecore[0]);
       for (i=1; i<excludecount; i++) CmiPrintf(" %d", excludecore[i]);
       CmiPrintf(".\n");
     }
     if (pemap!=NULL)
       CmiPrintf("Charm++> cpuaffinity PE-core map : %s\n", pemap);
  }

  if (CmiMyPe() >= CmiNumPes()) {         /* this is comm thread */
      /* comm thread either can float around, or pin down to the last rank.
         however it seems to be reportedly slower if it is floating */
    CmiNodeAllBarrier();
    if (commap != NULL) {
      int mycore = search_pemap(commap, CmiMyPeGlobal()-CmiNumPesGlobal());
      if(CmiMyPe()-CmiNumPes()==0) printf("Charm++> set comm %d on node %d to core #%d\n", CmiMyPe()-CmiNumPes(), CmiMyNode(), mycore); 
      if (-1 == CmiSetCPUAffinity(mycore))
        CmiAbort("set_cpu_affinity abort!");
      CmiNodeAllBarrier();
      if (show_affinity_flag) CmiPrintCPUAffinity();
      return;    /* comm thread return */
    }
    else {
    /* if (CmiSetCPUAffinity(CmiNumCores()-1) == -1) CmiAbort("set_cpu_affinity abort!"); */
#if !CMK_CRAYXT && !CMK_CRAYXE && !CMK_CRAYXC && !CMK_BLUEGENEQ
      if (pemap == NULL) {
#if CMK_MACHINE_PROGRESS_DEFINED
        while (affinity_doneflag < CmiMyNodeSize())  CmiNetworkProgress();
#else
#if CMK_SMP
       #error "Machine progress call needs to be implemented for cpu affinity!"
#endif
#endif
      }
#endif
#if CMK_CRAYXT || CMK_CRAYXE || CMK_CRAYXC
      /* if both pemap and commmap are NULL, will compute one */
      if (pemap != NULL)      
#endif
      {
      CmiNodeAllBarrier();
      if (show_affinity_flag) CmiPrintCPUAffinity();
      return;    /* comm thread return */
      }
    }
  }

  if (pemap != NULL && CmiMyPe()<CmiNumPes()) {    /* work thread */
    int mycore = search_pemap(pemap, CmiMyPeGlobal());
    if(show_affinity_flag) CmiPrintf("Charm++> set PE %d on node %d to core #%d\n", CmiMyPe(), CmiMyNode(), mycore); 
    if (mycore >= CmiNumCores()) {
      CmiPrintf("Error> Invalid core number %d, only have %d cores (0-%d) on the node. \n", mycore, CmiNumCores(), CmiNumCores()-1);
      CmiAbort("Invalid core number");
    }
    if (CmiSetCPUAffinity(mycore) == -1) CmiAbort("set_cpu_affinity abort!");
    CmiNodeAllBarrier();
    CmiNodeAllBarrier();
    /* if (show_affinity_flag) CmiPrintCPUAffinity(); */
    return;
  }

#if CMK_CRAYXT || CMK_CRAYXE || CMK_CRAYXC
  {
    int numCores = CmiNumCores();

    int myid = getXTNodeID(CmiMyNodeGlobal(), CmiNumNodesGlobal());
    int myrank;
    int pe, mype = CmiMyPeGlobal();
    int node = CmiMyNodeGlobal();
    int nnodes = 0;
#if CMK_SMP
    if (CmiMyPe() >= CmiNumPes()) {         /* this is comm thread */
      int node = CmiMyPe() - CmiNumPes();
      mype = CmiGetPeGlobal(CmiNodeFirst(node) + CmiMyNodeSize() - 1, CmiMyPartition()); /* last pe on SMP node */
      node = CmiGetNodeGlobal(node, CmiMyPartition());
    }
#endif
    pe = mype - 1;
    while (pe >= 0) {
      int n = CmiNodeOf(pe);
      if (n != node) { nnodes++; node = n; }
      if (getXTNodeID(n, CmiNumNodesGlobal()) != myid) break;
      pe --;
    }
    CmiAssert(numCores > 0);
    myrank = (mype - pe - 1 + nnodes)%numCores;
#if CMK_SMP
    if (CmiMyPe() >= CmiNumPes()) 
        myrank = (myrank + 1)%numCores;
#endif

    if (-1 != CmiSetCPUAffinity(myrank)) {
      DEBUGP(("Processor %d is bound to core #%d on node #%d\n", CmiMyPe(), myrank, mynode));
    }
    else{
      CmiPrintf("Processor %d set affinity failed!\n", CmiMyPe());
      CmiAbort("set cpu affinity abort!\n");
    }
  }
  if (CmiMyPe() < CmiNumPes()) 
  CmiNodeAllBarrier();
  CmiNodeAllBarrier();
#else
    /* get my ip address */
  if (CmiMyRank() == 0)
  {
#if CMK_HAS_GETHOSTNAME
    myip = skt_my_ip();        /* not thread safe, so only calls on rank 0 */
#else
    CmiAbort("Can not get unique name for the compute nodes. \n");
#endif
  }
  CmiNodeAllBarrier();

    /* prepare a msg to send */
  msg = (hostnameMsg *)CmiAlloc(sizeof(hostnameMsg));
  CmiSetHandler((char *)msg, cpuAffinityHandlerIdx);
  msg->pe = CmiMyPe();
  msg->ip = myip;
  msg->ncores = CmiNumCores();
  DEBUGP(("PE %d's node has %d number of cores. \n", CmiMyPe(), msg->ncores));
  msg->rank = 0;
  CmiSyncSendAndFree(0, sizeof(hostnameMsg), (void *)msg);

  if (CmiMyPe() == 0) {
    int i;
    hostTable = CmmNew();
    rankmsg = (rankMsg *)CmiAlloc(sizeof(rankMsg)+CmiNumPes()*sizeof(int)*2);
    CmiSetHandler((char *)rankmsg, cpuAffinityRecvHandlerIdx);
    rankmsg->ranks = (int *)((char*)rankmsg + sizeof(rankMsg));
    rankmsg->nodes = (int *)((char*)rankmsg + sizeof(rankMsg) + CmiNumPes()*sizeof(int));
    for (i=0; i<CmiNumPes(); i++) {
      rankmsg->ranks[i] = 0;
      rankmsg->nodes[i] = -1;
    }

    for (i=0; i<CmiNumPes(); i++) CmiDeliverSpecificMsg(cpuAffinityHandlerIdx);
  }

    /* receive broadcast from PE 0 */
  CmiDeliverSpecificMsg(cpuAffinityRecvHandlerIdx);
  CmiLock(affLock);
  affinity_doneflag++;
  CmiUnlock(affLock);
  CmiNodeAllBarrier();
#endif

  if (show_affinity_flag) CmiPrintCPUAffinity();
}
Exemplo n.º 9
0
void TorusLB::strategy() {
  int index;
  // compute the average load by (compute load + background load) / numPesAvailable
  computeAverage();
  // two heaps of self and pair computes
  makeTwoHeaps();

  const int beginGroup = processors[0].Id;
  const int endGroup = beginGroup + P;
#define INGROUP(PROC) ((PROC) >= beginGroup && (PROC) < endGroup)

  computeInfo *c;
  processorInfo *p, *minp;
  Iterator nextP;
  overLoad = 1.2;

  for(int I=0; I<numComputes; I++) {

  c = (computeInfo *) computePairHeap->deleteMax();
  if ( ! c ) c = (computeInfo *) computeSelfHeap->deleteMax(); 

  if(c->processor != -1) continue; // go to the next compute
  if(!c) CkAbort("TorusLB: Compute Heap empty!\n");

  for(int j=0; j<6; j++) {
    bestPe[j] = 0;
    goodPe[j] = 0;
    badPe[j] = 0;
  }

  // Look at pes which have the compute's patches

  // HYBRID check if processor is in local group
#define SELECT_REALPE(X) if INGROUP((X)) { \
  selectPes(&processors[(X) - beginGroup], c); \
  }

  const int realPe1 = patches[c->patch1].processor;
  SELECT_REALPE(realPe1)

  const int realPe2 = patches[c->patch2].processor;
  if ( realPe2 != realPe1 ) {
    SELECT_REALPE(realPe2)
  }

  // Try the processors which have the patches' proxies
  p = (processorInfo *)(patches[c->patch1].proxiesOn.iterator((Iterator *)&nextP));
  while(p) {						// patch 1
    if INGROUP(p->Id) selectPes(p, c);
    p = (processorInfo *)(patches[c->patch1].proxiesOn.next((Iterator *)&nextP));
  } 

  p = (processorInfo *)(patches[c->patch2].proxiesOn.iterator((Iterator *)&nextP));
  while(p) {						// patch 2
    if INGROUP(p->Id) selectPes(p, c);
    p = (processorInfo *)(patches[c->patch2].proxiesOn.next((Iterator *)&nextP));
  }

  // see if we have found a processor to place the compute on
  p = 0;
  if((p = bestPe[5])
#if USE_TOPOMAP
  || (p = goodPe[5])
#endif
  || (p = bestPe[4])
#if USE_TOPOMAP
  || (p = goodPe[4])
#endif
  || (p = bestPe[3])
#if USE_TOPOMAP
  || (p = goodPe[3])
#endif
  || (p = bestPe[1])
#if USE_TOPOMAP
  || (p = goodPe[1])
#endif
  || (p = bestPe[2])
#if USE_TOPOMAP
  || (p = goodPe[2])
#endif
  || (p = bestPe[0])
#if USE_TOPOMAP
  || (p = goodPe[0])
#endif
  ) {
    assign(c, p);
    continue;
  }

    // Try all pes on the nodes of the home patches
    if ( CmiNumNodes() > 1 ) {  // else not useful
      double minLoad = overLoad * averageLoad;
      minp = 0;
      int realNode1 = CmiNodeOf(realPe1);
      int nodeSize = CmiNodeSize(realNode1);
      if ( nodeSize > 1 ) {  // else did it already
        int firstpe = CmiNodeFirst(realNode1);
        for ( int rpe = firstpe; rpe < firstpe+nodeSize; ++rpe ) {
          if INGROUP(rpe) {
            p = &processors[rpe - beginGroup];
            if ( p->available && ( p->load + c->load < minLoad ) ) {
              minLoad = p->load + c->load;
              minp = p;
            }
          }
        }
      }
      if ( realPe2 != realPe1 ) {
        int realNode2 = CmiNodeOf(realPe2);
        if ( realNode2 != realNode1 ) {  // else did it already
          nodeSize = CmiNodeSize(realNode2);
          if ( nodeSize > 1 ) {
            int firstpe = CmiNodeFirst(realNode2);
            for ( int rpe = firstpe; rpe < firstpe+nodeSize; ++rpe ) {
              if INGROUP(rpe) {
                p = &processors[rpe - beginGroup];
                if ( p->available && ( p->load + c->load < minLoad ) ) {
                  minLoad = p->load + c->load;
                  minp = p;
                }
              }
            }
          }