/* platform specific initialization */
void vt_pform_init() {
  Kernel_GetPersonality(&mybgq, sizeof(Personality_t));
#if TIMER == TIMER_GET_TIMEBASE
  vt_ticks_per_sec = (uint64_t)mybgq.Kernel_Config.FreqMHz * 1000000LL;
#elif TIMER == TIMER_PAPI_REAL_USEC
  vt_time_base = vt_metric_real_usec();
#endif

  torus_coord[0] = mybgq.Network_Config.Acoord;
  torus_coord[1] = mybgq.Network_Config.Bcoord;
  torus_coord[2] = mybgq.Network_Config.Ccoord;
  torus_coord[3] = mybgq.Network_Config.Dcoord;
  torus_coord[4] = mybgq.Network_Config.Ecoord;
  torus_coord[5] = Kernel_ProcessorID();
}
Exemple #2
0
static int bgq_nodenum(void)
{
    int           hostnum;
    Personality_t personality;
    Kernel_GetPersonality(&personality, sizeof(personality));
    /* Each MPI rank has a unique coordinate in a 6-dimensional space
       (A,B,C,D,E,T), with dimensions A-E corresponding to different
       physical nodes, and T within each node. Each node has sixteen
       physical cores, each of which can have up to four hardware
       threads, so 0 <= T <= 63 (but the maximum value of T depends on
       the confituration of ranks and OpenMP threads per
       node). However, T is irrelevant for computing a suitable return
       value for gmx_hostname_num().
     */
    hostnum  = personality.Network_Config.Acoord;
    hostnum *= personality.Network_Config.Bnodes;
    hostnum += personality.Network_Config.Bcoord;
    hostnum *= personality.Network_Config.Cnodes;
    hostnum += personality.Network_Config.Ccoord;
    hostnum *= personality.Network_Config.Dnodes;
    hostnum += personality.Network_Config.Dcoord;
    hostnum *= personality.Network_Config.Enodes;
    hostnum += personality.Network_Config.Ecoord;

    if (debug)
    {
        std::fprintf(debug,
                     "Torus ID A: %d / %d B: %d / %d C: %d / %d D: %d / %d E: %d / %d\n"
                     "Node ID T: %d / %d core: %d / %d hardware thread: %d / %d\n",
                     personality.Network_Config.Acoord,
                     personality.Network_Config.Anodes,
                     personality.Network_Config.Bcoord,
                     personality.Network_Config.Bnodes,
                     personality.Network_Config.Ccoord,
                     personality.Network_Config.Cnodes,
                     personality.Network_Config.Dcoord,
                     personality.Network_Config.Dnodes,
                     personality.Network_Config.Ecoord,
                     personality.Network_Config.Enodes,
                     Kernel_ProcessorCoreID(),
                     16,
                     Kernel_ProcessorID(),
                     64,
                     Kernel_ProcessorThreadID(),
                     4);
    }
    return hostnum;
}
static void init()
{
  int rc = 0;
  uint32_t fifoid=0;
  uint32_t subgroup, group;
  int i;

  /* If we are the 1st process, set up the rget inj fifo */
  if ( Kernel_ProcessorID() == 0 )
    {
      /* Set up an rget injection fifo to be used by all processes on this node.
       * It is at a well-known location...subgroup 0, fifo 0.
       * - Allocate storage for an injection fifo
       * - Allocate and initialize that injection fifo.
       * - Activate that injection fifo.
       */
      rc = posix_memalign( (void**)&_ififoPtr, 64, _injFifoSize );
      assert ( rc == 0 );

      /* Set user fifo attribute. */
      Kernel_InjFifoAttributes_t injFifoAttrs[1];
      injFifoAttrs[0].RemoteGet = 1;
      injFifoAttrs[0].System    = 0;
      injFifoAttrs[0].Priority  = 0;
      
      subgroup = 0;

      rc = Kernel_AllocateInjFifos (subgroup,
				    &_ififo_subgroup,
				    1,
				    &fifoid,
				    injFifoAttrs);
      assert ( rc == 0 );
      
      Kernel_MemoryRegion_t  mregion;
      Kernel_CreateMemoryRegion ( &mregion,
				  _ififoPtr,
				  _injFifoSize );
      
      rc = Kernel_InjFifoInit( &_ififo_subgroup,
			       fifoid,
			       &mregion,
			       (uint64_t)_ififoPtr -
			       (uint64_t)mregion.BaseVa,
			       _injFifoSize-1 );
      assert ( rc == 0 );
      
      rc = Kernel_InjFifoActivate ( &_ififo_subgroup,
				    1,
				    &fifoid,
				    KERNEL_INJ_FIFO_ACTIVATE );
      assert ( rc == 0 );

      /* Allocate a Base Address Table Entry for all processes on the node to use,
       * and set its value to zero. 
       */
      uint32_t batId = 0;
      rc = Kernel_AllocateBaseAddressTable( 0, /* subgroup */
					    &_batSubgroup,
					    1,
					    &batId,
					    0 /* "User" access */);
      assert ( rc == 0 );
      
      MUHWI_BaseAddress_t baseAddress;
      baseAddress = 0;
      
      rc = MUSPI_SetBaseAddress ( &_batSubgroup,
				  batId,
				  baseAddress );
      assert ( rc == 0 );
    }  

  /* Set up a reception fifo to receive packets.
   * - Allocate storage for a reception fifo
   * - Use the subgroup equal to our HW thread ID.
   * - Allocate and initialize that reception fifo.
   * - Enable that reception fifo.
   */
  rc = posix_memalign( (void**)&_rfifoPtr, 32, _recFifoSize );
  assert ( rc == 0 );

  Kernel_RecFifoAttributes_t recFifoAttrs[1];
  recFifoAttrs[0].System = 0;

  subgroup = Kernel_ProcessorID();
  group    = Kernel_ProcessorCoreID();

  rc = Kernel_AllocateRecFifos (subgroup,
				&_rfifo_subgroup,
				1,
				&fifoid,
				recFifoAttrs);
  assert ( rc == 0 );

  _rfifoShadowPtr = &_rfifo_subgroup._recfifos[fifoid];

  uint64_t recFifoEnableBits;

  Kernel_MemoryRegion_t  mregion;
  Kernel_CreateMemoryRegion ( &mregion,
			      _rfifoPtr,
			      _recFifoSize );
	      
  rc = Kernel_RecFifoInit( &_rfifo_subgroup, 
			   fifoid,
			   &mregion, 
			   (uint64_t)_rfifoPtr -
			   (uint64_t)mregion.BaseVa,
			   _recFifoSize-1 );
  assert ( rc == 0 );

  recFifoEnableBits = ( 0x0000000000000001ULL << 
			( 15 - 
			  ( ( (Kernel_ProcessorThreadID())*BGQ_MU_NUM_REC_FIFOS_PER_SUBGROUP) + 
			    fifoid ) ) );
		  
  rc = Kernel_RecFifoEnable ( group,
			      recFifoEnableBits );
  assert ( rc == 0 );
  
  _globalRecFifoId = subgroup * BGQ_MU_NUM_REC_FIFOS_PER_SUBGROUP;

  /* Allocate NUM_BUFS send and recv buffers */
  for (i=0; i<NUM_BUFS; i++)
    {
      int size = (1<<i)*1024;

      rc = posix_memalign( (void**)&_sBuff[i], 8, size );
      assert ( rc == 0 );

      /* Init the buffer */
      int j;
      unsigned char value=i;
      unsigned char *bufPtr=_sBuff[i];
      for (j=0; j<size; j++)
	{
	  *bufPtr = value++;
	  bufPtr++;
	}

      Kernel_MemoryRegion_t  mregion;
      Kernel_CreateMemoryRegion ( &mregion,
				  _sBuff[i],
				  size );
	      
      _sBuffPA[i] = 
	(uint64_t)_sBuff[i] -
	(uint64_t)mregion.BaseVa +
	(uint64_t)mregion.BasePa;

      rc = posix_memalign( (void**)&_rBuff[i], 8, size );
      assert ( rc == 0 );

      Kernel_CreateMemoryRegion ( &mregion,
				  _rBuff[i],
				  size );
	      
      _rBuffPA[i] = 
	(uint64_t)_rBuff[i] -
	(uint64_t)mregion.BaseVa +
	(uint64_t)mregion.BasePa;
    }

  /* Obtain our node coordinates */
  Personality_t personality;
  Kernel_GetPersonality(&personality, sizeof(personality));
  myCoords.Destination.A_Destination = personality.Network_Config.Acoord;
  myCoords.Destination.B_Destination = personality.Network_Config.Bcoord;
  myCoords.Destination.C_Destination = personality.Network_Config.Ccoord;
  myCoords.Destination.D_Destination = personality.Network_Config.Dcoord;
  myCoords.Destination.E_Destination = personality.Network_Config.Ecoord;
  
  /* Build the remote get descriptor model */
  {
    MUSPI_Pt2PtRemoteGetDescriptorInfo_t i;
    memset(&i, 0x00, sizeof(i));
    i.Base.Pre_Fetch_Only  = MUHWI_DESCRIPTOR_PRE_FETCH_ONLY_NO;
    i.Base.Payload_Address = 0; /* To be set by the agent */
    i.Base.Message_Length  = sizeof(MUHWI_Descriptor_t);
    i.Base.Torus_FIFO_Map  = MUHWI_DESCRIPTOR_TORUS_FIFO_MAP_PRIORITY;
    i.Base.Dest.Destination.Destination = myCoords.Destination.Destination;
    i.Pt2Pt.Hints_ABCD     = 0;
    i.Pt2Pt.Misc1          = MUHWI_PACKET_USE_DETERMINISTIC_ROUTING;
    i.Pt2Pt.Misc2          = MUHWI_PACKET_VIRTUAL_CHANNEL_HIGH_PRIORITY;
    i.Pt2Pt.Skip           = 0;
    i.RemoteGet.Type             = MUHWI_PACKET_TYPE_GET;
    i.RemoteGet.Rget_Inj_FIFO_Id = 0;
    rc = MUSPI_CreatePt2PtRemoteGetDescriptor( &_rgetDesc,
					       &i
					     );
    assert ( rc == 0 );
  }

  /* Build the data descriptor model */
  {
    MUSPI_Pt2PtDirectPutDescriptorInfo_t i;
    memset(&i, 0x00, sizeof(i));
    i.Base.Pre_Fetch_Only  = MUHWI_DESCRIPTOR_PRE_FETCH_ONLY_NO;
    i.Base.Payload_Address = 0; /* To be set at runtime */
    i.Base.Message_Length  = 0; /* To be set at runtime */
    i.Base.Torus_FIFO_Map  = MUHWI_DESCRIPTOR_TORUS_FIFO_MAP_LOCAL0;
    i.Base.Dest.Destination.Destination = myCoords.Destination.Destination;
    i.Pt2Pt.Hints_ABCD     = 0;
    i.Pt2Pt.Misc1          = MUHWI_PACKET_USE_DETERMINISTIC_ROUTING;
    i.Pt2Pt.Misc2          = MUHWI_PACKET_VIRTUAL_CHANNEL_DETERMINISTIC;
    i.Pt2Pt.Skip           = 0;
    i.DirectPut.Rec_Payload_Base_Address_Id = 0;
    i.DirectPut.Rec_Payload_Offset          = 0; /* To be set at runtime */
    i.DirectPut.Rec_Counter_Base_Address_Id = 0;
    i.DirectPut.Rec_Counter_Offset          = 0; /* Not used...agent uses its own */
    i.DirectPut.Pacing                      = MUHWI_PACKET_DIRECT_PUT_IS_NOT_PACED;
    rc = MUSPI_CreatePt2PtDirectPutDescriptor( &_dataDesc,
					       &i
					     );
    assert ( rc == 0 );
  }

  /* Build the completion descriptor model */
  {
    MUSPI_Pt2PtMemoryFIFODescriptorInfo_t i;  
    memset(&i, 0x00, sizeof(i));
    i.Base.Pre_Fetch_Only  = MUHWI_DESCRIPTOR_PRE_FETCH_ONLY_NO;
    i.Base.Payload_Address = 0;
    i.Base.Message_Length  = 0;
    i.Base.Torus_FIFO_Map  = MUHWI_DESCRIPTOR_TORUS_FIFO_MAP_LOCAL0;
    i.Base.Dest.Destination.Destination = myCoords.Destination.Destination;
    i.Pt2Pt.Hints_ABCD     = 0;
    i.Pt2Pt.Misc1          = MUHWI_PACKET_USE_DETERMINISTIC_ROUTING;
    i.Pt2Pt.Misc2          = MUHWI_PACKET_VIRTUAL_CHANNEL_DETERMINISTIC;
    i.Pt2Pt.Skip           = 0;
    i.MemFIFO.Rec_FIFO_Id  = _globalRecFifoId;
    i.MemFIFO.Rec_Put_Offset = 0; /* Will contain the message number at runtime */
    rc = MUSPI_CreatePt2PtMemoryFIFODescriptor( &_completionDesc,
						&i
					      );
    assert ( rc == 0 );
  }

  /* Initialize request data structures */
  memset(_requestStatus,0x00,sizeof(_requestStatus));

  /* Wait to ensure this fifo has been allocated on process 0 before proceeding. */
  sleep(10);
  
}
int main()
{
  volatile int rc;
  int i;
  uint64_t uniqueID=0;
  CommAgent_Control_t control;
  memset(&control, 0x00, sizeof(control));
  CommAgent_WorkRequest_t * workPtr=NULL;;

  setbuf(stdout,NULL); setbuf(stderr,NULL);

  uint32_t tid = Kernel_ProcessorID(); // 0-63.

  if (tid == 0) fprintf(stderr,"%s() [%s:%d]: Hello from app test!\n",__FUNCTION__,__FILE__,__LINE__);

  /* Initialize one at a time to ensure integrity of MU registers that span subgroups */

  sleep(tid);

  init();

  sleep(64-tid);

  TRACE(("Done Initializing tid %u\n",tid));

  rc = CommAgent_Init ( &control );

  TRACE((stderr,"%s() [%s:%d]: CommAgent_Init() returned errno %d\n",__FUNCTION__,__FILE__,__LINE__,rc));  

  assert( rc == 0 );
  
  rc = CommAgent_RemoteGetPacing_Init ( control, 
					(CommAgent_RemoteGetPacing_SharedMemoryInfo_t *)NULL );
  
  TRACE((stderr,"%s() [%s:%d]: RemoteGetPacing_Init() returned errno %d\n",__FUNCTION__,__FILE__,__LINE__,rc));  

  assert( rc == 0 );
  
  rc = CommAgent_Fence_Init ( control );
  
  TRACE((stderr,"%s() [%s:%d]: Fence_Init() returned errno %d\n",__FUNCTION__,__FILE__,__LINE__,rc));  

  assert( rc == 0 );

  /* Loop through the set of remote get requests, sending each request to the agent.
   * Each request contains a direct put data descriptor and a memfifo completion descriptor.
   * Each remote get request has a corresponding completion indicator.
   * When a particular remote get request is submitted to the agent, the completion indicator
   * is set to 1, indicating that the request is active.
   * Each pass through the loop, we poll the reception fifo for completion packets.
   * The dispatch routine clears the completion indicator, so on the next pass,
   * the particular remote get request is submitted again to the agent.
   * This continues until the rget request that is for the largest size has been
   * processed NUM_ITER times.
   */

  /* Mode 0 is when the completion descriptor travels to the remote node and back.
   * Mode 1 is when the completion descriptor is a local transfer.
   */
  int mode;
  for ( mode=0; mode<=1; mode++ )
    {
      fprintf(stderr,"Starting test for mode %d\n",mode);

      _iter = 0;
      while ( _iter < NUM_ITER )
	{
	  for ( i=0; i<NUM_BUFS; i++ )
	    {
	      int size = (1<<i)*1024;
	      /* 1. Find the next request to submit */
	      if ( _requestStatus[i] == 0 )
		{
		  /* Init the recv buffer to all FFs */
		  memset(_rBuff[i],0xFF,size);
		  
		  /*    - mark the request as "active" */
		  _requestStatus[i] = 1;

		  /* 2. Allocate a slot in the agent's queue for the DPut */
		  do
		    {
		      rc = CommAgent_AllocateWorkRequest( control,
							  &workPtr,
							  &uniqueID );

		      TRACE((stderr,"%s() [%s:%d]: AllocateWorkRequest for request %d returned errno %d, workPtr=%p, uniqueID=%lu\n",__FUNCTION__,__FILE__,__LINE__,i,rc,workPtr,uniqueID));
		    } while (rc==EAGAIN);
		  assert(rc==0);
		  
		  /* Remember the unique ID */
		  uint64_t dputUniqueID = uniqueID;

		  /* 3. Set up the DPut request in the agent's queue */
		  memcpy ( &workPtr->request.rget.rgetDescriptor,
			   &_rgetDesc,
			   sizeof(_rgetDesc)
			   );
		  memcpy ( &workPtr->request.rget.payloadDescriptor,
			   &_dataDesc,
			   sizeof(_dataDesc)
			   );
		  MUSPI_SetPayload ( & workPtr->request.rget.payloadDescriptor,
				     _sBuffPA[i],
				     size );
		  MUSPI_SetRecPutOffset ( &workPtr->request.rget.payloadDescriptor, 
					  _rBuffPA[i] );
		  workPtr->request.rget.globalInjFifo = i; /* Make this unique for each DPut request */

		  /* Set the peer ID to match that of the memfifo request so these requests
		   * are done sequentially */
		  if ( mode == 1 ) workPtr->request.rget.peerID = dputUniqueID;
		  else workPtr->request.rget.peerID = 0;
		  
		  /* 4. Submit the DPut request to the agent */
		  rc = CommAgent_RemoteGetPacing_SubmitWorkRequest( control,
								    0, /* handle */
								    &workPtr->request.rget );
		  
		  /* 5. Allocate a slot in the agent's queue for the Memfifo completion */
		  do
		    {
		      rc = CommAgent_AllocateWorkRequest( control,
							  &workPtr,
							  &uniqueID );
		      TRACE((stderr,"%s() [%s:%d]: AllocateWorkRequest for request %d returned errno %d, workPtr=%p, uniqueID=%lu\n",__FUNCTION__,__FILE__,__LINE__,i,rc,workPtr,uniqueID));
		    } while (rc==EAGAIN);
		  assert(rc==0);
		  
		  /* 6. Set up the Memfifo request in the agent's queue */
		  memcpy ( &workPtr->request.rget.rgetDescriptor,
			   &_rgetDesc,
			   sizeof(_rgetDesc)
			   );
		  workPtr->request.rget.globalInjFifo = i; /* Make this match the DPut request so they are ordered */
		  
		  memcpy ( &workPtr->request.rget.payloadDescriptor,
			   &_completionDesc,
			   sizeof(_completionDesc)
			   );
		  /* Store the message number in the packet header.  The
		   * poll function will use it to mark that message complete.
		   */
		  MUSPI_SetRecPutOffset ( & workPtr->request.rget.payloadDescriptor, 
					  i );

		  /* Set the peer ID to match that of the rget request so these requests
		   * are done sequentially */
		  if ( mode == 1 ) workPtr->request.rget.peerID = dputUniqueID;
		  else workPtr->request.rget.peerID = 0;
		  
		  /* 7. Submit the request to the agent */
		  rc = CommAgent_RemoteGetPacing_SubmitWorkRequest( control,
								    0, /* handle */
								    &workPtr->request.rget );
		}
	    }
	  /* Poll the reception fifo, marking completed requests "complete".
	   * When the largest request has completed, increment "iter"
	   */
	  poll();
	}
      
      /* Wait for everything to complete */
      for (i=0; i<NUM_BUFS; i++)
	{
	  while ( _requestStatus[i] ) poll();
	}
    }

  printf("No errors\n");

  /***************************************************************************
   * The following was initial test code that is no longer used
   ***************************************************************************/

#if 0

  for (i=0; i<65; i++)
    {
      rc = CommAgent_AllocateWorkRequest( control,
					  (CommAgent_WorkRequest_t**)&workPtr[i] );
      fprintf(stderr,"%s() [%s:%d]: AllocateWorkRequest returned errno %d, workPtr[%d]=%p\n",__FUNCTION__,__FILE__,__LINE__,rc,i,workPtr[i]);
      if ( rc == EAGAIN ) break;
    }

  int limit = i;
  fprintf(stderr,"%s() [%s:%d]: Limit = %d\n",__FUNCTION__,__FILE__,__LINE__,limit);

  for (i=0; i<limit; i++)
    {
      //      memset(workPtr[i],0x00, sizeof(*workPtr[0]));

      rc = CommAgent_RemoteGetPacing_SubmitWorkRequest( control,
							0, /* handle */
							workPtr[i] );
      fprintf(stderr,"%s() [%s:%d]: SubmitWorkRequest %d returned errno %d\n",__FUNCTION__,__FILE__,__LINE__,i,rc);
    }
  for (i=0; i<65; i++) workPtr[i]=(CommAgent_RemoteGetPacing_WorkRequest_t *)((uint64_t)-1);

  for (i=0; i<256; i++)
    {
      int count=0, idx=i%64;

/*       This was the original code.  It exposes a compiler optimization bug */
/*       where the returned does not appear to be set. */
      
/*             while ( (rc = CommAgent_AllocateWorkRequest(  */
/*                                       control, */
/*       			         (CommAgent_WorkRequest_t**)&workPtr[idx] ) )  */
/*       	      == EAGAIN ) count++;  */

      CommAgent_WorkRequest_t *workRequestPtr;
      while ( (rc = CommAgent_AllocateWorkRequest( control,
						   &workRequestPtr ) )
	      == EAGAIN ) count++;
      assert ( rc == 0 );

      workPtr[idx] = (CommAgent_RemoteGetPacing_WorkRequest_t *)workRequestPtr;

/*       if ( ((uint64_t)(workPtr[idx])) == ((uint64_t)-1) )  */
/* 	{ */
/* 	  fprintf(stderr,"workPtr[%d] at %p = %p\n",idx,&workPtr[idx],workPtr[idx]); */
/* 	  Delay(1000000000); */
/* 	  fprintf(stderr,"workPtr[%d] at %p = %p\n",idx,&workPtr[idx],workPtr[idx]); */
/* 	  fprintf(stderr,"Count=%u, i=%d\n",count,i); */
/* 	  int j; */
/* 	  for (j=0; j<65; j++) fprintf(stderr,"workPtr[%d]=%p\n",j,workPtr[j]); */
/* 	  assert(0); */
/* 	} */

      //  memset(workPtr[idx],0x00, sizeof(CommAgent_RemoteGetPacing_WorkRequest_t));

      rc = CommAgent_RemoteGetPacing_SubmitWorkRequest( control,
							0, /* handle */
							workPtr[idx] );
      assert ( rc == 0 );

      fprintf(stderr,"%s() [%s:%d]: Allocated/Submitted work request %d, count=%d\n",__FUNCTION__,__FILE__,__LINE__,i,count);
      
    }

  // Wait for the agent to process everything.
  Sleep(10);

  fprintf(stderr,"Done with Delay\n");

#endif

  return 0;
}