示例#1
0
void EMON_Print_t(int numthreads)
{
  unsigned int tid, pid, cid;
  int node_id;


  tid = PhysicalThreadID();    // between 0 and 3
  pid = PhysicalThreadIndex(); // between 0 and 67
  cid = pid/4;
  node_id =  EMON_rank_on_card( );

  //  printf("node id = %d\n", node_id);
  

  // EMON set up is done only by one thread of one node
  if ((pid == 0) && ( node_id  == 0) )
    {

      double power = EMON_ReportPower();
      if (power < 0) {
     	printf("ERROR : EMON_GetPower failed\n");
     	test_exit(1);
      }
      
    }
}
示例#2
0
/*===============================================================*/
void HPM_Stop(char * this_label)
{
   int i, j, k;
   long long counter_value, tb;

   unsigned int tid, pid, cid;
   tid = PhysicalThreadID();    // between 0 and 3
   pid = PhysicalThreadIndex(); // between 0 and 67
   cid = pid/4;


   if (pid == 0)
     {

       if (code_block >= MAX_CODE_BLOCKS) return;

       tb = GetTimeBase();

       j = index_from_label(this_label);

       for (k=0; k<MAX_CORES; k++) {
	 if (coremask[k]) {
	   for (i=0; i<num_counters; i++) {
	     counter_value = Upci_Punit_Event_Read(&Punit[k], eventHandle[k][i]);
	     counter_sum[k][j][i] += (counter_value - counter_in[k][j][i]);
	   }
	 }
       }

       L2_sum[j][0] += UPC_L2_ReadCtr(0, UPC_L2_CTR_HITS)      - L2_in[j][0];
       L2_sum[j][1] += UPC_L2_ReadCtr(0, UPC_L2_CTR_MISSES)    - L2_in[j][1];
       L2_sum[j][2] += UPC_L2_ReadCtr(0, UPC_L2_PREFETCH)      - L2_in[j][2];
       L2_sum[j][3] += UPC_L2_ReadCtr(0, UPC_L2_FETCH_LINE)    - L2_in[j][3];
       L2_sum[j][4] += UPC_L2_ReadCtr(0, UPC_L2_STORE_LINE)    - L2_in[j][4];
       L2_sum[j][5] += UPC_L2_ReadCtr(0, UPC_L2_STORE_PARTIAL) - L2_in[j][5];
       
       block_stops[j] += 1;

       timebase_sum[j] += tb - timebase_in[j];

       // printf("entered stop by %d\n", pid);
#ifdef DEBUG
       Upci_A2PC_Val_t a2qry; 
       Kernel_Upci_A2PC_GetRegs(&a2qry); 
       Upci_A2PC_DumpRegs(&a2qry); 
       UPC_P_Dump_State(Kernel_ProcessorCoreID()); 
       Upci_Punit_Dump(0, &Punit[0]);
       UPC_C_Dump_State();
       UPC_C_Dump_Counters(0,1);
       for(i=0; i<24; i++) {
        Upci_Punit_EventH_Dump(0, i, &eventHandle[0][i]);
       }
#endif
     }
     

   return;
}
示例#3
0
/*================================================================*/
void HPM_Start(char * this_label)
{
   int i, j, k;
   long long tb;
   unsigned int tid, pid, cid;

   tid = PhysicalThreadID();    // between 0 and 3
   pid = PhysicalThreadIndex(); // between 0 and 67
   cid = pid/4;

   if (pid == 0)
     {

       tb = GetTimeBase();

       j = index_from_label(this_label);

       for (k=0; k<MAX_CORES; k++) {
	 if (coremask[k]) {
	   for (i=0; i<num_counters; i++) {
	     counter_in[k][j][i] = Upci_Punit_Event_Read(&Punit[k], eventHandle[k][i]);
	   }
	 }
       }

       L2_in[j][0] = UPC_L2_ReadCtr(0, UPC_L2_CTR_HITS);
       L2_in[j][1] = UPC_L2_ReadCtr(0, UPC_L2_CTR_MISSES);
       L2_in[j][2] = UPC_L2_ReadCtr(0, UPC_L2_PREFETCH);
       L2_in[j][3] = UPC_L2_ReadCtr(0, UPC_L2_FETCH_LINE);
       L2_in[j][4] = UPC_L2_ReadCtr(0, UPC_L2_STORE_LINE);
       L2_in[j][5] = UPC_L2_ReadCtr(0, UPC_L2_STORE_PARTIAL);

       // printf("entered start by %d\n", pid);

       block_starts[j] += 1;

       timebase_in[j] = tb;
     }

   return;
}
示例#4
0
static inline void doTimeSync(int numloops)
{
    Personality_t *pers = &FW_Personality;
    uint64_t tb = 0;
    int isRootNode = 0;
    if((pers->Network_Config.PrimordialClassRoute.GlobIntUpPortOutputs == 0) && (PhysicalThreadIndex() == 0))
        isRootNode = 1;
    else
        isRootNode = 0;
    
    int loop;
    uint64_t giSendTime = 1 * pers->Kernel_Config.FreqMHz * 1000;
    uint64_t giHoldTime;
    uint64_t fetch = 0;
    for(loop = 0; loop<numloops; loop++)
    {
        Kernel_ClearTimeBase();
        if(isRootNode)
        {
            while(Kernel_GetTimeBase() < giSendTime)
            {
            }
            activateGIPulseThread();
        }
        fetch = Kernel_GetTimeBase();
        tb = barrier_wait();
        giHoldTime = tb + 10000;
        while(Kernel_GetTimeBase() < giHoldTime)
        {
        }
    }
    int64_t delta = giSendTime - tb;
    uint64_t cleartime = 2ULL * pers->Kernel_Config.FreqMHz * 1000 + delta - (uint64_t)pers->Network_Config.latencyFromRoot;
    while(Kernel_GetTimeBase() < cleartime)
    {
    }
    Kernel_ClearTimeBase();
}
示例#5
0
void EMON_Init_t(int numthreads)
{
  unsigned int tid, pid, cid,node_id;

  tid = PhysicalThreadID();    // between 0 and 3
  pid = PhysicalThreadIndex(); // between 0 and 67
  cid = pid/4;
  node_id = EMON_rank_on_card();
  //  printf("node id = %d\n", node_id);
  

  // EMON set up is done only by one thread of one node
  if (pid == 0)// && (node_id == 0) )
    {
      int rc = EMON_SetupPowerMeasurement();

      if (rc) {
      	printf("ERROR : EMON_SetupPowerMeasurement failed with rc=%d\n", rc);
      	test_exit(rc);
      }
    }
     L2_Barrier(&id_barrier, numthreads);
}
示例#6
0
/*========================================================================*/
void HPM_Print_t(int numthreads)
{
   int i, j, k, nblocks;
   //   uint64_t counts, counts_0, counts_1, counts_2, counts_3;
   long long counts, counts_0, counts_1, counts_2, counts_3;
   //   uint64_t l1_misses, l1p_misses, node_l1p_misses, node_l2_misses, loads, node_loads, node_l1_misses;
   //   double cycles, ipc, stall_cycles, node_stall_cycles, node_cycles;
   //   double node_fxu_instructions, node_fpu_instructions, node_l1_hits, node_l1p_hits, ddr_hit_fraction;
   //   double fxu_fraction, fpu_fraction, fxu_instructions, fpu_instructions;
   //   double cores_per_process, max_fraction, percent_max_issue_rate;
   //   double l1_hits, l1_hit_fraction, l1p_hits, l1p_hit_fraction, node_l2_hits, l2_hit_fraction;
   //   double ld_bytes_per_cycle, st_bytes_per_cycle;
   //   uint64_t node_punit_counts[MAX_COUNTERS];
   //   long long node_timebase_sum;
   //   int Ax, Bx, Cx, Dx, Ex;
   //   Personality_t personality;
   //   char filename[132];
   //   FILE * fp;
   unsigned int tid, pid, cid;
   int node_id;
 

   tid = PhysicalThreadID();    // between 0 and 3
   pid = PhysicalThreadIndex(); // between 0 and 67
   cid = pid/4;
   node_id = EMON_rank_on_card();
   L2_Barrier(&id_barrier, numthreads);

   if ((pid != 0) || (node_id != 0))
	return;

   set_labels();

   set_aggregation_mask();  // sets a mask to aggregate by sum (0) or by max (1)

   if (code_block >= MAX_CODE_BLOCKS) nblocks = MAX_CODE_BLOCKS;
   else                               nblocks = code_block;

   //   fp = stderr;

   // print counts for each thread and the aggregate for every core
   printf( "\n");
   printf( "======================================================================\n");
   printf( "Hardware counter report for BGQ  - thread and core-specific values.\n");
   printf( "======================================================================\n");
   for (k=0; k<MAX_CORES; k++) {
      if (coremask[k]) {
         printf ("core %d\n", k);
         for (j=0; j<nblocks; j++) { 
            if (block_starts[j] == block_stops[j]) {
               printf( "----------------------------------------------------------------\n");
               printf( "%s, call count = %d, cycles = %lld :\n", 
                       code_block_label[j], block_starts[j], timebase_sum[j]);
               printf( "  -- Processor counters (thread specific) --------------\n");
               if (hpm_threads == 1) {
                  for (i=0; i<num_events; i++) {
                     counts = counter_sum[k][j][i];
                     printf( "%-d %14lld  %s\n", hpm_group, counts, label[counter_index[i]]);
                  }
               }
               else if (hpm_threads == 2) {
                  printf( "  thread0 counts  thread2 counts      net counts  label\n");
                  for (i=0; i<num_events; i++) {
                     counts_0 = counter_sum[k][j][i];
                     counts_2 = counter_sum[k][j][i+num_events];
                     if (mask[counter_index[i]]) {
                          counts = (counts_0 > counts_2) ? counts_0 : counts_2;
                     }
                     else counts = counts_0 + counts_2;
                     printf( "%-d %14lld  %14lld  %14lld  %s\n", 
                                   hpm_group, counts_0, counts_2, counts, label[counter_index[i]]);
                  }
               }
               else if (hpm_threads == 3) {
                  printf( "  thread0 counts  thread1 counts  thread2 counts      net counts  label\n");
                  for (i=0; i<num_events; i++) {
                     counts_0 = counter_sum[k][j][i];
                     counts_2 = counter_sum[k][j][i+num_events];
                     counts_1 = counter_sum[k][j][i+2*num_events];
                     if (mask[counter_index[i]]) {
                          counts = (counts_0 > counts_2) ? counts_0 : counts_2;
                          counts = (counts   > counts_1) ? counts   : counts_1;
                     }
                     else counts = counts_0 + counts_1 + counts_2;
                     printf( "%-d %14lld  %14lld  %14lld  %14lld  %s\n", 
                                   hpm_group, counts_0, counts_1, counts_2, counts, label[counter_index[i]]);
                  }
               }
               else if (hpm_threads == 4) {
                  printf( "  thread0 counts  thread1 counts  thread2 counts  thread3 counts      net counts  label\n");
                  for (i=0; i<num_events; i++) {
                     counts_0 = counter_sum[k][j][i];
                     counts_2 = counter_sum[k][j][i+num_events];
                     counts_1 = counter_sum[k][j][i+2*num_events];
                     counts_3 = counter_sum[k][j][i+3*num_events];
                     if (mask[counter_index[i]]) {
                          counts = (counts_0 > counts_2) ? counts_0 : counts_2;
                          counts = (counts   > counts_1) ? counts   : counts_1;
                          counts = (counts   > counts_3) ? counts   : counts_3;
                     }
                     else counts = counts_0 + counts_1 + counts_2 + counts_3;
                     printf( "%-d %14lld  %14lld  %14lld  %14lld  %14lld  %s\n", 
                                   hpm_group, counts_0, counts_1, counts_2, counts_3, counts, label[counter_index[i]]);
                  }
               }
               printf( "  -- L2 counters (shared for the node) -----------------\n");
               printf( "%-d %14lld  L2 Hits\n",                          100, L2_sum[j][0]);
               printf( "%-d %14lld  L2 Misses\n",                        100, L2_sum[j][1]);
               printf( "%-d %14lld  L2 lines prefetched\n",              100, L2_sum[j][2]);
               printf( "%-d %14lld  L2 lines loaded from memory\n",      100, L2_sum[j][3]);
               printf( "%-d %14lld  L2 full lines stored to mem\n",      100, L2_sum[j][4]);
               printf( "%-d %14lld  L2 partial lines stored to mem\n",   100, L2_sum[j][5]);
               printf( "\n");
            }
            else {
               printf( "mismatch in starts/stops for code block '%s'\n", code_block_label[j]);
               printf( "  starts = %d\n", block_starts[j]);
               printf( "  stops  = %d\n", block_stops[j]);
            }
         }
         printf( "\n");
      }
   }

   return;
}
示例#7
0
/*=================================================================*/
void HPM_Init_t(int numthreads)
{
  int i, j, k, core;
  // int threads_per_core;
  //  int * eventSet;
  char * ptr;
  unsigned int tid, pid, cid;
  unsigned int lock_status;

  int rc;
//  Upci_Mode_t Mode;

  tid = PhysicalThreadID();    // between 0 and 3
  pid = PhysicalThreadIndex(); // between 0 and 67
  cid = pid/4;

  if (pid == 0)
    {
      // set the initial cumulative counter values to zero 
      for (k=0; k<MAX_CORES; k++)
	for (j=0; j<MAX_CODE_BLOCKS; j++)
	  for (i=0; i<MAX_COUNTERS; i++)
	    counter_sum[k][j][i] = 0LL;
	
      for (j=0; j<MAX_CODE_BLOCKS; j++) timebase_sum[j] = 0LL;
	
      for (j=0; j<MAX_CODE_BLOCKS; j++) 
	for (i=0; i<6; i++)
	  L2_sum[j][i] = 0LL;
	
      // keep track of code block starts and stops 
      for (j=0; j<MAX_CODE_BLOCKS; j++) {
	block_starts[j] = 0;
	block_stops[j]  = 0;
      }
	
      // set mask used for thread and core aggregation
      for (i=0; i<MAX_EVENTS; i++) mask[i] = 0;
	
      // check env variables
      // fixme
      ptr = fwext_getenv("HPM_GROUP");
      if (ptr == NULL)  {
        hpm_group = 0;
      }
      else hpm_group = hpm_atoi(ptr);
      // printf("hpm_group = %d\n", hpm_group);
      // hpm_group = 82;
      if (hpm_group < -1) hpm_group = 0;
      if (hpm_group > 99) hpm_group = 0;
	
      // fixme
      // ptr = fwext_getenv("HPM_SCOPE");   if (pid !=0) return;
      
      // if (ptr != NULL) {
      //          if (strncasecmp(ptr,"process", 7) == 0) process_scope = 1;
      //          if (strncasecmp(ptr,"node", 4) == 0)    node_scope = 1;
      // }
	
      // fixme
      // ptr = fwext_getenv("HPM_METRICS");
      // if (ptr != NULL) {
      //          if (strncasecmp(ptr,"yes", 3) == 0) derived_metrics = 1;
      // }
	
      for (i=0; i<MAX_CORES; i++) coremask[i] = 1;
	
      // find the number of cores used by this process
      // fixme
      // numcores = 0;
      // for (i=0; i<MAX_CORES; i++) numcores += coremask[i];
      numcores = 17;
	
      // determine the number of threads per core
      // numthreads = BgGetNumThreads();
      // numthreads = 68;
      // threads_per_core = numthreads / numcores;
	
      // hpm_threads = threads_per_core;
      // fixme
      hpm_threads = 4;
      	
      // optionally reset the number of threads per core that will be counted
      // fixme
      // ptr = fwext_getenv("HPM_THREADS");
      // if (ptr != NULL) {
      //         hpm_threads = fwext_atoi(ptr);
      // if (hpm_threads < 1) hpm_threads = 1;
      // if (hpm_threads > 4) hpm_threads = 4;
      // }
	
      // set num_events and num_counters based on hpm_group and hpm_threads
      switch (hpm_group) {
      case -1:
	num_events = 6;
	eventSet = exptSet;
	break;
	
      case 0:
	num_events = 6;
	eventSet = mySet;
	break;
	
      case 1:
	num_events = 12;
	if (hpm_threads > 2) hpm_threads = 2;
	eventSet = ldSet;
	break;
	
      case 2:
	num_events = 24;
	if (hpm_threads > 1) hpm_threads = 1;
	eventSet = fpuSet;
	break;
	
      case 3:
	num_events = 12;
	if (hpm_threads > 2) hpm_threads = 2;
	eventSet = fpSet0;
	break;
	
      case 30:
	num_events = 6;
	eventSet = fpSet00;
	break;
	
      case 31:
	num_events = 6;
	eventSet = fpSet01;
	break;
	
      case 4:
	num_events = 12;
	if (hpm_threads > 2) hpm_threads = 2;
	eventSet = fpSet1;
	break;
	
      case 40:
	num_events = 6;
	eventSet = fpSet10;
	break;
	
      case 41:
	num_events = 6;
	eventSet = fpSet11;
	break;
	
      case 5:
	num_events = 24;
	if (hpm_threads > 1) hpm_threads = 1;
	eventSet = fxuSet;
	break;
	
      case 6:
	num_events = 12;
	if (hpm_threads > 2) hpm_threads = 2;
	eventSet = fxSet0;
	break;
	
      case 60:
	num_events = 6;
	eventSet = fxSet00;
	break;
	
      case 61:
	num_events = 6;
	eventSet = fxSet01;
	break;
	
      case 7:
	num_events = 12;
	if (hpm_threads > 2) hpm_threads = 2;
	eventSet = fxSet1;
	break;
	
      case 70:
	num_events = 6;
	eventSet = fxSet10;
	break;
	
      case 71:
	num_events = 6;
	eventSet = fxSet11;
	break;

      case 81:
	num_events = 6;
	eventSet = l1pset0;
	break;

      case 82:
	num_events = 6;
	eventSet = l1pset1;
	break;

      case 83:
	num_events = 6;
	eventSet = l1pset2;
	break;
	
      default:
	break;
	
      }
	
      num_counters =  num_events * hpm_threads;
      ppc_msync();
      Upci_Mode_Init(&Mode[0], UPC_DISTRIB_MODE, UPC_CM_INDEP, 0);
      initialized = 1;
      ppc_msync();
    }
      
  
  while ((initialized == 0) && (tid == 0))
    {
      ;	
    }

  if (tid == 0) {

    lock_status = 0;
    while (lock_status == 0)
      {
	lock_status = hpm_lock_acquire();
      }
       
    core = cid;

    // initialize hardware counters
    // Upci_Mode_Init(&Mode[core], UPC_DISTRIB_MODE, UPC_CM_INDEP, core);
    Upci_Punit_Init(&Punit[core], &Mode[core], core);

    // UPC_L1p_SetMode(core, L1P_CFG_UPC_SWITCH);
       
    // use one thread per core to enable 24 different punit counters
       
    // add events to count, save hwthread in one of the reserved event handle slots
    k = 0;
    for (i=0; i<num_events; i++) {                         // hwthread 0
      rc = Upci_Punit_AddEvent(&Punit[core], eventSet[i],  0, &eventHandle[core][k]);
      if (rc != 0) printf("failed to add event %d\n", eventSet[i]);
      if (pid == 0)
	counter_index[k] = eventSet[i];
      eventHandle[core][k].rsv[0] = 0;
      k++;
    }
    if (hpm_threads > 1) {
      for (i=0; i<num_events; i++) {                         // hwthread 2
	rc = Upci_Punit_AddEvent(&Punit[core], eventSet[i],  2, &eventHandle[core][k]);
	if (rc != 0) printf("failed to add event %d\n", eventSet[i]);
	if (pid == 0)
	  counter_index[k] = eventSet[i];
	eventHandle[core][k].rsv[0] = 2;
	k++;
      }
    }
    if (hpm_threads > 2) {
      for (i=0; i<num_events; i++) {                         // hwthread 1
	rc = Upci_Punit_AddEvent(&Punit[core], eventSet[i],  1, &eventHandle[core][k]);
	if (rc != 0) printf("failed to add event %d\n", eventSet[i]);
	if (pid == 0)
	  counter_index[k] = eventSet[i];
	eventHandle[core][k].rsv[0] = 1;
	k++;
      }
    }
    if (hpm_threads > 3) {
      for (i=0; i<num_events; i++) {                         // hwthread 3
	rc = Upci_Punit_AddEvent(&Punit[core], eventSet[i],  3, &eventHandle[core][k]);
	if (rc != 0) printf("failed to add event %d\n", eventSet[i]);
	if (pid == 0)
	  counter_index[k] = eventSet[i];
        eventHandle[core][k].rsv[0] = 3;
	k++;
      }
    }
    

    rc = Upci_Punit_Apply(&Punit[core]);
    if (rc != 0) printf("Upci_Punit_Apply failed\n");
    
    Upci_Punit_Start(&Punit[core], (UPCI_CTL_RESET | UPCI_CTL_DELAY));
    // printf("Initialised upc by core = %d\n", cid);
    // Upci_Punit_Dump(2, &Punit[core]);

    lock_val = 0;
    ppc_msync();
    
  }

  if (pid == 0)
    {
      UPC_L2_EnableUPC(1, 1);  
      UPC_L2_Start();
    }

  // PMPI_Barrier(local_comm);

  L2_Barrier(&id_barrier, numthreads);
     
  return;
}
示例#8
0
int test_main ( void ) {

    if (PhysicalThreadIndex() > 0)       // run  a single core test.
        test_exit(0);

    int rc=0;
    printf("Torus Remote Get Atomic Test\n");

    // Perform initialization of the network and mu
    Personality_t *pers;

    pers = fwext_getPersonality();
    uint64_t p1 = pers->Kernel_Config.NodeConfig & PERS_ENABLE_Mambo;
    if (p1) is_mambo = 1;

    // ND and MU init is done in firmware, but we disable it in svchost and
    // call it directly here because it performs much better.
    // #if 0
    fw_nd_set_verbose(0);  // if 1, prints all dcr commands, don't use on cycle sim
    // on cycle sim, can have DcrMonitory trace DCR commands

    rc = fw_nd_reset_release(pers);

    if(rc)
    {
        TRACE(("fw_nd_reset_release failed with rc=%d\n",rc));
        test_exit (rc);
    }

    fw_mu_set_verbose(0);  // if 1, prints all dcr commands, don't use on cycle sim
    // on cycle sim, can have DcrMonitory trace DCR commands

    rc = fw_mu_reset_release(pers);

    if(rc)
    {
        printf("fw_mu_reset_release failed with rc=%d\n",rc);
        test_exit (rc);
    }
    // #endif // if 0

    uint64_t max_value = ~0;

    fw_mu_set_sys_range(0, /* range_id */
                        0, /* min_value */
                        max_value);

    fw_mu_set_usr_range(0, /* range_id */
                        0, /* min_value */
                        max_value);

    /*   fw_mu_set_imfifo_rget   (1, 1); */
    /*   fw_mu_set_imfifo_system (1, 0); */

    TRACE(("Network and MU Initialization is complete\n"));

#else
int main(int argc, char **argv)
{
    int rc;
#endif

    uint i = 0;
    // Destination for Remote Get packet
    MUHWI_Destination_t dest;
    MUSPI_SetUpDestination ( &dest, 0, 0, 0, 0, 0 );

    MUSPI_InjFifoSubGroup_t   fifo_subgroup;

    uint64 message_size_in_bytes_remote_get = MESSAGE_SIZE_REMOTE_GET;
    uint64 message_size_in_bytes_direct_put = MESSAGE_SIZE_DIRECT_PUT;

    TRACE(("main(): Injection Memory FIFO (0,0,0), Send Remote Get Message with Atomic Increment\n"));


//#ifdef PRINT_DEBUG_MESSAGES
    printf("Start!\n");
//#endif

    // ------------------------------------------------------
    // allocates area for message_sent_remote_get[] buffer (RemoteGet)
    // ------------------------------------------------------
    uint64 *message_sent_remote_get = (uint64 *)malloc(message_size_in_bytes_remote_get);
    uint64 *message_sent_direct_put = (uint64 *)malloc(message_size_in_bytes_direct_put);

    TRACE(("message_sent_remote_get (address) = %p\n", message_sent_remote_get));
    TRACE(("message_size_in_bytes_remote_get = %lld\n", message_size_in_bytes_remote_get));

    TRACE(("message_sent_direct_put (address) = %p\n", message_sent_direct_put));
    TRACE(("message_size_in_bytes_direct_put = %lld\n", message_size_in_bytes_direct_put));

    // Initializes the message_sent_remote_get[] buffer
    for (i=0; i<message_size_in_bytes_remote_get/8; i++)
        message_sent_remote_get[i] = 0x00ull;  // 8-bytes

    Kernel_MemoryRegion_t  mregionSentRemoteGet;
    rc = Kernel_CreateMemoryRegion ( &mregionSentRemoteGet,
                                     message_sent_remote_get,
                                     message_size_in_bytes_remote_get );

    if ( rc != 0)
    {
        printf("Kernel_CreateMemoryRegion failed for message_sent_remote_get with rc=%d\n",rc);
#ifdef __FWEXT__
        test_exit(1);
#else
        exit(1);
#endif
    }


    // Initializes the message_sent[] buffer
    *message_sent_direct_put = (uint64)ATOMIC_COUNTER_INITIAL_VALUE;  // 8-bytes
    uint64_t expected_counter_value = ATOMIC_COUNTER_INITIAL_VALUE + RECEIVE_BUFFER_INITIAL_VALUE;

    Kernel_MemoryRegion_t  mregionSentDirectPut;
    rc = Kernel_CreateMemoryRegion ( &mregionSentDirectPut,
                                     message_sent_direct_put,
                                     message_size_in_bytes_direct_put );

    if ( rc != 0)
    {
        printf("Kernel_CreateMemoryRegion failed for message_sent_direct_put with rc=%d\n",rc);
#ifdef __FWEXT__
        test_exit(1);
#else
        exit(1);
#endif
    }

    // Get an atomic address for the message_sent buffer.

    uint64_t message_sent_atomic_address = MUSPI_GetAtomicAddress (
            (uint64_t)message_sent_direct_put -
            (uint64_t)mregionSentDirectPut.BaseVa +
            (uint64_t)mregionSentDirectPut.BasePa,
            MUHWI_ATOMIC_OPCODE_LOAD_INCREMENT );

    TRACE(("message_sent_direct_put (atomic address) = 0x%llx\n",
           (long long unsigned int)message_sent_atomic_address));

    /////////////////////////////////////////////////

    typedef struct recvArea
    {
        volatile uint64 counter;
        unsigned char   recvBuffer[MESSAGE_SIZE_DIRECT_PUT];
    } recvArea_t;

    // Allocate space for the reception counter and the receive buffer

    recvArea_t *recvAreaPtr = (recvArea_t*)malloc ( sizeof(recvArea_t) );
    if ( !recvAreaPtr )
    {
        printf("Allocating recvArea failed\n");
#ifdef __FWEXT__
        test_exit(1);
#else
        exit(1);
#endif
    }

    volatile uint64 *counterAddress    = (volatile uint64*)&(recvAreaPtr->counter);
    unsigned char   *recvBufferAddress = (unsigned char  *)&(recvAreaPtr->recvBuffer[0]);

    *((uint64*)recvBufferAddress) = RECEIVE_BUFFER_INITIAL_VALUE;

    // Get a memory region for the recvArea.

    Kernel_MemoryRegion_t  recvAreaMemRegion;
    rc = Kernel_CreateMemoryRegion ( &recvAreaMemRegion,
                                     recvAreaPtr,
                                     sizeof(recvArea_t) );

    if ( rc != 0)
    {
        printf("Kernel_CreateMemoryRegion failed for recvAreaMemRegion with rc=%d\n",rc);
#ifdef __FWEXT__
        test_exit(1);
#else
        exit(1);
#endif
    }

    // Calculate the offsets of the counter and receive buffer from the base address.

    uint64_t recvAreaBasePA   = (uint64_t)recvAreaMemRegion.BasePa;
    uint64_t counterOffset    = (uint64_t)counterAddress - (uint64_t)recvAreaMemRegion.BaseVa;
    uint64_t recvBufferOffset = (uint64_t)recvBufferAddress - (uint64_t)recvAreaMemRegion.BaseVa;

    TRACE(("counterAddress=%p, recvBufferAddress=%p, recvAreaBasePA=0x%llx, counterOffset=0x%llx, recvBufferOffset=0x%llx\n",counterAddress, recvBufferAddress,
           (long long unsigned int)recvAreaBasePA,
           (long long unsigned int)counterOffset,
           (long long unsigned int)recvBufferOffset));

    //////////////////////////////////////////////////////////////
    // Initialize base address table and atomic counter info
    //////////////////////////////////////////////////////////////
    /* Set up the base address table */
    uint32_t batids[1] = {0};
    MUSPI_BaseAddressTableSubGroup_t bat;
    rc = Kernel_AllocateBaseAddressTable ( 0,
                                           &bat,
                                           1,
                                           batids,
                                           0 /* "User" use */ );
    if (rc != 0)
    {
        printf("Kernel_AllocateBaseAddressTable failed with rc=%d\n",rc);
#ifdef __FWEXT__
        test_exit(1);
#else
        exit(1);
#endif
    }

    rc = MUSPI_SetBaseAddress ( &bat,
                                0,
                                (uint64_t)recvAreaMemRegion.BasePa );
    if (rc != 0)
    {
        printf("MUSPI_SetBaseAddress failed with rc=%d\n",rc);
#ifdef __FWEXT__
        test_exit(1);
#else
        exit(1);
#endif
    }

    TRACE(("Set BaseAddressTable entry slot 0 to 0x%llx\n",
           (long long unsigned int)recvAreaMemRegion.BasePa));

    uint64_t muAtomicCounterOffset = MUSPI_GetAtomicOffsetFromBaseAddress (
                                         &bat,
                                         0,
                                         recvAreaBasePA + counterOffset,
                                         MUHWI_ATOMIC_OPCODE_STORE_ADD );

    uint64_t muAtomicRecvBufferOffset = MUSPI_GetAtomicOffsetFromBaseAddress (
                                            &bat,
                                            0,
                                            recvAreaBasePA + recvBufferOffset,
                                            MUHWI_ATOMIC_OPCODE_STORE_ADD );

    TRACE(("main(): recvCounterVa=%p, recvAreaBasePA=0x%llx, muAtomicCounterOffset=0x%llx, muAtomicRecvBufferOffset=0x%llx\n",
           &(recvAreaPtr->counter),
           (long long unsigned int)recvAreaBasePA,
           (long long unsigned int)muAtomicCounterOffset,
           (long long unsigned int)muAtomicRecvBufferOffset));

    //////////////////////////////////////////////////////////////
    // Create a DirectPut Descriptor and copy it into the
    // message payload
    //////////////////////////////////////////////////////////////
    TRACE(("main(): Configures direct put descriptor\n"));

    MUSPI_Pt2PtDirectPutDescriptorInfo_t mu_iDirectPutDescriptorInfo;
    mu_iDirectPutDescriptorInfo.Base.Pre_Fetch_Only  = MUHWI_DESCRIPTOR_PRE_FETCH_ONLY_NO;
    mu_iDirectPutDescriptorInfo.Base.Payload_Address = message_sent_atomic_address;
    mu_iDirectPutDescriptorInfo.Base.Message_Length  = message_size_in_bytes_direct_put;
    mu_iDirectPutDescriptorInfo.Base.Torus_FIFO_Map  = MUHWI_DESCRIPTOR_TORUS_FIFO_MAP_AP;
    mu_iDirectPutDescriptorInfo.Base.Dest            = dest;
    mu_iDirectPutDescriptorInfo.Pt2Pt.Hints_ABCD = MUHWI_PACKET_HINT_AP;
    mu_iDirectPutDescriptorInfo.Pt2Pt.Misc1      = MUHWI_PACKET_HINT_E_NONE |
            MUHWI_PACKET_DO_NOT_ROUTE_TO_IO_NODE |
            MUHWI_PACKET_USE_DETERMINISTIC_ROUTING |
            MUHWI_PACKET_DO_NOT_DEPOSIT;
    mu_iDirectPutDescriptorInfo.Pt2Pt.Misc2      = MUHWI_PACKET_VIRTUAL_CHANNEL_DETERMINISTIC;
    mu_iDirectPutDescriptorInfo.Pt2Pt.Skip       = 0;

    mu_iDirectPutDescriptorInfo.DirectPut.Rec_Payload_Base_Address_Id = 0;
    mu_iDirectPutDescriptorInfo.DirectPut.Rec_Payload_Offset          = muAtomicRecvBufferOffset;
    mu_iDirectPutDescriptorInfo.DirectPut.Rec_Counter_Base_Address_Id = 0;
    mu_iDirectPutDescriptorInfo.DirectPut.Rec_Counter_Offset          = muAtomicCounterOffset;
    mu_iDirectPutDescriptorInfo.DirectPut.Pacing                      = MUHWI_PACKET_DIRECT_PUT_IS_NOT_PACED;

    rc = MUSPI_CreatePt2PtDirectPutDescriptor( &mu_iDirectPutDescriptor,
            &mu_iDirectPutDescriptorInfo
                                             );
    if (rc != 0)
    {
        printf("MUSPI_CreatePt2PtDirectPutDescriptor failed with rc=%d\n",rc);
#ifdef __FWEXT__
        test_exit(1);
#else
        exit(1);
#endif
    }

    //MUSPI_DescriptorDumpHex("Direct Put Descriptor",
    //		    &mu_iDirectPutDescriptor);

    // Copy Descriptor into RemoteGet message payload

    memcpy((char *)((void *)message_sent_remote_get), (char *)((void *)(&mu_iDirectPutDescriptor)), message_size_in_bytes_remote_get);


    /////////////////////////////////////////////////////////////
    // RemoteGet message
    // Create a remote get descriptor
    /////////////////////////////////////////////////////////////
    TRACE(("main(): Configures remote get descriptor\n"));

    MUSPI_Pt2PtRemoteGetDescriptorInfo_t mu_iRemoteGetDescriptorInfo;
    mu_iRemoteGetDescriptorInfo.Base.Pre_Fetch_Only  = MUHWI_DESCRIPTOR_PRE_FETCH_ONLY_NO;
    mu_iRemoteGetDescriptorInfo.Base.Payload_Address = (uint64_t)message_sent_remote_get - (uint64_t)mregionSentRemoteGet.BaseVa + (uint64_t)mregionSentRemoteGet.BasePa;
    mu_iRemoteGetDescriptorInfo.Base.Message_Length  = message_size_in_bytes_remote_get;
    mu_iRemoteGetDescriptorInfo.Base.Torus_FIFO_Map  = MUHWI_DESCRIPTOR_TORUS_FIFO_MAP_AP;
    mu_iRemoteGetDescriptorInfo.Base.Dest            = dest;
    mu_iRemoteGetDescriptorInfo.Pt2Pt.Hints_ABCD = MUHWI_PACKET_HINT_AP;
    mu_iRemoteGetDescriptorInfo.Pt2Pt.Misc1      = MUHWI_PACKET_HINT_E_NONE |
            MUHWI_PACKET_DO_NOT_ROUTE_TO_IO_NODE |
            MUHWI_PACKET_USE_DETERMINISTIC_ROUTING |
            MUHWI_PACKET_DO_NOT_DEPOSIT;
    mu_iRemoteGetDescriptorInfo.Pt2Pt.Misc2      = MUHWI_PACKET_VIRTUAL_CHANNEL_DETERMINISTIC;
    mu_iRemoteGetDescriptorInfo.Pt2Pt.Skip       = 0;
    mu_iRemoteGetDescriptorInfo.RemoteGet.Type             = MUHWI_PACKET_TYPE_GET;
    mu_iRemoteGetDescriptorInfo.RemoteGet.Rget_Inj_FIFO_Id = 1; // Fifo 1 is for remote get use

    // Prepares Injection Memory FIFO Descriptor (RemoteGet)
    rc = MUSPI_CreatePt2PtRemoteGetDescriptor( &mu_iRemoteGetDescriptor,
            &mu_iRemoteGetDescriptorInfo
                                             );
    if (rc != 0)
    {
        printf("MUSPI_CreatePt2PtRemoteGetDescriptor failed with rc=%d\n",rc);
#ifdef __FWEXT__
        test_exit(1);
#else
        exit(1);
#endif
    }

    //    MUSPI_DescriptorDumpHex("Remote Get Descriptor",
    //		    &mu_iRemoteGetDescriptor);

    /////////////////////////////////////////////////////////////////
    // Configures Injection Memory FIFO Registers
    // - fifo 0 that the core injects descriptors into
    // - fifo 1 that the MU injects remote get payload into
    /////////////////////////////////////////////////////////////////
    TRACE(("main(): Configures Injection Memory FIFO Registers\n"));

    void *injMemoryFifoPtr, *memoryForInjMemoryFifoPtr;
    rc = malloc_memalign ( &memoryForInjMemoryFifoPtr,
                           &injMemoryFifoPtr,
                           64,
                           INJ_MEMORY_FIFO_SIZE+1 );
    if (rc)
    {
        printf("inj_memory_fifo malloc failed with rc=%d\n",rc);
#ifdef __FWEXT__
        test_exit(1);
#else
        exit(1);
#endif
    }

    void *rgetMemoryFifoPtr, *memoryForRgetMemoryFifoPtr;
    rc = malloc_memalign ( &memoryForRgetMemoryFifoPtr,
                           &rgetMemoryFifoPtr,
                           64,
                           INJ_MEMORY_FIFO_SIZE+1 );
    if (rc)
    {
        printf("rget_memory_fifo malloc failed with rc=%d\n",rc);
#ifdef __FWEXT__
        test_exit(1);
#else
        exit(1);
#endif
    }

    uint32_t fifoid[2] = { 0, 1 };

    Kernel_InjFifoAttributes_t injFifoAttrs[2];
    injFifoAttrs[0].RemoteGet = 0;
    injFifoAttrs[0].System    = 0;
    injFifoAttrs[1].RemoteGet = 1;
    injFifoAttrs[1].System    = 0;

    rc = Kernel_AllocateInjFifos (0, &fifo_subgroup, 2,
                                  fifoid, injFifoAttrs);
    if ( rc != 0)
    {
        printf("Kernel_AllocateInjFifos failed with rc=%d\n",rc);
#ifdef __FWEXT__
        test_exit(1);
#else
        exit(1);
#endif
    }

    Kernel_MemoryRegion_t  mregionInj;
    rc = Kernel_CreateMemoryRegion ( &mregionInj,
                                     injMemoryFifoPtr,
                                     INJ_MEMORY_FIFO_SIZE + 1 );

    if ( rc != 0)
    {
        printf("Kernel_CreateMemoryRegion failed for injMemoryFifoPtr with rc=%d\n",rc);
#ifdef __FWEXT__
        test_exit(1);
#else
        exit(1);
#endif
    }

    Kernel_MemoryRegion_t  mregionRget;
    rc = Kernel_CreateMemoryRegion ( &mregionRget,
                                     rgetMemoryFifoPtr,
                                     INJ_MEMORY_FIFO_SIZE + 1 );

    if ( rc != 0)
    {
        printf("Kernel_CreateMemoryRegion failed for rgetMemoryFifoPtr with rc=%d\n",rc);
#ifdef __FWEXT__
        test_exit(1);
#else
        exit(1);
#endif
    }

    rc = Kernel_InjFifoInit (&fifo_subgroup, fifoid[0], &mregionInj,
                             (uint64_t)injMemoryFifoPtr -
                             (uint64_t)mregionInj.BaseVa,
                             INJ_MEMORY_FIFO_SIZE);
    if (rc != 0)
    {
        printf("Kernel_InjFifoInit Inj failed with rc=%d, errno=%d\n",rc,errno);
#ifdef __FWEXT__
        test_exit(1);
#else
        exit(1);
#endif
    }

    rc = Kernel_InjFifoInit (&fifo_subgroup, fifoid[1], &mregionRget,
                             (uint64_t)rgetMemoryFifoPtr -
                             (uint64_t)mregionRget.BaseVa,
                             INJ_MEMORY_FIFO_SIZE);
    if (rc != 0)
    {
        printf("Kernel_InjFifoInit Rget failed with rc=%d, errno=%d\n",rc,errno);
#ifdef __FWEXT__
        test_exit(1);
#else
        exit(1);
#endif
    }

    rc = Kernel_InjFifoActivate (&fifo_subgroup, 2, fifoid, KERNEL_INJ_FIFO_ACTIVATE);
    if (rc != 0)
    {
        printf("Kernel_InjFifoActivate Inj failed with rc=%d, errno=%d\n",rc,errno);
#ifdef __FWEXT__
        test_exit(1);
#else
        exit(1);
#endif
    }


    // ---------------------------------------------
    //    Reception Side
    // ---------------------------------------------


    /*     *data_counter_base_address = REC_PAYLOAD_BASE_ADDRESS; */
    /*     printf("data_counter_base_address = %p\n", data_counter_base_address); */

    // Loop, sending the remote get, waiting for the reception counter to hit zero,
    // and verifying the received counter's value.
    // for (i=0; i<num_iterations; i++)  /** disable loop **/
    {

        // Let's initialize the Counter for corresponding Counter Id
        // Note: counter is initialized with the message size
        // updates counter with number of bytes sent
        *counterAddress = MESSAGE_SIZE_DIRECT_PUT;

        // -----------------------------------------------------------
        // Processor Advances Tail pointer - Descriptor is 64-bytes
        // MU should Inject (RemoteGet) message into the Torus
        // -----------------------------------------------------------

        // Let's Inject the (RemoteGet) Descriptor into the Injection Memory FIFO
#if 1
        printf("main(): Inject Descriptor into Injection Memory FIFO\n");
#endif

        rc = MUSPI_InjFifoInject (MUSPI_IdToInjFifo(fifoid[0], &fifo_subgroup),
                                  (void *)(&mu_iRemoteGetDescriptor) );

        if (rc < 0) // Should have injected 1 descriptor
        {
            printf("MUSPI_InjFifoInject failed with rc=%d\n",rc);
#ifdef __FWEXT__
            test_exit(1);
#else
            exit(1);
#endif
        }

#ifndef __FWEXT__
        printf("main(): Successful injection of remote get descriptor\n");
#endif

        // //////////////////////////////////////////////////
        //      Reception side, check counter value
        // //////////////////////////////////////////////////

        uint64 volatile counter_value;

        // wait for the counter to reach ZERO
        while (1) {

            counter_value = *counterAddress;

            if (counter_value == 0)
            {
                //
#if 1
                printf("counter is now ZERO !!!!\n");
#endif

                break;
            }
        }
        _bgq_msync(); // Ensure data is available to all cores.


        // Let's print the Received Message contents

        //put_offset = (uint64)mu_pktHdrDirectPut.Put_Offset_LSB;

#ifndef __FWEXT__
        printf("recvBufferAddress = %p\n", recvBufferAddress);
        printf("---Prints Received Message contents\n");

        Print_Message((unsigned char *)recvBufferAddress, message_size_in_bytes_direct_put);

        printf("---Where Received Message is being stored: recvBufferAddress = %p\n", recvBufferAddress);
        printf("---Checks Received Message contents(size = %lld)\n", message_size_in_bytes_direct_put);
#endif

        uint64_t receivedCounterValue = *((uint64_t*)recvBufferAddress);

        if ( receivedCounterValue == expected_counter_value )
        {
            printf("---Received Counter Value = %llu\n",
                   (long long unsigned int)receivedCounterValue);
        }
        else
        {
            printf("ERROR: Received Counter Value = %llu, expected %llu\n",
                   (long long unsigned int)receivedCounterValue,
                   (long long unsigned int)expected_counter_value);
#ifdef __FWEXT__
            test_exit(1);
#else
            exit(1);
#endif
        }


        if ( *message_sent_direct_put == ATOMIC_COUNTER_INITIAL_VALUE+1 )
        {
            printf("---Sent Counter Value = %llu\n",
                   (long long unsigned int)*message_sent_direct_put);
        }
        else
        {
            printf("ERROR: Sent Counter Value = %llu, expected %llu\n",
                   (long long unsigned int)*message_sent_direct_put,
                   (long long unsigned int)(ATOMIC_COUNTER_INITIAL_VALUE+1));
#ifdef __FWEXT__
            test_exit(1);
#else
            exit(1);
#endif
        }

    }

    //printf("All counter values passed\n");

#ifdef __FWEXT__

    if ( is_mambo == 0 ) // Termination checks don't work in mambo.  ErrInt DCRs are not zero.
    {
        rc = fw_nd_term_check(pers);
        if (rc)
        {
            printf("ERROR: fw_nd_term_check failed with rc=%d\n",rc);
            test_exit(1);
        }

        rc = fw_mu_term_check(pers);
        if (rc)
        {
            printf("ERROR: fw_mu_term_check failed with rc=%d\n",rc);
            test_exit(1);
        }
    }

#endif

    printf("Done!\n");

#ifdef  __FWEXT__
    test_exit (0);
#endif

    return 0;
}