Пример #1
0
void * fight(void * input)
{
    int tid = get_thread_id();

    int count = 100000;

    if (debug) 
        printf("%d: before L2_Barrier \n", tid);

    uint64_t t0 = GetTimeBase();
    for (int i=0; i<count; i++)
        L2_Barrier(&barrier, num_threads);
    uint64_t t1 = GetTimeBase();

    if (debug) {
        printf("%d: after  L2_Barrier \n", tid);
        fflush(stdout);
    }

    uint64_t dt = t1-t0;
    printf("%2d: %d calls to %s took %llu cycles per call \n", 
           tid, count, "L2_Barrier", dt/count);
    fflush(stdout);

    pthread_exit(NULL);

    return NULL;
}
Пример #2
0
void * slowfight(void * input)
{
    int tid = get_thread_id();

    if (debug) 
        printf("%d: before L2_Barrier 1 \n", tid);
    L2_Barrier(&barrier, num_threads);
    if (debug) {
        printf("%d: after  L2_Barrier 1 \n", tid);
        fflush(stdout);
    }

    int count = 1000000;

    uint64_t rval;

    uint64_t t0 = GetTimeBase();
    for (int i=0; i<count; i++)
        rval = Fetch_and_Add(&(slowcounter.atom), 1);
    uint64_t t1 = GetTimeBase();

    if (debug) 
        printf("%d: before L2_Barrier 2 \n", tid);
    L2_Barrier(&barrier, num_threads);
    if (debug) {
        printf("%d: after  L2_Barrier 2 \n", tid);
        fflush(stdout);
    }
    
    uint64_t dt = t1-t0;
    printf("%2d: %d calls to %s took %llu cycles per call \n", 
           tid, count, "Fetch_and_Add", dt/count);
    fflush(stdout);

    pthread_exit(NULL);

    return NULL;
}
Пример #3
0
void * fight(void * input)
{
    int tid = get_thread_id();

    if (debug) 
        printf("%d: before L2_Barrier 1 \n", tid);
    L2_Barrier(&barrier, num_threads);
    if (debug) {
        printf("%d: after  L2_Barrier 1 \n", tid);
        fflush(stdout);
    }

    int count = 1000000;

    uint64_t t0 = GetTimeBase();
    for (int i=0; i<count; i++) {
        volatile uint64_t rval = L2_AtomicLoadIncrement(&(counter.atom));
    }
    uint64_t t1 = GetTimeBase();

    if (debug) 
        printf("%d: before L2_Barrier 2 \n", tid);
    L2_Barrier(&barrier, num_threads);
    if (debug) {
        printf("%d: after  L2_Barrier 2 \n", tid);
        fflush(stdout);
    }
    
    uint64_t dt = t1-t0;
    printf("%2d: %d calls to %s took %llu cycles per call \n", 
           tid, count, "L2_AtomicLoadIncrement", dt/count);
    fflush(stdout);

    pthread_exit(NULL);

    return NULL;
}
Пример #4
0
void EMON_Init_t(int numthreads)
{
  unsigned int tid, pid, cid,node_id;

  tid = PhysicalThreadID();    // between 0 and 3
  pid = PhysicalThreadIndex(); // between 0 and 67
  cid = pid/4;
  node_id = EMON_rank_on_card();
  //  printf("node id = %d\n", node_id);
  

  // EMON set up is done only by one thread of one node
  if (pid == 0)// && (node_id == 0) )
    {
      int rc = EMON_SetupPowerMeasurement();

      if (rc) {
      	printf("ERROR : EMON_SetupPowerMeasurement failed with rc=%d\n", rc);
      	test_exit(rc);
      }
    }
     L2_Barrier(&id_barrier, numthreads);
}
Пример #5
0
/*========================================================================*/
void HPM_Print_t(int numthreads)
{
   int i, j, k, nblocks;
   //   uint64_t counts, counts_0, counts_1, counts_2, counts_3;
   long long counts, counts_0, counts_1, counts_2, counts_3;
   //   uint64_t l1_misses, l1p_misses, node_l1p_misses, node_l2_misses, loads, node_loads, node_l1_misses;
   //   double cycles, ipc, stall_cycles, node_stall_cycles, node_cycles;
   //   double node_fxu_instructions, node_fpu_instructions, node_l1_hits, node_l1p_hits, ddr_hit_fraction;
   //   double fxu_fraction, fpu_fraction, fxu_instructions, fpu_instructions;
   //   double cores_per_process, max_fraction, percent_max_issue_rate;
   //   double l1_hits, l1_hit_fraction, l1p_hits, l1p_hit_fraction, node_l2_hits, l2_hit_fraction;
   //   double ld_bytes_per_cycle, st_bytes_per_cycle;
   //   uint64_t node_punit_counts[MAX_COUNTERS];
   //   long long node_timebase_sum;
   //   int Ax, Bx, Cx, Dx, Ex;
   //   Personality_t personality;
   //   char filename[132];
   //   FILE * fp;
   unsigned int tid, pid, cid;
   int node_id;
 

   tid = PhysicalThreadID();    // between 0 and 3
   pid = PhysicalThreadIndex(); // between 0 and 67
   cid = pid/4;
   node_id = EMON_rank_on_card();
   L2_Barrier(&id_barrier, numthreads);

   if ((pid != 0) || (node_id != 0))
	return;

   set_labels();

   set_aggregation_mask();  // sets a mask to aggregate by sum (0) or by max (1)

   if (code_block >= MAX_CODE_BLOCKS) nblocks = MAX_CODE_BLOCKS;
   else                               nblocks = code_block;

   //   fp = stderr;

   // print counts for each thread and the aggregate for every core
   printf( "\n");
   printf( "======================================================================\n");
   printf( "Hardware counter report for BGQ  - thread and core-specific values.\n");
   printf( "======================================================================\n");
   for (k=0; k<MAX_CORES; k++) {
      if (coremask[k]) {
         printf ("core %d\n", k);
         for (j=0; j<nblocks; j++) { 
            if (block_starts[j] == block_stops[j]) {
               printf( "----------------------------------------------------------------\n");
               printf( "%s, call count = %d, cycles = %lld :\n", 
                       code_block_label[j], block_starts[j], timebase_sum[j]);
               printf( "  -- Processor counters (thread specific) --------------\n");
               if (hpm_threads == 1) {
                  for (i=0; i<num_events; i++) {
                     counts = counter_sum[k][j][i];
                     printf( "%-d %14lld  %s\n", hpm_group, counts, label[counter_index[i]]);
                  }
               }
               else if (hpm_threads == 2) {
                  printf( "  thread0 counts  thread2 counts      net counts  label\n");
                  for (i=0; i<num_events; i++) {
                     counts_0 = counter_sum[k][j][i];
                     counts_2 = counter_sum[k][j][i+num_events];
                     if (mask[counter_index[i]]) {
                          counts = (counts_0 > counts_2) ? counts_0 : counts_2;
                     }
                     else counts = counts_0 + counts_2;
                     printf( "%-d %14lld  %14lld  %14lld  %s\n", 
                                   hpm_group, counts_0, counts_2, counts, label[counter_index[i]]);
                  }
               }
               else if (hpm_threads == 3) {
                  printf( "  thread0 counts  thread1 counts  thread2 counts      net counts  label\n");
                  for (i=0; i<num_events; i++) {
                     counts_0 = counter_sum[k][j][i];
                     counts_2 = counter_sum[k][j][i+num_events];
                     counts_1 = counter_sum[k][j][i+2*num_events];
                     if (mask[counter_index[i]]) {
                          counts = (counts_0 > counts_2) ? counts_0 : counts_2;
                          counts = (counts   > counts_1) ? counts   : counts_1;
                     }
                     else counts = counts_0 + counts_1 + counts_2;
                     printf( "%-d %14lld  %14lld  %14lld  %14lld  %s\n", 
                                   hpm_group, counts_0, counts_1, counts_2, counts, label[counter_index[i]]);
                  }
               }
               else if (hpm_threads == 4) {
                  printf( "  thread0 counts  thread1 counts  thread2 counts  thread3 counts      net counts  label\n");
                  for (i=0; i<num_events; i++) {
                     counts_0 = counter_sum[k][j][i];
                     counts_2 = counter_sum[k][j][i+num_events];
                     counts_1 = counter_sum[k][j][i+2*num_events];
                     counts_3 = counter_sum[k][j][i+3*num_events];
                     if (mask[counter_index[i]]) {
                          counts = (counts_0 > counts_2) ? counts_0 : counts_2;
                          counts = (counts   > counts_1) ? counts   : counts_1;
                          counts = (counts   > counts_3) ? counts   : counts_3;
                     }
                     else counts = counts_0 + counts_1 + counts_2 + counts_3;
                     printf( "%-d %14lld  %14lld  %14lld  %14lld  %14lld  %s\n", 
                                   hpm_group, counts_0, counts_1, counts_2, counts_3, counts, label[counter_index[i]]);
                  }
               }
               printf( "  -- L2 counters (shared for the node) -----------------\n");
               printf( "%-d %14lld  L2 Hits\n",                          100, L2_sum[j][0]);
               printf( "%-d %14lld  L2 Misses\n",                        100, L2_sum[j][1]);
               printf( "%-d %14lld  L2 lines prefetched\n",              100, L2_sum[j][2]);
               printf( "%-d %14lld  L2 lines loaded from memory\n",      100, L2_sum[j][3]);
               printf( "%-d %14lld  L2 full lines stored to mem\n",      100, L2_sum[j][4]);
               printf( "%-d %14lld  L2 partial lines stored to mem\n",   100, L2_sum[j][5]);
               printf( "\n");
            }
            else {
               printf( "mismatch in starts/stops for code block '%s'\n", code_block_label[j]);
               printf( "  starts = %d\n", block_starts[j]);
               printf( "  stops  = %d\n", block_stops[j]);
            }
         }
         printf( "\n");
      }
   }

   return;
}
Пример #6
0
/*=================================================================*/
void HPM_Init_t(int numthreads)
{
  int i, j, k, core;
  // int threads_per_core;
  //  int * eventSet;
  char * ptr;
  unsigned int tid, pid, cid;
  unsigned int lock_status;

  int rc;
//  Upci_Mode_t Mode;

  tid = PhysicalThreadID();    // between 0 and 3
  pid = PhysicalThreadIndex(); // between 0 and 67
  cid = pid/4;

  if (pid == 0)
    {
      // set the initial cumulative counter values to zero 
      for (k=0; k<MAX_CORES; k++)
	for (j=0; j<MAX_CODE_BLOCKS; j++)
	  for (i=0; i<MAX_COUNTERS; i++)
	    counter_sum[k][j][i] = 0LL;
	
      for (j=0; j<MAX_CODE_BLOCKS; j++) timebase_sum[j] = 0LL;
	
      for (j=0; j<MAX_CODE_BLOCKS; j++) 
	for (i=0; i<6; i++)
	  L2_sum[j][i] = 0LL;
	
      // keep track of code block starts and stops 
      for (j=0; j<MAX_CODE_BLOCKS; j++) {
	block_starts[j] = 0;
	block_stops[j]  = 0;
      }
	
      // set mask used for thread and core aggregation
      for (i=0; i<MAX_EVENTS; i++) mask[i] = 0;
	
      // check env variables
      // fixme
      ptr = fwext_getenv("HPM_GROUP");
      if (ptr == NULL)  {
        hpm_group = 0;
      }
      else hpm_group = hpm_atoi(ptr);
      // printf("hpm_group = %d\n", hpm_group);
      // hpm_group = 82;
      if (hpm_group < -1) hpm_group = 0;
      if (hpm_group > 99) hpm_group = 0;
	
      // fixme
      // ptr = fwext_getenv("HPM_SCOPE");   if (pid !=0) return;
      
      // if (ptr != NULL) {
      //          if (strncasecmp(ptr,"process", 7) == 0) process_scope = 1;
      //          if (strncasecmp(ptr,"node", 4) == 0)    node_scope = 1;
      // }
	
      // fixme
      // ptr = fwext_getenv("HPM_METRICS");
      // if (ptr != NULL) {
      //          if (strncasecmp(ptr,"yes", 3) == 0) derived_metrics = 1;
      // }
	
      for (i=0; i<MAX_CORES; i++) coremask[i] = 1;
	
      // find the number of cores used by this process
      // fixme
      // numcores = 0;
      // for (i=0; i<MAX_CORES; i++) numcores += coremask[i];
      numcores = 17;
	
      // determine the number of threads per core
      // numthreads = BgGetNumThreads();
      // numthreads = 68;
      // threads_per_core = numthreads / numcores;
	
      // hpm_threads = threads_per_core;
      // fixme
      hpm_threads = 4;
      	
      // optionally reset the number of threads per core that will be counted
      // fixme
      // ptr = fwext_getenv("HPM_THREADS");
      // if (ptr != NULL) {
      //         hpm_threads = fwext_atoi(ptr);
      // if (hpm_threads < 1) hpm_threads = 1;
      // if (hpm_threads > 4) hpm_threads = 4;
      // }
	
      // set num_events and num_counters based on hpm_group and hpm_threads
      switch (hpm_group) {
      case -1:
	num_events = 6;
	eventSet = exptSet;
	break;
	
      case 0:
	num_events = 6;
	eventSet = mySet;
	break;
	
      case 1:
	num_events = 12;
	if (hpm_threads > 2) hpm_threads = 2;
	eventSet = ldSet;
	break;
	
      case 2:
	num_events = 24;
	if (hpm_threads > 1) hpm_threads = 1;
	eventSet = fpuSet;
	break;
	
      case 3:
	num_events = 12;
	if (hpm_threads > 2) hpm_threads = 2;
	eventSet = fpSet0;
	break;
	
      case 30:
	num_events = 6;
	eventSet = fpSet00;
	break;
	
      case 31:
	num_events = 6;
	eventSet = fpSet01;
	break;
	
      case 4:
	num_events = 12;
	if (hpm_threads > 2) hpm_threads = 2;
	eventSet = fpSet1;
	break;
	
      case 40:
	num_events = 6;
	eventSet = fpSet10;
	break;
	
      case 41:
	num_events = 6;
	eventSet = fpSet11;
	break;
	
      case 5:
	num_events = 24;
	if (hpm_threads > 1) hpm_threads = 1;
	eventSet = fxuSet;
	break;
	
      case 6:
	num_events = 12;
	if (hpm_threads > 2) hpm_threads = 2;
	eventSet = fxSet0;
	break;
	
      case 60:
	num_events = 6;
	eventSet = fxSet00;
	break;
	
      case 61:
	num_events = 6;
	eventSet = fxSet01;
	break;
	
      case 7:
	num_events = 12;
	if (hpm_threads > 2) hpm_threads = 2;
	eventSet = fxSet1;
	break;
	
      case 70:
	num_events = 6;
	eventSet = fxSet10;
	break;
	
      case 71:
	num_events = 6;
	eventSet = fxSet11;
	break;

      case 81:
	num_events = 6;
	eventSet = l1pset0;
	break;

      case 82:
	num_events = 6;
	eventSet = l1pset1;
	break;

      case 83:
	num_events = 6;
	eventSet = l1pset2;
	break;
	
      default:
	break;
	
      }
	
      num_counters =  num_events * hpm_threads;
      ppc_msync();
      Upci_Mode_Init(&Mode[0], UPC_DISTRIB_MODE, UPC_CM_INDEP, 0);
      initialized = 1;
      ppc_msync();
    }
      
  
  while ((initialized == 0) && (tid == 0))
    {
      ;	
    }

  if (tid == 0) {

    lock_status = 0;
    while (lock_status == 0)
      {
	lock_status = hpm_lock_acquire();
      }
       
    core = cid;

    // initialize hardware counters
    // Upci_Mode_Init(&Mode[core], UPC_DISTRIB_MODE, UPC_CM_INDEP, core);
    Upci_Punit_Init(&Punit[core], &Mode[core], core);

    // UPC_L1p_SetMode(core, L1P_CFG_UPC_SWITCH);
       
    // use one thread per core to enable 24 different punit counters
       
    // add events to count, save hwthread in one of the reserved event handle slots
    k = 0;
    for (i=0; i<num_events; i++) {                         // hwthread 0
      rc = Upci_Punit_AddEvent(&Punit[core], eventSet[i],  0, &eventHandle[core][k]);
      if (rc != 0) printf("failed to add event %d\n", eventSet[i]);
      if (pid == 0)
	counter_index[k] = eventSet[i];
      eventHandle[core][k].rsv[0] = 0;
      k++;
    }
    if (hpm_threads > 1) {
      for (i=0; i<num_events; i++) {                         // hwthread 2
	rc = Upci_Punit_AddEvent(&Punit[core], eventSet[i],  2, &eventHandle[core][k]);
	if (rc != 0) printf("failed to add event %d\n", eventSet[i]);
	if (pid == 0)
	  counter_index[k] = eventSet[i];
	eventHandle[core][k].rsv[0] = 2;
	k++;
      }
    }
    if (hpm_threads > 2) {
      for (i=0; i<num_events; i++) {                         // hwthread 1
	rc = Upci_Punit_AddEvent(&Punit[core], eventSet[i],  1, &eventHandle[core][k]);
	if (rc != 0) printf("failed to add event %d\n", eventSet[i]);
	if (pid == 0)
	  counter_index[k] = eventSet[i];
	eventHandle[core][k].rsv[0] = 1;
	k++;
      }
    }
    if (hpm_threads > 3) {
      for (i=0; i<num_events; i++) {                         // hwthread 3
	rc = Upci_Punit_AddEvent(&Punit[core], eventSet[i],  3, &eventHandle[core][k]);
	if (rc != 0) printf("failed to add event %d\n", eventSet[i]);
	if (pid == 0)
	  counter_index[k] = eventSet[i];
        eventHandle[core][k].rsv[0] = 3;
	k++;
      }
    }
    

    rc = Upci_Punit_Apply(&Punit[core]);
    if (rc != 0) printf("Upci_Punit_Apply failed\n");
    
    Upci_Punit_Start(&Punit[core], (UPCI_CTL_RESET | UPCI_CTL_DELAY));
    // printf("Initialised upc by core = %d\n", cid);
    // Upci_Punit_Dump(2, &Punit[core]);

    lock_val = 0;
    ppc_msync();
    
  }

  if (pid == 0)
    {
      UPC_L2_EnableUPC(1, 1);  
      UPC_L2_Start();
    }

  // PMPI_Barrier(local_comm);

  L2_Barrier(&id_barrier, numthreads);
     
  return;
}