void * fight(void * input) { int tid = get_thread_id(); int count = 100000; if (debug) printf("%d: before L2_Barrier \n", tid); uint64_t t0 = GetTimeBase(); for (int i=0; i<count; i++) L2_Barrier(&barrier, num_threads); uint64_t t1 = GetTimeBase(); if (debug) { printf("%d: after L2_Barrier \n", tid); fflush(stdout); } uint64_t dt = t1-t0; printf("%2d: %d calls to %s took %llu cycles per call \n", tid, count, "L2_Barrier", dt/count); fflush(stdout); pthread_exit(NULL); return NULL; }
void * slowfight(void * input) { int tid = get_thread_id(); if (debug) printf("%d: before L2_Barrier 1 \n", tid); L2_Barrier(&barrier, num_threads); if (debug) { printf("%d: after L2_Barrier 1 \n", tid); fflush(stdout); } int count = 1000000; uint64_t rval; uint64_t t0 = GetTimeBase(); for (int i=0; i<count; i++) rval = Fetch_and_Add(&(slowcounter.atom), 1); uint64_t t1 = GetTimeBase(); if (debug) printf("%d: before L2_Barrier 2 \n", tid); L2_Barrier(&barrier, num_threads); if (debug) { printf("%d: after L2_Barrier 2 \n", tid); fflush(stdout); } uint64_t dt = t1-t0; printf("%2d: %d calls to %s took %llu cycles per call \n", tid, count, "Fetch_and_Add", dt/count); fflush(stdout); pthread_exit(NULL); return NULL; }
void * fight(void * input) { int tid = get_thread_id(); if (debug) printf("%d: before L2_Barrier 1 \n", tid); L2_Barrier(&barrier, num_threads); if (debug) { printf("%d: after L2_Barrier 1 \n", tid); fflush(stdout); } int count = 1000000; uint64_t t0 = GetTimeBase(); for (int i=0; i<count; i++) { volatile uint64_t rval = L2_AtomicLoadIncrement(&(counter.atom)); } uint64_t t1 = GetTimeBase(); if (debug) printf("%d: before L2_Barrier 2 \n", tid); L2_Barrier(&barrier, num_threads); if (debug) { printf("%d: after L2_Barrier 2 \n", tid); fflush(stdout); } uint64_t dt = t1-t0; printf("%2d: %d calls to %s took %llu cycles per call \n", tid, count, "L2_AtomicLoadIncrement", dt/count); fflush(stdout); pthread_exit(NULL); return NULL; }
void EMON_Init_t(int numthreads) { unsigned int tid, pid, cid,node_id; tid = PhysicalThreadID(); // between 0 and 3 pid = PhysicalThreadIndex(); // between 0 and 67 cid = pid/4; node_id = EMON_rank_on_card(); // printf("node id = %d\n", node_id); // EMON set up is done only by one thread of one node if (pid == 0)// && (node_id == 0) ) { int rc = EMON_SetupPowerMeasurement(); if (rc) { printf("ERROR : EMON_SetupPowerMeasurement failed with rc=%d\n", rc); test_exit(rc); } } L2_Barrier(&id_barrier, numthreads); }
/*========================================================================*/ void HPM_Print_t(int numthreads) { int i, j, k, nblocks; // uint64_t counts, counts_0, counts_1, counts_2, counts_3; long long counts, counts_0, counts_1, counts_2, counts_3; // uint64_t l1_misses, l1p_misses, node_l1p_misses, node_l2_misses, loads, node_loads, node_l1_misses; // double cycles, ipc, stall_cycles, node_stall_cycles, node_cycles; // double node_fxu_instructions, node_fpu_instructions, node_l1_hits, node_l1p_hits, ddr_hit_fraction; // double fxu_fraction, fpu_fraction, fxu_instructions, fpu_instructions; // double cores_per_process, max_fraction, percent_max_issue_rate; // double l1_hits, l1_hit_fraction, l1p_hits, l1p_hit_fraction, node_l2_hits, l2_hit_fraction; // double ld_bytes_per_cycle, st_bytes_per_cycle; // uint64_t node_punit_counts[MAX_COUNTERS]; // long long node_timebase_sum; // int Ax, Bx, Cx, Dx, Ex; // Personality_t personality; // char filename[132]; // FILE * fp; unsigned int tid, pid, cid; int node_id; tid = PhysicalThreadID(); // between 0 and 3 pid = PhysicalThreadIndex(); // between 0 and 67 cid = pid/4; node_id = EMON_rank_on_card(); L2_Barrier(&id_barrier, numthreads); if ((pid != 0) || (node_id != 0)) return; set_labels(); set_aggregation_mask(); // sets a mask to aggregate by sum (0) or by max (1) if (code_block >= MAX_CODE_BLOCKS) nblocks = MAX_CODE_BLOCKS; else nblocks = code_block; // fp = stderr; // print counts for each thread and the aggregate for every core printf( "\n"); printf( "======================================================================\n"); printf( "Hardware counter report for BGQ - thread and core-specific values.\n"); printf( "======================================================================\n"); for (k=0; k<MAX_CORES; k++) { if (coremask[k]) { printf ("core %d\n", k); for (j=0; j<nblocks; j++) { if (block_starts[j] == block_stops[j]) { printf( "----------------------------------------------------------------\n"); printf( "%s, call count = %d, cycles = %lld :\n", code_block_label[j], block_starts[j], timebase_sum[j]); printf( " -- Processor counters (thread specific) --------------\n"); if (hpm_threads == 1) { for (i=0; i<num_events; i++) { counts = counter_sum[k][j][i]; printf( "%-d %14lld %s\n", hpm_group, counts, label[counter_index[i]]); } } else if (hpm_threads == 2) { printf( " thread0 counts thread2 counts net counts label\n"); for (i=0; i<num_events; i++) { counts_0 = counter_sum[k][j][i]; counts_2 = counter_sum[k][j][i+num_events]; if (mask[counter_index[i]]) { counts = (counts_0 > counts_2) ? counts_0 : counts_2; } else counts = counts_0 + counts_2; printf( "%-d %14lld %14lld %14lld %s\n", hpm_group, counts_0, counts_2, counts, label[counter_index[i]]); } } else if (hpm_threads == 3) { printf( " thread0 counts thread1 counts thread2 counts net counts label\n"); for (i=0; i<num_events; i++) { counts_0 = counter_sum[k][j][i]; counts_2 = counter_sum[k][j][i+num_events]; counts_1 = counter_sum[k][j][i+2*num_events]; if (mask[counter_index[i]]) { counts = (counts_0 > counts_2) ? counts_0 : counts_2; counts = (counts > counts_1) ? counts : counts_1; } else counts = counts_0 + counts_1 + counts_2; printf( "%-d %14lld %14lld %14lld %14lld %s\n", hpm_group, counts_0, counts_1, counts_2, counts, label[counter_index[i]]); } } else if (hpm_threads == 4) { printf( " thread0 counts thread1 counts thread2 counts thread3 counts net counts label\n"); for (i=0; i<num_events; i++) { counts_0 = counter_sum[k][j][i]; counts_2 = counter_sum[k][j][i+num_events]; counts_1 = counter_sum[k][j][i+2*num_events]; counts_3 = counter_sum[k][j][i+3*num_events]; if (mask[counter_index[i]]) { counts = (counts_0 > counts_2) ? counts_0 : counts_2; counts = (counts > counts_1) ? counts : counts_1; counts = (counts > counts_3) ? counts : counts_3; } else counts = counts_0 + counts_1 + counts_2 + counts_3; printf( "%-d %14lld %14lld %14lld %14lld %14lld %s\n", hpm_group, counts_0, counts_1, counts_2, counts_3, counts, label[counter_index[i]]); } } printf( " -- L2 counters (shared for the node) -----------------\n"); printf( "%-d %14lld L2 Hits\n", 100, L2_sum[j][0]); printf( "%-d %14lld L2 Misses\n", 100, L2_sum[j][1]); printf( "%-d %14lld L2 lines prefetched\n", 100, L2_sum[j][2]); printf( "%-d %14lld L2 lines loaded from memory\n", 100, L2_sum[j][3]); printf( "%-d %14lld L2 full lines stored to mem\n", 100, L2_sum[j][4]); printf( "%-d %14lld L2 partial lines stored to mem\n", 100, L2_sum[j][5]); printf( "\n"); } else { printf( "mismatch in starts/stops for code block '%s'\n", code_block_label[j]); printf( " starts = %d\n", block_starts[j]); printf( " stops = %d\n", block_stops[j]); } } printf( "\n"); } } return; }
/*=================================================================*/ void HPM_Init_t(int numthreads) { int i, j, k, core; // int threads_per_core; // int * eventSet; char * ptr; unsigned int tid, pid, cid; unsigned int lock_status; int rc; // Upci_Mode_t Mode; tid = PhysicalThreadID(); // between 0 and 3 pid = PhysicalThreadIndex(); // between 0 and 67 cid = pid/4; if (pid == 0) { // set the initial cumulative counter values to zero for (k=0; k<MAX_CORES; k++) for (j=0; j<MAX_CODE_BLOCKS; j++) for (i=0; i<MAX_COUNTERS; i++) counter_sum[k][j][i] = 0LL; for (j=0; j<MAX_CODE_BLOCKS; j++) timebase_sum[j] = 0LL; for (j=0; j<MAX_CODE_BLOCKS; j++) for (i=0; i<6; i++) L2_sum[j][i] = 0LL; // keep track of code block starts and stops for (j=0; j<MAX_CODE_BLOCKS; j++) { block_starts[j] = 0; block_stops[j] = 0; } // set mask used for thread and core aggregation for (i=0; i<MAX_EVENTS; i++) mask[i] = 0; // check env variables // fixme ptr = fwext_getenv("HPM_GROUP"); if (ptr == NULL) { hpm_group = 0; } else hpm_group = hpm_atoi(ptr); // printf("hpm_group = %d\n", hpm_group); // hpm_group = 82; if (hpm_group < -1) hpm_group = 0; if (hpm_group > 99) hpm_group = 0; // fixme // ptr = fwext_getenv("HPM_SCOPE"); if (pid !=0) return; // if (ptr != NULL) { // if (strncasecmp(ptr,"process", 7) == 0) process_scope = 1; // if (strncasecmp(ptr,"node", 4) == 0) node_scope = 1; // } // fixme // ptr = fwext_getenv("HPM_METRICS"); // if (ptr != NULL) { // if (strncasecmp(ptr,"yes", 3) == 0) derived_metrics = 1; // } for (i=0; i<MAX_CORES; i++) coremask[i] = 1; // find the number of cores used by this process // fixme // numcores = 0; // for (i=0; i<MAX_CORES; i++) numcores += coremask[i]; numcores = 17; // determine the number of threads per core // numthreads = BgGetNumThreads(); // numthreads = 68; // threads_per_core = numthreads / numcores; // hpm_threads = threads_per_core; // fixme hpm_threads = 4; // optionally reset the number of threads per core that will be counted // fixme // ptr = fwext_getenv("HPM_THREADS"); // if (ptr != NULL) { // hpm_threads = fwext_atoi(ptr); // if (hpm_threads < 1) hpm_threads = 1; // if (hpm_threads > 4) hpm_threads = 4; // } // set num_events and num_counters based on hpm_group and hpm_threads switch (hpm_group) { case -1: num_events = 6; eventSet = exptSet; break; case 0: num_events = 6; eventSet = mySet; break; case 1: num_events = 12; if (hpm_threads > 2) hpm_threads = 2; eventSet = ldSet; break; case 2: num_events = 24; if (hpm_threads > 1) hpm_threads = 1; eventSet = fpuSet; break; case 3: num_events = 12; if (hpm_threads > 2) hpm_threads = 2; eventSet = fpSet0; break; case 30: num_events = 6; eventSet = fpSet00; break; case 31: num_events = 6; eventSet = fpSet01; break; case 4: num_events = 12; if (hpm_threads > 2) hpm_threads = 2; eventSet = fpSet1; break; case 40: num_events = 6; eventSet = fpSet10; break; case 41: num_events = 6; eventSet = fpSet11; break; case 5: num_events = 24; if (hpm_threads > 1) hpm_threads = 1; eventSet = fxuSet; break; case 6: num_events = 12; if (hpm_threads > 2) hpm_threads = 2; eventSet = fxSet0; break; case 60: num_events = 6; eventSet = fxSet00; break; case 61: num_events = 6; eventSet = fxSet01; break; case 7: num_events = 12; if (hpm_threads > 2) hpm_threads = 2; eventSet = fxSet1; break; case 70: num_events = 6; eventSet = fxSet10; break; case 71: num_events = 6; eventSet = fxSet11; break; case 81: num_events = 6; eventSet = l1pset0; break; case 82: num_events = 6; eventSet = l1pset1; break; case 83: num_events = 6; eventSet = l1pset2; break; default: break; } num_counters = num_events * hpm_threads; ppc_msync(); Upci_Mode_Init(&Mode[0], UPC_DISTRIB_MODE, UPC_CM_INDEP, 0); initialized = 1; ppc_msync(); } while ((initialized == 0) && (tid == 0)) { ; } if (tid == 0) { lock_status = 0; while (lock_status == 0) { lock_status = hpm_lock_acquire(); } core = cid; // initialize hardware counters // Upci_Mode_Init(&Mode[core], UPC_DISTRIB_MODE, UPC_CM_INDEP, core); Upci_Punit_Init(&Punit[core], &Mode[core], core); // UPC_L1p_SetMode(core, L1P_CFG_UPC_SWITCH); // use one thread per core to enable 24 different punit counters // add events to count, save hwthread in one of the reserved event handle slots k = 0; for (i=0; i<num_events; i++) { // hwthread 0 rc = Upci_Punit_AddEvent(&Punit[core], eventSet[i], 0, &eventHandle[core][k]); if (rc != 0) printf("failed to add event %d\n", eventSet[i]); if (pid == 0) counter_index[k] = eventSet[i]; eventHandle[core][k].rsv[0] = 0; k++; } if (hpm_threads > 1) { for (i=0; i<num_events; i++) { // hwthread 2 rc = Upci_Punit_AddEvent(&Punit[core], eventSet[i], 2, &eventHandle[core][k]); if (rc != 0) printf("failed to add event %d\n", eventSet[i]); if (pid == 0) counter_index[k] = eventSet[i]; eventHandle[core][k].rsv[0] = 2; k++; } } if (hpm_threads > 2) { for (i=0; i<num_events; i++) { // hwthread 1 rc = Upci_Punit_AddEvent(&Punit[core], eventSet[i], 1, &eventHandle[core][k]); if (rc != 0) printf("failed to add event %d\n", eventSet[i]); if (pid == 0) counter_index[k] = eventSet[i]; eventHandle[core][k].rsv[0] = 1; k++; } } if (hpm_threads > 3) { for (i=0; i<num_events; i++) { // hwthread 3 rc = Upci_Punit_AddEvent(&Punit[core], eventSet[i], 3, &eventHandle[core][k]); if (rc != 0) printf("failed to add event %d\n", eventSet[i]); if (pid == 0) counter_index[k] = eventSet[i]; eventHandle[core][k].rsv[0] = 3; k++; } } rc = Upci_Punit_Apply(&Punit[core]); if (rc != 0) printf("Upci_Punit_Apply failed\n"); Upci_Punit_Start(&Punit[core], (UPCI_CTL_RESET | UPCI_CTL_DELAY)); // printf("Initialised upc by core = %d\n", cid); // Upci_Punit_Dump(2, &Punit[core]); lock_val = 0; ppc_msync(); } if (pid == 0) { UPC_L2_EnableUPC(1, 1); UPC_L2_Start(); } // PMPI_Barrier(local_comm); L2_Barrier(&id_barrier, numthreads); return; }