void EMON_Print_t(int numthreads) { unsigned int tid, pid, cid; int node_id; tid = PhysicalThreadID(); // between 0 and 3 pid = PhysicalThreadIndex(); // between 0 and 67 cid = pid/4; node_id = EMON_rank_on_card( ); // printf("node id = %d\n", node_id); // EMON set up is done only by one thread of one node if ((pid == 0) && ( node_id == 0) ) { double power = EMON_ReportPower(); if (power < 0) { printf("ERROR : EMON_GetPower failed\n"); test_exit(1); } } }
/*===============================================================*/ void HPM_Stop(char * this_label) { int i, j, k; long long counter_value, tb; unsigned int tid, pid, cid; tid = PhysicalThreadID(); // between 0 and 3 pid = PhysicalThreadIndex(); // between 0 and 67 cid = pid/4; if (pid == 0) { if (code_block >= MAX_CODE_BLOCKS) return; tb = GetTimeBase(); j = index_from_label(this_label); for (k=0; k<MAX_CORES; k++) { if (coremask[k]) { for (i=0; i<num_counters; i++) { counter_value = Upci_Punit_Event_Read(&Punit[k], eventHandle[k][i]); counter_sum[k][j][i] += (counter_value - counter_in[k][j][i]); } } } L2_sum[j][0] += UPC_L2_ReadCtr(0, UPC_L2_CTR_HITS) - L2_in[j][0]; L2_sum[j][1] += UPC_L2_ReadCtr(0, UPC_L2_CTR_MISSES) - L2_in[j][1]; L2_sum[j][2] += UPC_L2_ReadCtr(0, UPC_L2_PREFETCH) - L2_in[j][2]; L2_sum[j][3] += UPC_L2_ReadCtr(0, UPC_L2_FETCH_LINE) - L2_in[j][3]; L2_sum[j][4] += UPC_L2_ReadCtr(0, UPC_L2_STORE_LINE) - L2_in[j][4]; L2_sum[j][5] += UPC_L2_ReadCtr(0, UPC_L2_STORE_PARTIAL) - L2_in[j][5]; block_stops[j] += 1; timebase_sum[j] += tb - timebase_in[j]; // printf("entered stop by %d\n", pid); #ifdef DEBUG Upci_A2PC_Val_t a2qry; Kernel_Upci_A2PC_GetRegs(&a2qry); Upci_A2PC_DumpRegs(&a2qry); UPC_P_Dump_State(Kernel_ProcessorCoreID()); Upci_Punit_Dump(0, &Punit[0]); UPC_C_Dump_State(); UPC_C_Dump_Counters(0,1); for(i=0; i<24; i++) { Upci_Punit_EventH_Dump(0, i, &eventHandle[0][i]); } #endif } return; }
/*================================================================*/ void HPM_Start(char * this_label) { int i, j, k; long long tb; unsigned int tid, pid, cid; tid = PhysicalThreadID(); // between 0 and 3 pid = PhysicalThreadIndex(); // between 0 and 67 cid = pid/4; if (pid == 0) { tb = GetTimeBase(); j = index_from_label(this_label); for (k=0; k<MAX_CORES; k++) { if (coremask[k]) { for (i=0; i<num_counters; i++) { counter_in[k][j][i] = Upci_Punit_Event_Read(&Punit[k], eventHandle[k][i]); } } } L2_in[j][0] = UPC_L2_ReadCtr(0, UPC_L2_CTR_HITS); L2_in[j][1] = UPC_L2_ReadCtr(0, UPC_L2_CTR_MISSES); L2_in[j][2] = UPC_L2_ReadCtr(0, UPC_L2_PREFETCH); L2_in[j][3] = UPC_L2_ReadCtr(0, UPC_L2_FETCH_LINE); L2_in[j][4] = UPC_L2_ReadCtr(0, UPC_L2_STORE_LINE); L2_in[j][5] = UPC_L2_ReadCtr(0, UPC_L2_STORE_PARTIAL); // printf("entered start by %d\n", pid); block_starts[j] += 1; timebase_in[j] = tb; } return; }
static inline void doTimeSync(int numloops) { Personality_t *pers = &FW_Personality; uint64_t tb = 0; int isRootNode = 0; if((pers->Network_Config.PrimordialClassRoute.GlobIntUpPortOutputs == 0) && (PhysicalThreadIndex() == 0)) isRootNode = 1; else isRootNode = 0; int loop; uint64_t giSendTime = 1 * pers->Kernel_Config.FreqMHz * 1000; uint64_t giHoldTime; uint64_t fetch = 0; for(loop = 0; loop<numloops; loop++) { Kernel_ClearTimeBase(); if(isRootNode) { while(Kernel_GetTimeBase() < giSendTime) { } activateGIPulseThread(); } fetch = Kernel_GetTimeBase(); tb = barrier_wait(); giHoldTime = tb + 10000; while(Kernel_GetTimeBase() < giHoldTime) { } } int64_t delta = giSendTime - tb; uint64_t cleartime = 2ULL * pers->Kernel_Config.FreqMHz * 1000 + delta - (uint64_t)pers->Network_Config.latencyFromRoot; while(Kernel_GetTimeBase() < cleartime) { } Kernel_ClearTimeBase(); }
void EMON_Init_t(int numthreads) { unsigned int tid, pid, cid,node_id; tid = PhysicalThreadID(); // between 0 and 3 pid = PhysicalThreadIndex(); // between 0 and 67 cid = pid/4; node_id = EMON_rank_on_card(); // printf("node id = %d\n", node_id); // EMON set up is done only by one thread of one node if (pid == 0)// && (node_id == 0) ) { int rc = EMON_SetupPowerMeasurement(); if (rc) { printf("ERROR : EMON_SetupPowerMeasurement failed with rc=%d\n", rc); test_exit(rc); } } L2_Barrier(&id_barrier, numthreads); }
/*========================================================================*/ void HPM_Print_t(int numthreads) { int i, j, k, nblocks; // uint64_t counts, counts_0, counts_1, counts_2, counts_3; long long counts, counts_0, counts_1, counts_2, counts_3; // uint64_t l1_misses, l1p_misses, node_l1p_misses, node_l2_misses, loads, node_loads, node_l1_misses; // double cycles, ipc, stall_cycles, node_stall_cycles, node_cycles; // double node_fxu_instructions, node_fpu_instructions, node_l1_hits, node_l1p_hits, ddr_hit_fraction; // double fxu_fraction, fpu_fraction, fxu_instructions, fpu_instructions; // double cores_per_process, max_fraction, percent_max_issue_rate; // double l1_hits, l1_hit_fraction, l1p_hits, l1p_hit_fraction, node_l2_hits, l2_hit_fraction; // double ld_bytes_per_cycle, st_bytes_per_cycle; // uint64_t node_punit_counts[MAX_COUNTERS]; // long long node_timebase_sum; // int Ax, Bx, Cx, Dx, Ex; // Personality_t personality; // char filename[132]; // FILE * fp; unsigned int tid, pid, cid; int node_id; tid = PhysicalThreadID(); // between 0 and 3 pid = PhysicalThreadIndex(); // between 0 and 67 cid = pid/4; node_id = EMON_rank_on_card(); L2_Barrier(&id_barrier, numthreads); if ((pid != 0) || (node_id != 0)) return; set_labels(); set_aggregation_mask(); // sets a mask to aggregate by sum (0) or by max (1) if (code_block >= MAX_CODE_BLOCKS) nblocks = MAX_CODE_BLOCKS; else nblocks = code_block; // fp = stderr; // print counts for each thread and the aggregate for every core printf( "\n"); printf( "======================================================================\n"); printf( "Hardware counter report for BGQ - thread and core-specific values.\n"); printf( "======================================================================\n"); for (k=0; k<MAX_CORES; k++) { if (coremask[k]) { printf ("core %d\n", k); for (j=0; j<nblocks; j++) { if (block_starts[j] == block_stops[j]) { printf( "----------------------------------------------------------------\n"); printf( "%s, call count = %d, cycles = %lld :\n", code_block_label[j], block_starts[j], timebase_sum[j]); printf( " -- Processor counters (thread specific) --------------\n"); if (hpm_threads == 1) { for (i=0; i<num_events; i++) { counts = counter_sum[k][j][i]; printf( "%-d %14lld %s\n", hpm_group, counts, label[counter_index[i]]); } } else if (hpm_threads == 2) { printf( " thread0 counts thread2 counts net counts label\n"); for (i=0; i<num_events; i++) { counts_0 = counter_sum[k][j][i]; counts_2 = counter_sum[k][j][i+num_events]; if (mask[counter_index[i]]) { counts = (counts_0 > counts_2) ? counts_0 : counts_2; } else counts = counts_0 + counts_2; printf( "%-d %14lld %14lld %14lld %s\n", hpm_group, counts_0, counts_2, counts, label[counter_index[i]]); } } else if (hpm_threads == 3) { printf( " thread0 counts thread1 counts thread2 counts net counts label\n"); for (i=0; i<num_events; i++) { counts_0 = counter_sum[k][j][i]; counts_2 = counter_sum[k][j][i+num_events]; counts_1 = counter_sum[k][j][i+2*num_events]; if (mask[counter_index[i]]) { counts = (counts_0 > counts_2) ? counts_0 : counts_2; counts = (counts > counts_1) ? counts : counts_1; } else counts = counts_0 + counts_1 + counts_2; printf( "%-d %14lld %14lld %14lld %14lld %s\n", hpm_group, counts_0, counts_1, counts_2, counts, label[counter_index[i]]); } } else if (hpm_threads == 4) { printf( " thread0 counts thread1 counts thread2 counts thread3 counts net counts label\n"); for (i=0; i<num_events; i++) { counts_0 = counter_sum[k][j][i]; counts_2 = counter_sum[k][j][i+num_events]; counts_1 = counter_sum[k][j][i+2*num_events]; counts_3 = counter_sum[k][j][i+3*num_events]; if (mask[counter_index[i]]) { counts = (counts_0 > counts_2) ? counts_0 : counts_2; counts = (counts > counts_1) ? counts : counts_1; counts = (counts > counts_3) ? counts : counts_3; } else counts = counts_0 + counts_1 + counts_2 + counts_3; printf( "%-d %14lld %14lld %14lld %14lld %14lld %s\n", hpm_group, counts_0, counts_1, counts_2, counts_3, counts, label[counter_index[i]]); } } printf( " -- L2 counters (shared for the node) -----------------\n"); printf( "%-d %14lld L2 Hits\n", 100, L2_sum[j][0]); printf( "%-d %14lld L2 Misses\n", 100, L2_sum[j][1]); printf( "%-d %14lld L2 lines prefetched\n", 100, L2_sum[j][2]); printf( "%-d %14lld L2 lines loaded from memory\n", 100, L2_sum[j][3]); printf( "%-d %14lld L2 full lines stored to mem\n", 100, L2_sum[j][4]); printf( "%-d %14lld L2 partial lines stored to mem\n", 100, L2_sum[j][5]); printf( "\n"); } else { printf( "mismatch in starts/stops for code block '%s'\n", code_block_label[j]); printf( " starts = %d\n", block_starts[j]); printf( " stops = %d\n", block_stops[j]); } } printf( "\n"); } } return; }
/*=================================================================*/ void HPM_Init_t(int numthreads) { int i, j, k, core; // int threads_per_core; // int * eventSet; char * ptr; unsigned int tid, pid, cid; unsigned int lock_status; int rc; // Upci_Mode_t Mode; tid = PhysicalThreadID(); // between 0 and 3 pid = PhysicalThreadIndex(); // between 0 and 67 cid = pid/4; if (pid == 0) { // set the initial cumulative counter values to zero for (k=0; k<MAX_CORES; k++) for (j=0; j<MAX_CODE_BLOCKS; j++) for (i=0; i<MAX_COUNTERS; i++) counter_sum[k][j][i] = 0LL; for (j=0; j<MAX_CODE_BLOCKS; j++) timebase_sum[j] = 0LL; for (j=0; j<MAX_CODE_BLOCKS; j++) for (i=0; i<6; i++) L2_sum[j][i] = 0LL; // keep track of code block starts and stops for (j=0; j<MAX_CODE_BLOCKS; j++) { block_starts[j] = 0; block_stops[j] = 0; } // set mask used for thread and core aggregation for (i=0; i<MAX_EVENTS; i++) mask[i] = 0; // check env variables // fixme ptr = fwext_getenv("HPM_GROUP"); if (ptr == NULL) { hpm_group = 0; } else hpm_group = hpm_atoi(ptr); // printf("hpm_group = %d\n", hpm_group); // hpm_group = 82; if (hpm_group < -1) hpm_group = 0; if (hpm_group > 99) hpm_group = 0; // fixme // ptr = fwext_getenv("HPM_SCOPE"); if (pid !=0) return; // if (ptr != NULL) { // if (strncasecmp(ptr,"process", 7) == 0) process_scope = 1; // if (strncasecmp(ptr,"node", 4) == 0) node_scope = 1; // } // fixme // ptr = fwext_getenv("HPM_METRICS"); // if (ptr != NULL) { // if (strncasecmp(ptr,"yes", 3) == 0) derived_metrics = 1; // } for (i=0; i<MAX_CORES; i++) coremask[i] = 1; // find the number of cores used by this process // fixme // numcores = 0; // for (i=0; i<MAX_CORES; i++) numcores += coremask[i]; numcores = 17; // determine the number of threads per core // numthreads = BgGetNumThreads(); // numthreads = 68; // threads_per_core = numthreads / numcores; // hpm_threads = threads_per_core; // fixme hpm_threads = 4; // optionally reset the number of threads per core that will be counted // fixme // ptr = fwext_getenv("HPM_THREADS"); // if (ptr != NULL) { // hpm_threads = fwext_atoi(ptr); // if (hpm_threads < 1) hpm_threads = 1; // if (hpm_threads > 4) hpm_threads = 4; // } // set num_events and num_counters based on hpm_group and hpm_threads switch (hpm_group) { case -1: num_events = 6; eventSet = exptSet; break; case 0: num_events = 6; eventSet = mySet; break; case 1: num_events = 12; if (hpm_threads > 2) hpm_threads = 2; eventSet = ldSet; break; case 2: num_events = 24; if (hpm_threads > 1) hpm_threads = 1; eventSet = fpuSet; break; case 3: num_events = 12; if (hpm_threads > 2) hpm_threads = 2; eventSet = fpSet0; break; case 30: num_events = 6; eventSet = fpSet00; break; case 31: num_events = 6; eventSet = fpSet01; break; case 4: num_events = 12; if (hpm_threads > 2) hpm_threads = 2; eventSet = fpSet1; break; case 40: num_events = 6; eventSet = fpSet10; break; case 41: num_events = 6; eventSet = fpSet11; break; case 5: num_events = 24; if (hpm_threads > 1) hpm_threads = 1; eventSet = fxuSet; break; case 6: num_events = 12; if (hpm_threads > 2) hpm_threads = 2; eventSet = fxSet0; break; case 60: num_events = 6; eventSet = fxSet00; break; case 61: num_events = 6; eventSet = fxSet01; break; case 7: num_events = 12; if (hpm_threads > 2) hpm_threads = 2; eventSet = fxSet1; break; case 70: num_events = 6; eventSet = fxSet10; break; case 71: num_events = 6; eventSet = fxSet11; break; case 81: num_events = 6; eventSet = l1pset0; break; case 82: num_events = 6; eventSet = l1pset1; break; case 83: num_events = 6; eventSet = l1pset2; break; default: break; } num_counters = num_events * hpm_threads; ppc_msync(); Upci_Mode_Init(&Mode[0], UPC_DISTRIB_MODE, UPC_CM_INDEP, 0); initialized = 1; ppc_msync(); } while ((initialized == 0) && (tid == 0)) { ; } if (tid == 0) { lock_status = 0; while (lock_status == 0) { lock_status = hpm_lock_acquire(); } core = cid; // initialize hardware counters // Upci_Mode_Init(&Mode[core], UPC_DISTRIB_MODE, UPC_CM_INDEP, core); Upci_Punit_Init(&Punit[core], &Mode[core], core); // UPC_L1p_SetMode(core, L1P_CFG_UPC_SWITCH); // use one thread per core to enable 24 different punit counters // add events to count, save hwthread in one of the reserved event handle slots k = 0; for (i=0; i<num_events; i++) { // hwthread 0 rc = Upci_Punit_AddEvent(&Punit[core], eventSet[i], 0, &eventHandle[core][k]); if (rc != 0) printf("failed to add event %d\n", eventSet[i]); if (pid == 0) counter_index[k] = eventSet[i]; eventHandle[core][k].rsv[0] = 0; k++; } if (hpm_threads > 1) { for (i=0; i<num_events; i++) { // hwthread 2 rc = Upci_Punit_AddEvent(&Punit[core], eventSet[i], 2, &eventHandle[core][k]); if (rc != 0) printf("failed to add event %d\n", eventSet[i]); if (pid == 0) counter_index[k] = eventSet[i]; eventHandle[core][k].rsv[0] = 2; k++; } } if (hpm_threads > 2) { for (i=0; i<num_events; i++) { // hwthread 1 rc = Upci_Punit_AddEvent(&Punit[core], eventSet[i], 1, &eventHandle[core][k]); if (rc != 0) printf("failed to add event %d\n", eventSet[i]); if (pid == 0) counter_index[k] = eventSet[i]; eventHandle[core][k].rsv[0] = 1; k++; } } if (hpm_threads > 3) { for (i=0; i<num_events; i++) { // hwthread 3 rc = Upci_Punit_AddEvent(&Punit[core], eventSet[i], 3, &eventHandle[core][k]); if (rc != 0) printf("failed to add event %d\n", eventSet[i]); if (pid == 0) counter_index[k] = eventSet[i]; eventHandle[core][k].rsv[0] = 3; k++; } } rc = Upci_Punit_Apply(&Punit[core]); if (rc != 0) printf("Upci_Punit_Apply failed\n"); Upci_Punit_Start(&Punit[core], (UPCI_CTL_RESET | UPCI_CTL_DELAY)); // printf("Initialised upc by core = %d\n", cid); // Upci_Punit_Dump(2, &Punit[core]); lock_val = 0; ppc_msync(); } if (pid == 0) { UPC_L2_EnableUPC(1, 1); UPC_L2_Start(); } // PMPI_Barrier(local_comm); L2_Barrier(&id_barrier, numthreads); return; }
int test_main ( void ) { if (PhysicalThreadIndex() > 0) // run a single core test. test_exit(0); int rc=0; printf("Torus Remote Get Atomic Test\n"); // Perform initialization of the network and mu Personality_t *pers; pers = fwext_getPersonality(); uint64_t p1 = pers->Kernel_Config.NodeConfig & PERS_ENABLE_Mambo; if (p1) is_mambo = 1; // ND and MU init is done in firmware, but we disable it in svchost and // call it directly here because it performs much better. // #if 0 fw_nd_set_verbose(0); // if 1, prints all dcr commands, don't use on cycle sim // on cycle sim, can have DcrMonitory trace DCR commands rc = fw_nd_reset_release(pers); if(rc) { TRACE(("fw_nd_reset_release failed with rc=%d\n",rc)); test_exit (rc); } fw_mu_set_verbose(0); // if 1, prints all dcr commands, don't use on cycle sim // on cycle sim, can have DcrMonitory trace DCR commands rc = fw_mu_reset_release(pers); if(rc) { printf("fw_mu_reset_release failed with rc=%d\n",rc); test_exit (rc); } // #endif // if 0 uint64_t max_value = ~0; fw_mu_set_sys_range(0, /* range_id */ 0, /* min_value */ max_value); fw_mu_set_usr_range(0, /* range_id */ 0, /* min_value */ max_value); /* fw_mu_set_imfifo_rget (1, 1); */ /* fw_mu_set_imfifo_system (1, 0); */ TRACE(("Network and MU Initialization is complete\n")); #else int main(int argc, char **argv) { int rc; #endif uint i = 0; // Destination for Remote Get packet MUHWI_Destination_t dest; MUSPI_SetUpDestination ( &dest, 0, 0, 0, 0, 0 ); MUSPI_InjFifoSubGroup_t fifo_subgroup; uint64 message_size_in_bytes_remote_get = MESSAGE_SIZE_REMOTE_GET; uint64 message_size_in_bytes_direct_put = MESSAGE_SIZE_DIRECT_PUT; TRACE(("main(): Injection Memory FIFO (0,0,0), Send Remote Get Message with Atomic Increment\n")); //#ifdef PRINT_DEBUG_MESSAGES printf("Start!\n"); //#endif // ------------------------------------------------------ // allocates area for message_sent_remote_get[] buffer (RemoteGet) // ------------------------------------------------------ uint64 *message_sent_remote_get = (uint64 *)malloc(message_size_in_bytes_remote_get); uint64 *message_sent_direct_put = (uint64 *)malloc(message_size_in_bytes_direct_put); TRACE(("message_sent_remote_get (address) = %p\n", message_sent_remote_get)); TRACE(("message_size_in_bytes_remote_get = %lld\n", message_size_in_bytes_remote_get)); TRACE(("message_sent_direct_put (address) = %p\n", message_sent_direct_put)); TRACE(("message_size_in_bytes_direct_put = %lld\n", message_size_in_bytes_direct_put)); // Initializes the message_sent_remote_get[] buffer for (i=0; i<message_size_in_bytes_remote_get/8; i++) message_sent_remote_get[i] = 0x00ull; // 8-bytes Kernel_MemoryRegion_t mregionSentRemoteGet; rc = Kernel_CreateMemoryRegion ( &mregionSentRemoteGet, message_sent_remote_get, message_size_in_bytes_remote_get ); if ( rc != 0) { printf("Kernel_CreateMemoryRegion failed for message_sent_remote_get with rc=%d\n",rc); #ifdef __FWEXT__ test_exit(1); #else exit(1); #endif } // Initializes the message_sent[] buffer *message_sent_direct_put = (uint64)ATOMIC_COUNTER_INITIAL_VALUE; // 8-bytes uint64_t expected_counter_value = ATOMIC_COUNTER_INITIAL_VALUE + RECEIVE_BUFFER_INITIAL_VALUE; Kernel_MemoryRegion_t mregionSentDirectPut; rc = Kernel_CreateMemoryRegion ( &mregionSentDirectPut, message_sent_direct_put, message_size_in_bytes_direct_put ); if ( rc != 0) { printf("Kernel_CreateMemoryRegion failed for message_sent_direct_put with rc=%d\n",rc); #ifdef __FWEXT__ test_exit(1); #else exit(1); #endif } // Get an atomic address for the message_sent buffer. uint64_t message_sent_atomic_address = MUSPI_GetAtomicAddress ( (uint64_t)message_sent_direct_put - (uint64_t)mregionSentDirectPut.BaseVa + (uint64_t)mregionSentDirectPut.BasePa, MUHWI_ATOMIC_OPCODE_LOAD_INCREMENT ); TRACE(("message_sent_direct_put (atomic address) = 0x%llx\n", (long long unsigned int)message_sent_atomic_address)); ///////////////////////////////////////////////// typedef struct recvArea { volatile uint64 counter; unsigned char recvBuffer[MESSAGE_SIZE_DIRECT_PUT]; } recvArea_t; // Allocate space for the reception counter and the receive buffer recvArea_t *recvAreaPtr = (recvArea_t*)malloc ( sizeof(recvArea_t) ); if ( !recvAreaPtr ) { printf("Allocating recvArea failed\n"); #ifdef __FWEXT__ test_exit(1); #else exit(1); #endif } volatile uint64 *counterAddress = (volatile uint64*)&(recvAreaPtr->counter); unsigned char *recvBufferAddress = (unsigned char *)&(recvAreaPtr->recvBuffer[0]); *((uint64*)recvBufferAddress) = RECEIVE_BUFFER_INITIAL_VALUE; // Get a memory region for the recvArea. Kernel_MemoryRegion_t recvAreaMemRegion; rc = Kernel_CreateMemoryRegion ( &recvAreaMemRegion, recvAreaPtr, sizeof(recvArea_t) ); if ( rc != 0) { printf("Kernel_CreateMemoryRegion failed for recvAreaMemRegion with rc=%d\n",rc); #ifdef __FWEXT__ test_exit(1); #else exit(1); #endif } // Calculate the offsets of the counter and receive buffer from the base address. uint64_t recvAreaBasePA = (uint64_t)recvAreaMemRegion.BasePa; uint64_t counterOffset = (uint64_t)counterAddress - (uint64_t)recvAreaMemRegion.BaseVa; uint64_t recvBufferOffset = (uint64_t)recvBufferAddress - (uint64_t)recvAreaMemRegion.BaseVa; TRACE(("counterAddress=%p, recvBufferAddress=%p, recvAreaBasePA=0x%llx, counterOffset=0x%llx, recvBufferOffset=0x%llx\n",counterAddress, recvBufferAddress, (long long unsigned int)recvAreaBasePA, (long long unsigned int)counterOffset, (long long unsigned int)recvBufferOffset)); ////////////////////////////////////////////////////////////// // Initialize base address table and atomic counter info ////////////////////////////////////////////////////////////// /* Set up the base address table */ uint32_t batids[1] = {0}; MUSPI_BaseAddressTableSubGroup_t bat; rc = Kernel_AllocateBaseAddressTable ( 0, &bat, 1, batids, 0 /* "User" use */ ); if (rc != 0) { printf("Kernel_AllocateBaseAddressTable failed with rc=%d\n",rc); #ifdef __FWEXT__ test_exit(1); #else exit(1); #endif } rc = MUSPI_SetBaseAddress ( &bat, 0, (uint64_t)recvAreaMemRegion.BasePa ); if (rc != 0) { printf("MUSPI_SetBaseAddress failed with rc=%d\n",rc); #ifdef __FWEXT__ test_exit(1); #else exit(1); #endif } TRACE(("Set BaseAddressTable entry slot 0 to 0x%llx\n", (long long unsigned int)recvAreaMemRegion.BasePa)); uint64_t muAtomicCounterOffset = MUSPI_GetAtomicOffsetFromBaseAddress ( &bat, 0, recvAreaBasePA + counterOffset, MUHWI_ATOMIC_OPCODE_STORE_ADD ); uint64_t muAtomicRecvBufferOffset = MUSPI_GetAtomicOffsetFromBaseAddress ( &bat, 0, recvAreaBasePA + recvBufferOffset, MUHWI_ATOMIC_OPCODE_STORE_ADD ); TRACE(("main(): recvCounterVa=%p, recvAreaBasePA=0x%llx, muAtomicCounterOffset=0x%llx, muAtomicRecvBufferOffset=0x%llx\n", &(recvAreaPtr->counter), (long long unsigned int)recvAreaBasePA, (long long unsigned int)muAtomicCounterOffset, (long long unsigned int)muAtomicRecvBufferOffset)); ////////////////////////////////////////////////////////////// // Create a DirectPut Descriptor and copy it into the // message payload ////////////////////////////////////////////////////////////// TRACE(("main(): Configures direct put descriptor\n")); MUSPI_Pt2PtDirectPutDescriptorInfo_t mu_iDirectPutDescriptorInfo; mu_iDirectPutDescriptorInfo.Base.Pre_Fetch_Only = MUHWI_DESCRIPTOR_PRE_FETCH_ONLY_NO; mu_iDirectPutDescriptorInfo.Base.Payload_Address = message_sent_atomic_address; mu_iDirectPutDescriptorInfo.Base.Message_Length = message_size_in_bytes_direct_put; mu_iDirectPutDescriptorInfo.Base.Torus_FIFO_Map = MUHWI_DESCRIPTOR_TORUS_FIFO_MAP_AP; mu_iDirectPutDescriptorInfo.Base.Dest = dest; mu_iDirectPutDescriptorInfo.Pt2Pt.Hints_ABCD = MUHWI_PACKET_HINT_AP; mu_iDirectPutDescriptorInfo.Pt2Pt.Misc1 = MUHWI_PACKET_HINT_E_NONE | MUHWI_PACKET_DO_NOT_ROUTE_TO_IO_NODE | MUHWI_PACKET_USE_DETERMINISTIC_ROUTING | MUHWI_PACKET_DO_NOT_DEPOSIT; mu_iDirectPutDescriptorInfo.Pt2Pt.Misc2 = MUHWI_PACKET_VIRTUAL_CHANNEL_DETERMINISTIC; mu_iDirectPutDescriptorInfo.Pt2Pt.Skip = 0; mu_iDirectPutDescriptorInfo.DirectPut.Rec_Payload_Base_Address_Id = 0; mu_iDirectPutDescriptorInfo.DirectPut.Rec_Payload_Offset = muAtomicRecvBufferOffset; mu_iDirectPutDescriptorInfo.DirectPut.Rec_Counter_Base_Address_Id = 0; mu_iDirectPutDescriptorInfo.DirectPut.Rec_Counter_Offset = muAtomicCounterOffset; mu_iDirectPutDescriptorInfo.DirectPut.Pacing = MUHWI_PACKET_DIRECT_PUT_IS_NOT_PACED; rc = MUSPI_CreatePt2PtDirectPutDescriptor( &mu_iDirectPutDescriptor, &mu_iDirectPutDescriptorInfo ); if (rc != 0) { printf("MUSPI_CreatePt2PtDirectPutDescriptor failed with rc=%d\n",rc); #ifdef __FWEXT__ test_exit(1); #else exit(1); #endif } //MUSPI_DescriptorDumpHex("Direct Put Descriptor", // &mu_iDirectPutDescriptor); // Copy Descriptor into RemoteGet message payload memcpy((char *)((void *)message_sent_remote_get), (char *)((void *)(&mu_iDirectPutDescriptor)), message_size_in_bytes_remote_get); ///////////////////////////////////////////////////////////// // RemoteGet message // Create a remote get descriptor ///////////////////////////////////////////////////////////// TRACE(("main(): Configures remote get descriptor\n")); MUSPI_Pt2PtRemoteGetDescriptorInfo_t mu_iRemoteGetDescriptorInfo; mu_iRemoteGetDescriptorInfo.Base.Pre_Fetch_Only = MUHWI_DESCRIPTOR_PRE_FETCH_ONLY_NO; mu_iRemoteGetDescriptorInfo.Base.Payload_Address = (uint64_t)message_sent_remote_get - (uint64_t)mregionSentRemoteGet.BaseVa + (uint64_t)mregionSentRemoteGet.BasePa; mu_iRemoteGetDescriptorInfo.Base.Message_Length = message_size_in_bytes_remote_get; mu_iRemoteGetDescriptorInfo.Base.Torus_FIFO_Map = MUHWI_DESCRIPTOR_TORUS_FIFO_MAP_AP; mu_iRemoteGetDescriptorInfo.Base.Dest = dest; mu_iRemoteGetDescriptorInfo.Pt2Pt.Hints_ABCD = MUHWI_PACKET_HINT_AP; mu_iRemoteGetDescriptorInfo.Pt2Pt.Misc1 = MUHWI_PACKET_HINT_E_NONE | MUHWI_PACKET_DO_NOT_ROUTE_TO_IO_NODE | MUHWI_PACKET_USE_DETERMINISTIC_ROUTING | MUHWI_PACKET_DO_NOT_DEPOSIT; mu_iRemoteGetDescriptorInfo.Pt2Pt.Misc2 = MUHWI_PACKET_VIRTUAL_CHANNEL_DETERMINISTIC; mu_iRemoteGetDescriptorInfo.Pt2Pt.Skip = 0; mu_iRemoteGetDescriptorInfo.RemoteGet.Type = MUHWI_PACKET_TYPE_GET; mu_iRemoteGetDescriptorInfo.RemoteGet.Rget_Inj_FIFO_Id = 1; // Fifo 1 is for remote get use // Prepares Injection Memory FIFO Descriptor (RemoteGet) rc = MUSPI_CreatePt2PtRemoteGetDescriptor( &mu_iRemoteGetDescriptor, &mu_iRemoteGetDescriptorInfo ); if (rc != 0) { printf("MUSPI_CreatePt2PtRemoteGetDescriptor failed with rc=%d\n",rc); #ifdef __FWEXT__ test_exit(1); #else exit(1); #endif } // MUSPI_DescriptorDumpHex("Remote Get Descriptor", // &mu_iRemoteGetDescriptor); ///////////////////////////////////////////////////////////////// // Configures Injection Memory FIFO Registers // - fifo 0 that the core injects descriptors into // - fifo 1 that the MU injects remote get payload into ///////////////////////////////////////////////////////////////// TRACE(("main(): Configures Injection Memory FIFO Registers\n")); void *injMemoryFifoPtr, *memoryForInjMemoryFifoPtr; rc = malloc_memalign ( &memoryForInjMemoryFifoPtr, &injMemoryFifoPtr, 64, INJ_MEMORY_FIFO_SIZE+1 ); if (rc) { printf("inj_memory_fifo malloc failed with rc=%d\n",rc); #ifdef __FWEXT__ test_exit(1); #else exit(1); #endif } void *rgetMemoryFifoPtr, *memoryForRgetMemoryFifoPtr; rc = malloc_memalign ( &memoryForRgetMemoryFifoPtr, &rgetMemoryFifoPtr, 64, INJ_MEMORY_FIFO_SIZE+1 ); if (rc) { printf("rget_memory_fifo malloc failed with rc=%d\n",rc); #ifdef __FWEXT__ test_exit(1); #else exit(1); #endif } uint32_t fifoid[2] = { 0, 1 }; Kernel_InjFifoAttributes_t injFifoAttrs[2]; injFifoAttrs[0].RemoteGet = 0; injFifoAttrs[0].System = 0; injFifoAttrs[1].RemoteGet = 1; injFifoAttrs[1].System = 0; rc = Kernel_AllocateInjFifos (0, &fifo_subgroup, 2, fifoid, injFifoAttrs); if ( rc != 0) { printf("Kernel_AllocateInjFifos failed with rc=%d\n",rc); #ifdef __FWEXT__ test_exit(1); #else exit(1); #endif } Kernel_MemoryRegion_t mregionInj; rc = Kernel_CreateMemoryRegion ( &mregionInj, injMemoryFifoPtr, INJ_MEMORY_FIFO_SIZE + 1 ); if ( rc != 0) { printf("Kernel_CreateMemoryRegion failed for injMemoryFifoPtr with rc=%d\n",rc); #ifdef __FWEXT__ test_exit(1); #else exit(1); #endif } Kernel_MemoryRegion_t mregionRget; rc = Kernel_CreateMemoryRegion ( &mregionRget, rgetMemoryFifoPtr, INJ_MEMORY_FIFO_SIZE + 1 ); if ( rc != 0) { printf("Kernel_CreateMemoryRegion failed for rgetMemoryFifoPtr with rc=%d\n",rc); #ifdef __FWEXT__ test_exit(1); #else exit(1); #endif } rc = Kernel_InjFifoInit (&fifo_subgroup, fifoid[0], &mregionInj, (uint64_t)injMemoryFifoPtr - (uint64_t)mregionInj.BaseVa, INJ_MEMORY_FIFO_SIZE); if (rc != 0) { printf("Kernel_InjFifoInit Inj failed with rc=%d, errno=%d\n",rc,errno); #ifdef __FWEXT__ test_exit(1); #else exit(1); #endif } rc = Kernel_InjFifoInit (&fifo_subgroup, fifoid[1], &mregionRget, (uint64_t)rgetMemoryFifoPtr - (uint64_t)mregionRget.BaseVa, INJ_MEMORY_FIFO_SIZE); if (rc != 0) { printf("Kernel_InjFifoInit Rget failed with rc=%d, errno=%d\n",rc,errno); #ifdef __FWEXT__ test_exit(1); #else exit(1); #endif } rc = Kernel_InjFifoActivate (&fifo_subgroup, 2, fifoid, KERNEL_INJ_FIFO_ACTIVATE); if (rc != 0) { printf("Kernel_InjFifoActivate Inj failed with rc=%d, errno=%d\n",rc,errno); #ifdef __FWEXT__ test_exit(1); #else exit(1); #endif } // --------------------------------------------- // Reception Side // --------------------------------------------- /* *data_counter_base_address = REC_PAYLOAD_BASE_ADDRESS; */ /* printf("data_counter_base_address = %p\n", data_counter_base_address); */ // Loop, sending the remote get, waiting for the reception counter to hit zero, // and verifying the received counter's value. // for (i=0; i<num_iterations; i++) /** disable loop **/ { // Let's initialize the Counter for corresponding Counter Id // Note: counter is initialized with the message size // updates counter with number of bytes sent *counterAddress = MESSAGE_SIZE_DIRECT_PUT; // ----------------------------------------------------------- // Processor Advances Tail pointer - Descriptor is 64-bytes // MU should Inject (RemoteGet) message into the Torus // ----------------------------------------------------------- // Let's Inject the (RemoteGet) Descriptor into the Injection Memory FIFO #if 1 printf("main(): Inject Descriptor into Injection Memory FIFO\n"); #endif rc = MUSPI_InjFifoInject (MUSPI_IdToInjFifo(fifoid[0], &fifo_subgroup), (void *)(&mu_iRemoteGetDescriptor) ); if (rc < 0) // Should have injected 1 descriptor { printf("MUSPI_InjFifoInject failed with rc=%d\n",rc); #ifdef __FWEXT__ test_exit(1); #else exit(1); #endif } #ifndef __FWEXT__ printf("main(): Successful injection of remote get descriptor\n"); #endif // ////////////////////////////////////////////////// // Reception side, check counter value // ////////////////////////////////////////////////// uint64 volatile counter_value; // wait for the counter to reach ZERO while (1) { counter_value = *counterAddress; if (counter_value == 0) { // #if 1 printf("counter is now ZERO !!!!\n"); #endif break; } } _bgq_msync(); // Ensure data is available to all cores. // Let's print the Received Message contents //put_offset = (uint64)mu_pktHdrDirectPut.Put_Offset_LSB; #ifndef __FWEXT__ printf("recvBufferAddress = %p\n", recvBufferAddress); printf("---Prints Received Message contents\n"); Print_Message((unsigned char *)recvBufferAddress, message_size_in_bytes_direct_put); printf("---Where Received Message is being stored: recvBufferAddress = %p\n", recvBufferAddress); printf("---Checks Received Message contents(size = %lld)\n", message_size_in_bytes_direct_put); #endif uint64_t receivedCounterValue = *((uint64_t*)recvBufferAddress); if ( receivedCounterValue == expected_counter_value ) { printf("---Received Counter Value = %llu\n", (long long unsigned int)receivedCounterValue); } else { printf("ERROR: Received Counter Value = %llu, expected %llu\n", (long long unsigned int)receivedCounterValue, (long long unsigned int)expected_counter_value); #ifdef __FWEXT__ test_exit(1); #else exit(1); #endif } if ( *message_sent_direct_put == ATOMIC_COUNTER_INITIAL_VALUE+1 ) { printf("---Sent Counter Value = %llu\n", (long long unsigned int)*message_sent_direct_put); } else { printf("ERROR: Sent Counter Value = %llu, expected %llu\n", (long long unsigned int)*message_sent_direct_put, (long long unsigned int)(ATOMIC_COUNTER_INITIAL_VALUE+1)); #ifdef __FWEXT__ test_exit(1); #else exit(1); #endif } } //printf("All counter values passed\n"); #ifdef __FWEXT__ if ( is_mambo == 0 ) // Termination checks don't work in mambo. ErrInt DCRs are not zero. { rc = fw_nd_term_check(pers); if (rc) { printf("ERROR: fw_nd_term_check failed with rc=%d\n",rc); test_exit(1); } rc = fw_mu_term_check(pers); if (rc) { printf("ERROR: fw_mu_term_check failed with rc=%d\n",rc); test_exit(1); } } #endif printf("Done!\n"); #ifdef __FWEXT__ test_exit (0); #endif return 0; }