/* internal function */ void update_progress_thread (struct fi_bgq_progress * progress) { struct l2atomic_fifo_consumer * consumer = &progress->consumer; uint64_t value_rsh3b = 0; /* Check if another endpoint should be managed by this progress thread */ if (l2atomic_fifo_consume(consumer, &value_rsh3b) == 0) { struct fi_bgq_ep *bgq_ep = (struct fi_bgq_ep *)(value_rsh3b << 3); assert(L2_AtomicLoad(&bgq_ep->async.enabled) != 0); assert(L2_AtomicLoad(&bgq_ep->async.active) == 0); progress->all_ep[(progress->all_ep_count)++] = bgq_ep; if (bgq_ep->rx.caps & FI_TAGGED) { progress->tag_ep[(progress->tag_ep_count)++] = bgq_ep; } if (bgq_ep->rx.caps & FI_MSG) { progress->msg_ep[(progress->msg_ep_count)++] = bgq_ep; } L2_AtomicStore(&bgq_ep->async.active, 1); } /* * Advance control code path for each endpoint once and check * each endpoint if async progress is disabled */ unsigned i = 0; while (i < progress->all_ep_count) { struct fi_bgq_ep *bgq_ep = progress->all_ep[i]; poll_cfifo(bgq_ep, 0); if (L2_AtomicLoad(&bgq_ep->async.enabled) == 0) { L2_AtomicStore(&bgq_ep->async.active, 0); if (bgq_ep->rx.caps & FI_MSG) { unsigned n = 0; while (progress->msg_ep[n] != bgq_ep) ++n; progress->msg_ep[n] = progress->msg_ep[--(progress->msg_ep_count)]; } if (bgq_ep->rx.caps & FI_TAGGED) { unsigned n = 0; while (progress->tag_ep[n] != bgq_ep) ++n; progress->tag_ep[n] = progress->tag_ep[--(progress->tag_ep_count)]; } progress->all_ep[i] = progress->all_ep[--(progress->all_ep_count)]; } else { ++i; } } return; }
/*! * \brief Initializes speculation registers before the start of a job. */ int Speculation_Init() { int slice; uint64_t scrub_rate; L2C_SPECID_t specid; if(TI_isDD1() || ((GetPersonality()->Kernel_Config.NodeConfig & PERS_ENABLE_DD1_Workarounds) != 0)) { } else { SPEC_SetNumberOfDomains(1); SPEC_SetPrivMap( L2C_PRIVMAP_DISABLEWRITEFNC(L2C_PRIVMAP_FUNC_NUMDOM) | L2C_PRIVMAP_DISABLEWRITEFNC(L2C_PRIVMAP_FUNC_PRIVMAP) ); ppc_msync(); for(specid=0; specid<128; specid++) { SPEC_TryChangeState_priv(specid, L2C_IDSTATE_PRED_SPEC | L2C_IDSTATE_INVAL); SPEC_SetConflict_priv(specid, 0); } ppc_msync(); } App_GetEnvValue("BG_SIMPLEROLLBACK", &SIMPLE_ROLLBACK); L2_AtomicStore(&SpecDomainsAllocated, 0); domainsConfigured = 0; // Reset the L2 scrub rate scrub_rate = 64; for(slice=0; slice<L2_DCR_num; slice++) { // Set the L2 scrub rate uint64_t l2_dcr_refctrl = DCRReadPriv(L2_DCR(slice, REFCTRL)); if(default_l2_first_init) default_l2_scrub_rate[slice] = L2_DCR__REFCTRL__SCB_INTERVAL_get(l2_dcr_refctrl); L2_DCR__REFCTRL__SCB_INTERVAL_insert(l2_dcr_refctrl, default_l2_scrub_rate[slice]); DCRWritePriv(L2_DCR(slice, REFCTRL), l2_dcr_refctrl); } default_l2_first_init = 1; Speculation_ExitJailMode(); return 0; }
int main(int argc, char * argv[]) { const int n = 1024; int count = (argc>1) ? atoi(argv[1]) : 1000000; /* this "activates" the L2 atomic data structures */ uint64_t * l2_counters = NULL; int rc = posix_memalign((void**)&l2_counters, 2*1024*1024, n * sizeof(uint64_t) ); assert(rc==0 && l2_counters != NULL); uint64_t rc64 = Kernel_L2AtomicsAllocate(l2_counters, n * sizeof(uint64_t) ); assert(rc64==0); for (int i=0; i<n; i++) { L2_AtomicStore(&(l2_counters[i]), 0); } #pragma omp parallel shared(l2_counters) { int me = omp_get_thread_num(); int jmax = n/omp_get_num_threads(); for (int j=0; j<jmax; j++) { #pragma omp barrier uint64_t t0 = GetTimeBase(); for (int i=0; i<count; i++) { L2_AtomicLoadIncrement(&(l2_counters[j*me])); } #pragma omp barrier uint64_t t1 = GetTimeBase(); printf("threads = %d, stride = %d, ticks = %llu \n", omp_get_num_threads(), j, t1-t0); fflush(stdout); } } for (int i=0; i<n; i++) { uint64_t rval = L2_AtomicLoad(&(l2_counters[i])); printf("l2_counter[%d]=%llu\n", i, rval); } return 0; }
int main(int argc, char * argv[]) { num_threads = (argc>1) ? atoi(argv[1]) : 1; printf("L2 counter test using %d threads \n", num_threads ); //printf("sizeof(BGQ_Atomic64_t) = %zu \n", sizeof(BGQ_Atomic64_t) ); /* this "activates" the L2 atomic data structures */ Kernel_L2AtomicsAllocate(&counter, sizeof(BGQ_Atomic64_t) ); L2_AtomicStore(&(counter.atom), 0); out64_sync(&(counter.atom), 0); pool = (pthread_t *) malloc( num_threads * sizeof(pthread_t) ); assert(pool!=NULL); /**************************************************/ for (int i=0; i<num_threads; i++) { int rc = pthread_create(&(pool[i]), NULL, &fight, NULL); if (rc!=0) { printf("pthread error \n"); fflush(stdout); sleep(1); } assert(rc==0); } if (debug) { printf("threads created \n"); fflush(stdout); } for (int i=0; i<num_threads; i++) { void * junk; int rc = pthread_join(pool[i], &junk); if (rc!=0) { printf("pthread error \n"); fflush(stdout); sleep(1); } assert(rc==0); } if (debug) { printf("threads joined \n"); fflush(stdout); } uint64_t rval = L2_AtomicLoad(&(counter.atom)); printf("final value of counter is %llu \n", rval); /**************************************************/ for (int i=0; i<num_threads; i++) { int rc = pthread_create(&(pool[i]), NULL, &slowfight, NULL); if (rc!=0) { printf("pthread error \n"); fflush(stdout); sleep(1); } assert(rc==0); } printf("threads created \n"); fflush(stdout); for (int i=0; i<num_threads; i++) { void * junk; int rc = pthread_join(pool[i], &junk); if (rc!=0) { printf("pthread error \n"); fflush(stdout); sleep(1); } assert(rc==0); } printf("threads joined \n"); fflush(stdout); rval = in64(&(slowcounter.atom)); printf("final value of slowcounter is %llu \n", rval); /**************************************************/ free(pool); return 0; }
/* internal function */ void * progress_fn (void *arg) { struct fi_bgq_progress * progress = (struct fi_bgq_progress *)arg; struct fi_bgq_ep ** tag_ep = progress->tag_ep; struct fi_bgq_ep ** msg_ep = progress->msg_ep; struct fi_bgq_ep ** all_ep = progress->all_ep; struct l2atomic_fifo_consumer * consumer = &progress->consumer; struct l2atomic_fifo_producer * producer = &progress->producer; uint64_t value_rsh3b = 0; const unsigned tag_loop = 16; const unsigned msg_loop = 4; unsigned m, j, i; /* first, enable the progress thread control fifo by setting the * HEAD and TAIL to zero and setting the BOUNDS to FIFO_SIZE-1 */ l2atomic_fifo_enable(consumer, producer); progress->active = 1; fi_bgq_msync(FI_BGQ_MSYNC_TYPE_WO); while (progress->enabled) { /* Advance performance critical code path for each endpoint multiple times */ const unsigned tag_ep_count = progress->tag_ep_count; const unsigned msg_ep_count = progress->msg_ep_count; for (m=0; m<msg_loop; ++m) { for (j=0; j<tag_loop; ++j) { for (i=0; i<tag_ep_count; ++i) { poll_mfifo(tag_ep[i], 0, 0, 0); poll_rfifo(tag_ep[i], 0); } } for (i=0; i<msg_ep_count; ++i) { poll_noinline(msg_ep[i], 1, 0); } } update_progress_thread(progress); } /* * This progress thread has been disabled. Before setting the thread to inactive: * 1. disable the progress thread control fifo by setting the BOUNDS to zero * 2. drain the progress thread control fifo of endpoints * 3. Attempt to transfer any endpoints managed by this progress thread to another progress thread * 4. If no active progress threads, disable and deactivate remaining endpoints */ l2atomic_fifo_disable(consumer, producer); while (0 == l2atomic_fifo_drain(consumer, producer, &value_rsh3b)) { progress->all_ep[(progress->all_ep_count)++] = (struct fi_bgq_ep *)(value_rsh3b << 3); } struct fi_bgq_domain *bgq_domain = progress->bgq_domain; const unsigned max_threads = bgq_domain->progress.max_threads; for (i=0; i<progress->all_ep_count; ++i) { value_rsh3b = ((uint64_t)(all_ep[i])) >> 3; unsigned p; for (p=0; p<max_threads; ++p) { if (0 == l2atomic_fifo_produce(&bgq_domain->progress.thread[p].producer, value_rsh3b)) { all_ep[i] = NULL; break; } } if (all_ep[i] != NULL) { /* No active progress threads; disable async progress on this endpoint */ L2_AtomicStore(&all_ep[i]->async.enabled, 0); L2_AtomicStore(&all_ep[i]->async.active, 0); all_ep[i] = NULL; /* TODO - is this an error or something? */ } } /* Deactivate this progress thread and exit */ progress->active = 0; fi_bgq_msync(FI_BGQ_MSYNC_TYPE_WO); return NULL; };