void printcounters(struct counter *ctrs, uint64_t duration) { struct metrics s = {0}; s.timestamp = _rdtsc(); s.duration = duration; // We skip the last core int corethreads =0; for (int cpu = 1; cpu < gbl.ncpus-3; ++cpu) { double delta[NEVENTS]; // volatile because another thread is changing it. volatile struct counter *p = &ctrs[cpu]; for (int i = 0; i < NEVENTS; ++i) { union { __m512d c; uint64_t values[8]; } t; t.c = _mm512_load_pd((void *)&p->counts[i][0]); delta[i] = perf_scale_delta(t.values, lastctr[cpu].counts[i]); _mm512_storenrngo_pd((void *)&lastctr[cpu].counts[i][0], t.c); if (delta[i] < 0) delta[i] = 0; sevents[i] += delta[i]; } if (2*delta[clocks1] > duration) { s.nthreads += 1; corethreads += 1; } if ((cpu % 4) == 0) // Last thread on this core { if (corethreads) s.ncores += 1; corethreads = 0; } s.vpu_ea += delta[vpu_ea]; s.instrs += delta[instrs]; s.vinstrs += delta[vpu_ie]; } uint64_t nreads = 0, nwrites = 0; for (int i = 0; i < NGBOXES; ++i) for (int j = 0; j < 2; ++j) { nreads += pmu_rdctr(i, j, 0); nwrites += pmu_rdctr(i, j, 1); } s.rbytes = (nreads - prevnreads) * 64; s.wbytes = (nwrites - prevnwrites)* 64; prevnreads = nreads; prevnwrites = nwrites; sample(&s); }
inline void transfer_omp_loop_nontemp(uintptr_t rbuf, uintptr_t sbuf, size_t size, HMPI_Request recv_req, HMPI_Request send_req){ int N_DOUBLES_PER_BLOCK = (64/sizeof(char)) ; size_t total = size / 64 ; int i = 0; //#pragma vector nontemporal #pragma omp parallel for for (i = 0; i < total; i++) { __m512d v_b = _mm512_load_pd(sbuf+ N_DOUBLES_PER_BLOCK*i); _mm512_storenrngo_pd(rbuf+ N_DOUBLES_PER_BLOCK*i, v_b); } }
static void readcounters(int cpu) { struct counter *cp = &counters[cpu]; for (int i = 0; i < NEVENTS; ++i) { struct evinfo *ep = &events[cpu][i]; union { __m512d c; uint64_t values[8]; } u; readctr((perf_event_mmap_page *)ep->buf, ep->fd, u.values); if (u.values[0]) _mm512_storenrngo_pd(&cp->counts[i][0], u.c); } }