void RWMutex::pause() { unsigned int DELAY_CYCLES = 64; #if !defined(__MIC__) _mm_pause(); _mm_pause(); #else _mm_delay_32(DELAY_CYCLES); #endif }
COMMON_SYSDEP void __cilkrts_short_pause(void) { #if __ICC >= 1110 # if __MIC__ || __MIC2__ _mm_delay_32(16); // stall for 16 cycles # else _mm_pause(); # endif #elif defined __i386__ || defined __x86_64 __asm__("pause"); #else # warning __cilkrts_short_pause empty #endif }
void SharedLazyTessellationCache::waitForUsersLessEqual(ThreadWorkState *const t_state, const unsigned int users) { while( !(t_state->counter <= users) ) { #if defined(__MIC__) _mm_delay_32(128); #else _mm_pause(); _mm_pause(); _mm_pause(); _mm_pause(); #endif } }
COMMON_SYSDEP void __cilkrts_yield(void) { #if __APPLE__ || __FreeBSD__ || __VXWORKS__ // On MacOS, call sched_yield to yield quantum. I'm not sure why we // don't do this on Linux also. sched_yield(); #elif defined(__MIC__) // On MIC, pthread_yield() really trashes things. Arch's measurements // showed that calling _mm_delay_32() (or doing nothing) was a better // option. Delaying 1024 clock cycles is a reasonable compromise between // giving up the processor and latency starting up when work becomes // available _mm_delay_32(1024); #elif defined(ANDROID) // On Android, call sched_yield to yield quantum. I'm not sure why we // don't do this on Linux also. sched_yield(); #else // On Linux, call pthread_yield (which in turn will call sched_yield) // to yield quantum. pthread_yield(); #endif }
int main (int argc, char *argv[]) { int i, j, t; gaspi_rank_t myrank; char *ptr0; //on numa architectures you have to map this process to the numa node where nic is installed if (start_bench (2) != 0) { printf ("Initialization failed\n"); exit (-1); } // BENCH // gaspi_proc_rank (&myrank); if (gaspi_segment_ptr (0, (void **) &ptr0) != GASPI_SUCCESS) { printf ("gaspi_segment_ptr failed !\n"); exit (-1); } gaspi_float cpu_freq; gaspi_cpu_frequency(&cpu_freq); if (myrank < 2) { if(myrank == 0) { printf("-----------------------------------\n"); printf ("%12s\t%5s\n", "Bytes", "Lat(usecs)"); printf("-----------------------------------\n"); } int bytes = 2; volatile char *postBuf = (volatile char *) ptr0; for (i = 1; i < 24; i++) { volatile char *pollBuf = (volatile char *) (ptr0 + ( 2 * bytes -1 )); int rcnt = 0; int cnt = 0; gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK); for (j = 0; j < ITERATIONS; j++) { if (rcnt < ITERATIONS && !(cnt < 1 && myrank == 1)) { rcnt++; while (*pollBuf != (char) rcnt) { #ifdef MIC _mm_delay_32(32); #else _mm_pause(); #endif } } stamp[j] = get_mcycles (); postBuf[bytes - 1] = (char) ++cnt; gaspi_write (0, 0, myrank ^ 0x1, 0, bytes, bytes, 0, GASPI_BLOCK); gaspi_wait (0, GASPI_BLOCK); } for (t = 0; t < (ITERATIONS - 1); t++) delta[t] = stamp[t + 1] - stamp[t]; qsort (delta, (ITERATIONS - 1), sizeof *delta, mcycles_compare); const double div = 1.0 / cpu_freq; const double ts = (double) delta[ITERATIONS / 2] * div * 0.5; if(myrank == 0) printf ("%12d\t%4.2f\n", bytes, ts); bytes <<= 1; } } end_bench (); return 0; }