Beispiel #1
0
  void RWMutex::pause()
   {
     unsigned int DELAY_CYCLES = 64;

#if !defined(__MIC__)
     _mm_pause(); 
     _mm_pause();
#else
     _mm_delay_32(DELAY_CYCLES); 
#endif      
   }
Beispiel #2
0
COMMON_SYSDEP void __cilkrts_short_pause(void)
{
#if __ICC >= 1110
#   if __MIC__ || __MIC2__
    _mm_delay_32(16); // stall for 16 cycles
#   else
    _mm_pause();
#   endif
#elif defined __i386__ || defined __x86_64
    __asm__("pause");
#else
#   warning __cilkrts_short_pause empty
#endif
}
Beispiel #3
0
  void SharedLazyTessellationCache::waitForUsersLessEqual(ThreadWorkState *const t_state,
							  const unsigned int users)
   {
     while( !(t_state->counter <= users) )
       {
#if defined(__MIC__)
	 _mm_delay_32(128);
#else
	 _mm_pause();
	 _mm_pause();
	 _mm_pause();
	 _mm_pause();
#endif
       }
   }
Beispiel #4
0
COMMON_SYSDEP void __cilkrts_yield(void)
{
#if __APPLE__ || __FreeBSD__ || __VXWORKS__
    // On MacOS, call sched_yield to yield quantum.  I'm not sure why we
    // don't do this on Linux also.
    sched_yield();
#elif defined(__MIC__)
    // On MIC, pthread_yield() really trashes things.  Arch's measurements
    // showed that calling _mm_delay_32() (or doing nothing) was a better
    // option.  Delaying 1024 clock cycles is a reasonable compromise between
    // giving up the processor and latency starting up when work becomes
    // available
    _mm_delay_32(1024);
#elif defined(ANDROID)
    // On Android, call sched_yield to yield quantum.  I'm not sure why we
    // don't do this on Linux also.
    sched_yield();
#else
    // On Linux, call pthread_yield (which in turn will call sched_yield)
    // to yield quantum.
    pthread_yield();
#endif
}
Beispiel #5
0
int
main (int argc, char *argv[])
{
  int i, j, t;
  gaspi_rank_t myrank;
  char *ptr0;


  //on numa architectures you have to map this process to the numa node where nic is installed
  if (start_bench (2) != 0)
    {
      printf ("Initialization failed\n");
      exit (-1);
    }

  // BENCH //
  gaspi_proc_rank (&myrank);

  if (gaspi_segment_ptr (0, (void **) &ptr0) != GASPI_SUCCESS)
    {
      printf ("gaspi_segment_ptr failed !\n");
      exit (-1);
    }

  gaspi_float cpu_freq;
  gaspi_cpu_frequency(&cpu_freq);

  if (myrank < 2)
    {
      if(myrank == 0)
	{
	  printf("-----------------------------------\n");
	  printf ("%12s\t%5s\n", "Bytes", "Lat(usecs)");
	  printf("-----------------------------------\n");
	}

      int bytes = 2;
      volatile char *postBuf = (volatile char *) ptr0;

      for (i = 1; i < 24; i++)
	{
	  volatile char *pollBuf = (volatile char *) (ptr0 + ( 2 * bytes -1 ));
	  int rcnt = 0;
	  int cnt = 0;
	  gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK);

	  for (j = 0; j < ITERATIONS; j++)
	    {
	      if (rcnt < ITERATIONS && !(cnt < 1 && myrank == 1))
		{
		  rcnt++;
		  while (*pollBuf != (char) rcnt)
		    {
#ifdef MIC
		      _mm_delay_32(32);
#else
		      _mm_pause();
#endif
		    }
		}

	      stamp[j] = get_mcycles ();

	      postBuf[bytes - 1] = (char) ++cnt;

	      gaspi_write (0, 0, myrank ^ 0x1,
			   0, bytes, bytes,
			   0, GASPI_BLOCK);
	      
	      gaspi_wait (0, GASPI_BLOCK);
	    }

	  for (t = 0; t < (ITERATIONS - 1); t++)
	    delta[t] = stamp[t + 1] - stamp[t];

	  qsort (delta, (ITERATIONS - 1), sizeof *delta, mcycles_compare);

	  const double div = 1.0 / cpu_freq;
	  const double ts = (double) delta[ITERATIONS / 2] * div * 0.5;

	  if(myrank == 0)
	    printf ("%12d\t%4.2f\n", bytes, ts);

	  bytes <<= 1;
	}
    }

  end_bench ();

  return 0;
}