Exemple #1
0
int main(int argc, char *argv[])
{
  unsigned long i;
  gaspi_pointer_t _vptr;
  gaspi_rank_t num_ranks, myrank;
  gaspi_number_t qmax ;
  gaspi_number_t queueSize;
  gaspi_rank_t left_rank, right_rank;
  const unsigned long N = (1 << 13);

  TSUITE_INIT(argc, argv);

  ASSERT (gaspi_proc_init(GASPI_BLOCK));
  ASSERT (gaspi_proc_num(&num_ranks));
  ASSERT (gaspi_proc_rank(&myrank));

  ASSERT(gaspi_segment_create(0,
			      _2MB,
			      GASPI_GROUP_ALL,
			      GASPI_BLOCK,
			      GASPI_MEM_INITIALIZED));

  ASSERT(gaspi_segment_ptr(0, &_vptr));

  ASSERT (gaspi_queue_size_max(&qmax));

  ASSERT (gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK));

  left_rank = (myrank + num_ranks - 1 ) % num_ranks;
  right_rank = (myrank + num_ranks + 1) % num_ranks;
  
  ASSERT( gaspi_write(0,          //seg
		      0,          //local off
		      left_rank,  //rank
		      0,          //seg rem
		      0,          //remote off
		      1,          //size 32KB
		      0,          //queue
		      GASPI_BLOCK));

  ASSERT( gaspi_write(0,          //seg
		      0,          //local off
		      right_rank,  //rank
		      0,          //seg rem
		      0,          //remote off
		      1,          //size 32KB
		      0,          //queue
		      GASPI_BLOCK));

  ASSERT (gaspi_wait(0, GASPI_BLOCK));
  ASSERT (gaspi_barrier(GASPI_GROUP_ALL, 5000));  
  ASSERT (gaspi_proc_term(GASPI_BLOCK));
   
  printf("Rank %d: Finish\n", myrank);
  fflush(stdout);

  return EXIT_SUCCESS;
}
void work(int tid)
{
  gaspi_rank_t rankSend;
  gaspi_offset_t localOff= 0;
  gaspi_offset_t remOff = 0;
  gaspi_number_t queueSize, qmax;
  gaspi_size_t commSize ;

  ASSERT (gaspi_queue_size_max(&qmax));

  for(commSize= 1; commSize < _500MB; commSize*=2 )
    for(rankSend = 0; rankSend < numranks; rankSend++)
      {
	gaspi_printf("thread %d rank to send: %d - %lu bytes\n", tid, rankSend, commSize);
	
	gaspi_queue_size(1, &queueSize);
	if (queueSize > qmax - 100)
  	  ASSERT (gaspi_wait(1, GASPI_BLOCK));
	
	ASSERT (gaspi_write(0, localOff, rankSend, 0,  remOff,  commSize, 1, GASPI_BLOCK));
	
      }
  
  ASSERT (gaspi_wait(1, GASPI_BLOCK));
  
  gaspi_threads_sync();
}
Exemple #3
0
void work(int tid)
{
  gaspi_rank_t rankSend;
  gaspi_offset_t localOff = 81478066;
  gaspi_offset_t remOff   = 81478246;
  gaspi_offset_t size = 1800;
  gaspi_number_t queueSize, qmax;

  ASSERT (gaspi_queue_size_max(&qmax));

  for(rankSend = 0; rankSend < numranks; rankSend++)
    {
      gaspi_printf("thread %d rank to send: %d\n", tid, rankSend);

      gaspi_queue_size(1, &queueSize);
      if (queueSize > qmax - 24)
  	  ASSERT (gaspi_wait(1, GASPI_BLOCK));

      ASSERT (gaspi_write(0, localOff, rankSend, 0,  remOff,  size, 1, GASPI_BLOCK));
      
    }
  ASSERT (gaspi_wait(1, GASPI_BLOCK));

  gaspi_threads_sync();
}
Exemple #4
0
int main(int argc, char *argv[])
{
  gaspi_rank_t numranks, myrank;
  gaspi_rank_t rankSend;
  gaspi_size_t segSize;

  TSUITE_INIT(argc, argv);

  ASSERT (gaspi_proc_init(GASPI_BLOCK));

  ASSERT (gaspi_proc_num(&numranks));
  ASSERT (gaspi_proc_rank(&myrank));

  ASSERT (gaspi_segment_create(0, _2GB, GASPI_GROUP_ALL, GASPI_BLOCK, GASPI_MEM_INITIALIZED));

  ASSERT( gaspi_segment_size(0, myrank, &segSize));

  gaspi_printf("seg size %lu MB \n", segSize/1024/1024);

  //  ASSERT (gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK));

  gaspi_offset_t localOff= 814780664;
  gaspi_offset_t remOff = 81478246;
  gaspi_offset_t size = 1800;
  gaspi_number_t queueSize, qmax;

  ASSERT (gaspi_queue_size_max(&qmax));

  for(rankSend = 0; rankSend < numranks; rankSend++)
    {
      gaspi_printf("rank to send: %d\n", rankSend);

      gaspi_queue_size(1, &queueSize);
      if (queueSize > qmax - 24)
  	  ASSERT (gaspi_wait(1, GASPI_BLOCK));

      ASSERT (gaspi_write(0, localOff, rankSend, 0,  remOff,  size, 1, GASPI_BLOCK));
      
    }
  ASSERT (gaspi_wait(1, GASPI_BLOCK));

  ASSERT (gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK));
  
  ASSERT (gaspi_proc_term(GASPI_BLOCK));

  return EXIT_SUCCESS;
}
Exemple #5
0
int main(int argc, char *argv[])
{
  gaspi_rank_t numranks, myrank;
  gaspi_rank_t rankSend;
  gaspi_size_t segSize;
  const  gaspi_offset_t localOff= 0;
  const gaspi_offset_t remOff = 0;
  gaspi_number_t queueSize, qmax;
  gaspi_size_t commSize ;

  TSUITE_INIT(argc, argv);

  ASSERT (gaspi_proc_init(GASPI_BLOCK));

  ASSERT (gaspi_proc_num(&numranks));
  ASSERT (gaspi_proc_rank(&myrank));

  ASSERT (gaspi_segment_create(0, _8MB, GASPI_GROUP_ALL, GASPI_BLOCK, GASPI_MEM_INITIALIZED));

  ASSERT (gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK));

  ASSERT (gaspi_queue_size_max(&qmax));

  for(commSize= 1; commSize <= _8MB; commSize*=2 )
    {
      for(rankSend = 0; rankSend < numranks; rankSend++)
	{
	  gaspi_printf("rank to send: %d - %lu bytes\n", rankSend, commSize);

	  gaspi_queue_size(1, &queueSize);
	  if (queueSize > qmax - 24)
	    ASSERT (gaspi_wait(1, GASPI_BLOCK));

	  ASSERT (gaspi_write(0, localOff, rankSend, 0,  remOff,  commSize, 1, GASPI_BLOCK));
	}
    }

  ASSERT (gaspi_wait(1, GASPI_BLOCK));

  ASSERT (gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK));

  ASSERT (gaspi_proc_term(GASPI_BLOCK));

  return EXIT_SUCCESS;
}
int main (int argc, char *argv[])
{
  gaspi_configuration_t  config = { 0 };
  //argc, argv, "", (1UL << 30)

  gaspi_proc_init (config, GASPI_BLOCK); // 1 GiB DMA enabled memory per node

  gaspi_rank_t iProc, nProc;
  gaspi_proc_rank (&iProc);
  gaspi_proc_num (&nProc);

  void* temp_ptr;
  gaspi_segment_ptr(GPI_SEGMENT, &temp_ptr);

  int *mem = (int *) temp_ptr;  // begin of DMA enabled memory
  int *src = mem;               // offset 0
  int *dst = mem + nProc;       // offset nProc * sizeof(int)

  for (gaspi_rank_t p = 0; p < nProc; ++p)
    {
      src[p] = iProc * nProc + p;

      const unsigned long locOff = p * sizeof (int);
      const unsigned long remOff = (nProc + iProc) * sizeof (int);

      gaspi_write(GPI_SEGMENT, locOff, p, GPI_SEGMENT, remOff, sizeof (int), 0, GASPI_BLOCK);
    }

  gaspi_wait (0, GASPI_BLOCK);
  gaspi_barrier (GASPI_GROUP_ALL, GASPI_BLOCK);

  dump (src, iProc, nProc, "src");
  dump (dst, iProc, nProc, "dst");

  gaspi_proc_term (GASPI_BLOCK);

  return EXIT_SUCCESS;
}
Exemple #7
0
void
send_global_msg_to_check_state(gaspi_state_vector_t health_vec, gaspi_rank_t *avoid_list)
{
  int i, j;
  int num_simultaneous_fail_checks = 1;
  gaspi_timeout_t HEALTH_CHECK_TIMEOUT_TIME = GASPI_BLOCK;	

  gaspi_printf("Checking global health state\n");

  /* in order to check multiple simultaneous fail, health check has to be performed multiple times */
  for(j = 0 ; j < num_simultaneous_fail_checks; ++j )
    {	
      for(i = 0; i < numprocs; ++i)
	{
	  if(avoid_list[i] != 1)
	    {
	      ASSERT(gaspi_write(gm_seg_health_chk_array_id, myrank, i,
				 gm_seg_health_chk_array_id, myrank, sizeof(int),
				 queue_id, HEALTH_CHECK_TIMEOUT_TIME));
	    }
	}

      gaspi_wait(queue_id, HEALTH_CHECK_TIMEOUT_TIME);
      ASSERT(gaspi_state_vec_get(health_vec));

      /* adding the dead processes to avoid_list */
      /* so that message for health test is not sent to them next time. */
      for(i = 0; i < numprocs; ++i)
	{
	  if(health_vec[i] == 1)
	    {
	      avoid_list[i] = 1;
	    }
	}
    }
  print_health_vec(health_vec);
}
Exemple #8
0
int main(int argc, char *argv[])
{
  TSUITE_INIT(argc, argv);

  ASSERT (gaspi_proc_init(GASPI_BLOCK));

  const unsigned long N = (1 << 13);
  gaspi_rank_t P, myrank;

  ASSERT (gaspi_proc_num(&P));
  ASSERT (gaspi_proc_rank(&myrank));

  gaspi_printf("P = %d N = %lu\n", P, N);
  
  gaspi_printf("Seg size: %lu MB\n",  MAX (_4GB, 2 * ((N/P) * N * 2 * sizeof (double)))/1024/1024);
  
  if(gaspi_segment_create(0, _1GB,
			  GASPI_GROUP_ALL, GASPI_BLOCK, GASPI_MEM_INITIALIZED) != GASPI_SUCCESS){
    gaspi_printf("Failed to create segment\n");
    return -1;
  }


  gaspi_pointer_t _vptr;
  if(gaspi_segment_ptr(0, &_vptr) != GASPI_SUCCESS)
    printf("gaspi_segment_ptr failed\n");

  gaspi_number_t qmax ;
  ASSERT (gaspi_queue_size_max(&qmax));

  gaspi_printf("Queue max: %lu\n", qmax);
 
  ASSERT (gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK));

  int i;
  gaspi_number_t queueSize;
  int rankSend = (myrank + 1) % P;
  gaspi_printf("rank to: %d\n", rankSend);

  for (i = 0; i < 2 * N; i ++)
    {
      gaspi_queue_size(1, &queueSize);
      if (queueSize > qmax - 24)
	{
	  gaspi_return_t ret;
	  do
	    {
	      ret = gaspi_wait(1, GASPI_TEST);
	      assert (ret != GASPI_ERROR);
	    }
	  while(ret != GASPI_SUCCESS);

	  gaspi_queue_size(1, &queueSize);
	  assert(queueSize == 0);
	}
      ASSERT (gaspi_write(0, 4, rankSend, 0, 6, 32768, 1, GASPI_TEST));
    }
  
  ASSERT (gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK));
  
  ASSERT (gaspi_proc_term(GASPI_BLOCK));


  return EXIT_SUCCESS;
}
Exemple #9
0
int main(int argc, char *argv[])
{
  gaspi_rank_t nprocs, myrank, i;
  int j, n;
  gaspi_rank_t *avoid_list;
  gaspi_group_t survivors;

  TSUITE_INIT(argc, argv);

  ASSERT (gaspi_proc_init(GASPI_BLOCK));
  ASSERT(gaspi_proc_num(&nprocs));
  ASSERT(gaspi_proc_rank(&myrank));

  ASSERT(gaspi_segment_create(0, 
			      _4MB,
			      GASPI_GROUP_ALL, 
			      GASPI_BLOCK, 
			      GASPI_MEM_INITIALIZED));


  avoid_list = (gaspi_rank_t *) malloc(nprocs * sizeof(gaspi_rank_t));

  assert (avoid_list != NULL);
  memset(avoid_list, 0, nprocs * sizeof(gaspi_rank_t));

  gaspi_state_vector_t vec = (gaspi_state_vector_t) malloc(nprocs);

  ASSERT(gaspi_state_vec_get(vec));

  //check that everyone is healthy
  for(i = 0; i < nprocs; i++)
    {
      assert(vec[i] == 0);
    }

  //sync
  ASSERT (gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK));

  //now last rank dies
  if(myrank == nprocs - 1)
    exit(-1);

  else
    {
      //create group of survivors 
      ASSERT(gaspi_group_create(&survivors));
      for(i = 0; i < nprocs - 1; i++)
	ASSERT(gaspi_group_add(survivors, i));
      
      ASSERT(gaspi_group_commit(survivors, GASPI_BLOCK));
      
      gaspi_printf("Done with groups\n");
      
      sleep(2);
    }
  //the others communicate
  gaspi_return_t retval;

  for(j = 0; j < 10; j++)
    {
      gaspi_printf("Iteration %d\n", j);
      for(i = 0; i < nprocs; i++)
	{
	  if( avoid_list[i] != 1 )
	    ASSERT(gaspi_write(0, 0, i,
			       0, 0, sizeof(int),
			       0,
			       GASPI_BLOCK));
	  
	  
	}
      retval = gaspi_wait(0, GASPI_BLOCK);
      
      //problem found -> recover
      if(retval != GASPI_SUCCESS)
	{
	  ASSERT(gaspi_state_vec_get(vec));
	  for(n = 0; n < nprocs; n++)
	    {
	      if(vec[n] != GASPI_STATE_HEALTHY)
		{
		  gaspi_printf("Problem with node %d detected\n", n);
		  assert(n == (nprocs - 1));
		  
		  ASSERT(recover());
		  
		  avoid_list[n] = 1;
		}
	    }
	}
    }	  

  ASSERT (gaspi_barrier(survivors, GASPI_BLOCK));

  ASSERT (gaspi_proc_term(GASPI_BLOCK));
  
  gaspi_printf("exiting\n");
  return EXIT_SUCCESS;
}
Exemple #10
0
int main(int argc, char *argv[])
{
  int i, iter;
  int ret = 0;
  
  gaspi_rank_t myrank, numranks;
  gaspi_size_t mem_size = 0UL, j;

  TSUITE_INIT(argc, argv);

  ASSERT (gaspi_proc_init(GASPI_BLOCK));
  ASSERT (gaspi_proc_rank(&myrank));
  ASSERT (gaspi_proc_num(&numranks));

  if( numranks < 2 )
    {
      return EXIT_SUCCESS;
    }
  
  mem_size = 2 * SLOT_SIZE * (numranks - 1);

  if(myrank == 0)
    {
      printf("Mem size: %lu (%.2f MB)\nProcs: %u Max Slot size %lu Iterations %d\n",
	     mem_size,
	     mem_size * 1.0f / 1024/ 1024,
	     numranks,
	     (gaspi_size_t) SLOT_SIZE,
	     MAX_ITERATIONS);
#ifdef WITH_SYNC
      printf("Using notifications only\n");
#endif      
    }
  
  ASSERT (gaspi_segment_create(0, mem_size, GASPI_GROUP_ALL, GASPI_BLOCK, GASPI_MEM_INITIALIZED));

  gaspi_pointer_t _vptr;
  ASSERT (gaspi_segment_ptr(0, &_vptr));

  float *mptr = (float *) _vptr;

  //generate random
  srand((unsigned)time(0)); 
      
  srand48((unsigned) time(0));

  gaspi_size_t cur_slot_size = SLOT_SIZE;
  for(cur_slot_size = SLOT_SIZE; cur_slot_size >= sizeof(float); cur_slot_size/=2)
    {
      if(myrank == 0)
	printf("===== Slot Size %lu ====\n", cur_slot_size);

      for(iter = 0; iter < MAX_ITERATIONS; iter++)
	{
	  if(myrank == 0)
	    printf("iteration %3d... ", iter);
      
	  ASSERT(gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK));

	  /* fill slots with randoms */
	  for(j = 0; j < (mem_size / sizeof(float) / 2); j++)
	    {
	      mptr[j]=  drand48() + (myrank*1.0);
	    }

	  ASSERT (gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK));

	  gaspi_offset_t offset_in = 0, offset_out = mem_size / 2;

	  /* rank 0 write to all others */
	  if(myrank == 0)
	    {
	      for (i = 1; i < numranks; i++)
		{
		  offset_in = (i - 1) * cur_slot_size;
#ifdef WITH_SYNC
		  ASSERT (gaspi_write_notify(0, offset_in, i,
					     0, 0, cur_slot_size,
					     0, 1,
					     0, GASPI_BLOCK));
#else
		  ASSERT (gaspi_write(0, offset_in, i,
				      0, 0, cur_slot_size,
				      0, GASPI_BLOCK));

#endif		  
		}

	      ASSERT(gaspi_wait(0, GASPI_BLOCK));     
	    }
#ifdef WITH_SYNC	  
	  else
	    {
	      gaspi_notification_id_t id;
	      gaspi_notification_t val;
	      
	      ASSERT(gaspi_notify_waitsome(0, 0, 1, &id, GASPI_BLOCK));
	      
	      ASSERT(gaspi_notify_reset(0, id, &val));
	      assert(val == 1);
	    }
#endif

#ifndef WITH_SYNC	  
	  ASSERT (gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK));
#endif
	  /* other ranks all write back to 0 */
	  if(myrank != 0)
	    {
	      offset_in = 0;
	      offset_out = (mem_size / 2) + (cur_slot_size * (myrank - 1));
#ifdef WITH_SYNC
	      ASSERT (gaspi_write_notify(0, offset_in, 0,
					 0, offset_out, cur_slot_size,
					 myrank, 1,
					 0, GASPI_BLOCK));
#else
	      ASSERT (gaspi_write(0, offset_in, i,
				  0, offset_out, cur_slot_size,
				  0, GASPI_BLOCK));
#endif
	      ASSERT(gaspi_wait(0, GASPI_BLOCK));     
	    }
#ifdef WITH_SYNC
	  else
	    {
	      gaspi_notification_id_t id;
	      gaspi_notification_t val;
	      int notification_counter = 0;
	      do
		{
		  
		  ASSERT(gaspi_notify_waitsome(0, 1, numranks - 1, &id, GASPI_BLOCK));
		  ASSERT(gaspi_notify_reset(0, id, &val));
		  assert(val == 1);
		  notification_counter++;
		}
	      while( notification_counter < numranks - 1); 
	    }
#endif
#ifndef WITH_SYNC	  	  
	  ASSERT (gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK));
#endif  
	  if(myrank == 0)
	    {
	      /* check correctness */
	      float *in = (float *) _vptr;
	      float *out = (float *) ((char *) _vptr + mem_size / 2);
	      const gaspi_size_t total_elems = (cur_slot_size * (numranks - 1) / sizeof(float));
      
	      for(j = 0; j < total_elems; j++)
		{
		  if(in[j] != out[j])
		    {
		      printf("Different values at pos %lu: %f %f (iterations %d)\n", j, in[j], out[j], iter);
		      ret = -1;
		      goto end;
		  
		    }
		}
	      printf("All fine!\n");
	    }
	}
    }
  
 end:
  ASSERT (gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK));
  ASSERT (gaspi_proc_term(GASPI_BLOCK));

  return ret; 
}
Exemple #11
0
int main(int argc, char *argv[])
{
  int k = 0;
  int ret = 0;
  unsigned long j;

  const gaspi_size_t size = 4096;

  const gaspi_size_t memSize = _4GB;

  gaspi_offset_t offset_write = 0;
  gaspi_offset_t offset_read = _2GB;
  gaspi_offset_t offset_check = 3221225472;
  gaspi_number_t qmax ;
  gaspi_number_t queueSize;

  TSUITE_INIT(argc, argv);

  ASSERT (gaspi_proc_init(GASPI_BLOCK));
  ASSERT (gaspi_queue_size_max(&qmax));
  ASSERT (gaspi_segment_create(0, memSize, GASPI_GROUP_ALL, GASPI_BLOCK, GASPI_MEM_INITIALIZED));

  gaspi_pointer_t _vptr;
  ASSERT (gaspi_segment_ptr(0, &_vptr));

  /* get memory area pointer */
  float *mptr_f = (float *) _vptr;
  char *mptr_c = (char *) _vptr;

  gaspi_rank_t myrank, highestnode;
  ASSERT (gaspi_proc_rank(&myrank));
  ASSERT (gaspi_proc_num(&highestnode));

  while(k <= RUNS)
    {
      //generate random
      srand((unsigned)time(0));
      srand48((unsigned) time(0));

      ASSERT(gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK));

      //clean
      for(j = 0; j < memSize; j++)
	mptr_c[j]= 0;

      /* fill randoms up to 1GB */
      for(j = 0; j < (GB / sizeof(float)); j++)
	{
	  mptr_f[j]=  drand48() + (myrank * 1.0);
	}

#ifdef DEBUG
      gaspi_printf("random value in pos 0 %f\n", mptr_f[0]);
#endif
      ASSERT (gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK));

      gaspi_printf("\n....Running iteration %d of %d...\n",k, RUNS);

      const unsigned long packets = (GB / size);
      for(j = 0; j < packets; j++)
	{
	  ASSERT(gaspi_queue_size(0, &queueSize));
	  if (queueSize > qmax - 24)
	    {
	      ASSERT(gaspi_wait(0, GASPI_BLOCK));
	    }

	  ASSERT (gaspi_write(0, offset_write, (myrank + 1) % highestnode,
			      0, offset_read, size,
			      0, GASPI_BLOCK));

	  offset_write += size;
	  offset_read += size;
	}

    offset_write=0;
    offset_read = _2GB;

#ifdef DEBUG
    gaspi_printf("%d bytes written!\n", packets * size);
#endif

    /* notify remote that data is written */
    ASSERT (gaspi_notify( 0, (myrank + 1) % highestnode, 0, 1, 0, GASPI_BLOCK));
    gaspi_notification_id_t recv_id;
    ASSERT(gaspi_notify_waitsome(0, 0, 1, &recv_id, GASPI_BLOCK));
    assert(recv_id == 0);
    gaspi_notification_t notification_val;
    ASSERT( gaspi_notify_reset(0, recv_id, &notification_val));

    /* notify remote that data has arrived */
    ASSERT (gaspi_notify( 0, (myrank + highestnode - 1) % highestnode, 1, 1, 0, GASPI_BLOCK));

    gaspi_notification_id_t ack_id;
    ASSERT(gaspi_notify_waitsome(0, 1, 1, &ack_id, GASPI_BLOCK));
    assert(ack_id == 1);
    ASSERT( gaspi_notify_reset(0, ack_id, &notification_val));

    /* check if data was written successfully */
    ASSERT (gaspi_read(0, offset_check, (myrank + 1) % highestnode,
		       0, offset_read, GB / 2,
		       0, GASPI_BLOCK));

    ASSERT (gaspi_read(0, offset_check + (GB / 2), (myrank + 1) % highestnode,
		       0, offset_read  + (GB / 2), GB / 2,
		       0, GASPI_BLOCK));

    ASSERT (gaspi_wait(0, GASPI_BLOCK));

#ifdef DEBUG
    gaspi_printf("Values %f %f %f \n", mptr_f[0], mptr_f[offset_read / sizeof(float)], mptr_f[offset_check / sizeof(float)]);
#endif

    j = 0;
    while(j < GB / sizeof(float) )
      {
	if(mptr_f[j] != mptr_f[offset_check / sizeof(float) + j]){
	  gaspi_printf("value incorrect %f-%f at %d \n",
		       mptr_f[j],
		       mptr_f[offset_check / sizeof(float) + j],
		       j);
	  ret = -1;
	  goto out;
	}
	j++;
      }

#ifdef DEBUG
    gaspi_printf("Check!\n");
#endif

    k++;
  }

 out:

  gaspi_printf("Waiting to finish...\n");
  ASSERT(gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK));
  ASSERT (gaspi_proc_term(GASPI_BLOCK));

  return ret;
}
Exemple #12
0
int main(int argc, char *argv[])
{

  TSUITE_INIT(argc, argv);

  ASSERT (gaspi_proc_init(GASPI_BLOCK));

  gaspi_rank_t numranks, myrank;

  ASSERT (gaspi_proc_num(&numranks));
  ASSERT (gaspi_proc_rank(&myrank));

  int rankSend = (myrank + 1) % numranks;

  ASSERT(gaspi_segment_create(0, _1MB, GASPI_GROUP_ALL, GASPI_BLOCK, GASPI_MEM_INITIALIZED));

  gaspi_size_t segSize;
  ASSERT( gaspi_segment_size(0, myrank, &segSize));

  unsigned char * pGlbMem;

  gaspi_pointer_t _vptr;
  ASSERT(gaspi_segment_ptr(0, &_vptr));

  pGlbMem = ( unsigned char *) _vptr;

  gaspi_number_t queueSize, qmax ;
  ASSERT (gaspi_queue_size_max(&qmax));

  ASSERT (gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK));

  const unsigned long localOff = 0;
  const unsigned long remOff = 0;

  /* write_notify */
  do
    {
      ASSERT(gaspi_write_notify(0, localOff, rankSend,
				0, remOff, 1,
				(gaspi_notification_id_t) myrank, 1,
				1, GASPI_BLOCK));
      gaspi_queue_size(1, &queueSize);
    }
  while(queueSize < qmax);

  EXPECT_FAIL_WITH(gaspi_write_notify(0, localOff, rankSend,
				      0, remOff, 1,
				      (gaspi_notification_id_t) myrank, 1,
				      1, GASPI_BLOCK),
		   GASPI_QUEUE_FULL);

  ASSERT (gaspi_wait(1, GASPI_BLOCK));

  /* write */
  do
    {
      ASSERT(gaspi_write(0, localOff, rankSend,
			 0, remOff, 1,
			 1, GASPI_BLOCK));

      gaspi_queue_size(1, &queueSize);
    }
  while(queueSize < qmax);

  EXPECT_FAIL_WITH(gaspi_write(0, localOff, rankSend,
			       0, remOff, 1,
			       1, GASPI_BLOCK),
		   GASPI_QUEUE_FULL);

  ASSERT (gaspi_wait(1, GASPI_BLOCK));

  ASSERT(gaspi_write(0, localOff, rankSend,
		     0, remOff, 1,
		     1, GASPI_BLOCK));

  /* write + write_notify */
  do
    {
      ASSERT(gaspi_write(0, localOff, rankSend,
			 0, remOff, 1,
			 1, GASPI_BLOCK));

      gaspi_queue_size(1, &queueSize);
    }
  while(queueSize < qmax - 1);

  EXPECT_FAIL_WITH(gaspi_write_notify(0, localOff, rankSend,
				      0, remOff, 1,
				      (gaspi_notification_id_t) myrank, 1,
				      1, GASPI_BLOCK),
		   GASPI_QUEUE_FULL);

  ASSERT (gaspi_wait(1, GASPI_BLOCK));

  ASSERT(gaspi_write_notify(0, localOff, rankSend,
			    0, remOff, 1,
			    (gaspi_notification_id_t) myrank, 1,
			    1, GASPI_BLOCK));

  /* read */
  do
    {
      ASSERT(gaspi_read(0, localOff, rankSend,
			 0, remOff, 1,
			 1, GASPI_BLOCK));

      gaspi_queue_size(1, &queueSize);
    }
  while(queueSize < qmax);

  EXPECT_FAIL_WITH(gaspi_read(0, localOff, rankSend,
			       0, remOff, 1,
			       1, GASPI_BLOCK),
		   GASPI_QUEUE_FULL);

  ASSERT (gaspi_wait(1, GASPI_BLOCK));

  ASSERT(gaspi_read(0, localOff, rankSend,
		    0, remOff, 1,
		    1, GASPI_BLOCK));

  /* write_list_notify */
  {
    const gaspi_number_t nListElems = 255;
    gaspi_number_t n;

    gaspi_segment_id_t localSegs[nListElems];
    gaspi_offset_t localOffs[nListElems];
    const gaspi_rank_t rank2send = (myrank + 1) % numranks;
    gaspi_segment_id_t remSegs[nListElems];
    gaspi_offset_t remOffs[nListElems];
    gaspi_size_t sizes[nListElems];

    const unsigned int bytes = sizeof(int);
    gaspi_offset_t initLocOff = 0;
    gaspi_offset_t initRemOff = (bytes * nListElems + 64);

    for(n = 0; n < nListElems; n++)
    {
      sizes[n] = bytes;

      localSegs[n] = 0;
      localOffs[n] = initLocOff;
      initLocOff += bytes;

      remSegs[n] = 0;
      remOffs[n] = initRemOff;
      initRemOff += bytes;
    }

    do
      {
	ASSERT( gaspi_write_list_notify( nListElems,
					 localSegs, localOffs, rank2send,
					 remSegs, remOffs, sizes,
					 0, myrank, 1,
					 0, GASPI_BLOCK));

	gaspi_queue_size(0, &queueSize);
      }
    while(queueSize < qmax);

    EXPECT_FAIL_WITH( gaspi_write_list_notify( nListElems,
					       localSegs, localOffs, rank2send,
					       remSegs, remOffs, sizes,
					       0, myrank, 1,
					       0, GASPI_BLOCK),
			  GASPI_QUEUE_FULL);
    ASSERT (gaspi_wait(0, GASPI_BLOCK));
    ASSERT( gaspi_write_list_notify( nListElems,
				     localSegs, localOffs, rank2send,
				     remSegs, remOffs, sizes,
				     0, myrank, 1,
				     0, GASPI_BLOCK));

  }

  ASSERT (gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK));

  ASSERT (gaspi_proc_term(GASPI_BLOCK));

  return EXIT_SUCCESS;
}
Exemple #13
0
int main(int argc, char* argv[])
{
  int i, j;
  gaspi_number_t gsize;
  int comm_state = WORKING;
  int num_failures = 0;
  int timesteps = 0;
  
  ASSERT (gaspi_proc_init(GASPI_BLOCK));
  ASSERT (gaspi_proc_rank(&myrank));
  ASSERT (gaspi_proc_num(&numprocs));

  read_params(argc, argv, &timesteps, &numprocs_idle);
	
  numprocs_working = numprocs - numprocs_idle;
  numprocs_working_and_idle = numprocs_working + numprocs_idle;
  gaspi_rank_t *comm_main_ranks = malloc( numprocs_idle * sizeof(gaspi_rank_t));
  init_array_2(comm_main_ranks, numprocs_working);

  /* contains info of all processes:
     which are working(0), broken(1) and idle(2).
     keeps updated all the time(iterations) */
  int * status_processes = (int *) malloc(numprocs * sizeof(int));
	
  init_array_3(status_processes, numprocs, WORKING);
  for(i = numprocs-1, j=0; j < numprocs_idle;--i,++j)
    {
      status_processes[i] = IDLE; // putting last processes to IDLE
    }
	
  // ===== GASPI group creation =====
  if(status_processes[myrank]==WORKING)
    {
      ASSERT(gaspi_group_create(&COMM_MAIN));

      gaspi_number_t i;
      for(i=0; i<numprocs; i++)
	{
	  if(status_processes[i]==WORKING)
	    {
	      ASSERT(gaspi_group_add(COMM_MAIN, i));
	      ASSERT(gaspi_group_size(COMM_MAIN, &gsize));
	    }
	}
      ASSERT(gaspi_group_ranks (COMM_MAIN, comm_main_ranks));
      ASSERT(gaspi_group_commit (COMM_MAIN, GASPI_BLOCK));
    }

  /* ====== Init a SYNC FLAGS Segment ====== */
  /* used to communicate the WORKING, BROKEN, or FINISHED_WORK status between the working and idle processes. */

  gaspi_size_t SYNC_global_mem_size;
  SYNC_global_mem_size = numprocs * sizeof(int);

  gaspi_pointer_t gm_ptr_sync=NULL;
  ASSERT(init_segment (gm_seg_sync_flags_id, SYNC_global_mem_size));
  ASSERT(gaspi_segment_ptr (gm_seg_sync_flags_id, &gm_ptr_sync));

  int * sync_flags = (int *) gm_ptr_sync;
  init_array_3(sync_flags, numprocs, WORKING);
	
  /* ====== Init a health check write FLAGS Segment ====== */
  /* This array is used to send the gaspi_write message write before health_chk routine,
     which will then update the gaspi internal health vector */

  gaspi_size_t health_chk_global_mem_size;
  health_chk_global_mem_size = numprocs*sizeof(int);
  gaspi_pointer_t gm_ptr_health_chk=NULL;
  ASSERT(init_segment (gm_seg_health_chk_array_id, health_chk_global_mem_size));
  ASSERT(gaspi_segment_ptr (gm_seg_health_chk_array_id, &gm_ptr_health_chk));
	
  gaspi_state_vector_t health_vec = (gaspi_state_vector_t) malloc(numprocs);
  ASSERT(gaspi_state_vec_get(health_vec));

  gaspi_rank_t * avoid_list= (gaspi_rank_t *) malloc(numprocs * sizeof(gaspi_rank_t));
  for(i = 0;i < numprocs; ++i)
    avoid_list[i] = (gaspi_rank_t) 0;
	
  gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK);

  /* ===== TIME-STEP LOOP =====  */
  if(status_processes[myrank]==IDLE)
    {
      /* IDLE processes remain in this loop */
      while(1)
	{
	  gaspi_printf("%d.", myrank);
	  if(sync_flags[0] == WORKING)
	    {
	      /*  NO FAILURE REPORTED  */
	      usleep(1000000);
	    }
	  if(sync_flags[0] == BROKEN)
	    {
	      /* FAILURE REPORTED */
	      gaspi_printf("myrank: %d Broken reported\n", myrank);
	      comm_state=BROKEN;
	      break;
	    }
	  if(sync_flags[0] == WORKFINISHED)
	    {
	      /* WORKFINISHED REPORTED */
	      gaspi_printf("myrank: %d WorkFinished reported\n", myrank);
	      comm_state = WORKFINISHED;
	      break;
	    }
	}
    }

  int time_step;
  for(time_step=1; time_step <= timesteps && comm_state!=WORKFINISHED; time_step++)
    {
      gaspi_printf("== time_step: %d ==\n", time_step);
      if(comm_state==WORKING && status_processes[myrank]==WORKING)
	{
	  gaspi_barrier(COMM_MAIN, GASPI_TIMEOUT_TIME);
	  sleep(1); // NOTE: this is the work section.
	  if(time_step == 5 && myrank== 1)
	    {
	      exit (-1);
	    }
	}
      
      if(time_step<timesteps )
	{
	  send_global_msg_to_check_state(health_vec, avoid_list);
	  num_failures = check_comm_health(status_processes, health_vec);

	  gaspi_printf("%d NUM_FAILURES at timestep %d = %d\n", myrank, time_step, num_failures);

	  if( num_failures != 0 )
	    {
	      rescue_process = numprocs_working;
	      if(myrank==0)
		{
		  // message the IDLE process
		  sync_flags[0]=BROKEN;
		  
		  for(i = 0 ; i < num_failures ; ++i)
		    {
		      /* TODO: multiple failures at the same time. */
		      gaspi_printf("messaging rescue_process: %d\n", rescue_process);
		      ASSERT(gaspi_write(gm_seg_sync_flags_id, 0, rescue_process, gm_seg_sync_flags_id, 0, sizeof(int), 0, GASPI_BLOCK));
		      rescue_process++;
		    }
		}

	      if(myrank==0 || myrank==rescue_process)
		gaspi_printf("%d REPAIRING COMM_MAIN FLAG 1\n", myrank);

	      update_status_processes_array(status_processes, health_vec);
	      numprocs_working_and_idle = refresh_numprocs_working_and_idle(status_processes);
	      
	      if(myrank != rescue_process)
		{
		  ASSERT(gaspi_group_delete(COMM_MAIN));
		  ASSERT(recover());
		}
	  
	      ASSERT(gaspi_group_create(&COMM_MAIN_NEW));

	      for(i = 0; i < numprocs; i++)
		{
		  if(status_processes[i]==WORKING)
		    {
		      ASSERT(gaspi_group_add(COMM_MAIN_NEW, i));
		      ASSERT(gaspi_group_size(COMM_MAIN_NEW, &gsize));
		      if(gsize == numprocs_working)
			break;
		    }
		}
	  
	      gaspi_printf("%d: COMM_MAIN_NEW size is: %hi\n", myrank, gsize);

	      ASSERT(gaspi_group_commit (COMM_MAIN_NEW, GASPI_BLOCK));
	  
	      init_array_2(comm_main_ranks, numprocs_working);
	  
	      ASSERT(gaspi_group_ranks (COMM_MAIN_NEW, comm_main_ranks));

	      gaspi_printf("printing group_ranks_main: \n");
	      gaspi_printf_array(comm_main_ranks, numprocs_working);

	      comm_state = WORKING;
	      gaspi_printf("%d REPAIRING COMM_MAIN_NEW FLAG 2\n", myrank);
				
	      if(status_processes[myrank] == WORKING)
		{
		  ASSERT(gaspi_barrier(COMM_MAIN_NEW, GASPI_BLOCK));
		  ASSERT(gaspi_barrier(COMM_MAIN_NEW, GASPI_BLOCK));
		}

	      /* set things to work again */
	      COMM_MAIN = COMM_MAIN_NEW;
	      time_step = 5;
	    }
	}
    }
  
  if(myrank == 0)
    {
      gaspi_printf("finished successfully\n");
    }
  
  gaspi_proc_term(10000);

  return EXIT_SUCCESS;
}
Exemple #14
0
int main(int argc, char *argv[])
{
  int j,i,k=0;
  int ret=0;

  const gaspi_size_t size=4096;//4k

  const gaspi_size_t memSize = 4294967296; //4GB

  gaspi_offset_t offset_write=0, offset_read = memSize / 2, offset_check = 3221225472 ;

  ASSERT (gaspi_proc_init(GASPI_BLOCK));

  ASSERT (gaspi_segment_create(0, memSize, GASPI_GROUP_ALL, GASPI_BLOCK, GASPI_MEM_INITIALIZED));

  gaspi_pointer_t _vptr;
  ASSERT (gaspi_segment_ptr(0, &_vptr));

  /* get memory area pointer */
#ifdef FLOAT
  float *mptr = (float *) _vptr;
#else
  int *mptr = (int *) _vptr;
#endif

  gaspi_rank_t myrank, highestnode;
  ASSERT (gaspi_proc_rank(&myrank));
  ASSERT (gaspi_proc_num(&highestnode));

  while(k <= RUNS)
    { 
      //generate random
      srand((unsigned)time(0)); 
      
#ifdef FLOAT
      srand48((unsigned) time(0));
#endif
      //clean
      for(j = 0; j < (memSize / 4); j++)
	mptr[j]= 0;

    //fill randoms up to 1GB
      for(j = 0; j < (memSize / 16); j++)
	{
#ifdef FLOAT
	  mptr[j]=  drand48() + (myrank*1.0);
#else
	  mptr[j]=  rand() + myrank;
#endif
	}

#ifdef DEBUG
#ifdef FLOAT
      gaspi_printf("random value in pos 0 %f\n", mptr[0]);
#else
      gaspi_printf("random value in pos 0 %d\n", mptr[0]);
#endif
#endif //DEBUG

      ASSERT (gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK));

      gaspi_printf("\n....Running iteration %d of %d...\n",k, RUNS);

    for(i = 0; i < ITERATIONS; i++)
      {
	for(j = 0; j < ITERATIONS; j++)
	  {
	    ASSERT (gaspi_write(0, offset_write, (myrank + 1) % highestnode,
				0, offset_read, size, 0, GASPI_BLOCK));

	    offset_write += size;
	    offset_read += size;
	  }
	ASSERT (gaspi_wait(0, GASPI_BLOCK));
      }
#ifdef DEBUG
    gaspi_printf("%d bytes written!\n", ITERATIONS * ITERATIONS * size);
#endif
    //check if data was written successfully
    ASSERT (gaspi_read(0, offset_check, (myrank + 1) % highestnode, 
		       0, memSize/2, GB, 0, GASPI_BLOCK));

    ASSERT (gaspi_wait(0, GASPI_BLOCK));
#ifdef DEBUG
    gaspi_printf("%d bytes read!\n",GB);
#endif
    j=0;

#ifdef DEBUG
#ifdef FLOAT
    gaspi_printf("Values  %f %f %f \n", mptr[0], mptr[memSize/8], mptr[offset_check/4]);
#else
    gaspi_printf("Values  %d %d %d \n", mptr[0], mptr[memSize/8], mptr[offset_check/4]);
#endif
#endif//DEBUG

    while(j < GB / 4 )
      {
	if(mptr[j] != mptr[offset_check / 4 + j]){
#ifdef FLOAT
	  gaspi_printf("value incorrect %f-%f at %d \n",mptr[j],mptr[offset_check / 4],j);
#else
	  gaspi_printf("value incorrect %d-%d at %d \n",mptr[j],mptr[offset_check / 4],j);
#endif
	  ret = -1;
	  goto out;
	}
	j++;
      }
    
    offset_write=0;
    offset_read = memSize / 2;

#ifdef DEBUG
    gaspi_printf("Check!\n");
#endif	

    k++;
  }

 out:

  gaspi_printf("Waiting to finish...\n");
  ASSERT(gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK));
  ASSERT (gaspi_proc_term(GASPI_BLOCK));

  return ret;
}
Exemple #15
0
int main(int argc, char *argv[])
{
  gaspi_rank_t numranks, myrank;
  gaspi_rank_t rankSend;
  gaspi_size_t segSize;
  const  gaspi_offset_t localOff_r= 0;
  const gaspi_offset_t remOff_r = 0;
  const  gaspi_offset_t localOff_w = _128MB / 2 ;
  const gaspi_offset_t remOff_w = _128MB / 2;
  gaspi_number_t queueSize, qmax;
  const gaspi_size_t commSize = _8MB;
  int i;
  gaspi_gpu_t gpus[8]; 
  gaspi_gpu_num nGPUs;


  TSUITE_INIT(argc, argv);

  ASSERT (gaspi_proc_init(GASPI_BLOCK));

  ASSERT (gaspi_proc_num(&numranks));
  ASSERT (gaspi_proc_rank(&myrank));
  ASSERT (gaspi_init_GPUs());
  ASSERT (gaspi_number_of_GPUs(&nGPUs));
  ASSERT (gaspi_GPU_ids(gpus));

  ASSERT (gaspi_segment_create(0, _128MB, GASPI_GROUP_ALL, GASPI_BLOCK, GASPI_MEM_INITIALIZED|GASPI_MEM_GPU));

  ASSERT( gaspi_segment_size(0, myrank, &segSize));

  ASSERT (gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK));

  ASSERT (gaspi_queue_size_max(&qmax));

  for(i = 0; i < 100; i++ )
  {
    for(rankSend = 0; rankSend < numranks; rankSend++)
    {
      if(rankSend == myrank)
        continue;

      gaspi_printf("partner rank: %d - %lu bytes (%d)\n", rankSend, commSize, i);

      ASSERT (gaspi_queue_size(1, &queueSize));
      if (queueSize > qmax - 24)
        ASSERT (gaspi_wait(1, GASPI_BLOCK));

      ASSERT (gaspi_read(0, localOff_r, rankSend, 0,  remOff_r,  commSize, 1, GASPI_BLOCK));
    }
  }
  for(i = 0; i < 100; i++ )
  {
    for(rankSend = 0; rankSend < numranks; rankSend++)
    {
      if(rankSend == myrank)
        continue;

      ASSERT (gaspi_queue_size(1, &queueSize));
      if (queueSize > qmax - 24)
        ASSERT (gaspi_wait(1, GASPI_BLOCK));
      
      ASSERT (gaspi_write(0, localOff_r, rankSend, 0,  remOff_r,  commSize, 1, GASPI_BLOCK));
    }
  }

  ASSERT (gaspi_wait(1, GASPI_BLOCK));

  ASSERT (gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK));

  ASSERT (gaspi_proc_term(GASPI_BLOCK));

  return EXIT_SUCCESS;
}
Exemple #16
0
int
main (int argc, char *argv[])
{
  int i, j, t;
  gaspi_rank_t myrank;
  char *ptr0;


  //on numa architectures you have to map this process to the numa node where nic is installed
  if (start_bench (2) != 0)
    {
      printf ("Initialization failed\n");
      exit (-1);
    }

  // BENCH //
  gaspi_proc_rank (&myrank);

  if (gaspi_segment_ptr (0, (void **) &ptr0) != GASPI_SUCCESS)
    {
      printf ("gaspi_segment_ptr failed !\n");
      exit (-1);
    }

  gaspi_float cpu_freq;
  gaspi_cpu_frequency(&cpu_freq);

  if (myrank < 2)
    {
      if(myrank == 0)
	{
	  printf("-----------------------------------\n");
	  printf ("%12s\t%5s\n", "Bytes", "Lat(usecs)");
	  printf("-----------------------------------\n");
	}

      int bytes = 2;
      volatile char *postBuf = (volatile char *) ptr0;

      for (i = 1; i < 24; i++)
	{
	  volatile char *pollBuf = (volatile char *) (ptr0 + ( 2 * bytes -1 ));
	  int rcnt = 0;
	  int cnt = 0;
	  gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK);

	  for (j = 0; j < ITERATIONS; j++)
	    {
	      if (rcnt < ITERATIONS && !(cnt < 1 && myrank == 1))
		{
		  rcnt++;
		  while (*pollBuf != (char) rcnt)
		    {
#ifdef MIC
		      _mm_delay_32(32);
#else
		      _mm_pause();
#endif
		    }
		}

	      stamp[j] = get_mcycles ();

	      postBuf[bytes - 1] = (char) ++cnt;

	      gaspi_write (0, 0, myrank ^ 0x1,
			   0, bytes, bytes,
			   0, GASPI_BLOCK);
	      
	      gaspi_wait (0, GASPI_BLOCK);
	    }

	  for (t = 0; t < (ITERATIONS - 1); t++)
	    delta[t] = stamp[t + 1] - stamp[t];

	  qsort (delta, (ITERATIONS - 1), sizeof *delta, mcycles_compare);

	  const double div = 1.0 / cpu_freq;
	  const double ts = (double) delta[ITERATIONS / 2] * div * 0.5;

	  if(myrank == 0)
	    printf ("%12d\t%4.2f\n", bytes, ts);

	  bytes <<= 1;
	}
    }

  end_bench ();

  return 0;
}
int main(int argc, char *argv[])
{
  int i;
  gaspi_rank_t rank, nprocs;
  gaspi_notification_id_t id;

  const int num_elems = 1024;

  TSUITE_INIT( argc, argv );

  ASSERT( gaspi_proc_init(GASPI_BLOCK) );

  ASSERT( gaspi_proc_num(&nprocs) );
  ASSERT( gaspi_proc_rank(&rank) );

  const gaspi_rank_t left = (rank + nprocs - 1 ) % nprocs;
  const gaspi_rank_t right = (rank + nprocs + 1) % nprocs;

  /* Create and fill buffer */
  int  * const buf = (int *) malloc(num_elems * sizeof(int));
  assert( buf != NULL);

  for (i = 0; i < num_elems; i++)
    {
      buf[i] = rank;
    }

  ASSERT( gaspi_segment_use( 0, buf, num_elems * sizeof(int),
			     GASPI_GROUP_ALL, GASPI_BLOCK,
			     0) );

  ASSERT( gaspi_segment_create( 1, num_elems * sizeof(int),
				GASPI_GROUP_ALL, GASPI_BLOCK,
				GASPI_MEM_INITIALIZED) );

  ASSERT(gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK));

  /* write data to neighbour */
  ASSERT( gaspi_write( 0, 0, right,
		       1, 0, num_elems * sizeof(int),
		       0, GASPI_BLOCK) );

  ASSERT( gaspi_notify( 1, right, 0, 1, 0, GASPI_BLOCK ) );
  ASSERT( gaspi_notify_waitsome( 1, 0, 1, &id, GASPI_BLOCK ) );
  ASSERT( gaspi_wait( 0, GASPI_BLOCK ) );

  /* Check data */
  gaspi_pointer_t seg1_ptr;
  ASSERT( gaspi_segment_ptr( 1, &seg1_ptr ) );
  int * recv_buf = (int *) seg1_ptr;

  for (i = 0; i < num_elems; i++)
    {
      assert(recv_buf[i] == left);
    }

  ASSERT( gaspi_segment_delete(0));
  ASSERT( gaspi_segment_delete(1));

  for (i = 0; i < num_elems; i++)
    {
      assert(buf[i] == rank);
    }

  ASSERT( gaspi_barrier( GASPI_GROUP_ALL, GASPI_BLOCK ) );

  ASSERT( gaspi_proc_term( GASPI_BLOCK ) );

  return EXIT_SUCCESS;
}
gaspi_return_t
pgaspi_gpu_write(const gaspi_segment_id_t segment_id_local,
		 const gaspi_offset_t offset_local,
		 const gaspi_rank_t rank,
		 const gaspi_segment_id_t segment_id_remote,
		 const gaspi_offset_t offset_remote,
		 const gaspi_size_t size,
		 const gaspi_queue_id_t queue,
		 const gaspi_timeout_t timeout_ms)
{
  if( glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].cudaDevId < 0 ||
      size <= GASPI_GPU_DIRECT_MAX )
    {
      return gaspi_write(segment_id_local, offset_local, rank, segment_id_remote, offset_remote, size, queue, timeout_ms);
    }

  if(lock_gaspi_tout (&glb_gaspi_ctx.lockC[queue], timeout_ms))
    return GASPI_TIMEOUT;

  char* host_ptr = (char*)(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].host_ptr + NOTIFY_OFFSET + offset_local);
  char* device_ptr = (char*)(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].addr + offset_local);

  gaspi_gpu* agpu =  _gaspi_find_gpu(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].cudaDevId);
  if( !agpu )
    {
      gaspi_print_error("No GPU found or not initialized (gaspi_init_GPUs).");
      return GASPI_ERROR;
    }

  int size_left = size;
  int copy_size = 0;
  int gpu_offset = 0;
  const int BLOCK_SIZE = GASPI_GPU_BUFFERED;

  const gaspi_cycles_t s0 = gaspi_get_cycles ();

  while(size_left > 0)
    {
      int i;
      for(i = 0; i < GASPI_CUDA_EVENTS; i++)
	{
	  if(size_left > BLOCK_SIZE)
	    copy_size = BLOCK_SIZE;
	  else
	    copy_size = size_left;

	  if( cudaMemcpyAsync(host_ptr + gpu_offset, device_ptr + gpu_offset, copy_size, cudaMemcpyDeviceToHost, agpu->streams[queue]))
	    {
	      unlock_gaspi(&glb_gaspi_ctx.lockC[queue]);
	      return GASPI_ERROR;
	    }

	  glb_gaspi_ctx.ne_count_c[queue]++;

	  agpu->events[queue][i].segment_remote = segment_id_remote;
	  agpu->events[queue][i].segment_local = segment_id_local;
	  agpu->events[queue][i].size = copy_size;
	  agpu->events[queue][i].rank = rank;
	  agpu->events[queue][i].offset_local = offset_local+gpu_offset;
	  agpu->events[queue][i].offset_remote = offset_remote+gpu_offset;
	  agpu->events[queue][i].in_use =1;

	  cudaError_t err = cudaEventRecord(agpu->events[queue][i].event, agpu->streams[queue]);
	  if(err != cudaSuccess)
	    {
	      glb_gaspi_ctx.qp_state_vec[queue][rank] = GASPI_STATE_CORRUPT;
	      unlock_gaspi(&glb_gaspi_ctx.lockC[queue]);
	      return GASPI_ERROR;
	    }

	  gpu_offset += copy_size;
	  size_left -= copy_size;

	  if(size_left == 0)
	    break;

	  if(agpu->events[queue][i].ib_use)
	    {
	      struct ibv_wc wc;
	      int ne;
	      do
		{
		  ne = ibv_poll_cq (glb_gaspi_ctx_ib.scqC[queue], 1, &wc);
		  glb_gaspi_ctx.ne_count_c[queue] -= ne;
		  if (ne == 0)
		    {
		      const gaspi_cycles_t s1 = gaspi_get_cycles ();
		      const gaspi_cycles_t tdelta = s1 - s0;

		      const float ms = (float) tdelta * glb_gaspi_ctx.cycles_to_msecs;
		      if (ms > timeout_ms)
			{
			  unlock_gaspi (&glb_gaspi_ctx.lockC[queue]);
			  return GASPI_TIMEOUT;
			}
		    }
		} while(ne==0);
	      agpu->events[queue][i].ib_use = 0;
	    }
	}

      for(i = 0; i < GASPI_CUDA_EVENTS; i++)
	{
	  cudaError_t error;
	  if ( agpu->events[queue][i].in_use == 1 )
	    {
	      do
		{
		  error = cudaEventQuery(agpu->events[queue][i].event );
		  if( cudaSuccess == error )
		    {
		      if (_gaspi_event_send(&agpu->events[queue][i],queue))
			{
			  unlock_gaspi (&glb_gaspi_ctx.lockC[queue]);
			  return GASPI_ERROR;
			}

		      agpu->events[queue][i].in_use = 0;
		    }
		  else if(error == cudaErrorNotReady)
		    {
		      const gaspi_cycles_t s1 = gaspi_get_cycles ();
		      const gaspi_cycles_t tdelta = s1 - s0;

		      const float ms = (float) tdelta * glb_gaspi_ctx.cycles_to_msecs;
		      if (ms > timeout_ms)
			{
			  unlock_gaspi (&glb_gaspi_ctx.lockC[queue]);
			  return GASPI_TIMEOUT;
			}
		    }
		  else
		    {
		      unlock_gaspi (&glb_gaspi_ctx.lockC[queue]);
		      return GASPI_ERROR;
		    }
		} while(error != cudaSuccess);
	    }
	}
    }

  unlock_gaspi (&glb_gaspi_ctx.lockC[queue]);

  return GASPI_SUCCESS;
}