int main (int argc, char *argv[])
{

  MPI_Init (&argc, &argv);
  
  int nProc, iProc;

  MPI_Comm_rank (MPI_COMM_WORLD, &iProc);
  MPI_Comm_size (MPI_COMM_WORLD, &nProc);

  // number of threads
  const int NTHREADS = 6;

  // number of buffers
  const int NWAY     = 2;

  // left neighbour
  const int left  = LEFT(iProc, nProc);

  // right neighbour
  const int right = RIGHT(iProc, nProc);

  // allocate array of for local vector, left halo and right halo
  double* array = malloc (NWAY * (NTHREADS+2) * 2 * VLEN * sizeof (double));
  ASSERT (array != 0);

  // initial buffer id
  int buffer_id = 0;

  // initialize data
  data_init (NTHREADS, iProc, buffer_id, array);
  
  omp_set_num_threads (NTHREADS);

  MPI_Barrier (MPI_COMM_WORLD);

  double time = -now();

#pragma omp parallel default (shared) firstprivate (buffer_id)
  {
    const int tid = omp_get_thread_num();

    for (int k = 0; k < NITER; ++k)
    {
      for ( int i = 0; i < nProc * NTHREADS; ++i )
      {

	const int slice_id    = tid + 1;
	const int left_halo   = 0;
	const int right_halo  = NTHREADS+1;

	if (tid == 0)
	  {

	    MPI_Request send_req[2];
	    MPI_Request recv_req[2];

	    // post recv
	    MPI_Irecv ( &array_ELEM_right (buffer_id, left_halo, 0), VLEN, MPI_DOUBLE
		       , left, i, MPI_COMM_WORLD, &recv_req[0]);

	    // post recv
	    MPI_Irecv ( &array_ELEM_left (buffer_id, right_halo, 0), VLEN, MPI_DOUBLE
		       , right, i, MPI_COMM_WORLD, &recv_req[1]);

	    // issue send
	    MPI_Isend ( &array_ELEM_right (buffer_id, right_halo - 1, 0), VLEN, MPI_DOUBLE
			 , right, i, MPI_COMM_WORLD, &send_req[0]);

	    // issue send
	    MPI_Isend ( &array_ELEM_left (buffer_id, left_halo + 1, 0), VLEN, MPI_DOUBLE
			 , left, i, MPI_COMM_WORLD, &send_req[1]);

	    // free send request
	    MPI_Request_free(&send_req[0]);
	    
	    MPI_Request_free(&send_req[1]);

	    // wait for Irecv, Isend
	    MPI_Waitall (2, recv_req, MPI_STATUSES_IGNORE);

	  }

#pragma omp barrier

	// compute data, read from id "buffer_id", write to id "1 - buffer_id"
	data_compute (NTHREADS, array, 1 - buffer_id, buffer_id, slice_id);

#pragma omp barrier

	// alternate the buffer
	buffer_id = 1 - buffer_id;

      }
    }
  }
  time += now();

  data_verify (NTHREADS, iProc, ( NITER * nProc * NTHREADS ) % NWAY, array);

  printf ("# mpi %s nProc %d vlen %i niter %d nthreads %i nway %i time %g\n"
         , argv[0], nProc, VLEN, NITER, NTHREADS, NWAY, time
         );
  
  MPI_Finalize();

  free (array);

  return EXIT_SUCCESS;
}
int main (int argc, char *argv[])
{

  SUCCESS_OR_DIE (gaspi_proc_init (GASPI_BLOCK));

  gaspi_rank_t iProc, nProc;
  SUCCESS_OR_DIE (gaspi_proc_rank (&iProc));
  SUCCESS_OR_DIE (gaspi_proc_num (&nProc));

  // number of threads
  const int NTHREADS = 2;

  // number of buffers
  const int NWAY     = 2;

  gaspi_segment_id_t const segment_id = 0;

  // allocate segment for array for local vector, left halo and right halo
  SUCCESS_OR_DIE ( gaspi_segment_create
      ( segment_id, NWAY * (NTHREADS + 2) * 2 * VLEN * sizeof (double)
      , GASPI_GROUP_ALL, GASPI_BLOCK, GASPI_MEM_UNINITIALIZED));
  gaspi_pointer_t array;
  SUCCESS_OR_DIE ( gaspi_segment_ptr ( segment_id, &array) );

  // initial buffer id
  int buffer_id = 0;

  // set notification values
  gaspi_notification_id_t left_data_available[NWAY];
  gaspi_notification_id_t right_data_available[NWAY];
  for (gaspi_notification_id_t id = 0; id < NWAY; ++id)
  {
    left_data_available[id] = id;
    right_data_available[id] = NWAY + id;
  }

  // set queue id
  gaspi_queue_id_t queue_id = 0;

  // initialize data
  data_init (NTHREADS, iProc, buffer_id, array);

  omp_set_num_threads (NTHREADS);

  double time = -now();

#pragma omp parallel default (shared) firstprivate (buffer_id)
  {

    const int tid = omp_get_thread_num();

    for (int k = 0; k < NITER; ++k)
    {
      for ( int i = 0; i < nProc * NTHREADS; ++i )
      {

	const int left_halo   = 0;
	const int slice_id    = tid + 1;
	const int right_halo  = NTHREADS+1;
	
        if (tid == 0)
        {
	  // issue write
          wait_for_queue_max_half (&queue_id);
          SUCCESS_OR_DIE ( gaspi_write_notify
              ( segment_id, array_OFFSET_left (buffer_id, left_halo + 1, 0), LEFT(iProc, nProc) 
              , segment_id, array_OFFSET_left (buffer_id, right_halo, 0), VLEN * sizeof (double)
              , right_data_available[buffer_id], 1 + i, queue_id, GASPI_BLOCK));

	  // issue write
          wait_for_queue_max_half (&queue_id);
          SUCCESS_OR_DIE ( gaspi_write_notify
              ( segment_id, array_OFFSET_right (buffer_id, right_halo - 1, 0), RIGHT(iProc, nProc)
              , segment_id, array_OFFSET_right (buffer_id, left_halo, 0), VLEN * sizeof (double)
              , left_data_available[buffer_id], 1 + i, queue_id, GASPI_BLOCK));


	  // wait for data notification
          wait_or_die (segment_id, right_data_available[buffer_id], 1 + i);

	  // wait for data notification
          wait_or_die (segment_id, left_data_available[buffer_id], 1 + i);


        }
#pragma omp barrier

	// compute data, read from id "buffer_id", write to id "1 - buffer_id"
	data_compute ( NTHREADS, array, 1 - buffer_id, buffer_id, slice_id);

#pragma omp barrier

	// alternate the buffer
	buffer_id = 1 - buffer_id;

      }
    }
  }

  time += now();

  data_verify (NTHREADS, iProc, (NITER * nProc * NTHREADS) % NWAY, array);

  printf ("# gaspi %s nProc %d vlen %i niter %d nthreads %i nway %i time %g\n"
         , argv[0], nProc, VLEN, NITER, NTHREADS, NWAY, time
         );

  gaspi_proc_term (GASPI_BLOCK);

  return EXIT_SUCCESS;
}