示例#1
0
//--------------------------------------------------------------------------------------
// FUNCTION: RCCE_send
//--------------------------------------------------------------------------------------
// send function for simplified API; use library-maintained variables for synchronization
//--------------------------------------------------------------------------------------
int RCCE_send(char *privbuf, size_t size, int dest) {

#ifdef USE_PROBE_FLAGS
  RCCE_FLAG* probe = &RCCE_probe_flag[RCCE_IAM];
#else
  RCCE_FLAG* probe = NULL;
#endif

#ifndef USE_REMOTE_PUT_LOCAL_GET
  if(RCCE_send_queue != NULL)
#else
  if(RCCE_send_queue[dest] != NULL)
#endif
     return(RCCE_REJECTED);

#ifdef USE_TAGGED_FOR_SHORT
  if(size <= (RCCE_LINE_SIZE - sizeof(int)))
  {
#ifdef USE_PROBE_FLAGS
    RCCE_flag_write_tagged(probe, RCCE_FLAG_SET, dest, privbuf, size);
#endif

#ifdef USE_REMOTE_PUT_LOCAL_GET

    RCCE_wait_until(RCCE_ready_flag[dest], RCCE_FLAG_SET);
    RCCE_flag_write(&RCCE_ready_flag[dest], RCCE_FLAG_UNSET, RCCE_IAM);

#ifndef USE_PROBE_FLAGS_SHORTCUT
#ifdef USE_PROBE_FLAGS
    RCCE_flag_write(&RCCE_sent_flag[RCCE_IAM], RCCE_FLAG_SET, dest);
#else
    RCCE_flag_write_tagged(&RCCE_sent_flag[RCCE_IAM], RCCE_FLAG_SET, dest, privbuf, size);
#endif
#endif

#else // LOCAL PUT / REMOTE GET: (standard)
  
#ifdef USE_PROBE_FLAGS
    RCCE_flag_write(&RCCE_sent_flag[RCCE_IAM], RCCE_FLAG_SET, dest);
#else
    RCCE_flag_write_tagged(&RCCE_sent_flag[RCCE_IAM], RCCE_FLAG_SET, dest, privbuf, size);
#endif

    RCCE_wait_until(RCCE_ready_flag[dest], RCCE_FLAG_SET);
    RCCE_flag_write(&RCCE_ready_flag[dest], RCCE_FLAG_UNSET, RCCE_IAM);

#endif // !USE_REMOTE_PUT_LOCAL_GET

    return(RCCE_SUCCESS);
  }
  else
#endif

  return(RCCE_send_general(privbuf, RCCE_buff_ptr, RCCE_chunk, 
			   &RCCE_ready_flag[dest], &RCCE_sent_flag[RCCE_IAM],
			   size, dest,
			   1, 0, 0,          // copy, pipe, mcast
			   NULL, 0, probe)); // tag, len
}
示例#2
0
//--------------------------------------------------------------------------------------
// FUNCTION: RCCE_barrier
//--------------------------------------------------------------------------------------
// very simple, linear barrier 
//--------------------------------------------------------------------------------------
int RCCE_barrier(RCCE_COMM *comm) {
 
  int   counter, i, error;
  int   ROOT      =  0;
  volatile unsigned char cyclechar[RCCE_LINE_SIZE];
  volatile unsigned char   valchar[RCCE_LINE_SIZE];
  volatile int *cycle;
  volatile int *val;

  counter = 0;
  cycle  = (volatile int *)cyclechar;
  val    = (volatile int *)valchar;

  if (RCCE_debug_synch) 
    fprintf(STDERR,"UE %d has checked into barrier\n", RCCE_IAM);
  // flip local barrier variable                                      
  if (error = RCCE_get(cyclechar, (t_vcharp)(comm->gather), RCCE_LINE_SIZE, RCCE_IAM))
    return(RCCE_error_return(RCCE_debug_synch,error));
  *cycle = !(*cycle);
  if (error = RCCE_put((t_vcharp)(comm->gather), cyclechar, RCCE_LINE_SIZE, RCCE_IAM))
    return(RCCE_error_return(RCCE_debug_synch,error));

  if (RCCE_IAM==comm->member[ROOT]) {
    // read "remote" gather flags; once all equal "cycle" (i.e counter==comm->size), 
    // we know all UEs have reached the barrier                                            
    while (counter != comm->size) {
      // skip the first member (#0), because that is the ROOT         
      for (counter=i=1; i<comm->size; i++) {
        /* copy flag values out of comm buffer                        */
        if (error = RCCE_get(valchar, (t_vcharp)(comm->gather), RCCE_LINE_SIZE, 
                             comm->member[i]))
          return(RCCE_error_return(RCCE_debug_synch,error));
        if (*val == *cycle) counter++;
      }
    }
    // set release flags                                              
    for (i=1; i<comm->size; i++) {
      if (error = RCCE_flag_write(&(comm->release), *cycle, comm->member[i]))
        return(RCCE_error_return(RCCE_debug_synch,error));
    }
  }
  else {
    if (error = RCCE_wait_until(comm->release, *cycle))
      return(RCCE_error_return(RCCE_debug_synch,error));
  }
  if (RCCE_debug_synch) fprintf(STDERR,"UE %d has cleared barrier\n", RCCE_IAM);  
  return(RCCE_SUCCESS);
}
示例#3
0
//--------------------------------------------------------------------------------------
// FUNCTION: RCCE_barrier
//--------------------------------------------------------------------------------------
// very simple, linear barrier 
//--------------------------------------------------------------------------------------
int RCCE_barrier(RCCE_COMM *comm) {
 
  int               counter, i, error;
  int               ROOT =  0;
  t_vchar           cyclechar[RCCE_LINE_SIZE];
  t_vchar           valchar  [RCCE_LINE_SIZE];
  t_vcharp gatherp, releasep;
  RCCE_FLAG_STATUS  cycle;

  counter = 0;

  gatherp = comm->gather.line_address;

  if (RCCE_debug_synch) 
    fprintf(STDERR,"UE %d has checked into barrier\n", RCCE_IAM);
  // flip local barrier variable                                      
  if (error = RCCE_get(cyclechar, gatherp, RCCE_LINE_SIZE, RCCE_IAM))
    return(RCCE_error_return(RCCE_debug_synch,error));
  cycle = RCCE_flip_bit_value(cyclechar, comm->gather.location);
  if (error = RCCE_put(comm->gather.line_address, cyclechar, RCCE_LINE_SIZE, RCCE_IAM))
    return(RCCE_error_return(RCCE_debug_synch,error));

  if (RCCE_IAM==comm->member[ROOT]) {
    // read "remote" gather flags; once all equal "cycle" (i.e counter==comm->size), 
    // we know all UEs have reached the barrier                   
    while (counter != comm->size) {
      // skip the first member (#0), because that is the ROOT         
      for (counter=i=1; i<comm->size; i++) {
        // copy flag values out of comm buffer                        
        if (error = RCCE_get(valchar, comm->gather.line_address, RCCE_LINE_SIZE, 
                             comm->member[i]))
          return(RCCE_error_return(RCCE_debug_synch,error));
        if (RCCE_bit_value(valchar, comm->gather.location) == cycle) counter++;
      }
    }
    // set release flags                                              
    for (i=1; i<comm->size; i++) 
      if (error = RCCE_flag_write(&(comm->release), cycle, comm->member[i]))
        return(RCCE_error_return(RCCE_debug_synch,error));
  }
  else {
    if (error = RCCE_wait_until(comm->release, cycle))
      return(RCCE_error_return(RCCE_debug_synch,error));
  }
  if (RCCE_debug_synch) fprintf(STDERR,"UE %d has cleared barrier\n", RCCE_IAM);  
  return(RCCE_SUCCESS);
}
示例#4
0
int RCCE_APP(int argc, char **argv){

  int ID, ID_nb, ID_donor, nrounds, error, strlength;
  RCCE_FLAG flag_sent, flag_ack;

  double *cbuffer, *buffer, sum;
  char msg[RCCE_MAX_ERROR_STRING];

  RCCE_init(&argc, &argv);

  ID = RCCE_ue();
  ID_nb = (ID+1)%RCCE_num_ues();
  ID_donor = (ID-1+RCCE_num_ues())%RCCE_num_ues();

  if (argc != 2) {
    if (ID==0) printf("Executable requires one parameter (number of rounds): %d\n",argc-1);
    return(1);
  }
  nrounds = atoi(*++argv);
  if (nrounds < 0) {
    if (ID==0) printf("Number of rounds should be non-negative: %d\n", nrounds);
    return(1);
  }

  /* allocate private memory and comm buffer space */
  buffer  = (double *) malloc(BUFSIZE*sizeof(double));
  if (!buffer) printf("Mark 01: Failed to allocate private buffer on proc %d\n", ID);
  cbuffer = (double *) RCCE_malloc(BUFSIZE*sizeof(double));
  if (!buffer) printf("Mark 02:RCCE failed to allocate %d doubles on proc %d\n",
      BUFSIZE, ID);

  /* initialize buffer with UE-specific data  */
  for (int i=0; i<BUFSIZE; i++) buffer[i] = (double)(ID+1+i);
  sum = 0.0;  
  for (int i=0; i<BUFSIZE; i++) sum += buffer[i];
  printf("Initial sum on UE %03d equals %f\n", ID, sum);

  /* create and initialize flag variables */
  if (error=RCCE_flag_alloc(&flag_sent))
    printf("Mark 03a: Could not allocate flag_sent on %d, error=%d\n", ID, error);
  if (error=RCCE_flag_alloc(&flag_ack))
    printf("Mark 03b: Could not allocate flag_ack on %d, error=%d\n", ID, error);

  if(error=RCCE_flag_write(&flag_sent, RCCE_FLAG_UNSET, ID)) 
    printf("Mark 04: Could not initialize flag_sent on %d, error=%d\n", ID, error);
  if(error=RCCE_flag_write(&flag_ack, RCCE_FLAG_SET, ID_donor)) 
    printf("Mark 05: Could not initialize flag_ack on %d, error=%d\n", ID_donor, error);

  for (int round=0; round<nrounds; round++) {

    int size = BUFSIZE*sizeof(double);
    RCCE_wait_until(flag_ack, RCCE_FLAG_SET);
    RCCE_flag_write(&flag_ack, RCCE_FLAG_UNSET, ID);
    RCCE_put((t_vcharp)cbuffer, (t_vcharp)buffer, size, ID_nb);
    RCCE_flag_write(&flag_sent, RCCE_FLAG_SET, ID_nb);

    RCCE_wait_until(flag_sent, RCCE_FLAG_SET);
    RCCE_flag_write(&flag_sent, RCCE_FLAG_UNSET, ID);
    RCCE_get((t_vcharp)buffer, (t_vcharp)cbuffer, size, ID);
    RCCE_flag_write(&flag_ack, RCCE_FLAG_SET, ID_donor);

  }

  /* compute local sum */
  sum = 0.0;
  for (int i=0; i<BUFSIZE; i++) sum += buffer[i];
  printf("Final sum on UE %03d equals %f\n", ID, sum);

  RCCE_finalize();

  return(0);
}
示例#5
0
//--------------------------------------------------------------------------------------
// FUNCTION: RCCE_send_general
//--------------------------------------------------------------------------------------
// Synchronized send function (gory and non-gory mode)
//--------------------------------------------------------------------------------------
static int RCCE_send_general(
  char *privbuf,    // source buffer in local private memory (send buffer)
  t_vcharp combuf,  // intermediate buffer in MPB
  size_t chunk,     // size of MPB available for this message (bytes)
  RCCE_FLAG *ready, // flag indicating whether receiver is ready
  RCCE_FLAG *sent,  // flag indicating whether message has been sent by source
  size_t size,      // size of message (bytes)
  int dest,         // UE that will receive the message
  int copy,         // set to 0 for synchronization only (no copying/sending)
  int pipe,         // use pipelining?
  int mcast,        // multicast?
  void* tag,        // additional tag?
  int len,          // length of additional tag
  RCCE_FLAG *probe  // flag for probing for incoming messages
  ) {

  char padline[RCCE_LINE_SIZE]; // copy buffer, used if message not multiple of line size
  size_t wsize,    // offset within send buffer when putting in "chunk" bytes
        remainder, // bytes remaining to be sent
        nbytes;    // number of bytes to be sent in single RCCE_put call
  char *bufptr;    // running pointer inside privbuf for current location

#ifdef USE_REMOTE_PUT_LOCAL_GET
  if(mcast) return(RCCE_error_return(1, RCCE_ERROR_NO_MULTICAST_SUPPORT));
#endif

  if(probe)
#ifdef USE_TAGGED_FLAGS
    RCCE_flag_write_tagged(probe, RCCE_FLAG_SET, dest, tag, len);
#else
    RCCE_flag_write(probe, RCCE_FLAG_SET, dest);
#endif

#ifdef USE_SYNCH_FOR_ZERO_BYTE
  // synchronize even in case of zero byte messages:
  if(size == 0) {
#ifdef USE_REMOTE_PUT_LOCAL_GET
    RCCE_wait_until(*ready, RCCE_FLAG_SET);
    RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM);
#ifdef USE_TAGGED_FLAGS
    if(!probe)
      RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len);
    else
#endif
    RCCE_flag_write(sent, RCCE_FLAG_SET, dest);
#else // LOCAL PUT / REMOTE GET: (standard)
#ifdef USE_TAGGED_FLAGS
    if(!probe)
      RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len);
    else
#endif
    RCCE_flag_write(sent, RCCE_FLAG_SET, dest);
    RCCE_wait_until(*ready, RCCE_FLAG_SET);
    RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM);
#endif // !USE_REMOTE_PUT_LOCAL_GET
    return(RCCE_SUCCESS);
  }
#endif // USE_SYNCH_FOR_ZERO_BYTE

  if(!pipe) {
    // send data in units of available chunk size of comm buffer 
    for (wsize=0; wsize< (size/chunk)*chunk; wsize+=chunk) {
      bufptr = privbuf + wsize;
      nbytes = chunk;

#ifdef USE_REMOTE_PUT_LOCAL_GET

      RCCE_wait_until(*ready, RCCE_FLAG_SET);
      RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM);
      
      // copy private data to remote comm buffer
      if(copy) RCCE_put(combuf, (t_vcharp) bufptr, nbytes, dest);

#ifdef USE_TAGGED_FLAGS
      if( (wsize == 0) && (!probe) )
	RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len);
      else
#endif
      RCCE_flag_write(sent, RCCE_FLAG_SET, dest);

#else // LOCAL PUT / REMOTE GET: (standard)

      // copy private data to own comm buffer
      if(copy) RCCE_put(combuf, (t_vcharp) bufptr, nbytes, RCCE_IAM);

      if(!mcast) {
#ifdef USE_TAGGED_FLAGS
	if( (wsize == 0) && (!probe) )
	  RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len);
	else
#endif
	RCCE_flag_write(sent, RCCE_FLAG_SET, dest);

	// wait for the destination to be ready to receive a message          
	RCCE_wait_until(*ready, RCCE_FLAG_SET);
	RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM);
      }
      else {
	RCCE_TNS_barrier(&RCCE_COMM_WORLD);
	RCCE_TNS_barrier(&RCCE_COMM_WORLD);
      }
#endif // !USE_REMOTE_PUT_LOCAL_GET

    } // for
  }
  else // if(!pipe) ->  if(pipe)
  {
    // pipelined version of send/recv:
    size_t subchunk1, subchunk2;

    for(wsize = 0; wsize < (size/chunk)*chunk; wsize+=chunk) {

      if(wsize == 0) {
	// allign sub-chunks to cache line granularity:
	subchunk1 = ( (chunk / 2) / RCCE_LINE_SIZE ) * RCCE_LINE_SIZE;
	subchunk2 = chunk - subchunk1;
      }

      bufptr = privbuf + wsize;
      nbytes = subchunk1;

#ifdef USE_REMOTE_PUT_LOCAL_GET

      RCCE_wait_until(*ready, RCCE_FLAG_SET);
      RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM);

      // copy private data chunk 1 to remote comm buffer
      if(copy) RCCE_put(combuf, (t_vcharp) bufptr, nbytes, dest);

#ifdef USE_TAGGED_FLAGS
      if( (wsize == 0) && (!probe) )
	RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len);
      else
#endif
      RCCE_flag_write(sent, RCCE_FLAG_SET, dest);

#else // LOCAL PUT / REMOTE GET: (standard)
      
      // copy private data chunk 1 to own comm buffer
      if(copy) RCCE_put(combuf, (t_vcharp) bufptr, nbytes, RCCE_IAM);

#ifdef USE_TAGGED_FLAGS
      if( (wsize == 0) && (!probe) )
	RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len);
      else
#endif
      RCCE_flag_write(sent, RCCE_FLAG_SET, dest);
      
      RCCE_wait_until(*ready, RCCE_FLAG_SET);
      RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM);

#endif // !USE_REMOTE_PUT_LOCAL_GET      
      
      bufptr = privbuf + wsize + subchunk1;
      nbytes = subchunk2;
      
#ifdef USE_REMOTE_PUT_LOCAL_GET

      RCCE_wait_until(*ready, RCCE_FLAG_SET);
      RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM);
      
      // copy private data chunk 2 to remote comm buffer
      if(copy) RCCE_put(combuf + subchunk1, (t_vcharp) bufptr, nbytes, dest);

      RCCE_flag_write(sent, RCCE_FLAG_SET, dest);

#else // LOCAL PUT / REMOTE GET: (standard)

      // copy private data chunk 2 to own comm buffer
      if(copy) RCCE_put(combuf + subchunk1, (t_vcharp) bufptr, nbytes, RCCE_IAM);
      
      RCCE_flag_write(sent, RCCE_FLAG_SET, dest);
      
      RCCE_wait_until(*ready, RCCE_FLAG_SET);
      RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM);

#endif // !USE_REMOTE_PUT_LOCAL_GET

    } //for

  } // if(pipe)

  remainder = size%chunk; 
  // if nothing is left over, we are done 
  if (!remainder) return(RCCE_SUCCESS);

  // send remainder of data--whole cache lines            
  bufptr = privbuf + (size/chunk)*chunk;
  nbytes = remainder - remainder%RCCE_LINE_SIZE;

  if (nbytes) {

#ifdef USE_REMOTE_PUT_LOCAL_GET

    RCCE_wait_until(*ready, RCCE_FLAG_SET);
    RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM);
    
    // copy private data to remote comm buffer
    if(copy) RCCE_put(combuf, (t_vcharp) bufptr, nbytes, dest);

#ifdef USE_TAGGED_FLAGS
    if( (wsize == 0) && (!probe) )
      RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len);
    else
#endif
    RCCE_flag_write(sent, RCCE_FLAG_SET, dest);
    
#else // LOCAL PUT / REMOTE GET: (standard)

    // copy private data to own comm buffer
    if(copy) RCCE_put(combuf, (t_vcharp)bufptr, nbytes, RCCE_IAM);

    if(!mcast) {
#ifdef USE_TAGGED_FLAGS
      if( (wsize == 0) && (!probe) )
	RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len);
      else
#endif
      RCCE_flag_write(sent, RCCE_FLAG_SET, dest);

      // wait for the destination to be ready to receive a message          
      RCCE_wait_until(*ready, RCCE_FLAG_SET);
      RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM);
    }
    else {
      RCCE_TNS_barrier(&RCCE_COMM_WORLD);
      RCCE_TNS_barrier(&RCCE_COMM_WORLD);
    }
#endif // !USE_REMOTE_PUT_LOCAL_GET

  } // if(nbytes)
   
  remainder = remainder%RCCE_LINE_SIZE;
  if (!remainder) return(RCCE_SUCCESS);
  
  // remainder is less than a cache line. This must be copied into appropriately sized 
  // intermediate space before it can be sent to the receiver 
  bufptr = privbuf + (size/chunk)*chunk + nbytes;
  nbytes = RCCE_LINE_SIZE;

  if(copy) {
#ifdef COPPERRIDGE
    memcpy_scc(padline,bufptr,remainder);
#else
    memcpy(padline,bufptr,remainder);
#endif
  }

#ifdef USE_REMOTE_PUT_LOCAL_GET

  RCCE_wait_until(*ready, RCCE_FLAG_SET);
  RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM);
  
  // copy private data to remote comm buffer
  if(copy) RCCE_put(combuf, (t_vcharp) padline, nbytes, dest);

#ifdef USE_TAGGED_FLAGS
  if( (wsize == 0) && (!probe) )
    RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len);
  else
#endif
  RCCE_flag_write(sent, RCCE_FLAG_SET, dest);

#else // LOCAL PUT / REMOTE GET: (standard)

  // copy private data to own comm buffer 
  if(copy) RCCE_put(combuf, (t_vcharp)padline, nbytes, RCCE_IAM);
  
  if(!mcast) {
#ifdef USE_TAGGED_FLAGS
    if( (wsize == 0) && (!probe) )
      RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len);
    else
#endif
    RCCE_flag_write(sent, RCCE_FLAG_SET, dest);

    // wait for the destination to be ready to receive a message          
    RCCE_wait_until(*ready, RCCE_FLAG_SET);
    RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM);
  }
  else {
    RCCE_TNS_barrier(&RCCE_COMM_WORLD);
    RCCE_TNS_barrier(&RCCE_COMM_WORLD);
  }

#endif // !USE_REMOTE_PUT_LOCAL_GET

  return(RCCE_SUCCESS);
}
示例#6
0
int RCCE_APP(int argc, char **argv) {

  /* statically allocated space sits in off-chip private memory          */
  float     a[NXNY], *buff;
  int       i, offset, iter=10, tile;
  int       MY_ID;
  int       NTILES1;
  double    time;
  RCCE_FLAG flag0, flag1;

  RCCE_init(&argc, &argv);
  
  NTILES1 = RCCE_num_ues()-1;
  MY_ID = RCCE_ue();

  if (NX%8) {
    printf("Grid width should be multiple of 8: %d\n", NX);
    exit(1);
  }
  if (argc>1) iter=atoi(*++argv);
  if (MY_ID==0) printf("Executing %d iterations\n", iter);

    /* allocate space on the comm buffer                                 */
  buff = (float *) RCCE_malloc(sizeof(float)*2*NX); 
  /* Allocate flags to coordinate comm.                                  */                                 
  if (RCCE_flag_alloc(&flag0)) return(1);
  if (RCCE_flag_alloc(&flag1)) return(1);

  /* initialize array a on all tiles; this stuffs a into private caches  */
  for (offset=0,       i=0; i<NXNY; i++) a[i+offset] = 0.0;
  if (MY_ID == 0) 
     for (offset=0,    i=0; i<NX;   i++) a[i+offset] = 1.0;
  if (MY_ID == NTILES1) 
     for (offset=NXNY1,i=0; i<NX;   i++) a[i+offset] = 2.0;

  /* put in a barrier so everybody can be sure to have initialized       */
  RCCE_barrier(&RCCE_COMM_WORLD);

  /* main loop */

  if (MY_ID==0) time = RCCE_wtime();

  while ((iter--)>0){
  
    /* start with copying fringe data to neighboring tiles               */
    if (MY_ID!=NTILES1) {
      /* Initialize neighbor flag to zero                                */
      RCCE_flag_write(&flag0, RCCE_FLAG_UNSET, MY_ID+1); 
      /* copy private data to shared comm buffer of neighbor             */
      RCCE_put((t_vcharp)(&buff[0]), (t_vcharp)(&a[NXNY2]), NX*sizeof(float), MY_ID+1);
      RCCE_flag_write(&flag0, RCCE_FLAG_SET, MY_ID+1); 
    }
    if (MY_ID != 0) {
      /* Initialize neighbor flag to zero                                */
      RCCE_flag_write(&flag1, 0, MY_ID-1); 
      /* copy private data to shared comm buffer of neighbor             */
      RCCE_put((t_vcharp)(&buff[NX]), (t_vcharp)(&a[NX]), NX*sizeof(float), MY_ID-1);
      RCCE_flag_write(&flag1, RCCE_FLAG_SET, MY_ID-1); 
    }

    /* Make sure the data has been recvd and copy data out of buffer(s)  */
    if (MY_ID!=NTILES1) {
      RCCE_wait_until(flag1, RCCE_FLAG_SET);
      RCCE_get((t_vcharp)(&a[NXNY1]), (t_vcharp)(&buff[NX]), NX*sizeof(float),MY_ID);
    }

    if (MY_ID!=0) {
      RCCE_wait_until(flag0, RCCE_FLAG_SET);
      RCCE_get((t_vcharp)(&a[0]), (t_vcharp)(&buff[0]), NX*sizeof(float),MY_ID);
    }

    /* apply the stencil operation                                       */
    for (i=0; i<NXNY2; i++) {
      a[i+O3] +=
         W1*a[i+O1] + W2*a[i+O2] + W3*a[i+O3] + W4*a[i+O4] + W5*a[i+O5];
    }
  }
  RCCE_barrier(&RCCE_COMM_WORLD);
  if (MY_ID==0) { 
    time = RCCE_wtime()-time;
  }

  /* print result strip by strip; this would not be done on RC */
  for (int id=0; id<=NTILES1; id++) {
    RCCE_barrier(&RCCE_COMM_WORLD);
    if (MY_ID==id) {
      int start = NX; int end = NXNY1;
      if (MY_ID==0) start = 0;
      if (MY_ID == NTILES1) end = NXNY;
      for (offset=0, i=start; i<end; i++) {
        if (!(i%NX)) printf("\n");
//        comment out next line and uncomment subsequent three to print error
        printf("%f ",a[i+offset]);
//        int jj=i/NX+(MY_ID*(NY-1));
//        double aexact=1.0+(double)jj/((NTILES1+1)*(NY-1));
//        printf("%f ",a[i+offset]-aexact);
      }
    }
  }
  RCCE_barrier(&RCCE_COMM_WORLD);
  if (MY_ID==0) { 
    printf("\nTotal time: %lf\n", time);
  }

  RCCE_finalize();

  return(0);
}