Esempio n. 1
0
//--------------------------------------------------------------------------------------
// FUNCTION: RCCE_wait_until
//--------------------------------------------------------------------------------------
// wait until flag in local MPB becomes set or unset. To avoid reading stale data from 
// the cache instead of new flag value from the MPB, issue MPB cache invalidation before 
// each read, including within the spin cycle 
//--------------------------------------------------------------------------------------
int RCCE_wait_until(RCCE_FLAG flag, RCCE_FLAG_STATUS val) {
  t_vcharp cflag;

  cflag = (t_vcharp) flag;
#ifdef GORY
  if (val != RCCE_FLAG_UNSET && val != RCCE_FLAG_SET) 
    return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_STATUS_UNDEFINED));
  if (!cflag)
    return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_NOT_ALLOCATED));
  // check to see if flag is properly contained in the local comm buffer  
  if (cflag - RCCE_comm_buffer[RCCE_IAM]>=0 &&
      cflag+RCCE_LINE_SIZE - (RCCE_comm_buffer[RCCE_IAM] + RCCE_BUFF_SIZE)<0){}
  else {
    return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_NOT_IN_COMM_BUFFER));
  }
#endif

  // always flush/invalidate to ensure we read the most recent value of *flag
  // keep reading it until it has the required value. We only need to read the
  // first int of the MPB cache line containing the flag
  do {
#ifdef _OPENMP
    #pragma omp flush   
#endif
    RC_cache_invalidate();
  } while ((*flag) != val);

  return(RCCE_SUCCESS);
}
Esempio n. 2
0
//--------------------------------------------------------------------------------------
// RCCE_bcast
//--------------------------------------------------------------------------------------
// function that sends data from UE root to all other UEs in the communicator
//--------------------------------------------------------------------------------------
int RCCE_bcast(
  char *buf,     // private memory, used for sending (root) and receiving (other UEs) 
  size_t num,    // number of bytes to be sent
  int root,      // source within "comm" of broadcast data
  RCCE_COMM comm // communication domain
  ) {

  int ue, ierr;
#ifdef GORY
  printf("Collectives only implemented for simplified API\n");
  return(1);
#else
  // check to make sure root is member of the communicator
  if (root<0 || root >= comm.size) 
  return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_ID));

  if (RCCE_IAM == comm.member[root]) {
    for (ue=0; ue<comm.size; ue++) if (ue != root)
      if((ierr=RCCE_send(buf, num, comm.member[ue])))
         return(RCCE_error_return(RCCE_debug_comm,ierr));
  }
  else if((ierr=RCCE_recv(buf, num, comm.member[root])))
         return(RCCE_error_return(RCCE_debug_comm,ierr));

  return(RCCE_SUCCESS);
#endif
}
Esempio n. 3
0
//--------------------------------------------------------------------------------------
// FUNCTION: RCCE_wait_until
//--------------------------------------------------------------------------------------
// wait until flag in local MPB becomes set or unset. To avoid reading stale data from 
// the cache instead of new flag value from the MPB, issue MPB cache invalidation before 
// each read, including within the spin cycle 
//--------------------------------------------------------------------------------------
int RCCE_wait_until(RCCE_FLAG flag, RCCE_FLAG_STATUS val) {
  t_vcharp cflag;

  cflag = flag.line_address;

// avoid tests if we use the simplified API 
#ifdef GORY
  if (val != RCCE_FLAG_UNSET && val != RCCE_FLAG_SET) 
    return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_STATUS_UNDEFINED));
  if (!cflag)
    return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_NOT_ALLOCATED));
  // check to see if flag is properly contained in the local comm buffer  
  if (cflag - RCCE_comm_buffer[RCCE_IAM]>=0 &&
      cflag+RCCE_LINE_SIZE - (RCCE_comm_buffer[RCCE_IAM] + RCCE_BUFF_SIZE)<0){}
  else {
    return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_NOT_IN_COMM_BUFFER));
  }
#endif

  // always flush/invalidate to ensure we read the most recent value of *flag
  // keep reading it until it has the required value 
  do {
#ifdef _OPENMP
    #pragma omp flush  
#endif
    RC_cache_invalidate();
  } 
  while ((RCCE_bit_value(cflag, flag.location) != val));

  return(RCCE_SUCCESS);
}
Esempio n. 4
0
//--------------------------------------------------------------------------------------
// FUNCTION: RCCE_flag_write
//--------------------------------------------------------------------------------------
// This is the core flag manipulation routine. It requires locking to guarantee atomic
// access while updating one of a line of flags.
//--------------------------------------------------------------------------------------
int    RCCE_flag_write(RCCE_FLAG *flag, RCCE_FLAG_STATUS val, int ID) {
  t_vchar val_array[1];

#ifdef GORY
  // check input parameters 
  if (!flag || flag->location < 0 || flag->location > RCCE_FLAGS_PER_LINE)  
    return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_UNDEFINED));
  if (val != RCCE_FLAG_UNSET && val != RCCE_FLAG_SET)
    return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_STATUS_UNDEFINED));
  if (ID<0 || ID>=RCCE_NP) 
    return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_ID));
#endif

  // acquire lock to make sure nobody else fiddles with the flags on the target core 
  RCCE_acquire_lock(ID);
  // copy word from MPB containing flag to private memory
  RCCE_get_char(val_array, flag->flag_addr, ID);

  // overwrite single bit within local copy of flag word
  RCCE_write_bit_value(val_array, (flag->location)%RCCE_FLAGS_PER_BYTE, val);

  // write copy back to the MPB
  RCCE_put_char(flag->flag_addr, val_array, ID);
  
  // release write lock for the flags on the target core 
  RCCE_release_lock(ID);

  return(RCCE_SUCCESS);
}
Esempio n. 5
0
//--------------------------------------------------------------------------------------
// FUNCTION: RCCE_flag_alloc
//--------------------------------------------------------------------------------------
// allocate space for one flag. Since multiple fit on a single cache line, we only
// need to allocate new MPB space when all the existing lines are completely filled. A
// flag line is a data structure that contains an array ("flag") of size RCCE_LINE_SIZE 
// characters. Each element in "flag" corresponds to a flag being in use  (value is 1) 
// or not (value is 0). The actual value of the flag is stored in the MPB line pointed 
// to be the field "line_address," at the corresponding bit/byte location as in field 
// "flag."
//--------------------------------------------------------------------------------------
int    RCCE_flag_alloc(RCCE_FLAG *flag) {
  RCCE_FLAG_LINE *flagp;
  t_vcharp flag_addr;
  int c, loc;

  // find the head of the data structure that administers the flag variables
  flagp = &RCCE_flags;
  while (flagp->members == RCCE_FLAGS_PER_LINE && flagp->next) {
    flagp = flagp->next;
  }

  // if this is a new flag line, need to allocate MPB for it 
  if (!flagp->line_address) flagp->line_address = RCCE_malloc(RCCE_LINE_SIZE);
  if (!flagp->line_address) return(RCCE_error_return(RCCE_debug_synch,
                                   RCCE_ERROR_FLAG_NOT_ALLOCATED));

  if (flagp->members < RCCE_FLAGS_PER_LINE) {
    // there is space in this line for a new flag; find first open slot    
    for (loc=0; loc<RCCE_FLAGS_PER_LINE; loc++) {
      flag_addr =  flagp->line_address + loc/RCCE_FLAGS_PER_BYTE;
      if (!((int)(flagp->flag[loc]))) {
        flagp->flag[loc] = (char) ((unsigned int) 1);
        flagp->members++;
        flag->location = loc;
        flag->line_address = flagp->line_address;
        flag->flag_addr = flag_addr;
        return(RCCE_SUCCESS);
      }
    }
  }
  else {
    // must create new flag line if last one was full
    flagp->next = (RCCE_FLAG_LINE *) malloc(sizeof(RCCE_FLAG_LINE));
    if (!(flagp->next)) return(RCCE_error_return(RCCE_debug_synch,
                                   RCCE_ERROR_FLAG_NOT_ALLOCATED));
    flagp = flagp->next;
    flagp->line_address = RCCE_malloc(RCCE_LINE_SIZE);
    if (!(flagp->line_address)) return(RCCE_error_return(RCCE_debug_synch,
                                   RCCE_ERROR_FLAG_NOT_ALLOCATED));
    // initialize the flag line 
    flagp->members=1;
    flagp->next = NULL;
    for (c=1; c<RCCE_LINE_SIZE; c++) flagp->flag[c] = (char)((unsigned int) 0);
    
    // set first flag field to indicate the corresponding flag is now in use
    flagp->flag[0] = (char)((unsigned int) 1);
    flag->location = 0;
    flag->line_address = flagp->line_address;
    flag->flag_addr = flag->line_address;
  } 
  return(RCCE_SUCCESS);
}
Esempio n. 6
0
//--------------------------------------------------------------------------------------
// FUNCTION: RCCE_flag_write
//--------------------------------------------------------------------------------------
// This is the core flag manipulation routine. 
//--------------------------------------------------------------------------------------
int    RCCE_flag_write(RCCE_FLAG *flag, RCCE_FLAG_STATUS val, int ID) {

#ifdef GORY
  // check input parameters 
  if (!flag || flag->location < 0 || flag->location > RCCE_LINE_SIZE)  
    return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_UNDEFINED));
  if (val != RCCE_FLAG_UNSET && val != RCCE_FLAG_SET)
     return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_STATUS_UNDEFINED));
  if (ID<0 || ID>=RCCE_NP) 
    return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_ID));
#endif

  RCCE_put_char(flag->flag_addr, (t_vcharp) &val, ID);
  return(RCCE_SUCCESS);
}
Esempio n. 7
0
//--------------------------------------------------------------------------------------
// FUNCTION: RCCE_barrier
//--------------------------------------------------------------------------------------
// very simple, linear barrier 
//--------------------------------------------------------------------------------------
int RCCE_barrier(RCCE_COMM *comm) {
 
  int   counter, i, error;
  int   ROOT      =  0;
  volatile unsigned char cyclechar[RCCE_LINE_SIZE];
  volatile unsigned char   valchar[RCCE_LINE_SIZE];
  volatile int *cycle;
  volatile int *val;

  counter = 0;
  cycle  = (volatile int *)cyclechar;
  val    = (volatile int *)valchar;

  if (RCCE_debug_synch) 
    fprintf(STDERR,"UE %d has checked into barrier\n", RCCE_IAM);
  // flip local barrier variable                                      
  if (error = RCCE_get(cyclechar, (t_vcharp)(comm->gather), RCCE_LINE_SIZE, RCCE_IAM))
    return(RCCE_error_return(RCCE_debug_synch,error));
  *cycle = !(*cycle);
  if (error = RCCE_put((t_vcharp)(comm->gather), cyclechar, RCCE_LINE_SIZE, RCCE_IAM))
    return(RCCE_error_return(RCCE_debug_synch,error));

  if (RCCE_IAM==comm->member[ROOT]) {
    // read "remote" gather flags; once all equal "cycle" (i.e counter==comm->size), 
    // we know all UEs have reached the barrier                                            
    while (counter != comm->size) {
      // skip the first member (#0), because that is the ROOT         
      for (counter=i=1; i<comm->size; i++) {
        /* copy flag values out of comm buffer                        */
        if (error = RCCE_get(valchar, (t_vcharp)(comm->gather), RCCE_LINE_SIZE, 
                             comm->member[i]))
          return(RCCE_error_return(RCCE_debug_synch,error));
        if (*val == *cycle) counter++;
      }
    }
    // set release flags                                              
    for (i=1; i<comm->size; i++) {
      if (error = RCCE_flag_write(&(comm->release), *cycle, comm->member[i]))
        return(RCCE_error_return(RCCE_debug_synch,error));
    }
  }
  else {
    if (error = RCCE_wait_until(comm->release, *cycle))
      return(RCCE_error_return(RCCE_debug_synch,error));
  }
  if (RCCE_debug_synch) fprintf(STDERR,"UE %d has cleared barrier\n", RCCE_IAM);  
  return(RCCE_SUCCESS);
}
Esempio n. 8
0
//--------------------------------------------------------------------------------------
// FUNCTION: RCCE_barrier
//--------------------------------------------------------------------------------------
// very simple, linear barrier 
//--------------------------------------------------------------------------------------
int RCCE_barrier(RCCE_COMM *comm) {
 
  int               counter, i, error;
  int               ROOT =  0;
  t_vchar           cyclechar[RCCE_LINE_SIZE];
  t_vchar           valchar  [RCCE_LINE_SIZE];
  t_vcharp gatherp, releasep;
  RCCE_FLAG_STATUS  cycle;

  counter = 0;

  gatherp = comm->gather.line_address;

  if (RCCE_debug_synch) 
    fprintf(STDERR,"UE %d has checked into barrier\n", RCCE_IAM);
  // flip local barrier variable                                      
  if (error = RCCE_get(cyclechar, gatherp, RCCE_LINE_SIZE, RCCE_IAM))
    return(RCCE_error_return(RCCE_debug_synch,error));
  cycle = RCCE_flip_bit_value(cyclechar, comm->gather.location);
  if (error = RCCE_put(comm->gather.line_address, cyclechar, RCCE_LINE_SIZE, RCCE_IAM))
    return(RCCE_error_return(RCCE_debug_synch,error));

  if (RCCE_IAM==comm->member[ROOT]) {
    // read "remote" gather flags; once all equal "cycle" (i.e counter==comm->size), 
    // we know all UEs have reached the barrier                   
    while (counter != comm->size) {
      // skip the first member (#0), because that is the ROOT         
      for (counter=i=1; i<comm->size; i++) {
        // copy flag values out of comm buffer                        
        if (error = RCCE_get(valchar, comm->gather.line_address, RCCE_LINE_SIZE, 
                             comm->member[i]))
          return(RCCE_error_return(RCCE_debug_synch,error));
        if (RCCE_bit_value(valchar, comm->gather.location) == cycle) counter++;
      }
    }
    // set release flags                                              
    for (i=1; i<comm->size; i++) 
      if (error = RCCE_flag_write(&(comm->release), cycle, comm->member[i]))
        return(RCCE_error_return(RCCE_debug_synch,error));
  }
  else {
    if (error = RCCE_wait_until(comm->release, cycle))
      return(RCCE_error_return(RCCE_debug_synch,error));
  }
  if (RCCE_debug_synch) fprintf(STDERR,"UE %d has cleared barrier\n", RCCE_IAM);  
  return(RCCE_SUCCESS);
}
Esempio n. 9
0
//--------------------------------------------------------------------------------------
// FUNCTION: RCCE_flag_read
//--------------------------------------------------------------------------------------
// This routine is rarely needed. We typically only read a flag when we're waiting for
// it to change value (function RCCE_wait_until). Reading does not require locking. The
// moment the target flag we're trying to read changes value, it is OK to read and
// return that value
//--------------------------------------------------------------------------------------
int    RCCE_flag_read(RCCE_FLAG flag, RCCE_FLAG_STATUS *val, int ID) {
  volatile unsigned char val_loc;

#ifdef GORY
  if (flag.location < 0 || flag.location > RCCE_LINE_SIZE)  
    return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_UNDEFINED));
  if (!val)   return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_VAL_UNDEFINED));
  if (ID<0 || ID>=RCCE_NP) 
    return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_ID));
#endif

// Should be able to use same technique as in RCCE_wait_until, i.e., should not need 
// to copy out of MPB first. However, this function is not time critical
  RCCE_get_char(&val_loc, flag.flag_addr, ID);
  *val = val_loc;
  return(RCCE_SUCCESS);
}
Esempio n. 10
0
//--------------------------------------------------------------------------------------
// FUNCTION: RCCE_flag_read
//--------------------------------------------------------------------------------------
// This routine is rarely needed. We typically only read a flag when we're waiting for
// it to change value (function RCCE_wait_until). Reading does not require locking. The
// moment the target flag we're trying to read changes value, it is OK to read and
// return that value
//--------------------------------------------------------------------------------------
int    RCCE_flag_read(RCCE_FLAG flag, RCCE_FLAG_STATUS *val, int ID) {
  t_vchar val_array[1];

#ifdef GORY
  if (flag.location < 0 || flag.location > RCCE_FLAGS_PER_LINE)  
    return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_UNDEFINED));
  if (!val)   return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_VAL_UNDEFINED));
  if (ID<0 || ID>=RCCE_NP) 
    return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_ID));
#endif

// Should be able to use same technique as in RCCE_wait_until, i.e., should not need 
// to copy out of MPB first. However, this function is not time critical
  RCCE_get_char(val_array, flag.flag_addr, ID);
  *val = RCCE_bit_value(val_array, (flag.location)%RCCE_FLAGS_PER_BYTE);
  return(RCCE_SUCCESS);
}
Esempio n. 11
0
//---------------------------------------------------------------------------------------
// FUNCTION: RCCE_reduce
//---------------------------------------------------------------------------------------
// Reduction function which delivers the reduction results to UE root
//---------------------------------------------------------------------------------------
int RCCE_reduce(
  char *inbuf,   // source buffer for reduction datan
  char *outbuf,  // target buffer for reduction data
  int num,       // number of data elements to be reduced
  int type,      // type of data elements
  int op,        // reduction operation
  int root,      // member of "comm" receiving reduction results
  RCCE_COMM comm // communication domain within which to reduce
  ){

  int ue, all = 0;
  // check to make sure root is member of the communicator
  if (root<0 || root >= comm.size) 
  return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_ID));

  return(RCCE_error_return(RCCE_debug_comm,
      RCCE_reduce_general(inbuf, outbuf, num, type, op, root, all, comm)));
}
Esempio n. 12
0
// DO NOT USE THIS FUNCTION IN NON-GORY MODE UNTIL MALLOC_FREE HAS BEEN IMPLEMENTED
int RCCE_comm_free(RCCE_COMM *comm) {
  printf("DO NOT USE IN NON-GORY MODE UNTIL MALLOC_FREE HAS BEEN IMPLEMENTED\n");
  if (comm->initialized != RCCE_COMM_INITIALIZED) 
             return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_COMM_INITIALIZED));
  RCCE_flag_free(&(comm->gather));
  RCCE_flag_free(&(comm->release));
  comm->initialized = RCCE_COMM_NOT_INITIALIZED;  

  return(RCCE_SUCCESS);
}  
Esempio n. 13
0
//--------------------------------------------------------------------------------------
// FUNCTION: RCCE_comm_size
// returns the number of UEs inside the communicator
//--------------------------------------------------------------------------------------
int RCCE_comm_size(
  RCCE_COMM comm, // communicator
  int *size       // return value (size)
  ) {

  if (comm.initialized == RCCE_COMM_INITIALIZED) {
    *size = comm.size;
    return(RCCE_SUCCESS);
  }
  else return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_COMM_INITIALIZED));
}
Esempio n. 14
0
//--------------------------------------------------------------------------------------
// FUNCTION: RCCE_comm_rank
// returns the rank of the calling UE inside the communicator
//--------------------------------------------------------------------------------------
int RCCE_comm_rank(
  RCCE_COMM comm, // communicator
  int *rank       // return value (rank)
  ) {

  if (comm.initialized == RCCE_COMM_INITIALIZED) {
    *rank = comm.my_rank;
    return(RCCE_SUCCESS);
  }
  else return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_COMM_INITIALIZED));
}
Esempio n. 15
0
//--------------------------------------------------------------------------------------
// FUNCTION: RCCE_flag_free
//--------------------------------------------------------------------------------------
// free space for one flag. Since multiple fit on a single cache line, we only
// need to free claimed MPB space when the all existing lines are completely emptied.
//--------------------------------------------------------------------------------------
int    RCCE_flag_free(RCCE_FLAG *flag) {

  RCCE_FLAG_LINE *flagp, *flagpminus1 = NULL;
  int loc;

  // check wether flag exists, and whether the location field is valid 
  if (!flag || flag->location < 0) 
    return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_UNDEFINED));
  // find flag line in globally maintained structure                   
  flagp  = &RCCE_flags;
  while (flagp->next && flag->line_address != flagp->line_address) {
    flagpminus1 = flagp;
    flagp = flagp->next;
  }
  if (flag->line_address != flagp->line_address) 
    return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_UNDEFINED));

  // error checking is done
  flagp->members--; 
  loc = flag->location;
#ifdef SINGLEBITFLAGS
  RCCE_flip_bit_value(flagp->flag+loc/RCCE_FLAGS_PER_BYTE,loc%RCCE_FLAGS_PER_BYTE);
#else
  flagp->flag[flag->location] = (char) ((unsigned int) 0);
#endif
  // something special happens if we've emptied an entire line         
  if (flagp->members==0) {
    if (flagpminus1) {
      // there is a predecessor; splice out current flag line from linked list
      RCCE_free(flagp->line_address);
      flagpminus1->next = flagp->next;
      free(flagp); 
    } 
    // if there is a successor but no predecessor, do nothing          
  }
  // invalidate location field to make sure we won't free again by mistake
  flag->location = -1;
  flag->line_address = NULL;

  return(RCCE_SUCCESS);
}
Esempio n. 16
0
int RCCE_test_flag(RCCE_FLAG flag, RCCE_FLAG_STATUS val, int *result) {
  t_vcharp cflag;

  cflag = (t_vcharp) flag;
#ifdef GORY
  if (val != RCCE_FLAG_UNSET && val != RCCE_FLAG_SET) 
    return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_STATUS_UNDEFINED));
  if (!cflag)
    return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_NOT_ALLOCATED));
  // check to see if flag is properly contained in the local comm buffer  
  if (cflag - RCCE_comm_buffer[RCCE_IAM]>=0 &&
      cflag+RCCE_LINE_SIZE - (RCCE_comm_buffer[RCCE_IAM] + RCCE_BUFF_SIZE)<0){}
  else {
    return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_NOT_IN_COMM_BUFFER));
  }
#endif

#ifdef USE_REVERTED_FLAGS
  flag = flag + RCCE_LINE_SIZE / sizeof(int) - 1;
#endif

  // always flush/invalidate to ensure we read the most recent value of *flag
  // keep reading it until it has the required value. We only need to read the
  // first int of the MPB cache line containing the flag
#ifdef _OPENMP
#pragma omp flush   
#endif
#ifndef USE_FLAG_EXPERIMENTAL
  RC_cache_invalidate();
#endif
  if((*flag) != val) {
    (*result) = 0;
  }    
  else {
    (*result) = 1;
  }

  return(RCCE_SUCCESS);
}
Esempio n. 17
0
//--------------------------------------------------------------------------------------
// FUNCTION: RCCE_comm_split
// RCCE_comm_split works like MPI_Comm_split, but:
// 1. Always uses the default global communicator as the basis, not an 
//    arbitrary communicator       
// 2. Uses the rank of the UE in the global communicator as the key
// 3. Uses a function, operating on UE's global rank, to compute color
//--------------------------------------------------------------------------------------
int RCCE_comm_split(
  int (*color)(int, void *), // function returning a color value for given ue and aux
  void *aux,                 // optional user-supplied data structure 
  RCCE_COMM *comm            // new communicator
  ) {

  int i, my_color, error;

  if (!comm) return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_COMM_UNDEFINED));

  // start with a barrier to make sure all UEs are participating, unless we are still 
  // defining the global communicator; there is no danger in skipping the barrier in 
  // that case, because the global communicator is defined in RCCE_init, which must be 
  // called by all cores before any other RCCE calls
  if (comm != &RCCE_COMM_WORLD) RCCE_barrier(&RCCE_COMM_WORLD);
 
  // determine the size of the communicator                              
  my_color = color(RCCE_IAM, aux);

  comm->size = 0;
  for (i=0; i<RCCE_NP; i++) {
    if (color(i, aux) == my_color) {
    if (i == RCCE_IAM) comm->my_rank = comm->size;
    comm->member[comm->size++] = i;
    }
  }

  // note: we only need to allocate new synch flags if the communicator has not yet been
  // initialized. It is legal to overwrite an initialized communcator, in which case the 
  // membership may change, but the same synchronization flags can be used       
  if (comm->initialized == RCCE_COMM_INITIALIZED) return(RCCE_SUCCESS);
  if(error=RCCE_flag_alloc(&(comm->gather)))
     return(RCCE_error_return(RCCE_debug_comm,error));
  if(error=RCCE_flag_alloc(&(comm->release)))
     return(RCCE_error_return(RCCE_debug_comm,error));
  comm->initialized = RCCE_COMM_INITIALIZED;

  return(RCCE_SUCCESS);
}
Esempio n. 18
0
//---------------------------------------------------------------------------------------
// FUNCTION: RCCE_allreduce
//---------------------------------------------------------------------------------------
// Reduction function which delivers the reduction results to all participating UEs
//---------------------------------------------------------------------------------------
int RCCE_allreduce(
  char *inbuf,   // source buffer for reduction datan
  char *outbuf,  // target buffer for reduction data
  int num,       // number of data elements to be reduced
  int type,      // type of data elements
  int op,        // reduction operation
  RCCE_COMM comm // communication domain within which to reduce
  ){

  int root = 0, all = 1;
  return(RCCE_error_return(RCCE_debug_comm,
    RCCE_reduce_general(inbuf, outbuf, num, type, op, root, all, comm)));
}
Esempio n. 19
0
int RCCE_send(char *privbuf, size_t size, int dest)
{
#ifdef MEASURE_TIME
    double send_start = RCCE_wtime();
#endif

    if (dest<0 || dest >= RCCE_NP) {
        return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_ID));
    }

    errval_t err = send_message(privbuf, size, RC_COREID[dest]);
    assert(err_is_ok(err));

#ifdef MEASURE_TIME
    measure_rcce_time += RCCE_wtime() - send_start;
#endif

#ifdef MEASURE_DATA
    measure_rcce_data[rcce_curphase][dest] += size;
#endif

    return (RCCE_SUCCESS);
}
Esempio n. 20
0
//--------------------------------------------------------------------------------------
// FUNCTION: RCCE_init
//--------------------------------------------------------------------------------------
// initialize the library and sanitize parameter list
//--------------------------------------------------------------------------------------
int RCCE_init(
  int *argc,   // pointer to argc, passed in from main program
  char ***argv // pointer to argv, passed in from main program
  ) {

  int i, ue, dummy_offset, loc, error;
#ifdef SCC
  int x, y, z;
  unsigned int physical_lockaddress;
#endif

#ifdef SHMADD
  unsigned int RCCE_SHM_BUFFER_offset ,result, rd_slot_nbr, wr_slot_nbr;
#endif
  void *nothing = NULL;
  
#ifdef SCC
  // Copperridge specific initialization...
  InitAPI(0);fflush(0);
#endif  

  // save pointer to executable name for later insertion into the argument list
  char *executable_name = (*argv)[0];

  RCCE_NP        = atoi(*(++(*argv)));  
  RC_REFCLOCKGHZ = atof(*(++(*argv)));  

  // put the participating core ids (unsorted) into an array             
  for (ue=0; ue<RCCE_NP; ue++) {
    RC_COREID[ue] = atoi(*(++(*argv)));
  }

#ifndef SCC
  // if using the functional emulator, must make sure to have read all command line 
  // parameters up to now before overwriting (shifted) first one with executable
  // name; even though argv is made firstprivate, that applies only the pointer to 
  // the arguments, not the actual data
  #pragma omp barrier
#endif
  // make sure executable name is as expected                 
  (*argv)[0] = executable_name;

  RC_MY_COREID = MYCOREID();

  // adjust apparent number of command line arguments, so it will appear to main 
  // program that number of UEs, clock frequency, and core ID list were not on
  // command line        
  *argc -= RCCE_NP+2;

  // sort array of participating phyical core IDs to determine their ranks
  RCCE_qsort((char *)RC_COREID, RCCE_NP, sizeof(int), id_compare);

  // determine rank of calling core
  for (ue=0; ue<RCCE_NP; ue++) {
    if (RC_COREID[ue] == RC_MY_COREID) RCCE_IAM = ue;
  }

#ifdef SHMADD
//   printf("Using SHMADD\n");
     RCCE_SHM_BUFFER_offset     = 0x00;
//   RCCE_SHM_BUFFER_offset     = 0x3FFFF80;
//   RCCE_SHM_BUFFER_offset   = 0x4000000;
//   RCCE_SHM_BUFFER_offset   = 0x181000;
   rd_slot_nbr=0x80;
   for(i=0; i<60; i++) {
     result  = readLUT(rd_slot_nbr);
     result -= 1;
     wr_slot_nbr = rd_slot_nbr + 4;
     writeLUT(wr_slot_nbr,result);
     rd_slot_nbr++;
   }
#endif

  // leave in one reassuring debug print
  if (DEBUG){
    printf("My rank is %d, physical core ID is %d\n", RCCE_IAM, RC_MY_COREID);
    fflush(0);
  }

  if (RCCE_IAM<0)
    return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_CORE_NOT_IN_HOSTFILE));

#ifdef SCC
  // compute and memory map addresses of test&set registers for all participating cores 
  for (ue=0; ue<RCCE_NP; ue++) { 
    z = Z_PID(RC_COREID[ue]);
    x = X_PID(RC_COREID[ue]);
    y = Y_PID(RC_COREID[ue]);
    physical_lockaddress = CRB_ADDR(x,y) + (z==0 ? LOCK0 : LOCK1);
    virtual_lockaddress[ue] = (t_vcharp) MallocConfigReg(physical_lockaddress);
  }
#endif

  // initialize MPB starting addresses for all participating cores; allow one
  // dummy cache line at front of MPB for fooling write combine buffer in case
  // of single-byte MPB access
  RCCE_fool_write_combine_buffer = RC_COMM_BUFFER_START(RCCE_IAM);

  for (ue=0; ue<RCCE_NP; ue++) 
    RCCE_comm_buffer[ue] = RC_COMM_BUFFER_START(ue) + RCCE_LINE_SIZE;

  // gross MPB size is set equal to maximum
  RCCE_BUFF_SIZE = RCCE_BUFF_SIZE_MAX - RCCE_LINE_SIZE;

#ifdef RC_POWER_MANAGEMENT
#ifndef SCC
  // always store RPC queue data structure at beginning of usable MPB, so allocatable
  // storage needs to skip it. Only need to do this for functional emulator
  for (ue=0; ue<RCCE_NP; ue++) 
    RCCE_comm_buffer[ue] += REGULATOR_LENGTH;
  RCCE_BUFF_SIZE -= REGULATOR_LENGTH;
#endif
#endif

  // initialize RCCE_malloc
  RCCE_malloc_init(RCCE_comm_buffer[RCCE_IAM],RCCE_BUFF_SIZE);
#ifdef SHMADD

  RCCE_shmalloc_init(RC_SHM_BUFFER_START()+RCCE_SHM_BUFFER_offset ,RCCE_SHM_SIZE_MAX);
#ifdef SHMDBG
  printf("\n%d:%s:%d: RCCE_SHM_BUFFER_offset, RCCE_SHM_SIZE_MAX: % x %x\n", RCCE_IAM, 
    __FILE__,__LINE__,RCCE_SHM_BUFFER_offset ,RCCE_SHM_SIZE_MAX);
#endif
#else
  RCCE_shmalloc_init(RC_SHM_BUFFER_START(),RCCE_SHM_SIZE_MAX);
#endif

  // initialize the (global) flag bookkeeping data structure
  for (loc=0; loc<RCCE_FLAGS_PER_LINE; loc++) 
    RCCE_flags.flag[loc] = (char)((unsigned int)0);
  RCCE_flags.line_address = NULL;
  RCCE_flags.members=0;
  RCCE_flags.next=NULL;

  // create global communicator (equivalent of MPI_COMM_WORLD); this will also allocate 
  // the two synchronization flags associated with the global barrier 
  RCCE_comm_split(RCCE_global_color, nothing, &RCCE_COMM_WORLD);

  // if power management is enabled, initialize more stuff; this includes two more 
  // communicators (for voltage and frequency domains), plus two synchronization flags
  // associated with the barrier for each communicator       
#ifdef RC_POWER_MANAGEMENT
  if (error=RCCE_init_RPC(RC_COREID, RCCE_IAM, RCCE_NP)) 
       return(RCCE_error_return(RCCE_debug_RPC,error));
#endif

#ifndef GORY
  // if we use the simplified API, we need to define more flags upfront  
  for (ue=0; ue<RCCE_NP; ue++) {
    if (error=RCCE_flag_alloc(&RCCE_sent_flag[ue]))
      return(RCCE_error_return(RCCE_debug_synch,error));
    if (error=RCCE_flag_alloc(&RCCE_ready_flag[ue]))
      return(RCCE_error_return(RCCE_debug_synch,error));
  }

#endif

  return (RCCE_SUCCESS);
}
Esempio n. 21
0
int RCCE_recv(char *privbuf, size_t size, int source)
{
    errval_t err;
#ifdef MEASURE_TIME
    double recv_start = RCCE_wtime();
#endif

#ifdef RCCE_PERF_MEASURE
    dispatcher_handle_t handle = curdispatcher();
    struct dispatcher_shared_generic* d =
        get_dispatcher_shared_generic(handle);
#endif

    if (source<0 || source >= RCCE_NP) {
        return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_ID));
    }

    int core_id = RC_COREID[source];
    struct msg_buf *mb = &msgbuf[core_id];
#ifdef BULK_TRANSFER_ENABLED
    mb->bulk_ready = true;
    mb->length = size;
    mb->current = 0;
    mb->msg = privbuf;
#endif

    dprintf("%d: R(%lu,%d,%p,%d,%p)\n", my_core_id, size, source, mb, mb->pending, privbuf);

#ifdef BULK_TRANSFER_ENABLED
    err = barray[core_id]->tx_vtbl.bulk_recv_ready(barray[core_id], NOP_CONT,
            my_core_id, size);
    assert(err_is_ok(err));
#endif

    PERF(30);

    while(!mb->pending) {
        messages_wait_and_handle_next();
    }

    PERF(31);

    dprintf("%d: msg arrived\n", my_core_id);

    /* if(size <= DEFAULT_UMP_BUFLEN) { */
#ifndef BULK_TRANSFER_ENABLED
    assert(size == mb->length);
    memcpy(privbuf, mb->msg, size);
    /* } else { */
#else
    assert(mb->bulk);
#endif
    /* } */
    mb->pending = false;

#ifndef BULK_TRANSFER_ENABLED
    assert(!mb->bulk);
    free(mb->msg);
    PERF(32);
    err = barray[core_id]->tx_vtbl.message_reply(barray[core_id],
            NOP_CONT, my_core_id);
    PERF(33);
    assert(err_is_ok(err));
#else
    assert(mb->bulk);
#endif

#ifdef MEASURE_TIME
    measure_rcce_time += RCCE_wtime() - recv_start;
#endif

    return (RCCE_SUCCESS);
}
Esempio n. 22
0
//--------------------------------------------------------------------------------------
// FUNCTION: RCCE_nb_barrier
//--------------------------------------------------------------------------------------
// non-blocking version of the linear barrier 
//--------------------------------------------------------------------------------------
int RCCE_nb_barrier(RCCE_COMM *comm) {
 
  volatile unsigned char cyclechar[RCCE_LINE_SIZE] __attribute__ ((aligned (RCCE_LINE_SIZE)));
  volatile unsigned char   valchar[RCCE_LINE_SIZE] __attribute__ ((aligned (RCCE_LINE_SIZE)));
  int   i, error;
  int   ROOT      =  0;
#ifdef USE_FLAG_EXPERIMENTAL
  volatile char *cycle;
  volatile char *val;
  cycle  = (volatile char *)cyclechar;
  val    = (volatile char *)valchar;
#else
  volatile int *cycle;
  volatile int *val;
  cycle  = (volatile int *)cyclechar;
  val    = (volatile int *)valchar;
#endif

  if(comm->label == 1) goto label1;
  if(comm->label == 2) goto label2;

  comm->count = 0;

  if (RCCE_debug_synch) 
    fprintf(STDERR,"UE %d has checked into barrier\n", RCCE_IAM);

#ifdef USE_FAT_BARRIER

  // flip local barrier variable
#ifndef USE_FLAG_EXPERIMENTAL
  if ((error = RCCE_get(cyclechar, (t_vcharp)(comm->gather[RCCE_IAM]), RCCE_LINE_SIZE, RCCE_IAM)))
#else
  if ((error = RCCE_get_flag(cyclechar, (t_vcharp)(comm->gather[RCCE_IAM]), RCCE_LINE_SIZE, RCCE_IAM)))
#endif
    return(RCCE_error_return(RCCE_debug_synch,error));
  *cycle = !(*cycle);
#ifndef USE_FLAG_EXPERIMENTAL
  if ((error = RCCE_put((t_vcharp)(comm->gather[RCCE_IAM]), cyclechar, RCCE_LINE_SIZE, RCCE_IAM)))
#else
  if ((error = RCCE_put_flag((t_vcharp)(comm->gather[RCCE_IAM]), cyclechar, RCCE_LINE_SIZE, RCCE_IAM)))
#endif
    return(RCCE_error_return(RCCE_debug_synch,error));
  if ((error = RCCE_put((t_vcharp)(comm->gather[RCCE_IAM]), cyclechar, RCCE_LINE_SIZE, comm->member[ROOT])))
    return(RCCE_error_return(RCCE_debug_synch,error));
 
  if (RCCE_IAM==comm->member[ROOT]) {
    // read "remote" gather flags; once all equal "cycle" (i.e counter==comm->size),
    // we know all UEs have reached the barrier
    comm->cycle = *cycle;
label1:
    while (comm->count != comm->size) {
      // skip the first member (#0), because that is the ROOT
      for (comm->count=i=1; i<comm->size; i++) {
	/* copy flag values out of comm buffer */
#ifndef USE_FLAG_EXPERIMENTAL
	if ((error = RCCE_get(valchar, (t_vcharp)(comm->gather[i]), RCCE_LINE_SIZE, RCCE_IAM)))
#else
        if ((error = RCCE_get_flag(valchar, (t_vcharp)(comm->gather[i]), RCCE_LINE_SIZE, RCCE_IAM)))
#endif
	  return(RCCE_error_return(RCCE_debug_synch,error));
	if (*val == comm->cycle) comm->count++;
      }
      if(comm->count != comm->size) {
	comm->label = 1;
	return(RCCE_PENDING);
      }
    }
    // set release flags
    for (i=1; i<comm->size; i++) {
      if ((error = RCCE_flag_write(&(comm->release), comm->cycle, comm->member[i])))
	return(RCCE_error_return(RCCE_debug_synch,error));
    }   
  }
  else {
    int test;
    comm->cycle = *cycle;
label2:
    RCCE_test_flag(comm->release, comm->cycle, &test);
    if(!test) {
      comm->label = 2;
      return(RCCE_PENDING);
    }
  }

  comm->label = 0;

#else // !USE_FAT_BARRIER

  // flip local barrier variable
#ifndef USE_FLAG_EXPERIMENTAL
  if ((error = RCCE_get(cyclechar, (t_vcharp)(comm->gather[0]), RCCE_LINE_SIZE, RCCE_IAM)))
#else
  if ((error = RCCE_get_flag(cyclechar, (t_vcharp)(comm->gather[0]), RCCE_LINE_SIZE, RCCE_IAM)))
#endif
    return(RCCE_error_return(RCCE_debug_synch,error));
  *cycle = !(*cycle);
#ifndef USE_FLAG_EXPERIMENTAL
  if ((error = RCCE_put((t_vcharp)(comm->gather[0]), cyclechar, RCCE_LINE_SIZE, RCCE_IAM)))
#else
  if ((error = RCCE_put_flag((t_vcharp)(comm->gather[0]), cyclechar, RCCE_LINE_SIZE, RCCE_IAM)))
#endif
    return(RCCE_error_return(RCCE_debug_synch,error));

  if (RCCE_IAM==comm->member[ROOT]) {
    // read "remote" gather flags; once all equal "cycle" (i.e counter==comm->size), 
    // we know all UEs have reached the barrier
    comm->cycle = *cycle;
label1:    
    while (comm->count != comm->size) {
      // skip the first member (#0), because that is the ROOT         
      for (comm->count=i=1; i<comm->size; i++) {
        /* copy flag values out of comm buffer                        */
#ifndef USE_FLAG_EXPERIMENTAL
        if ((error = RCCE_get(valchar, (t_vcharp)(comm->gather[0]), RCCE_LINE_SIZE, 
                             comm->member[i])))
#else
         if ((error = RCCE_get_flag(valchar, (t_vcharp)(comm->gather[0]), RCCE_LINE_SIZE, 
                             comm->member[i])))
#endif
          return(RCCE_error_return(RCCE_debug_synch,error));
        if (*val == comm->cycle) comm->count++;
      }
      if(comm->count != comm->size) {
	comm->label = 1;
	return(RCCE_PENDING);
      }
    }
    // set release flags                                              
    for (i=1; i<comm->size; i++) {
      if ((error = RCCE_flag_write(&(comm->release), comm->cycle, comm->member[i])))
        return(RCCE_error_return(RCCE_debug_synch,error));
    }
  }
  else {
    int test;
    comm->cycle = *cycle;
label2:
    RCCE_test_flag(comm->release, comm->cycle, &test);
    if(!test) {
      comm->label = 2;
      return(RCCE_PENDING);
    }
  }

  comm->label = 0;

#endif // !USE_FAT_BARRIER
  if (RCCE_debug_synch) fprintf(STDERR,"UE %d has cleared barrier\n", RCCE_IAM);  
  return(RCCE_SUCCESS);
}
Esempio n. 23
0
//--------------------------------------------------------------------------------------
// FUNCTION: RCCE_put
//--------------------------------------------------------------------------------------
// copy data from address "source" in the local MPB or the calling UE's private memory 
// to address "target" in the remote MPB. We do not test to see if a move from the 
// calling UE's private memory stays within allocated memory                        
//--------------------------------------------------------------------------------------
int RCCE_put(
  t_vcharp target, // target buffer, MPB
  t_vcharp source, // source buffer, MPB or private memory
  int num_bytes, 
  int ID
  ) {

#ifdef GORY
  // we only need to do tests in GORY mode; in non-GORY mode ths function is never 
  // called by the user, but only be the library
  int copy_mode;

  // check validity of parameters                                        
  if (!target) return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_TARGET));
  if (!source) return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_SOURCE));
  if (ID<0 || 
      ID>=RCCE_NP) return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_ID));
  if (num_bytes < 0 || num_bytes%RCCE_LINE_SIZE!=0) 
     return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_MESSAGE_LENGTH));
  // determine if target data is in MPB; check using local buffer boundaries 
  if (target - RCCE_comm_buffer[RCCE_IAM]>=0 &&
      target+num_bytes - (RCCE_comm_buffer[RCCE_IAM] + RCCE_BUFF_SIZE)<=0)
    // shift target address to point to remote MPB                
    target = RCCE_comm_buffer[ID]+(target-RCCE_comm_buffer[RCCE_IAM]);    
  else  return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_TARGET));

   // source can be either local MPB or private memory            
  if (source - RCCE_comm_buffer[RCCE_IAM] >= 0 &&
      source+num_bytes - (RCCE_comm_buffer[RCCE_IAM] + RCCE_BUFF_SIZE)<=0)
    copy_mode = BOTH_IN_COMM_BUFFER;
  else 
    copy_mode = SOURCE_IN_PRIVATE_MEMORY;    

  // make sure that if the copy is between locations within the same MPB
  // there is no overlap between source and target address ranges  
  if ( copy_mode == BOTH_IN_COMM_BUFFER) {
    if (((source-target)>0 && (source+num_bytes-target)<0) ||
       ((target-source)>0 && (target+num_bytes-source)<0)) {
      return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_DATA_OVERLAP));
    }
  }

  // ascertain that the start of the buffer is cache line aligned   
  int start_index = target-RCCE_comm_buffer[ID];
  if (start_index%RCCE_LINE_SIZE!=0) 
    return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_ALIGNMENT));

  // only verify alignment of the target if it is in the MPB
  if (copy_mode == BOTH_IN_COMM_BUFFER) {
    start_index = source-RCCE_comm_buffer[ID];
    if (start_index%RCCE_LINE_SIZE!=0) 
      return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_ALIGNMENT));
  }
#else
  // in non-GORY mode we only need to retain the MPB target shift; we
  // already know the target is in the MPB, not private memory
  target = RCCE_comm_buffer[ID]+(target-RCCE_comm_buffer[RCCE_IAM]);    
#endif

  // make sure that any data that has been put in our MPB by another UE is visible 
#ifdef _OPENMP
  #pragma omp flush
#endif

  // do the actual copy 
  RC_cache_invalidate();
  memcpy((void *)target, (void *)source, num_bytes);

  // flush data to make it visible to all threads; cannot use flush list because it 
  // concerns malloced space                        
#ifdef _OPENMP
  #pragma omp flush
#endif
  return(RCCE_SUCCESS);
}
Esempio n. 24
0
//--------------------------------------------------------------------------------------
// FUNCTION: RCCE_send_general
//--------------------------------------------------------------------------------------
// Synchronized send function (gory and non-gory mode)
//--------------------------------------------------------------------------------------
static int RCCE_send_general(
  char *privbuf,    // source buffer in local private memory (send buffer)
  t_vcharp combuf,  // intermediate buffer in MPB
  size_t chunk,     // size of MPB available for this message (bytes)
  RCCE_FLAG *ready, // flag indicating whether receiver is ready
  RCCE_FLAG *sent,  // flag indicating whether message has been sent by source
  size_t size,      // size of message (bytes)
  int dest,         // UE that will receive the message
  int copy,         // set to 0 for synchronization only (no copying/sending)
  int pipe,         // use pipelining?
  int mcast,        // multicast?
  void* tag,        // additional tag?
  int len,          // length of additional tag
  RCCE_FLAG *probe  // flag for probing for incoming messages
  ) {

  char padline[RCCE_LINE_SIZE]; // copy buffer, used if message not multiple of line size
  size_t wsize,    // offset within send buffer when putting in "chunk" bytes
        remainder, // bytes remaining to be sent
        nbytes;    // number of bytes to be sent in single RCCE_put call
  char *bufptr;    // running pointer inside privbuf for current location

#ifdef USE_REMOTE_PUT_LOCAL_GET
  if(mcast) return(RCCE_error_return(1, RCCE_ERROR_NO_MULTICAST_SUPPORT));
#endif

  if(probe)
#ifdef USE_TAGGED_FLAGS
    RCCE_flag_write_tagged(probe, RCCE_FLAG_SET, dest, tag, len);
#else
    RCCE_flag_write(probe, RCCE_FLAG_SET, dest);
#endif

#ifdef USE_SYNCH_FOR_ZERO_BYTE
  // synchronize even in case of zero byte messages:
  if(size == 0) {
#ifdef USE_REMOTE_PUT_LOCAL_GET
    RCCE_wait_until(*ready, RCCE_FLAG_SET);
    RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM);
#ifdef USE_TAGGED_FLAGS
    if(!probe)
      RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len);
    else
#endif
    RCCE_flag_write(sent, RCCE_FLAG_SET, dest);
#else // LOCAL PUT / REMOTE GET: (standard)
#ifdef USE_TAGGED_FLAGS
    if(!probe)
      RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len);
    else
#endif
    RCCE_flag_write(sent, RCCE_FLAG_SET, dest);
    RCCE_wait_until(*ready, RCCE_FLAG_SET);
    RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM);
#endif // !USE_REMOTE_PUT_LOCAL_GET
    return(RCCE_SUCCESS);
  }
#endif // USE_SYNCH_FOR_ZERO_BYTE

  if(!pipe) {
    // send data in units of available chunk size of comm buffer 
    for (wsize=0; wsize< (size/chunk)*chunk; wsize+=chunk) {
      bufptr = privbuf + wsize;
      nbytes = chunk;

#ifdef USE_REMOTE_PUT_LOCAL_GET

      RCCE_wait_until(*ready, RCCE_FLAG_SET);
      RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM);
      
      // copy private data to remote comm buffer
      if(copy) RCCE_put(combuf, (t_vcharp) bufptr, nbytes, dest);

#ifdef USE_TAGGED_FLAGS
      if( (wsize == 0) && (!probe) )
	RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len);
      else
#endif
      RCCE_flag_write(sent, RCCE_FLAG_SET, dest);

#else // LOCAL PUT / REMOTE GET: (standard)

      // copy private data to own comm buffer
      if(copy) RCCE_put(combuf, (t_vcharp) bufptr, nbytes, RCCE_IAM);

      if(!mcast) {
#ifdef USE_TAGGED_FLAGS
	if( (wsize == 0) && (!probe) )
	  RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len);
	else
#endif
	RCCE_flag_write(sent, RCCE_FLAG_SET, dest);

	// wait for the destination to be ready to receive a message          
	RCCE_wait_until(*ready, RCCE_FLAG_SET);
	RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM);
      }
      else {
	RCCE_TNS_barrier(&RCCE_COMM_WORLD);
	RCCE_TNS_barrier(&RCCE_COMM_WORLD);
      }
#endif // !USE_REMOTE_PUT_LOCAL_GET

    } // for
  }
  else // if(!pipe) ->  if(pipe)
  {
    // pipelined version of send/recv:
    size_t subchunk1, subchunk2;

    for(wsize = 0; wsize < (size/chunk)*chunk; wsize+=chunk) {

      if(wsize == 0) {
	// allign sub-chunks to cache line granularity:
	subchunk1 = ( (chunk / 2) / RCCE_LINE_SIZE ) * RCCE_LINE_SIZE;
	subchunk2 = chunk - subchunk1;
      }

      bufptr = privbuf + wsize;
      nbytes = subchunk1;

#ifdef USE_REMOTE_PUT_LOCAL_GET

      RCCE_wait_until(*ready, RCCE_FLAG_SET);
      RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM);

      // copy private data chunk 1 to remote comm buffer
      if(copy) RCCE_put(combuf, (t_vcharp) bufptr, nbytes, dest);

#ifdef USE_TAGGED_FLAGS
      if( (wsize == 0) && (!probe) )
	RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len);
      else
#endif
      RCCE_flag_write(sent, RCCE_FLAG_SET, dest);

#else // LOCAL PUT / REMOTE GET: (standard)
      
      // copy private data chunk 1 to own comm buffer
      if(copy) RCCE_put(combuf, (t_vcharp) bufptr, nbytes, RCCE_IAM);

#ifdef USE_TAGGED_FLAGS
      if( (wsize == 0) && (!probe) )
	RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len);
      else
#endif
      RCCE_flag_write(sent, RCCE_FLAG_SET, dest);
      
      RCCE_wait_until(*ready, RCCE_FLAG_SET);
      RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM);

#endif // !USE_REMOTE_PUT_LOCAL_GET      
      
      bufptr = privbuf + wsize + subchunk1;
      nbytes = subchunk2;
      
#ifdef USE_REMOTE_PUT_LOCAL_GET

      RCCE_wait_until(*ready, RCCE_FLAG_SET);
      RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM);
      
      // copy private data chunk 2 to remote comm buffer
      if(copy) RCCE_put(combuf + subchunk1, (t_vcharp) bufptr, nbytes, dest);

      RCCE_flag_write(sent, RCCE_FLAG_SET, dest);

#else // LOCAL PUT / REMOTE GET: (standard)

      // copy private data chunk 2 to own comm buffer
      if(copy) RCCE_put(combuf + subchunk1, (t_vcharp) bufptr, nbytes, RCCE_IAM);
      
      RCCE_flag_write(sent, RCCE_FLAG_SET, dest);
      
      RCCE_wait_until(*ready, RCCE_FLAG_SET);
      RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM);

#endif // !USE_REMOTE_PUT_LOCAL_GET

    } //for

  } // if(pipe)

  remainder = size%chunk; 
  // if nothing is left over, we are done 
  if (!remainder) return(RCCE_SUCCESS);

  // send remainder of data--whole cache lines            
  bufptr = privbuf + (size/chunk)*chunk;
  nbytes = remainder - remainder%RCCE_LINE_SIZE;

  if (nbytes) {

#ifdef USE_REMOTE_PUT_LOCAL_GET

    RCCE_wait_until(*ready, RCCE_FLAG_SET);
    RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM);
    
    // copy private data to remote comm buffer
    if(copy) RCCE_put(combuf, (t_vcharp) bufptr, nbytes, dest);

#ifdef USE_TAGGED_FLAGS
    if( (wsize == 0) && (!probe) )
      RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len);
    else
#endif
    RCCE_flag_write(sent, RCCE_FLAG_SET, dest);
    
#else // LOCAL PUT / REMOTE GET: (standard)

    // copy private data to own comm buffer
    if(copy) RCCE_put(combuf, (t_vcharp)bufptr, nbytes, RCCE_IAM);

    if(!mcast) {
#ifdef USE_TAGGED_FLAGS
      if( (wsize == 0) && (!probe) )
	RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len);
      else
#endif
      RCCE_flag_write(sent, RCCE_FLAG_SET, dest);

      // wait for the destination to be ready to receive a message          
      RCCE_wait_until(*ready, RCCE_FLAG_SET);
      RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM);
    }
    else {
      RCCE_TNS_barrier(&RCCE_COMM_WORLD);
      RCCE_TNS_barrier(&RCCE_COMM_WORLD);
    }
#endif // !USE_REMOTE_PUT_LOCAL_GET

  } // if(nbytes)
   
  remainder = remainder%RCCE_LINE_SIZE;
  if (!remainder) return(RCCE_SUCCESS);
  
  // remainder is less than a cache line. This must be copied into appropriately sized 
  // intermediate space before it can be sent to the receiver 
  bufptr = privbuf + (size/chunk)*chunk + nbytes;
  nbytes = RCCE_LINE_SIZE;

  if(copy) {
#ifdef COPPERRIDGE
    memcpy_scc(padline,bufptr,remainder);
#else
    memcpy(padline,bufptr,remainder);
#endif
  }

#ifdef USE_REMOTE_PUT_LOCAL_GET

  RCCE_wait_until(*ready, RCCE_FLAG_SET);
  RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM);
  
  // copy private data to remote comm buffer
  if(copy) RCCE_put(combuf, (t_vcharp) padline, nbytes, dest);

#ifdef USE_TAGGED_FLAGS
  if( (wsize == 0) && (!probe) )
    RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len);
  else
#endif
  RCCE_flag_write(sent, RCCE_FLAG_SET, dest);

#else // LOCAL PUT / REMOTE GET: (standard)

  // copy private data to own comm buffer 
  if(copy) RCCE_put(combuf, (t_vcharp)padline, nbytes, RCCE_IAM);
  
  if(!mcast) {
#ifdef USE_TAGGED_FLAGS
    if( (wsize == 0) && (!probe) )
      RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len);
    else
#endif
    RCCE_flag_write(sent, RCCE_FLAG_SET, dest);

    // wait for the destination to be ready to receive a message          
    RCCE_wait_until(*ready, RCCE_FLAG_SET);
    RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM);
  }
  else {
    RCCE_TNS_barrier(&RCCE_COMM_WORLD);
    RCCE_TNS_barrier(&RCCE_COMM_WORLD);
  }

#endif // !USE_REMOTE_PUT_LOCAL_GET

  return(RCCE_SUCCESS);
}