//-------------------------------------------------------------------------------------- // FUNCTION: RCCE_wait_until //-------------------------------------------------------------------------------------- // wait until flag in local MPB becomes set or unset. To avoid reading stale data from // the cache instead of new flag value from the MPB, issue MPB cache invalidation before // each read, including within the spin cycle //-------------------------------------------------------------------------------------- int RCCE_wait_until(RCCE_FLAG flag, RCCE_FLAG_STATUS val) { t_vcharp cflag; cflag = (t_vcharp) flag; #ifdef GORY if (val != RCCE_FLAG_UNSET && val != RCCE_FLAG_SET) return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_STATUS_UNDEFINED)); if (!cflag) return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_NOT_ALLOCATED)); // check to see if flag is properly contained in the local comm buffer if (cflag - RCCE_comm_buffer[RCCE_IAM]>=0 && cflag+RCCE_LINE_SIZE - (RCCE_comm_buffer[RCCE_IAM] + RCCE_BUFF_SIZE)<0){} else { return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_NOT_IN_COMM_BUFFER)); } #endif // always flush/invalidate to ensure we read the most recent value of *flag // keep reading it until it has the required value. We only need to read the // first int of the MPB cache line containing the flag do { #ifdef _OPENMP #pragma omp flush #endif RC_cache_invalidate(); } while ((*flag) != val); return(RCCE_SUCCESS); }
//-------------------------------------------------------------------------------------- // RCCE_bcast //-------------------------------------------------------------------------------------- // function that sends data from UE root to all other UEs in the communicator //-------------------------------------------------------------------------------------- int RCCE_bcast( char *buf, // private memory, used for sending (root) and receiving (other UEs) size_t num, // number of bytes to be sent int root, // source within "comm" of broadcast data RCCE_COMM comm // communication domain ) { int ue, ierr; #ifdef GORY printf("Collectives only implemented for simplified API\n"); return(1); #else // check to make sure root is member of the communicator if (root<0 || root >= comm.size) return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_ID)); if (RCCE_IAM == comm.member[root]) { for (ue=0; ue<comm.size; ue++) if (ue != root) if((ierr=RCCE_send(buf, num, comm.member[ue]))) return(RCCE_error_return(RCCE_debug_comm,ierr)); } else if((ierr=RCCE_recv(buf, num, comm.member[root]))) return(RCCE_error_return(RCCE_debug_comm,ierr)); return(RCCE_SUCCESS); #endif }
//-------------------------------------------------------------------------------------- // FUNCTION: RCCE_wait_until //-------------------------------------------------------------------------------------- // wait until flag in local MPB becomes set or unset. To avoid reading stale data from // the cache instead of new flag value from the MPB, issue MPB cache invalidation before // each read, including within the spin cycle //-------------------------------------------------------------------------------------- int RCCE_wait_until(RCCE_FLAG flag, RCCE_FLAG_STATUS val) { t_vcharp cflag; cflag = flag.line_address; // avoid tests if we use the simplified API #ifdef GORY if (val != RCCE_FLAG_UNSET && val != RCCE_FLAG_SET) return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_STATUS_UNDEFINED)); if (!cflag) return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_NOT_ALLOCATED)); // check to see if flag is properly contained in the local comm buffer if (cflag - RCCE_comm_buffer[RCCE_IAM]>=0 && cflag+RCCE_LINE_SIZE - (RCCE_comm_buffer[RCCE_IAM] + RCCE_BUFF_SIZE)<0){} else { return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_NOT_IN_COMM_BUFFER)); } #endif // always flush/invalidate to ensure we read the most recent value of *flag // keep reading it until it has the required value do { #ifdef _OPENMP #pragma omp flush #endif RC_cache_invalidate(); } while ((RCCE_bit_value(cflag, flag.location) != val)); return(RCCE_SUCCESS); }
//-------------------------------------------------------------------------------------- // FUNCTION: RCCE_flag_write //-------------------------------------------------------------------------------------- // This is the core flag manipulation routine. It requires locking to guarantee atomic // access while updating one of a line of flags. //-------------------------------------------------------------------------------------- int RCCE_flag_write(RCCE_FLAG *flag, RCCE_FLAG_STATUS val, int ID) { t_vchar val_array[1]; #ifdef GORY // check input parameters if (!flag || flag->location < 0 || flag->location > RCCE_FLAGS_PER_LINE) return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_UNDEFINED)); if (val != RCCE_FLAG_UNSET && val != RCCE_FLAG_SET) return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_STATUS_UNDEFINED)); if (ID<0 || ID>=RCCE_NP) return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_ID)); #endif // acquire lock to make sure nobody else fiddles with the flags on the target core RCCE_acquire_lock(ID); // copy word from MPB containing flag to private memory RCCE_get_char(val_array, flag->flag_addr, ID); // overwrite single bit within local copy of flag word RCCE_write_bit_value(val_array, (flag->location)%RCCE_FLAGS_PER_BYTE, val); // write copy back to the MPB RCCE_put_char(flag->flag_addr, val_array, ID); // release write lock for the flags on the target core RCCE_release_lock(ID); return(RCCE_SUCCESS); }
//-------------------------------------------------------------------------------------- // FUNCTION: RCCE_flag_alloc //-------------------------------------------------------------------------------------- // allocate space for one flag. Since multiple fit on a single cache line, we only // need to allocate new MPB space when all the existing lines are completely filled. A // flag line is a data structure that contains an array ("flag") of size RCCE_LINE_SIZE // characters. Each element in "flag" corresponds to a flag being in use (value is 1) // or not (value is 0). The actual value of the flag is stored in the MPB line pointed // to be the field "line_address," at the corresponding bit/byte location as in field // "flag." //-------------------------------------------------------------------------------------- int RCCE_flag_alloc(RCCE_FLAG *flag) { RCCE_FLAG_LINE *flagp; t_vcharp flag_addr; int c, loc; // find the head of the data structure that administers the flag variables flagp = &RCCE_flags; while (flagp->members == RCCE_FLAGS_PER_LINE && flagp->next) { flagp = flagp->next; } // if this is a new flag line, need to allocate MPB for it if (!flagp->line_address) flagp->line_address = RCCE_malloc(RCCE_LINE_SIZE); if (!flagp->line_address) return(RCCE_error_return(RCCE_debug_synch, RCCE_ERROR_FLAG_NOT_ALLOCATED)); if (flagp->members < RCCE_FLAGS_PER_LINE) { // there is space in this line for a new flag; find first open slot for (loc=0; loc<RCCE_FLAGS_PER_LINE; loc++) { flag_addr = flagp->line_address + loc/RCCE_FLAGS_PER_BYTE; if (!((int)(flagp->flag[loc]))) { flagp->flag[loc] = (char) ((unsigned int) 1); flagp->members++; flag->location = loc; flag->line_address = flagp->line_address; flag->flag_addr = flag_addr; return(RCCE_SUCCESS); } } } else { // must create new flag line if last one was full flagp->next = (RCCE_FLAG_LINE *) malloc(sizeof(RCCE_FLAG_LINE)); if (!(flagp->next)) return(RCCE_error_return(RCCE_debug_synch, RCCE_ERROR_FLAG_NOT_ALLOCATED)); flagp = flagp->next; flagp->line_address = RCCE_malloc(RCCE_LINE_SIZE); if (!(flagp->line_address)) return(RCCE_error_return(RCCE_debug_synch, RCCE_ERROR_FLAG_NOT_ALLOCATED)); // initialize the flag line flagp->members=1; flagp->next = NULL; for (c=1; c<RCCE_LINE_SIZE; c++) flagp->flag[c] = (char)((unsigned int) 0); // set first flag field to indicate the corresponding flag is now in use flagp->flag[0] = (char)((unsigned int) 1); flag->location = 0; flag->line_address = flagp->line_address; flag->flag_addr = flag->line_address; } return(RCCE_SUCCESS); }
//-------------------------------------------------------------------------------------- // FUNCTION: RCCE_flag_write //-------------------------------------------------------------------------------------- // This is the core flag manipulation routine. //-------------------------------------------------------------------------------------- int RCCE_flag_write(RCCE_FLAG *flag, RCCE_FLAG_STATUS val, int ID) { #ifdef GORY // check input parameters if (!flag || flag->location < 0 || flag->location > RCCE_LINE_SIZE) return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_UNDEFINED)); if (val != RCCE_FLAG_UNSET && val != RCCE_FLAG_SET) return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_STATUS_UNDEFINED)); if (ID<0 || ID>=RCCE_NP) return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_ID)); #endif RCCE_put_char(flag->flag_addr, (t_vcharp) &val, ID); return(RCCE_SUCCESS); }
//-------------------------------------------------------------------------------------- // FUNCTION: RCCE_barrier //-------------------------------------------------------------------------------------- // very simple, linear barrier //-------------------------------------------------------------------------------------- int RCCE_barrier(RCCE_COMM *comm) { int counter, i, error; int ROOT = 0; volatile unsigned char cyclechar[RCCE_LINE_SIZE]; volatile unsigned char valchar[RCCE_LINE_SIZE]; volatile int *cycle; volatile int *val; counter = 0; cycle = (volatile int *)cyclechar; val = (volatile int *)valchar; if (RCCE_debug_synch) fprintf(STDERR,"UE %d has checked into barrier\n", RCCE_IAM); // flip local barrier variable if (error = RCCE_get(cyclechar, (t_vcharp)(comm->gather), RCCE_LINE_SIZE, RCCE_IAM)) return(RCCE_error_return(RCCE_debug_synch,error)); *cycle = !(*cycle); if (error = RCCE_put((t_vcharp)(comm->gather), cyclechar, RCCE_LINE_SIZE, RCCE_IAM)) return(RCCE_error_return(RCCE_debug_synch,error)); if (RCCE_IAM==comm->member[ROOT]) { // read "remote" gather flags; once all equal "cycle" (i.e counter==comm->size), // we know all UEs have reached the barrier while (counter != comm->size) { // skip the first member (#0), because that is the ROOT for (counter=i=1; i<comm->size; i++) { /* copy flag values out of comm buffer */ if (error = RCCE_get(valchar, (t_vcharp)(comm->gather), RCCE_LINE_SIZE, comm->member[i])) return(RCCE_error_return(RCCE_debug_synch,error)); if (*val == *cycle) counter++; } } // set release flags for (i=1; i<comm->size; i++) { if (error = RCCE_flag_write(&(comm->release), *cycle, comm->member[i])) return(RCCE_error_return(RCCE_debug_synch,error)); } } else { if (error = RCCE_wait_until(comm->release, *cycle)) return(RCCE_error_return(RCCE_debug_synch,error)); } if (RCCE_debug_synch) fprintf(STDERR,"UE %d has cleared barrier\n", RCCE_IAM); return(RCCE_SUCCESS); }
//-------------------------------------------------------------------------------------- // FUNCTION: RCCE_barrier //-------------------------------------------------------------------------------------- // very simple, linear barrier //-------------------------------------------------------------------------------------- int RCCE_barrier(RCCE_COMM *comm) { int counter, i, error; int ROOT = 0; t_vchar cyclechar[RCCE_LINE_SIZE]; t_vchar valchar [RCCE_LINE_SIZE]; t_vcharp gatherp, releasep; RCCE_FLAG_STATUS cycle; counter = 0; gatherp = comm->gather.line_address; if (RCCE_debug_synch) fprintf(STDERR,"UE %d has checked into barrier\n", RCCE_IAM); // flip local barrier variable if (error = RCCE_get(cyclechar, gatherp, RCCE_LINE_SIZE, RCCE_IAM)) return(RCCE_error_return(RCCE_debug_synch,error)); cycle = RCCE_flip_bit_value(cyclechar, comm->gather.location); if (error = RCCE_put(comm->gather.line_address, cyclechar, RCCE_LINE_SIZE, RCCE_IAM)) return(RCCE_error_return(RCCE_debug_synch,error)); if (RCCE_IAM==comm->member[ROOT]) { // read "remote" gather flags; once all equal "cycle" (i.e counter==comm->size), // we know all UEs have reached the barrier while (counter != comm->size) { // skip the first member (#0), because that is the ROOT for (counter=i=1; i<comm->size; i++) { // copy flag values out of comm buffer if (error = RCCE_get(valchar, comm->gather.line_address, RCCE_LINE_SIZE, comm->member[i])) return(RCCE_error_return(RCCE_debug_synch,error)); if (RCCE_bit_value(valchar, comm->gather.location) == cycle) counter++; } } // set release flags for (i=1; i<comm->size; i++) if (error = RCCE_flag_write(&(comm->release), cycle, comm->member[i])) return(RCCE_error_return(RCCE_debug_synch,error)); } else { if (error = RCCE_wait_until(comm->release, cycle)) return(RCCE_error_return(RCCE_debug_synch,error)); } if (RCCE_debug_synch) fprintf(STDERR,"UE %d has cleared barrier\n", RCCE_IAM); return(RCCE_SUCCESS); }
//-------------------------------------------------------------------------------------- // FUNCTION: RCCE_flag_read //-------------------------------------------------------------------------------------- // This routine is rarely needed. We typically only read a flag when we're waiting for // it to change value (function RCCE_wait_until). Reading does not require locking. The // moment the target flag we're trying to read changes value, it is OK to read and // return that value //-------------------------------------------------------------------------------------- int RCCE_flag_read(RCCE_FLAG flag, RCCE_FLAG_STATUS *val, int ID) { volatile unsigned char val_loc; #ifdef GORY if (flag.location < 0 || flag.location > RCCE_LINE_SIZE) return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_UNDEFINED)); if (!val) return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_VAL_UNDEFINED)); if (ID<0 || ID>=RCCE_NP) return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_ID)); #endif // Should be able to use same technique as in RCCE_wait_until, i.e., should not need // to copy out of MPB first. However, this function is not time critical RCCE_get_char(&val_loc, flag.flag_addr, ID); *val = val_loc; return(RCCE_SUCCESS); }
//-------------------------------------------------------------------------------------- // FUNCTION: RCCE_flag_read //-------------------------------------------------------------------------------------- // This routine is rarely needed. We typically only read a flag when we're waiting for // it to change value (function RCCE_wait_until). Reading does not require locking. The // moment the target flag we're trying to read changes value, it is OK to read and // return that value //-------------------------------------------------------------------------------------- int RCCE_flag_read(RCCE_FLAG flag, RCCE_FLAG_STATUS *val, int ID) { t_vchar val_array[1]; #ifdef GORY if (flag.location < 0 || flag.location > RCCE_FLAGS_PER_LINE) return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_UNDEFINED)); if (!val) return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_VAL_UNDEFINED)); if (ID<0 || ID>=RCCE_NP) return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_ID)); #endif // Should be able to use same technique as in RCCE_wait_until, i.e., should not need // to copy out of MPB first. However, this function is not time critical RCCE_get_char(val_array, flag.flag_addr, ID); *val = RCCE_bit_value(val_array, (flag.location)%RCCE_FLAGS_PER_BYTE); return(RCCE_SUCCESS); }
//--------------------------------------------------------------------------------------- // FUNCTION: RCCE_reduce //--------------------------------------------------------------------------------------- // Reduction function which delivers the reduction results to UE root //--------------------------------------------------------------------------------------- int RCCE_reduce( char *inbuf, // source buffer for reduction datan char *outbuf, // target buffer for reduction data int num, // number of data elements to be reduced int type, // type of data elements int op, // reduction operation int root, // member of "comm" receiving reduction results RCCE_COMM comm // communication domain within which to reduce ){ int ue, all = 0; // check to make sure root is member of the communicator if (root<0 || root >= comm.size) return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_ID)); return(RCCE_error_return(RCCE_debug_comm, RCCE_reduce_general(inbuf, outbuf, num, type, op, root, all, comm))); }
// DO NOT USE THIS FUNCTION IN NON-GORY MODE UNTIL MALLOC_FREE HAS BEEN IMPLEMENTED int RCCE_comm_free(RCCE_COMM *comm) { printf("DO NOT USE IN NON-GORY MODE UNTIL MALLOC_FREE HAS BEEN IMPLEMENTED\n"); if (comm->initialized != RCCE_COMM_INITIALIZED) return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_COMM_INITIALIZED)); RCCE_flag_free(&(comm->gather)); RCCE_flag_free(&(comm->release)); comm->initialized = RCCE_COMM_NOT_INITIALIZED; return(RCCE_SUCCESS); }
//-------------------------------------------------------------------------------------- // FUNCTION: RCCE_comm_size // returns the number of UEs inside the communicator //-------------------------------------------------------------------------------------- int RCCE_comm_size( RCCE_COMM comm, // communicator int *size // return value (size) ) { if (comm.initialized == RCCE_COMM_INITIALIZED) { *size = comm.size; return(RCCE_SUCCESS); } else return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_COMM_INITIALIZED)); }
//-------------------------------------------------------------------------------------- // FUNCTION: RCCE_comm_rank // returns the rank of the calling UE inside the communicator //-------------------------------------------------------------------------------------- int RCCE_comm_rank( RCCE_COMM comm, // communicator int *rank // return value (rank) ) { if (comm.initialized == RCCE_COMM_INITIALIZED) { *rank = comm.my_rank; return(RCCE_SUCCESS); } else return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_COMM_INITIALIZED)); }
//-------------------------------------------------------------------------------------- // FUNCTION: RCCE_flag_free //-------------------------------------------------------------------------------------- // free space for one flag. Since multiple fit on a single cache line, we only // need to free claimed MPB space when the all existing lines are completely emptied. //-------------------------------------------------------------------------------------- int RCCE_flag_free(RCCE_FLAG *flag) { RCCE_FLAG_LINE *flagp, *flagpminus1 = NULL; int loc; // check wether flag exists, and whether the location field is valid if (!flag || flag->location < 0) return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_UNDEFINED)); // find flag line in globally maintained structure flagp = &RCCE_flags; while (flagp->next && flag->line_address != flagp->line_address) { flagpminus1 = flagp; flagp = flagp->next; } if (flag->line_address != flagp->line_address) return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_UNDEFINED)); // error checking is done flagp->members--; loc = flag->location; #ifdef SINGLEBITFLAGS RCCE_flip_bit_value(flagp->flag+loc/RCCE_FLAGS_PER_BYTE,loc%RCCE_FLAGS_PER_BYTE); #else flagp->flag[flag->location] = (char) ((unsigned int) 0); #endif // something special happens if we've emptied an entire line if (flagp->members==0) { if (flagpminus1) { // there is a predecessor; splice out current flag line from linked list RCCE_free(flagp->line_address); flagpminus1->next = flagp->next; free(flagp); } // if there is a successor but no predecessor, do nothing } // invalidate location field to make sure we won't free again by mistake flag->location = -1; flag->line_address = NULL; return(RCCE_SUCCESS); }
int RCCE_test_flag(RCCE_FLAG flag, RCCE_FLAG_STATUS val, int *result) { t_vcharp cflag; cflag = (t_vcharp) flag; #ifdef GORY if (val != RCCE_FLAG_UNSET && val != RCCE_FLAG_SET) return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_STATUS_UNDEFINED)); if (!cflag) return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_NOT_ALLOCATED)); // check to see if flag is properly contained in the local comm buffer if (cflag - RCCE_comm_buffer[RCCE_IAM]>=0 && cflag+RCCE_LINE_SIZE - (RCCE_comm_buffer[RCCE_IAM] + RCCE_BUFF_SIZE)<0){} else { return(RCCE_error_return(RCCE_debug_synch,RCCE_ERROR_FLAG_NOT_IN_COMM_BUFFER)); } #endif #ifdef USE_REVERTED_FLAGS flag = flag + RCCE_LINE_SIZE / sizeof(int) - 1; #endif // always flush/invalidate to ensure we read the most recent value of *flag // keep reading it until it has the required value. We only need to read the // first int of the MPB cache line containing the flag #ifdef _OPENMP #pragma omp flush #endif #ifndef USE_FLAG_EXPERIMENTAL RC_cache_invalidate(); #endif if((*flag) != val) { (*result) = 0; } else { (*result) = 1; } return(RCCE_SUCCESS); }
//-------------------------------------------------------------------------------------- // FUNCTION: RCCE_comm_split // RCCE_comm_split works like MPI_Comm_split, but: // 1. Always uses the default global communicator as the basis, not an // arbitrary communicator // 2. Uses the rank of the UE in the global communicator as the key // 3. Uses a function, operating on UE's global rank, to compute color //-------------------------------------------------------------------------------------- int RCCE_comm_split( int (*color)(int, void *), // function returning a color value for given ue and aux void *aux, // optional user-supplied data structure RCCE_COMM *comm // new communicator ) { int i, my_color, error; if (!comm) return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_COMM_UNDEFINED)); // start with a barrier to make sure all UEs are participating, unless we are still // defining the global communicator; there is no danger in skipping the barrier in // that case, because the global communicator is defined in RCCE_init, which must be // called by all cores before any other RCCE calls if (comm != &RCCE_COMM_WORLD) RCCE_barrier(&RCCE_COMM_WORLD); // determine the size of the communicator my_color = color(RCCE_IAM, aux); comm->size = 0; for (i=0; i<RCCE_NP; i++) { if (color(i, aux) == my_color) { if (i == RCCE_IAM) comm->my_rank = comm->size; comm->member[comm->size++] = i; } } // note: we only need to allocate new synch flags if the communicator has not yet been // initialized. It is legal to overwrite an initialized communcator, in which case the // membership may change, but the same synchronization flags can be used if (comm->initialized == RCCE_COMM_INITIALIZED) return(RCCE_SUCCESS); if(error=RCCE_flag_alloc(&(comm->gather))) return(RCCE_error_return(RCCE_debug_comm,error)); if(error=RCCE_flag_alloc(&(comm->release))) return(RCCE_error_return(RCCE_debug_comm,error)); comm->initialized = RCCE_COMM_INITIALIZED; return(RCCE_SUCCESS); }
//--------------------------------------------------------------------------------------- // FUNCTION: RCCE_allreduce //--------------------------------------------------------------------------------------- // Reduction function which delivers the reduction results to all participating UEs //--------------------------------------------------------------------------------------- int RCCE_allreduce( char *inbuf, // source buffer for reduction datan char *outbuf, // target buffer for reduction data int num, // number of data elements to be reduced int type, // type of data elements int op, // reduction operation RCCE_COMM comm // communication domain within which to reduce ){ int root = 0, all = 1; return(RCCE_error_return(RCCE_debug_comm, RCCE_reduce_general(inbuf, outbuf, num, type, op, root, all, comm))); }
int RCCE_send(char *privbuf, size_t size, int dest) { #ifdef MEASURE_TIME double send_start = RCCE_wtime(); #endif if (dest<0 || dest >= RCCE_NP) { return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_ID)); } errval_t err = send_message(privbuf, size, RC_COREID[dest]); assert(err_is_ok(err)); #ifdef MEASURE_TIME measure_rcce_time += RCCE_wtime() - send_start; #endif #ifdef MEASURE_DATA measure_rcce_data[rcce_curphase][dest] += size; #endif return (RCCE_SUCCESS); }
//-------------------------------------------------------------------------------------- // FUNCTION: RCCE_init //-------------------------------------------------------------------------------------- // initialize the library and sanitize parameter list //-------------------------------------------------------------------------------------- int RCCE_init( int *argc, // pointer to argc, passed in from main program char ***argv // pointer to argv, passed in from main program ) { int i, ue, dummy_offset, loc, error; #ifdef SCC int x, y, z; unsigned int physical_lockaddress; #endif #ifdef SHMADD unsigned int RCCE_SHM_BUFFER_offset ,result, rd_slot_nbr, wr_slot_nbr; #endif void *nothing = NULL; #ifdef SCC // Copperridge specific initialization... InitAPI(0);fflush(0); #endif // save pointer to executable name for later insertion into the argument list char *executable_name = (*argv)[0]; RCCE_NP = atoi(*(++(*argv))); RC_REFCLOCKGHZ = atof(*(++(*argv))); // put the participating core ids (unsorted) into an array for (ue=0; ue<RCCE_NP; ue++) { RC_COREID[ue] = atoi(*(++(*argv))); } #ifndef SCC // if using the functional emulator, must make sure to have read all command line // parameters up to now before overwriting (shifted) first one with executable // name; even though argv is made firstprivate, that applies only the pointer to // the arguments, not the actual data #pragma omp barrier #endif // make sure executable name is as expected (*argv)[0] = executable_name; RC_MY_COREID = MYCOREID(); // adjust apparent number of command line arguments, so it will appear to main // program that number of UEs, clock frequency, and core ID list were not on // command line *argc -= RCCE_NP+2; // sort array of participating phyical core IDs to determine their ranks RCCE_qsort((char *)RC_COREID, RCCE_NP, sizeof(int), id_compare); // determine rank of calling core for (ue=0; ue<RCCE_NP; ue++) { if (RC_COREID[ue] == RC_MY_COREID) RCCE_IAM = ue; } #ifdef SHMADD // printf("Using SHMADD\n"); RCCE_SHM_BUFFER_offset = 0x00; // RCCE_SHM_BUFFER_offset = 0x3FFFF80; // RCCE_SHM_BUFFER_offset = 0x4000000; // RCCE_SHM_BUFFER_offset = 0x181000; rd_slot_nbr=0x80; for(i=0; i<60; i++) { result = readLUT(rd_slot_nbr); result -= 1; wr_slot_nbr = rd_slot_nbr + 4; writeLUT(wr_slot_nbr,result); rd_slot_nbr++; } #endif // leave in one reassuring debug print if (DEBUG){ printf("My rank is %d, physical core ID is %d\n", RCCE_IAM, RC_MY_COREID); fflush(0); } if (RCCE_IAM<0) return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_CORE_NOT_IN_HOSTFILE)); #ifdef SCC // compute and memory map addresses of test&set registers for all participating cores for (ue=0; ue<RCCE_NP; ue++) { z = Z_PID(RC_COREID[ue]); x = X_PID(RC_COREID[ue]); y = Y_PID(RC_COREID[ue]); physical_lockaddress = CRB_ADDR(x,y) + (z==0 ? LOCK0 : LOCK1); virtual_lockaddress[ue] = (t_vcharp) MallocConfigReg(physical_lockaddress); } #endif // initialize MPB starting addresses for all participating cores; allow one // dummy cache line at front of MPB for fooling write combine buffer in case // of single-byte MPB access RCCE_fool_write_combine_buffer = RC_COMM_BUFFER_START(RCCE_IAM); for (ue=0; ue<RCCE_NP; ue++) RCCE_comm_buffer[ue] = RC_COMM_BUFFER_START(ue) + RCCE_LINE_SIZE; // gross MPB size is set equal to maximum RCCE_BUFF_SIZE = RCCE_BUFF_SIZE_MAX - RCCE_LINE_SIZE; #ifdef RC_POWER_MANAGEMENT #ifndef SCC // always store RPC queue data structure at beginning of usable MPB, so allocatable // storage needs to skip it. Only need to do this for functional emulator for (ue=0; ue<RCCE_NP; ue++) RCCE_comm_buffer[ue] += REGULATOR_LENGTH; RCCE_BUFF_SIZE -= REGULATOR_LENGTH; #endif #endif // initialize RCCE_malloc RCCE_malloc_init(RCCE_comm_buffer[RCCE_IAM],RCCE_BUFF_SIZE); #ifdef SHMADD RCCE_shmalloc_init(RC_SHM_BUFFER_START()+RCCE_SHM_BUFFER_offset ,RCCE_SHM_SIZE_MAX); #ifdef SHMDBG printf("\n%d:%s:%d: RCCE_SHM_BUFFER_offset, RCCE_SHM_SIZE_MAX: % x %x\n", RCCE_IAM, __FILE__,__LINE__,RCCE_SHM_BUFFER_offset ,RCCE_SHM_SIZE_MAX); #endif #else RCCE_shmalloc_init(RC_SHM_BUFFER_START(),RCCE_SHM_SIZE_MAX); #endif // initialize the (global) flag bookkeeping data structure for (loc=0; loc<RCCE_FLAGS_PER_LINE; loc++) RCCE_flags.flag[loc] = (char)((unsigned int)0); RCCE_flags.line_address = NULL; RCCE_flags.members=0; RCCE_flags.next=NULL; // create global communicator (equivalent of MPI_COMM_WORLD); this will also allocate // the two synchronization flags associated with the global barrier RCCE_comm_split(RCCE_global_color, nothing, &RCCE_COMM_WORLD); // if power management is enabled, initialize more stuff; this includes two more // communicators (for voltage and frequency domains), plus two synchronization flags // associated with the barrier for each communicator #ifdef RC_POWER_MANAGEMENT if (error=RCCE_init_RPC(RC_COREID, RCCE_IAM, RCCE_NP)) return(RCCE_error_return(RCCE_debug_RPC,error)); #endif #ifndef GORY // if we use the simplified API, we need to define more flags upfront for (ue=0; ue<RCCE_NP; ue++) { if (error=RCCE_flag_alloc(&RCCE_sent_flag[ue])) return(RCCE_error_return(RCCE_debug_synch,error)); if (error=RCCE_flag_alloc(&RCCE_ready_flag[ue])) return(RCCE_error_return(RCCE_debug_synch,error)); } #endif return (RCCE_SUCCESS); }
int RCCE_recv(char *privbuf, size_t size, int source) { errval_t err; #ifdef MEASURE_TIME double recv_start = RCCE_wtime(); #endif #ifdef RCCE_PERF_MEASURE dispatcher_handle_t handle = curdispatcher(); struct dispatcher_shared_generic* d = get_dispatcher_shared_generic(handle); #endif if (source<0 || source >= RCCE_NP) { return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_ID)); } int core_id = RC_COREID[source]; struct msg_buf *mb = &msgbuf[core_id]; #ifdef BULK_TRANSFER_ENABLED mb->bulk_ready = true; mb->length = size; mb->current = 0; mb->msg = privbuf; #endif dprintf("%d: R(%lu,%d,%p,%d,%p)\n", my_core_id, size, source, mb, mb->pending, privbuf); #ifdef BULK_TRANSFER_ENABLED err = barray[core_id]->tx_vtbl.bulk_recv_ready(barray[core_id], NOP_CONT, my_core_id, size); assert(err_is_ok(err)); #endif PERF(30); while(!mb->pending) { messages_wait_and_handle_next(); } PERF(31); dprintf("%d: msg arrived\n", my_core_id); /* if(size <= DEFAULT_UMP_BUFLEN) { */ #ifndef BULK_TRANSFER_ENABLED assert(size == mb->length); memcpy(privbuf, mb->msg, size); /* } else { */ #else assert(mb->bulk); #endif /* } */ mb->pending = false; #ifndef BULK_TRANSFER_ENABLED assert(!mb->bulk); free(mb->msg); PERF(32); err = barray[core_id]->tx_vtbl.message_reply(barray[core_id], NOP_CONT, my_core_id); PERF(33); assert(err_is_ok(err)); #else assert(mb->bulk); #endif #ifdef MEASURE_TIME measure_rcce_time += RCCE_wtime() - recv_start; #endif return (RCCE_SUCCESS); }
//-------------------------------------------------------------------------------------- // FUNCTION: RCCE_nb_barrier //-------------------------------------------------------------------------------------- // non-blocking version of the linear barrier //-------------------------------------------------------------------------------------- int RCCE_nb_barrier(RCCE_COMM *comm) { volatile unsigned char cyclechar[RCCE_LINE_SIZE] __attribute__ ((aligned (RCCE_LINE_SIZE))); volatile unsigned char valchar[RCCE_LINE_SIZE] __attribute__ ((aligned (RCCE_LINE_SIZE))); int i, error; int ROOT = 0; #ifdef USE_FLAG_EXPERIMENTAL volatile char *cycle; volatile char *val; cycle = (volatile char *)cyclechar; val = (volatile char *)valchar; #else volatile int *cycle; volatile int *val; cycle = (volatile int *)cyclechar; val = (volatile int *)valchar; #endif if(comm->label == 1) goto label1; if(comm->label == 2) goto label2; comm->count = 0; if (RCCE_debug_synch) fprintf(STDERR,"UE %d has checked into barrier\n", RCCE_IAM); #ifdef USE_FAT_BARRIER // flip local barrier variable #ifndef USE_FLAG_EXPERIMENTAL if ((error = RCCE_get(cyclechar, (t_vcharp)(comm->gather[RCCE_IAM]), RCCE_LINE_SIZE, RCCE_IAM))) #else if ((error = RCCE_get_flag(cyclechar, (t_vcharp)(comm->gather[RCCE_IAM]), RCCE_LINE_SIZE, RCCE_IAM))) #endif return(RCCE_error_return(RCCE_debug_synch,error)); *cycle = !(*cycle); #ifndef USE_FLAG_EXPERIMENTAL if ((error = RCCE_put((t_vcharp)(comm->gather[RCCE_IAM]), cyclechar, RCCE_LINE_SIZE, RCCE_IAM))) #else if ((error = RCCE_put_flag((t_vcharp)(comm->gather[RCCE_IAM]), cyclechar, RCCE_LINE_SIZE, RCCE_IAM))) #endif return(RCCE_error_return(RCCE_debug_synch,error)); if ((error = RCCE_put((t_vcharp)(comm->gather[RCCE_IAM]), cyclechar, RCCE_LINE_SIZE, comm->member[ROOT]))) return(RCCE_error_return(RCCE_debug_synch,error)); if (RCCE_IAM==comm->member[ROOT]) { // read "remote" gather flags; once all equal "cycle" (i.e counter==comm->size), // we know all UEs have reached the barrier comm->cycle = *cycle; label1: while (comm->count != comm->size) { // skip the first member (#0), because that is the ROOT for (comm->count=i=1; i<comm->size; i++) { /* copy flag values out of comm buffer */ #ifndef USE_FLAG_EXPERIMENTAL if ((error = RCCE_get(valchar, (t_vcharp)(comm->gather[i]), RCCE_LINE_SIZE, RCCE_IAM))) #else if ((error = RCCE_get_flag(valchar, (t_vcharp)(comm->gather[i]), RCCE_LINE_SIZE, RCCE_IAM))) #endif return(RCCE_error_return(RCCE_debug_synch,error)); if (*val == comm->cycle) comm->count++; } if(comm->count != comm->size) { comm->label = 1; return(RCCE_PENDING); } } // set release flags for (i=1; i<comm->size; i++) { if ((error = RCCE_flag_write(&(comm->release), comm->cycle, comm->member[i]))) return(RCCE_error_return(RCCE_debug_synch,error)); } } else { int test; comm->cycle = *cycle; label2: RCCE_test_flag(comm->release, comm->cycle, &test); if(!test) { comm->label = 2; return(RCCE_PENDING); } } comm->label = 0; #else // !USE_FAT_BARRIER // flip local barrier variable #ifndef USE_FLAG_EXPERIMENTAL if ((error = RCCE_get(cyclechar, (t_vcharp)(comm->gather[0]), RCCE_LINE_SIZE, RCCE_IAM))) #else if ((error = RCCE_get_flag(cyclechar, (t_vcharp)(comm->gather[0]), RCCE_LINE_SIZE, RCCE_IAM))) #endif return(RCCE_error_return(RCCE_debug_synch,error)); *cycle = !(*cycle); #ifndef USE_FLAG_EXPERIMENTAL if ((error = RCCE_put((t_vcharp)(comm->gather[0]), cyclechar, RCCE_LINE_SIZE, RCCE_IAM))) #else if ((error = RCCE_put_flag((t_vcharp)(comm->gather[0]), cyclechar, RCCE_LINE_SIZE, RCCE_IAM))) #endif return(RCCE_error_return(RCCE_debug_synch,error)); if (RCCE_IAM==comm->member[ROOT]) { // read "remote" gather flags; once all equal "cycle" (i.e counter==comm->size), // we know all UEs have reached the barrier comm->cycle = *cycle; label1: while (comm->count != comm->size) { // skip the first member (#0), because that is the ROOT for (comm->count=i=1; i<comm->size; i++) { /* copy flag values out of comm buffer */ #ifndef USE_FLAG_EXPERIMENTAL if ((error = RCCE_get(valchar, (t_vcharp)(comm->gather[0]), RCCE_LINE_SIZE, comm->member[i]))) #else if ((error = RCCE_get_flag(valchar, (t_vcharp)(comm->gather[0]), RCCE_LINE_SIZE, comm->member[i]))) #endif return(RCCE_error_return(RCCE_debug_synch,error)); if (*val == comm->cycle) comm->count++; } if(comm->count != comm->size) { comm->label = 1; return(RCCE_PENDING); } } // set release flags for (i=1; i<comm->size; i++) { if ((error = RCCE_flag_write(&(comm->release), comm->cycle, comm->member[i]))) return(RCCE_error_return(RCCE_debug_synch,error)); } } else { int test; comm->cycle = *cycle; label2: RCCE_test_flag(comm->release, comm->cycle, &test); if(!test) { comm->label = 2; return(RCCE_PENDING); } } comm->label = 0; #endif // !USE_FAT_BARRIER if (RCCE_debug_synch) fprintf(STDERR,"UE %d has cleared barrier\n", RCCE_IAM); return(RCCE_SUCCESS); }
//-------------------------------------------------------------------------------------- // FUNCTION: RCCE_put //-------------------------------------------------------------------------------------- // copy data from address "source" in the local MPB or the calling UE's private memory // to address "target" in the remote MPB. We do not test to see if a move from the // calling UE's private memory stays within allocated memory //-------------------------------------------------------------------------------------- int RCCE_put( t_vcharp target, // target buffer, MPB t_vcharp source, // source buffer, MPB or private memory int num_bytes, int ID ) { #ifdef GORY // we only need to do tests in GORY mode; in non-GORY mode ths function is never // called by the user, but only be the library int copy_mode; // check validity of parameters if (!target) return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_TARGET)); if (!source) return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_SOURCE)); if (ID<0 || ID>=RCCE_NP) return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_ID)); if (num_bytes < 0 || num_bytes%RCCE_LINE_SIZE!=0) return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_MESSAGE_LENGTH)); // determine if target data is in MPB; check using local buffer boundaries if (target - RCCE_comm_buffer[RCCE_IAM]>=0 && target+num_bytes - (RCCE_comm_buffer[RCCE_IAM] + RCCE_BUFF_SIZE)<=0) // shift target address to point to remote MPB target = RCCE_comm_buffer[ID]+(target-RCCE_comm_buffer[RCCE_IAM]); else return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_TARGET)); // source can be either local MPB or private memory if (source - RCCE_comm_buffer[RCCE_IAM] >= 0 && source+num_bytes - (RCCE_comm_buffer[RCCE_IAM] + RCCE_BUFF_SIZE)<=0) copy_mode = BOTH_IN_COMM_BUFFER; else copy_mode = SOURCE_IN_PRIVATE_MEMORY; // make sure that if the copy is between locations within the same MPB // there is no overlap between source and target address ranges if ( copy_mode == BOTH_IN_COMM_BUFFER) { if (((source-target)>0 && (source+num_bytes-target)<0) || ((target-source)>0 && (target+num_bytes-source)<0)) { return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_DATA_OVERLAP)); } } // ascertain that the start of the buffer is cache line aligned int start_index = target-RCCE_comm_buffer[ID]; if (start_index%RCCE_LINE_SIZE!=0) return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_ALIGNMENT)); // only verify alignment of the target if it is in the MPB if (copy_mode == BOTH_IN_COMM_BUFFER) { start_index = source-RCCE_comm_buffer[ID]; if (start_index%RCCE_LINE_SIZE!=0) return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_ALIGNMENT)); } #else // in non-GORY mode we only need to retain the MPB target shift; we // already know the target is in the MPB, not private memory target = RCCE_comm_buffer[ID]+(target-RCCE_comm_buffer[RCCE_IAM]); #endif // make sure that any data that has been put in our MPB by another UE is visible #ifdef _OPENMP #pragma omp flush #endif // do the actual copy RC_cache_invalidate(); memcpy((void *)target, (void *)source, num_bytes); // flush data to make it visible to all threads; cannot use flush list because it // concerns malloced space #ifdef _OPENMP #pragma omp flush #endif return(RCCE_SUCCESS); }
//-------------------------------------------------------------------------------------- // FUNCTION: RCCE_send_general //-------------------------------------------------------------------------------------- // Synchronized send function (gory and non-gory mode) //-------------------------------------------------------------------------------------- static int RCCE_send_general( char *privbuf, // source buffer in local private memory (send buffer) t_vcharp combuf, // intermediate buffer in MPB size_t chunk, // size of MPB available for this message (bytes) RCCE_FLAG *ready, // flag indicating whether receiver is ready RCCE_FLAG *sent, // flag indicating whether message has been sent by source size_t size, // size of message (bytes) int dest, // UE that will receive the message int copy, // set to 0 for synchronization only (no copying/sending) int pipe, // use pipelining? int mcast, // multicast? void* tag, // additional tag? int len, // length of additional tag RCCE_FLAG *probe // flag for probing for incoming messages ) { char padline[RCCE_LINE_SIZE]; // copy buffer, used if message not multiple of line size size_t wsize, // offset within send buffer when putting in "chunk" bytes remainder, // bytes remaining to be sent nbytes; // number of bytes to be sent in single RCCE_put call char *bufptr; // running pointer inside privbuf for current location #ifdef USE_REMOTE_PUT_LOCAL_GET if(mcast) return(RCCE_error_return(1, RCCE_ERROR_NO_MULTICAST_SUPPORT)); #endif if(probe) #ifdef USE_TAGGED_FLAGS RCCE_flag_write_tagged(probe, RCCE_FLAG_SET, dest, tag, len); #else RCCE_flag_write(probe, RCCE_FLAG_SET, dest); #endif #ifdef USE_SYNCH_FOR_ZERO_BYTE // synchronize even in case of zero byte messages: if(size == 0) { #ifdef USE_REMOTE_PUT_LOCAL_GET RCCE_wait_until(*ready, RCCE_FLAG_SET); RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM); #ifdef USE_TAGGED_FLAGS if(!probe) RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len); else #endif RCCE_flag_write(sent, RCCE_FLAG_SET, dest); #else // LOCAL PUT / REMOTE GET: (standard) #ifdef USE_TAGGED_FLAGS if(!probe) RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len); else #endif RCCE_flag_write(sent, RCCE_FLAG_SET, dest); RCCE_wait_until(*ready, RCCE_FLAG_SET); RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM); #endif // !USE_REMOTE_PUT_LOCAL_GET return(RCCE_SUCCESS); } #endif // USE_SYNCH_FOR_ZERO_BYTE if(!pipe) { // send data in units of available chunk size of comm buffer for (wsize=0; wsize< (size/chunk)*chunk; wsize+=chunk) { bufptr = privbuf + wsize; nbytes = chunk; #ifdef USE_REMOTE_PUT_LOCAL_GET RCCE_wait_until(*ready, RCCE_FLAG_SET); RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM); // copy private data to remote comm buffer if(copy) RCCE_put(combuf, (t_vcharp) bufptr, nbytes, dest); #ifdef USE_TAGGED_FLAGS if( (wsize == 0) && (!probe) ) RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len); else #endif RCCE_flag_write(sent, RCCE_FLAG_SET, dest); #else // LOCAL PUT / REMOTE GET: (standard) // copy private data to own comm buffer if(copy) RCCE_put(combuf, (t_vcharp) bufptr, nbytes, RCCE_IAM); if(!mcast) { #ifdef USE_TAGGED_FLAGS if( (wsize == 0) && (!probe) ) RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len); else #endif RCCE_flag_write(sent, RCCE_FLAG_SET, dest); // wait for the destination to be ready to receive a message RCCE_wait_until(*ready, RCCE_FLAG_SET); RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM); } else { RCCE_TNS_barrier(&RCCE_COMM_WORLD); RCCE_TNS_barrier(&RCCE_COMM_WORLD); } #endif // !USE_REMOTE_PUT_LOCAL_GET } // for } else // if(!pipe) -> if(pipe) { // pipelined version of send/recv: size_t subchunk1, subchunk2; for(wsize = 0; wsize < (size/chunk)*chunk; wsize+=chunk) { if(wsize == 0) { // allign sub-chunks to cache line granularity: subchunk1 = ( (chunk / 2) / RCCE_LINE_SIZE ) * RCCE_LINE_SIZE; subchunk2 = chunk - subchunk1; } bufptr = privbuf + wsize; nbytes = subchunk1; #ifdef USE_REMOTE_PUT_LOCAL_GET RCCE_wait_until(*ready, RCCE_FLAG_SET); RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM); // copy private data chunk 1 to remote comm buffer if(copy) RCCE_put(combuf, (t_vcharp) bufptr, nbytes, dest); #ifdef USE_TAGGED_FLAGS if( (wsize == 0) && (!probe) ) RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len); else #endif RCCE_flag_write(sent, RCCE_FLAG_SET, dest); #else // LOCAL PUT / REMOTE GET: (standard) // copy private data chunk 1 to own comm buffer if(copy) RCCE_put(combuf, (t_vcharp) bufptr, nbytes, RCCE_IAM); #ifdef USE_TAGGED_FLAGS if( (wsize == 0) && (!probe) ) RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len); else #endif RCCE_flag_write(sent, RCCE_FLAG_SET, dest); RCCE_wait_until(*ready, RCCE_FLAG_SET); RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM); #endif // !USE_REMOTE_PUT_LOCAL_GET bufptr = privbuf + wsize + subchunk1; nbytes = subchunk2; #ifdef USE_REMOTE_PUT_LOCAL_GET RCCE_wait_until(*ready, RCCE_FLAG_SET); RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM); // copy private data chunk 2 to remote comm buffer if(copy) RCCE_put(combuf + subchunk1, (t_vcharp) bufptr, nbytes, dest); RCCE_flag_write(sent, RCCE_FLAG_SET, dest); #else // LOCAL PUT / REMOTE GET: (standard) // copy private data chunk 2 to own comm buffer if(copy) RCCE_put(combuf + subchunk1, (t_vcharp) bufptr, nbytes, RCCE_IAM); RCCE_flag_write(sent, RCCE_FLAG_SET, dest); RCCE_wait_until(*ready, RCCE_FLAG_SET); RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM); #endif // !USE_REMOTE_PUT_LOCAL_GET } //for } // if(pipe) remainder = size%chunk; // if nothing is left over, we are done if (!remainder) return(RCCE_SUCCESS); // send remainder of data--whole cache lines bufptr = privbuf + (size/chunk)*chunk; nbytes = remainder - remainder%RCCE_LINE_SIZE; if (nbytes) { #ifdef USE_REMOTE_PUT_LOCAL_GET RCCE_wait_until(*ready, RCCE_FLAG_SET); RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM); // copy private data to remote comm buffer if(copy) RCCE_put(combuf, (t_vcharp) bufptr, nbytes, dest); #ifdef USE_TAGGED_FLAGS if( (wsize == 0) && (!probe) ) RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len); else #endif RCCE_flag_write(sent, RCCE_FLAG_SET, dest); #else // LOCAL PUT / REMOTE GET: (standard) // copy private data to own comm buffer if(copy) RCCE_put(combuf, (t_vcharp)bufptr, nbytes, RCCE_IAM); if(!mcast) { #ifdef USE_TAGGED_FLAGS if( (wsize == 0) && (!probe) ) RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len); else #endif RCCE_flag_write(sent, RCCE_FLAG_SET, dest); // wait for the destination to be ready to receive a message RCCE_wait_until(*ready, RCCE_FLAG_SET); RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM); } else { RCCE_TNS_barrier(&RCCE_COMM_WORLD); RCCE_TNS_barrier(&RCCE_COMM_WORLD); } #endif // !USE_REMOTE_PUT_LOCAL_GET } // if(nbytes) remainder = remainder%RCCE_LINE_SIZE; if (!remainder) return(RCCE_SUCCESS); // remainder is less than a cache line. This must be copied into appropriately sized // intermediate space before it can be sent to the receiver bufptr = privbuf + (size/chunk)*chunk + nbytes; nbytes = RCCE_LINE_SIZE; if(copy) { #ifdef COPPERRIDGE memcpy_scc(padline,bufptr,remainder); #else memcpy(padline,bufptr,remainder); #endif } #ifdef USE_REMOTE_PUT_LOCAL_GET RCCE_wait_until(*ready, RCCE_FLAG_SET); RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM); // copy private data to remote comm buffer if(copy) RCCE_put(combuf, (t_vcharp) padline, nbytes, dest); #ifdef USE_TAGGED_FLAGS if( (wsize == 0) && (!probe) ) RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len); else #endif RCCE_flag_write(sent, RCCE_FLAG_SET, dest); #else // LOCAL PUT / REMOTE GET: (standard) // copy private data to own comm buffer if(copy) RCCE_put(combuf, (t_vcharp)padline, nbytes, RCCE_IAM); if(!mcast) { #ifdef USE_TAGGED_FLAGS if( (wsize == 0) && (!probe) ) RCCE_flag_write_tagged(sent, RCCE_FLAG_SET, dest, tag, len); else #endif RCCE_flag_write(sent, RCCE_FLAG_SET, dest); // wait for the destination to be ready to receive a message RCCE_wait_until(*ready, RCCE_FLAG_SET); RCCE_flag_write(ready, RCCE_FLAG_UNSET, RCCE_IAM); } else { RCCE_TNS_barrier(&RCCE_COMM_WORLD); RCCE_TNS_barrier(&RCCE_COMM_WORLD); } #endif // !USE_REMOTE_PUT_LOCAL_GET return(RCCE_SUCCESS); }