int iRCCE_flag_alloc(RCCE_FLAG *flag) { #if !defined(SINGLEBITFLAGS) return iRCCE_flag_alloc_tagged(flag); #else return RCCE_flag_alloc(flag); #endif }
//-------------------------------------------------------------------------------------- // FUNCTION: RCCE_comm_split // RCCE_comm_split works like MPI_Comm_split, but: // 1. Always uses the default global communicator as the basis, not an // arbitrary communicator // 2. Uses the rank of the UE in the global communicator as the key // 3. Uses a function, operating on UE's global rank, to compute color //-------------------------------------------------------------------------------------- int RCCE_comm_split( int (*color)(int, void *), // function returning a color value for given ue and aux void *aux, // optional user-supplied data structure RCCE_COMM *comm // new communicator ) { int i, my_color, error; if (!comm) return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_COMM_UNDEFINED)); // start with a barrier to make sure all UEs are participating, unless we are still // defining the global communicator; there is no danger in skipping the barrier in // that case, because the global communicator is defined in RCCE_init, which must be // called by all cores before any other RCCE calls if (comm != &RCCE_COMM_WORLD) RCCE_barrier(&RCCE_COMM_WORLD); // determine the size of the communicator my_color = color(RCCE_IAM, aux); comm->size = 0; for (i=0; i<RCCE_NP; i++) { if (color(i, aux) == my_color) { if (i == RCCE_IAM) comm->my_rank = comm->size; comm->member[comm->size++] = i; } } // note: we only need to allocate new synch flags if the communicator has not yet been // initialized. It is legal to overwrite an initialized communcator, in which case the // membership may change, but the same synchronization flags can be used if (comm->initialized == RCCE_COMM_INITIALIZED) return(RCCE_SUCCESS); if(error=RCCE_flag_alloc(&(comm->gather))) return(RCCE_error_return(RCCE_debug_comm,error)); if(error=RCCE_flag_alloc(&(comm->release))) return(RCCE_error_return(RCCE_debug_comm,error)); comm->initialized = RCCE_COMM_INITIALIZED; return(RCCE_SUCCESS); }
int RCCE_APP(int argc, char **argv) { /* statically allocated space sits in off-chip private memory */ float a[NXNY], *buff; int i, offset, iter=10, tile; int MY_ID; int NTILES1; double time; RCCE_FLAG flag0, flag1; RCCE_init(&argc, &argv); NTILES1 = RCCE_num_ues()-1; MY_ID = RCCE_ue(); if (NX%8) { printf("Grid width should be multiple of 8: %d\n", NX); exit(1); } if (argc>1) iter=atoi(*++argv); if (MY_ID==0) printf("Executing %d iterations\n", iter); /* allocate space on the comm buffer */ buff = (float *) RCCE_malloc(sizeof(float)*2*NX); /* Allocate flags to coordinate comm. */ if (RCCE_flag_alloc(&flag0)) return(1); if (RCCE_flag_alloc(&flag1)) return(1); /* initialize array a on all tiles; this stuffs a into private caches */ for (offset=0, i=0; i<NXNY; i++) a[i+offset] = 0.0; if (MY_ID == 0) for (offset=0, i=0; i<NX; i++) a[i+offset] = 1.0; if (MY_ID == NTILES1) for (offset=NXNY1,i=0; i<NX; i++) a[i+offset] = 2.0; /* put in a barrier so everybody can be sure to have initialized */ RCCE_barrier(&RCCE_COMM_WORLD); /* main loop */ if (MY_ID==0) time = RCCE_wtime(); while ((iter--)>0){ /* start with copying fringe data to neighboring tiles */ if (MY_ID!=NTILES1) { /* Initialize neighbor flag to zero */ RCCE_flag_write(&flag0, RCCE_FLAG_UNSET, MY_ID+1); /* copy private data to shared comm buffer of neighbor */ RCCE_put((t_vcharp)(&buff[0]), (t_vcharp)(&a[NXNY2]), NX*sizeof(float), MY_ID+1); RCCE_flag_write(&flag0, RCCE_FLAG_SET, MY_ID+1); } if (MY_ID != 0) { /* Initialize neighbor flag to zero */ RCCE_flag_write(&flag1, 0, MY_ID-1); /* copy private data to shared comm buffer of neighbor */ RCCE_put((t_vcharp)(&buff[NX]), (t_vcharp)(&a[NX]), NX*sizeof(float), MY_ID-1); RCCE_flag_write(&flag1, RCCE_FLAG_SET, MY_ID-1); } /* Make sure the data has been recvd and copy data out of buffer(s) */ if (MY_ID!=NTILES1) { RCCE_wait_until(flag1, RCCE_FLAG_SET); RCCE_get((t_vcharp)(&a[NXNY1]), (t_vcharp)(&buff[NX]), NX*sizeof(float),MY_ID); } if (MY_ID!=0) { RCCE_wait_until(flag0, RCCE_FLAG_SET); RCCE_get((t_vcharp)(&a[0]), (t_vcharp)(&buff[0]), NX*sizeof(float),MY_ID); } /* apply the stencil operation */ for (i=0; i<NXNY2; i++) { a[i+O3] += W1*a[i+O1] + W2*a[i+O2] + W3*a[i+O3] + W4*a[i+O4] + W5*a[i+O5]; } } RCCE_barrier(&RCCE_COMM_WORLD); if (MY_ID==0) { time = RCCE_wtime()-time; } /* print result strip by strip; this would not be done on RC */ for (int id=0; id<=NTILES1; id++) { RCCE_barrier(&RCCE_COMM_WORLD); if (MY_ID==id) { int start = NX; int end = NXNY1; if (MY_ID==0) start = 0; if (MY_ID == NTILES1) end = NXNY; for (offset=0, i=start; i<end; i++) { if (!(i%NX)) printf("\n"); // comment out next line and uncomment subsequent three to print error printf("%f ",a[i+offset]); // int jj=i/NX+(MY_ID*(NY-1)); // double aexact=1.0+(double)jj/((NTILES1+1)*(NY-1)); // printf("%f ",a[i+offset]-aexact); } } } RCCE_barrier(&RCCE_COMM_WORLD); if (MY_ID==0) { printf("\nTotal time: %lf\n", time); } RCCE_finalize(); return(0); }
int RCCE_APP(int argc, char **argv){ int ID, ID_nb, ID_donor, nrounds, error, strlength; RCCE_FLAG flag_sent, flag_ack; double *cbuffer, *buffer, sum; char msg[RCCE_MAX_ERROR_STRING]; RCCE_init(&argc, &argv); ID = RCCE_ue(); ID_nb = (ID+1)%RCCE_num_ues(); ID_donor = (ID-1+RCCE_num_ues())%RCCE_num_ues(); if (argc != 2) { if (ID==0) printf("Executable requires one parameter (number of rounds): %d\n",argc-1); return(1); } nrounds = atoi(*++argv); if (nrounds < 0) { if (ID==0) printf("Number of rounds should be non-negative: %d\n", nrounds); return(1); } /* allocate private memory and comm buffer space */ buffer = (double *) malloc(BUFSIZE*sizeof(double)); if (!buffer) printf("Mark 01: Failed to allocate private buffer on proc %d\n", ID); cbuffer = (double *) RCCE_malloc(BUFSIZE*sizeof(double)); if (!buffer) printf("Mark 02:RCCE failed to allocate %d doubles on proc %d\n", BUFSIZE, ID); /* initialize buffer with UE-specific data */ for (int i=0; i<BUFSIZE; i++) buffer[i] = (double)(ID+1+i); sum = 0.0; for (int i=0; i<BUFSIZE; i++) sum += buffer[i]; printf("Initial sum on UE %03d equals %f\n", ID, sum); /* create and initialize flag variables */ if (error=RCCE_flag_alloc(&flag_sent)) printf("Mark 03a: Could not allocate flag_sent on %d, error=%d\n", ID, error); if (error=RCCE_flag_alloc(&flag_ack)) printf("Mark 03b: Could not allocate flag_ack on %d, error=%d\n", ID, error); if(error=RCCE_flag_write(&flag_sent, RCCE_FLAG_UNSET, ID)) printf("Mark 04: Could not initialize flag_sent on %d, error=%d\n", ID, error); if(error=RCCE_flag_write(&flag_ack, RCCE_FLAG_SET, ID_donor)) printf("Mark 05: Could not initialize flag_ack on %d, error=%d\n", ID_donor, error); for (int round=0; round<nrounds; round++) { int size = BUFSIZE*sizeof(double); RCCE_wait_until(flag_ack, RCCE_FLAG_SET); RCCE_flag_write(&flag_ack, RCCE_FLAG_UNSET, ID); RCCE_put((t_vcharp)cbuffer, (t_vcharp)buffer, size, ID_nb); RCCE_flag_write(&flag_sent, RCCE_FLAG_SET, ID_nb); RCCE_wait_until(flag_sent, RCCE_FLAG_SET); RCCE_flag_write(&flag_sent, RCCE_FLAG_UNSET, ID); RCCE_get((t_vcharp)buffer, (t_vcharp)cbuffer, size, ID); RCCE_flag_write(&flag_ack, RCCE_FLAG_SET, ID_donor); } /* compute local sum */ sum = 0.0; for (int i=0; i<BUFSIZE; i++) sum += buffer[i]; printf("Final sum on UE %03d equals %f\n", ID, sum); RCCE_finalize(); return(0); }
//-------------------------------------------------------------------------------------- // FUNCTION: RCCE_init //-------------------------------------------------------------------------------------- // initialize the library and sanitize parameter list //-------------------------------------------------------------------------------------- int RCCE_init( int *argc, // pointer to argc, passed in from main program char ***argv // pointer to argv, passed in from main program ) { int i, ue, dummy_offset, loc, error; #ifdef SCC int x, y, z; unsigned int physical_lockaddress; #endif #ifdef SHMADD unsigned int RCCE_SHM_BUFFER_offset ,result, rd_slot_nbr, wr_slot_nbr; #endif void *nothing = NULL; #ifdef SCC // Copperridge specific initialization... InitAPI(0);fflush(0); #endif // save pointer to executable name for later insertion into the argument list char *executable_name = (*argv)[0]; RCCE_NP = atoi(*(++(*argv))); RC_REFCLOCKGHZ = atof(*(++(*argv))); // put the participating core ids (unsorted) into an array for (ue=0; ue<RCCE_NP; ue++) { RC_COREID[ue] = atoi(*(++(*argv))); } #ifndef SCC // if using the functional emulator, must make sure to have read all command line // parameters up to now before overwriting (shifted) first one with executable // name; even though argv is made firstprivate, that applies only the pointer to // the arguments, not the actual data #pragma omp barrier #endif // make sure executable name is as expected (*argv)[0] = executable_name; RC_MY_COREID = MYCOREID(); // adjust apparent number of command line arguments, so it will appear to main // program that number of UEs, clock frequency, and core ID list were not on // command line *argc -= RCCE_NP+2; // sort array of participating phyical core IDs to determine their ranks RCCE_qsort((char *)RC_COREID, RCCE_NP, sizeof(int), id_compare); // determine rank of calling core for (ue=0; ue<RCCE_NP; ue++) { if (RC_COREID[ue] == RC_MY_COREID) RCCE_IAM = ue; } #ifdef SHMADD // printf("Using SHMADD\n"); RCCE_SHM_BUFFER_offset = 0x00; // RCCE_SHM_BUFFER_offset = 0x3FFFF80; // RCCE_SHM_BUFFER_offset = 0x4000000; // RCCE_SHM_BUFFER_offset = 0x181000; rd_slot_nbr=0x80; for(i=0; i<60; i++) { result = readLUT(rd_slot_nbr); result -= 1; wr_slot_nbr = rd_slot_nbr + 4; writeLUT(wr_slot_nbr,result); rd_slot_nbr++; } #endif // leave in one reassuring debug print if (DEBUG){ printf("My rank is %d, physical core ID is %d\n", RCCE_IAM, RC_MY_COREID); fflush(0); } if (RCCE_IAM<0) return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_CORE_NOT_IN_HOSTFILE)); #ifdef SCC // compute and memory map addresses of test&set registers for all participating cores for (ue=0; ue<RCCE_NP; ue++) { z = Z_PID(RC_COREID[ue]); x = X_PID(RC_COREID[ue]); y = Y_PID(RC_COREID[ue]); physical_lockaddress = CRB_ADDR(x,y) + (z==0 ? LOCK0 : LOCK1); virtual_lockaddress[ue] = (t_vcharp) MallocConfigReg(physical_lockaddress); } #endif // initialize MPB starting addresses for all participating cores; allow one // dummy cache line at front of MPB for fooling write combine buffer in case // of single-byte MPB access RCCE_fool_write_combine_buffer = RC_COMM_BUFFER_START(RCCE_IAM); for (ue=0; ue<RCCE_NP; ue++) RCCE_comm_buffer[ue] = RC_COMM_BUFFER_START(ue) + RCCE_LINE_SIZE; // gross MPB size is set equal to maximum RCCE_BUFF_SIZE = RCCE_BUFF_SIZE_MAX - RCCE_LINE_SIZE; #ifdef RC_POWER_MANAGEMENT #ifndef SCC // always store RPC queue data structure at beginning of usable MPB, so allocatable // storage needs to skip it. Only need to do this for functional emulator for (ue=0; ue<RCCE_NP; ue++) RCCE_comm_buffer[ue] += REGULATOR_LENGTH; RCCE_BUFF_SIZE -= REGULATOR_LENGTH; #endif #endif // initialize RCCE_malloc RCCE_malloc_init(RCCE_comm_buffer[RCCE_IAM],RCCE_BUFF_SIZE); #ifdef SHMADD RCCE_shmalloc_init(RC_SHM_BUFFER_START()+RCCE_SHM_BUFFER_offset ,RCCE_SHM_SIZE_MAX); #ifdef SHMDBG printf("\n%d:%s:%d: RCCE_SHM_BUFFER_offset, RCCE_SHM_SIZE_MAX: % x %x\n", RCCE_IAM, __FILE__,__LINE__,RCCE_SHM_BUFFER_offset ,RCCE_SHM_SIZE_MAX); #endif #else RCCE_shmalloc_init(RC_SHM_BUFFER_START(),RCCE_SHM_SIZE_MAX); #endif // initialize the (global) flag bookkeeping data structure for (loc=0; loc<RCCE_FLAGS_PER_LINE; loc++) RCCE_flags.flag[loc] = (char)((unsigned int)0); RCCE_flags.line_address = NULL; RCCE_flags.members=0; RCCE_flags.next=NULL; // create global communicator (equivalent of MPI_COMM_WORLD); this will also allocate // the two synchronization flags associated with the global barrier RCCE_comm_split(RCCE_global_color, nothing, &RCCE_COMM_WORLD); // if power management is enabled, initialize more stuff; this includes two more // communicators (for voltage and frequency domains), plus two synchronization flags // associated with the barrier for each communicator #ifdef RC_POWER_MANAGEMENT if (error=RCCE_init_RPC(RC_COREID, RCCE_IAM, RCCE_NP)) return(RCCE_error_return(RCCE_debug_RPC,error)); #endif #ifndef GORY // if we use the simplified API, we need to define more flags upfront for (ue=0; ue<RCCE_NP; ue++) { if (error=RCCE_flag_alloc(&RCCE_sent_flag[ue])) return(RCCE_error_return(RCCE_debug_synch,error)); if (error=RCCE_flag_alloc(&RCCE_ready_flag[ue])) return(RCCE_error_return(RCCE_debug_synch,error)); } #endif return (RCCE_SUCCESS); }