Esempio n. 1
0
int iRCCE_flag_alloc(RCCE_FLAG *flag)
{
#if !defined(SINGLEBITFLAGS)
  return iRCCE_flag_alloc_tagged(flag);
#else
  return RCCE_flag_alloc(flag);
#endif  
}
Esempio n. 2
0
//--------------------------------------------------------------------------------------
// FUNCTION: RCCE_comm_split
// RCCE_comm_split works like MPI_Comm_split, but:
// 1. Always uses the default global communicator as the basis, not an 
//    arbitrary communicator       
// 2. Uses the rank of the UE in the global communicator as the key
// 3. Uses a function, operating on UE's global rank, to compute color
//--------------------------------------------------------------------------------------
int RCCE_comm_split(
  int (*color)(int, void *), // function returning a color value for given ue and aux
  void *aux,                 // optional user-supplied data structure 
  RCCE_COMM *comm            // new communicator
  ) {

  int i, my_color, error;

  if (!comm) return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_COMM_UNDEFINED));

  // start with a barrier to make sure all UEs are participating, unless we are still 
  // defining the global communicator; there is no danger in skipping the barrier in 
  // that case, because the global communicator is defined in RCCE_init, which must be 
  // called by all cores before any other RCCE calls
  if (comm != &RCCE_COMM_WORLD) RCCE_barrier(&RCCE_COMM_WORLD);
 
  // determine the size of the communicator                              
  my_color = color(RCCE_IAM, aux);

  comm->size = 0;
  for (i=0; i<RCCE_NP; i++) {
    if (color(i, aux) == my_color) {
    if (i == RCCE_IAM) comm->my_rank = comm->size;
    comm->member[comm->size++] = i;
    }
  }

  // note: we only need to allocate new synch flags if the communicator has not yet been
  // initialized. It is legal to overwrite an initialized communcator, in which case the 
  // membership may change, but the same synchronization flags can be used       
  if (comm->initialized == RCCE_COMM_INITIALIZED) return(RCCE_SUCCESS);
  if(error=RCCE_flag_alloc(&(comm->gather)))
     return(RCCE_error_return(RCCE_debug_comm,error));
  if(error=RCCE_flag_alloc(&(comm->release)))
     return(RCCE_error_return(RCCE_debug_comm,error));
  comm->initialized = RCCE_COMM_INITIALIZED;

  return(RCCE_SUCCESS);
}
Esempio n. 3
0
int RCCE_APP(int argc, char **argv) {

  /* statically allocated space sits in off-chip private memory          */
  float     a[NXNY], *buff;
  int       i, offset, iter=10, tile;
  int       MY_ID;
  int       NTILES1;
  double    time;
  RCCE_FLAG flag0, flag1;

  RCCE_init(&argc, &argv);
  
  NTILES1 = RCCE_num_ues()-1;
  MY_ID = RCCE_ue();

  if (NX%8) {
    printf("Grid width should be multiple of 8: %d\n", NX);
    exit(1);
  }
  if (argc>1) iter=atoi(*++argv);
  if (MY_ID==0) printf("Executing %d iterations\n", iter);

    /* allocate space on the comm buffer                                 */
  buff = (float *) RCCE_malloc(sizeof(float)*2*NX); 
  /* Allocate flags to coordinate comm.                                  */                                 
  if (RCCE_flag_alloc(&flag0)) return(1);
  if (RCCE_flag_alloc(&flag1)) return(1);

  /* initialize array a on all tiles; this stuffs a into private caches  */
  for (offset=0,       i=0; i<NXNY; i++) a[i+offset] = 0.0;
  if (MY_ID == 0) 
     for (offset=0,    i=0; i<NX;   i++) a[i+offset] = 1.0;
  if (MY_ID == NTILES1) 
     for (offset=NXNY1,i=0; i<NX;   i++) a[i+offset] = 2.0;

  /* put in a barrier so everybody can be sure to have initialized       */
  RCCE_barrier(&RCCE_COMM_WORLD);

  /* main loop */

  if (MY_ID==0) time = RCCE_wtime();

  while ((iter--)>0){
  
    /* start with copying fringe data to neighboring tiles               */
    if (MY_ID!=NTILES1) {
      /* Initialize neighbor flag to zero                                */
      RCCE_flag_write(&flag0, RCCE_FLAG_UNSET, MY_ID+1); 
      /* copy private data to shared comm buffer of neighbor             */
      RCCE_put((t_vcharp)(&buff[0]), (t_vcharp)(&a[NXNY2]), NX*sizeof(float), MY_ID+1);
      RCCE_flag_write(&flag0, RCCE_FLAG_SET, MY_ID+1); 
    }
    if (MY_ID != 0) {
      /* Initialize neighbor flag to zero                                */
      RCCE_flag_write(&flag1, 0, MY_ID-1); 
      /* copy private data to shared comm buffer of neighbor             */
      RCCE_put((t_vcharp)(&buff[NX]), (t_vcharp)(&a[NX]), NX*sizeof(float), MY_ID-1);
      RCCE_flag_write(&flag1, RCCE_FLAG_SET, MY_ID-1); 
    }

    /* Make sure the data has been recvd and copy data out of buffer(s)  */
    if (MY_ID!=NTILES1) {
      RCCE_wait_until(flag1, RCCE_FLAG_SET);
      RCCE_get((t_vcharp)(&a[NXNY1]), (t_vcharp)(&buff[NX]), NX*sizeof(float),MY_ID);
    }

    if (MY_ID!=0) {
      RCCE_wait_until(flag0, RCCE_FLAG_SET);
      RCCE_get((t_vcharp)(&a[0]), (t_vcharp)(&buff[0]), NX*sizeof(float),MY_ID);
    }

    /* apply the stencil operation                                       */
    for (i=0; i<NXNY2; i++) {
      a[i+O3] +=
         W1*a[i+O1] + W2*a[i+O2] + W3*a[i+O3] + W4*a[i+O4] + W5*a[i+O5];
    }
  }
  RCCE_barrier(&RCCE_COMM_WORLD);
  if (MY_ID==0) { 
    time = RCCE_wtime()-time;
  }

  /* print result strip by strip; this would not be done on RC */
  for (int id=0; id<=NTILES1; id++) {
    RCCE_barrier(&RCCE_COMM_WORLD);
    if (MY_ID==id) {
      int start = NX; int end = NXNY1;
      if (MY_ID==0) start = 0;
      if (MY_ID == NTILES1) end = NXNY;
      for (offset=0, i=start; i<end; i++) {
        if (!(i%NX)) printf("\n");
//        comment out next line and uncomment subsequent three to print error
        printf("%f ",a[i+offset]);
//        int jj=i/NX+(MY_ID*(NY-1));
//        double aexact=1.0+(double)jj/((NTILES1+1)*(NY-1));
//        printf("%f ",a[i+offset]-aexact);
      }
    }
  }
  RCCE_barrier(&RCCE_COMM_WORLD);
  if (MY_ID==0) { 
    printf("\nTotal time: %lf\n", time);
  }

  RCCE_finalize();

  return(0);
}
Esempio n. 4
0
int RCCE_APP(int argc, char **argv){

  int ID, ID_nb, ID_donor, nrounds, error, strlength;
  RCCE_FLAG flag_sent, flag_ack;

  double *cbuffer, *buffer, sum;
  char msg[RCCE_MAX_ERROR_STRING];

  RCCE_init(&argc, &argv);

  ID = RCCE_ue();
  ID_nb = (ID+1)%RCCE_num_ues();
  ID_donor = (ID-1+RCCE_num_ues())%RCCE_num_ues();

  if (argc != 2) {
    if (ID==0) printf("Executable requires one parameter (number of rounds): %d\n",argc-1);
    return(1);
  }
  nrounds = atoi(*++argv);
  if (nrounds < 0) {
    if (ID==0) printf("Number of rounds should be non-negative: %d\n", nrounds);
    return(1);
  }

  /* allocate private memory and comm buffer space */
  buffer  = (double *) malloc(BUFSIZE*sizeof(double));
  if (!buffer) printf("Mark 01: Failed to allocate private buffer on proc %d\n", ID);
  cbuffer = (double *) RCCE_malloc(BUFSIZE*sizeof(double));
  if (!buffer) printf("Mark 02:RCCE failed to allocate %d doubles on proc %d\n",
      BUFSIZE, ID);

  /* initialize buffer with UE-specific data  */
  for (int i=0; i<BUFSIZE; i++) buffer[i] = (double)(ID+1+i);
  sum = 0.0;  
  for (int i=0; i<BUFSIZE; i++) sum += buffer[i];
  printf("Initial sum on UE %03d equals %f\n", ID, sum);

  /* create and initialize flag variables */
  if (error=RCCE_flag_alloc(&flag_sent))
    printf("Mark 03a: Could not allocate flag_sent on %d, error=%d\n", ID, error);
  if (error=RCCE_flag_alloc(&flag_ack))
    printf("Mark 03b: Could not allocate flag_ack on %d, error=%d\n", ID, error);

  if(error=RCCE_flag_write(&flag_sent, RCCE_FLAG_UNSET, ID)) 
    printf("Mark 04: Could not initialize flag_sent on %d, error=%d\n", ID, error);
  if(error=RCCE_flag_write(&flag_ack, RCCE_FLAG_SET, ID_donor)) 
    printf("Mark 05: Could not initialize flag_ack on %d, error=%d\n", ID_donor, error);

  for (int round=0; round<nrounds; round++) {

    int size = BUFSIZE*sizeof(double);
    RCCE_wait_until(flag_ack, RCCE_FLAG_SET);
    RCCE_flag_write(&flag_ack, RCCE_FLAG_UNSET, ID);
    RCCE_put((t_vcharp)cbuffer, (t_vcharp)buffer, size, ID_nb);
    RCCE_flag_write(&flag_sent, RCCE_FLAG_SET, ID_nb);

    RCCE_wait_until(flag_sent, RCCE_FLAG_SET);
    RCCE_flag_write(&flag_sent, RCCE_FLAG_UNSET, ID);
    RCCE_get((t_vcharp)buffer, (t_vcharp)cbuffer, size, ID);
    RCCE_flag_write(&flag_ack, RCCE_FLAG_SET, ID_donor);

  }

  /* compute local sum */
  sum = 0.0;
  for (int i=0; i<BUFSIZE; i++) sum += buffer[i];
  printf("Final sum on UE %03d equals %f\n", ID, sum);

  RCCE_finalize();

  return(0);
}
Esempio n. 5
0
//--------------------------------------------------------------------------------------
// FUNCTION: RCCE_init
//--------------------------------------------------------------------------------------
// initialize the library and sanitize parameter list
//--------------------------------------------------------------------------------------
int RCCE_init(
  int *argc,   // pointer to argc, passed in from main program
  char ***argv // pointer to argv, passed in from main program
  ) {

  int i, ue, dummy_offset, loc, error;
#ifdef SCC
  int x, y, z;
  unsigned int physical_lockaddress;
#endif

#ifdef SHMADD
  unsigned int RCCE_SHM_BUFFER_offset ,result, rd_slot_nbr, wr_slot_nbr;
#endif
  void *nothing = NULL;
  
#ifdef SCC
  // Copperridge specific initialization...
  InitAPI(0);fflush(0);
#endif  

  // save pointer to executable name for later insertion into the argument list
  char *executable_name = (*argv)[0];

  RCCE_NP        = atoi(*(++(*argv)));  
  RC_REFCLOCKGHZ = atof(*(++(*argv)));  

  // put the participating core ids (unsorted) into an array             
  for (ue=0; ue<RCCE_NP; ue++) {
    RC_COREID[ue] = atoi(*(++(*argv)));
  }

#ifndef SCC
  // if using the functional emulator, must make sure to have read all command line 
  // parameters up to now before overwriting (shifted) first one with executable
  // name; even though argv is made firstprivate, that applies only the pointer to 
  // the arguments, not the actual data
  #pragma omp barrier
#endif
  // make sure executable name is as expected                 
  (*argv)[0] = executable_name;

  RC_MY_COREID = MYCOREID();

  // adjust apparent number of command line arguments, so it will appear to main 
  // program that number of UEs, clock frequency, and core ID list were not on
  // command line        
  *argc -= RCCE_NP+2;

  // sort array of participating phyical core IDs to determine their ranks
  RCCE_qsort((char *)RC_COREID, RCCE_NP, sizeof(int), id_compare);

  // determine rank of calling core
  for (ue=0; ue<RCCE_NP; ue++) {
    if (RC_COREID[ue] == RC_MY_COREID) RCCE_IAM = ue;
  }

#ifdef SHMADD
//   printf("Using SHMADD\n");
     RCCE_SHM_BUFFER_offset     = 0x00;
//   RCCE_SHM_BUFFER_offset     = 0x3FFFF80;
//   RCCE_SHM_BUFFER_offset   = 0x4000000;
//   RCCE_SHM_BUFFER_offset   = 0x181000;
   rd_slot_nbr=0x80;
   for(i=0; i<60; i++) {
     result  = readLUT(rd_slot_nbr);
     result -= 1;
     wr_slot_nbr = rd_slot_nbr + 4;
     writeLUT(wr_slot_nbr,result);
     rd_slot_nbr++;
   }
#endif

  // leave in one reassuring debug print
  if (DEBUG){
    printf("My rank is %d, physical core ID is %d\n", RCCE_IAM, RC_MY_COREID);
    fflush(0);
  }

  if (RCCE_IAM<0)
    return(RCCE_error_return(RCCE_debug_comm,RCCE_ERROR_CORE_NOT_IN_HOSTFILE));

#ifdef SCC
  // compute and memory map addresses of test&set registers for all participating cores 
  for (ue=0; ue<RCCE_NP; ue++) { 
    z = Z_PID(RC_COREID[ue]);
    x = X_PID(RC_COREID[ue]);
    y = Y_PID(RC_COREID[ue]);
    physical_lockaddress = CRB_ADDR(x,y) + (z==0 ? LOCK0 : LOCK1);
    virtual_lockaddress[ue] = (t_vcharp) MallocConfigReg(physical_lockaddress);
  }
#endif

  // initialize MPB starting addresses for all participating cores; allow one
  // dummy cache line at front of MPB for fooling write combine buffer in case
  // of single-byte MPB access
  RCCE_fool_write_combine_buffer = RC_COMM_BUFFER_START(RCCE_IAM);

  for (ue=0; ue<RCCE_NP; ue++) 
    RCCE_comm_buffer[ue] = RC_COMM_BUFFER_START(ue) + RCCE_LINE_SIZE;

  // gross MPB size is set equal to maximum
  RCCE_BUFF_SIZE = RCCE_BUFF_SIZE_MAX - RCCE_LINE_SIZE;

#ifdef RC_POWER_MANAGEMENT
#ifndef SCC
  // always store RPC queue data structure at beginning of usable MPB, so allocatable
  // storage needs to skip it. Only need to do this for functional emulator
  for (ue=0; ue<RCCE_NP; ue++) 
    RCCE_comm_buffer[ue] += REGULATOR_LENGTH;
  RCCE_BUFF_SIZE -= REGULATOR_LENGTH;
#endif
#endif

  // initialize RCCE_malloc
  RCCE_malloc_init(RCCE_comm_buffer[RCCE_IAM],RCCE_BUFF_SIZE);
#ifdef SHMADD

  RCCE_shmalloc_init(RC_SHM_BUFFER_START()+RCCE_SHM_BUFFER_offset ,RCCE_SHM_SIZE_MAX);
#ifdef SHMDBG
  printf("\n%d:%s:%d: RCCE_SHM_BUFFER_offset, RCCE_SHM_SIZE_MAX: % x %x\n", RCCE_IAM, 
    __FILE__,__LINE__,RCCE_SHM_BUFFER_offset ,RCCE_SHM_SIZE_MAX);
#endif
#else
  RCCE_shmalloc_init(RC_SHM_BUFFER_START(),RCCE_SHM_SIZE_MAX);
#endif

  // initialize the (global) flag bookkeeping data structure
  for (loc=0; loc<RCCE_FLAGS_PER_LINE; loc++) 
    RCCE_flags.flag[loc] = (char)((unsigned int)0);
  RCCE_flags.line_address = NULL;
  RCCE_flags.members=0;
  RCCE_flags.next=NULL;

  // create global communicator (equivalent of MPI_COMM_WORLD); this will also allocate 
  // the two synchronization flags associated with the global barrier 
  RCCE_comm_split(RCCE_global_color, nothing, &RCCE_COMM_WORLD);

  // if power management is enabled, initialize more stuff; this includes two more 
  // communicators (for voltage and frequency domains), plus two synchronization flags
  // associated with the barrier for each communicator       
#ifdef RC_POWER_MANAGEMENT
  if (error=RCCE_init_RPC(RC_COREID, RCCE_IAM, RCCE_NP)) 
       return(RCCE_error_return(RCCE_debug_RPC,error));
#endif

#ifndef GORY
  // if we use the simplified API, we need to define more flags upfront  
  for (ue=0; ue<RCCE_NP; ue++) {
    if (error=RCCE_flag_alloc(&RCCE_sent_flag[ue]))
      return(RCCE_error_return(RCCE_debug_synch,error));
    if (error=RCCE_flag_alloc(&RCCE_ready_flag[ue]))
      return(RCCE_error_return(RCCE_debug_synch,error));
  }

#endif

  return (RCCE_SUCCESS);
}