Example #1
0
File: p2p.c Project: ParRes/Kernels
int main(int argc, char ** argv) {

  long   m, n;            /* grid dimensions                                     */
  int    i, j, iter;      /* dummies                                             */
  int    iterations;      /* number of times to run the pipeline algorithm       */
  double pipeline_time,   /* timing parameters                                   */
         avgtime, max_time;
  double epsilon = 1.e-8; /* error tolerance                                     */
  double corner_val;      /* verification value at top right corner of grid      */
  double *vector;/* array holding grid values                           */
  long   total_length;    /* total required length to store grid values          */

  /*******************************************************************************
  ** process and test input parameters
  ********************************************************************************/

  if(MYTHREAD == THREADS-1){
    printf("Parallel Research Kernels version %s\n", PRKVERSION);
    printf("UPC pipeline execution on 2D grid\n");
  }

  if (argc != 4){
    if(MYTHREAD == THREADS-1){
      printf("Usage: %s <# iterations> <first array dimension> ", *argv);
      printf("<second array dimension>\n");
    }
    upc_global_exit(EXIT_FAILURE);
  }

  iterations  = atoi(*++argv);
  if (iterations < 1){
    if(MYTHREAD == THREADS-1)
      printf("ERROR: iterations must be >= 1 : %d \n",iterations);
    upc_global_exit(EXIT_FAILURE);
  }

  m  = atol(*++argv);
  n  = atol(*++argv);

  if (m < 1 || n < 1){
    if(MYTHREAD == THREADS-1)
      printf("ERROR: grid dimensions must be positive: %d, %d \n", m, n);
    upc_global_exit(EXIT_FAILURE);
  }

  if(MYTHREAD == THREADS-1){
    printf("Number of threads         = %d\n", THREADS);
    printf("Grid sizes                = %ld, %ld\n", m, n);
    printf("Number of iterations      = %d\n", iterations);
#if USE_BUPC_EXT
    printf("Using Berkeley UPC extensions\n");
#endif
  }

  /*********************************************************************
  ** Allocate memory for input and output matrices
  *********************************************************************/
#if USE_BUPC_EXT
  bupc_sem_t *myflag = bupc_sem_alloc(BUPC_SEM_INTEGER | BUPC_SEM_MPRODUCER);
  upc_barrier;
  allflags[MYTHREAD] = myflag;
  upc_barrier;
  bupc_sem_t *mypeer = allflags[(MYTHREAD+1) % THREADS];
#endif

  long segment_size = m / THREADS;
  int leftover = m % THREADS;
  int myoffsetx, sizex;

  if(MYTHREAD < leftover){
    myoffsetx = (segment_size + 1) * MYTHREAD;
    sizex = segment_size + 1;
  }else{
    myoffsetx = (segment_size + 1) * leftover + segment_size * (MYTHREAD - leftover);
    sizex = segment_size;
  }

#if USE_BUPC_EXT
  if(MYTHREAD != 0){
    myoffsetx -= 1;
    sizex += 1;
  }
#endif

  int sizey = n;
  int myoffsety = 0;

  upc_barrier;

  debug("Allocating arrays (%d, %d), offset (%d, %d)", sizex, sizey, myoffsetx, myoffsety);
  local_shared_block_ptrs in_array  = shared_2d_array_alloc(sizex, sizey, myoffsetx, myoffsety);

  in_arrays[MYTHREAD] = in_array;

  double **in_array_private = shared_2d_array_to_private(in_array, sizex, sizey, myoffsetx, myoffsety);

  if(MYTHREAD == 0)
    current_max_line[MYTHREAD] = sizey;
  else
    current_max_line[MYTHREAD] = 0;

  upc_barrier;

  /*********************************************************************
  ** Initialize the matrices
  *********************************************************************/

  /* clear the array                                                             */
  for (j=0; j<n; j++)
    for (i=myoffsetx; i<myoffsetx + sizex; i++)
      ARRAY(i, j) = 0.0;

  /* set boundary values (bottom and left side of grid                           */
  if(MYTHREAD == 0)
    for (j=0; j<n; j++)
      ARRAY(0, j) = (double) j;

  for (i=myoffsetx; i<myoffsetx + sizex; i++)
    ARRAY(i, 0) = (double) i;

  upc_barrier;

  for (iter = 0; iter<=iterations; iter++){
    /* start timer after a warmup iteration */
    if (iter == 1)
      pipeline_time = wtime();
    if(MYTHREAD == 0)
      debug("start it %d, %f", iter, ARRAY(0, 0));

    if(MYTHREAD != THREADS - 1)  // Send the element in line 0
      in_arrays[MYTHREAD + 1][0][myoffsetx + sizex -1] = ARRAY(myoffsetx + sizex - 1, 0);

    for (j=1; j<n; j++) {
#if USE_BUPC_EXT
      if(MYTHREAD > 0){
        bupc_sem_wait(myflag);
      }

      for (i=myoffsetx+1; i<myoffsetx + sizex; i++)
        ARRAY(i, j) = ARRAY(i-1, j) + ARRAY(i, j-1) - ARRAY(i-1, j-1);

      if(MYTHREAD != THREADS - 1){
        in_arrays[MYTHREAD + 1][j][myoffsetx + sizex -1] = ARRAY(myoffsetx + sizex - 1, j);

        bupc_sem_post(mypeer);
      }
#else
      while(j > current_max_line[MYTHREAD]) // Normally not necessary: bupc_poll();
        ;

      if(MYTHREAD > 0)
        ARRAY(myoffsetx, j) = in_arrays[MYTHREAD - 1][j][myoffsetx-1] + ARRAY(myoffsetx, j-1) - in_arrays[MYTHREAD-1][j-1][myoffsetx-1];

      for (i=myoffsetx+1; i<myoffsetx + sizex; i++)
        ARRAY(i, j) = ARRAY(i-1, j) + ARRAY(i, j-1) - ARRAY(i-1, j-1);

      if(MYTHREAD < THREADS - 1)
        current_max_line[MYTHREAD+1] = j;

#endif
    }

    /* copy top right corner value to bottom left corner to create dependency; we
       need a barrier to make sure the latest value is used. This also guarantees
     that the flags for the next iteration (if any) are not getting clobbered  */
    if(MYTHREAD == 0)
      current_max_line[MYTHREAD] = sizey;
    else
      current_max_line[MYTHREAD] = 0;

    if(MYTHREAD == THREADS - 1){
      in_arrays[0][0][0] = -ARRAY(m-1, n-1);
    }
    upc_barrier;
  }

  pipeline_time = wtime() - pipeline_time;
  times[MYTHREAD] = pipeline_time;

  upc_barrier;

  // Compute max_time
  if(MYTHREAD == THREADS - 1){
    max_time = times[MYTHREAD];
    for(i=1; i<THREADS; i++){
      if(max_time < times[i])
        max_time = times[i];
    }
  }

  /*******************************************************************************
  ** Analyze and output results.
  ********************************************************************************/

  /* verify correctness, using top right value;                                  */
  if( MYTHREAD == THREADS - 1){
    corner_val = (double)((iterations+1)*(n+m-2));
    if (fabs(ARRAY(m-1,n-1)-corner_val)/corner_val > epsilon) {
      printf("ERROR: checksum %lf does not match verification value %lf\n",
          ARRAY(m-1, n-1), corner_val);
      exit(EXIT_FAILURE);
    }
#if VERBOSE
    printf("checksum %lf verification value %lf\n",
        ARRAY(m-1, n-1), corner_val);
    printf("Solution validates; verification value = %lf\n", corner_val);
#else
    printf("Solution validates\n");
#endif
    avgtime = max_time/iterations;
  printf("Rate (MFlops/s): %lf Avg time (s): %lf\n",
         1.0E-06 * 2 * ((double)(m-1)*(double)(n-1))/avgtime, avgtime);
  exit(EXIT_SUCCESS);
  }
}
Example #2
0
int main(int argc, char ** argv) {
  int    N;
  int    tile_size=32;  /* default tile size for tiling of local transpose */
  int    num_iterations;/* number of times to do the transpose             */
  int    tiling;        /* boolean: true if tiling is used                 */
  double total_bytes;   /* combined size of matrices                       */
  double start_time,    /* timing parameters                               */
         end_time, avgtime;

  /*********************************************************************
  ** read and test input parameters
  *********************************************************************/

  if(argc != 3 && argc != 4){
    if(MYTHREAD == 0)
      printf("Usage: %s <# iterations> <matrix order> [tile size]\n", *argv);
    upc_global_exit(EXIT_FAILURE);
  }

  num_iterations = atoi(*++argv);
  if(num_iterations < 1){
    if(MYTHREAD == 0)
      printf("ERROR: iterations must be >= 1 : %d \n", num_iterations);
    upc_global_exit(EXIT_FAILURE);
  }

  N = atoi(*++argv);
  if(N < 0){
    if(MYTHREAD == 0)
      printf("ERROR: Matrix Order must be greater than 0 : %d \n", N);
    upc_global_exit(EXIT_FAILURE);
  }

  if (argc == 4)
    tile_size = atoi(*++argv);

  /*a non-positive tile size means no tiling of the local transpose */
  tiling = (tile_size > 0) && (tile_size < N);
  if(!tiling)
    tile_size = N;

  int sizex = N / THREADS;
  if(N % THREADS != 0) {
    if(MYTHREAD == 0)
      printf("N %% THREADS != 0\n");
    upc_global_exit(EXIT_FAILURE);
  }
  int sizey = N;

  if(MYTHREAD == 0) {
    printf("Parallel Research Kernels version %s\n", PRKVERSION);
    printf("UPC matrix transpose: B = A^T\n");
    printf("Number of threads    = %d\n", THREADS);
    printf("Matrix order         = %d\n", N);
    printf("Number of iterations = %d\n", num_iterations);
    if (tiling)
          printf("Tile size            = %d\n", tile_size);
    else  printf("Untiled\n");
  }

  /*********************************************************************
  ** Allocate memory for input and output matrices
  *********************************************************************/

  total_bytes = 2.0 * sizeof(double) * N * N;

  int myoffsetx = MYTHREAD * sizex;
  int myoffsety = 0;

  upc_barrier;

  debug("Allocating arrays (%d, %d), offset (%d, %d)", sizex, sizey, myoffsetx, myoffsety);
  local_shared_block_ptrs in_array  = shared_2d_array_alloc(sizex, sizey, myoffsetx, myoffsety);
  local_shared_block_ptrs out_array = shared_2d_array_alloc(sizex, sizey, myoffsetx, myoffsety);
  local_shared_block_ptrs buf_array = shared_2d_array_alloc(sizex, sizey, myoffsetx, myoffsety);

  in_arrays[MYTHREAD] = in_array;
  out_arrays[MYTHREAD] = out_array;
  buf_arrays[MYTHREAD] = buf_array;

  double **in_array_private = shared_2d_array_to_private(in_array, sizex, sizey, myoffsetx, myoffsety);
  double **out_array_private = shared_2d_array_to_private(out_array, sizex, sizey, myoffsetx, myoffsety);
  double **buf_array_private = shared_2d_array_to_private(buf_array, sizex, sizey, myoffsetx, myoffsety);

  upc_barrier;

  /*********************************************************************
  ** Initialize the matrices
  *********************************************************************/
  for(int y=myoffsety; y<myoffsety + sizey; y++){
    for(int x=myoffsetx; x<myoffsetx + sizex; x++){
      in_array_private[y][x] = (double) (x+N*y);
      out_array[y][x] = -1.0;
    }
  }
  upc_barrier;

  for(int y=myoffsety; y<myoffsety + sizey; y++){
    for(int x=myoffsetx; x<myoffsetx + sizex; x++){
      if(in_array_private[y][x] !=(double) (x+N*y))
        die("x=%d y=%d in_array=%f != %f", x, y, in_array[y][x], (x+N*y));
      if(out_array_private[y][x] != -1.0)
        die("out_array_private error");
    }
  }

  /*********************************************************************
  ** Transpose
  *********************************************************************/
  int transfer_size = sizex * sizex * sizeof(double);
  if(MYTHREAD == 0)
    debug("transfer size = %d", transfer_size);

  for(int iter=0; iter<=num_iterations; iter++){
    /* start timer after a warmup iteration */
    if(iter == 1){
      upc_barrier;
      start_time = wtime();
    }

    for(int i=0; i<THREADS; i++){
      int local_blk_id = (MYTHREAD + i) % THREADS;
      int remote_blk_id = MYTHREAD;
      int remote_thread = local_blk_id;

      upc_memget(&buf_array_private[local_blk_id * sizex][myoffsetx],
                  &in_arrays[remote_thread][remote_blk_id * sizex][remote_thread * sizex], transfer_size);

#define OUT_ARRAY(x,y) out_array_private[local_blk_id * sizex + x][myoffsetx + y]
#define BUF_ARRAY(x,y) buf_array_private[local_blk_id * sizex + x][myoffsetx + y]

      if(!tiling){
        for(int x=0; x<sizex; x++){
          for(int y=0; y<sizex; y++){
            OUT_ARRAY(x,y) = BUF_ARRAY(y,x);
          }
        }
      }
      else{
        for(int x=0; x<sizex; x+=tile_size){
          for(int y=0; y<sizex; y+=tile_size){
            for(int bx=x; bx<MIN(sizex, x+tile_size); bx++){
              for(int by=y; by<MIN(sizex, y+tile_size); by++){
                OUT_ARRAY(bx,by) = BUF_ARRAY(by,bx);
              }
            }
          }
        }
      }
    }
    upc_barrier;
  }

  upc_barrier;
  end_time = wtime();

  /*********************************************************************
  ** Analyze and output results.
  *********************************************************************/
  for(int y=myoffsety; y<myoffsety + sizey; y++){
    for(int x=myoffsetx; x<myoffsetx + sizex; x++){
      if(in_array_private[y][x] != (double)(x+ N*y))
        die("Error in input: x=%d y=%d", x, y);
      if(out_array_private[y][x] != (double)(y + N*x))
        die("x=%d y=%d in_array=%f != %f   %d %d", x, y, out_array[y][x], (double)(y + N*x), (int)(out_array[y][x]) % N, (int)(out_array[y][x]) / N);
    }
  }

  if(MYTHREAD == 0){
    printf("Solution validates\n");
    double transfer_size = 2 * N * N * sizeof(double);
    avgtime = (end_time - start_time) / num_iterations;
    double rate = transfer_size / avgtime * 1.0E-06;
    printf("Rate (MB/s): %lf Avg time (s): %lf\n",rate, avgtime);
  }
}
Example #3
0
int main(int argc, char ** argv) {

  int    n;               /* linear grid dimension */
  int    i, j, ii, jj, it, jt, iter;  /* dummies */
  double norm,            /* L1 norm of solution */
         reference_norm;
  double f_active_points; /* interior of grid with respect to stencil */
  DTYPE  flops;           /* floating point ops per iteration */
  int    iterations;      /* number of times to run the algorithm */
  double stencil_time,    /* timing parameters */
         avgtime, max_time;
  int    stencil_size;    /* number of points in stencil */
  DTYPE  weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil */
  int    istart;    /* bounds of grid tile assigned to calling rank        */
  int    jstart;    /* bounds of grid tile assigned to calling rank        */
  int    Num_procsx, Num_procsy;

  /*******************************************************************************
  ** process and test input parameters
  ********************************************************************************/
  if(MYTHREAD == 0){
    printf("Parallel Research Kernels version %s\n", PRKVERSION);
    printf("UPC stencil execution on 2D grid\n");
    fflush(stdout);
  }

  if (argc != 4 && argc != 3)
    if(MYTHREAD == 0)
      bail_out("Usage: %s <# iterations> <array dimension> [x_tiles]\n", *argv);

  iterations  = atoi(*++argv);
  if (iterations < 1)
    if(MYTHREAD == 0)
      bail_out("iterations must be >= 1 : %d", iterations);

  n  = atoi(*++argv);

  if (n < 1)
    if(MYTHREAD == 0)
      bail_out("grid dimension must be positive: %d", n);

  if (argc == 4)
    Num_procsx  = atoi(*++argv);
  else
    Num_procsx = 0;

  if(Num_procsx < 0)
    if(MYTHREAD == 0)
      bail_out("Number of tiles in the x-direction should be positive (got: %d)", Num_procsx);

  if(Num_procsx > THREADS)
    if(MYTHREAD == 0)
      bail_out("Number of tiles in the x-direction should be < THREADS (got: %d)", Num_procsx);

  /* Num_procsx=0 refers to automated calculation of division on each coordinates like MPI code */
  if(Num_procsx == 0){
    for (Num_procsx=(int) (sqrt(THREADS+1)); Num_procsx>0; Num_procsx--) {
      if (!(THREADS%Num_procsx)) {
        Num_procsy = THREADS/Num_procsx;
        break;
      }
    }
  }
  else {
    Num_procsy = THREADS / Num_procsx;
  }

  if(RADIUS < 1)
    if(MYTHREAD == 0)
      bail_out("Stencil radius %d should be positive", RADIUS);

  if(2*RADIUS +1 > n)
    if(MYTHREAD == 0)
      bail_out("Stencil radius %d exceeds grid size %d", RADIUS, n);

  if(Num_procsx * Num_procsy != THREADS){
    bail_out("Num_procsx * Num_procsy != THREADS");
  }

  /* compute amount of space required for input and solution arrays             */

  int my_IDx = MYTHREAD % Num_procsx;
  int my_IDy = MYTHREAD / Num_procsx;

  int blockx = n / Num_procsx;
  int leftover = n % Num_procsx;
  if (my_IDx < leftover) {
    istart = (blockx + 1) * my_IDx;
    blockx += 1;
  }
  else {
    istart = (blockx+1) * leftover + blockx * (my_IDx-leftover);
  }

  if (blockx == 0)
    bail_out("No work to do on x-direction!");

  int blocky = n / Num_procsy;
  leftover = n % Num_procsy;
  if (my_IDy < leftover) {
    jstart = (blocky+1) * my_IDy;
    blocky += 1;
  }
  else {
    jstart = (blocky+1) * leftover + blocky * (my_IDy-leftover);
  }

  if (blocky == 0)
    bail_out("No work to do on y-direction!");

  if(blockx < RADIUS || blocky < RADIUS) {
    bail_out("blockx < RADIUS || blocky < RADIUS");
  }

  int myoffsetx = istart - RADIUS;
  int myoffsety = jstart - RADIUS;
  thread_offsetx[MYTHREAD] = myoffsetx;
  thread_offsety[MYTHREAD] = myoffsety;

  int sizex = blockx + 2*RADIUS;
  int sizey = blocky + 2*RADIUS;
  thread_sizex[MYTHREAD] = sizex;
  thread_sizey[MYTHREAD] = sizey;

  upc_barrier;

  local_shared_block_ptrs in_array  = shared_2d_array_alloc(sizex, sizey, myoffsetx, myoffsety);
  local_shared_block_ptrs out_array = shared_2d_array_alloc(sizex, sizey, myoffsetx, myoffsety);

  in_arrays[MYTHREAD] = in_array;
  out_arrays[MYTHREAD] = out_array;

  DTYPE **in_array_private = shared_2d_array_to_private(in_array, sizex, sizey, myoffsetx, myoffsety);
  DTYPE **out_array_private = shared_2d_array_to_private(out_array, sizex, sizey, myoffsetx, myoffsety);

  upc_barrier;

  private_in_arrays = prk_malloc(sizeof(private_shared_block_ptrs) * THREADS);
  if(private_in_arrays == NULL)
    bail_out("Cannot allocate private_in_arrays");

  private_out_arrays = prk_malloc(sizeof(private_shared_block_ptrs) * THREADS);
  if(private_out_arrays == NULL)
    bail_out("Cannot allocate private_out_arrays");

  for(int thread=0; thread<THREADS; thread++){
    private_in_arrays[thread] = partially_privatize(in_arrays[thread], thread);
    private_out_arrays[thread] = partially_privatize(out_arrays[thread], thread);
  }

  /* intialize the input and output arrays */
  for(int y=myoffsety; y<myoffsety + sizey; y++){
    for(int x=myoffsetx; x<myoffsetx + sizex; x++){
      in_array_private[y][x] = COEFX*x + COEFY*y;
      out_array[y][x] = 0.;
    }
  }
  upc_barrier;

  for(int y=myoffsety; y<myoffsety + sizey; y++){
    for(int x=myoffsetx; x<myoffsetx + sizex; x++){
      if(in_array_private[y][x] != COEFX*x + COEFY*y)
        bail_out("x=%d y=%d in_array=%f != %f", x, y, in_array[y][x], COEFX*x + COEFY*y);
    }
  }

  /* fill the stencil weights to reflect a discrete divergence operator */
  for (jj=-RADIUS; jj<=RADIUS; jj++)
    for (ii=-RADIUS; ii<=RADIUS; ii++)
      WEIGHT(ii, jj) = (DTYPE)0.0;

  stencil_size = 4*RADIUS+1;
  for (ii=1; ii<=RADIUS; ii++) {
    WEIGHT(0, ii) = WEIGHT( ii,0) =  (DTYPE) (1.0/(2.0*ii*RADIUS));
    WEIGHT(0,-ii) = WEIGHT(-ii,0) = -(DTYPE) (1.0/(2.0*ii*RADIUS));
  }

  if(MYTHREAD == 0){
    printf("Number of threads      = %d\n", THREADS);
    printf("Grid size              = %d\n", n);
    printf("Radius of stencil      = %d\n", RADIUS);
    printf("Tiles in x/y-direction = %d/%d\n", Num_procsx, Num_procsy);
#if DOUBLE
    printf("Data type              = double precision\n");
#else
    printf("Data type              = single precision\n");
#endif
#if LOOPGEN
    printf("Script used to expand stencil loop body\n");
#else
    printf("Compact representation of stencil loop body\n");
#endif
    printf("Number of iterations   = %d\n", iterations);
  }

  upc_barrier;

  int startx = myoffsetx + RADIUS;
  int endx = myoffsetx + sizex - RADIUS;

  int starty = myoffsety + RADIUS;
  int endy = myoffsety + sizey - RADIUS;

  if(my_IDx == 0)
    startx += RADIUS;

  if(my_IDx == Num_procsx - 1)
    endx -= RADIUS;

  if(my_IDy == 0)
    starty += RADIUS;

  if(my_IDy == Num_procsy - 1)
    endy -= RADIUS;

  upc_barrier;

  for (iter = 0; iter<=iterations; iter++){
    /* start timer after a warmup iteration */
    if (iter == 1) {
      upc_barrier;
      stencil_time = wtime();
    }

    /* Get ghost zones */
    /* NORTH */
    if(my_IDy != 0){
      int peer = (my_IDy - 1) * Num_procsx + my_IDx;
      for (int y=starty - RADIUS; y<starty; y++) {
        int transfer_size = (endx - startx) * sizeof(DTYPE);
        upc_memget(&in_array_private[y][startx], &private_in_arrays[peer][y][startx], transfer_size);
      }
    }
    /* SOUTH */
    if(my_IDy != Num_procsy - 1){
      int peer = (my_IDy + 1) * Num_procsx + my_IDx;
      for (int y=endy; y<endy + RADIUS; y++) {
        int transfer_size = (endx - startx) * sizeof(DTYPE);
        upc_memget(&in_array_private[y][startx], &private_in_arrays[peer][y][startx], transfer_size);
      }
    }
    /* LEFT */
    if(my_IDx != 0){
      int peer = my_IDy * Num_procsx + my_IDx - 1;
      for (int y=starty; y<endy; y++) {
        for (int x=startx - RADIUS; x<startx; x++) {
          in_array_private[y][x] = private_in_arrays[peer][y][x];
        }
      }
    }
    /* RIGHT*/
    if(my_IDx != Num_procsx - 1){
      int peer = my_IDy * Num_procsx + my_IDx + 1;
      for (int y=starty; y<endy; y++) {
        for (int x=endx; x<endx + RADIUS; x++) {
          in_array_private[y][x] = private_in_arrays[peer][y][x];
        }
      }
    }

    /* Apply the stencil operator */
    for (j=starty; j<endy; j++) {
      for (i=startx; i<endx; i++) {
        #if LOOPGEN
          #include "loop_body_star.incl"
        #else
          for (jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj);
          for (ii=-RADIUS; ii<0; ii++)       OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j);
          for (ii=1; ii<=RADIUS; ii++)       OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j);
        #endif
      }
    }

    upc_barrier; /* <- Necessary barrier: some slow threads could use future data */

    /* add constant to solution to force refresh of neighbor data, if any */
    for(int y=myoffsety + RADIUS; y<myoffsety + sizey - RADIUS; y++)
      for(int x=myoffsetx + RADIUS; x<myoffsetx + sizex - RADIUS; x++)
        in_array_private[y][x] += 1.0;

    upc_barrier; /* <- Necessary barrier: some threads could start on old data */
  } /* end of iterations */

  stencil_time = wtime() - stencil_time;
  times[MYTHREAD] = stencil_time;

  upc_barrier;

  // Compute max_time
  if(MYTHREAD == 0){
    max_time = times[MYTHREAD];
    for(i=1; i<THREADS; i++){
      if(max_time < times[i])
        max_time = times[i];
    }
  }

  norm = (double) 0.0;
  f_active_points = (double)(n-2*RADIUS) * (double)(n-2*RADIUS);

  /* compute L1 norm in parallel */
  for (int y=starty; y<endy; y++) {
    for (int x=startx; x<endx; x++) {
      norm += (double)ABS(out_array[y][x]);
    }
  }

  norm /= f_active_points;
  norms[MYTHREAD] = norm;

  upc_barrier;

  if(MYTHREAD == 0){
    norm = 0.;
    for(int i=0; i<THREADS; i++) norm += norms[i];

    /*******************************************************************************
    ** Analyze and output results.
    ********************************************************************************/

    /* verify correctness */
    reference_norm = (double) (iterations+1) * (COEFX + COEFY);

    if (ABS(norm - reference_norm) > EPSILON)
      bail_out("L1 norm = "FSTR", Reference L1 norm = "FSTR"\n", norm, reference_norm);
    else {
      printf("Solution validates\n");
#if VERBOSE
      printf("Reference L1 norm = "FSTR", L1 norm = "FSTR"\n",
             reference_norm, norm);
#endif
    }

    flops = (DTYPE) (2*stencil_size+1) * f_active_points;
    avgtime = max_time/iterations;
    printf("Rate (MFlops/s): "FSTR"  Avg time (s): %lf\n",
           1.0E-06 * flops/avgtime, avgtime);

    exit(EXIT_SUCCESS);
  }
}