예제 #1
0
int main(int argc, char *argv[]) {
    int n = 100;
    // int n1=101;
    start_pes(0);
    int nn = (n-1) / _num_pes();
    int n_local0 = 1 + _my_pe() * nn;
    int n_local1 = 1 + (_my_pe()+1) * nn;
    // allocate only local part + ghost zone of the arrays x,y
    float *x, *y;

  
    x = (float*) malloc((n_local1 - n_local0 + 2)*sizeof(float));
    y = (float*) malloc((n_local1 - n_local0 + 2)*sizeof(float));  // forgot shmalloc

    shmem_barrier_all();

    //... // fill x, y

    // fill ghost zone
    if (_my_pe() > 0)
	shmem_float_get(x,y,n1,1); // extra code
    shmem_float_put(y,x, 1, _my_pe()-1);

    shmem_barrier_all();

    // do computation
    float e = 0;
    int i;
    for (i=n_local0; i<n_local1; ++i) {
	x[i] += ( y[i+1] + y[i-1] )*.5;
	e += y[i] * y[i];
    }

    static float work[_SHMEM_REDUCE_SYNC_SIZE];
    static long sync[_SHMEM_REDUCE_SYNC_SIZE];
    static float el, es;
    el = e;
    shmem_float_sum_to_all(&es, &el, 1,
			   0, 0, _num_pes(), work, sync);
    e = es;

    // ... // output x, e

    x += (n_local0 - 1);
    y += (n_local0 - 1);
    shfree(x);
    shfree(y);
    return 0;
}
예제 #2
0
파일: stencil.c 프로젝트: kempj/Kernels
int main(int argc, char ** argv) {
 
  int    Num_procs;       /* number of ranks                                     */
  int    Num_procsx, Num_procsy; /* number of ranks in each coord direction      */
  int    my_ID;           /* SHMEM rank                                          */
  int    my_IDx, my_IDy;  /* coordinates of rank in rank grid                    */
  int    right_nbr;       /* global rank of right neighboring tile               */
  int    left_nbr;        /* global rank of left neighboring tile                */
  int    top_nbr;         /* global rank of top neighboring tile                 */
  int    bottom_nbr;      /* global rank of bottom neighboring tile              */
  DTYPE *top_buf_out;     /* communication buffer                                */
  DTYPE *top_buf_in[2];   /*       "         "                                   */
  DTYPE *bottom_buf_out;  /*       "         "                                   */
  DTYPE *bottom_buf_in[2];/*       "         "                                   */
  DTYPE *right_buf_out;   /*       "         "                                   */
  DTYPE *right_buf_in[2]; /*       "         "                                   */
  DTYPE *left_buf_out;    /*       "         "                                   */
  DTYPE *left_buf_in[2];  /*       "         "                                   */
  int    root = 0;
  int    n, width, height;/* linear global and local grid dimension              */
  int    i, j, ii, jj, kk, it, jt, iter, leftover;  /* dummies                   */
  int    istart, iend;    /* bounds of grid tile assigned to calling rank        */
  int    jstart, jend;    /* bounds of grid tile assigned to calling rank        */
  DTYPE  reference_norm;
  DTYPE  f_active_points; /* interior of grid with respect to stencil            */
  int    stencil_size;    /* number of points in the stencil                     */
  DTYPE  flops;           /* floating point ops per iteration                    */
  int    iterations;      /* number of times to run the algorithm                */
  double avgtime,         /* timing parameters                                   */
         *local_stencil_time, *stencil_time; 
  DTYPE  * RESTRICT in;   /* input grid values                                   */
  DTYPE  * RESTRICT out;  /* output grid values                                  */
  long   total_length_in; /* total required length to store input array          */
  long   total_length_out;/* total required length to store output array         */
  int    error=0;         /* error flag                                          */
  DTYPE  weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil     */
  int    *arguments;      /* command line parameters                             */
  int    count_case=4;    /* number of neighbors of a rank                       */
  long   *pSync_bcast;    /* work space for collectives                          */
  long   *pSync_reduce;   /* work space for collectives                          */
  double *pWrk_time;      /* work space for collectives                          */
  DTYPE  *pWrk_norm;      /* work space for collectives                          */
  int    *iterflag;       /* synchronization flags                               */
  int    sw;              /* double buffering switch                             */
  DTYPE  *local_norm, *norm; /* local and global error norms                     */

  /*******************************************************************************
  ** Initialize the SHMEM environment
  ********************************************************************************/
  prk_shmem_init();

  my_ID=prk_shmem_my_pe();
  Num_procs=prk_shmem_n_pes();

  pSync_bcast        = (long *)   prk_shmem_malloc(PRK_SHMEM_BCAST_SYNC_SIZE*sizeof(long));
  pSync_reduce       = (long *)   prk_shmem_malloc(PRK_SHMEM_REDUCE_SYNC_SIZE*sizeof(long));
  pWrk_time          = (double *) prk_shmem_malloc(PRK_SHMEM_REDUCE_MIN_WRKDATA_SIZE*sizeof(double));
  pWrk_norm          = (DTYPE *)  prk_shmem_malloc(PRK_SHMEM_REDUCE_MIN_WRKDATA_SIZE*sizeof(DTYPE));
  local_stencil_time = (double *) prk_shmem_malloc(sizeof(double));
  stencil_time       = (double *) prk_shmem_malloc(sizeof(double));
  local_norm         = (DTYPE *)  prk_shmem_malloc(sizeof(DTYPE));
  norm               = (DTYPE *)  prk_shmem_malloc(sizeof(DTYPE));
  iterflag           = (int *)    prk_shmem_malloc(2*sizeof(int));
  if (!(pSync_bcast && pSync_reduce && pWrk_time && pWrk_norm && iterflag &&
	local_stencil_time && stencil_time && local_norm && norm))
  {
    printf("Could not allocate scalar variables on rank %d\n", my_ID);
    error = 1;
  }
  bail_out(error);

  for(i=0;i<PRK_SHMEM_BCAST_SYNC_SIZE;i++)
    pSync_bcast[i]=PRK_SHMEM_SYNC_VALUE;

  for(i=0;i<PRK_SHMEM_REDUCE_SYNC_SIZE;i++)
    pSync_reduce[i]=PRK_SHMEM_SYNC_VALUE;

  arguments=(int*)prk_shmem_malloc(2*sizeof(int));
 
  /*******************************************************************************
  ** process, test, and broadcast input parameters    
  ********************************************************************************/
 
  if (my_ID == root) {
#ifndef STAR
    printf("ERROR: Compact stencil not supported\n");
    error = 1;
    goto ENDOFTESTS;
#endif
      
    if (argc != 3){
      printf("Usage: %s <# iterations> <array dimension> \n", 
             *argv);
      error = 1;
      goto ENDOFTESTS;
    }
 
    iterations  = atoi(*++argv); 
    arguments[0]=iterations;

    if (iterations < 1){
      printf("ERROR: iterations must be >= 1 : %d \n",iterations);
      error = 1;
      goto ENDOFTESTS;  
    }
 
    n  = atoi(*++argv);
    arguments[1]=n;
    long nsquare = (long)n * (long)n;

    if (nsquare < Num_procs){ 
      printf("ERROR: grid size must be at least # ranks: %ld\n", nsquare);
      error = 1;
      goto ENDOFTESTS;
    }
 
    if (RADIUS < 0) {
      printf("ERROR: Stencil radius %d should be non-negative\n", RADIUS);
      error = 1;
      goto ENDOFTESTS;  
    }
 
    if (2*RADIUS +1 > n) {
      printf("ERROR: Stencil radius %d exceeds grid size %d\n", RADIUS, n);
      error = 1;
      goto ENDOFTESTS;  
    }
 
    ENDOFTESTS:;  
  }
  bail_out(error);
 
  /* determine best way to create a 2D grid of ranks (closest to square, for 
     best surface/volume ratio); we do this brute force for now
  */
  for (Num_procsx=(int) (sqrt(Num_procs+1)); Num_procsx>0; Num_procsx--) {
    if (!(Num_procs%Num_procsx)) {
      Num_procsy = Num_procs/Num_procsx;
      break;
    }
  }      
  my_IDx = my_ID%Num_procsx;
  my_IDy = my_ID/Num_procsx;
  /* compute neighbors; don't worry about dropping off the edges of the grid */
  right_nbr  = my_ID+1;
  left_nbr   = my_ID-1;
  top_nbr    = my_ID+Num_procsx;
  bottom_nbr = my_ID-Num_procsx;

  iterflag[0] = iterflag[1] = 0;

  if(my_IDx==0)            count_case--;
  if(my_IDx==Num_procsx-1) count_case--;
  if(my_IDy==0)            count_case--;
  if(my_IDy==Num_procsy-1) count_case--;
 
  if (my_ID == root) {
    printf("Parallel Research Kernels version %s\n", PRKVERSION);
    printf("SHMEM stencil execution on 2D grid\n");
    printf("Number of ranks        = %d\n", Num_procs);
    printf("Grid size              = %d\n", n);
    printf("Radius of stencil      = %d\n", RADIUS);
    printf("Tiles in x/y-direction = %d/%d\n", Num_procsx, Num_procsy);
    printf("Type of stencil        = star\n");
#ifdef DOUBLE
    printf("Data type              = double precision\n");
#else
    printf("Data type              = single precision\n");
#endif
#if LOOPGEN
    printf("Script used to expand stencil loop body\n");
#else
    printf("Compact representation of stencil loop body\n");
#endif
#if SPLITFENCE
    printf("Split fence            = ON\n");
#else
    printf("Split fence            = OFF\n");
#endif
    printf("Number of iterations   = %d\n", iterations);
  }

  shmem_barrier_all();
 
  shmem_broadcast32(&arguments[0], &arguments[0], 2, root, 0, 0, Num_procs, pSync_bcast);

  iterations=arguments[0];
  n=arguments[1];

  shmem_barrier_all();
  prk_shmem_free(arguments);
 
  /* compute amount of space required for input and solution arrays             */
  
  width = n/Num_procsx;
  leftover = n%Num_procsx;
  if (my_IDx<leftover) {
    istart = (width+1) * my_IDx; 
    iend = istart + width + 1;
  }
  else {
    istart = (width+1) * leftover + width * (my_IDx-leftover);
    iend = istart + width;
  }
  
  width = iend - istart + 1;
  if (width == 0) {
    printf("ERROR: rank %d has no work to do\n", my_ID);
    error = 1;
  }
  bail_out(error);
 
  height = n/Num_procsy;
  leftover = n%Num_procsy;
  if (my_IDy<leftover) {
    jstart = (height+1) * my_IDy; 
    jend = jstart + height + 1;
  }
  else {
    jstart = (height+1) * leftover + height * (my_IDy-leftover);
    jend = jstart + height;
  }
  
  height = jend - jstart + 1;
  if (height == 0) {
    printf("ERROR: rank %d has no work to do\n", my_ID);
    error = 1;
  }
  bail_out(error);
 
  if (width < RADIUS || height < RADIUS) {
    printf("ERROR: rank %d has work tile smaller then stencil radius\n",
           my_ID);
    error = 1;
  }
  bail_out(error);
 
  total_length_in = (width+2*RADIUS);
  total_length_in *= (height+2*RADIUS);
  total_length_in *= sizeof(DTYPE);

  total_length_out = width;
  total_length_out *= height;
  total_length_out *= sizeof(DTYPE);
 
  in  = (DTYPE *) malloc(total_length_in);
  out = (DTYPE *) malloc(total_length_out);
  if (!in || !out) {
    printf("ERROR: rank %d could not allocate space for input/output array\n",
            my_ID);
    error = 1;
  }
  bail_out(error);
 
  /* fill the stencil weights to reflect a discrete divergence operator         */
  for (jj=-RADIUS; jj<=RADIUS; jj++) for (ii=-RADIUS; ii<=RADIUS; ii++)
    WEIGHT(ii,jj) = (DTYPE) 0.0;
  stencil_size = 4*RADIUS+1;

  for (ii=1; ii<=RADIUS; ii++) {
    WEIGHT(0, ii) = WEIGHT( ii,0) =  (DTYPE) (1.0/(2.0*ii*RADIUS));
    WEIGHT(0,-ii) = WEIGHT(-ii,0) = -(DTYPE) (1.0/(2.0*ii*RADIUS));
  }
 
  norm[0] = (DTYPE) 0.0;
  f_active_points = (DTYPE) (n-2*RADIUS)*(DTYPE) (n-2*RADIUS);

  /* intialize the input and output arrays                                     */
  for (j=jstart; j<jend; j++) for (i=istart; i<iend; i++) {
    IN(i,j)  = COEFX*i+COEFY*j;
    OUT(i,j) = (DTYPE)0.0;
  }

  /* allocate communication buffers for halo values                            */
  top_buf_out=(DTYPE*)malloc(2*sizeof(DTYPE)*RADIUS*width);
  if (!top_buf_out) {
    printf("ERROR: Rank %d could not allocate output comm buffers for y-direction\n", my_ID);
    error = 1;
  }
  bail_out(error);
  bottom_buf_out = top_buf_out+RADIUS*width;

  top_buf_in[0]=(DTYPE*)prk_shmem_malloc(4*sizeof(DTYPE)*RADIUS*width);
  if(!top_buf_in)
  {
    printf("ERROR: Rank %d could not allocate input comm buffers for y-direction\n", my_ID);
    error=1;
  }
  bail_out(error);
  top_buf_in[1]    = top_buf_in[0]    + RADIUS*width;
  bottom_buf_in[0] = top_buf_in[1]    + RADIUS*width;
  bottom_buf_in[1] = bottom_buf_in[0] + RADIUS*width;
 
  right_buf_out=(DTYPE*)malloc(2*sizeof(DTYPE)*RADIUS*height);
  if (!right_buf_out) {
    printf("ERROR: Rank %d could not allocate output comm buffers for x-direction\n", my_ID);
    error = 1;
  }
  bail_out(error);
  left_buf_out=right_buf_out+RADIUS*height;

  right_buf_in[0]=(DTYPE*)prk_shmem_malloc(4*sizeof(DTYPE)*RADIUS*height);
  if(!right_buf_in)
  {
    printf("ERROR: Rank %d could not allocate input comm buffers for x-dimension\n", my_ID);
    error=1;
  }
  bail_out(error);
  right_buf_in[1] = right_buf_in[0] + RADIUS*height;
  left_buf_in[0]  = right_buf_in[1] + RADIUS*height;
  left_buf_in[1]  = left_buf_in[0]  + RADIUS*height;

  /* make sure all symmetric heaps are allocated before being used  */
  shmem_barrier_all();

  for (iter = 0; iter<=iterations; iter++){

    /* start timer after a warmup iteration */
    if (iter == 1) { 
      shmem_barrier_all();
      local_stencil_time[0] = wtime();
    }
    /* sw determines which incoming buffer to select */
    sw = iter%2;

    /* need to fetch ghost point data from neighbors */

    if (my_IDy < Num_procsy-1) {
      for (kk=0,j=jend-RADIUS; j<=jend-1; j++) for (i=istart; i<=iend; i++) {
          top_buf_out[kk++]= IN(i,j);
      }
      shmem_putmem(bottom_buf_in[sw], top_buf_out, RADIUS*width*sizeof(DTYPE), top_nbr);
#if SPLITFENCE
      shmem_fence();
      shmem_int_inc(&iterflag[sw], top_nbr);
#endif
    }
    if (my_IDy > 0) {
      for (kk=0,j=jstart; j<=jstart+RADIUS-1; j++) for (i=istart; i<=iend; i++) {
          bottom_buf_out[kk++]= IN(i,j);
      }
      shmem_putmem(top_buf_in[sw], bottom_buf_out, RADIUS*width*sizeof(DTYPE), bottom_nbr);
#if SPLITFENCE
      shmem_fence();
      shmem_int_inc(&iterflag[sw], bottom_nbr);
#endif
    }

    if(my_IDx < Num_procsx-1) {
      for(kk=0,j=jstart;j<=jend;j++) for(i=iend-RADIUS;i<=iend-1;i++) {
	right_buf_out[kk++]=IN(i,j);
      }
      shmem_putmem(left_buf_in[sw], right_buf_out, RADIUS*height*sizeof(DTYPE), right_nbr);
#if SPLITFENCE
      shmem_fence();
      shmem_int_inc(&iterflag[sw], right_nbr);
#endif
    }

    if(my_IDx>0) {
      for(kk=0,j=jstart;j<=jend;j++) for(i=istart;i<=istart+RADIUS-1;i++) {
	left_buf_out[kk++]=IN(i,j);
      }
      shmem_putmem(right_buf_in[sw], left_buf_out, RADIUS*height*sizeof(DTYPE), left_nbr);
#if SPLITFENCE
      shmem_fence();
      shmem_int_inc(&iterflag[sw], left_nbr);
#endif
    }

#if SPLITFENCE == 0
    shmem_fence();
    if(my_IDy<Num_procsy-1) shmem_int_inc(&iterflag[sw], top_nbr);
    if(my_IDy>0)            shmem_int_inc(&iterflag[sw], bottom_nbr);
    if(my_IDx<Num_procsx-1) shmem_int_inc(&iterflag[sw], right_nbr);
    if(my_IDx>0)            shmem_int_inc(&iterflag[sw], left_nbr);
#endif

    shmem_int_wait_until(&iterflag[sw], SHMEM_CMP_EQ, count_case*(iter/2+1));

    if (my_IDy < Num_procsy-1) {
      for (kk=0,j=jend; j<=jend+RADIUS-1; j++) for (i=istart; i<=iend; i++) {
          IN(i,j) = top_buf_in[sw][kk++];
      }      
    }
    if (my_IDy > 0) {
      for (kk=0,j=jstart-RADIUS; j<=jstart-1; j++) for (i=istart; i<=iend; i++) {
          IN(i,j) = bottom_buf_in[sw][kk++];
      }      
    }

    if (my_IDx < Num_procsx-1) {
      for (kk=0,j=jstart; j<=jend; j++) for (i=iend; i<=iend+RADIUS-1; i++) {
          IN(i,j) = right_buf_in[sw][kk++];
      }      
    }
    if (my_IDx > 0) {
      for (kk=0,j=jstart; j<=jend; j++) for (i=istart-RADIUS; i<=istart-1; i++) {
          IN(i,j) = left_buf_in[sw][kk++];
      }      
    }
 
    /* Apply the stencil operator */
    for (j=MAX(jstart,RADIUS); j<=MIN(n-RADIUS-1,jend); j++) {
      for (i=MAX(istart,RADIUS); i<=MIN(n-RADIUS-1,iend); i++) {
        #if LOOPGEN
          #include "loop_body_star.incl"
        #else
          for (jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj);
          for (ii=-RADIUS; ii<0; ii++)       OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j);
          for (ii=1; ii<=RADIUS; ii++)       OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j);
        #endif
      }
    }
 
    /* add constant to solution to force refresh of neighbor data, if any */
    for (j=jstart; j<jend; j++) for (i=istart; i<iend; i++) IN(i,j)+= 1.0;
 
  }
 
  local_stencil_time[0] = wtime() - local_stencil_time[0];

  shmem_barrier_all();

  shmem_double_max_to_all(&stencil_time[0], &local_stencil_time[0], 1, 0, 0,
                          Num_procs, pWrk_time, pSync_reduce);
  
  /* compute L1 norm in parallel                                                */
  local_norm[0] = (DTYPE) 0.0;
  for (j=MAX(jstart,RADIUS); j<MIN(n-RADIUS,jend); j++) {
    for (i=MAX(istart,RADIUS); i<MIN(n-RADIUS,iend); i++) {
      local_norm[0] += (DTYPE)ABS(OUT(i,j));
    }
  }

  shmem_barrier_all();
 
#ifdef DOUBLE
  shmem_double_sum_to_all(&norm[0], &local_norm[0], 1, 0, 0, Num_procs, pWrk_norm, pSync_reduce);
#else
  shmem_float_sum_to_all(&norm[0], &local_norm[0], 1, 0, 0, Num_procs, pWrk_norm, pSync_reduce);
#endif
 
  /*******************************************************************************
  ** Analyze and output results.
  ********************************************************************************/
 
/* verify correctness                                                            */
  if (my_ID == root) {
    norm[0] /= f_active_points;
    if (RADIUS > 0) {
      reference_norm = (DTYPE) (iterations+1) * (COEFX + COEFY);
    }
    else {
      reference_norm = (DTYPE) 0.0;
    }
    if (ABS(norm[0]-reference_norm) > EPSILON) {
      printf("ERROR: L1 norm = "FSTR", Reference L1 norm = "FSTR"\n",
             norm[0], reference_norm);
      error = 1;
    }
    else {
      printf("Solution validates\n");
#ifdef VERBOSE
      printf("Reference L1 norm = "FSTR", L1 norm = "FSTR"\n", 
             reference_norm, norm[0]);
#endif
    }
  }
  bail_out(error);
 
  if (my_ID == root) {
    /* flops/stencil: 2 flops (fma) for each point in the stencil, 
       plus one flop for the update of the input of the array        */
    flops = (DTYPE) (2*stencil_size+1) * f_active_points;
    avgtime = stencil_time[0]/iterations;
    printf("Rate (MFlops/s): "FSTR"  Avg time (s): %lf\n",
           1.0E-06 * flops/avgtime, avgtime);
  }
 

  prk_shmem_free(top_buf_in);
  prk_shmem_free(right_buf_in);
  free(top_buf_out);
  free(right_buf_out);

  prk_shmem_free(pSync_bcast);
  prk_shmem_free(pSync_reduce);
  prk_shmem_free(pWrk_time);
  prk_shmem_free(pWrk_norm);

  prk_shmem_finalize();

  exit(EXIT_SUCCESS);
}
예제 #3
0
파일: to_all.c 프로젝트: caomw/SOS
int
sum_to_all(int me, int npes)
{
  int i, pass=0;

  memset(ok,0,sizeof(ok));

  for (i = 0; i < N; i++) {
    src0[i] = src1[i] = src2[i] = src3[i] = src4[i] = src5[i] = src6[i] = me;
    dst0[i] = -9;
	dst1[i] = -9;
	dst2[i] = -9;
	dst3[i] = -9;
	dst4[i] = -9;
	dst5[i] = -9;
	dst6[i] = -9;
  }

  shmem_barrier_all();

  shmem_short_sum_to_all(dst0, src0, N, 0, 0, npes, pWrk0, pSync);
  shmem_int_sum_to_all(dst1, src1, N, 0, 0, npes, pWrk1, pSync1);
  shmem_long_sum_to_all(dst2, src2, N, 0, 0, npes, pWrk2, pSync);
  shmem_float_sum_to_all(dst3, src3, N, 0, 0, npes, pWrk3, pSync1);
  shmem_double_sum_to_all(dst4, src4, N, 0, 0, npes, pWrk4, pSync);
  shmem_longdouble_sum_to_all(dst5, src5, N, 0, 0, npes, pWrk5, pSync1);
  shmem_longlong_sum_to_all(dst6, src6, N, 0, 0, npes, pWrk6, pSync);

  if(me == 0) {
    for (i = 0; i < N; i++) {
	  if(dst0[i] != (short) (npes * (npes-1)/2)) ok[0] = 1;
	  if(dst1[i] != (int) (npes * (npes-1)/2)) ok[1] = 1;
	  if(dst2[i] != (long) (npes * (npes-1)/2)) ok[2] = 1;
	  if(dst3[i] != (float) (npes * (npes-1)/2)) ok[3] = 1;
	  if(dst4[i] != (double) (npes * (npes-1)/2)) ok[4] = 1;
	  if(dst5[i] != (long double) (npes * (npes-1)/2)) ok[5] = 1;
	  if(dst6[i] != (long long) (npes * (npes-1)/2)) ok[6] = 1;
    }
    if(ok[0]==1){
      printf("Reduction operation shmem_short_sum_to_all: Failed\n");
	}  
    else{
      Vprintf("Reduction operation shmem_short_sum_to_all: Passed\n");
      pass++;
	}
	if(ok[1]==1){
      printf("Reduction operation shmem_int_sum_to_all: Failed\n");
	}  
    else{
      Vprintf("Reduction operation shmem_int_sum_to_all: Passed\n");
      pass++;
	}
	if(ok[2]==1){
      printf("Reduction operation shmem_long_sum_to_all: Failed\n");
	}  
    else{
      Vprintf("Reduction operation shmem_long_sum_to_all: Passed\n");
      pass++;
	}
	if(ok[3]==1){
      printf("Reduction operation shmem_float_sum_to_all: Failed\n");
	}  
    else{
      Vprintf("Reduction operation shmem_float_sum_to_all: Passed\n");
      pass++;
	}
	if(ok[4]==1){
      printf("Reduction operation shmem_double_sum_to_all: Failed\n");
	}  
    else{
      Vprintf("Reduction operation shmem_double_sum_to_all: Passed\n");
      pass++;
	}
	if(ok[5]==1){
      printf("Reduction operation shmem_longdouble_sum_to_all: Failed\n");
	}  
    else{
      Vprintf("Reduction operation shmem_longdouble_sum_to_all: Passed\n");
      pass++;
	}
	if(ok[6]==1){
      printf("Reduction operation shmem_longlong_sum_to_all: Failed\n");
	}  
    else{
      Vprintf("Reduction operation shmem_longlong_sum_to_all: Passed\n");
      pass++;
	}
    Vprintf("\n"); fflush(stdout);
  }
    if (Serialize) shmem_barrier_all();

    return (pass == 7 ? 1 : 0);
}
예제 #4
0
int
main()
{
  int i,j;
  int me, npes;
  int success0, success1, success2, success3, success4, success5, success6;
  success0 = success1 = success2 = success3 = success4 = success5 = success6 = 0;

  start_pes(0);
  me = _my_pe();
  npes = _num_pes();

  for (i = 0; i < _SHMEM_REDUCE_SYNC_SIZE; i += 1) {
    pSync[i] = _SHMEM_SYNC_VALUE;
	pSync1[i] = _SHMEM_SYNC_VALUE;
  }

  for (i = 0; i < N; i += 1) {
    src0[i] = src1[i] = src2[i] = src3[i] = src4[i] = src5[i] = src6[i] = me + i;
 }
  
  /*Test MAX: shmem_double_max_to_all, shmem_float_max_to_all, shmem_int_max_to_all, shmem_long_max_to_all, shmem_longdouble_max_to_all, shmem_longlong_max_to_all, shmem_short_max_to_all */
  shmem_barrier_all();

  shmem_short_max_to_all(dst0, src0, N, 0, 0, npes, pWrk0, pSync);
  shmem_int_max_to_all(dst1, src1, N, 0, 0, npes, pWrk1, pSync1);
  shmem_long_max_to_all(dst2, src2, N, 0, 0, npes, pWrk2, pSync);
  shmem_float_max_to_all(dst3, src3, N, 0, 0, npes, pWrk3, pSync1);
  shmem_double_max_to_all(dst4, src4, N, 0, 0, npes, pWrk4, pSync);
  shmem_longdouble_max_to_all(dst5, src5, N, 0, 0, npes, pWrk5, pSync1);
  shmem_longlong_max_to_all(dst6, src6, N, 0, 0, npes, pWrk6, pSync);
  
  
  if(me == 0){
    for (i = 0,j=-1; i < N; i++,j++) {
      if(dst0[i] != npes+j)
        success0 =1;
	  if(dst1[i] != npes+j)
        success1 =1;
	  if(dst2[i] != npes+j)
        success2 =1;
	  if(dst3[i] != npes+j)
        success3 =1;
	  if(dst4[i] != npes+j)
        success4 =1;
	  if(dst5[i] != npes+j)
        success5 =1;
	  if(dst6[i] != npes+j)
        success6 =1;
    }
    if(success0==1){
      printf("Reduction operation shmem_short_max_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_short_max_to_all: Passed\n");
	}
	if(success1==1){
      printf("Reduction operation shmem_int_max_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_int_max_to_all: Passed\n");
	}
	if(success2==1){
      printf("Reduction operation shmem_long_max_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_long_max_to_all: Passed\n");
	}
	if(success3==1){
      printf("Reduction operation shmem_float_max_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_float_max_to_all: Passed\n");
	}
	if(success4==1){
      printf("Reduction operation shmem_double_max_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_double_max_to_all: Passed\n");
	}
	if(success5==1){
      printf("Reduction operation shmem_longdouble_max_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_longdouble_max_to_all: Passed\n");
	}
	if(success6==1){
      printf("Reduction operation shmem_longlong_max_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_longlong_max_to_all: Passed\n");
	}
	
  }
  
  
  /*Test MIN: shmem_double_min_to_all, shmem_float_min_to_all, shmem_int_min_to_all, shmem_long_min_to_all, shmem_longdouble_min_to_all, shmem_longlong_min_to_all, shmem_short_min_to_all*/
  success0 = success1 = success2 = success3 = success4 = success5 = success6 = 0;
  
  for (i = 0; i < N; i += 1) {
    src0[i] = src1[i] = src2[i] = src3[i] = src4[i] = src5[i] = src6[i] = me + i;
 }
 
  for (i = 0; i < N; i += 1) {
    dst0[i] = -9;
	dst1[i] = -9;
	dst2[i] = -9;
	dst3[i] = -9;
	dst4[i] = -9;
	dst5[i] = -9;
	dst6[i] = -9;
  }
   
  shmem_barrier_all();
  
  shmem_short_min_to_all(dst0, src0, N, 0, 0, npes, pWrk0, pSync);
  shmem_int_min_to_all(dst1, src1, N, 0, 0, npes, pWrk1, pSync1);
  shmem_long_min_to_all(dst2, src2, N, 0, 0, npes, pWrk2, pSync);
  shmem_float_min_to_all(dst3, src3, N, 0, 0, npes, pWrk3, pSync1);
  shmem_double_min_to_all(dst4, src4, N, 0, 0, npes, pWrk4, pSync);
  shmem_longdouble_min_to_all(dst5, src5, N, 0, 0, npes, pWrk5, pSync1);
  shmem_longlong_min_to_all(dst6, src6, N, 0, 0, npes, pWrk6, pSync);
  
  
  if(me == 0){
    for (i = 0; i < N; i++) {
      if(dst0[i] != i)
        success0 =1;
	  if(dst1[i] != i)
        success1 =1;
	  if(dst2[i] != i)
        success2 =1;
	  if(dst3[i] != i)
        success3 =1;
	  if(dst4[i] != i)
        success4 =1;
	  if(dst5[i] != i)
        success5 =1;
	  if(dst6[i] != i)
        success6 =1;
    }
    if(success0==1){
      printf("Reduction operation shmem_short_min_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_short_min_to_all: Passed\n");
	}
	if(success1==1){
      printf("Reduction operation shmem_int_min_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_int_min_to_all: Passed\n");
	}
	if(success2==1){
      printf("Reduction operation shmem_long_min_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_long_min_to_all: Passed\n");
	}
	if(success3==1){
      printf("Reduction operation shmem_float_min_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_float_min_to_all: Passed\n");
	}
	if(success4==1){
      printf("Reduction operation shmem_double_min_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_double_min_to_all: Passed\n");
	}
	if(success5==1){
      printf("Reduction operation shmem_longdouble_min_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_longdouble_min_to_all: Passed\n");
	}
	if(success6==1){
      printf("Reduction operation shmem_longlong_min_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_longlong_min_to_all: Passed\n");
	}
	
  }
  
  /*Test SUM: shmem_double_sum_to_all, shmem_float_sum_to_all, shmem_int_sum_to_all, shmem_long_sum_to_all, shmem_longdouble_sum_to_all, shmem_longlong_sum_to_all, shmem_short_sum_to_all*/
  success0 = success1 = success2 = success3 = success4 = success5 = success6 = 0;
  for (i = 0; i < N; i += 1) {
    src0[i] = src1[i] = src2[i] = src3[i] = src4[i] = src5[i] = src6[i] = me;
 }
  for (i = 0; i < N; i += 1) {
    dst0[i] = -9;
	dst1[i] = -9;
	dst2[i] = -9;
	dst3[i] = -9;
	dst4[i] = -9;
	dst5[i] = -9;
	dst6[i] = -9;
  }
  shmem_barrier_all();

  shmem_short_sum_to_all(dst0, src0, N, 0, 0, npes, pWrk0, pSync);
  shmem_int_sum_to_all(dst1, src1, N, 0, 0, npes, pWrk1, pSync1);
  shmem_long_sum_to_all(dst2, src2, N, 0, 0, npes, pWrk2, pSync);
  shmem_float_sum_to_all(dst3, src3, N, 0, 0, npes, pWrk3, pSync1);
  shmem_double_sum_to_all(dst4, src4, N, 0, 0, npes, pWrk4, pSync);
  shmem_longdouble_sum_to_all(dst5, src5, N, 0, 0, npes, pWrk5, pSync1);
  shmem_longlong_sum_to_all(dst6, src6, N, 0, 0, npes, pWrk6, pSync);

  
  if(me == 0){
    for (i = 0; i < N; i++) {
	  if(dst0[i] != (npes * (npes-1)/2))
        success0 =1;
	  if(dst1[i] != (npes * (npes-1)/2))
        success1 =1;
	  if(dst2[i] != (npes * (npes-1)/2))
        success2 =1;
	  if(dst3[i] != (npes * (npes-1)/2))
        success3 =1;
	  if(dst4[i] != (npes * (npes-1)/2))
        success4 =1;
	  if(dst5[i] != (npes * (npes-1)/2))
        success5 =1;
	  if(dst6[i] != (npes * (npes-1)/2))
        success6 =1;
    }
    if(success0==1){
      printf("Reduction operation shmem_short_sum_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_short_sum_to_all: Passed\n");
	}
	if(success1==1){
      printf("Reduction operation shmem_int_sum_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_int_sum_to_all: Passed\n");
	}
	if(success2==1){
      printf("Reduction operation shmem_long_sum_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_long_sum_to_all: Passed\n");
	}
	if(success3==1){
      printf("Reduction operation shmem_float_sum_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_float_sum_to_all: Passed\n");
	}
	if(success4==1){
      printf("Reduction operation shmem_double_sum_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_double_sum_to_all: Passed\n");
	}
	if(success5==1){
      printf("Reduction operation shmem_longdouble_sum_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_longdouble_sum_to_all: Passed\n");
	}
	if(success6==1){
      printf("Reduction operation shmem_longlong_sum_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_longlong_sum_to_all: Passed\n");
	}
	
  }
  
  /*Test AND: shmem_int_and_to_all, shmem_long_and_to_all, shmem_longlong_and_to_all, shmem_short_and_to_all,*/
  success0 = success1 = success2 = success6 = 0;
  for (i = 0; i < N; i += 1) {
    src0[i] = src1[i] = src2[i] = src6[i] = me;
 }
 for (i = 0; i < N; i += 1) {
    dst0[i] = -9;
	dst1[i] = -9;
	dst2[i] = -9;
	dst6[i] = -9;
  }
 
  shmem_barrier_all();
  
  shmem_short_and_to_all(dst0, src0, N, 0, 0, npes, pWrk0, pSync);
  shmem_int_and_to_all(dst1, src1, N, 0, 0, npes, pWrk1, pSync1);
  shmem_long_and_to_all(dst2, src2, N, 0, 0, npes, pWrk2, pSync);
  shmem_longlong_and_to_all(dst6, src6, N, 0, 0, npes, pWrk6, pSync1);
  
  
  if(me==0){
    for (i = 0; i < N; i++) {
	  if(dst0[i] != 0)
        success0 =1;
	  if(dst1[i] != 0)
        success1 =1;
	  if(dst2[i] != 0)
        success2 =1;
	 if(dst6[i] != 0)
        success6 =1;
    }
    if(success0==1){
      printf("Reduction operation shmem_short_and_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_short_and_to_all: Passed\n");
	}
	if(success1==1){
      printf("Reduction operation shmem_int_and_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_int_and_to_all: Passed\n");
	}
	if(success2==1){
      printf("Reduction operation shmem_long_and_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_long_and_to_all: Passed\n");
	}
	if(success6==1){
      printf("Reduction operation shmem_longlong_and_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_longlong_and_to_all: Passed\n");
	}
	
  }
  
 /*Test PROD: shmem_double_prod_to_all, shmem_float_prod_to_all, shmem_int_prod_to_all, shmem_long_prod_to_all, shmem_longdouble_prod_to_all, shmem_longlong_prod_to_all, shmem_short_prod_to_all, */
  
  success0 = success1 = success2 = success3 = success4 = success5 = success6 = 0;
  for (i = 0; i < N; i += 1) {
    src0[i] = src1[i] = src2[i] = src3[i] = src4[i] = src5[i] = src6[i] = me + 1;
 }
  for (i = 0; i < N; i += 1) {
    dst0[i] = -9;
	dst1[i] = -9;
	dst2[i] = -9;
	dst3[i] = -9;
	dst4[i] = -9;
	dst5[i] = -9;
	dst6[i] = -9;
  }
  
  expected_result0 = expected_result1 = expected_result2 = expected_result3 = expected_result4 = expected_result5 = expected_result6 =1;
  for(i=1;i<=npes;i++){
    expected_result0 = expected_result0 * i;
	expected_result1 = expected_result1 * i;
	expected_result2 = expected_result2 * i;
	expected_result3 = expected_result3 * i;
	expected_result4 = expected_result4 * i;
	expected_result5 = expected_result5 * i;
	expected_result6 = expected_result6 * i;
  }
   
  shmem_barrier_all();
 
  shmem_short_prod_to_all(dst0, src0, N, 0, 0, npes, pWrk0, pSync);
  shmem_int_prod_to_all(dst1, src1, N, 0, 0, npes, pWrk1, pSync1);
  shmem_long_prod_to_all(dst2, src2, N, 0, 0, npes, pWrk2, pSync);
  shmem_float_prod_to_all(dst3, src3, N, 0, 0, npes, pWrk3, pSync1);
  shmem_double_prod_to_all(dst4, src4, N, 0, 0, npes, pWrk4, pSync);
  shmem_longdouble_prod_to_all(dst5, src5, N, 0, 0, npes, pWrk5, pSync1);
  shmem_longlong_prod_to_all(dst6, src6, N, 0, 0, npes, pWrk6, pSync);

 
  if(me == 0){
    for (i = 0; i < N; i++) {
	 /*printf("dst2[%d]: %ld, expected val: %ld\n",i, dst2[i], (long)expected_result2);*/
      if(dst0[i] != expected_result0)
        success0 =1;
	  if(dst1[i] != expected_result1)
        success1 =1;
	  if(dst2[i] != expected_result2)
        success2 =1;
	  if(dst3[i] != expected_result3)
        success3 =1;
	  if(dst4[i] != expected_result4)
        success4 =1;
	  if(dst5[i] != expected_result5)
        success5 =1;
	  if(dst6[i] != expected_result6)
        success6 =1;
    }
    if(success0==1){
      printf("Reduction operation shmem_short_prod_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_short_prod_to_all: Passed\n");
	}
	if(success1==1){
      printf("Reduction operation shmem_int_prod_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_int_prod_to_all: Passed\n");
	}
	if(success2==1){
      printf("Reduction operation shmem_long_prod_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_long_prod_to_all: Passed\n");
	}
	if(success3==1){
      printf("Reduction operation shmem_float_prod_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_float_prod_to_all: Passed\n");
	}
	if(success4==1){
      printf("Reduction operation shmem_double_prod_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_double_prod_to_all: Passed\n");
	}
	if(success5==1){
      printf("Reduction operation shmem_longdouble_prod_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_longdouble_prod_to_all: Passed\n");
	}
	if(success6==1){
      printf("Reduction operation shmem_longlong_prod_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_longlong_prod_to_all: Passed\n");
	}
	
  }
 
 /*Test OR: shmem_int_or_to_all, shmem_long_or_to_all, shmem_longlong_or_to_all, shmem_short_or_to_all,*/
  
  success0 = success1 = success2 = success6 = 0;
  for (i = 0; i < N; i += 1) {
    src0[i] = src1[i] = src2[i] = src6[i] = (me + 1)%4;
 }
 for (i = 0; i < N; i += 1) {
    dst0[i] = -9;
	dst1[i] = -9;
	dst2[i] = -9;
	dst6[i] = -9;
  }
 
  shmem_barrier_all();
  
  shmem_short_or_to_all(dst0, src0, N, 0, 0, npes, pWrk0, pSync);
  shmem_int_or_to_all(dst1, src1, N, 0, 0, npes, pWrk1, pSync1);
  shmem_long_or_to_all(dst2, src2, N, 0, 0, npes, pWrk2, pSync);
  shmem_longlong_or_to_all(dst6, src6, N, 0, 0, npes, pWrk6, pSync1);
  
  
  if(me==0){
    for (i = 0; i < N; i++) {
      if(dst0[i] != 3)
        success0 =1;
	  if(dst1[i] != 3)
        success1 =1;
	  if(dst2[i] != 3)
        success2 =1;
	 if(dst6[i] != 3)
        success6 =1;
    }
    if(success0==1){
      printf("Reduction operation shmem_short_or_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_short_or_to_all: Passed\n");
	}
	if(success1==1){
      printf("Reduction operation shmem_int_or_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_int_or_to_all: Passed\n");
	}
	if(success2==1){
      printf("Reduction operation shmem_long_or_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_long_or_to_all: Passed\n");
	}
	if(success6==1){
      printf("Reduction operation shmem_longlong_or_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_longlong_or_to_all: Passed\n");
	}
	
  }
 
 /*Test XOR: shmem_int_xor_to_all, shmem_long_xor_to_all, shmem_longlong_xor_to_all, shmem_short_xor_to_all*/
  
  success0 = success1 = success2 = success6 = 0;
  for (i = 0; i < N; i += 1) {
    src0[i] = src1[i] = src2[i] = src6[i] = me%2;
 }
 for (i = 0; i < N; i += 1) {
    dst0[i] = -9;
	dst1[i] = -9;
	dst2[i] = -9;
	dst6[i] = -9;
  }
  int expected_result = ((int)(npes/2) % 2);
  
 
  shmem_barrier_all();
  
  shmem_short_xor_to_all(dst0, src0, N, 0, 0, npes, pWrk0, pSync);
  shmem_int_xor_to_all(dst1, src1, N, 0, 0, npes, pWrk1, pSync1);
  shmem_long_xor_to_all(dst2, src2, N, 0, 0, npes, pWrk2, pSync);
  shmem_longlong_xor_to_all(dst6, src6, N, 0, 0, npes, pWrk6, pSync1);
  
  if(me==0){
    for (i = 0; i < N; i++) {
      if(dst0[i] != expected_result)
        success0 =1;
	  if(dst1[i] != expected_result)
        success1 =1;
	  if(dst2[i] != expected_result)
        success2 =1;
	 if(dst6[i] != expected_result)
        success6 =1;
    }
    if(success0==1){
      printf("Reduction operation shmem_short_xor_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_short_xor_to_all: Passed\n");
	}
	if(success1==1){
      printf("Reduction operation shmem_int_xor_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_int_xor_to_all: Passed\n");
	}
	if(success2==1){
      printf("Reduction operation shmem_long_xor_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_long_xor_to_all: Passed\n");
	}
	if(success6==1){
      printf("Reduction operation shmem_longlong_xor_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_longlong_xor_to_all: Passed\n");
	}
	
  }

  return 0;
}
예제 #5
0
파일: shmem_2dheat.c 프로젝트: coti/oshmpi
int
main (int argc, char **argv)
{
  /* arrays used to contain each PE's rows - specify cols, no need to spec rows */
  float **U_Curr;
  float **U_Next;
  /* helper variables */
  /* available iterator  */
  int i, j, k, m, n;
  int per_proc, remainder, my_start_row, my_end_row, my_num_rows;
  int verbose = 0;
  int show_time = 0;
  double time;
  double t, tv[2];

  /*OpenSHMEM initilization*/
  start_pes (0);
  p = _num_pes ();
  my_rank = _my_pe ();

  if (p > 8) {
      fprintf(stderr, "Ignoring test when run with more than 8 pes\n");
      return 77;
  }

  /* argument processing done by everyone */
  int c, errflg;
  extern char *optarg;
  extern int optind, optopt;

  while ((c = getopt (argc, argv, "e:h:m:tw:v")) != -1)
    {
      switch (c)
	{
	case 'e':
	  EPSILON = atof (optarg);
	  break;
	case 'h':
	  HEIGHT = atoi (optarg);
	  break;
	case 'm':
	  /* selects the numerical methods */
	  switch (atoi (optarg))
	    {
	    case 1:		/* jacobi */
	      meth = 1;
	      break;
	    case 2:		/* gauss-seidel */
	      meth = 2;
	      break;
	    case 3:		/* sor */
	      meth = 3;
	      break;
	    }
	  break;
	case 't':
	  show_time++;		/* overridden by -v (verbose) */
	  break;
	case 'w':
	  WIDTH = atoi (optarg);
	  break;
	case 'v':
	  verbose++;
	  break;
	  /* handle bad arguments */
	case ':':		/* -h or -w without operand */
	  if (ROOT == my_rank)
	    fprintf (stderr, "Option -%c requires an operand\n", optopt);
	  errflg++;
	  break;
	case '?':
	  if (ROOT == my_rank)
	    fprintf (stderr, "Unrecognized option: -%c\n", optopt);
	  errflg++;
	  break;
	}
    }

  if (ROOT == my_rank && argc < 2)
    {
      printf ("Using defaults: -h 20 -w 20 -m 2\n");
    }

//  if (0 < errflg) 
//      exit(EXIT_FAILURE);


  /* wait for user to input runtime params */
 
  for (i = 0; i < _SHMEM_REDUCE_SYNC_SIZE; i += 1)
    pSync[i] = _SHMEM_SYNC_VALUE;

  shmem_barrier_all ();


  /* broadcast method to use  */
  
  shmem_broadcast32 (&meth, &meth, 1, 0, 0, 0, p, pSync);
  switch (meth)
    {
    case 1:
      method = &jacobi;
      break;
    case 2:
      method = &gauss_seidel;
      break;
    case 3:
      method = &sor;
      break;
    }

  /* let each processor decide what rows(s) it owns */
  my_start_row = get_start (my_rank);
  my_end_row = get_end (my_rank);
  my_num_rows = get_num_rows (my_rank);

  if (0 < verbose)
    printf ("proc %d contains (%d) rows %d to %d\n", my_rank, my_num_rows,
	    my_start_row, my_end_row);
  fflush (stdout);

  /* allocate 2d array */
  U_Curr = (float **) malloc (sizeof (float *) * my_num_rows);
  U_Curr[0] =
    (float *) malloc (sizeof (float) * my_num_rows * (int) floor (WIDTH / H));
  for (i = 1; i < my_num_rows; i++)
    {
      U_Curr[i] = U_Curr[i - 1] + (int) floor (WIDTH / H);
    }

  /* allocate 2d array */
  U_Next = (float **) malloc (sizeof (float *) * my_num_rows);
  U_Next[0] =
    (float *) malloc (sizeof (float) * my_num_rows * (int) floor (WIDTH / H));
  for (i = 1; i < my_num_rows; i++)
    {
      U_Next[i] = U_Next[i - 1] + (int) floor (WIDTH / H);
    }

  /* initialize global grid */
  init_domain (U_Curr, my_rank);
  init_domain (U_Next, my_rank);

  /* iterate for solution */
  if (my_rank == ROOT)
    {
     
      tv[0] = gettime ();
    }
  k = 1;
  while (1)
    {
      method (U_Curr, U_Next);

      local_convergence_sqd = get_convergence_sqd (U_Curr, U_Next, my_rank);
     
      shmem_barrier_all ();
      shmem_float_sum_to_all (&convergence_sqd, &local_convergence_sqd, 1, 0,
			      0, p, pWrk, pSync);
      if (my_rank == ROOT)
	{
	  convergence = sqrt (convergence_sqd);
	  if (verbose == 1)
	    {
	      printf ("L2 = %f\n", convergence);
	    }
	}

      /* broadcast method to use */
     
      shmem_barrier_all ();
      shmem_broadcast32 (&convergence, &convergence, 1, 0, 0, 0, p, pSync);
      if (convergence <= EPSILON)
	{
	  break;
	}

      /* copy U_Next to U_Curr */
      for (j = my_start_row; j <= my_end_row; j++)
	{
	  for (i = 0; i < (int) floor (WIDTH / H); i++)
	    {
	      U_Curr[j - my_start_row][i] = U_Next[j - my_start_row][i];
	    }
	}
      k++;
      //MPI_Barrier(MPI_COMM_WORLD);    
      shmem_barrier_all ();
    }


  /* say something at the end */
  if (my_rank == ROOT)
    {
      //time = MPI_Wtime() - time;
      tv[1] = gettime ();
      t = dt (&tv[1], &tv[0]);
      printf
	("Estimated time to convergence in %d iterations using %d processors on a %dx%d grid is %f seconds\n",
	 k, p, (int) floor (WIDTH / H), (int) floor (HEIGHT / H),
	 t / 1000000.0);
    }

  //MPI_Finalize();
  exit (EXIT_SUCCESS);
  return 0;
}