double
benchmark_inc_longlong (struct pe_vars v, union data_types *buffer,
                        unsigned long iterations)
{
    int64_t begin, end; 
    int i;
    static double rate = 0, sum_rate = 0, lat = 0, sum_lat = 0;

    /*
     * Touch memory
     */
    memset(buffer, CHAR_MAX * drand48(), sizeof(union data_types
                [ITERATIONS]));

    shmem_barrier_all();

    if (v.me < v.pairs) {
        begin = TIME();
        for (i = 0; i < iterations; i++) {
            shmem_longlong_inc(&(buffer[i].longlong_type), v.nxtpe);
        }
        end = TIME();

        rate = ((double)iterations * 1e6) / (end - begin);
        lat = (end - begin) / (double)iterations;
    }

    shmem_double_sum_to_all(&sum_rate, &rate, 1, 0, 0, v.npes, pwrk1, psync1);
    shmem_double_sum_to_all(&sum_lat, &lat, 1, 0, 0, v.npes, pwrk2, psync2);
    print_operation_rate(v.me, "shmem_longlong_inc", sum_rate/1e6, sum_lat/v.pairs);

    return 0;
}
Пример #2
0
static void
test_prepost(void)
{
    int i, j, k;

    tmp = 0;
    total = 0;

    shmem_barrier_all();

    for (i = 0 ; i < niters - 1 ; ++i) {
        cache_invalidate();

        shmem_barrier_all();

        tmp = timer();
        for (j = 0 ; j < npeers ; ++j) {
            for (k = 0 ; k < nmsgs ; ++k) {
                shmem_putmem(recv_buf + (nbytes * (k + j * nmsgs)), 
                             send_buf + (nbytes * (k + j * nmsgs)), 
                             nbytes, send_peers[npeers - j - 1]);
            }
        }
        shmem_quiet();
        shmem_short_wait((short*) (recv_buf + (nbytes * ((nmsgs - 1) + (npeers - 1) * nmsgs))), 0);
        total += (timer() - tmp);
        memset(recv_buf, 0, npeers * nmsgs * nbytes);
    }

    shmem_double_sum_to_all(&tmp, &total, 1, 0, 0, world_size, reduce_pWrk, reduce_pSync);
    display_result("pre-post", (niters * npeers * nmsgs * 2) / (tmp / world_size));
}
Пример #3
0
void
benchmark (struct pe_vars v, long * msg_buffer)
{
    static double pwrk[_SHMEM_REDUCE_SYNC_SIZE];
    static long psync[_SHMEM_BCAST_SYNC_SIZE];
    static double mr, mr_sum;
    unsigned long size, i;

    memset(psync, _SHMEM_SYNC_VALUE, sizeof(long[_SHMEM_BCAST_SYNC_SIZE]));

    /*
     * Warmup
     */
    if (v.me < v.pairs) {
        //for (i = 0; i < (ITERS_LARGE * MAX_MSG_SZ); i += MAX_MSG_SZ) {
        for (i = 0; i < ITERS_LARGE; i += 1) {
            //shmem_putmem(&msg_buffer[i], &msg_buffer[i], MAX_MSG_SZ, v.nxtpe);
            shmem_long_put(&msg_buffer[i], &msg_buffer[i], MAX_MSG_SZ, v.nxtpe);
        }
    }
    
    shmem_barrier_all();

    /*
     * Benchmark
     */
    for (size = 1; size <= MAX_MSG_SZ; size <<= 1) {
        i = size < LARGE_THRESHOLD ? ITERS_SMALL : ITERS_LARGE;

        mr = message_rate(v, msg_buffer, size, i);
        shmem_double_sum_to_all(&mr_sum, &mr, 1, 0, 0, v.npes, pwrk, psync);
        print_message_rate(v.me, size, mr_sum);
    }
}
Пример #4
0
/*inline*/ void globalSum_double(LSMSCommunication &comm,double &a, int n)
{
  shmem_barrier(comm.comm.start_pe, comm.comm.logPE_stride, comm.comm.size,pSync1);
  static double r_d;  
  r_d=a;
  shmem_double_sum_to_all(&a, &r_d, n,comm.comm.start_pe, comm.comm.logPE_stride, comm.comm.size, pWrk_d, pSync2);
}
Пример #5
0
/*inline*/ void globalSum_real(LSMSCommunication &comm,double *a, int n)
{
  shmem_barrier(comm.comm.start_pe, comm.comm.logPE_stride, comm.comm.size,pSync1);
  double* r_d = (double*)shmalloc(n*sizeof(double));  
  memcpy(r_d,a,n*sizeof(double));
  shmem_double_sum_to_all(a, r_d, n,comm.comm.start_pe, comm.comm.logPE_stride, comm.comm.size, pWrk_d, pSync2);
  shfree(r_d);
}
double
benchmark_cswap_longlong (struct pe_vars v, union data_types *buffer,
                          unsigned long iterations)
{
    int64_t begin, end; 
    int i;
    static double rate = 0, sum_rate = 0, lat = 0, sum_lat = 0;

    /*
     * Touch memory
     */
    for (i=0; i<ITERATIONS; i++) { 
        buffer[i].int_type = v.me;
    }

    shmem_barrier_all();

    if (v.me < v.pairs) {
        long long cond = v.nxtpe;
        long long value = INT_MAX * drand48();
        long long old_value;

        begin = TIME();
        for (i = 0; i < iterations; i++) {
            old_value = shmem_longlong_cswap(&(buffer[i].longlong_type), cond, value, v.nxtpe);
        }
        end = TIME();

        rate = ((double)iterations * 1e6) / (end - begin);
        lat = (end - begin) / (double)iterations;        
    }

    shmem_double_sum_to_all(&sum_rate, &rate, 1, 0, 0, v.npes, pwrk1, psync1);
    shmem_double_sum_to_all(&sum_lat, &lat, 1, 0, 0, v.npes, pwrk2, psync2);
    print_operation_rate(v.me, "shmem_longlong_cswap", sum_rate/1e6, sum_lat/v.pairs);

    return 0;
}
Пример #7
0
static void
test_one_way(void)
{
    int i, k;
    int pe_size  = world_size;

    tmp = 0;
    total = 0;

    shmem_barrier_all();

    if (world_size % 2 == 1) {
        pe_size = world_size - 1;
    }

    if (!(world_size % 2 == 1 && rank == (world_size - 1))) {
        if (rank < world_size / 2) {
            for (i = 0 ; i < niters ; ++i) {
                cache_invalidate();

                shmem_barrier(0, 0, pe_size, barrier_pSync);

                tmp = timer();
                for (k = 0 ; k < nmsgs ; ++k) {
                    shmem_putmem(recv_buf + (nbytes * k), 
                                 send_buf + (nbytes * k), 
                                 nbytes, rank + (world_size / 2));
                }
                shmem_quiet();
                total += (timer() - tmp);
            }
        } else {
            for (i = 0 ; i < niters ; ++i) {
                cache_invalidate();

                shmem_barrier(0, 0, pe_size, barrier_pSync);

                tmp = timer();
                shmem_short_wait((short*) (recv_buf + (nbytes * (nmsgs - 1))), 0);
                total += (timer() - tmp);
                memset(recv_buf, 0, npeers * nmsgs * nbytes);
            }
        }

        shmem_double_sum_to_all(&tmp, &total, 1, 0, 0, pe_size, reduce_pWrk, reduce_pSync);
        display_result("single direction", (niters * nmsgs) / (tmp / world_size));
    }

    shmem_barrier_all();
}
Пример #8
0
/* Test function for type double */
int sum_double(double *Source, double *Target, int PE_start, int logPE_stride, int PE_size, int rstride, int SameST, double *pWrk, long *pSync, char *Case)
{
   int my_pe;
   int i,j,fail,n_err,ret_val,lpe;
   double chtarget;

   my_pe = shmem_my_pe();
   ret_val=0;

   shmem_double_sum_to_all(Target, Source, NREDUCE, PE_start, logPE_stride, PE_size, pWrk, pSync);

/* check that values of pSync have been restored to the original values */
/*
   fail=0;
   n_err=0;
   for(i=0;i<_SHMEM_REDUCE_SYNC_SIZE;i++)
      if(pSync[i]!=_SHMEM_SYNC_VALUE) {
	 if(!fail) printf("FAIL pSync[%d]=%f (was %f) in process %d (in the active set), Case: %s\n",i,pSync[i],_SHMEM_SYNC_VALUE,my_pe,Case);
	 fail=1;
	 n_err++;
      }
   if(fail) ret_val+=n_err;
*/
/* if Source is different from Target check that values of Source have not been changed */
   if(!SameST) ret_val+=check_sval_notchanged(Source,Case);

/* Check the values of Target */
   fail=0;
   n_err=0;
   for(i=0;i<NREDUCE;i++)
   {
      chtarget=0.;
      for(lpe=PE_start,j=0;j<PE_size;lpe+=rstride,j++)
         chtarget+=SINIT;
      if(abs(Target[i]-chtarget)>1.e-8) {
	 if(!fail) printf("FAIL Target[%d]=%f (should be %f) in process %d (in the active set), Case: %s\n",i,Target[i],chtarget,my_pe,Case);
   	 fail=1;
 	 n_err++;
      }
   }
   if(fail) ret_val+=n_err;

   return ret_val;
}
Пример #9
0
int main(int argc, char ** argv) {
 
  int    Num_procs;       /* number of ranks                                     */
  int    Num_procsx, Num_procsy; /* number of ranks in each coord direction      */
  int    my_ID;           /* SHMEM rank                                          */
  int    my_IDx, my_IDy;  /* coordinates of rank in rank grid                    */
  int    right_nbr;       /* global rank of right neighboring tile               */
  int    left_nbr;        /* global rank of left neighboring tile                */
  int    top_nbr;         /* global rank of top neighboring tile                 */
  int    bottom_nbr;      /* global rank of bottom neighboring tile              */
  DTYPE *top_buf_out;     /* communication buffer                                */
  DTYPE *top_buf_in[2];   /*       "         "                                   */
  DTYPE *bottom_buf_out;  /*       "         "                                   */
  DTYPE *bottom_buf_in[2];/*       "         "                                   */
  DTYPE *right_buf_out;   /*       "         "                                   */
  DTYPE *right_buf_in[2]; /*       "         "                                   */
  DTYPE *left_buf_out;    /*       "         "                                   */
  DTYPE *left_buf_in[2];  /*       "         "                                   */
  int    root = 0;
  int    n, width, height;/* linear global and local grid dimension              */
  int    i, j, ii, jj, kk, it, jt, iter, leftover;  /* dummies                   */
  int    istart, iend;    /* bounds of grid tile assigned to calling rank        */
  int    jstart, jend;    /* bounds of grid tile assigned to calling rank        */
  DTYPE  reference_norm;
  DTYPE  f_active_points; /* interior of grid with respect to stencil            */
  int    stencil_size;    /* number of points in the stencil                     */
  DTYPE  flops;           /* floating point ops per iteration                    */
  int    iterations;      /* number of times to run the algorithm                */
  double avgtime,         /* timing parameters                                   */
         *local_stencil_time, *stencil_time; 
  DTYPE  * RESTRICT in;   /* input grid values                                   */
  DTYPE  * RESTRICT out;  /* output grid values                                  */
  long   total_length_in; /* total required length to store input array          */
  long   total_length_out;/* total required length to store output array         */
  int    error=0;         /* error flag                                          */
  DTYPE  weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil     */
  int    *arguments;      /* command line parameters                             */
  int    count_case=4;    /* number of neighbors of a rank                       */
  long   *pSync_bcast;    /* work space for collectives                          */
  long   *pSync_reduce;   /* work space for collectives                          */
  double *pWrk_time;      /* work space for collectives                          */
  DTYPE  *pWrk_norm;      /* work space for collectives                          */
  int    *iterflag;       /* synchronization flags                               */
  int    sw;              /* double buffering switch                             */
  DTYPE  *local_norm, *norm; /* local and global error norms                     */

  /*******************************************************************************
  ** Initialize the SHMEM environment
  ********************************************************************************/
  prk_shmem_init();

  my_ID=prk_shmem_my_pe();
  Num_procs=prk_shmem_n_pes();

  pSync_bcast        = (long *)   prk_shmem_malloc(PRK_SHMEM_BCAST_SYNC_SIZE*sizeof(long));
  pSync_reduce       = (long *)   prk_shmem_malloc(PRK_SHMEM_REDUCE_SYNC_SIZE*sizeof(long));
  pWrk_time          = (double *) prk_shmem_malloc(PRK_SHMEM_REDUCE_MIN_WRKDATA_SIZE*sizeof(double));
  pWrk_norm          = (DTYPE *)  prk_shmem_malloc(PRK_SHMEM_REDUCE_MIN_WRKDATA_SIZE*sizeof(DTYPE));
  local_stencil_time = (double *) prk_shmem_malloc(sizeof(double));
  stencil_time       = (double *) prk_shmem_malloc(sizeof(double));
  local_norm         = (DTYPE *)  prk_shmem_malloc(sizeof(DTYPE));
  norm               = (DTYPE *)  prk_shmem_malloc(sizeof(DTYPE));
  iterflag           = (int *)    prk_shmem_malloc(2*sizeof(int));
  if (!(pSync_bcast && pSync_reduce && pWrk_time && pWrk_norm && iterflag &&
	local_stencil_time && stencil_time && local_norm && norm))
  {
    printf("Could not allocate scalar variables on rank %d\n", my_ID);
    error = 1;
  }
  bail_out(error);

  for(i=0;i<PRK_SHMEM_BCAST_SYNC_SIZE;i++)
    pSync_bcast[i]=PRK_SHMEM_SYNC_VALUE;

  for(i=0;i<PRK_SHMEM_REDUCE_SYNC_SIZE;i++)
    pSync_reduce[i]=PRK_SHMEM_SYNC_VALUE;

  arguments=(int*)prk_shmem_malloc(2*sizeof(int));
 
  /*******************************************************************************
  ** process, test, and broadcast input parameters    
  ********************************************************************************/
 
  if (my_ID == root) {
#ifndef STAR
    printf("ERROR: Compact stencil not supported\n");
    error = 1;
    goto ENDOFTESTS;
#endif
      
    if (argc != 3){
      printf("Usage: %s <# iterations> <array dimension> \n", 
             *argv);
      error = 1;
      goto ENDOFTESTS;
    }
 
    iterations  = atoi(*++argv); 
    arguments[0]=iterations;

    if (iterations < 1){
      printf("ERROR: iterations must be >= 1 : %d \n",iterations);
      error = 1;
      goto ENDOFTESTS;  
    }
 
    n  = atoi(*++argv);
    arguments[1]=n;
    long nsquare = (long)n * (long)n;

    if (nsquare < Num_procs){ 
      printf("ERROR: grid size must be at least # ranks: %ld\n", nsquare);
      error = 1;
      goto ENDOFTESTS;
    }
 
    if (RADIUS < 0) {
      printf("ERROR: Stencil radius %d should be non-negative\n", RADIUS);
      error = 1;
      goto ENDOFTESTS;  
    }
 
    if (2*RADIUS +1 > n) {
      printf("ERROR: Stencil radius %d exceeds grid size %d\n", RADIUS, n);
      error = 1;
      goto ENDOFTESTS;  
    }
 
    ENDOFTESTS:;  
  }
  bail_out(error);
 
  /* determine best way to create a 2D grid of ranks (closest to square, for 
     best surface/volume ratio); we do this brute force for now
  */
  for (Num_procsx=(int) (sqrt(Num_procs+1)); Num_procsx>0; Num_procsx--) {
    if (!(Num_procs%Num_procsx)) {
      Num_procsy = Num_procs/Num_procsx;
      break;
    }
  }      
  my_IDx = my_ID%Num_procsx;
  my_IDy = my_ID/Num_procsx;
  /* compute neighbors; don't worry about dropping off the edges of the grid */
  right_nbr  = my_ID+1;
  left_nbr   = my_ID-1;
  top_nbr    = my_ID+Num_procsx;
  bottom_nbr = my_ID-Num_procsx;

  iterflag[0] = iterflag[1] = 0;

  if(my_IDx==0)            count_case--;
  if(my_IDx==Num_procsx-1) count_case--;
  if(my_IDy==0)            count_case--;
  if(my_IDy==Num_procsy-1) count_case--;
 
  if (my_ID == root) {
    printf("Parallel Research Kernels version %s\n", PRKVERSION);
    printf("SHMEM stencil execution on 2D grid\n");
    printf("Number of ranks        = %d\n", Num_procs);
    printf("Grid size              = %d\n", n);
    printf("Radius of stencil      = %d\n", RADIUS);
    printf("Tiles in x/y-direction = %d/%d\n", Num_procsx, Num_procsy);
    printf("Type of stencil        = star\n");
#ifdef DOUBLE
    printf("Data type              = double precision\n");
#else
    printf("Data type              = single precision\n");
#endif
#if LOOPGEN
    printf("Script used to expand stencil loop body\n");
#else
    printf("Compact representation of stencil loop body\n");
#endif
#if SPLITFENCE
    printf("Split fence            = ON\n");
#else
    printf("Split fence            = OFF\n");
#endif
    printf("Number of iterations   = %d\n", iterations);
  }

  shmem_barrier_all();
 
  shmem_broadcast32(&arguments[0], &arguments[0], 2, root, 0, 0, Num_procs, pSync_bcast);

  iterations=arguments[0];
  n=arguments[1];

  shmem_barrier_all();
  prk_shmem_free(arguments);
 
  /* compute amount of space required for input and solution arrays             */
  
  width = n/Num_procsx;
  leftover = n%Num_procsx;
  if (my_IDx<leftover) {
    istart = (width+1) * my_IDx; 
    iend = istart + width + 1;
  }
  else {
    istart = (width+1) * leftover + width * (my_IDx-leftover);
    iend = istart + width;
  }
  
  width = iend - istart + 1;
  if (width == 0) {
    printf("ERROR: rank %d has no work to do\n", my_ID);
    error = 1;
  }
  bail_out(error);
 
  height = n/Num_procsy;
  leftover = n%Num_procsy;
  if (my_IDy<leftover) {
    jstart = (height+1) * my_IDy; 
    jend = jstart + height + 1;
  }
  else {
    jstart = (height+1) * leftover + height * (my_IDy-leftover);
    jend = jstart + height;
  }
  
  height = jend - jstart + 1;
  if (height == 0) {
    printf("ERROR: rank %d has no work to do\n", my_ID);
    error = 1;
  }
  bail_out(error);
 
  if (width < RADIUS || height < RADIUS) {
    printf("ERROR: rank %d has work tile smaller then stencil radius\n",
           my_ID);
    error = 1;
  }
  bail_out(error);
 
  total_length_in = (width+2*RADIUS);
  total_length_in *= (height+2*RADIUS);
  total_length_in *= sizeof(DTYPE);

  total_length_out = width;
  total_length_out *= height;
  total_length_out *= sizeof(DTYPE);
 
  in  = (DTYPE *) malloc(total_length_in);
  out = (DTYPE *) malloc(total_length_out);
  if (!in || !out) {
    printf("ERROR: rank %d could not allocate space for input/output array\n",
            my_ID);
    error = 1;
  }
  bail_out(error);
 
  /* fill the stencil weights to reflect a discrete divergence operator         */
  for (jj=-RADIUS; jj<=RADIUS; jj++) for (ii=-RADIUS; ii<=RADIUS; ii++)
    WEIGHT(ii,jj) = (DTYPE) 0.0;
  stencil_size = 4*RADIUS+1;

  for (ii=1; ii<=RADIUS; ii++) {
    WEIGHT(0, ii) = WEIGHT( ii,0) =  (DTYPE) (1.0/(2.0*ii*RADIUS));
    WEIGHT(0,-ii) = WEIGHT(-ii,0) = -(DTYPE) (1.0/(2.0*ii*RADIUS));
  }
 
  norm[0] = (DTYPE) 0.0;
  f_active_points = (DTYPE) (n-2*RADIUS)*(DTYPE) (n-2*RADIUS);

  /* intialize the input and output arrays                                     */
  for (j=jstart; j<jend; j++) for (i=istart; i<iend; i++) {
    IN(i,j)  = COEFX*i+COEFY*j;
    OUT(i,j) = (DTYPE)0.0;
  }

  /* allocate communication buffers for halo values                            */
  top_buf_out=(DTYPE*)malloc(2*sizeof(DTYPE)*RADIUS*width);
  if (!top_buf_out) {
    printf("ERROR: Rank %d could not allocate output comm buffers for y-direction\n", my_ID);
    error = 1;
  }
  bail_out(error);
  bottom_buf_out = top_buf_out+RADIUS*width;

  top_buf_in[0]=(DTYPE*)prk_shmem_malloc(4*sizeof(DTYPE)*RADIUS*width);
  if(!top_buf_in)
  {
    printf("ERROR: Rank %d could not allocate input comm buffers for y-direction\n", my_ID);
    error=1;
  }
  bail_out(error);
  top_buf_in[1]    = top_buf_in[0]    + RADIUS*width;
  bottom_buf_in[0] = top_buf_in[1]    + RADIUS*width;
  bottom_buf_in[1] = bottom_buf_in[0] + RADIUS*width;
 
  right_buf_out=(DTYPE*)malloc(2*sizeof(DTYPE)*RADIUS*height);
  if (!right_buf_out) {
    printf("ERROR: Rank %d could not allocate output comm buffers for x-direction\n", my_ID);
    error = 1;
  }
  bail_out(error);
  left_buf_out=right_buf_out+RADIUS*height;

  right_buf_in[0]=(DTYPE*)prk_shmem_malloc(4*sizeof(DTYPE)*RADIUS*height);
  if(!right_buf_in)
  {
    printf("ERROR: Rank %d could not allocate input comm buffers for x-dimension\n", my_ID);
    error=1;
  }
  bail_out(error);
  right_buf_in[1] = right_buf_in[0] + RADIUS*height;
  left_buf_in[0]  = right_buf_in[1] + RADIUS*height;
  left_buf_in[1]  = left_buf_in[0]  + RADIUS*height;

  /* make sure all symmetric heaps are allocated before being used  */
  shmem_barrier_all();

  for (iter = 0; iter<=iterations; iter++){

    /* start timer after a warmup iteration */
    if (iter == 1) { 
      shmem_barrier_all();
      local_stencil_time[0] = wtime();
    }
    /* sw determines which incoming buffer to select */
    sw = iter%2;

    /* need to fetch ghost point data from neighbors */

    if (my_IDy < Num_procsy-1) {
      for (kk=0,j=jend-RADIUS; j<=jend-1; j++) for (i=istart; i<=iend; i++) {
          top_buf_out[kk++]= IN(i,j);
      }
      shmem_putmem(bottom_buf_in[sw], top_buf_out, RADIUS*width*sizeof(DTYPE), top_nbr);
#if SPLITFENCE
      shmem_fence();
      shmem_int_inc(&iterflag[sw], top_nbr);
#endif
    }
    if (my_IDy > 0) {
      for (kk=0,j=jstart; j<=jstart+RADIUS-1; j++) for (i=istart; i<=iend; i++) {
          bottom_buf_out[kk++]= IN(i,j);
      }
      shmem_putmem(top_buf_in[sw], bottom_buf_out, RADIUS*width*sizeof(DTYPE), bottom_nbr);
#if SPLITFENCE
      shmem_fence();
      shmem_int_inc(&iterflag[sw], bottom_nbr);
#endif
    }

    if(my_IDx < Num_procsx-1) {
      for(kk=0,j=jstart;j<=jend;j++) for(i=iend-RADIUS;i<=iend-1;i++) {
	right_buf_out[kk++]=IN(i,j);
      }
      shmem_putmem(left_buf_in[sw], right_buf_out, RADIUS*height*sizeof(DTYPE), right_nbr);
#if SPLITFENCE
      shmem_fence();
      shmem_int_inc(&iterflag[sw], right_nbr);
#endif
    }

    if(my_IDx>0) {
      for(kk=0,j=jstart;j<=jend;j++) for(i=istart;i<=istart+RADIUS-1;i++) {
	left_buf_out[kk++]=IN(i,j);
      }
      shmem_putmem(right_buf_in[sw], left_buf_out, RADIUS*height*sizeof(DTYPE), left_nbr);
#if SPLITFENCE
      shmem_fence();
      shmem_int_inc(&iterflag[sw], left_nbr);
#endif
    }

#if SPLITFENCE == 0
    shmem_fence();
    if(my_IDy<Num_procsy-1) shmem_int_inc(&iterflag[sw], top_nbr);
    if(my_IDy>0)            shmem_int_inc(&iterflag[sw], bottom_nbr);
    if(my_IDx<Num_procsx-1) shmem_int_inc(&iterflag[sw], right_nbr);
    if(my_IDx>0)            shmem_int_inc(&iterflag[sw], left_nbr);
#endif

    shmem_int_wait_until(&iterflag[sw], SHMEM_CMP_EQ, count_case*(iter/2+1));

    if (my_IDy < Num_procsy-1) {
      for (kk=0,j=jend; j<=jend+RADIUS-1; j++) for (i=istart; i<=iend; i++) {
          IN(i,j) = top_buf_in[sw][kk++];
      }      
    }
    if (my_IDy > 0) {
      for (kk=0,j=jstart-RADIUS; j<=jstart-1; j++) for (i=istart; i<=iend; i++) {
          IN(i,j) = bottom_buf_in[sw][kk++];
      }      
    }

    if (my_IDx < Num_procsx-1) {
      for (kk=0,j=jstart; j<=jend; j++) for (i=iend; i<=iend+RADIUS-1; i++) {
          IN(i,j) = right_buf_in[sw][kk++];
      }      
    }
    if (my_IDx > 0) {
      for (kk=0,j=jstart; j<=jend; j++) for (i=istart-RADIUS; i<=istart-1; i++) {
          IN(i,j) = left_buf_in[sw][kk++];
      }      
    }
 
    /* Apply the stencil operator */
    for (j=MAX(jstart,RADIUS); j<=MIN(n-RADIUS-1,jend); j++) {
      for (i=MAX(istart,RADIUS); i<=MIN(n-RADIUS-1,iend); i++) {
        #if LOOPGEN
          #include "loop_body_star.incl"
        #else
          for (jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj);
          for (ii=-RADIUS; ii<0; ii++)       OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j);
          for (ii=1; ii<=RADIUS; ii++)       OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j);
        #endif
      }
    }
 
    /* add constant to solution to force refresh of neighbor data, if any */
    for (j=jstart; j<jend; j++) for (i=istart; i<iend; i++) IN(i,j)+= 1.0;
 
  }
 
  local_stencil_time[0] = wtime() - local_stencil_time[0];

  shmem_barrier_all();

  shmem_double_max_to_all(&stencil_time[0], &local_stencil_time[0], 1, 0, 0,
                          Num_procs, pWrk_time, pSync_reduce);
  
  /* compute L1 norm in parallel                                                */
  local_norm[0] = (DTYPE) 0.0;
  for (j=MAX(jstart,RADIUS); j<MIN(n-RADIUS,jend); j++) {
    for (i=MAX(istart,RADIUS); i<MIN(n-RADIUS,iend); i++) {
      local_norm[0] += (DTYPE)ABS(OUT(i,j));
    }
  }

  shmem_barrier_all();
 
#ifdef DOUBLE
  shmem_double_sum_to_all(&norm[0], &local_norm[0], 1, 0, 0, Num_procs, pWrk_norm, pSync_reduce);
#else
  shmem_float_sum_to_all(&norm[0], &local_norm[0], 1, 0, 0, Num_procs, pWrk_norm, pSync_reduce);
#endif
 
  /*******************************************************************************
  ** Analyze and output results.
  ********************************************************************************/
 
/* verify correctness                                                            */
  if (my_ID == root) {
    norm[0] /= f_active_points;
    if (RADIUS > 0) {
      reference_norm = (DTYPE) (iterations+1) * (COEFX + COEFY);
    }
    else {
      reference_norm = (DTYPE) 0.0;
    }
    if (ABS(norm[0]-reference_norm) > EPSILON) {
      printf("ERROR: L1 norm = "FSTR", Reference L1 norm = "FSTR"\n",
             norm[0], reference_norm);
      error = 1;
    }
    else {
      printf("Solution validates\n");
#ifdef VERBOSE
      printf("Reference L1 norm = "FSTR", L1 norm = "FSTR"\n", 
             reference_norm, norm[0]);
#endif
    }
  }
  bail_out(error);
 
  if (my_ID == root) {
    /* flops/stencil: 2 flops (fma) for each point in the stencil, 
       plus one flop for the update of the input of the array        */
    flops = (DTYPE) (2*stencil_size+1) * f_active_points;
    avgtime = stencil_time[0]/iterations;
    printf("Rate (MFlops/s): "FSTR"  Avg time (s): %lf\n",
           1.0E-06 * flops/avgtime, avgtime);
  }
 

  prk_shmem_free(top_buf_in);
  prk_shmem_free(right_buf_in);
  free(top_buf_out);
  free(right_buf_out);

  prk_shmem_free(pSync_bcast);
  prk_shmem_free(pSync_reduce);
  prk_shmem_free(pWrk_time);
  prk_shmem_free(pWrk_norm);

  prk_shmem_finalize();

  exit(EXIT_SUCCESS);
}
Пример #10
0
Файл: to_all.c Проект: caomw/SOS
int
sum_to_all(int me, int npes)
{
  int i, pass=0;

  memset(ok,0,sizeof(ok));

  for (i = 0; i < N; i++) {
    src0[i] = src1[i] = src2[i] = src3[i] = src4[i] = src5[i] = src6[i] = me;
    dst0[i] = -9;
	dst1[i] = -9;
	dst2[i] = -9;
	dst3[i] = -9;
	dst4[i] = -9;
	dst5[i] = -9;
	dst6[i] = -9;
  }

  shmem_barrier_all();

  shmem_short_sum_to_all(dst0, src0, N, 0, 0, npes, pWrk0, pSync);
  shmem_int_sum_to_all(dst1, src1, N, 0, 0, npes, pWrk1, pSync1);
  shmem_long_sum_to_all(dst2, src2, N, 0, 0, npes, pWrk2, pSync);
  shmem_float_sum_to_all(dst3, src3, N, 0, 0, npes, pWrk3, pSync1);
  shmem_double_sum_to_all(dst4, src4, N, 0, 0, npes, pWrk4, pSync);
  shmem_longdouble_sum_to_all(dst5, src5, N, 0, 0, npes, pWrk5, pSync1);
  shmem_longlong_sum_to_all(dst6, src6, N, 0, 0, npes, pWrk6, pSync);

  if(me == 0) {
    for (i = 0; i < N; i++) {
	  if(dst0[i] != (short) (npes * (npes-1)/2)) ok[0] = 1;
	  if(dst1[i] != (int) (npes * (npes-1)/2)) ok[1] = 1;
	  if(dst2[i] != (long) (npes * (npes-1)/2)) ok[2] = 1;
	  if(dst3[i] != (float) (npes * (npes-1)/2)) ok[3] = 1;
	  if(dst4[i] != (double) (npes * (npes-1)/2)) ok[4] = 1;
	  if(dst5[i] != (long double) (npes * (npes-1)/2)) ok[5] = 1;
	  if(dst6[i] != (long long) (npes * (npes-1)/2)) ok[6] = 1;
    }
    if(ok[0]==1){
      printf("Reduction operation shmem_short_sum_to_all: Failed\n");
	}  
    else{
      Vprintf("Reduction operation shmem_short_sum_to_all: Passed\n");
      pass++;
	}
	if(ok[1]==1){
      printf("Reduction operation shmem_int_sum_to_all: Failed\n");
	}  
    else{
      Vprintf("Reduction operation shmem_int_sum_to_all: Passed\n");
      pass++;
	}
	if(ok[2]==1){
      printf("Reduction operation shmem_long_sum_to_all: Failed\n");
	}  
    else{
      Vprintf("Reduction operation shmem_long_sum_to_all: Passed\n");
      pass++;
	}
	if(ok[3]==1){
      printf("Reduction operation shmem_float_sum_to_all: Failed\n");
	}  
    else{
      Vprintf("Reduction operation shmem_float_sum_to_all: Passed\n");
      pass++;
	}
	if(ok[4]==1){
      printf("Reduction operation shmem_double_sum_to_all: Failed\n");
	}  
    else{
      Vprintf("Reduction operation shmem_double_sum_to_all: Passed\n");
      pass++;
	}
	if(ok[5]==1){
      printf("Reduction operation shmem_longdouble_sum_to_all: Failed\n");
	}  
    else{
      Vprintf("Reduction operation shmem_longdouble_sum_to_all: Passed\n");
      pass++;
	}
	if(ok[6]==1){
      printf("Reduction operation shmem_longlong_sum_to_all: Failed\n");
	}  
    else{
      Vprintf("Reduction operation shmem_longlong_sum_to_all: Passed\n");
      pass++;
	}
    Vprintf("\n"); fflush(stdout);
  }
    if (Serialize) shmem_barrier_all();

    return (pass == 7 ? 1 : 0);
}
Пример #11
0
int
main()
{
  int i,j;
  int me, npes;
  int success0, success1, success2, success3, success4, success5, success6;
  success0 = success1 = success2 = success3 = success4 = success5 = success6 = 0;

  start_pes(0);
  me = _my_pe();
  npes = _num_pes();

  for (i = 0; i < _SHMEM_REDUCE_SYNC_SIZE; i += 1) {
    pSync[i] = _SHMEM_SYNC_VALUE;
	pSync1[i] = _SHMEM_SYNC_VALUE;
  }

  for (i = 0; i < N; i += 1) {
    src0[i] = src1[i] = src2[i] = src3[i] = src4[i] = src5[i] = src6[i] = me + i;
 }
  
  /*Test MAX: shmem_double_max_to_all, shmem_float_max_to_all, shmem_int_max_to_all, shmem_long_max_to_all, shmem_longdouble_max_to_all, shmem_longlong_max_to_all, shmem_short_max_to_all */
  shmem_barrier_all();

  shmem_short_max_to_all(dst0, src0, N, 0, 0, npes, pWrk0, pSync);
  shmem_int_max_to_all(dst1, src1, N, 0, 0, npes, pWrk1, pSync1);
  shmem_long_max_to_all(dst2, src2, N, 0, 0, npes, pWrk2, pSync);
  shmem_float_max_to_all(dst3, src3, N, 0, 0, npes, pWrk3, pSync1);
  shmem_double_max_to_all(dst4, src4, N, 0, 0, npes, pWrk4, pSync);
  shmem_longdouble_max_to_all(dst5, src5, N, 0, 0, npes, pWrk5, pSync1);
  shmem_longlong_max_to_all(dst6, src6, N, 0, 0, npes, pWrk6, pSync);
  
  
  if(me == 0){
    for (i = 0,j=-1; i < N; i++,j++) {
      if(dst0[i] != npes+j)
        success0 =1;
	  if(dst1[i] != npes+j)
        success1 =1;
	  if(dst2[i] != npes+j)
        success2 =1;
	  if(dst3[i] != npes+j)
        success3 =1;
	  if(dst4[i] != npes+j)
        success4 =1;
	  if(dst5[i] != npes+j)
        success5 =1;
	  if(dst6[i] != npes+j)
        success6 =1;
    }
    if(success0==1){
      printf("Reduction operation shmem_short_max_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_short_max_to_all: Passed\n");
	}
	if(success1==1){
      printf("Reduction operation shmem_int_max_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_int_max_to_all: Passed\n");
	}
	if(success2==1){
      printf("Reduction operation shmem_long_max_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_long_max_to_all: Passed\n");
	}
	if(success3==1){
      printf("Reduction operation shmem_float_max_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_float_max_to_all: Passed\n");
	}
	if(success4==1){
      printf("Reduction operation shmem_double_max_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_double_max_to_all: Passed\n");
	}
	if(success5==1){
      printf("Reduction operation shmem_longdouble_max_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_longdouble_max_to_all: Passed\n");
	}
	if(success6==1){
      printf("Reduction operation shmem_longlong_max_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_longlong_max_to_all: Passed\n");
	}
	
  }
  
  
  /*Test MIN: shmem_double_min_to_all, shmem_float_min_to_all, shmem_int_min_to_all, shmem_long_min_to_all, shmem_longdouble_min_to_all, shmem_longlong_min_to_all, shmem_short_min_to_all*/
  success0 = success1 = success2 = success3 = success4 = success5 = success6 = 0;
  
  for (i = 0; i < N; i += 1) {
    src0[i] = src1[i] = src2[i] = src3[i] = src4[i] = src5[i] = src6[i] = me + i;
 }
 
  for (i = 0; i < N; i += 1) {
    dst0[i] = -9;
	dst1[i] = -9;
	dst2[i] = -9;
	dst3[i] = -9;
	dst4[i] = -9;
	dst5[i] = -9;
	dst6[i] = -9;
  }
   
  shmem_barrier_all();
  
  shmem_short_min_to_all(dst0, src0, N, 0, 0, npes, pWrk0, pSync);
  shmem_int_min_to_all(dst1, src1, N, 0, 0, npes, pWrk1, pSync1);
  shmem_long_min_to_all(dst2, src2, N, 0, 0, npes, pWrk2, pSync);
  shmem_float_min_to_all(dst3, src3, N, 0, 0, npes, pWrk3, pSync1);
  shmem_double_min_to_all(dst4, src4, N, 0, 0, npes, pWrk4, pSync);
  shmem_longdouble_min_to_all(dst5, src5, N, 0, 0, npes, pWrk5, pSync1);
  shmem_longlong_min_to_all(dst6, src6, N, 0, 0, npes, pWrk6, pSync);
  
  
  if(me == 0){
    for (i = 0; i < N; i++) {
      if(dst0[i] != i)
        success0 =1;
	  if(dst1[i] != i)
        success1 =1;
	  if(dst2[i] != i)
        success2 =1;
	  if(dst3[i] != i)
        success3 =1;
	  if(dst4[i] != i)
        success4 =1;
	  if(dst5[i] != i)
        success5 =1;
	  if(dst6[i] != i)
        success6 =1;
    }
    if(success0==1){
      printf("Reduction operation shmem_short_min_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_short_min_to_all: Passed\n");
	}
	if(success1==1){
      printf("Reduction operation shmem_int_min_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_int_min_to_all: Passed\n");
	}
	if(success2==1){
      printf("Reduction operation shmem_long_min_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_long_min_to_all: Passed\n");
	}
	if(success3==1){
      printf("Reduction operation shmem_float_min_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_float_min_to_all: Passed\n");
	}
	if(success4==1){
      printf("Reduction operation shmem_double_min_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_double_min_to_all: Passed\n");
	}
	if(success5==1){
      printf("Reduction operation shmem_longdouble_min_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_longdouble_min_to_all: Passed\n");
	}
	if(success6==1){
      printf("Reduction operation shmem_longlong_min_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_longlong_min_to_all: Passed\n");
	}
	
  }
  
  /*Test SUM: shmem_double_sum_to_all, shmem_float_sum_to_all, shmem_int_sum_to_all, shmem_long_sum_to_all, shmem_longdouble_sum_to_all, shmem_longlong_sum_to_all, shmem_short_sum_to_all*/
  success0 = success1 = success2 = success3 = success4 = success5 = success6 = 0;
  for (i = 0; i < N; i += 1) {
    src0[i] = src1[i] = src2[i] = src3[i] = src4[i] = src5[i] = src6[i] = me;
 }
  for (i = 0; i < N; i += 1) {
    dst0[i] = -9;
	dst1[i] = -9;
	dst2[i] = -9;
	dst3[i] = -9;
	dst4[i] = -9;
	dst5[i] = -9;
	dst6[i] = -9;
  }
  shmem_barrier_all();

  shmem_short_sum_to_all(dst0, src0, N, 0, 0, npes, pWrk0, pSync);
  shmem_int_sum_to_all(dst1, src1, N, 0, 0, npes, pWrk1, pSync1);
  shmem_long_sum_to_all(dst2, src2, N, 0, 0, npes, pWrk2, pSync);
  shmem_float_sum_to_all(dst3, src3, N, 0, 0, npes, pWrk3, pSync1);
  shmem_double_sum_to_all(dst4, src4, N, 0, 0, npes, pWrk4, pSync);
  shmem_longdouble_sum_to_all(dst5, src5, N, 0, 0, npes, pWrk5, pSync1);
  shmem_longlong_sum_to_all(dst6, src6, N, 0, 0, npes, pWrk6, pSync);

  
  if(me == 0){
    for (i = 0; i < N; i++) {
	  if(dst0[i] != (npes * (npes-1)/2))
        success0 =1;
	  if(dst1[i] != (npes * (npes-1)/2))
        success1 =1;
	  if(dst2[i] != (npes * (npes-1)/2))
        success2 =1;
	  if(dst3[i] != (npes * (npes-1)/2))
        success3 =1;
	  if(dst4[i] != (npes * (npes-1)/2))
        success4 =1;
	  if(dst5[i] != (npes * (npes-1)/2))
        success5 =1;
	  if(dst6[i] != (npes * (npes-1)/2))
        success6 =1;
    }
    if(success0==1){
      printf("Reduction operation shmem_short_sum_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_short_sum_to_all: Passed\n");
	}
	if(success1==1){
      printf("Reduction operation shmem_int_sum_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_int_sum_to_all: Passed\n");
	}
	if(success2==1){
      printf("Reduction operation shmem_long_sum_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_long_sum_to_all: Passed\n");
	}
	if(success3==1){
      printf("Reduction operation shmem_float_sum_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_float_sum_to_all: Passed\n");
	}
	if(success4==1){
      printf("Reduction operation shmem_double_sum_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_double_sum_to_all: Passed\n");
	}
	if(success5==1){
      printf("Reduction operation shmem_longdouble_sum_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_longdouble_sum_to_all: Passed\n");
	}
	if(success6==1){
      printf("Reduction operation shmem_longlong_sum_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_longlong_sum_to_all: Passed\n");
	}
	
  }
  
  /*Test AND: shmem_int_and_to_all, shmem_long_and_to_all, shmem_longlong_and_to_all, shmem_short_and_to_all,*/
  success0 = success1 = success2 = success6 = 0;
  for (i = 0; i < N; i += 1) {
    src0[i] = src1[i] = src2[i] = src6[i] = me;
 }
 for (i = 0; i < N; i += 1) {
    dst0[i] = -9;
	dst1[i] = -9;
	dst2[i] = -9;
	dst6[i] = -9;
  }
 
  shmem_barrier_all();
  
  shmem_short_and_to_all(dst0, src0, N, 0, 0, npes, pWrk0, pSync);
  shmem_int_and_to_all(dst1, src1, N, 0, 0, npes, pWrk1, pSync1);
  shmem_long_and_to_all(dst2, src2, N, 0, 0, npes, pWrk2, pSync);
  shmem_longlong_and_to_all(dst6, src6, N, 0, 0, npes, pWrk6, pSync1);
  
  
  if(me==0){
    for (i = 0; i < N; i++) {
	  if(dst0[i] != 0)
        success0 =1;
	  if(dst1[i] != 0)
        success1 =1;
	  if(dst2[i] != 0)
        success2 =1;
	 if(dst6[i] != 0)
        success6 =1;
    }
    if(success0==1){
      printf("Reduction operation shmem_short_and_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_short_and_to_all: Passed\n");
	}
	if(success1==1){
      printf("Reduction operation shmem_int_and_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_int_and_to_all: Passed\n");
	}
	if(success2==1){
      printf("Reduction operation shmem_long_and_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_long_and_to_all: Passed\n");
	}
	if(success6==1){
      printf("Reduction operation shmem_longlong_and_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_longlong_and_to_all: Passed\n");
	}
	
  }
  
 /*Test PROD: shmem_double_prod_to_all, shmem_float_prod_to_all, shmem_int_prod_to_all, shmem_long_prod_to_all, shmem_longdouble_prod_to_all, shmem_longlong_prod_to_all, shmem_short_prod_to_all, */
  
  success0 = success1 = success2 = success3 = success4 = success5 = success6 = 0;
  for (i = 0; i < N; i += 1) {
    src0[i] = src1[i] = src2[i] = src3[i] = src4[i] = src5[i] = src6[i] = me + 1;
 }
  for (i = 0; i < N; i += 1) {
    dst0[i] = -9;
	dst1[i] = -9;
	dst2[i] = -9;
	dst3[i] = -9;
	dst4[i] = -9;
	dst5[i] = -9;
	dst6[i] = -9;
  }
  
  expected_result0 = expected_result1 = expected_result2 = expected_result3 = expected_result4 = expected_result5 = expected_result6 =1;
  for(i=1;i<=npes;i++){
    expected_result0 = expected_result0 * i;
	expected_result1 = expected_result1 * i;
	expected_result2 = expected_result2 * i;
	expected_result3 = expected_result3 * i;
	expected_result4 = expected_result4 * i;
	expected_result5 = expected_result5 * i;
	expected_result6 = expected_result6 * i;
  }
   
  shmem_barrier_all();
 
  shmem_short_prod_to_all(dst0, src0, N, 0, 0, npes, pWrk0, pSync);
  shmem_int_prod_to_all(dst1, src1, N, 0, 0, npes, pWrk1, pSync1);
  shmem_long_prod_to_all(dst2, src2, N, 0, 0, npes, pWrk2, pSync);
  shmem_float_prod_to_all(dst3, src3, N, 0, 0, npes, pWrk3, pSync1);
  shmem_double_prod_to_all(dst4, src4, N, 0, 0, npes, pWrk4, pSync);
  shmem_longdouble_prod_to_all(dst5, src5, N, 0, 0, npes, pWrk5, pSync1);
  shmem_longlong_prod_to_all(dst6, src6, N, 0, 0, npes, pWrk6, pSync);

 
  if(me == 0){
    for (i = 0; i < N; i++) {
	 /*printf("dst2[%d]: %ld, expected val: %ld\n",i, dst2[i], (long)expected_result2);*/
      if(dst0[i] != expected_result0)
        success0 =1;
	  if(dst1[i] != expected_result1)
        success1 =1;
	  if(dst2[i] != expected_result2)
        success2 =1;
	  if(dst3[i] != expected_result3)
        success3 =1;
	  if(dst4[i] != expected_result4)
        success4 =1;
	  if(dst5[i] != expected_result5)
        success5 =1;
	  if(dst6[i] != expected_result6)
        success6 =1;
    }
    if(success0==1){
      printf("Reduction operation shmem_short_prod_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_short_prod_to_all: Passed\n");
	}
	if(success1==1){
      printf("Reduction operation shmem_int_prod_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_int_prod_to_all: Passed\n");
	}
	if(success2==1){
      printf("Reduction operation shmem_long_prod_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_long_prod_to_all: Passed\n");
	}
	if(success3==1){
      printf("Reduction operation shmem_float_prod_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_float_prod_to_all: Passed\n");
	}
	if(success4==1){
      printf("Reduction operation shmem_double_prod_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_double_prod_to_all: Passed\n");
	}
	if(success5==1){
      printf("Reduction operation shmem_longdouble_prod_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_longdouble_prod_to_all: Passed\n");
	}
	if(success6==1){
      printf("Reduction operation shmem_longlong_prod_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_longlong_prod_to_all: Passed\n");
	}
	
  }
 
 /*Test OR: shmem_int_or_to_all, shmem_long_or_to_all, shmem_longlong_or_to_all, shmem_short_or_to_all,*/
  
  success0 = success1 = success2 = success6 = 0;
  for (i = 0; i < N; i += 1) {
    src0[i] = src1[i] = src2[i] = src6[i] = (me + 1)%4;
 }
 for (i = 0; i < N; i += 1) {
    dst0[i] = -9;
	dst1[i] = -9;
	dst2[i] = -9;
	dst6[i] = -9;
  }
 
  shmem_barrier_all();
  
  shmem_short_or_to_all(dst0, src0, N, 0, 0, npes, pWrk0, pSync);
  shmem_int_or_to_all(dst1, src1, N, 0, 0, npes, pWrk1, pSync1);
  shmem_long_or_to_all(dst2, src2, N, 0, 0, npes, pWrk2, pSync);
  shmem_longlong_or_to_all(dst6, src6, N, 0, 0, npes, pWrk6, pSync1);
  
  
  if(me==0){
    for (i = 0; i < N; i++) {
      if(dst0[i] != 3)
        success0 =1;
	  if(dst1[i] != 3)
        success1 =1;
	  if(dst2[i] != 3)
        success2 =1;
	 if(dst6[i] != 3)
        success6 =1;
    }
    if(success0==1){
      printf("Reduction operation shmem_short_or_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_short_or_to_all: Passed\n");
	}
	if(success1==1){
      printf("Reduction operation shmem_int_or_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_int_or_to_all: Passed\n");
	}
	if(success2==1){
      printf("Reduction operation shmem_long_or_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_long_or_to_all: Passed\n");
	}
	if(success6==1){
      printf("Reduction operation shmem_longlong_or_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_longlong_or_to_all: Passed\n");
	}
	
  }
 
 /*Test XOR: shmem_int_xor_to_all, shmem_long_xor_to_all, shmem_longlong_xor_to_all, shmem_short_xor_to_all*/
  
  success0 = success1 = success2 = success6 = 0;
  for (i = 0; i < N; i += 1) {
    src0[i] = src1[i] = src2[i] = src6[i] = me%2;
 }
 for (i = 0; i < N; i += 1) {
    dst0[i] = -9;
	dst1[i] = -9;
	dst2[i] = -9;
	dst6[i] = -9;
  }
  int expected_result = ((int)(npes/2) % 2);
  
 
  shmem_barrier_all();
  
  shmem_short_xor_to_all(dst0, src0, N, 0, 0, npes, pWrk0, pSync);
  shmem_int_xor_to_all(dst1, src1, N, 0, 0, npes, pWrk1, pSync1);
  shmem_long_xor_to_all(dst2, src2, N, 0, 0, npes, pWrk2, pSync);
  shmem_longlong_xor_to_all(dst6, src6, N, 0, 0, npes, pWrk6, pSync1);
  
  if(me==0){
    for (i = 0; i < N; i++) {
      if(dst0[i] != expected_result)
        success0 =1;
	  if(dst1[i] != expected_result)
        success1 =1;
	  if(dst2[i] != expected_result)
        success2 =1;
	 if(dst6[i] != expected_result)
        success6 =1;
    }
    if(success0==1){
      printf("Reduction operation shmem_short_xor_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_short_xor_to_all: Passed\n");
	}
	if(success1==1){
      printf("Reduction operation shmem_int_xor_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_int_xor_to_all: Passed\n");
	}
	if(success2==1){
      printf("Reduction operation shmem_long_xor_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_long_xor_to_all: Passed\n");
	}
	if(success6==1){
      printf("Reduction operation shmem_longlong_xor_to_all: Failed\n");
	}  
    else{
      printf("Reduction operation shmem_longlong_xor_to_all: Passed\n");
	}
	
  }

  return 0;
}
Пример #12
0
int main(int argc, char *argv[])
{
    int i = 0, rank, size;
    int skip, numprocs;
    static double avg_time = 0.0, max_time = 0.0, min_time = 0.0;
    static double latency = 0.0;
    int64_t t_start = 0, t_stop = 0, timer=0;
    char *buffer=NULL;
    int max_msg_size = 1048576, full = 0;
    int t;

    for ( t = 0; t < _SHMEM_BCAST_SYNC_SIZE; t += 1) pSyncBcast1[t] = _SHMEM_SYNC_VALUE;
    for ( t = 0; t < _SHMEM_BCAST_SYNC_SIZE; t += 1) pSyncBcast2[t] = _SHMEM_SYNC_VALUE;
    for ( t = 0; t < _SHMEM_REDUCE_SYNC_SIZE; t += 1) pSyncRed1[t] = _SHMEM_SYNC_VALUE;
    for ( t = 0; t < _SHMEM_REDUCE_SYNC_SIZE; t += 1) pSyncRed2[t] = _SHMEM_SYNC_VALUE;

    start_pes(0);
    rank = _my_pe();
    numprocs = _num_pes();

    if (process_args(argc, argv, rank, &max_msg_size, &full)) {
        return 0;
    }
    
    if(numprocs < 2) {
        if(rank == 0) {
            fprintf(stderr, "This test requires at least two processes\n");
        }
        return -1;
    }
    print_header(rank, full);

    buffer = shmalloc(max_msg_size * sizeof(char));
    if(NULL == buffer) {
        fprintf(stderr, "malloc failed.\n");
        exit(1);
    }
    
    memset(buffer,1, max_msg_size);

    for(size=1; size <=max_msg_size/sizeof(uint32_t); size *= 2) {
        if(size > LARGE_MESSAGE_SIZE) {
            skip = SKIP_LARGE;
            iterations = iterations_large;
        }
        else {
            skip = SKIP;
        }

        timer=0;        
        for(i=0; i < iterations + skip ; i++) {
            t_start = TIME();
            if(i%2)
                shmem_broadcast32(buffer, buffer, size, 0, 0, 0, numprocs, pSyncBcast1);
            else
                shmem_broadcast32(buffer, buffer, size, 0, 0, 0, numprocs, pSyncBcast2);
            t_stop = TIME();

            if(i>=skip){
                timer+=t_stop-t_start;
            } 
            shmem_barrier_all();
        }
        shmem_barrier_all();            
        latency = (1.0 * timer) / iterations;

        shmem_double_min_to_all(&min_time, &latency, 1, 0, 0, numprocs, pWrk1, pSyncRed1);
        shmem_double_max_to_all(&max_time, &latency, 1, 0, 0, numprocs, pWrk2, pSyncRed2);
        shmem_double_sum_to_all(&avg_time, &latency, 1, 0, 0, numprocs, pWrk1, pSyncRed1);
        avg_time = avg_time/numprocs;

        print_data(rank, full, size*sizeof(uint32_t), avg_time, min_time, max_time, iterations);
    }

    shfree(buffer);  
    return EXIT_SUCCESS;
}
Пример #13
0
int
main(int argc, char **argv)
{
    int loops=DFLT_LOOPS;
    char *pgm;
    int *Target;
    int *Source;
    int i, me, npes;
    int target_PE;
    long bytes;
    double start_time, *total_time;

    shmem_init();
    me = shmem_my_pe();
    npes = shmem_n_pes();

    if ((pgm=strrchr(argv[0],'/')))
        pgm++;
    else
        pgm = argv[0];

    while ((i = getopt (argc, argv, "hve:l:st")) != EOF) {
        switch (i)
        {
            case 'v':
                Verbose++;
                break;
            case 'e':
                if ((elements = atoi_scaled(optarg)) <= 0) {
                    fprintf(stderr,"ERR: Bad elements count %d\n",elements);
                    shmem_finalize();
                    return 1;
                }
                break;
            case 'l':
                if ((loops = atoi_scaled(optarg)) <= 0) {
                    fprintf(stderr,"ERR: Bad loop count %d\n",loops);
                    shmem_finalize();
                    return 1;
                }
                break;
            case 's':
                Sync++;
                break;
            case 't':
                Track++;
                break;
            case 'h':
                if (me == 0)
                    usage(pgm);
                return 0;
            default:
                if (me == 0) {
                    fprintf(stderr,"%s: unknown switch '-%c'?\n",pgm,i);
                    usage(pgm);
                }
                shmem_finalize();
                return 1;
        }
    }

    for(i=0; i < SHMEM_REDUCE_SYNC_SIZE; i++)
        pSync[i] = SHMEM_SYNC_VALUE;

    target_PE = (me+1) % npes;

    total_time = (double *) shmem_malloc( npes * sizeof(double) );
    if (!total_time) {
        fprintf(stderr,"ERR: bad total_time shmem_malloc(%ld)\n",
                (elements * sizeof(double)));
        shmem_global_exit(1);
    }
    for(i=0; i < npes; i++)
        total_time[i] = -1.0;

    Source = (int *) shmem_malloc( elements * sizeof(*Source) );
    if (!Source) {
        fprintf(stderr,"ERR: bad Source shmem_malloc(%ld)\n",
                (elements * sizeof(*Target)));
        shmem_free(total_time);
        shmem_global_exit(1);
    }

    Target = (int *) shmem_malloc( elements * sizeof(*Target) );
    if (!Target) {
        fprintf(stderr,"ERR: bad Target shmem_malloc(%ld)\n",
                (elements * sizeof(*Target)));
        shmem_free(Source);
        shmem_free(total_time);
        shmem_global_exit(1);
    }

    for (i = 0; i < elements; i++) {
        Target[i] = -90;
        Source[i] = i + 1;
    }

    bytes = loops * sizeof(int) * elements;

    if (Verbose && me==0) {
        fprintf(stderr,
                "%s: INFO - %d loops, put %d (int) elements to PE+1 Max put ??\n",
                pgm, loops, elements);
    }
    shmem_barrier_all();

    for(i=0; i < loops; i++) {

        start_time = shmemx_wtime();

        shmem_int_put(Target, Source, elements, target_PE);

        time_taken += (shmemx_wtime() - start_time);

        if (me==0) {
            if ( Track && i > 0 && ((i % 200) == 0))
                fprintf(stderr,".%d",i);
        }
        if (Sync)
            shmem_barrier_all();
    }

    // collect time per node.
    shmem_double_put( &total_time[me], &time_taken, 1, 0 );
    shmem_double_sum_to_all(&sum_time, &time_taken, 1, 0, 0, npes, pWrk, pSync);

    shmem_barrier_all();

    for (i = 0; i < elements; i++) {
        if (Target[i] != i + 1) {
            printf("%d: Error Target[%d] = %d, expected %d\n",
                   me, i, Target[i], i + 1);
            shmem_global_exit(1);
        }
    }

    if ( Track && me == 0 ) fprintf(stderr,"\n");

    if(Verbose && me == 0) {
        double rate, comp_time;

        if (Verbose > 1)
            fprintf(stdout,"Individule PE times: (seconds)\n");
        for(i=0,comp_time=0.0; i < npes; i++) {
            comp_time += total_time[i];
            if (Verbose > 1)
                fprintf(stdout,"  PE[%d] %8.6f\n",i,total_time[i]);
        }

        sum_time /= (double)npes;
        comp_time /= (double)npes;
        if (sum_time != comp_time)
            printf("%s: computed_time %7.5f != sum_to_all_time %7.5f)\n",
                   pgm, comp_time, sum_time );

        rate = ((double)bytes/(1024.0*1024.0)) / comp_time;
        printf("%s: shmem_int_put() %7.4f MB/sec (bytes %ld secs %7.4f)\n",
               pgm, rate, bytes, sum_time);
    }

    shmem_free(total_time);
    shmem_free(Target);
    shmem_free(Source);

    shmem_finalize();

    return 0;
}
Пример #14
0
int
main ()
{
  int quantum = -1, checktick ();
  int BytesPerWord;
  int k;
  ssize_t j, i;
  STREAM_TYPE scalar;
  // process local counters
  int count_p = 0, next_p = 0;
  gcounter = 0;

  /* --- SETUP --- determine precision and check timing --- */

  printf (HLINE);
  printf ("STREAM version $Revision: 5.10 $\n");
  printf (HLINE);
  BytesPerWord = sizeof (STREAM_TYPE);
  printf ("This system uses %d bytes per array element.\n", BytesPerWord);
  /* SHMEM initialize */
  start_pes (0);
  _world_size = _num_pes ();
  _world_rank = _my_pe ();
  /* wait for user to input runtime params */
  for (int j = 0; j < _SHMEM_BARRIER_SYNC_SIZE; j++)
    {
      pSync0[j] = pSync1[j] = pSync2[j] = _SHMEM_SYNC_VALUE;
    }

  if (_world_rank == 0)
    {
      printf (HLINE);
#ifdef N
      printf ("*****  WARNING: ******\n");
      printf
	("      It appears that you set the preprocessor variable N when compiling this code.\n");
      printf
	("      This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n");
      printf ("      Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",
	      (unsigned long long) STREAM_ARRAY_SIZE);
      printf ("*****  WARNING: ******\n");
#endif

      printf ("Array size = %llu (elements), Offset = %d (elements)\n",
	      (unsigned long long) STREAM_ARRAY_SIZE, OFFSET);
      printf ("Memory per array = %.1f MiB (= %.1f GiB).\n",
	      BytesPerWord * ((double) STREAM_ARRAY_SIZE / 1024.0 / 1024.0),
	      BytesPerWord * ((double) STREAM_ARRAY_SIZE / 1024.0 / 1024.0 /
			      1024.0));
      printf ("Total memory required = %.1f MiB (= %.1f GiB).\n",
	      (3.0 * BytesPerWord) * ((double) STREAM_ARRAY_SIZE / 1024.0 /
				      1024.),
	      (3.0 * BytesPerWord) * ((double) STREAM_ARRAY_SIZE / 1024.0 /
				      1024. / 1024.));
      printf ("Each kernel will be executed %d times.\n", NTIMES);
      printf
	(" The *best* time for each kernel (excluding the first iteration)\n");
      printf (" will be used to compute the reported bandwidth.\n");
      printf ("Number of SHMEM PEs requested = %i\n", _world_size);
    }

  int blocksize = 10000;
  assert (STREAM_ARRAY_SIZE % blocksize == 0);

  // do something really minor
  /* Get initial value for system clock. */
  for (j = 0; j < STREAM_ARRAY_SIZE; j++)
    {
      a[j] = 1.0;
      b[j] = 2.0;
      c[j] = 0.0;
    }

  printf (HLINE);

  if (_world_rank == 0)
    {
      if ((quantum = checktick ()) >= 1)
	printf ("Your clock granularity/precision appears to be "
		"%d microseconds.\n", quantum);
      else
	{
	  printf ("Your clock granularity appears to be "
		  "less than one microsecond.\n");
	  quantum = 1;
	}
    }

  // assign fixed iterations per PE

  // since we know default STREAM array size
  // we are hardcoding this, but if the value
  // changes, then this blocking factor must
  // also change
  // basically, each PE works on this block
  // size at a time

  time_start = mysecond ();
  /* Initialize */
  next_p = shmem_int_fadd (&gcounter, 1, ROOT);
  for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize)
    {
      if (next_p == count_p)
	{
	  for (i = j; i < (j + blocksize); i++)
	    {
	      a[i] = 2.0E0 * a[i];
	    }
	  next_p = shmem_int_fadd (&gcounter, 1, ROOT);
	}
      count_p++;
    }
  time_end = mysecond ();
  clock_time_PE = time_end - time_start;
  shmem_double_sum_to_all (&total_clock_time, &clock_time_PE, 1,
			   0, 0, _world_size, pWrk0, pSync0);

  if (_world_rank == 0)
    {
      printf ("Each test below will take on the order"
	      " of %d microseconds.\n", (int) (total_clock_time * 1.0E6));
      printf ("   (= %d clock ticks)\n",
	      (int) ((1.0E6 * total_clock_time) / quantum));
      printf ("Increase the size of the arrays if this shows that\n");
      printf ("you are not getting at least 20 clock ticks per test.\n");

      printf (HLINE);

      printf ("WARNING -- The above is only a rough guideline.\n");
      printf ("For best results, please be sure you know the\n");
      printf ("precision of your system timer.\n");
      printf (HLINE);
    }
  /*      --- MAIN LOOP --- repeat test cases NTIMES times --- */

  // reduction required, as each PE only fills a,b,c partially
  scalar = 3.0;

  for (k = 0; k < NTIMES; k++)
    {
      // this is required for correctness
      // for NTIMES > 1 which is typically
      // the case
      for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize)
	{
	  if (next_p == count_p)
	    {
	      for (i = j; i < (j + blocksize); i++)
		{
		  a[i] = 1.0;
		  b[i] = 2.0;
		  c[i] = 0.0;
		  a[i] = 2.0E0 * a[i];

		}
	      next_p = shmem_int_fadd (&gcounter, 1, ROOT);
	    }
	  count_p++;
	  shmem_double_max_to_all (a + j, a + j, blocksize, 0,
				   0, _world_size, pWrk1, pSync1);
	}
      shmem_barrier_all ();

      time_start = mysecond ();
      for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize)
	{
	  if (next_p == count_p)
	    {
	      for (i = j; i < (j + blocksize); i++)
		{
		  c[i] = a[i];
		}
	      next_p = shmem_int_fadd (&gcounter, 1, ROOT);
	    }
	  count_p++;
	  shmem_double_max_to_all (c + j, c + j, blocksize, 0,
				   0, _world_size, pWrk1, pSync1);
	}
      shmem_barrier_all ();
      time_end = mysecond () - time_start;
      shmem_double_max_to_all (&times[0][k], &time_end, 1,
			       0, 0, _world_size, pWrk0, pSync0);

      time_start = mysecond ();
      for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize)
	{
	  if (next_p == count_p)
	    {
	      for (i = j; i < (j + blocksize); i++)
		{
		  b[i] = scalar * c[i];
		}
	      next_p = shmem_int_fadd (&gcounter, 1, ROOT);
	    }
	  count_p++;
	  shmem_double_max_to_all (b + j, b + j, blocksize, 0,
				   0, _world_size, pWrk1, pSync1);
	}
      shmem_barrier_all ();
      time_end = mysecond () - time_start;
      shmem_double_sum_to_all (&times[1][k], &time_end, 1,
			       0, 0, _world_size, pWrk0, pSync0);

      time_start = mysecond ();
      for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize)
	{
	  if (next_p == count_p)
	    {
	      for (i = j; i < (j + blocksize); i++)
		{
		  c[i] = a[i] + b[i];
		}
	      next_p = shmem_int_fadd (&gcounter, 1, ROOT);
	    }
	  count_p++;
	  shmem_double_max_to_all (c + j, c + j, blocksize, 0,
				   0, _world_size, pWrk1, pSync1);
	}
      shmem_barrier_all ();
      time_end = mysecond () - time_start;
      shmem_double_sum_to_all (&times[2][k], &time_end, 1,
			       0, 0, _world_size, pWrk0, pSync0);

      time_start = mysecond ();
      for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize)
	{
	  if (next_p == count_p)
	    {
	      for (i = j; i < (j + blocksize); i++)
		{
		  a[i] = b[i] + scalar * c[i];
		}
	      next_p = shmem_int_fadd (&gcounter, 1, ROOT);
	    }
	  count_p++;
	  shmem_double_max_to_all (a + j, a + j, blocksize, 0,
				   0, _world_size, pWrk1, pSync1);
	}
      shmem_barrier_all ();
      time_end = mysecond () - time_start;
      shmem_double_sum_to_all (&times[3][k], &time_end, 1,
			       0, 0, _world_size, pWrk0, pSync0);
    }

  shmem_barrier_all ();

  /*      --- SUMMARY --- */

  for (k = 1; k < NTIMES; k++)	/* note -- skip first iteration */
    {
      for (j = 0; j < 4; j++)
	{
	  avgtime[j] = avgtime[j] + times[j][k];
	  mintime[j] = MIN (mintime[j], times[j][k]);
	  maxtime[j] = MAX (maxtime[j], times[j][k]);
	}
    }

  if (_world_rank == 0)
    {
      printf
	("Function    Best Rate MB/s  Avg time     Min time     Max time\n");
      for (j = 0; j < 4; j++)
	{
	  avgtime[j] = avgtime[j] / (double) (NTIMES - 1);

	  printf ("%s%12.1f  %11.6f  %11.6f  %11.6f\n", label[j],
		  1.0E-06 * bytes[j] / mintime[j],
		  avgtime[j], mintime[j], maxtime[j]);
	}
      printf (HLINE);
    }
  /* --- Check Results --- */
  if (_world_rank == 0)
    {
      checkSTREAMresults ();
      printf (HLINE);
    }
  return 0;
}
Пример #15
0
int main(int argc, char ** argv)
{
  long Block_order;        /* number of columns owned by rank       */
  int Block_size;          /* size of a single block                */
  int Colblock_size;       /* size of column block                  */
  int Tile_order=32;       /* default Tile order                    */
  int tiling;              /* boolean: true if tiling is used       */
  int Num_procs;           /* number of ranks                       */
  int order;               /* order of overall matrix               */
  int bufferCount;         /* number of input buffers               */
  int targetBuffer;        /* buffer with which to communicate      */
  int send_to, recv_from;  /* ranks with which to communicate       */
  long bytes;              /* combined size of matrices             */
  int my_ID;               /* rank                                  */
  int root=0;              /* rank of root                          */
  int iterations;          /* number of times to do the transpose   */
  long i, j, it, jt, istart;/* dummies                              */
  int iter;                /* index of iteration                    */
  int phase;               /* phase inside staged communication     */
  int colstart;            /* starting column for owning rank       */
  int error;               /* error flag                            */
  double *A_p;             /* original matrix column block          */
  double *B_p;             /* transposed matrix column block        */
  double **Work_in_p;      /* workspace for the transpose function  */
  double *Work_out_p;      /* workspace for the transpose function  */
  double epsilon = 1.e-8;  /* error tolerance                       */
  double avgtime;          /* timing parameters                     */
  long   *pSync_bcast;     /* work space for collectives            */
  long   *pSync_reduce;    /* work space for collectives            */
  double *pWrk;            /* work space for SHMEM collectives      */
  double *local_trans_time, 
         *trans_time;      /* timing parameters                     */
  double *abserr, 
         *abserr_tot;      /* local and aggregate error             */
  int    *send_flag,
         *recv_flag;       /* synchronization flags                 */
  int    *arguments;       /* command line arguments                */

/*********************************************************************
** Initialize the SHMEM environment
*********************************************************************/

  prk_shmem_init();
  my_ID=prk_shmem_my_pe();
  Num_procs=prk_shmem_n_pes();

  if (my_ID == root) {
    printf("Parallel Research Kernels version %s\n", PRKVERSION);
    printf("SHMEM matrix transpose: B = A^T\n");
  }

// initialize sync variables for error checks
  pSync_bcast      = (long *)   prk_shmem_align(prk_get_alignment(),PRK_SHMEM_BCAST_SYNC_SIZE*sizeof(long));
  pSync_reduce     = (long *)   prk_shmem_align(prk_get_alignment(),PRK_SHMEM_REDUCE_SYNC_SIZE*sizeof(long));
  pWrk             = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double) * PRK_SHMEM_REDUCE_MIN_WRKDATA_SIZE);
  local_trans_time = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double));
  trans_time       = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double));
  arguments        = (int *)    prk_shmem_align(prk_get_alignment(),4*sizeof(int));
  abserr           = (double *) prk_shmem_align(prk_get_alignment(),2*sizeof(double));
  abserr_tot       = abserr + 1;
  if (!pSync_bcast || !pSync_reduce || !pWrk || !local_trans_time ||
      !trans_time || !arguments || !abserr) {
    printf("Rank %d could not allocate scalar work space on symm heap\n", my_ID);
    error = 1;
    goto ENDOFTESTS;
  }

  for(i=0;i<PRK_SHMEM_BCAST_SYNC_SIZE;i++)
    pSync_bcast[i]=PRK_SHMEM_SYNC_VALUE;

  for(i=0;i<PRK_SHMEM_REDUCE_SYNC_SIZE;i++)
    pSync_reduce[i]=PRK_SHMEM_SYNC_VALUE;

/*********************************************************************
** process, test and broadcast input parameters
*********************************************************************/
  error = 0;
  if (my_ID == root) {
    if (argc != 4 && argc != 5){
      printf("Usage: %s <# iterations> <matrix order> <# buffers> [Tile size]\n",
                                                               *argv);
      error = 1; goto ENDOFTESTS;
    }

    iterations  = atoi(*++argv);
    arguments[0]=iterations;
    if(iterations < 1){
      printf("ERROR: iterations must be >= 1 : %d \n",iterations);
      error = 1; goto ENDOFTESTS;
    }

    order = atoi(*++argv);
    arguments[1]=order;
    if (order < Num_procs) {
      printf("ERROR: matrix order %d should at least # procs %d\n", 
             order, Num_procs);
      error = 1; goto ENDOFTESTS;
    }
    if (order%Num_procs) {
      printf("ERROR: matrix order %d should be divisible by # procs %d\n",
             order, Num_procs);
      error = 1; goto ENDOFTESTS;
    }

    bufferCount = atoi(*++argv);
    arguments[2]=bufferCount;
    if (Num_procs > 1) {
      if ((bufferCount < 1) || (bufferCount >= Num_procs)) {
        printf("ERROR: bufferCount must be >= 1 and < # procs : %d\n", bufferCount);
        error = 1; goto ENDOFTESTS;
      }
    }

    if (argc == 5) Tile_order = atoi(*++argv);
    arguments[3]=Tile_order;

    ENDOFTESTS:;
  }
  bail_out(error);

  if (my_ID == root) {
    printf("Number of ranks      = %d\n", Num_procs);
    printf("Matrix order         = %d\n", order);
    printf("Number of iterations = %d\n", iterations);
    printf("Number of buffers    = %d\n", bufferCount);
    if ((Tile_order > 0) && (Tile_order < order))
          printf("Tile size            = %d\n", Tile_order);
    else  printf("Untiled\n");
  }
  
  shmem_barrier_all();

  /*  Broadcast input data to all ranks */
  shmem_broadcast32(&arguments[0], &arguments[0], 4, root, 0, 0, Num_procs, pSync_bcast);

  iterations=arguments[0];
  order=arguments[1];
  bufferCount=arguments[2];
  Tile_order=arguments[3];

  shmem_barrier_all();
  prk_shmem_free(arguments);

  /* a non-positive tile size means no tiling of the local transpose */
  tiling = (Tile_order > 0) && (Tile_order < order);
  bytes = 2 * sizeof(double) * order * order;

/*********************************************************************
** The matrix is broken up into column blocks that are mapped one to a 
** rank.  Each column block is made up of Num_procs smaller square 
** blocks of order block_order.
*********************************************************************/

  Block_order    = order/Num_procs;
  colstart       = Block_order * my_ID;
  Colblock_size  = order * Block_order;
  Block_size     = Block_order * Block_order;

/*********************************************************************
** Create the column block of the test matrix, the row block of the 
** transposed matrix, and workspace (workspace only if #procs>1)
*********************************************************************/
  A_p = (double *)prk_malloc(Colblock_size*sizeof(double));
  if (A_p == NULL){
    printf(" Error allocating space for original matrix on node %d\n",my_ID);
    error = 1;
  }
  bail_out(error);

  B_p = (double *)prk_malloc(Colblock_size*sizeof(double));
  if (B_p == NULL){
    printf(" Error allocating space for transpose matrix on node %d\n",my_ID);
    error = 1;
  }
  bail_out(error);

  if (Num_procs>1) {
    Work_in_p   = (double**)prk_malloc(bufferCount*sizeof(double));
    Work_out_p = (double *) prk_malloc(Block_size*sizeof(double));
    recv_flag  = (int*)     prk_shmem_align(prk_get_alignment(),bufferCount*sizeof(int));
    if ((Work_in_p == NULL)||(Work_out_p==NULL) || (recv_flag == NULL)){
      printf(" Error allocating space for work or flags on node %d\n",my_ID);
      error = 1;
    }

    if (bufferCount < (Num_procs - 1)) {
      send_flag = (int*) prk_shmem_align(prk_get_alignment(), (Num_procs-1) * sizeof(int));

      if (send_flag == NULL) {
	printf("Error allocating space for flags on node %d\n", my_ID);
	error = 1;
      }
    }

    bail_out(error);

    for(i=0;i<bufferCount;i++) {
      Work_in_p[i]=(double *) prk_shmem_align(prk_get_alignment(),Block_size*sizeof(double));
      if (Work_in_p[i] == NULL) {
        printf(" Error allocating space for work on node %d\n",my_ID);
        error = 1;
      }
      bail_out(error);
    }

    if (bufferCount < (Num_procs - 1)) {
      for(i=0;i<(Num_procs-1);i++)
        send_flag[i]=0;
    }

    for(i=0;i<bufferCount;i++)
      recv_flag[i]=0;
  }
  
  /* Fill the original column matrices                                              */
  istart = 0;  
  for (j=0;j<Block_order;j++) 
    for (i=0;i<order; i++)  {
      A(i,j) = (double) (order*(j+colstart) + i);
      B(i,j) = 0.0;
  }

  shmem_barrier_all();

  if (bufferCount < (Num_procs - 1)) {
    if (Num_procs > 1) {
      for ( i = 0; i < bufferCount; i++) {
        recv_from = (my_ID + i + 1)%Num_procs;
        shmem_int_inc(&send_flag[i], recv_from);
      }
    }
  }

  shmem_barrier_all();

  for (iter = 0; iter<=iterations; iter++){

    /* start timer after a warmup iteration                                        */
    if (iter == 1) { 
      shmem_barrier_all();
      local_trans_time[0] = wtime();
    }

    /* do the local transpose                                                     */
    istart = colstart; 
    if (!tiling) {
      for (i=0; i<Block_order; i++) 
        for (j=0; j<Block_order; j++) {
          B(j,i) += A(i,j);
          A(i,j) += 1.0;
	}
    }
    else {
      for (i=0; i<Block_order; i+=Tile_order) 
        for (j=0; j<Block_order; j+=Tile_order) 
          for (it=i; it<MIN(Block_order,i+Tile_order); it++)
            for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) {
              B(jt,it) += A(it,jt); 
              A(it,jt) += 1.0;
            }
    }

    for (phase=1; phase<Num_procs; phase++){
      recv_from = (my_ID + phase            )%Num_procs;
      send_to   = (my_ID - phase + Num_procs)%Num_procs;

      targetBuffer = (iter * (Num_procs - 1) + (phase - 1)) % bufferCount;

      istart = send_to*Block_order; 
      if (!tiling) {
        for (i=0; i<Block_order; i++) 
          for (j=0; j<Block_order; j++){
	    Work_out(j,i) = A(i,j);
            A(i,j) += 1.0;
	  }
      }
      else {
        for (i=0; i<Block_order; i+=Tile_order) 
          for (j=0; j<Block_order; j+=Tile_order) 
            for (it=i; it<MIN(Block_order,i+Tile_order); it++)
              for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) {
                Work_out(jt,it) = A(it,jt); 
                A(it,jt) += 1.0;
	      }
      }

      if (bufferCount < (Num_procs - 1))
        shmem_int_wait_until(&send_flag[phase-1], SHMEM_CMP_EQ, iter+1);

      shmem_double_put(&Work_in_p[targetBuffer][0], &Work_out_p[0], Block_size, send_to);
      shmem_fence();
      shmem_int_inc(&recv_flag[targetBuffer], send_to);

      i = (iter * (Num_procs - 1) + phase) / bufferCount;

      if ((iter * (Num_procs - 1) + phase) % bufferCount)
	i++;

      shmem_int_wait_until(&recv_flag[targetBuffer], SHMEM_CMP_EQ, i);

      istart = recv_from*Block_order; 
      /* scatter received block to transposed matrix; no need to tile */
      for (j=0; j<Block_order; j++)
        for (i=0; i<Block_order; i++) 
          B(i,j) += Work_in(targetBuffer, i,j);

      if (bufferCount < (Num_procs - 1)) {
        if ((phase + bufferCount) < Num_procs)
	  recv_from = (my_ID + phase + bufferCount) % Num_procs;
        else
	  recv_from = (my_ID + phase + bufferCount + 1 - Num_procs) % Num_procs;

        shmem_int_inc(&send_flag[(phase+bufferCount-1)%(Num_procs-1)], recv_from);
      }
    }  /* end of phase loop  */
  } /* end of iterations */

  local_trans_time[0] = wtime() - local_trans_time[0];

  shmem_barrier_all();
  shmem_double_max_to_all(trans_time, local_trans_time, 1, 0, 0, Num_procs, pWrk, pSync_reduce);

  abserr[0] = 0.0;
  istart = 0;
  double addit = ((double)(iterations+1) * (double) (iterations))/2.0;
  for (j=0;j<Block_order;j++) for (i=0;i<order; i++) {
      abserr[0] += ABS(B(i,j) - (double)((order*i + j+colstart)*(iterations+1)+addit));
  }

  shmem_barrier_all();
  shmem_double_sum_to_all(abserr_tot, abserr, 1, 0, 0, Num_procs, pWrk, pSync_reduce);

  if (my_ID == root) {
    if (abserr_tot[0] <= epsilon) {
      printf("Solution validates\n");
      avgtime = trans_time[0]/(double)iterations;
      printf("Rate (MB/s): %lf Avg time (s): %lf\n",1.0E-06*bytes/avgtime, avgtime);
#ifdef VERBOSE
      printf("Summed errors: %f \n", abserr[0]);
#endif
    }
    else {
      printf("ERROR: Aggregate squared error %e exceeds threshold %e\n", abserr[0], epsilon);
      error = 1;
    }
  }

  bail_out(error);

  if (Num_procs>1) 
  {
    if (bufferCount < (Num_procs - 1))
      prk_shmem_free(send_flag);

    prk_shmem_free(recv_flag);
    prk_free(Work_out_p);

    for(i=0;i<bufferCount;i++)
      prk_shmem_free(Work_in_p[i]);

    prk_free(Work_in_p);
  }

  prk_shmem_free(pSync_bcast);
  prk_shmem_free(pSync_reduce);
  prk_shmem_free(pWrk);

  prk_shmem_finalize();
  exit(EXIT_SUCCESS);

}  /* end of main */
Пример #16
0
int
main (int argc, char *argv[])
{
  int myid, numprocs, i;
  double h, sum, x;
  struct timeval startwtime, endwtime;

  start_pes (0);
  numprocs = _num_pes ();
  myid = _my_pe ();

  if (myid == 0)
    {
      if (argc > 1)
	n = atoi (argv[1]);	/* # rectangles on command line */
      else
	n = 10000;		/* default # of rectangles */

      gettimeofday (&startwtime, NULL);
    }

  /* initialize sync array */
  for (i = 0; i < _SHMEM_BCAST_SYNC_SIZE; i += 1)
    pSync[i] = _SHMEM_SYNC_VALUE;
  shmem_barrier_all ();

  /* send "n" out to everyone */
  shmem_broadcast32 (&n, &n, 1, 0, 0, 0, numprocs, pSync);

  /* do partial computation */
  h = 1.0 / (double) n;
  sum = 0.0;
  /* A slightly better approach starts from large i and works back */
  for (i = myid + 1; i <= n; i += numprocs)
    {
      x = h * ((double) i - 0.5);
      sum += f (x);
    }
  mypi = h * sum;

  /* wait for everyone to finish */
  shmem_barrier_all ();

  /* add up partial pi computations into PI */
  shmem_double_sum_to_all (&pi, &mypi, 1, 0, 0, numprocs, pWrk, pSync);

  /* "master" PE summarizes */
  if (myid == 0)
    {
      double elapsed;
      gettimeofday (&endwtime, NULL);
      elapsed = (endwtime.tv_sec - startwtime.tv_sec) * 1000.0;	/* sec to ms */
      elapsed += (endwtime.tv_usec - startwtime.tv_usec) / 1000.0;	/* us to ms */
      printf ("pi is approximately %.16f, Error is %.16f\n",
	      pi, fabs (pi - PI25DT));
      printf ("run time = %f ms\n", elapsed);
      fflush (stdout);
    }

  return 0;
}