Ejemplo n.º 1
0
int
wb_tree_probe(wb_tree *tree, void *key, void **dat)
{
	int rv = 0;
	wb_node *node, *parent = NULL;
	float wbal;

	ASSERT(tree != NULL);

	node = tree->root;
	while (node) {
		rv = tree->key_cmp(key, node->key);
		if (rv < 0)
			parent = node, node = node->llink;
		else if (rv > 0)
			parent = node, node = node->rlink;
		else {
			*dat = node->dat;
			return 0;
		}
	}

	if ((node = node_new(key, *dat)) == NULL)
		return -1;
	if ((node->parent = parent) == NULL) {
		ASSERT(tree->count == 0);
		tree->root = node;
		tree->count = 1;
		return 0;
	}
	if (rv < 0)
		parent->llink = node;
	else
		parent->rlink = node;

	while ((node = parent) != NULL) {
		parent = node->parent;
		node->weight++;
		wbal = WEIGHT(node->llink) / (float)node->weight;
		if (wbal < ALPHA_0) {
			wbal = WEIGHT(node->rlink->llink) / (float)node->rlink->weight;
			if (wbal < ALPHA_3) {
				rot_left(tree, node);
			} else {
				rot_right(tree, node->rlink);
				rot_left(tree, node);
			}
		} else if (wbal > ALPHA_1) {
			wbal = WEIGHT(node->llink->llink) / (float)node->llink->weight;
			if (wbal > ALPHA_2) {
				rot_right(tree, node);
			} else {
				rot_left(tree, node->llink);
				rot_right(tree, node);
			}
		}
	}
	tree->count++;
	return 1;
}
Ejemplo n.º 2
0
Archivo: ft.c Proyecto: 8l/csolve
Vertices *
MST(Vertices * graph)
{
  HeapP * heap;
  Vertices * vertex;
  Edges * edge;
  ;

  InitFHeap();

  /*
   * key(s) = 0;
   * key(v) = infty for v != s;
   * init heap;
   * make a heap;
   * put s in heap;
   */
  vertex = graph;
  KEY(vertex) = 0;
  heap = MakeHeap();
  (void)Insert(&heap, (Item *)vertex);

  vertex = NEXT_VERTEX(vertex);
  while(vertex != graph)
  {
    KEY(vertex) = PLUS_INFINITY;
    vertex = NEXT_VERTEX(vertex);
  }
  while(vertex != graph);

  vertex = FindMin(heap);
  while(vertex != NULL_VERTEX)
  {
    heap = DeleteMin(heap);
    KEY(vertex) = MINUS_INFINITY;
    edge = EDGES(vertex);
    while(edge != NULL_EDGE)
    {
      if(WEIGHT(edge) < KEY(VERTEX(edge)))
      {
        KEY(VERTEX(edge)) = WEIGHT(edge);
        CHOSEN_EDGE(VERTEX(edge)) = edge;
        (void)Insert(&heap, VERTEX(edge));
      }
      edge = NEXT_EDGE(edge);
    }
    vertex = FindMin(heap);
  }
  ;
  return(graph);
}
Ejemplo n.º 3
0
Archivo: graph.c Proyecto: 8l/csolve
Vertices *
GenTree(int nVertex)
{
  int       i;
  int       weight;
  Vertices * vertex;
  Vertices * graph;
  Edges * edge;

  graph = NewVertex();
  NEXT_VERTEX(graph) = graph;

  for(i = 1; i < nVertex; i++)
  {
    vertex = NewVertex();
    edge = NewEdge();

    /*
     * The newly created vertex has one edge ...
     */
    EDGES(vertex) = edge;

    /*
     * ... which is connected to the graph so far generated.  The connection
     * point in the graph is picked at random.
     */
    VERTEX(edge) = PickVertex(graph, random() % i);
    weight = GET_WEIGHT;
    WEIGHT(edge) = weight;
    SOURCE(edge) = vertex;

    /*
     * Link the new vertex into the graph.
     */
    NEXT_VERTEX(vertex) = NEXT_VERTEX(graph);
    NEXT_VERTEX(graph) = vertex;

    /*
     * Add an edge to the vertex randomly picked as the connection point.
     */
    edge = NewEdge();
    WEIGHT(edge) = weight;
    SOURCE(edge) = VERTEX(EDGES(vertex));
    VERTEX(edge) = vertex;
    NEXT_EDGE(edge) = EDGES(VERTEX(EDGES(vertex)));
    EDGES(VERTEX(EDGES(vertex))) = edge;
   }

  return(graph);
}
Ejemplo n.º 4
0
    void compute() {
      double * RESTRICT in  = this->in;
      double * RESTRICT out = this->out;
      int ii, jj;

      for (int j=MAX(jstart,RADIUS); j<=MIN(n-1-RADIUS,jend); j++) {
        for (int i=MAX(istart,RADIUS); i<=MIN(n-1-RADIUS,iend); i++) {
          #if LOOPGEN
            #include "loop_body_star.incl"
          #else
            for (jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj);
            for (ii=-RADIUS; ii<0; ii++)       OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j);
            for (ii=1; ii<=RADIUS; ii++)       OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j);
          #endif
        }
      }
    }
Ejemplo n.º 5
0
float TThresholdCA::operator()(PClassifier classifier, PExampleGenerator data, const int &weightID, float &optCA, const int &targetValue, TFloatFloatList *CAs)
{
  if (!data->domain->classVar)
    raiseError("classless domain");
  if (data->domain->classVar != classifier->classVar)
    raiseError("classifier's class variables mismatches the given examples'");
  TEnumVariable *classVar = data->domain->classVar.AS(TEnumVariable);
  if (!classVar)
    raiseError("discrete class expected");

  int wtarget;
  if (targetValue >= 0)
    wtarget = targetValue;
  else if (classVar->baseValue >= 0)
    wtarget = classVar->baseValue;
  else if (classVar->values->size() == 2)
    wtarget = 1;
  else
    raiseError("cannot determine target class: none is given, class is not binary and its 'baseValue' is not set");

  typedef map<float, float> tmfpff;
  tmfpff dists;
  float N = 0.0, corr = 0.0;
  PEITERATE(ei, data) 
    if (!(*ei).getClass().isSpecial()) {
      float wei = WEIGHT(*ei);
      N += wei;
      if ((*ei).getClass().intV == wtarget) {
        corr += wei;
        wei = -wei;
      }

      const float prob = classifier->classDistribution(*ei)->atint(wtarget);
      pair<tmfpff::iterator, bool> elm = dists.insert(make_pair(prob, wei));
      if (!elm.second)
        (*elm.first).second += wei;
    }

  optCA = 0;

  if (dists.size() < 2)
    return 0.5;
    
  float optthresh;
  for(tmfpff::const_iterator ni(dists.begin()), ie(dists.end()), ii(ni++); ni != ie; ii = ni++) {
    corr += (*ii).second;
    if ((corr > optCA) || ((corr == optCA) && ((*ii).first < 0.5))) {
      optCA = corr;
      optthresh = ((*ii).first + (*ni).first) / 2.0;
    }
    if (CAs)
      CAs->push_back(make_pair(((*ii).first + (*ni).first) / 2.0, corr/N));
  }

  optCA /= N;
  return optthresh;
} 
Ejemplo n.º 6
0
    void compute() {
      double * RESTRICT in = this->in;
      double * RESTRICT out = this->out;

      for (int j=MAX(jstart,RADIUS); j<=MIN(n-1-RADIUS,jend); j++) {
        for (int i=MAX(istart,RADIUS); i<=MIN(n-1-RADIUS,iend); i++) {

          for (int jj=-RADIUS; jj<=RADIUS; jj++) {
            OUT(i-istart,j-jstart) += WEIGHT(0,jj)*IN(i-istart,j-jstart+jj);
	  }
          for (int ii=-RADIUS; ii<0; ii++) {
            OUT(i-istart,j-jstart) += WEIGHT(ii,0)*IN(i-istart+ii,j-jstart);
	  }
          for (int ii=1; ii<=RADIUS; ii++) {
            OUT(i-istart,j-jstart) += WEIGHT(ii,0)*IN(i-istart+ii,j-jstart);
	  }
        }
      }
    }
Ejemplo n.º 7
0
Archivo: graph.c Proyecto: 8l/csolve
void
PrintNeighbors(Vertices * vertex)
{
  Edges * edge;

  edge = EDGES(vertex);
  while(edge != NULL)
  {
    printf(" %d(%d)[%d]", ID(VERTEX(edge)), WEIGHT(edge), ID(SOURCE(edge)));
    edge = NEXT_EDGE(edge);
  }
}
Ejemplo n.º 8
0
Archivo: graph.c Proyecto: 8l/csolve
void
Connect(Vertices * vertex1, Vertices * vertex2)
{
  int    weight;
  Edges * edge;

  weight = GET_WEIGHT;

  edge = NewEdge();
  WEIGHT(edge) = weight;
  SOURCE(edge) = vertex1;
  VERTEX(edge) = vertex2;
  NEXT_EDGE(edge) = EDGES(vertex1);
  EDGES(vertex1) = edge;
  
  edge = NewEdge();
  WEIGHT(edge) = weight;
  SOURCE(edge) = vertex2;
  VERTEX(edge) = vertex1;
  NEXT_EDGE(edge) = EDGES(vertex2);
  EDGES(vertex2) = edge;
}
Ejemplo n.º 9
0
Archivo: graph.c Proyecto: 8l/csolve
Edges *
NewEdge()
{
  Edges * edge;

  edge = (Edges *)malloc(sizeof(Edges));

  if(edge == NULL)
  {
    fprintf(stderr, "Could not malloc\n");
    exit(1);
  }

  WEIGHT(edge) = 0;
  VERTEX(edge) = NULL;
  NEXT_EDGE(edge) = NULL;
  
  return(edge);
}
Ejemplo n.º 10
0
sm_row *
sm_minimum_cover(sm_matrix *A, int *weight, int heuristic, int debug_level)
  /* set to 1 for a heuristic covering */
  /* how deep in the recursion to provide info */
{
  stats_t stats;
  solution_t *best, *select;
  sm_row *prow, *sol;
  sm_col *pcol;
  sm_matrix *dup_A;
  int nelem, bound;
  double sparsity;

  /* Avoid sillyness */
  if (A->nrows <= 0) {
    return sm_row_alloc();    /* easy to cover */
  }

  /* Initialize debugging structure */
  stats.start_time = util_cpu_time();
  stats.debug = debug_level > 0;
  stats.max_print_depth = debug_level;
  stats.max_depth = -1;
  stats.nodes = 0;
  stats.component = stats.comp_count = 0;
  stats.gimpel = stats.gimpel_count = 0;
  stats.no_branching = heuristic != 0;
  stats.lower_bound = -1;

  /* Check the matrix sparsity */
  nelem = 0;
  sm_foreach_row(A, prow) {
    nelem += prow->length;
  }
  sparsity = (double) nelem / (double) (A->nrows * A->ncols);

  /* Determine an upper bound on the solution */
  bound = 1;
  sm_foreach_col(A, pcol) {
    bound += WEIGHT(weight, pcol->col_num);
  }
Ejemplo n.º 11
0
void survivals(TTimes &times, float &sow, PExampleGenerator gen, const int &outcomeIndex, TValue &failValue, const int &timeIndex, const int &weightID)
{
  const bool outcomemeta = outcomeIndex<0;
  const bool timemeta = timeIndex<0;

  if (!timemeta && (gen->domain->getVar(timeIndex)->varType != TValue::FLOATVAR))
    raiseError("continuous attribute expected for censoring time");
  if (!outcomemeta && (gen->domain->getVar(outcomeIndex)->varType != TValue::INTVAR))
    raiseError("discrete attribute expected for outcome");
  if (failValue.isSpecial() || (failValue.varType!=TValue::INTVAR))
    raiseError("discrete value needs to be specified for the 'failure'");

  const int &failIndex = failValue.intV;

  sow = 0.0;
  PEITERATE(ei, gen) {
    float wei = WEIGHT(*ei);

    TValue &timeval = (*ei)[timeIndex];
    if (timeval.isSpecial())
      continue;
    if (timemeta && timeval.varType != TValue::FLOATVAR)
      raiseError("continuous attribute expected for censoring time");

    TValue &outcomeval = (*ei)[outcomeIndex];
    if (outcomeval.isSpecial())
      continue;
    if (outcomemeta && outcomeval.varType != TValue::INTVAR)
      raiseError("discrete attribute expected for outcome");

    if (outcomeval.intV==failIndex)
      times[timeval.floatV].failed += wei;
    else
      times[timeval.floatV].censored += wei;

    sow += wei;
  }
Ejemplo n.º 12
0
int main(int argc, char ** argv) {
 
  int    Num_procs;       /* number of ranks                                     */
  int    Num_procsx, Num_procsy; /* number of ranks in each coord direction      */
  int    my_ID;           /* SHMEM rank                                          */
  int    my_IDx, my_IDy;  /* coordinates of rank in rank grid                    */
  int    right_nbr;       /* global rank of right neighboring tile               */
  int    left_nbr;        /* global rank of left neighboring tile                */
  int    top_nbr;         /* global rank of top neighboring tile                 */
  int    bottom_nbr;      /* global rank of bottom neighboring tile              */
  DTYPE *top_buf_out;     /* communication buffer                                */
  DTYPE *top_buf_in[2];   /*       "         "                                   */
  DTYPE *bottom_buf_out;  /*       "         "                                   */
  DTYPE *bottom_buf_in[2];/*       "         "                                   */
  DTYPE *right_buf_out;   /*       "         "                                   */
  DTYPE *right_buf_in[2]; /*       "         "                                   */
  DTYPE *left_buf_out;    /*       "         "                                   */
  DTYPE *left_buf_in[2];  /*       "         "                                   */
  int    root = 0;
  int    n, width, height;/* linear global and local grid dimension              */
  int    i, j, ii, jj, kk, it, jt, iter, leftover;  /* dummies                   */
  int    istart, iend;    /* bounds of grid tile assigned to calling rank        */
  int    jstart, jend;    /* bounds of grid tile assigned to calling rank        */
  DTYPE  reference_norm;
  DTYPE  f_active_points; /* interior of grid with respect to stencil            */
  int    stencil_size;    /* number of points in the stencil                     */
  DTYPE  flops;           /* floating point ops per iteration                    */
  int    iterations;      /* number of times to run the algorithm                */
  double avgtime,         /* timing parameters                                   */
         *local_stencil_time, *stencil_time; 
  DTYPE  * RESTRICT in;   /* input grid values                                   */
  DTYPE  * RESTRICT out;  /* output grid values                                  */
  long   total_length_in; /* total required length to store input array          */
  long   total_length_out;/* total required length to store output array         */
  int    error=0;         /* error flag                                          */
  DTYPE  weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil     */
  int    *arguments;      /* command line parameters                             */
  int    count_case=4;    /* number of neighbors of a rank                       */
  long   *pSync_bcast;    /* work space for collectives                          */
  long   *pSync_reduce;   /* work space for collectives                          */
  double *pWrk_time;      /* work space for collectives                          */
  DTYPE  *pWrk_norm;      /* work space for collectives                          */
  int    *iterflag;       /* synchronization flags                               */
  int    sw;              /* double buffering switch                             */
  DTYPE  *local_norm, *norm; /* local and global error norms                     */

  /*******************************************************************************
  ** Initialize the SHMEM environment
  ********************************************************************************/
  prk_shmem_init();

  my_ID=prk_shmem_my_pe();
  Num_procs=prk_shmem_n_pes();

  pSync_bcast        = (long *)   prk_shmem_malloc(PRK_SHMEM_BCAST_SYNC_SIZE*sizeof(long));
  pSync_reduce       = (long *)   prk_shmem_malloc(PRK_SHMEM_REDUCE_SYNC_SIZE*sizeof(long));
  pWrk_time          = (double *) prk_shmem_malloc(PRK_SHMEM_REDUCE_MIN_WRKDATA_SIZE*sizeof(double));
  pWrk_norm          = (DTYPE *)  prk_shmem_malloc(PRK_SHMEM_REDUCE_MIN_WRKDATA_SIZE*sizeof(DTYPE));
  local_stencil_time = (double *) prk_shmem_malloc(sizeof(double));
  stencil_time       = (double *) prk_shmem_malloc(sizeof(double));
  local_norm         = (DTYPE *)  prk_shmem_malloc(sizeof(DTYPE));
  norm               = (DTYPE *)  prk_shmem_malloc(sizeof(DTYPE));
  iterflag           = (int *)    prk_shmem_malloc(2*sizeof(int));
  if (!(pSync_bcast && pSync_reduce && pWrk_time && pWrk_norm && iterflag &&
	local_stencil_time && stencil_time && local_norm && norm))
  {
    printf("Could not allocate scalar variables on rank %d\n", my_ID);
    error = 1;
  }
  bail_out(error);

  for(i=0;i<PRK_SHMEM_BCAST_SYNC_SIZE;i++)
    pSync_bcast[i]=PRK_SHMEM_SYNC_VALUE;

  for(i=0;i<PRK_SHMEM_REDUCE_SYNC_SIZE;i++)
    pSync_reduce[i]=PRK_SHMEM_SYNC_VALUE;

  arguments=(int*)prk_shmem_malloc(2*sizeof(int));
 
  /*******************************************************************************
  ** process, test, and broadcast input parameters    
  ********************************************************************************/
 
  if (my_ID == root) {
#ifndef STAR
    printf("ERROR: Compact stencil not supported\n");
    error = 1;
    goto ENDOFTESTS;
#endif
      
    if (argc != 3){
      printf("Usage: %s <# iterations> <array dimension> \n", 
             *argv);
      error = 1;
      goto ENDOFTESTS;
    }
 
    iterations  = atoi(*++argv); 
    arguments[0]=iterations;

    if (iterations < 1){
      printf("ERROR: iterations must be >= 1 : %d \n",iterations);
      error = 1;
      goto ENDOFTESTS;  
    }
 
    n  = atoi(*++argv);
    arguments[1]=n;
    long nsquare = (long)n * (long)n;

    if (nsquare < Num_procs){ 
      printf("ERROR: grid size must be at least # ranks: %ld\n", nsquare);
      error = 1;
      goto ENDOFTESTS;
    }
 
    if (RADIUS < 0) {
      printf("ERROR: Stencil radius %d should be non-negative\n", RADIUS);
      error = 1;
      goto ENDOFTESTS;  
    }
 
    if (2*RADIUS +1 > n) {
      printf("ERROR: Stencil radius %d exceeds grid size %d\n", RADIUS, n);
      error = 1;
      goto ENDOFTESTS;  
    }
 
    ENDOFTESTS:;  
  }
  bail_out(error);
 
  /* determine best way to create a 2D grid of ranks (closest to square, for 
     best surface/volume ratio); we do this brute force for now
  */
  for (Num_procsx=(int) (sqrt(Num_procs+1)); Num_procsx>0; Num_procsx--) {
    if (!(Num_procs%Num_procsx)) {
      Num_procsy = Num_procs/Num_procsx;
      break;
    }
  }      
  my_IDx = my_ID%Num_procsx;
  my_IDy = my_ID/Num_procsx;
  /* compute neighbors; don't worry about dropping off the edges of the grid */
  right_nbr  = my_ID+1;
  left_nbr   = my_ID-1;
  top_nbr    = my_ID+Num_procsx;
  bottom_nbr = my_ID-Num_procsx;

  iterflag[0] = iterflag[1] = 0;

  if(my_IDx==0)            count_case--;
  if(my_IDx==Num_procsx-1) count_case--;
  if(my_IDy==0)            count_case--;
  if(my_IDy==Num_procsy-1) count_case--;
 
  if (my_ID == root) {
    printf("Parallel Research Kernels version %s\n", PRKVERSION);
    printf("SHMEM stencil execution on 2D grid\n");
    printf("Number of ranks        = %d\n", Num_procs);
    printf("Grid size              = %d\n", n);
    printf("Radius of stencil      = %d\n", RADIUS);
    printf("Tiles in x/y-direction = %d/%d\n", Num_procsx, Num_procsy);
    printf("Type of stencil        = star\n");
#ifdef DOUBLE
    printf("Data type              = double precision\n");
#else
    printf("Data type              = single precision\n");
#endif
#if LOOPGEN
    printf("Script used to expand stencil loop body\n");
#else
    printf("Compact representation of stencil loop body\n");
#endif
#if SPLITFENCE
    printf("Split fence            = ON\n");
#else
    printf("Split fence            = OFF\n");
#endif
    printf("Number of iterations   = %d\n", iterations);
  }

  shmem_barrier_all();
 
  shmem_broadcast32(&arguments[0], &arguments[0], 2, root, 0, 0, Num_procs, pSync_bcast);

  iterations=arguments[0];
  n=arguments[1];

  shmem_barrier_all();
  prk_shmem_free(arguments);
 
  /* compute amount of space required for input and solution arrays             */
  
  width = n/Num_procsx;
  leftover = n%Num_procsx;
  if (my_IDx<leftover) {
    istart = (width+1) * my_IDx; 
    iend = istart + width + 1;
  }
  else {
    istart = (width+1) * leftover + width * (my_IDx-leftover);
    iend = istart + width;
  }
  
  width = iend - istart + 1;
  if (width == 0) {
    printf("ERROR: rank %d has no work to do\n", my_ID);
    error = 1;
  }
  bail_out(error);
 
  height = n/Num_procsy;
  leftover = n%Num_procsy;
  if (my_IDy<leftover) {
    jstart = (height+1) * my_IDy; 
    jend = jstart + height + 1;
  }
  else {
    jstart = (height+1) * leftover + height * (my_IDy-leftover);
    jend = jstart + height;
  }
  
  height = jend - jstart + 1;
  if (height == 0) {
    printf("ERROR: rank %d has no work to do\n", my_ID);
    error = 1;
  }
  bail_out(error);
 
  if (width < RADIUS || height < RADIUS) {
    printf("ERROR: rank %d has work tile smaller then stencil radius\n",
           my_ID);
    error = 1;
  }
  bail_out(error);
 
  total_length_in = (width+2*RADIUS);
  total_length_in *= (height+2*RADIUS);
  total_length_in *= sizeof(DTYPE);

  total_length_out = width;
  total_length_out *= height;
  total_length_out *= sizeof(DTYPE);
 
  in  = (DTYPE *) malloc(total_length_in);
  out = (DTYPE *) malloc(total_length_out);
  if (!in || !out) {
    printf("ERROR: rank %d could not allocate space for input/output array\n",
            my_ID);
    error = 1;
  }
  bail_out(error);
 
  /* fill the stencil weights to reflect a discrete divergence operator         */
  for (jj=-RADIUS; jj<=RADIUS; jj++) for (ii=-RADIUS; ii<=RADIUS; ii++)
    WEIGHT(ii,jj) = (DTYPE) 0.0;
  stencil_size = 4*RADIUS+1;

  for (ii=1; ii<=RADIUS; ii++) {
    WEIGHT(0, ii) = WEIGHT( ii,0) =  (DTYPE) (1.0/(2.0*ii*RADIUS));
    WEIGHT(0,-ii) = WEIGHT(-ii,0) = -(DTYPE) (1.0/(2.0*ii*RADIUS));
  }
 
  norm[0] = (DTYPE) 0.0;
  f_active_points = (DTYPE) (n-2*RADIUS)*(DTYPE) (n-2*RADIUS);

  /* intialize the input and output arrays                                     */
  for (j=jstart; j<jend; j++) for (i=istart; i<iend; i++) {
    IN(i,j)  = COEFX*i+COEFY*j;
    OUT(i,j) = (DTYPE)0.0;
  }

  /* allocate communication buffers for halo values                            */
  top_buf_out=(DTYPE*)malloc(2*sizeof(DTYPE)*RADIUS*width);
  if (!top_buf_out) {
    printf("ERROR: Rank %d could not allocate output comm buffers for y-direction\n", my_ID);
    error = 1;
  }
  bail_out(error);
  bottom_buf_out = top_buf_out+RADIUS*width;

  top_buf_in[0]=(DTYPE*)prk_shmem_malloc(4*sizeof(DTYPE)*RADIUS*width);
  if(!top_buf_in)
  {
    printf("ERROR: Rank %d could not allocate input comm buffers for y-direction\n", my_ID);
    error=1;
  }
  bail_out(error);
  top_buf_in[1]    = top_buf_in[0]    + RADIUS*width;
  bottom_buf_in[0] = top_buf_in[1]    + RADIUS*width;
  bottom_buf_in[1] = bottom_buf_in[0] + RADIUS*width;
 
  right_buf_out=(DTYPE*)malloc(2*sizeof(DTYPE)*RADIUS*height);
  if (!right_buf_out) {
    printf("ERROR: Rank %d could not allocate output comm buffers for x-direction\n", my_ID);
    error = 1;
  }
  bail_out(error);
  left_buf_out=right_buf_out+RADIUS*height;

  right_buf_in[0]=(DTYPE*)prk_shmem_malloc(4*sizeof(DTYPE)*RADIUS*height);
  if(!right_buf_in)
  {
    printf("ERROR: Rank %d could not allocate input comm buffers for x-dimension\n", my_ID);
    error=1;
  }
  bail_out(error);
  right_buf_in[1] = right_buf_in[0] + RADIUS*height;
  left_buf_in[0]  = right_buf_in[1] + RADIUS*height;
  left_buf_in[1]  = left_buf_in[0]  + RADIUS*height;

  /* make sure all symmetric heaps are allocated before being used  */
  shmem_barrier_all();

  for (iter = 0; iter<=iterations; iter++){

    /* start timer after a warmup iteration */
    if (iter == 1) { 
      shmem_barrier_all();
      local_stencil_time[0] = wtime();
    }
    /* sw determines which incoming buffer to select */
    sw = iter%2;

    /* need to fetch ghost point data from neighbors */

    if (my_IDy < Num_procsy-1) {
      for (kk=0,j=jend-RADIUS; j<=jend-1; j++) for (i=istart; i<=iend; i++) {
          top_buf_out[kk++]= IN(i,j);
      }
      shmem_putmem(bottom_buf_in[sw], top_buf_out, RADIUS*width*sizeof(DTYPE), top_nbr);
#if SPLITFENCE
      shmem_fence();
      shmem_int_inc(&iterflag[sw], top_nbr);
#endif
    }
    if (my_IDy > 0) {
      for (kk=0,j=jstart; j<=jstart+RADIUS-1; j++) for (i=istart; i<=iend; i++) {
          bottom_buf_out[kk++]= IN(i,j);
      }
      shmem_putmem(top_buf_in[sw], bottom_buf_out, RADIUS*width*sizeof(DTYPE), bottom_nbr);
#if SPLITFENCE
      shmem_fence();
      shmem_int_inc(&iterflag[sw], bottom_nbr);
#endif
    }

    if(my_IDx < Num_procsx-1) {
      for(kk=0,j=jstart;j<=jend;j++) for(i=iend-RADIUS;i<=iend-1;i++) {
	right_buf_out[kk++]=IN(i,j);
      }
      shmem_putmem(left_buf_in[sw], right_buf_out, RADIUS*height*sizeof(DTYPE), right_nbr);
#if SPLITFENCE
      shmem_fence();
      shmem_int_inc(&iterflag[sw], right_nbr);
#endif
    }

    if(my_IDx>0) {
      for(kk=0,j=jstart;j<=jend;j++) for(i=istart;i<=istart+RADIUS-1;i++) {
	left_buf_out[kk++]=IN(i,j);
      }
      shmem_putmem(right_buf_in[sw], left_buf_out, RADIUS*height*sizeof(DTYPE), left_nbr);
#if SPLITFENCE
      shmem_fence();
      shmem_int_inc(&iterflag[sw], left_nbr);
#endif
    }

#if SPLITFENCE == 0
    shmem_fence();
    if(my_IDy<Num_procsy-1) shmem_int_inc(&iterflag[sw], top_nbr);
    if(my_IDy>0)            shmem_int_inc(&iterflag[sw], bottom_nbr);
    if(my_IDx<Num_procsx-1) shmem_int_inc(&iterflag[sw], right_nbr);
    if(my_IDx>0)            shmem_int_inc(&iterflag[sw], left_nbr);
#endif

    shmem_int_wait_until(&iterflag[sw], SHMEM_CMP_EQ, count_case*(iter/2+1));

    if (my_IDy < Num_procsy-1) {
      for (kk=0,j=jend; j<=jend+RADIUS-1; j++) for (i=istart; i<=iend; i++) {
          IN(i,j) = top_buf_in[sw][kk++];
      }      
    }
    if (my_IDy > 0) {
      for (kk=0,j=jstart-RADIUS; j<=jstart-1; j++) for (i=istart; i<=iend; i++) {
          IN(i,j) = bottom_buf_in[sw][kk++];
      }      
    }

    if (my_IDx < Num_procsx-1) {
      for (kk=0,j=jstart; j<=jend; j++) for (i=iend; i<=iend+RADIUS-1; i++) {
          IN(i,j) = right_buf_in[sw][kk++];
      }      
    }
    if (my_IDx > 0) {
      for (kk=0,j=jstart; j<=jend; j++) for (i=istart-RADIUS; i<=istart-1; i++) {
          IN(i,j) = left_buf_in[sw][kk++];
      }      
    }
 
    /* Apply the stencil operator */
    for (j=MAX(jstart,RADIUS); j<=MIN(n-RADIUS-1,jend); j++) {
      for (i=MAX(istart,RADIUS); i<=MIN(n-RADIUS-1,iend); i++) {
        #if LOOPGEN
          #include "loop_body_star.incl"
        #else
          for (jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj);
          for (ii=-RADIUS; ii<0; ii++)       OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j);
          for (ii=1; ii<=RADIUS; ii++)       OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j);
        #endif
      }
    }
 
    /* add constant to solution to force refresh of neighbor data, if any */
    for (j=jstart; j<jend; j++) for (i=istart; i<iend; i++) IN(i,j)+= 1.0;
 
  }
 
  local_stencil_time[0] = wtime() - local_stencil_time[0];

  shmem_barrier_all();

  shmem_double_max_to_all(&stencil_time[0], &local_stencil_time[0], 1, 0, 0,
                          Num_procs, pWrk_time, pSync_reduce);
  
  /* compute L1 norm in parallel                                                */
  local_norm[0] = (DTYPE) 0.0;
  for (j=MAX(jstart,RADIUS); j<MIN(n-RADIUS,jend); j++) {
    for (i=MAX(istart,RADIUS); i<MIN(n-RADIUS,iend); i++) {
      local_norm[0] += (DTYPE)ABS(OUT(i,j));
    }
  }

  shmem_barrier_all();
 
#ifdef DOUBLE
  shmem_double_sum_to_all(&norm[0], &local_norm[0], 1, 0, 0, Num_procs, pWrk_norm, pSync_reduce);
#else
  shmem_float_sum_to_all(&norm[0], &local_norm[0], 1, 0, 0, Num_procs, pWrk_norm, pSync_reduce);
#endif
 
  /*******************************************************************************
  ** Analyze and output results.
  ********************************************************************************/
 
/* verify correctness                                                            */
  if (my_ID == root) {
    norm[0] /= f_active_points;
    if (RADIUS > 0) {
      reference_norm = (DTYPE) (iterations+1) * (COEFX + COEFY);
    }
    else {
      reference_norm = (DTYPE) 0.0;
    }
    if (ABS(norm[0]-reference_norm) > EPSILON) {
      printf("ERROR: L1 norm = "FSTR", Reference L1 norm = "FSTR"\n",
             norm[0], reference_norm);
      error = 1;
    }
    else {
      printf("Solution validates\n");
#ifdef VERBOSE
      printf("Reference L1 norm = "FSTR", L1 norm = "FSTR"\n", 
             reference_norm, norm[0]);
#endif
    }
  }
  bail_out(error);
 
  if (my_ID == root) {
    /* flops/stencil: 2 flops (fma) for each point in the stencil, 
       plus one flop for the update of the input of the array        */
    flops = (DTYPE) (2*stencil_size+1) * f_active_points;
    avgtime = stencil_time[0]/iterations;
    printf("Rate (MFlops/s): "FSTR"  Avg time (s): %lf\n",
           1.0E-06 * flops/avgtime, avgtime);
  }
 

  prk_shmem_free(top_buf_in);
  prk_shmem_free(right_buf_in);
  free(top_buf_out);
  free(right_buf_out);

  prk_shmem_free(pSync_bcast);
  prk_shmem_free(pSync_reduce);
  prk_shmem_free(pWrk_time);
  prk_shmem_free(pWrk_norm);

  prk_shmem_finalize();

  exit(EXIT_SUCCESS);
}
Ejemplo n.º 13
0
    Main(CkArgMsg* m) {

      int num_chares, min_size;
      long nsquare;         

      CkPrintf("Parallel Research Kernels Version %s\n", PRKVERSION);
      CkPrintf("Charm++ stencil execution on 2D grid\n");

      if (m->argc != 4) {
        CkPrintf("%s <maxiterations> <grid_size> <overdecomposition factor>\n", m->argv[0]);
        CkExit();
      }

      // store the main proxy
      mainProxy = thisProxy;

      maxiterations = atoi(m->argv[1]);
      if (maxiterations < 1) {
        CkPrintf("ERROR: maxiterations must be positive: %d", maxiterations);
        CkExit();
      }

      n = atoi(m->argv[2]);
      nsquare = n * n;
      if (nsquare < CkNumPes()) {
        CkPrintf("ERROR: Grid size %ld must be larger than  #PEs %d", 
                 nsquare, CkNumPes());
        CkExit();
      }

      overdecomposition = atoi(m->argv[3]);
      if (n < overdecomposition) {
        CkPrintf("ERROR: Grid size %d must be larger than overdecomposition %d",
                 n, overdecomposition);
	CkExit();
      }

      if (RADIUS < 0) {
        CkPrintf("ERROR: Stencil radius %d should be non-negative\n", RADIUS);
        CkExit();
      }
  
      if (2*RADIUS +1 > n) {
        CkPrintf("ERROR: Stencil diameter %d exceeds grid size %d\n", 2*RADIUS +1 , n);
        CkExit();
      }

      // compute decomposition that has smallest surface/volume ratio
      num_chares = CkNumPes()*overdecomposition;
      for (num_chare_cols= (int) (sqrt(num_chares+1)); num_chare_cols>0; num_chare_cols--) {
        if (!(num_chares%num_chare_cols)) {
          num_chare_rows = num_chares/num_chare_cols;
          break;
        }
      }
      // determine best way to create a 2D grid of ranks (closest to square)     */
      factor(num_chares, &num_chare_cols, &num_chare_rows);
      
      min_size = (n+num_chare_cols-1)/num_chare_cols;
      if (min_size<RADIUS) {
        CkPrintf("ERROR: Some tiles smaller than radius of difference stencil\n");
        CkExit();
      }

      // print info
      CkPrintf("Number of Charm++ PEs   = %d\n", CkNumPes());
      CkPrintf("Overdecomposition       = %d\n", overdecomposition);
      CkPrintf("Grid size               = %d\n", n);
      CkPrintf("Radius of stencil       = %d\n", RADIUS);
      CkPrintf("Chares in x/y-direction = %d/%d\n", num_chare_cols, num_chare_rows);
#if STAR
      CkPrintf("Type of stencil         = star\n");
#else
      CkPrintf("Type of stencil         = compact\n");
      CkPrintf("ERROR: Compact stencil not (yet) supported\n");
      CkExit();
#endif
#if LOOPGEN
      CkPrintf("Script used to expand stencil loop body\n");
#else
      CkPrintf("Compact representation of stencil loop body\n");
#endif
      CkPrintf("Number of iterations    = %d\n", maxiterations);

      // Create new array of worker chares
      array = CProxy_Stencil::ckNew(num_chare_cols, num_chare_rows);

      /* fill the stencil weights to reflect a discrete divergence operator         */
      for (int j=-RADIUS; j<=RADIUS; j++) for (int i=-RADIUS; i<=RADIUS; i++)
					      WEIGHT(i,j) = 0.0;
      #if STAR
        for (int i=1; i<=RADIUS; i++) {
          WEIGHT(0, i) = WEIGHT( i,0) =  (1.0/(2.0*i*RADIUS));
          WEIGHT(0,-i) = WEIGHT(-i,0) = -(1.0/(2.0*i*RADIUS));
        }
      #else
        stencil_size = (2*RADIUS+1)*(2*RADIUS+1);
        for (int j=1; j<=RADIUS; j++) {
          for (int i=-j+1; i<j; i++) {
            WEIGHT(i,j)  =  (1.0/(4.0*j*(2.0*j-1)*RADIUS));
            WEIGHT(i,-j) = -(1.0/(4.0*j*(2.0*j-1)*RADIUS));
            WEIGHT(j,i)  =  (1.0/(4.0*j*(2.0*j-1)*RADIUS));
            WEIGHT(-j,i) = -(1.0/(4.0*j*(2.0*j-1)*RADIUS));      
          }
          WEIGHT(j,j)    =  (1.0/(4.0*j*RADIUS));
          WEIGHT(-j,-j)  = -(1.0/(4.0*j*RADIUS));
        }
      #endif  

      //Start the computation
      array.run();
    }
Ejemplo n.º 14
0
int main(int argc, char ** argv) {

  int    Num_procs;       /* number of ranks                                     */
  int    Num_procsx,
         Num_procsy;      /* number of ranks in each coord direction             */
  int    Num_groupsx,
         Num_groupsy;     /* number of blocks in each coord direction            */
  int    my_group;        /* sequence number of shared memory block              */
  int    my_group_IDx,
         my_group_IDy;    /* coordinates of block within block grid              */
  int    group_size;      /* number of ranks in shared memory group              */
  int    group_sizex,
         group_sizey;     /* number of ranks in block in each coord direction    */
  int    my_ID;           /* MPI rank                                            */
  int    my_global_IDx,
         my_global_IDy;   /* coordinates of rank in overall rank grid            */
  int    my_local_IDx,
         my_local_IDy;    /* coordinates of rank within shared memory block      */
  int    right_nbr;       /* global rank of right neighboring tile               */
  int    left_nbr;        /* global rank of left neighboring tile                */
  int    top_nbr;         /* global rank of top neighboring tile                 */
  int    bottom_nbr;      /* global rank of bottom neighboring tile              */
  int    local_nbr[4];    /* list of synchronizing local neighbors               */
  int    num_local_nbrs;  /* number of synchronizing local neighbors             */
  int    dummy;
  DTYPE *top_buf_out;     /* communication buffer                                */
  DTYPE *top_buf_in;      /*       "         "                                   */
  DTYPE *bottom_buf_out;  /*       "         "                                   */
  DTYPE *bottom_buf_in;   /*       "         "                                   */
  DTYPE *right_buf_out;   /*       "         "                                   */
  DTYPE *right_buf_in;    /*       "         "                                   */
  DTYPE *left_buf_out;    /*       "         "                                   */
  DTYPE *left_buf_in;     /*       "         "                                   */
  int    root = 0;
  long   n, width, height;/* linear global and block grid dimension              */
  int    width_rank,
         height_rank;     /* linear local dimension                              */
  int    iter, leftover;  /* dummies                   */
  int    istart_rank,
         iend_rank;       /* bounds of grid tile assigned to calling rank        */
  int    jstart_rank,
         jend_rank;       /* bounds of grid tile assigned to calling rank        */
  int    istart, iend;    /* bounds of grid block containing tile                */
  int    jstart, jend;    /* bounds of grid block containing tile                */
  DTYPE  norm,            /* L1 norm of solution                                 */
         local_norm,      /* contribution of calling rank to L1 norm             */
         reference_norm;  /* value to be matched by computed norm                */
  DTYPE  f_active_points; /* interior of grid with respect to stencil            */
  DTYPE  flops;           /* floating point ops per iteration                    */
  int    iterations;      /* number of times to run the algorithm                */
  double local_stencil_time,/* timing parameters                                 */
         stencil_time,
         avgtime;
  int    stencil_size;    /* number of points in stencil                         */
  DTYPE  * RESTRICT in;   /* input grid values                                   */
  DTYPE  * RESTRICT out;  /* output grid values                                  */
  long   total_length_in; /* total required length to store input array          */
  long   total_length_out;/* total required length to store output array         */
  int    error=0;         /* error flag                                          */
  DTYPE  weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil     */
  MPI_Request request[8]; /* requests for sends & receives in 4 coord directions */
  MPI_Win shm_win_in;     /* shared memory window object for IN array            */
  MPI_Win shm_win_out;    /* shared memory window object for OUT array           */
  MPI_Comm shm_comm_prep; /* preparatory shared memory communicator              */
  MPI_Comm shm_comm;      /* Shared Memory Communicator                          */
  int shm_procs;          /* # of rankes in shared domain                        */
  int shm_ID;             /* MPI rank in shared memory domain                    */
  MPI_Aint size_in;       /* size of the IN array in shared memory window        */
  MPI_Aint size_out;      /* size of the OUT array in shared memory window       */
  int size_mul;           /* one for shm_comm root, zero for the other ranks     */
  int disp_unit;          /* ignored                                             */

  /*******************************************************************************
  ** Initialize the MPI environment
  ********************************************************************************/
  MPI_Init(&argc,&argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_ID);
  MPI_Comm_size(MPI_COMM_WORLD, &Num_procs);

  /*******************************************************************************
  ** process, test, and broadcast input parameters
  ********************************************************************************/

  if (my_ID == root) {
    printf("Parallel Research Kernels version %s\n", PRKVERSION);
    printf("MPI+SHM stencil execution on 2D grid\n");

#if !STAR
      printf("ERROR: Compact stencil not supported\n");
      error = 1;
      goto ENDOFTESTS;
#endif

    if (argc != 4){
      printf("Usage: %s  <#ranks per coherence domain><# iterations> <array dimension> \n",
             *argv);
      error = 1;
      goto ENDOFTESTS;
    }

    group_size = atoi(*++argv);
    if (group_size < 1) {
      printf("ERROR: # ranks per coherence domain must be >= 1 : %d \n",group_size);
      error = 1;
      goto ENDOFTESTS;
    }
    if (Num_procs%group_size) {
      printf("ERROR: total # %d ranks not divisible by ranks per coherence domain %d\n",
	     Num_procs, group_size);
      error = 1;
      goto ENDOFTESTS;
    }

    iterations  = atoi(*++argv);
    if (iterations < 0){
      printf("ERROR: iterations must be >= 0 : %d \n",iterations);
      error = 1;
      goto ENDOFTESTS;
    }

    n  = atol(*++argv);
    long nsquare = n * n;
    if (nsquare < Num_procs){
      printf("ERROR: grid size must be at least # ranks: %ld\n", nsquare);
      error = 1;
      goto ENDOFTESTS;
    }

    if (RADIUS < 0) {
      printf("ERROR: Stencil radius %d should be non-negative\n", RADIUS);
      error = 1;
      goto ENDOFTESTS;
    }

    if (2*RADIUS +1 > n) {
      printf("ERROR: Stencil radius %d exceeds grid size %ld\n", RADIUS, n);
      error = 1;
      goto ENDOFTESTS;
    }

    ENDOFTESTS:;
  }
  bail_out(error);

  MPI_Bcast(&n,          1, MPI_LONG, root, MPI_COMM_WORLD);
  MPI_Bcast(&iterations, 1, MPI_INT, root, MPI_COMM_WORLD);
  MPI_Bcast(&group_size, 1, MPI_INT, root, MPI_COMM_WORLD);

  /* determine best way to create a 2D grid of ranks (closest to square, for
     best surface/volume ratio); we do this brute force for now. The
     decomposition needs to be such that shared memory groups can evenly
     tessellate the rank grid
  */
  for (Num_procsx=(int) (sqrt(Num_procs+1)); Num_procsx>0; Num_procsx--) {
    if (!(Num_procs%Num_procsx)) {
      Num_procsy = Num_procs/Num_procsx;
      for (group_sizex=(int)(sqrt(group_size+1)); group_sizex>0; group_sizex--) {
        if (!(group_size%group_sizex) && !(Num_procsx%group_sizex)) {
          group_sizey=group_size/group_sizex;
          break;
        }
      }
      if (!(Num_procsy%group_sizey)) break;
    }
  }


  if (my_ID == root) {
    printf("Number of ranks                 = %d\n", Num_procs);
    printf("Grid size                       = %ld\n", n);
    printf("Radius of stencil               = %d\n", RADIUS);
    printf("Tiles in x/y-direction          = %d/%d\n", Num_procsx, Num_procsy);
    printf("Tiles per shared memory domain  = %d\n", group_size);
    printf("Tiles in x/y-direction in group = %d/%d\n", group_sizex,  group_sizey);
    printf("Type of stencil                 = star\n");
#if LOCAL_BARRIER_SYNCH
    printf("Local synchronization           = barrier\n");
#else
    printf("Local synchronization           = point to point\n");
#endif
#if DOUBLE
    printf("Data type                       = double precision\n");
#else
    printf("Data type                       = single precision\n");
#endif
#if LOOPGEN
    printf("Script used to expand stencil loop body\n");
#else
    printf("Compact representation of stencil loop body\n");
#endif
    printf("Number of iterations            = %d\n", iterations);
  }

  /* Setup for Shared memory regions */

  /* first divide WORLD in groups of size group_size */
  MPI_Comm_split(MPI_COMM_WORLD, my_ID/group_size, my_ID%group_size, &shm_comm_prep);
  /* derive from that an SHM communicator */
  MPI_Comm_split_type(shm_comm_prep, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shm_comm);
  MPI_Comm_rank(shm_comm, &shm_ID);
  MPI_Comm_size(shm_comm, &shm_procs);
  /* do sanity check, making sure groups did not shrink in second comm split */
  if (shm_procs != group_size) MPI_Abort(MPI_COMM_WORLD, 666);

  Num_groupsx = Num_procsx/group_sizex;
  Num_groupsy = Num_procsy/group_sizey;

  my_group = my_ID/group_size;
  my_group_IDx = my_group%Num_groupsx;
  my_group_IDy = my_group/Num_groupsx;
  my_local_IDx = my_ID%group_sizex;
  my_local_IDy = (my_ID%group_size)/group_sizex;
  my_global_IDx = my_group_IDx*group_sizex+my_local_IDx;
  my_global_IDy = my_group_IDy*group_sizey+my_local_IDy;

  /* set all neighboring ranks to -1 (no communication with those ranks) */
  left_nbr = right_nbr = top_nbr = bottom_nbr = -1;
  /* keep track of local neighbors for local synchronization             */
  num_local_nbrs = 0;

  if (my_local_IDx == group_sizex-1 && my_group_IDx != (Num_groupsx-1)) {
    right_nbr = (my_group+1)*group_size+shm_ID-group_sizex+1;
  }
  if (my_local_IDx != group_sizex-1) {
    local_nbr[num_local_nbrs++] = shm_ID + 1;
  }

  if (my_local_IDx == 0 && my_group_IDx != 0) {
    left_nbr = (my_group-1)*group_size+shm_ID+group_sizex-1;
  }
  if (my_local_IDx != 0) {
    local_nbr[num_local_nbrs++] = shm_ID - 1;
  }

  if (my_local_IDy == group_sizey-1 && my_group_IDy != (Num_groupsy-1)) {
    top_nbr = (my_group+Num_groupsx)*group_size + my_local_IDx;
  }
  if (my_local_IDy != group_sizey-1) {
    local_nbr[num_local_nbrs++] = shm_ID + group_sizex;
  }

  if (my_local_IDy == 0 && my_group_IDy != 0) {
    bottom_nbr = (my_group-Num_groupsx)*group_size + group_sizex*(group_sizey-1)+my_local_IDx;
  }
  if (my_local_IDy != 0) {
    local_nbr[num_local_nbrs++] = shm_ID - group_sizex;
  }

  /* compute amount of space required for input and solution arrays for the block,
     and also compute index sets                                                  */

  width = n/Num_groupsx;
  leftover = n%Num_groupsx;
  if (my_group_IDx<leftover) {
    istart = (width+1) * my_group_IDx;
    iend = istart + width;
  }
  else {
    istart = (width+1) * leftover + width * (my_group_IDx-leftover);
    iend = istart + width - 1;
  }

  width = iend - istart + 1;
  if (width == 0) {
    printf("ERROR: rank %d has no work to do\n", my_ID);
    error = 1;
  }
  bail_out(error);

  height = n/Num_groupsy;
  leftover = n%Num_groupsy;
  if (my_group_IDy<leftover) {
    jstart = (height+1) * my_group_IDy;
    jend = jstart + height;
  }
  else {
    jstart = (height+1) * leftover + height * (my_group_IDy-leftover);
    jend = jstart + height - 1;
  }

  height = jend - jstart + 1;
  if (height == 0) {
    printf("ERROR: rank %d has no work to do\n", my_ID);
    error = 1;
  }
  bail_out(error);

  if (width < RADIUS || height < RADIUS) {
    printf("ERROR: rank %d has work tile smaller then stencil radius; w=%ld,h=%ld\n",
           my_ID, width, height);
    error = 1;
  }
  bail_out(error);

  total_length_in = (width+2*RADIUS)*(height+2*RADIUS)*sizeof(DTYPE);
  total_length_out = width*height*sizeof(DTYPE);

  /* only the root of each SHM domain specifies window of nonzero size */
  size_mul = (shm_ID==0);
  size_in= total_length_in*size_mul;
  MPI_Win_allocate_shared(size_in, sizeof(double), MPI_INFO_NULL, shm_comm,
                          (void *) &in, &shm_win_in);
  MPI_Win_lock_all(MPI_MODE_NOCHECK, shm_win_in);
  MPI_Win_shared_query(shm_win_in, MPI_PROC_NULL, &size_in, &disp_unit, (void *)&in);
  if (in == NULL){
    printf("Error allocating space for input array by group %d\n",my_group);
    error = 1;
  }
  bail_out(error);

  size_out= total_length_out*size_mul;
  MPI_Win_allocate_shared(size_out, sizeof(double), MPI_INFO_NULL, shm_comm,
                          (void *) &out, &shm_win_out);
  MPI_Win_lock_all(MPI_MODE_NOCHECK, shm_win_out);
  MPI_Win_shared_query(shm_win_out, MPI_PROC_NULL, &size_out, &disp_unit, (void *)&out);
  if (out == NULL){
    printf("Error allocating space for output array by group %d\n", my_group);
    error = 1;
  }
  bail_out(error);

  /* determine index set assigned to each rank                         */

  width_rank = width/group_sizex;
  leftover = width%group_sizex;
  if (my_local_IDx<leftover) {
    istart_rank = (width_rank+1) * my_local_IDx;
    iend_rank = istart_rank + width_rank;
  }
  else {
    istart_rank = (width_rank+1) * leftover + width_rank * (my_local_IDx-leftover);
    iend_rank = istart_rank + width_rank - 1;
  }
  istart_rank += istart;
  iend_rank += istart;
  width_rank = iend_rank - istart_rank + 1;

  height_rank = height/group_sizey;
  leftover = height%group_sizey;
  if (my_local_IDy<leftover) {
    jstart_rank = (height_rank+1) * my_local_IDy;
    jend_rank = jstart_rank + height_rank;
  }
  else {
    jstart_rank = (height_rank+1) * leftover + height_rank * (my_local_IDy-leftover);
    jend_rank = jstart_rank + height_rank - 1;
  }
  jstart_rank+=jstart;
  jend_rank+=jstart;
  height_rank = jend_rank - jstart_rank + 1;

  if (height_rank*width_rank==0) {
    error = 1;
    printf("Rank %d has no work to do\n", my_ID);
  }
  bail_out(error);

  /* allocate communication buffers for halo values                            */
  top_buf_out = (DTYPE *) prk_malloc(4*sizeof(DTYPE)*RADIUS*width_rank);
  if (!top_buf_out) {
    printf("ERROR: Rank %d could not allocated comm buffers for y-direction\n", my_ID);
    error = 1;
  }
  bail_out(error);
  top_buf_in     = top_buf_out +   RADIUS*width_rank;
  bottom_buf_out = top_buf_out + 2*RADIUS*width_rank;
  bottom_buf_in  = top_buf_out + 3*RADIUS*width_rank;

  right_buf_out = (DTYPE *) prk_malloc(4*sizeof(DTYPE)*RADIUS*height_rank);
  if (!right_buf_out) {
    printf("ERROR: Rank %d could not allocated comm buffers for x-direction\n", my_ID);
    error = 1;
  }
  bail_out(error);
  right_buf_in   = right_buf_out +   RADIUS*height_rank;
  left_buf_out   = right_buf_out + 2*RADIUS*height_rank;
  left_buf_in    = right_buf_out + 3*RADIUS*height_rank;

    /* fill the stencil weights to reflect a discrete divergence operator         */
  for (int jj=-RADIUS; jj<=RADIUS; jj++) for (int ii=-RADIUS; ii<=RADIUS; ii++)
    WEIGHT(ii,jj) = (DTYPE) 0.0;
  stencil_size = 4*RADIUS+1;
  for (int ii=1; ii<=RADIUS; ii++) {
    WEIGHT(0, ii) = WEIGHT( ii,0) =  (DTYPE) (1.0/(2.0*ii*RADIUS));
    WEIGHT(0,-ii) = WEIGHT(-ii,0) = -(DTYPE) (1.0/(2.0*ii*RADIUS));
  }

  norm = (DTYPE) 0.0;
  f_active_points = (DTYPE) (n-2*RADIUS)*(DTYPE) (n-2*RADIUS);
  /* intialize the input and output arrays                                     */
  for (int j=jstart_rank; j<=jend_rank; j++) for (int i=istart_rank; i<=iend_rank; i++) {
    IN(i,j)  = COEFX*i+COEFY*j;
    OUT(i,j) = (DTYPE)0.0;
  }

  /* LOAD/STORE FENCE */
  MPI_Win_sync(shm_win_in);
  MPI_Win_sync(shm_win_out);
  MPI_Barrier(shm_comm);

  for (iter = 0; iter<=iterations; iter++){

    /* start timer after a warmup iteration */
    if (iter == 1) {
      MPI_Barrier(MPI_COMM_WORLD);
      local_stencil_time = wtime();
    }

    /* need to fetch ghost point data from neighbors in y-direction                 */
    if (top_nbr != -1) {
      MPI_Irecv(top_buf_in, RADIUS*width_rank, MPI_DTYPE, top_nbr, 101,
                MPI_COMM_WORLD, &(request[1]));
      for (int kk=0,j=jend_rank-RADIUS+1; j<=jend_rank; j++)
      for (int i=istart_rank; i<=iend_rank; i++) {
        top_buf_out[kk++]= IN(i,j);
      }
      MPI_Isend(top_buf_out, RADIUS*width_rank,MPI_DTYPE, top_nbr, 99,
                MPI_COMM_WORLD, &(request[0]));
    }

    if (bottom_nbr != -1) {
      MPI_Irecv(bottom_buf_in,RADIUS*width_rank, MPI_DTYPE, bottom_nbr, 99,
                MPI_COMM_WORLD, &(request[3]));
      for (int kk=0,j=jstart_rank; j<=jstart_rank+RADIUS-1; j++)
      for (int i=istart_rank; i<=iend_rank; i++) {
        bottom_buf_out[kk++]= IN(i,j);
      }
      MPI_Isend(bottom_buf_out, RADIUS*width_rank,MPI_DTYPE, bottom_nbr, 101,
 	  MPI_COMM_WORLD, &(request[2]));
      }

    if (top_nbr != -1) {
      MPI_Wait(&(request[0]), MPI_STATUS_IGNORE);
      MPI_Wait(&(request[1]), MPI_STATUS_IGNORE);
      for (int kk=0,j=jend_rank+1; j<=jend_rank+RADIUS; j++)
      for (int i=istart_rank; i<=iend_rank; i++) {
        IN(i,j) = top_buf_in[kk++];
      }
    }

    if (bottom_nbr != -1) {
      MPI_Wait(&(request[2]), MPI_STATUS_IGNORE);
      MPI_Wait(&(request[3]), MPI_STATUS_IGNORE);
      for (int kk=0,j=jstart_rank-RADIUS; j<=jstart_rank-1; j++)
      for (int i=istart_rank; i<=iend_rank; i++) {
        IN(i,j) = bottom_buf_in[kk++];
      }
    }

    /* LOAD/STORE FENCE */
    MPI_Win_sync(shm_win_in);

    /* need to fetch ghost point data from neighbors in x-direction                 */
    if (right_nbr != -1) {
      MPI_Irecv(right_buf_in, RADIUS*height_rank, MPI_DTYPE, right_nbr, 1010,
                MPI_COMM_WORLD, &(request[1+4]));
      for (int kk=0,j=jstart_rank; j<=jend_rank; j++)
      for (int i=iend_rank-RADIUS+1; i<=iend_rank; i++) {
        right_buf_out[kk++]= IN(i,j);
      }
      MPI_Isend(right_buf_out, RADIUS*height_rank, MPI_DTYPE, right_nbr, 990,
                MPI_COMM_WORLD, &(request[0+4]));
    }

    if (left_nbr != -1) {
      MPI_Irecv(left_buf_in, RADIUS*height_rank, MPI_DTYPE, left_nbr, 990,
                MPI_COMM_WORLD, &(request[3+4]));
      for (int kk=0,j=jstart_rank; j<=jend_rank; j++)
      for (int i=istart_rank; i<=istart_rank+RADIUS-1; i++) {
        left_buf_out[kk++]= IN(i,j);
      }
      MPI_Isend(left_buf_out, RADIUS*height_rank, MPI_DTYPE, left_nbr, 1010,
                MPI_COMM_WORLD, &(request[2+4]));
    }

    if (right_nbr != -1) {
      MPI_Wait(&(request[0+4]), MPI_STATUS_IGNORE);
      MPI_Wait(&(request[1+4]), MPI_STATUS_IGNORE);
      for (int kk=0,j=jstart_rank; j<=jend_rank; j++)
      for (int i=iend_rank+1; i<=iend_rank+RADIUS; i++) {
        IN(i,j) = right_buf_in[kk++];
      }
    }

    if (left_nbr != -1) {
      MPI_Wait(&(request[2+4]), MPI_STATUS_IGNORE);
      MPI_Wait(&(request[3+4]), MPI_STATUS_IGNORE);
      for (int kk=0,j=jstart_rank; j<=jend_rank; j++)
      for (int i=istart_rank-RADIUS; i<=istart_rank-1; i++) {
        IN(i,j) = left_buf_in[kk++];
      }
    }

    /* LOAD/STORE FENCE */
    MPI_Win_sync(shm_win_in);

    /* Apply the stencil operator */
    for (int j=MAX(jstart_rank,RADIUS); j<=MIN(n-RADIUS-1,jend_rank); j++) {
      for (int i=MAX(istart_rank,RADIUS); i<=MIN(n-RADIUS-1,iend_rank); i++) {
        #if LOOPGEN
          #include "loop_body_star.incl"
        #else
          for (int jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj);
          for (int ii=-RADIUS; ii<0; ii++)       OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j);
          for (int ii=1; ii<=RADIUS; ii++)       OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j);
        #endif
      }
    }

    /* LOAD/STORE FENCE */
    MPI_Win_sync(shm_win_out);

#if LOCAL_BARRIER_SYNCH
    MPI_Barrier(shm_comm); // needed to avoid writing IN while other ranks are reading it
#else
    for (int i=0; i<num_local_nbrs; i++) {
      MPI_Irecv(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm, &(request[i]));
      MPI_Send(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm);
    }
    MPI_Waitall(num_local_nbrs, request, MPI_STATUSES_IGNORE);
#endif

    /* add constant to solution to force refresh of neighbor data, if any */
    for (int j=jstart_rank; j<=jend_rank; j++)
    for (int i=istart_rank; i<=iend_rank; i++) IN(i,j)+= 1.0;

    /* LOAD/STORE FENCE */
    MPI_Win_sync(shm_win_in);

#if LOCAL_BARRIER_SYNCH
    MPI_Barrier(shm_comm); // needed to avoid reading IN while other ranks are writing it
#else
    for (int i=0; i<num_local_nbrs; i++) {
      MPI_Irecv(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm, &(request[i]));
      MPI_Send(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm);
    }
    MPI_Waitall(num_local_nbrs, request, MPI_STATUSES_IGNORE);
#endif

  } /* end of iterations                                                   */

  local_stencil_time = wtime() - local_stencil_time;
  MPI_Reduce(&local_stencil_time, &stencil_time, 1, MPI_DOUBLE, MPI_MAX, root,
             MPI_COMM_WORLD);

  /* compute L1 norm in parallel                                                */
  local_norm = (DTYPE) 0.0;
  for (int j=MAX(jstart_rank,RADIUS); j<=MIN(n-RADIUS-1,jend_rank); j++) {
    for (int i=MAX(istart_rank,RADIUS); i<=MIN(n-RADIUS-1,iend_rank); i++) {
      local_norm += (DTYPE)ABS(OUT(i,j));
    }
  }

  MPI_Reduce(&local_norm, &norm, 1, MPI_DTYPE, MPI_SUM, root, MPI_COMM_WORLD);

  /*******************************************************************************
  ** Analyze and output results.
  ********************************************************************************/

/* verify correctness                                                            */
  if (my_ID == root) {
    norm /= f_active_points;
    if (RADIUS > 0) {
      reference_norm = (DTYPE) (iterations+1) * (COEFX + COEFY);
    }
    else {
      reference_norm = (DTYPE) 0.0;
    }
    if (ABS(norm-reference_norm) > EPSILON) {
      printf("ERROR: L1 norm = "FSTR", Reference L1 norm = "FSTR"\n",
             norm, reference_norm);
      error = 1;
    }
    else {
      printf("Solution validates\n");
#if VERBOSE
      printf("Reference L1 norm = "FSTR", L1 norm = "FSTR"\n",
             reference_norm, norm);
#endif
    }
  }
  bail_out(error);

  MPI_Win_unlock_all(shm_win_in);
  MPI_Win_unlock_all(shm_win_out);
  MPI_Win_free(&shm_win_in);
  MPI_Win_free(&shm_win_out);

  if (my_ID == root) {
    /* flops/stencil: 2 flops (fma) for each point in the stencil,
       plus one flop for the update of the input of the array        */
    flops = (DTYPE) (2*stencil_size+1) * f_active_points;
    avgtime = stencil_time/iterations;
    printf("Rate (MFlops/s): "FSTR"  Avg time (s): %lf\n",
           1.0E-06 * flops/avgtime, avgtime);
  }

  MPI_Finalize();
  exit(EXIT_SUCCESS);
}
Ejemplo n.º 15
0
int main(int argc, char ** argv) {

  int    Num_procs;       /* number of ranks                                     */
  int    Num_procsx, Num_procsy; /* number of ranks in each coord direction      */
  int    my_ID;           /* MPI rank                                            */
  int    my_IDx, my_IDy;  /* coordinates of rank in rank grid                    */
  int    right_nbr;       /* global rank of right neighboring tile               */
  int    left_nbr;        /* global rank of left neighboring tile                */
  int    top_nbr;         /* global rank of top neighboring tile                 */
  int    bottom_nbr;      /* global rank of bottom neighboring tile              */
  DTYPE *top_buf_out;     /* communication buffer                                */
  DTYPE *top_buf_in;      /*       "         "                                   */
  DTYPE *bottom_buf_out;  /*       "         "                                   */
  DTYPE *bottom_buf_in;   /*       "         "                                   */
  DTYPE *right_buf_out;   /*       "         "                                   */
  DTYPE *right_buf_in;    /*       "         "                                   */
  DTYPE *left_buf_out;    /*       "         "                                   */
  DTYPE *left_buf_in;     /*       "         "                                   */
  int    root = 0;
  int    n, width, height;/* linear global and local grid dimension              */
  long   nsquare;         /* total number of grid points                         */
  int    i, j, ii, jj, kk, it, jt, iter, leftover;  /* dummies                   */
  int    istart, iend;    /* bounds of grid tile assigned to calling rank        */
  int    jstart, jend;    /* bounds of grid tile assigned to calling rank        */
  DTYPE  norm,            /* L1 norm of solution                                 */
         local_norm,      /* contribution of calling rank to L1 norm             */
         reference_norm;
  DTYPE  f_active_points; /* interior of grid with respect to stencil            */
  DTYPE  flops;           /* floating point ops per iteration                    */
  int    iterations;      /* number of times to run the algorithm                */
  double local_stencil_time,/* timing parameters                                 */
         stencil_time,
         avgtime;
  int    stencil_size;    /* number of points in stencil                         */
  int    nthread_input,   /* thread parameters                                   */
         nthread;
  DTYPE  * RESTRICT in;   /* input grid values                                   */
  DTYPE  * RESTRICT out;  /* output grid values                                  */
  long   total_length_in; /* total required length to store input array          */
  long   total_length_out;/* total required length to store output array         */
  int    error=0;         /* error flag                                          */
  DTYPE  weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil     */
  MPI_Request request[8];

  /*******************************************************************************
  ** Initialize the MPI environment
  ********************************************************************************/
  MPI_Init(&argc,&argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_ID);
  MPI_Comm_size(MPI_COMM_WORLD, &Num_procs);

  /*******************************************************************************
  ** process, test, and broadcast input parameters
  ********************************************************************************/

  if (my_ID == root) {
    printf("Parallel Research Kernels version %s\n", PRKVERSION);
    printf("MPI+OPENMP stencil execution on 2D grid\n");

#ifndef STAR
      printf("ERROR: Compact stencil not supported\n");
      error = 1;
      goto ENDOFTESTS;
#endif

    if (argc != 4){
      printf("Usage: %s <#threads><#iterations> <array dimension> \n",
             *argv);
      error = 1;
      goto ENDOFTESTS;
    }

    /* Take number of threads to request from command line */
    nthread_input = atoi(*++argv);
    if ((nthread_input < 1) || (nthread_input > MAX_THREADS)) {
      printf("ERROR: Invalid number of threads: %d\n", nthread_input);
      error = 1;
      goto ENDOFTESTS;
    }

    iterations  = atoi(*++argv);
    if (iterations < 1){
      printf("ERROR: iterations must be >= 1 : %d \n",iterations);
      error = 1;
      goto ENDOFTESTS;
    }

    n       = atoi(*++argv);
    nsquare = (long) n * (long) n;
    if (nsquare < Num_procs){
      printf("ERROR: grid size %ld must be at least # ranks: %d\n",
	     nsquare, Num_procs);
      error = 1;
      goto ENDOFTESTS;
    }

    if (RADIUS < 0) {
      printf("ERROR: Stencil radius %d should be non-negative\n", RADIUS);
      error = 1;
      goto ENDOFTESTS;
    }

    if (2*RADIUS +1 > n) {
      printf("ERROR: Stencil radius %d exceeds grid size %d\n", RADIUS, n);
      error = 1;
      goto ENDOFTESTS;
    }

    ENDOFTESTS:;
  }
  bail_out(error);

  /* determine best way to create a 2D grid of ranks (closest to square, for
     best surface/volume ratio); we do this brute force for now
  */
  for (Num_procsx=(int) (sqrt(Num_procs+1)); Num_procsx>0; Num_procsx--) {
    if (!(Num_procs%Num_procsx)) {
      Num_procsy = Num_procs/Num_procsx;
      break;
    }
  }
  my_IDx = my_ID%Num_procsx;
  my_IDy = my_ID/Num_procsx;
  /* compute neighbors; don't worry about dropping off the edges of the grid */
  right_nbr  = my_ID+1;
  left_nbr   = my_ID-1;
  top_nbr    = my_ID+Num_procsx;
  bottom_nbr = my_ID-Num_procsx;


  MPI_Bcast(&n,             1, MPI_INT, root, MPI_COMM_WORLD);
  MPI_Bcast(&iterations,    1, MPI_INT, root, MPI_COMM_WORLD);
  MPI_Bcast(&nthread_input, 1, MPI_INT, root, MPI_COMM_WORLD);

  omp_set_num_threads(nthread_input);

  if (my_ID == root) {
    printf("Number of ranks        = %d\n", Num_procs);
    printf("Number of threads      = %d\n", omp_get_max_threads());
    printf("Grid size              = %d\n", n);
    printf("Radius of stencil      = %d\n", RADIUS);
    printf("Tiles in x/y-direction = %d/%d\n", Num_procsx, Num_procsy);
    printf("Type of stencil        = star\n");
#if DOUBLE
    printf("Data type              = double precision\n");
#else
    printf("Data type              = single precision\n");
#endif
#if LOOPGEN
    printf("Script used to expand stencil loop body\n");
#else
    printf("Compact representation of stencil loop body\n");
#endif
    printf("Number of iterations   = %d\n", iterations);
  }

  /* compute amount of space required for input and solution arrays             */

  width = n/Num_procsx;
  leftover = n%Num_procsx;
  if (my_IDx<leftover) {
    istart = (width+1) * my_IDx;
    iend = istart + width;
  }
  else {
    istart = (width+1) * leftover + width * (my_IDx-leftover);
    iend = istart + width - 1;
  }

  width = iend - istart + 1;
  if (width == 0) {
    printf("ERROR: rank %d has no work to do\n", my_ID);
    error = 1;
  }
  bail_out(error);

  height = n/Num_procsy;
  leftover = n%Num_procsy;
  if (my_IDy<leftover) {
    jstart = (height+1) * my_IDy;
    jend = jstart + height;
  }
  else {
    jstart = (height+1) * leftover + height * (my_IDy-leftover);
    jend = jstart + height - 1;
  }

  height = jend - jstart + 1;
  if (height == 0) {
    printf("ERROR: rank %d has no work to do\n", my_ID);
    error = 1;
  }
  bail_out(error);

  if (width < RADIUS || height < RADIUS) {
    printf("ERROR: rank %d has work tile smaller then stencil radius\n",
           my_ID);
    error = 1;
  }
  bail_out(error);

  total_length_in = (width+2*RADIUS)*(height+2*RADIUS)*sizeof(DTYPE);
  if (total_length_in/(height+2*RADIUS) != (width+2*RADIUS)*sizeof(DTYPE)) {
    printf("ERROR: Space for %d x %d input array cannot be represented\n",
           width+2*RADIUS, height+2*RADIUS);
    error = 1;
  }
  bail_out(error);

  total_length_out = width*height*sizeof(DTYPE);

  in  = (DTYPE *) prk_malloc(total_length_in);
  out = (DTYPE *) prk_malloc(total_length_out);
  if (!in || !out) {
    printf("ERROR: rank %d could not allocate space for input/output array\n",
            my_ID);
    error = 1;
  }
  bail_out(error);

  /* fill the stencil weights to reflect a discrete divergence operator         */
  for (jj=-RADIUS; jj<=RADIUS; jj++) for (ii=-RADIUS; ii<=RADIUS; ii++)
    WEIGHT(ii,jj) = (DTYPE) 0.0;
  stencil_size = 4*RADIUS+1;
  for (ii=1; ii<=RADIUS; ii++) {
    WEIGHT(0, ii) = WEIGHT( ii,0) =  (DTYPE) (1.0/(2.0*ii*RADIUS));
    WEIGHT(0,-ii) = WEIGHT(-ii,0) = -(DTYPE) (1.0/(2.0*ii*RADIUS));
  }

  norm = (DTYPE) 0.0;
  f_active_points = (DTYPE) (n-2*RADIUS)*(DTYPE) (n-2*RADIUS);
  /* intialize the input and output arrays                                     */
  #pragma omp parallel for private (i)
  for (j=jstart; j<=jend; j++) for (i=istart; i<=iend; i++) {
    IN(i,j)  = COEFX*i+COEFY*j;
    OUT(i,j) = (DTYPE)0.0;
  }

  /* allocate communication buffers for halo values                            */
  top_buf_out = (DTYPE *) prk_malloc(4*sizeof(DTYPE)*RADIUS*width);
  if (!top_buf_out) {
    printf("ERROR: Rank %d could not allocated comm buffers for y-direction\n", my_ID);
    error = 1;
  }
  bail_out(error);
  top_buf_in     = top_buf_out +   RADIUS*width;
  bottom_buf_out = top_buf_out + 2*RADIUS*width;
  bottom_buf_in  = top_buf_out + 3*RADIUS*width;

  right_buf_out  = (DTYPE *) prk_malloc(4*sizeof(DTYPE)*RADIUS*height);
  if (!right_buf_out) {
    printf("ERROR: Rank %d could not allocated comm buffers for x-direction\n", my_ID);
    error = 1;
  }
  bail_out(error);
  right_buf_in   = right_buf_out +   RADIUS*height;
  left_buf_out   = right_buf_out + 2*RADIUS*height;
  left_buf_in    = right_buf_out + 3*RADIUS*height;

  for (iter = 0; iter<=iterations; iter++){

    /* start timer after a warmup iteration */
    if (iter == 1) {
      MPI_Barrier(MPI_COMM_WORLD);
      local_stencil_time = wtime();
    }

    /* need to fetch ghost point data from neighbors in y-direction                 */
    if (my_IDy < Num_procsy-1) {
      MPI_Irecv(top_buf_in, RADIUS*width, MPI_DTYPE, top_nbr, 101,
                MPI_COMM_WORLD, &(request[1]));
      for (kk=0,j=jend-RADIUS+1; j<=jend; j++) for (i=istart; i<=iend; i++) {
          top_buf_out[kk++]= IN(i,j);
      }
      MPI_Isend(top_buf_out, RADIUS*width,MPI_DTYPE, top_nbr, 99,
                MPI_COMM_WORLD, &(request[0]));
    }
    if (my_IDy > 0) {
      MPI_Irecv(bottom_buf_in,RADIUS*width, MPI_DTYPE, bottom_nbr, 99,
                MPI_COMM_WORLD, &(request[3]));
      for (kk=0,j=jstart; j<=jstart+RADIUS-1; j++) for (i=istart; i<=iend; i++) {
          bottom_buf_out[kk++]= IN(i,j);
      }
      MPI_Isend(bottom_buf_out, RADIUS*width,MPI_DTYPE, bottom_nbr, 101,
                MPI_COMM_WORLD, &(request[2]));
    }
    if (my_IDy < Num_procsy-1) {
      MPI_Wait(&(request[0]), MPI_STATUS_IGNORE);
      MPI_Wait(&(request[1]), MPI_STATUS_IGNORE);
      for (kk=0,j=jend+1; j<=jend+RADIUS; j++) for (i=istart; i<=iend; i++) {
          IN(i,j) = top_buf_in[kk++];
      }
    }
    if (my_IDy > 0) {
      MPI_Wait(&(request[2]), MPI_STATUS_IGNORE);
      MPI_Wait(&(request[3]), MPI_STATUS_IGNORE);
      for (kk=0,j=jstart-RADIUS; j<=jstart-1; j++) for (i=istart; i<=iend; i++) {
          IN(i,j) = bottom_buf_in[kk++];
      }
    }

    /* need to fetch ghost point data from neighbors in x-direction                 */
    if (my_IDx < Num_procsx-1) {
      MPI_Irecv(right_buf_in, RADIUS*height, MPI_DTYPE, right_nbr, 1010,
                MPI_COMM_WORLD, &(request[1+4]));
      for (kk=0,j=jstart; j<=jend; j++) for (i=iend-RADIUS+1; i<=iend; i++) {
          right_buf_out[kk++]= IN(i,j);
      }
      MPI_Isend(right_buf_out, RADIUS*height, MPI_DTYPE, right_nbr, 990,
              MPI_COMM_WORLD, &(request[0+4]));
    }
    if (my_IDx > 0) {
      MPI_Irecv(left_buf_in, RADIUS*height, MPI_DTYPE, left_nbr, 990,
                MPI_COMM_WORLD, &(request[3+4]));
      for (kk=0,j=jstart; j<=jend; j++) for (i=istart; i<=istart+RADIUS-1; i++) {
          left_buf_out[kk++]= IN(i,j);
      }
      MPI_Isend(left_buf_out, RADIUS*height, MPI_DTYPE, left_nbr, 1010,
                MPI_COMM_WORLD, &(request[2+4]));
    }
    if (my_IDx < Num_procsx-1) {
      MPI_Wait(&(request[0+4]), MPI_STATUS_IGNORE);
      MPI_Wait(&(request[1+4]), MPI_STATUS_IGNORE);
      for (kk=0,j=jstart; j<=jend; j++) for (i=iend+1; i<=iend+RADIUS; i++) {
          IN(i,j) = right_buf_in[kk++];
      }
    }
    if (my_IDx > 0) {
      MPI_Wait(&(request[2+4]), MPI_STATUS_IGNORE);
      MPI_Wait(&(request[3+4]), MPI_STATUS_IGNORE);
      for (kk=0,j=jstart; j<=jend; j++) for (i=istart-RADIUS; i<=istart-1; i++) {
          IN(i,j) = left_buf_in[kk++];
      }
    }

    /* Apply the stencil operator */
    #pragma omp parallel for  private (i, j, ii, jj)
    for (j=MAX(jstart,RADIUS); j<=MIN(n-RADIUS-1,jend); j++) {
      for (i=MAX(istart,RADIUS); i<=MIN(n-RADIUS-1,iend); i++) {
        #if LOOPGEN
          #include "loop_body_star.incl"
        #else
          for (jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj);
          for (ii=-RADIUS; ii<0; ii++)       OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j);
          for (ii=1; ii<=RADIUS; ii++)       OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j);
        #endif
      }
    }

    #pragma omp parallel for private (i)
    /* add constant to solution to force refresh of neighbor data, if any */
    for (j=jstart; j<=jend; j++) for (i=istart; i<=iend; i++) IN(i,j)+= 1.0;

  }

  local_stencil_time = wtime() - local_stencil_time;
  MPI_Reduce(&local_stencil_time, &stencil_time, 1, MPI_DOUBLE, MPI_MAX, root,
             MPI_COMM_WORLD);

  /* compute L1 norm in parallel                                                */
  local_norm = (DTYPE) 0.0;
#pragma omp parallel for reduction(+:local_norm) private (i)
  for (j=MAX(jstart,RADIUS); j<=MIN(n-RADIUS-1,jend); j++) {
    for (i=MAX(istart,RADIUS); i<=MIN(n-RADIUS-1,iend); i++) {
      local_norm += (DTYPE)ABS(OUT(i,j));
    }
  }

  MPI_Reduce(&local_norm, &norm, 1, MPI_DTYPE, MPI_SUM, root, MPI_COMM_WORLD);

  /*******************************************************************************
  ** Analyze and output results.
  ********************************************************************************/

/* verify correctness                                                            */
  if (my_ID == root) {
    norm /= f_active_points;
    if (RADIUS > 0) {
      reference_norm = (DTYPE) (iterations+1) * (COEFX + COEFY);
    }
    else {
      reference_norm = (DTYPE) 0.0;
    }
    if (ABS(norm-reference_norm) > EPSILON) {
      printf("ERROR: L1 norm = "FSTR", Reference L1 norm = "FSTR"\n",
             norm, reference_norm);
      error = 1;
    }
    else {
      printf("Solution validates\n");
#if VERBOSE
      printf("Reference L1 norm = "FSTR", L1 norm = "FSTR"\n",
             reference_norm, norm);
#endif
    }
  }
  bail_out(error);

  if (my_ID == root) {
    /* flops/stencil: 2 flops (fma) for each point in the stencil,
       plus one flop for the update of the input of the array        */
    flops = (DTYPE) (2*stencil_size+1) * f_active_points;
    avgtime = stencil_time/iterations;
    printf("Rate (MFlops/s): "FSTR"  Avg time (s): %lf\n",
           1.0E-06 * flops/avgtime, avgtime);
  }

  MPI_Finalize();
  exit(EXIT_SUCCESS);
}
Ejemplo n.º 16
0
int
wb_tree_remove(wb_tree *tree, const void *key, int del)
{
	int rv;
	wb_node *node, *temp, *out = NULL; /* ergh @ GCC unitializated warning */

	ASSERT(tree != NULL);
	ASSERT(key != NULL);

	node = tree->root;
	while (node) {
		rv = tree->key_cmp(key, node->key);
		if (rv) {
			node = rv < 0 ? node->llink : node->rlink;
			continue;
		}
		if (node->llink == NULL) {
			temp = node;
			out = node->rlink;
			if (out)
				out->parent = node->parent;
			if (del) {
				if (tree->key_del)
					tree->key_del(node->key);
				if (tree->dat_del)
					tree->dat_del(node->dat);
			}
			if (node->parent) {
				if (node->parent->llink == node)
					node->parent->llink = out;
				else
					node->parent->rlink = out;
			} else {
				tree->root = out;
			}
			FREE(node);
			out = temp;
		} else if (node->rlink == NULL) {
			temp = node;
			out = node->llink;
			if (out)
				out->parent = node->parent;
			if (del) {
				if (tree->key_del)
					tree->key_del(node->key);
				if (tree->dat_del)
					tree->dat_del(node->dat);
			}
			if (node->parent) {
				if (node->parent->llink == node)
					node->parent->llink = out;
				else
					node->parent->rlink = out;
			} else {
				tree->root = out;
			}
			FREE(node);
			out = temp;
		} else if (WEIGHT(node->llink) > WEIGHT(node->rlink)) {
			if (WEIGHT(node->llink->llink) < WEIGHT(node->llink->rlink))
				rot_left(tree, node->llink);
			out = node->llink;
			rot_right(tree, node);
			node = out->rlink;
			continue;
		} else {
			if (WEIGHT(node->rlink->rlink) < WEIGHT(node->rlink->llink))
				rot_right(tree, node->rlink);
			out = node->rlink;
			rot_left(tree, node);
			node = out->llink;
			continue;
		}

		if (--tree->count) {
			while (out) {
				out->weight--;
				out = out->parent;
			}
		}
		return 0;
	}
	return -1;
}
Ejemplo n.º 17
0
int
wb_tree_insert(wb_tree *tree, void *key, void *dat, int overwrite)
{
	int rv = 0;
	wb_node *node, *parent = NULL;
	float wbal;

	ASSERT(tree != NULL);

	node = tree->root;
	while (node) {
		rv = tree->key_cmp(key, node->key);
		if (rv < 0)
			parent = node, node = node->llink;
		else if (rv > 0)
			parent = node, node = node->rlink;
		else {
			if (overwrite == 0)
				return 1;
			if (tree->key_del)
				tree->key_del(node->key);
			if (tree->dat_del)
				tree->dat_del(node->dat);
			node->key = key;
			node->dat = dat;
			return 0;
		}
	}

	if ((node = node_new(key, dat)) == NULL)
		return -1;
	if ((node->parent = parent) == NULL) {
		ASSERT(tree->count == 0);
		tree->root = node;
		tree->count = 1;
		return 0;
	}
	if (rv < 0)
		parent->llink = node;
	else
		parent->rlink = node;

	while ((node = parent) != NULL) {
		parent = node->parent;
		node->weight++;
		wbal = WEIGHT(node->llink) / (float)node->weight;
		if (wbal < ALPHA_0) {
			wbal = WEIGHT(node->rlink->llink) / (float)node->rlink->weight;
			if (wbal < ALPHA_3) {		/* LL */
				rot_left(tree, node);
			} else {					/* RL */
				rot_right(tree, node->rlink);
				rot_left(tree, node);
			}
		} else if (wbal > ALPHA_1) {
			wbal = WEIGHT(node->llink->llink) / (float)node->llink->weight;
			if (wbal > ALPHA_2) {		/* RR */
				rot_right(tree, node);
			} else {					/* LR */
				rot_left(tree, node->llink);
				rot_right(tree, node);
			}
		}
	}
	tree->count++;
	return 0;
}
Ejemplo n.º 18
0
      /* profile 13 : 31 dBm | p00,p01,p02,p03,p04,p05,p06,p07,p08,p09,p10,p11,p12,p13,p14,p15  */
      {  /* ramp up   */ {  {   0,0,0,0,0,0,0,15,48,53,150,204,242,250,255,255 },
         /* ramp down */    { 255,217,124,67,14,0,0,0,0,0,0,0,0,0,0,0 }  }
      }, /*-------------------------------------------------------------------------------------*/
      /* profile 14 : 33 dBm | p00,p01,p02,p03,p04,p05,p06,p07,p08,p09,p10,p11,p12,p13,p14,p15  */
      {  /* ramp up   */ {  {   0,0,0,0,0,0,0,25,36,80,144,195,242,248,255,255 },
         /* ramp down */    { 255,209,145,71,0,0,0,0,0,0,0,0,0,0,0,0 }  }
      }, /*-------------------------------------------------------------------------------------*/
      /* profile 15 : 35 dBm | p00,p01,p02,p03,p04,p05,p06,p07,p08,p09,p10,p11,p12,p13,p14,p15  */
      {  /* ramp up   */ {  {   0,0,0,0,0,0,54,59,60,83,116,151,190,228,255,255 },
         /* ramp down */    { 255,230,200,171,131,78,55,43,27,18,0,0,0,0,0,0 }  }
      }, /*-------------------------------------------------------------------------------------*/
   },
   /* ARFCN WEIGHT */
   {  /* max arfcn , mid_level ,  hi_weight   ,  lo_weight   */
      {     160    ,    11     , WEIGHT(1.030), WEIGHT(1.026) },
      {     190    ,    11     , WEIGHT(1.005), WEIGHT(1.013) },
      {     220    ,    11     , WEIGHT(0.986), WEIGHT(1.013) },
      {     190    ,    11     , WEIGHT(1.005), WEIGHT(1.013) },
      {     220    ,    11     , WEIGHT(0.986), WEIGHT(1.013) },
      {     251    ,    11     , WEIGHT(0.965), WEIGHT(1.000) },
      /*------------------------------------------------------*/
      { TABLE_END }
   },
   /* Battery WEIGHT */
   {  /*      low temp,       mid temp,        hi temp */
       {  WEIGHT(1.000),  WEIGHT(1.000),  WEIGHT(1.000)  },  /* low volt */
       {  WEIGHT(1.000),  WEIGHT(1.000),  WEIGHT(1.000)  },  /* mid volt */
       {  WEIGHT(1.000),  WEIGHT(1.000),  WEIGHT(1.000)  },  /*  hi volt */
   },
};
Ejemplo n.º 19
0
int main(int argc, char ** argv) {

  int    n;               /* linear grid dimension */
  int    i, j, ii, jj, it, jt, iter;  /* dummies */
  double norm,            /* L1 norm of solution */
         reference_norm;
  double f_active_points; /* interior of grid with respect to stencil */
  DTYPE  flops;           /* floating point ops per iteration */
  int    iterations;      /* number of times to run the algorithm */
  double stencil_time,    /* timing parameters */
         avgtime, max_time;
  int    stencil_size;    /* number of points in stencil */
  DTYPE  weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil */
  int    istart;    /* bounds of grid tile assigned to calling rank        */
  int    jstart;    /* bounds of grid tile assigned to calling rank        */
  int    Num_procsx, Num_procsy;

  /*******************************************************************************
  ** process and test input parameters
  ********************************************************************************/
  if(MYTHREAD == 0){
    printf("Parallel Research Kernels version %s\n", PRKVERSION);
    printf("UPC stencil execution on 2D grid\n");
    fflush(stdout);
  }

  if (argc != 4 && argc != 3)
    if(MYTHREAD == 0)
      bail_out("Usage: %s <# iterations> <array dimension> [x_tiles]\n", *argv);

  iterations  = atoi(*++argv);
  if (iterations < 1)
    if(MYTHREAD == 0)
      bail_out("iterations must be >= 1 : %d", iterations);

  n  = atoi(*++argv);

  if (n < 1)
    if(MYTHREAD == 0)
      bail_out("grid dimension must be positive: %d", n);

  if (argc == 4)
    Num_procsx  = atoi(*++argv);
  else
    Num_procsx = 0;

  if(Num_procsx < 0)
    if(MYTHREAD == 0)
      bail_out("Number of tiles in the x-direction should be positive (got: %d)", Num_procsx);

  if(Num_procsx > THREADS)
    if(MYTHREAD == 0)
      bail_out("Number of tiles in the x-direction should be < THREADS (got: %d)", Num_procsx);

  /* Num_procsx=0 refers to automated calculation of division on each coordinates like MPI code */
  if(Num_procsx == 0){
    for (Num_procsx=(int) (sqrt(THREADS+1)); Num_procsx>0; Num_procsx--) {
      if (!(THREADS%Num_procsx)) {
        Num_procsy = THREADS/Num_procsx;
        break;
      }
    }
  }
  else {
    Num_procsy = THREADS / Num_procsx;
  }

  if(RADIUS < 1)
    if(MYTHREAD == 0)
      bail_out("Stencil radius %d should be positive", RADIUS);

  if(2*RADIUS +1 > n)
    if(MYTHREAD == 0)
      bail_out("Stencil radius %d exceeds grid size %d", RADIUS, n);

  if(Num_procsx * Num_procsy != THREADS){
    bail_out("Num_procsx * Num_procsy != THREADS");
  }

  /* compute amount of space required for input and solution arrays             */

  int my_IDx = MYTHREAD % Num_procsx;
  int my_IDy = MYTHREAD / Num_procsx;

  int blockx = n / Num_procsx;
  int leftover = n % Num_procsx;
  if (my_IDx < leftover) {
    istart = (blockx + 1) * my_IDx;
    blockx += 1;
  }
  else {
    istart = (blockx+1) * leftover + blockx * (my_IDx-leftover);
  }

  if (blockx == 0)
    bail_out("No work to do on x-direction!");

  int blocky = n / Num_procsy;
  leftover = n % Num_procsy;
  if (my_IDy < leftover) {
    jstart = (blocky+1) * my_IDy;
    blocky += 1;
  }
  else {
    jstart = (blocky+1) * leftover + blocky * (my_IDy-leftover);
  }

  if (blocky == 0)
    bail_out("No work to do on y-direction!");

  if(blockx < RADIUS || blocky < RADIUS) {
    bail_out("blockx < RADIUS || blocky < RADIUS");
  }

  int myoffsetx = istart - RADIUS;
  int myoffsety = jstart - RADIUS;
  thread_offsetx[MYTHREAD] = myoffsetx;
  thread_offsety[MYTHREAD] = myoffsety;

  int sizex = blockx + 2*RADIUS;
  int sizey = blocky + 2*RADIUS;
  thread_sizex[MYTHREAD] = sizex;
  thread_sizey[MYTHREAD] = sizey;

  upc_barrier;

  local_shared_block_ptrs in_array  = shared_2d_array_alloc(sizex, sizey, myoffsetx, myoffsety);
  local_shared_block_ptrs out_array = shared_2d_array_alloc(sizex, sizey, myoffsetx, myoffsety);

  in_arrays[MYTHREAD] = in_array;
  out_arrays[MYTHREAD] = out_array;

  DTYPE **in_array_private = shared_2d_array_to_private(in_array, sizex, sizey, myoffsetx, myoffsety);
  DTYPE **out_array_private = shared_2d_array_to_private(out_array, sizex, sizey, myoffsetx, myoffsety);

  upc_barrier;

  private_in_arrays = prk_malloc(sizeof(private_shared_block_ptrs) * THREADS);
  if(private_in_arrays == NULL)
    bail_out("Cannot allocate private_in_arrays");

  private_out_arrays = prk_malloc(sizeof(private_shared_block_ptrs) * THREADS);
  if(private_out_arrays == NULL)
    bail_out("Cannot allocate private_out_arrays");

  for(int thread=0; thread<THREADS; thread++){
    private_in_arrays[thread] = partially_privatize(in_arrays[thread], thread);
    private_out_arrays[thread] = partially_privatize(out_arrays[thread], thread);
  }

  /* intialize the input and output arrays */
  for(int y=myoffsety; y<myoffsety + sizey; y++){
    for(int x=myoffsetx; x<myoffsetx + sizex; x++){
      in_array_private[y][x] = COEFX*x + COEFY*y;
      out_array[y][x] = 0.;
    }
  }
  upc_barrier;

  for(int y=myoffsety; y<myoffsety + sizey; y++){
    for(int x=myoffsetx; x<myoffsetx + sizex; x++){
      if(in_array_private[y][x] != COEFX*x + COEFY*y)
        bail_out("x=%d y=%d in_array=%f != %f", x, y, in_array[y][x], COEFX*x + COEFY*y);
    }
  }

  /* fill the stencil weights to reflect a discrete divergence operator */
  for (jj=-RADIUS; jj<=RADIUS; jj++)
    for (ii=-RADIUS; ii<=RADIUS; ii++)
      WEIGHT(ii, jj) = (DTYPE)0.0;

  stencil_size = 4*RADIUS+1;
  for (ii=1; ii<=RADIUS; ii++) {
    WEIGHT(0, ii) = WEIGHT( ii,0) =  (DTYPE) (1.0/(2.0*ii*RADIUS));
    WEIGHT(0,-ii) = WEIGHT(-ii,0) = -(DTYPE) (1.0/(2.0*ii*RADIUS));
  }

  if(MYTHREAD == 0){
    printf("Number of threads      = %d\n", THREADS);
    printf("Grid size              = %d\n", n);
    printf("Radius of stencil      = %d\n", RADIUS);
    printf("Tiles in x/y-direction = %d/%d\n", Num_procsx, Num_procsy);
#if DOUBLE
    printf("Data type              = double precision\n");
#else
    printf("Data type              = single precision\n");
#endif
#if LOOPGEN
    printf("Script used to expand stencil loop body\n");
#else
    printf("Compact representation of stencil loop body\n");
#endif
    printf("Number of iterations   = %d\n", iterations);
  }

  upc_barrier;

  int startx = myoffsetx + RADIUS;
  int endx = myoffsetx + sizex - RADIUS;

  int starty = myoffsety + RADIUS;
  int endy = myoffsety + sizey - RADIUS;

  if(my_IDx == 0)
    startx += RADIUS;

  if(my_IDx == Num_procsx - 1)
    endx -= RADIUS;

  if(my_IDy == 0)
    starty += RADIUS;

  if(my_IDy == Num_procsy - 1)
    endy -= RADIUS;

  upc_barrier;

  for (iter = 0; iter<=iterations; iter++){
    /* start timer after a warmup iteration */
    if (iter == 1) {
      upc_barrier;
      stencil_time = wtime();
    }

    /* Get ghost zones */
    /* NORTH */
    if(my_IDy != 0){
      int peer = (my_IDy - 1) * Num_procsx + my_IDx;
      for (int y=starty - RADIUS; y<starty; y++) {
        int transfer_size = (endx - startx) * sizeof(DTYPE);
        upc_memget(&in_array_private[y][startx], &private_in_arrays[peer][y][startx], transfer_size);
      }
    }
    /* SOUTH */
    if(my_IDy != Num_procsy - 1){
      int peer = (my_IDy + 1) * Num_procsx + my_IDx;
      for (int y=endy; y<endy + RADIUS; y++) {
        int transfer_size = (endx - startx) * sizeof(DTYPE);
        upc_memget(&in_array_private[y][startx], &private_in_arrays[peer][y][startx], transfer_size);
      }
    }
    /* LEFT */
    if(my_IDx != 0){
      int peer = my_IDy * Num_procsx + my_IDx - 1;
      for (int y=starty; y<endy; y++) {
        for (int x=startx - RADIUS; x<startx; x++) {
          in_array_private[y][x] = private_in_arrays[peer][y][x];
        }
      }
    }
    /* RIGHT*/
    if(my_IDx != Num_procsx - 1){
      int peer = my_IDy * Num_procsx + my_IDx + 1;
      for (int y=starty; y<endy; y++) {
        for (int x=endx; x<endx + RADIUS; x++) {
          in_array_private[y][x] = private_in_arrays[peer][y][x];
        }
      }
    }

    /* Apply the stencil operator */
    for (j=starty; j<endy; j++) {
      for (i=startx; i<endx; i++) {
        #if LOOPGEN
          #include "loop_body_star.incl"
        #else
          for (jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj);
          for (ii=-RADIUS; ii<0; ii++)       OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j);
          for (ii=1; ii<=RADIUS; ii++)       OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j);
        #endif
      }
    }

    upc_barrier; /* <- Necessary barrier: some slow threads could use future data */

    /* add constant to solution to force refresh of neighbor data, if any */
    for(int y=myoffsety + RADIUS; y<myoffsety + sizey - RADIUS; y++)
      for(int x=myoffsetx + RADIUS; x<myoffsetx + sizex - RADIUS; x++)
        in_array_private[y][x] += 1.0;

    upc_barrier; /* <- Necessary barrier: some threads could start on old data */
  } /* end of iterations */

  stencil_time = wtime() - stencil_time;
  times[MYTHREAD] = stencil_time;

  upc_barrier;

  // Compute max_time
  if(MYTHREAD == 0){
    max_time = times[MYTHREAD];
    for(i=1; i<THREADS; i++){
      if(max_time < times[i])
        max_time = times[i];
    }
  }

  norm = (double) 0.0;
  f_active_points = (double)(n-2*RADIUS) * (double)(n-2*RADIUS);

  /* compute L1 norm in parallel */
  for (int y=starty; y<endy; y++) {
    for (int x=startx; x<endx; x++) {
      norm += (double)ABS(out_array[y][x]);
    }
  }

  norm /= f_active_points;
  norms[MYTHREAD] = norm;

  upc_barrier;

  if(MYTHREAD == 0){
    norm = 0.;
    for(int i=0; i<THREADS; i++) norm += norms[i];

    /*******************************************************************************
    ** Analyze and output results.
    ********************************************************************************/

    /* verify correctness */
    reference_norm = (double) (iterations+1) * (COEFX + COEFY);

    if (ABS(norm - reference_norm) > EPSILON)
      bail_out("L1 norm = "FSTR", Reference L1 norm = "FSTR"\n", norm, reference_norm);
    else {
      printf("Solution validates\n");
#if VERBOSE
      printf("Reference L1 norm = "FSTR", L1 norm = "FSTR"\n",
             reference_norm, norm);
#endif
    }

    flops = (DTYPE) (2*stencil_size+1) * f_active_points;
    avgtime = max_time/iterations;
    printf("Rate (MFlops/s): "FSTR"  Avg time (s): %lf\n",
           1.0E-06 * flops/avgtime, avgtime);

    exit(EXIT_SUCCESS);
  }
}
Ejemplo n.º 20
0
int main(int argc, char ** argv) {

  long   n;                 /* linear grid dimension                               */
  int    i, j, ii, jj, iter;/* dummies                                             */
  double norm = 0.0,        /* L1 norm of solution                                 */
         reference_norm;
  double f_active_points; /* interior of grid with respect to stencil            */
  double flops;           /* floating point ops per iteration                    */
  int    iterations=25;   /* number of times to run the algorithm                */
  double stencil_time,    /* timing parameters                                   */
         avgtime;
  int    stencil_size;    /* number of points in stencil                         */
  double  * RESTRICT in;  /* input grid values                                   */
  double  * RESTRICT out; /* output grid values                                  */
  long   total_length;    /* total required length to store grid values          */
  double weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil     */

   if(argc ==2){
      n = atoi(argv[1]);
   }
   else{
      n = DEF_SIZE;
   }

  if (2*RADIUS +1 > n) {
    printf("ERROR: Stencil radius %d exceeds grid size %d\n", RADIUS, n);
    exit(EXIT_FAIL);
  }

  /*  allocate the required space                                               */
  total_length = n*n*sizeof(double);
  in  = (double *) malloc(total_length);
  out = (double *) malloc(total_length);
  if (!in || !out) {
    printf("ERROR: could not allocate space for input or output array\n");
    exit(EXIT_FAIL);
  }

  /* fill the stencil weights to reflect a discrete divergence operator         */
  stencil_size = (2*RADIUS+1)*(2*RADIUS+1);
  for (jj=-RADIUS; jj<= RADIUS; jj++) for (ii=-RADIUS; ii<= RADIUS; ii++)
    WEIGHT(ii,jj)=0.0;
  for (jj=1; jj<=RADIUS; jj++) {
    for (ii=-jj+1; ii<jj; ii++) {
      WEIGHT(ii,jj)  =  (double) (1.0/(4.0*jj*(2.0*jj-1)*RADIUS));
      WEIGHT(ii,-jj) = -(double) (1.0/(4.0*jj*(2.0*jj-1)*RADIUS));
      WEIGHT(jj,ii)  =  (double) (1.0/(4.0*jj*(2.0*jj-1)*RADIUS));
      WEIGHT(-jj,ii) = -(double) (1.0/(4.0*jj*(2.0*jj-1)*RADIUS));      
    }
    WEIGHT(jj,jj)    =  (double) (1.0/(4.0*jj*RADIUS));
    WEIGHT(-jj,-jj)  = -(double) (1.0/(4.0*jj*RADIUS));
  }

  f_active_points = (double) (n-2*RADIUS)*(double) (n-2*RADIUS);

  printf("Serial stencil execution on 2D grid\n");
  printf("Grid size            = %d\n", n);
  printf("Radius of stencil    = %d\n", RADIUS);
  printf("Type of stencil      = compact\n");
  printf("Number of iterations = %d\n", iterations);

  /* intialize the input and output arrays                                     */
  for (j=0; j<n; j++) for (i=0; i<n; i++) 
    IN(i,j) = COEFX*i+COEFY*j;
  for (j=RADIUS; j<n-RADIUS; j++) for (i=RADIUS; i<n-RADIUS; i++) 
    OUT(i,j) = 0.0;

  for (iter = 0; iter<=iterations; iter++){

    /* start timer after a warmup iteration */
    if (iter == 1)  stencil_time = wtime();

    /* Apply the stencil operator                                               */
    for (j=RADIUS; j<n-RADIUS; j++) {
      for (i=RADIUS; i<n-RADIUS; i++) {
        /* would like to be able to unroll this loop, but compiler will ignore  */
        for (jj=-RADIUS; jj<=RADIUS; jj++) 
        for (ii=-RADIUS; ii<=RADIUS; ii++)  OUT(i,j) += WEIGHT(ii,jj)*IN(i+ii,j+jj);
      }
    }

    /* add constant to solution to force refresh of input data                  */
    for (j=0; j<n; j++) for (i=0; i<n; i++) IN(i,j)+= 1.0;

  } /* end of iterations                                                        */

  stencil_time = wtime() - stencil_time;

  /* compute L1 norm                                                            */
  for (j=RADIUS; j<n-RADIUS; j++) for (i=RADIUS; i<n-RADIUS; i++) {
    norm += (double)ABS(OUT(i,j));
  }

  norm /= f_active_points;

  /******************************************************************************
  ** Analyze and output results.
  *******************************************************************************/

/* verify correctness                                                           */
  reference_norm = (double) (iterations+1) * (COEFX + COEFY);
  if (ABS(norm-reference_norm) > EPSILON) {
    printf("ERROR: L1 norm = %lf, Reference L1 norm = %lf\n",
           norm, reference_norm);
    exit(EXIT_FAIL);
  }
  else {
    printf("Solution validates\n");
  }

  flops = (double) (2*stencil_size+1) * f_active_points;
  avgtime = stencil_time/iterations;
  printf("Serial Rate (MFlops/s): %lf  Avg time (s): %lf\n",
         1.0E-06 * flops/avgtime, avgtime);

  exit(EXIT_SUCCESS);
}
Ejemplo n.º 21
0
void 
solution_add(solution_t *sol, int *weight, int col)
{
    (void) sm_row_insert(sol->row, col);
    sol->cost += WEIGHT(weight, col);
}