int wb_tree_probe(wb_tree *tree, void *key, void **dat) { int rv = 0; wb_node *node, *parent = NULL; float wbal; ASSERT(tree != NULL); node = tree->root; while (node) { rv = tree->key_cmp(key, node->key); if (rv < 0) parent = node, node = node->llink; else if (rv > 0) parent = node, node = node->rlink; else { *dat = node->dat; return 0; } } if ((node = node_new(key, *dat)) == NULL) return -1; if ((node->parent = parent) == NULL) { ASSERT(tree->count == 0); tree->root = node; tree->count = 1; return 0; } if (rv < 0) parent->llink = node; else parent->rlink = node; while ((node = parent) != NULL) { parent = node->parent; node->weight++; wbal = WEIGHT(node->llink) / (float)node->weight; if (wbal < ALPHA_0) { wbal = WEIGHT(node->rlink->llink) / (float)node->rlink->weight; if (wbal < ALPHA_3) { rot_left(tree, node); } else { rot_right(tree, node->rlink); rot_left(tree, node); } } else if (wbal > ALPHA_1) { wbal = WEIGHT(node->llink->llink) / (float)node->llink->weight; if (wbal > ALPHA_2) { rot_right(tree, node); } else { rot_left(tree, node->llink); rot_right(tree, node); } } } tree->count++; return 1; }
Vertices * MST(Vertices * graph) { HeapP * heap; Vertices * vertex; Edges * edge; ; InitFHeap(); /* * key(s) = 0; * key(v) = infty for v != s; * init heap; * make a heap; * put s in heap; */ vertex = graph; KEY(vertex) = 0; heap = MakeHeap(); (void)Insert(&heap, (Item *)vertex); vertex = NEXT_VERTEX(vertex); while(vertex != graph) { KEY(vertex) = PLUS_INFINITY; vertex = NEXT_VERTEX(vertex); } while(vertex != graph); vertex = FindMin(heap); while(vertex != NULL_VERTEX) { heap = DeleteMin(heap); KEY(vertex) = MINUS_INFINITY; edge = EDGES(vertex); while(edge != NULL_EDGE) { if(WEIGHT(edge) < KEY(VERTEX(edge))) { KEY(VERTEX(edge)) = WEIGHT(edge); CHOSEN_EDGE(VERTEX(edge)) = edge; (void)Insert(&heap, VERTEX(edge)); } edge = NEXT_EDGE(edge); } vertex = FindMin(heap); } ; return(graph); }
Vertices * GenTree(int nVertex) { int i; int weight; Vertices * vertex; Vertices * graph; Edges * edge; graph = NewVertex(); NEXT_VERTEX(graph) = graph; for(i = 1; i < nVertex; i++) { vertex = NewVertex(); edge = NewEdge(); /* * The newly created vertex has one edge ... */ EDGES(vertex) = edge; /* * ... which is connected to the graph so far generated. The connection * point in the graph is picked at random. */ VERTEX(edge) = PickVertex(graph, random() % i); weight = GET_WEIGHT; WEIGHT(edge) = weight; SOURCE(edge) = vertex; /* * Link the new vertex into the graph. */ NEXT_VERTEX(vertex) = NEXT_VERTEX(graph); NEXT_VERTEX(graph) = vertex; /* * Add an edge to the vertex randomly picked as the connection point. */ edge = NewEdge(); WEIGHT(edge) = weight; SOURCE(edge) = VERTEX(EDGES(vertex)); VERTEX(edge) = vertex; NEXT_EDGE(edge) = EDGES(VERTEX(EDGES(vertex))); EDGES(VERTEX(EDGES(vertex))) = edge; } return(graph); }
void compute() { double * RESTRICT in = this->in; double * RESTRICT out = this->out; int ii, jj; for (int j=MAX(jstart,RADIUS); j<=MIN(n-1-RADIUS,jend); j++) { for (int i=MAX(istart,RADIUS); i<=MIN(n-1-RADIUS,iend); i++) { #if LOOPGEN #include "loop_body_star.incl" #else for (jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj); for (ii=-RADIUS; ii<0; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); for (ii=1; ii<=RADIUS; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); #endif } } }
float TThresholdCA::operator()(PClassifier classifier, PExampleGenerator data, const int &weightID, float &optCA, const int &targetValue, TFloatFloatList *CAs) { if (!data->domain->classVar) raiseError("classless domain"); if (data->domain->classVar != classifier->classVar) raiseError("classifier's class variables mismatches the given examples'"); TEnumVariable *classVar = data->domain->classVar.AS(TEnumVariable); if (!classVar) raiseError("discrete class expected"); int wtarget; if (targetValue >= 0) wtarget = targetValue; else if (classVar->baseValue >= 0) wtarget = classVar->baseValue; else if (classVar->values->size() == 2) wtarget = 1; else raiseError("cannot determine target class: none is given, class is not binary and its 'baseValue' is not set"); typedef map<float, float> tmfpff; tmfpff dists; float N = 0.0, corr = 0.0; PEITERATE(ei, data) if (!(*ei).getClass().isSpecial()) { float wei = WEIGHT(*ei); N += wei; if ((*ei).getClass().intV == wtarget) { corr += wei; wei = -wei; } const float prob = classifier->classDistribution(*ei)->atint(wtarget); pair<tmfpff::iterator, bool> elm = dists.insert(make_pair(prob, wei)); if (!elm.second) (*elm.first).second += wei; } optCA = 0; if (dists.size() < 2) return 0.5; float optthresh; for(tmfpff::const_iterator ni(dists.begin()), ie(dists.end()), ii(ni++); ni != ie; ii = ni++) { corr += (*ii).second; if ((corr > optCA) || ((corr == optCA) && ((*ii).first < 0.5))) { optCA = corr; optthresh = ((*ii).first + (*ni).first) / 2.0; } if (CAs) CAs->push_back(make_pair(((*ii).first + (*ni).first) / 2.0, corr/N)); } optCA /= N; return optthresh; }
void compute() { double * RESTRICT in = this->in; double * RESTRICT out = this->out; for (int j=MAX(jstart,RADIUS); j<=MIN(n-1-RADIUS,jend); j++) { for (int i=MAX(istart,RADIUS); i<=MIN(n-1-RADIUS,iend); i++) { for (int jj=-RADIUS; jj<=RADIUS; jj++) { OUT(i-istart,j-jstart) += WEIGHT(0,jj)*IN(i-istart,j-jstart+jj); } for (int ii=-RADIUS; ii<0; ii++) { OUT(i-istart,j-jstart) += WEIGHT(ii,0)*IN(i-istart+ii,j-jstart); } for (int ii=1; ii<=RADIUS; ii++) { OUT(i-istart,j-jstart) += WEIGHT(ii,0)*IN(i-istart+ii,j-jstart); } } } }
void PrintNeighbors(Vertices * vertex) { Edges * edge; edge = EDGES(vertex); while(edge != NULL) { printf(" %d(%d)[%d]", ID(VERTEX(edge)), WEIGHT(edge), ID(SOURCE(edge))); edge = NEXT_EDGE(edge); } }
void Connect(Vertices * vertex1, Vertices * vertex2) { int weight; Edges * edge; weight = GET_WEIGHT; edge = NewEdge(); WEIGHT(edge) = weight; SOURCE(edge) = vertex1; VERTEX(edge) = vertex2; NEXT_EDGE(edge) = EDGES(vertex1); EDGES(vertex1) = edge; edge = NewEdge(); WEIGHT(edge) = weight; SOURCE(edge) = vertex2; VERTEX(edge) = vertex1; NEXT_EDGE(edge) = EDGES(vertex2); EDGES(vertex2) = edge; }
Edges * NewEdge() { Edges * edge; edge = (Edges *)malloc(sizeof(Edges)); if(edge == NULL) { fprintf(stderr, "Could not malloc\n"); exit(1); } WEIGHT(edge) = 0; VERTEX(edge) = NULL; NEXT_EDGE(edge) = NULL; return(edge); }
sm_row * sm_minimum_cover(sm_matrix *A, int *weight, int heuristic, int debug_level) /* set to 1 for a heuristic covering */ /* how deep in the recursion to provide info */ { stats_t stats; solution_t *best, *select; sm_row *prow, *sol; sm_col *pcol; sm_matrix *dup_A; int nelem, bound; double sparsity; /* Avoid sillyness */ if (A->nrows <= 0) { return sm_row_alloc(); /* easy to cover */ } /* Initialize debugging structure */ stats.start_time = util_cpu_time(); stats.debug = debug_level > 0; stats.max_print_depth = debug_level; stats.max_depth = -1; stats.nodes = 0; stats.component = stats.comp_count = 0; stats.gimpel = stats.gimpel_count = 0; stats.no_branching = heuristic != 0; stats.lower_bound = -1; /* Check the matrix sparsity */ nelem = 0; sm_foreach_row(A, prow) { nelem += prow->length; } sparsity = (double) nelem / (double) (A->nrows * A->ncols); /* Determine an upper bound on the solution */ bound = 1; sm_foreach_col(A, pcol) { bound += WEIGHT(weight, pcol->col_num); }
void survivals(TTimes ×, float &sow, PExampleGenerator gen, const int &outcomeIndex, TValue &failValue, const int &timeIndex, const int &weightID) { const bool outcomemeta = outcomeIndex<0; const bool timemeta = timeIndex<0; if (!timemeta && (gen->domain->getVar(timeIndex)->varType != TValue::FLOATVAR)) raiseError("continuous attribute expected for censoring time"); if (!outcomemeta && (gen->domain->getVar(outcomeIndex)->varType != TValue::INTVAR)) raiseError("discrete attribute expected for outcome"); if (failValue.isSpecial() || (failValue.varType!=TValue::INTVAR)) raiseError("discrete value needs to be specified for the 'failure'"); const int &failIndex = failValue.intV; sow = 0.0; PEITERATE(ei, gen) { float wei = WEIGHT(*ei); TValue &timeval = (*ei)[timeIndex]; if (timeval.isSpecial()) continue; if (timemeta && timeval.varType != TValue::FLOATVAR) raiseError("continuous attribute expected for censoring time"); TValue &outcomeval = (*ei)[outcomeIndex]; if (outcomeval.isSpecial()) continue; if (outcomemeta && outcomeval.varType != TValue::INTVAR) raiseError("discrete attribute expected for outcome"); if (outcomeval.intV==failIndex) times[timeval.floatV].failed += wei; else times[timeval.floatV].censored += wei; sow += wei; }
int main(int argc, char ** argv) { int Num_procs; /* number of ranks */ int Num_procsx, Num_procsy; /* number of ranks in each coord direction */ int my_ID; /* SHMEM rank */ int my_IDx, my_IDy; /* coordinates of rank in rank grid */ int right_nbr; /* global rank of right neighboring tile */ int left_nbr; /* global rank of left neighboring tile */ int top_nbr; /* global rank of top neighboring tile */ int bottom_nbr; /* global rank of bottom neighboring tile */ DTYPE *top_buf_out; /* communication buffer */ DTYPE *top_buf_in[2]; /* " " */ DTYPE *bottom_buf_out; /* " " */ DTYPE *bottom_buf_in[2];/* " " */ DTYPE *right_buf_out; /* " " */ DTYPE *right_buf_in[2]; /* " " */ DTYPE *left_buf_out; /* " " */ DTYPE *left_buf_in[2]; /* " " */ int root = 0; int n, width, height;/* linear global and local grid dimension */ int i, j, ii, jj, kk, it, jt, iter, leftover; /* dummies */ int istart, iend; /* bounds of grid tile assigned to calling rank */ int jstart, jend; /* bounds of grid tile assigned to calling rank */ DTYPE reference_norm; DTYPE f_active_points; /* interior of grid with respect to stencil */ int stencil_size; /* number of points in the stencil */ DTYPE flops; /* floating point ops per iteration */ int iterations; /* number of times to run the algorithm */ double avgtime, /* timing parameters */ *local_stencil_time, *stencil_time; DTYPE * RESTRICT in; /* input grid values */ DTYPE * RESTRICT out; /* output grid values */ long total_length_in; /* total required length to store input array */ long total_length_out;/* total required length to store output array */ int error=0; /* error flag */ DTYPE weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil */ int *arguments; /* command line parameters */ int count_case=4; /* number of neighbors of a rank */ long *pSync_bcast; /* work space for collectives */ long *pSync_reduce; /* work space for collectives */ double *pWrk_time; /* work space for collectives */ DTYPE *pWrk_norm; /* work space for collectives */ int *iterflag; /* synchronization flags */ int sw; /* double buffering switch */ DTYPE *local_norm, *norm; /* local and global error norms */ /******************************************************************************* ** Initialize the SHMEM environment ********************************************************************************/ prk_shmem_init(); my_ID=prk_shmem_my_pe(); Num_procs=prk_shmem_n_pes(); pSync_bcast = (long *) prk_shmem_malloc(PRK_SHMEM_BCAST_SYNC_SIZE*sizeof(long)); pSync_reduce = (long *) prk_shmem_malloc(PRK_SHMEM_REDUCE_SYNC_SIZE*sizeof(long)); pWrk_time = (double *) prk_shmem_malloc(PRK_SHMEM_REDUCE_MIN_WRKDATA_SIZE*sizeof(double)); pWrk_norm = (DTYPE *) prk_shmem_malloc(PRK_SHMEM_REDUCE_MIN_WRKDATA_SIZE*sizeof(DTYPE)); local_stencil_time = (double *) prk_shmem_malloc(sizeof(double)); stencil_time = (double *) prk_shmem_malloc(sizeof(double)); local_norm = (DTYPE *) prk_shmem_malloc(sizeof(DTYPE)); norm = (DTYPE *) prk_shmem_malloc(sizeof(DTYPE)); iterflag = (int *) prk_shmem_malloc(2*sizeof(int)); if (!(pSync_bcast && pSync_reduce && pWrk_time && pWrk_norm && iterflag && local_stencil_time && stencil_time && local_norm && norm)) { printf("Could not allocate scalar variables on rank %d\n", my_ID); error = 1; } bail_out(error); for(i=0;i<PRK_SHMEM_BCAST_SYNC_SIZE;i++) pSync_bcast[i]=PRK_SHMEM_SYNC_VALUE; for(i=0;i<PRK_SHMEM_REDUCE_SYNC_SIZE;i++) pSync_reduce[i]=PRK_SHMEM_SYNC_VALUE; arguments=(int*)prk_shmem_malloc(2*sizeof(int)); /******************************************************************************* ** process, test, and broadcast input parameters ********************************************************************************/ if (my_ID == root) { #ifndef STAR printf("ERROR: Compact stencil not supported\n"); error = 1; goto ENDOFTESTS; #endif if (argc != 3){ printf("Usage: %s <# iterations> <array dimension> \n", *argv); error = 1; goto ENDOFTESTS; } iterations = atoi(*++argv); arguments[0]=iterations; if (iterations < 1){ printf("ERROR: iterations must be >= 1 : %d \n",iterations); error = 1; goto ENDOFTESTS; } n = atoi(*++argv); arguments[1]=n; long nsquare = (long)n * (long)n; if (nsquare < Num_procs){ printf("ERROR: grid size must be at least # ranks: %ld\n", nsquare); error = 1; goto ENDOFTESTS; } if (RADIUS < 0) { printf("ERROR: Stencil radius %d should be non-negative\n", RADIUS); error = 1; goto ENDOFTESTS; } if (2*RADIUS +1 > n) { printf("ERROR: Stencil radius %d exceeds grid size %d\n", RADIUS, n); error = 1; goto ENDOFTESTS; } ENDOFTESTS:; } bail_out(error); /* determine best way to create a 2D grid of ranks (closest to square, for best surface/volume ratio); we do this brute force for now */ for (Num_procsx=(int) (sqrt(Num_procs+1)); Num_procsx>0; Num_procsx--) { if (!(Num_procs%Num_procsx)) { Num_procsy = Num_procs/Num_procsx; break; } } my_IDx = my_ID%Num_procsx; my_IDy = my_ID/Num_procsx; /* compute neighbors; don't worry about dropping off the edges of the grid */ right_nbr = my_ID+1; left_nbr = my_ID-1; top_nbr = my_ID+Num_procsx; bottom_nbr = my_ID-Num_procsx; iterflag[0] = iterflag[1] = 0; if(my_IDx==0) count_case--; if(my_IDx==Num_procsx-1) count_case--; if(my_IDy==0) count_case--; if(my_IDy==Num_procsy-1) count_case--; if (my_ID == root) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("SHMEM stencil execution on 2D grid\n"); printf("Number of ranks = %d\n", Num_procs); printf("Grid size = %d\n", n); printf("Radius of stencil = %d\n", RADIUS); printf("Tiles in x/y-direction = %d/%d\n", Num_procsx, Num_procsy); printf("Type of stencil = star\n"); #ifdef DOUBLE printf("Data type = double precision\n"); #else printf("Data type = single precision\n"); #endif #if LOOPGEN printf("Script used to expand stencil loop body\n"); #else printf("Compact representation of stencil loop body\n"); #endif #if SPLITFENCE printf("Split fence = ON\n"); #else printf("Split fence = OFF\n"); #endif printf("Number of iterations = %d\n", iterations); } shmem_barrier_all(); shmem_broadcast32(&arguments[0], &arguments[0], 2, root, 0, 0, Num_procs, pSync_bcast); iterations=arguments[0]; n=arguments[1]; shmem_barrier_all(); prk_shmem_free(arguments); /* compute amount of space required for input and solution arrays */ width = n/Num_procsx; leftover = n%Num_procsx; if (my_IDx<leftover) { istart = (width+1) * my_IDx; iend = istart + width + 1; } else { istart = (width+1) * leftover + width * (my_IDx-leftover); iend = istart + width; } width = iend - istart + 1; if (width == 0) { printf("ERROR: rank %d has no work to do\n", my_ID); error = 1; } bail_out(error); height = n/Num_procsy; leftover = n%Num_procsy; if (my_IDy<leftover) { jstart = (height+1) * my_IDy; jend = jstart + height + 1; } else { jstart = (height+1) * leftover + height * (my_IDy-leftover); jend = jstart + height; } height = jend - jstart + 1; if (height == 0) { printf("ERROR: rank %d has no work to do\n", my_ID); error = 1; } bail_out(error); if (width < RADIUS || height < RADIUS) { printf("ERROR: rank %d has work tile smaller then stencil radius\n", my_ID); error = 1; } bail_out(error); total_length_in = (width+2*RADIUS); total_length_in *= (height+2*RADIUS); total_length_in *= sizeof(DTYPE); total_length_out = width; total_length_out *= height; total_length_out *= sizeof(DTYPE); in = (DTYPE *) malloc(total_length_in); out = (DTYPE *) malloc(total_length_out); if (!in || !out) { printf("ERROR: rank %d could not allocate space for input/output array\n", my_ID); error = 1; } bail_out(error); /* fill the stencil weights to reflect a discrete divergence operator */ for (jj=-RADIUS; jj<=RADIUS; jj++) for (ii=-RADIUS; ii<=RADIUS; ii++) WEIGHT(ii,jj) = (DTYPE) 0.0; stencil_size = 4*RADIUS+1; for (ii=1; ii<=RADIUS; ii++) { WEIGHT(0, ii) = WEIGHT( ii,0) = (DTYPE) (1.0/(2.0*ii*RADIUS)); WEIGHT(0,-ii) = WEIGHT(-ii,0) = -(DTYPE) (1.0/(2.0*ii*RADIUS)); } norm[0] = (DTYPE) 0.0; f_active_points = (DTYPE) (n-2*RADIUS)*(DTYPE) (n-2*RADIUS); /* intialize the input and output arrays */ for (j=jstart; j<jend; j++) for (i=istart; i<iend; i++) { IN(i,j) = COEFX*i+COEFY*j; OUT(i,j) = (DTYPE)0.0; } /* allocate communication buffers for halo values */ top_buf_out=(DTYPE*)malloc(2*sizeof(DTYPE)*RADIUS*width); if (!top_buf_out) { printf("ERROR: Rank %d could not allocate output comm buffers for y-direction\n", my_ID); error = 1; } bail_out(error); bottom_buf_out = top_buf_out+RADIUS*width; top_buf_in[0]=(DTYPE*)prk_shmem_malloc(4*sizeof(DTYPE)*RADIUS*width); if(!top_buf_in) { printf("ERROR: Rank %d could not allocate input comm buffers for y-direction\n", my_ID); error=1; } bail_out(error); top_buf_in[1] = top_buf_in[0] + RADIUS*width; bottom_buf_in[0] = top_buf_in[1] + RADIUS*width; bottom_buf_in[1] = bottom_buf_in[0] + RADIUS*width; right_buf_out=(DTYPE*)malloc(2*sizeof(DTYPE)*RADIUS*height); if (!right_buf_out) { printf("ERROR: Rank %d could not allocate output comm buffers for x-direction\n", my_ID); error = 1; } bail_out(error); left_buf_out=right_buf_out+RADIUS*height; right_buf_in[0]=(DTYPE*)prk_shmem_malloc(4*sizeof(DTYPE)*RADIUS*height); if(!right_buf_in) { printf("ERROR: Rank %d could not allocate input comm buffers for x-dimension\n", my_ID); error=1; } bail_out(error); right_buf_in[1] = right_buf_in[0] + RADIUS*height; left_buf_in[0] = right_buf_in[1] + RADIUS*height; left_buf_in[1] = left_buf_in[0] + RADIUS*height; /* make sure all symmetric heaps are allocated before being used */ shmem_barrier_all(); for (iter = 0; iter<=iterations; iter++){ /* start timer after a warmup iteration */ if (iter == 1) { shmem_barrier_all(); local_stencil_time[0] = wtime(); } /* sw determines which incoming buffer to select */ sw = iter%2; /* need to fetch ghost point data from neighbors */ if (my_IDy < Num_procsy-1) { for (kk=0,j=jend-RADIUS; j<=jend-1; j++) for (i=istart; i<=iend; i++) { top_buf_out[kk++]= IN(i,j); } shmem_putmem(bottom_buf_in[sw], top_buf_out, RADIUS*width*sizeof(DTYPE), top_nbr); #if SPLITFENCE shmem_fence(); shmem_int_inc(&iterflag[sw], top_nbr); #endif } if (my_IDy > 0) { for (kk=0,j=jstart; j<=jstart+RADIUS-1; j++) for (i=istart; i<=iend; i++) { bottom_buf_out[kk++]= IN(i,j); } shmem_putmem(top_buf_in[sw], bottom_buf_out, RADIUS*width*sizeof(DTYPE), bottom_nbr); #if SPLITFENCE shmem_fence(); shmem_int_inc(&iterflag[sw], bottom_nbr); #endif } if(my_IDx < Num_procsx-1) { for(kk=0,j=jstart;j<=jend;j++) for(i=iend-RADIUS;i<=iend-1;i++) { right_buf_out[kk++]=IN(i,j); } shmem_putmem(left_buf_in[sw], right_buf_out, RADIUS*height*sizeof(DTYPE), right_nbr); #if SPLITFENCE shmem_fence(); shmem_int_inc(&iterflag[sw], right_nbr); #endif } if(my_IDx>0) { for(kk=0,j=jstart;j<=jend;j++) for(i=istart;i<=istart+RADIUS-1;i++) { left_buf_out[kk++]=IN(i,j); } shmem_putmem(right_buf_in[sw], left_buf_out, RADIUS*height*sizeof(DTYPE), left_nbr); #if SPLITFENCE shmem_fence(); shmem_int_inc(&iterflag[sw], left_nbr); #endif } #if SPLITFENCE == 0 shmem_fence(); if(my_IDy<Num_procsy-1) shmem_int_inc(&iterflag[sw], top_nbr); if(my_IDy>0) shmem_int_inc(&iterflag[sw], bottom_nbr); if(my_IDx<Num_procsx-1) shmem_int_inc(&iterflag[sw], right_nbr); if(my_IDx>0) shmem_int_inc(&iterflag[sw], left_nbr); #endif shmem_int_wait_until(&iterflag[sw], SHMEM_CMP_EQ, count_case*(iter/2+1)); if (my_IDy < Num_procsy-1) { for (kk=0,j=jend; j<=jend+RADIUS-1; j++) for (i=istart; i<=iend; i++) { IN(i,j) = top_buf_in[sw][kk++]; } } if (my_IDy > 0) { for (kk=0,j=jstart-RADIUS; j<=jstart-1; j++) for (i=istart; i<=iend; i++) { IN(i,j) = bottom_buf_in[sw][kk++]; } } if (my_IDx < Num_procsx-1) { for (kk=0,j=jstart; j<=jend; j++) for (i=iend; i<=iend+RADIUS-1; i++) { IN(i,j) = right_buf_in[sw][kk++]; } } if (my_IDx > 0) { for (kk=0,j=jstart; j<=jend; j++) for (i=istart-RADIUS; i<=istart-1; i++) { IN(i,j) = left_buf_in[sw][kk++]; } } /* Apply the stencil operator */ for (j=MAX(jstart,RADIUS); j<=MIN(n-RADIUS-1,jend); j++) { for (i=MAX(istart,RADIUS); i<=MIN(n-RADIUS-1,iend); i++) { #if LOOPGEN #include "loop_body_star.incl" #else for (jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj); for (ii=-RADIUS; ii<0; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); for (ii=1; ii<=RADIUS; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); #endif } } /* add constant to solution to force refresh of neighbor data, if any */ for (j=jstart; j<jend; j++) for (i=istart; i<iend; i++) IN(i,j)+= 1.0; } local_stencil_time[0] = wtime() - local_stencil_time[0]; shmem_barrier_all(); shmem_double_max_to_all(&stencil_time[0], &local_stencil_time[0], 1, 0, 0, Num_procs, pWrk_time, pSync_reduce); /* compute L1 norm in parallel */ local_norm[0] = (DTYPE) 0.0; for (j=MAX(jstart,RADIUS); j<MIN(n-RADIUS,jend); j++) { for (i=MAX(istart,RADIUS); i<MIN(n-RADIUS,iend); i++) { local_norm[0] += (DTYPE)ABS(OUT(i,j)); } } shmem_barrier_all(); #ifdef DOUBLE shmem_double_sum_to_all(&norm[0], &local_norm[0], 1, 0, 0, Num_procs, pWrk_norm, pSync_reduce); #else shmem_float_sum_to_all(&norm[0], &local_norm[0], 1, 0, 0, Num_procs, pWrk_norm, pSync_reduce); #endif /******************************************************************************* ** Analyze and output results. ********************************************************************************/ /* verify correctness */ if (my_ID == root) { norm[0] /= f_active_points; if (RADIUS > 0) { reference_norm = (DTYPE) (iterations+1) * (COEFX + COEFY); } else { reference_norm = (DTYPE) 0.0; } if (ABS(norm[0]-reference_norm) > EPSILON) { printf("ERROR: L1 norm = "FSTR", Reference L1 norm = "FSTR"\n", norm[0], reference_norm); error = 1; } else { printf("Solution validates\n"); #ifdef VERBOSE printf("Reference L1 norm = "FSTR", L1 norm = "FSTR"\n", reference_norm, norm[0]); #endif } } bail_out(error); if (my_ID == root) { /* flops/stencil: 2 flops (fma) for each point in the stencil, plus one flop for the update of the input of the array */ flops = (DTYPE) (2*stencil_size+1) * f_active_points; avgtime = stencil_time[0]/iterations; printf("Rate (MFlops/s): "FSTR" Avg time (s): %lf\n", 1.0E-06 * flops/avgtime, avgtime); } prk_shmem_free(top_buf_in); prk_shmem_free(right_buf_in); free(top_buf_out); free(right_buf_out); prk_shmem_free(pSync_bcast); prk_shmem_free(pSync_reduce); prk_shmem_free(pWrk_time); prk_shmem_free(pWrk_norm); prk_shmem_finalize(); exit(EXIT_SUCCESS); }
Main(CkArgMsg* m) { int num_chares, min_size; long nsquare; CkPrintf("Parallel Research Kernels Version %s\n", PRKVERSION); CkPrintf("Charm++ stencil execution on 2D grid\n"); if (m->argc != 4) { CkPrintf("%s <maxiterations> <grid_size> <overdecomposition factor>\n", m->argv[0]); CkExit(); } // store the main proxy mainProxy = thisProxy; maxiterations = atoi(m->argv[1]); if (maxiterations < 1) { CkPrintf("ERROR: maxiterations must be positive: %d", maxiterations); CkExit(); } n = atoi(m->argv[2]); nsquare = n * n; if (nsquare < CkNumPes()) { CkPrintf("ERROR: Grid size %ld must be larger than #PEs %d", nsquare, CkNumPes()); CkExit(); } overdecomposition = atoi(m->argv[3]); if (n < overdecomposition) { CkPrintf("ERROR: Grid size %d must be larger than overdecomposition %d", n, overdecomposition); CkExit(); } if (RADIUS < 0) { CkPrintf("ERROR: Stencil radius %d should be non-negative\n", RADIUS); CkExit(); } if (2*RADIUS +1 > n) { CkPrintf("ERROR: Stencil diameter %d exceeds grid size %d\n", 2*RADIUS +1 , n); CkExit(); } // compute decomposition that has smallest surface/volume ratio num_chares = CkNumPes()*overdecomposition; for (num_chare_cols= (int) (sqrt(num_chares+1)); num_chare_cols>0; num_chare_cols--) { if (!(num_chares%num_chare_cols)) { num_chare_rows = num_chares/num_chare_cols; break; } } // determine best way to create a 2D grid of ranks (closest to square) */ factor(num_chares, &num_chare_cols, &num_chare_rows); min_size = (n+num_chare_cols-1)/num_chare_cols; if (min_size<RADIUS) { CkPrintf("ERROR: Some tiles smaller than radius of difference stencil\n"); CkExit(); } // print info CkPrintf("Number of Charm++ PEs = %d\n", CkNumPes()); CkPrintf("Overdecomposition = %d\n", overdecomposition); CkPrintf("Grid size = %d\n", n); CkPrintf("Radius of stencil = %d\n", RADIUS); CkPrintf("Chares in x/y-direction = %d/%d\n", num_chare_cols, num_chare_rows); #if STAR CkPrintf("Type of stencil = star\n"); #else CkPrintf("Type of stencil = compact\n"); CkPrintf("ERROR: Compact stencil not (yet) supported\n"); CkExit(); #endif #if LOOPGEN CkPrintf("Script used to expand stencil loop body\n"); #else CkPrintf("Compact representation of stencil loop body\n"); #endif CkPrintf("Number of iterations = %d\n", maxiterations); // Create new array of worker chares array = CProxy_Stencil::ckNew(num_chare_cols, num_chare_rows); /* fill the stencil weights to reflect a discrete divergence operator */ for (int j=-RADIUS; j<=RADIUS; j++) for (int i=-RADIUS; i<=RADIUS; i++) WEIGHT(i,j) = 0.0; #if STAR for (int i=1; i<=RADIUS; i++) { WEIGHT(0, i) = WEIGHT( i,0) = (1.0/(2.0*i*RADIUS)); WEIGHT(0,-i) = WEIGHT(-i,0) = -(1.0/(2.0*i*RADIUS)); } #else stencil_size = (2*RADIUS+1)*(2*RADIUS+1); for (int j=1; j<=RADIUS; j++) { for (int i=-j+1; i<j; i++) { WEIGHT(i,j) = (1.0/(4.0*j*(2.0*j-1)*RADIUS)); WEIGHT(i,-j) = -(1.0/(4.0*j*(2.0*j-1)*RADIUS)); WEIGHT(j,i) = (1.0/(4.0*j*(2.0*j-1)*RADIUS)); WEIGHT(-j,i) = -(1.0/(4.0*j*(2.0*j-1)*RADIUS)); } WEIGHT(j,j) = (1.0/(4.0*j*RADIUS)); WEIGHT(-j,-j) = -(1.0/(4.0*j*RADIUS)); } #endif //Start the computation array.run(); }
int main(int argc, char ** argv) { int Num_procs; /* number of ranks */ int Num_procsx, Num_procsy; /* number of ranks in each coord direction */ int Num_groupsx, Num_groupsy; /* number of blocks in each coord direction */ int my_group; /* sequence number of shared memory block */ int my_group_IDx, my_group_IDy; /* coordinates of block within block grid */ int group_size; /* number of ranks in shared memory group */ int group_sizex, group_sizey; /* number of ranks in block in each coord direction */ int my_ID; /* MPI rank */ int my_global_IDx, my_global_IDy; /* coordinates of rank in overall rank grid */ int my_local_IDx, my_local_IDy; /* coordinates of rank within shared memory block */ int right_nbr; /* global rank of right neighboring tile */ int left_nbr; /* global rank of left neighboring tile */ int top_nbr; /* global rank of top neighboring tile */ int bottom_nbr; /* global rank of bottom neighboring tile */ int local_nbr[4]; /* list of synchronizing local neighbors */ int num_local_nbrs; /* number of synchronizing local neighbors */ int dummy; DTYPE *top_buf_out; /* communication buffer */ DTYPE *top_buf_in; /* " " */ DTYPE *bottom_buf_out; /* " " */ DTYPE *bottom_buf_in; /* " " */ DTYPE *right_buf_out; /* " " */ DTYPE *right_buf_in; /* " " */ DTYPE *left_buf_out; /* " " */ DTYPE *left_buf_in; /* " " */ int root = 0; long n, width, height;/* linear global and block grid dimension */ int width_rank, height_rank; /* linear local dimension */ int iter, leftover; /* dummies */ int istart_rank, iend_rank; /* bounds of grid tile assigned to calling rank */ int jstart_rank, jend_rank; /* bounds of grid tile assigned to calling rank */ int istart, iend; /* bounds of grid block containing tile */ int jstart, jend; /* bounds of grid block containing tile */ DTYPE norm, /* L1 norm of solution */ local_norm, /* contribution of calling rank to L1 norm */ reference_norm; /* value to be matched by computed norm */ DTYPE f_active_points; /* interior of grid with respect to stencil */ DTYPE flops; /* floating point ops per iteration */ int iterations; /* number of times to run the algorithm */ double local_stencil_time,/* timing parameters */ stencil_time, avgtime; int stencil_size; /* number of points in stencil */ DTYPE * RESTRICT in; /* input grid values */ DTYPE * RESTRICT out; /* output grid values */ long total_length_in; /* total required length to store input array */ long total_length_out;/* total required length to store output array */ int error=0; /* error flag */ DTYPE weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil */ MPI_Request request[8]; /* requests for sends & receives in 4 coord directions */ MPI_Win shm_win_in; /* shared memory window object for IN array */ MPI_Win shm_win_out; /* shared memory window object for OUT array */ MPI_Comm shm_comm_prep; /* preparatory shared memory communicator */ MPI_Comm shm_comm; /* Shared Memory Communicator */ int shm_procs; /* # of rankes in shared domain */ int shm_ID; /* MPI rank in shared memory domain */ MPI_Aint size_in; /* size of the IN array in shared memory window */ MPI_Aint size_out; /* size of the OUT array in shared memory window */ int size_mul; /* one for shm_comm root, zero for the other ranks */ int disp_unit; /* ignored */ /******************************************************************************* ** Initialize the MPI environment ********************************************************************************/ MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_ID); MPI_Comm_size(MPI_COMM_WORLD, &Num_procs); /******************************************************************************* ** process, test, and broadcast input parameters ********************************************************************************/ if (my_ID == root) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("MPI+SHM stencil execution on 2D grid\n"); #if !STAR printf("ERROR: Compact stencil not supported\n"); error = 1; goto ENDOFTESTS; #endif if (argc != 4){ printf("Usage: %s <#ranks per coherence domain><# iterations> <array dimension> \n", *argv); error = 1; goto ENDOFTESTS; } group_size = atoi(*++argv); if (group_size < 1) { printf("ERROR: # ranks per coherence domain must be >= 1 : %d \n",group_size); error = 1; goto ENDOFTESTS; } if (Num_procs%group_size) { printf("ERROR: total # %d ranks not divisible by ranks per coherence domain %d\n", Num_procs, group_size); error = 1; goto ENDOFTESTS; } iterations = atoi(*++argv); if (iterations < 0){ printf("ERROR: iterations must be >= 0 : %d \n",iterations); error = 1; goto ENDOFTESTS; } n = atol(*++argv); long nsquare = n * n; if (nsquare < Num_procs){ printf("ERROR: grid size must be at least # ranks: %ld\n", nsquare); error = 1; goto ENDOFTESTS; } if (RADIUS < 0) { printf("ERROR: Stencil radius %d should be non-negative\n", RADIUS); error = 1; goto ENDOFTESTS; } if (2*RADIUS +1 > n) { printf("ERROR: Stencil radius %d exceeds grid size %ld\n", RADIUS, n); error = 1; goto ENDOFTESTS; } ENDOFTESTS:; } bail_out(error); MPI_Bcast(&n, 1, MPI_LONG, root, MPI_COMM_WORLD); MPI_Bcast(&iterations, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&group_size, 1, MPI_INT, root, MPI_COMM_WORLD); /* determine best way to create a 2D grid of ranks (closest to square, for best surface/volume ratio); we do this brute force for now. The decomposition needs to be such that shared memory groups can evenly tessellate the rank grid */ for (Num_procsx=(int) (sqrt(Num_procs+1)); Num_procsx>0; Num_procsx--) { if (!(Num_procs%Num_procsx)) { Num_procsy = Num_procs/Num_procsx; for (group_sizex=(int)(sqrt(group_size+1)); group_sizex>0; group_sizex--) { if (!(group_size%group_sizex) && !(Num_procsx%group_sizex)) { group_sizey=group_size/group_sizex; break; } } if (!(Num_procsy%group_sizey)) break; } } if (my_ID == root) { printf("Number of ranks = %d\n", Num_procs); printf("Grid size = %ld\n", n); printf("Radius of stencil = %d\n", RADIUS); printf("Tiles in x/y-direction = %d/%d\n", Num_procsx, Num_procsy); printf("Tiles per shared memory domain = %d\n", group_size); printf("Tiles in x/y-direction in group = %d/%d\n", group_sizex, group_sizey); printf("Type of stencil = star\n"); #if LOCAL_BARRIER_SYNCH printf("Local synchronization = barrier\n"); #else printf("Local synchronization = point to point\n"); #endif #if DOUBLE printf("Data type = double precision\n"); #else printf("Data type = single precision\n"); #endif #if LOOPGEN printf("Script used to expand stencil loop body\n"); #else printf("Compact representation of stencil loop body\n"); #endif printf("Number of iterations = %d\n", iterations); } /* Setup for Shared memory regions */ /* first divide WORLD in groups of size group_size */ MPI_Comm_split(MPI_COMM_WORLD, my_ID/group_size, my_ID%group_size, &shm_comm_prep); /* derive from that an SHM communicator */ MPI_Comm_split_type(shm_comm_prep, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shm_comm); MPI_Comm_rank(shm_comm, &shm_ID); MPI_Comm_size(shm_comm, &shm_procs); /* do sanity check, making sure groups did not shrink in second comm split */ if (shm_procs != group_size) MPI_Abort(MPI_COMM_WORLD, 666); Num_groupsx = Num_procsx/group_sizex; Num_groupsy = Num_procsy/group_sizey; my_group = my_ID/group_size; my_group_IDx = my_group%Num_groupsx; my_group_IDy = my_group/Num_groupsx; my_local_IDx = my_ID%group_sizex; my_local_IDy = (my_ID%group_size)/group_sizex; my_global_IDx = my_group_IDx*group_sizex+my_local_IDx; my_global_IDy = my_group_IDy*group_sizey+my_local_IDy; /* set all neighboring ranks to -1 (no communication with those ranks) */ left_nbr = right_nbr = top_nbr = bottom_nbr = -1; /* keep track of local neighbors for local synchronization */ num_local_nbrs = 0; if (my_local_IDx == group_sizex-1 && my_group_IDx != (Num_groupsx-1)) { right_nbr = (my_group+1)*group_size+shm_ID-group_sizex+1; } if (my_local_IDx != group_sizex-1) { local_nbr[num_local_nbrs++] = shm_ID + 1; } if (my_local_IDx == 0 && my_group_IDx != 0) { left_nbr = (my_group-1)*group_size+shm_ID+group_sizex-1; } if (my_local_IDx != 0) { local_nbr[num_local_nbrs++] = shm_ID - 1; } if (my_local_IDy == group_sizey-1 && my_group_IDy != (Num_groupsy-1)) { top_nbr = (my_group+Num_groupsx)*group_size + my_local_IDx; } if (my_local_IDy != group_sizey-1) { local_nbr[num_local_nbrs++] = shm_ID + group_sizex; } if (my_local_IDy == 0 && my_group_IDy != 0) { bottom_nbr = (my_group-Num_groupsx)*group_size + group_sizex*(group_sizey-1)+my_local_IDx; } if (my_local_IDy != 0) { local_nbr[num_local_nbrs++] = shm_ID - group_sizex; } /* compute amount of space required for input and solution arrays for the block, and also compute index sets */ width = n/Num_groupsx; leftover = n%Num_groupsx; if (my_group_IDx<leftover) { istart = (width+1) * my_group_IDx; iend = istart + width; } else { istart = (width+1) * leftover + width * (my_group_IDx-leftover); iend = istart + width - 1; } width = iend - istart + 1; if (width == 0) { printf("ERROR: rank %d has no work to do\n", my_ID); error = 1; } bail_out(error); height = n/Num_groupsy; leftover = n%Num_groupsy; if (my_group_IDy<leftover) { jstart = (height+1) * my_group_IDy; jend = jstart + height; } else { jstart = (height+1) * leftover + height * (my_group_IDy-leftover); jend = jstart + height - 1; } height = jend - jstart + 1; if (height == 0) { printf("ERROR: rank %d has no work to do\n", my_ID); error = 1; } bail_out(error); if (width < RADIUS || height < RADIUS) { printf("ERROR: rank %d has work tile smaller then stencil radius; w=%ld,h=%ld\n", my_ID, width, height); error = 1; } bail_out(error); total_length_in = (width+2*RADIUS)*(height+2*RADIUS)*sizeof(DTYPE); total_length_out = width*height*sizeof(DTYPE); /* only the root of each SHM domain specifies window of nonzero size */ size_mul = (shm_ID==0); size_in= total_length_in*size_mul; MPI_Win_allocate_shared(size_in, sizeof(double), MPI_INFO_NULL, shm_comm, (void *) &in, &shm_win_in); MPI_Win_lock_all(MPI_MODE_NOCHECK, shm_win_in); MPI_Win_shared_query(shm_win_in, MPI_PROC_NULL, &size_in, &disp_unit, (void *)&in); if (in == NULL){ printf("Error allocating space for input array by group %d\n",my_group); error = 1; } bail_out(error); size_out= total_length_out*size_mul; MPI_Win_allocate_shared(size_out, sizeof(double), MPI_INFO_NULL, shm_comm, (void *) &out, &shm_win_out); MPI_Win_lock_all(MPI_MODE_NOCHECK, shm_win_out); MPI_Win_shared_query(shm_win_out, MPI_PROC_NULL, &size_out, &disp_unit, (void *)&out); if (out == NULL){ printf("Error allocating space for output array by group %d\n", my_group); error = 1; } bail_out(error); /* determine index set assigned to each rank */ width_rank = width/group_sizex; leftover = width%group_sizex; if (my_local_IDx<leftover) { istart_rank = (width_rank+1) * my_local_IDx; iend_rank = istart_rank + width_rank; } else { istart_rank = (width_rank+1) * leftover + width_rank * (my_local_IDx-leftover); iend_rank = istart_rank + width_rank - 1; } istart_rank += istart; iend_rank += istart; width_rank = iend_rank - istart_rank + 1; height_rank = height/group_sizey; leftover = height%group_sizey; if (my_local_IDy<leftover) { jstart_rank = (height_rank+1) * my_local_IDy; jend_rank = jstart_rank + height_rank; } else { jstart_rank = (height_rank+1) * leftover + height_rank * (my_local_IDy-leftover); jend_rank = jstart_rank + height_rank - 1; } jstart_rank+=jstart; jend_rank+=jstart; height_rank = jend_rank - jstart_rank + 1; if (height_rank*width_rank==0) { error = 1; printf("Rank %d has no work to do\n", my_ID); } bail_out(error); /* allocate communication buffers for halo values */ top_buf_out = (DTYPE *) prk_malloc(4*sizeof(DTYPE)*RADIUS*width_rank); if (!top_buf_out) { printf("ERROR: Rank %d could not allocated comm buffers for y-direction\n", my_ID); error = 1; } bail_out(error); top_buf_in = top_buf_out + RADIUS*width_rank; bottom_buf_out = top_buf_out + 2*RADIUS*width_rank; bottom_buf_in = top_buf_out + 3*RADIUS*width_rank; right_buf_out = (DTYPE *) prk_malloc(4*sizeof(DTYPE)*RADIUS*height_rank); if (!right_buf_out) { printf("ERROR: Rank %d could not allocated comm buffers for x-direction\n", my_ID); error = 1; } bail_out(error); right_buf_in = right_buf_out + RADIUS*height_rank; left_buf_out = right_buf_out + 2*RADIUS*height_rank; left_buf_in = right_buf_out + 3*RADIUS*height_rank; /* fill the stencil weights to reflect a discrete divergence operator */ for (int jj=-RADIUS; jj<=RADIUS; jj++) for (int ii=-RADIUS; ii<=RADIUS; ii++) WEIGHT(ii,jj) = (DTYPE) 0.0; stencil_size = 4*RADIUS+1; for (int ii=1; ii<=RADIUS; ii++) { WEIGHT(0, ii) = WEIGHT( ii,0) = (DTYPE) (1.0/(2.0*ii*RADIUS)); WEIGHT(0,-ii) = WEIGHT(-ii,0) = -(DTYPE) (1.0/(2.0*ii*RADIUS)); } norm = (DTYPE) 0.0; f_active_points = (DTYPE) (n-2*RADIUS)*(DTYPE) (n-2*RADIUS); /* intialize the input and output arrays */ for (int j=jstart_rank; j<=jend_rank; j++) for (int i=istart_rank; i<=iend_rank; i++) { IN(i,j) = COEFX*i+COEFY*j; OUT(i,j) = (DTYPE)0.0; } /* LOAD/STORE FENCE */ MPI_Win_sync(shm_win_in); MPI_Win_sync(shm_win_out); MPI_Barrier(shm_comm); for (iter = 0; iter<=iterations; iter++){ /* start timer after a warmup iteration */ if (iter == 1) { MPI_Barrier(MPI_COMM_WORLD); local_stencil_time = wtime(); } /* need to fetch ghost point data from neighbors in y-direction */ if (top_nbr != -1) { MPI_Irecv(top_buf_in, RADIUS*width_rank, MPI_DTYPE, top_nbr, 101, MPI_COMM_WORLD, &(request[1])); for (int kk=0,j=jend_rank-RADIUS+1; j<=jend_rank; j++) for (int i=istart_rank; i<=iend_rank; i++) { top_buf_out[kk++]= IN(i,j); } MPI_Isend(top_buf_out, RADIUS*width_rank,MPI_DTYPE, top_nbr, 99, MPI_COMM_WORLD, &(request[0])); } if (bottom_nbr != -1) { MPI_Irecv(bottom_buf_in,RADIUS*width_rank, MPI_DTYPE, bottom_nbr, 99, MPI_COMM_WORLD, &(request[3])); for (int kk=0,j=jstart_rank; j<=jstart_rank+RADIUS-1; j++) for (int i=istart_rank; i<=iend_rank; i++) { bottom_buf_out[kk++]= IN(i,j); } MPI_Isend(bottom_buf_out, RADIUS*width_rank,MPI_DTYPE, bottom_nbr, 101, MPI_COMM_WORLD, &(request[2])); } if (top_nbr != -1) { MPI_Wait(&(request[0]), MPI_STATUS_IGNORE); MPI_Wait(&(request[1]), MPI_STATUS_IGNORE); for (int kk=0,j=jend_rank+1; j<=jend_rank+RADIUS; j++) for (int i=istart_rank; i<=iend_rank; i++) { IN(i,j) = top_buf_in[kk++]; } } if (bottom_nbr != -1) { MPI_Wait(&(request[2]), MPI_STATUS_IGNORE); MPI_Wait(&(request[3]), MPI_STATUS_IGNORE); for (int kk=0,j=jstart_rank-RADIUS; j<=jstart_rank-1; j++) for (int i=istart_rank; i<=iend_rank; i++) { IN(i,j) = bottom_buf_in[kk++]; } } /* LOAD/STORE FENCE */ MPI_Win_sync(shm_win_in); /* need to fetch ghost point data from neighbors in x-direction */ if (right_nbr != -1) { MPI_Irecv(right_buf_in, RADIUS*height_rank, MPI_DTYPE, right_nbr, 1010, MPI_COMM_WORLD, &(request[1+4])); for (int kk=0,j=jstart_rank; j<=jend_rank; j++) for (int i=iend_rank-RADIUS+1; i<=iend_rank; i++) { right_buf_out[kk++]= IN(i,j); } MPI_Isend(right_buf_out, RADIUS*height_rank, MPI_DTYPE, right_nbr, 990, MPI_COMM_WORLD, &(request[0+4])); } if (left_nbr != -1) { MPI_Irecv(left_buf_in, RADIUS*height_rank, MPI_DTYPE, left_nbr, 990, MPI_COMM_WORLD, &(request[3+4])); for (int kk=0,j=jstart_rank; j<=jend_rank; j++) for (int i=istart_rank; i<=istart_rank+RADIUS-1; i++) { left_buf_out[kk++]= IN(i,j); } MPI_Isend(left_buf_out, RADIUS*height_rank, MPI_DTYPE, left_nbr, 1010, MPI_COMM_WORLD, &(request[2+4])); } if (right_nbr != -1) { MPI_Wait(&(request[0+4]), MPI_STATUS_IGNORE); MPI_Wait(&(request[1+4]), MPI_STATUS_IGNORE); for (int kk=0,j=jstart_rank; j<=jend_rank; j++) for (int i=iend_rank+1; i<=iend_rank+RADIUS; i++) { IN(i,j) = right_buf_in[kk++]; } } if (left_nbr != -1) { MPI_Wait(&(request[2+4]), MPI_STATUS_IGNORE); MPI_Wait(&(request[3+4]), MPI_STATUS_IGNORE); for (int kk=0,j=jstart_rank; j<=jend_rank; j++) for (int i=istart_rank-RADIUS; i<=istart_rank-1; i++) { IN(i,j) = left_buf_in[kk++]; } } /* LOAD/STORE FENCE */ MPI_Win_sync(shm_win_in); /* Apply the stencil operator */ for (int j=MAX(jstart_rank,RADIUS); j<=MIN(n-RADIUS-1,jend_rank); j++) { for (int i=MAX(istart_rank,RADIUS); i<=MIN(n-RADIUS-1,iend_rank); i++) { #if LOOPGEN #include "loop_body_star.incl" #else for (int jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj); for (int ii=-RADIUS; ii<0; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); for (int ii=1; ii<=RADIUS; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); #endif } } /* LOAD/STORE FENCE */ MPI_Win_sync(shm_win_out); #if LOCAL_BARRIER_SYNCH MPI_Barrier(shm_comm); // needed to avoid writing IN while other ranks are reading it #else for (int i=0; i<num_local_nbrs; i++) { MPI_Irecv(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm, &(request[i])); MPI_Send(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm); } MPI_Waitall(num_local_nbrs, request, MPI_STATUSES_IGNORE); #endif /* add constant to solution to force refresh of neighbor data, if any */ for (int j=jstart_rank; j<=jend_rank; j++) for (int i=istart_rank; i<=iend_rank; i++) IN(i,j)+= 1.0; /* LOAD/STORE FENCE */ MPI_Win_sync(shm_win_in); #if LOCAL_BARRIER_SYNCH MPI_Barrier(shm_comm); // needed to avoid reading IN while other ranks are writing it #else for (int i=0; i<num_local_nbrs; i++) { MPI_Irecv(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm, &(request[i])); MPI_Send(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm); } MPI_Waitall(num_local_nbrs, request, MPI_STATUSES_IGNORE); #endif } /* end of iterations */ local_stencil_time = wtime() - local_stencil_time; MPI_Reduce(&local_stencil_time, &stencil_time, 1, MPI_DOUBLE, MPI_MAX, root, MPI_COMM_WORLD); /* compute L1 norm in parallel */ local_norm = (DTYPE) 0.0; for (int j=MAX(jstart_rank,RADIUS); j<=MIN(n-RADIUS-1,jend_rank); j++) { for (int i=MAX(istart_rank,RADIUS); i<=MIN(n-RADIUS-1,iend_rank); i++) { local_norm += (DTYPE)ABS(OUT(i,j)); } } MPI_Reduce(&local_norm, &norm, 1, MPI_DTYPE, MPI_SUM, root, MPI_COMM_WORLD); /******************************************************************************* ** Analyze and output results. ********************************************************************************/ /* verify correctness */ if (my_ID == root) { norm /= f_active_points; if (RADIUS > 0) { reference_norm = (DTYPE) (iterations+1) * (COEFX + COEFY); } else { reference_norm = (DTYPE) 0.0; } if (ABS(norm-reference_norm) > EPSILON) { printf("ERROR: L1 norm = "FSTR", Reference L1 norm = "FSTR"\n", norm, reference_norm); error = 1; } else { printf("Solution validates\n"); #if VERBOSE printf("Reference L1 norm = "FSTR", L1 norm = "FSTR"\n", reference_norm, norm); #endif } } bail_out(error); MPI_Win_unlock_all(shm_win_in); MPI_Win_unlock_all(shm_win_out); MPI_Win_free(&shm_win_in); MPI_Win_free(&shm_win_out); if (my_ID == root) { /* flops/stencil: 2 flops (fma) for each point in the stencil, plus one flop for the update of the input of the array */ flops = (DTYPE) (2*stencil_size+1) * f_active_points; avgtime = stencil_time/iterations; printf("Rate (MFlops/s): "FSTR" Avg time (s): %lf\n", 1.0E-06 * flops/avgtime, avgtime); } MPI_Finalize(); exit(EXIT_SUCCESS); }
int main(int argc, char ** argv) { int Num_procs; /* number of ranks */ int Num_procsx, Num_procsy; /* number of ranks in each coord direction */ int my_ID; /* MPI rank */ int my_IDx, my_IDy; /* coordinates of rank in rank grid */ int right_nbr; /* global rank of right neighboring tile */ int left_nbr; /* global rank of left neighboring tile */ int top_nbr; /* global rank of top neighboring tile */ int bottom_nbr; /* global rank of bottom neighboring tile */ DTYPE *top_buf_out; /* communication buffer */ DTYPE *top_buf_in; /* " " */ DTYPE *bottom_buf_out; /* " " */ DTYPE *bottom_buf_in; /* " " */ DTYPE *right_buf_out; /* " " */ DTYPE *right_buf_in; /* " " */ DTYPE *left_buf_out; /* " " */ DTYPE *left_buf_in; /* " " */ int root = 0; int n, width, height;/* linear global and local grid dimension */ long nsquare; /* total number of grid points */ int i, j, ii, jj, kk, it, jt, iter, leftover; /* dummies */ int istart, iend; /* bounds of grid tile assigned to calling rank */ int jstart, jend; /* bounds of grid tile assigned to calling rank */ DTYPE norm, /* L1 norm of solution */ local_norm, /* contribution of calling rank to L1 norm */ reference_norm; DTYPE f_active_points; /* interior of grid with respect to stencil */ DTYPE flops; /* floating point ops per iteration */ int iterations; /* number of times to run the algorithm */ double local_stencil_time,/* timing parameters */ stencil_time, avgtime; int stencil_size; /* number of points in stencil */ int nthread_input, /* thread parameters */ nthread; DTYPE * RESTRICT in; /* input grid values */ DTYPE * RESTRICT out; /* output grid values */ long total_length_in; /* total required length to store input array */ long total_length_out;/* total required length to store output array */ int error=0; /* error flag */ DTYPE weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil */ MPI_Request request[8]; /******************************************************************************* ** Initialize the MPI environment ********************************************************************************/ MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_ID); MPI_Comm_size(MPI_COMM_WORLD, &Num_procs); /******************************************************************************* ** process, test, and broadcast input parameters ********************************************************************************/ if (my_ID == root) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("MPI+OPENMP stencil execution on 2D grid\n"); #ifndef STAR printf("ERROR: Compact stencil not supported\n"); error = 1; goto ENDOFTESTS; #endif if (argc != 4){ printf("Usage: %s <#threads><#iterations> <array dimension> \n", *argv); error = 1; goto ENDOFTESTS; } /* Take number of threads to request from command line */ nthread_input = atoi(*++argv); if ((nthread_input < 1) || (nthread_input > MAX_THREADS)) { printf("ERROR: Invalid number of threads: %d\n", nthread_input); error = 1; goto ENDOFTESTS; } iterations = atoi(*++argv); if (iterations < 1){ printf("ERROR: iterations must be >= 1 : %d \n",iterations); error = 1; goto ENDOFTESTS; } n = atoi(*++argv); nsquare = (long) n * (long) n; if (nsquare < Num_procs){ printf("ERROR: grid size %ld must be at least # ranks: %d\n", nsquare, Num_procs); error = 1; goto ENDOFTESTS; } if (RADIUS < 0) { printf("ERROR: Stencil radius %d should be non-negative\n", RADIUS); error = 1; goto ENDOFTESTS; } if (2*RADIUS +1 > n) { printf("ERROR: Stencil radius %d exceeds grid size %d\n", RADIUS, n); error = 1; goto ENDOFTESTS; } ENDOFTESTS:; } bail_out(error); /* determine best way to create a 2D grid of ranks (closest to square, for best surface/volume ratio); we do this brute force for now */ for (Num_procsx=(int) (sqrt(Num_procs+1)); Num_procsx>0; Num_procsx--) { if (!(Num_procs%Num_procsx)) { Num_procsy = Num_procs/Num_procsx; break; } } my_IDx = my_ID%Num_procsx; my_IDy = my_ID/Num_procsx; /* compute neighbors; don't worry about dropping off the edges of the grid */ right_nbr = my_ID+1; left_nbr = my_ID-1; top_nbr = my_ID+Num_procsx; bottom_nbr = my_ID-Num_procsx; MPI_Bcast(&n, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&iterations, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&nthread_input, 1, MPI_INT, root, MPI_COMM_WORLD); omp_set_num_threads(nthread_input); if (my_ID == root) { printf("Number of ranks = %d\n", Num_procs); printf("Number of threads = %d\n", omp_get_max_threads()); printf("Grid size = %d\n", n); printf("Radius of stencil = %d\n", RADIUS); printf("Tiles in x/y-direction = %d/%d\n", Num_procsx, Num_procsy); printf("Type of stencil = star\n"); #if DOUBLE printf("Data type = double precision\n"); #else printf("Data type = single precision\n"); #endif #if LOOPGEN printf("Script used to expand stencil loop body\n"); #else printf("Compact representation of stencil loop body\n"); #endif printf("Number of iterations = %d\n", iterations); } /* compute amount of space required for input and solution arrays */ width = n/Num_procsx; leftover = n%Num_procsx; if (my_IDx<leftover) { istart = (width+1) * my_IDx; iend = istart + width; } else { istart = (width+1) * leftover + width * (my_IDx-leftover); iend = istart + width - 1; } width = iend - istart + 1; if (width == 0) { printf("ERROR: rank %d has no work to do\n", my_ID); error = 1; } bail_out(error); height = n/Num_procsy; leftover = n%Num_procsy; if (my_IDy<leftover) { jstart = (height+1) * my_IDy; jend = jstart + height; } else { jstart = (height+1) * leftover + height * (my_IDy-leftover); jend = jstart + height - 1; } height = jend - jstart + 1; if (height == 0) { printf("ERROR: rank %d has no work to do\n", my_ID); error = 1; } bail_out(error); if (width < RADIUS || height < RADIUS) { printf("ERROR: rank %d has work tile smaller then stencil radius\n", my_ID); error = 1; } bail_out(error); total_length_in = (width+2*RADIUS)*(height+2*RADIUS)*sizeof(DTYPE); if (total_length_in/(height+2*RADIUS) != (width+2*RADIUS)*sizeof(DTYPE)) { printf("ERROR: Space for %d x %d input array cannot be represented\n", width+2*RADIUS, height+2*RADIUS); error = 1; } bail_out(error); total_length_out = width*height*sizeof(DTYPE); in = (DTYPE *) prk_malloc(total_length_in); out = (DTYPE *) prk_malloc(total_length_out); if (!in || !out) { printf("ERROR: rank %d could not allocate space for input/output array\n", my_ID); error = 1; } bail_out(error); /* fill the stencil weights to reflect a discrete divergence operator */ for (jj=-RADIUS; jj<=RADIUS; jj++) for (ii=-RADIUS; ii<=RADIUS; ii++) WEIGHT(ii,jj) = (DTYPE) 0.0; stencil_size = 4*RADIUS+1; for (ii=1; ii<=RADIUS; ii++) { WEIGHT(0, ii) = WEIGHT( ii,0) = (DTYPE) (1.0/(2.0*ii*RADIUS)); WEIGHT(0,-ii) = WEIGHT(-ii,0) = -(DTYPE) (1.0/(2.0*ii*RADIUS)); } norm = (DTYPE) 0.0; f_active_points = (DTYPE) (n-2*RADIUS)*(DTYPE) (n-2*RADIUS); /* intialize the input and output arrays */ #pragma omp parallel for private (i) for (j=jstart; j<=jend; j++) for (i=istart; i<=iend; i++) { IN(i,j) = COEFX*i+COEFY*j; OUT(i,j) = (DTYPE)0.0; } /* allocate communication buffers for halo values */ top_buf_out = (DTYPE *) prk_malloc(4*sizeof(DTYPE)*RADIUS*width); if (!top_buf_out) { printf("ERROR: Rank %d could not allocated comm buffers for y-direction\n", my_ID); error = 1; } bail_out(error); top_buf_in = top_buf_out + RADIUS*width; bottom_buf_out = top_buf_out + 2*RADIUS*width; bottom_buf_in = top_buf_out + 3*RADIUS*width; right_buf_out = (DTYPE *) prk_malloc(4*sizeof(DTYPE)*RADIUS*height); if (!right_buf_out) { printf("ERROR: Rank %d could not allocated comm buffers for x-direction\n", my_ID); error = 1; } bail_out(error); right_buf_in = right_buf_out + RADIUS*height; left_buf_out = right_buf_out + 2*RADIUS*height; left_buf_in = right_buf_out + 3*RADIUS*height; for (iter = 0; iter<=iterations; iter++){ /* start timer after a warmup iteration */ if (iter == 1) { MPI_Barrier(MPI_COMM_WORLD); local_stencil_time = wtime(); } /* need to fetch ghost point data from neighbors in y-direction */ if (my_IDy < Num_procsy-1) { MPI_Irecv(top_buf_in, RADIUS*width, MPI_DTYPE, top_nbr, 101, MPI_COMM_WORLD, &(request[1])); for (kk=0,j=jend-RADIUS+1; j<=jend; j++) for (i=istart; i<=iend; i++) { top_buf_out[kk++]= IN(i,j); } MPI_Isend(top_buf_out, RADIUS*width,MPI_DTYPE, top_nbr, 99, MPI_COMM_WORLD, &(request[0])); } if (my_IDy > 0) { MPI_Irecv(bottom_buf_in,RADIUS*width, MPI_DTYPE, bottom_nbr, 99, MPI_COMM_WORLD, &(request[3])); for (kk=0,j=jstart; j<=jstart+RADIUS-1; j++) for (i=istart; i<=iend; i++) { bottom_buf_out[kk++]= IN(i,j); } MPI_Isend(bottom_buf_out, RADIUS*width,MPI_DTYPE, bottom_nbr, 101, MPI_COMM_WORLD, &(request[2])); } if (my_IDy < Num_procsy-1) { MPI_Wait(&(request[0]), MPI_STATUS_IGNORE); MPI_Wait(&(request[1]), MPI_STATUS_IGNORE); for (kk=0,j=jend+1; j<=jend+RADIUS; j++) for (i=istart; i<=iend; i++) { IN(i,j) = top_buf_in[kk++]; } } if (my_IDy > 0) { MPI_Wait(&(request[2]), MPI_STATUS_IGNORE); MPI_Wait(&(request[3]), MPI_STATUS_IGNORE); for (kk=0,j=jstart-RADIUS; j<=jstart-1; j++) for (i=istart; i<=iend; i++) { IN(i,j) = bottom_buf_in[kk++]; } } /* need to fetch ghost point data from neighbors in x-direction */ if (my_IDx < Num_procsx-1) { MPI_Irecv(right_buf_in, RADIUS*height, MPI_DTYPE, right_nbr, 1010, MPI_COMM_WORLD, &(request[1+4])); for (kk=0,j=jstart; j<=jend; j++) for (i=iend-RADIUS+1; i<=iend; i++) { right_buf_out[kk++]= IN(i,j); } MPI_Isend(right_buf_out, RADIUS*height, MPI_DTYPE, right_nbr, 990, MPI_COMM_WORLD, &(request[0+4])); } if (my_IDx > 0) { MPI_Irecv(left_buf_in, RADIUS*height, MPI_DTYPE, left_nbr, 990, MPI_COMM_WORLD, &(request[3+4])); for (kk=0,j=jstart; j<=jend; j++) for (i=istart; i<=istart+RADIUS-1; i++) { left_buf_out[kk++]= IN(i,j); } MPI_Isend(left_buf_out, RADIUS*height, MPI_DTYPE, left_nbr, 1010, MPI_COMM_WORLD, &(request[2+4])); } if (my_IDx < Num_procsx-1) { MPI_Wait(&(request[0+4]), MPI_STATUS_IGNORE); MPI_Wait(&(request[1+4]), MPI_STATUS_IGNORE); for (kk=0,j=jstart; j<=jend; j++) for (i=iend+1; i<=iend+RADIUS; i++) { IN(i,j) = right_buf_in[kk++]; } } if (my_IDx > 0) { MPI_Wait(&(request[2+4]), MPI_STATUS_IGNORE); MPI_Wait(&(request[3+4]), MPI_STATUS_IGNORE); for (kk=0,j=jstart; j<=jend; j++) for (i=istart-RADIUS; i<=istart-1; i++) { IN(i,j) = left_buf_in[kk++]; } } /* Apply the stencil operator */ #pragma omp parallel for private (i, j, ii, jj) for (j=MAX(jstart,RADIUS); j<=MIN(n-RADIUS-1,jend); j++) { for (i=MAX(istart,RADIUS); i<=MIN(n-RADIUS-1,iend); i++) { #if LOOPGEN #include "loop_body_star.incl" #else for (jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj); for (ii=-RADIUS; ii<0; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); for (ii=1; ii<=RADIUS; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); #endif } } #pragma omp parallel for private (i) /* add constant to solution to force refresh of neighbor data, if any */ for (j=jstart; j<=jend; j++) for (i=istart; i<=iend; i++) IN(i,j)+= 1.0; } local_stencil_time = wtime() - local_stencil_time; MPI_Reduce(&local_stencil_time, &stencil_time, 1, MPI_DOUBLE, MPI_MAX, root, MPI_COMM_WORLD); /* compute L1 norm in parallel */ local_norm = (DTYPE) 0.0; #pragma omp parallel for reduction(+:local_norm) private (i) for (j=MAX(jstart,RADIUS); j<=MIN(n-RADIUS-1,jend); j++) { for (i=MAX(istart,RADIUS); i<=MIN(n-RADIUS-1,iend); i++) { local_norm += (DTYPE)ABS(OUT(i,j)); } } MPI_Reduce(&local_norm, &norm, 1, MPI_DTYPE, MPI_SUM, root, MPI_COMM_WORLD); /******************************************************************************* ** Analyze and output results. ********************************************************************************/ /* verify correctness */ if (my_ID == root) { norm /= f_active_points; if (RADIUS > 0) { reference_norm = (DTYPE) (iterations+1) * (COEFX + COEFY); } else { reference_norm = (DTYPE) 0.0; } if (ABS(norm-reference_norm) > EPSILON) { printf("ERROR: L1 norm = "FSTR", Reference L1 norm = "FSTR"\n", norm, reference_norm); error = 1; } else { printf("Solution validates\n"); #if VERBOSE printf("Reference L1 norm = "FSTR", L1 norm = "FSTR"\n", reference_norm, norm); #endif } } bail_out(error); if (my_ID == root) { /* flops/stencil: 2 flops (fma) for each point in the stencil, plus one flop for the update of the input of the array */ flops = (DTYPE) (2*stencil_size+1) * f_active_points; avgtime = stencil_time/iterations; printf("Rate (MFlops/s): "FSTR" Avg time (s): %lf\n", 1.0E-06 * flops/avgtime, avgtime); } MPI_Finalize(); exit(EXIT_SUCCESS); }
int wb_tree_remove(wb_tree *tree, const void *key, int del) { int rv; wb_node *node, *temp, *out = NULL; /* ergh @ GCC unitializated warning */ ASSERT(tree != NULL); ASSERT(key != NULL); node = tree->root; while (node) { rv = tree->key_cmp(key, node->key); if (rv) { node = rv < 0 ? node->llink : node->rlink; continue; } if (node->llink == NULL) { temp = node; out = node->rlink; if (out) out->parent = node->parent; if (del) { if (tree->key_del) tree->key_del(node->key); if (tree->dat_del) tree->dat_del(node->dat); } if (node->parent) { if (node->parent->llink == node) node->parent->llink = out; else node->parent->rlink = out; } else { tree->root = out; } FREE(node); out = temp; } else if (node->rlink == NULL) { temp = node; out = node->llink; if (out) out->parent = node->parent; if (del) { if (tree->key_del) tree->key_del(node->key); if (tree->dat_del) tree->dat_del(node->dat); } if (node->parent) { if (node->parent->llink == node) node->parent->llink = out; else node->parent->rlink = out; } else { tree->root = out; } FREE(node); out = temp; } else if (WEIGHT(node->llink) > WEIGHT(node->rlink)) { if (WEIGHT(node->llink->llink) < WEIGHT(node->llink->rlink)) rot_left(tree, node->llink); out = node->llink; rot_right(tree, node); node = out->rlink; continue; } else { if (WEIGHT(node->rlink->rlink) < WEIGHT(node->rlink->llink)) rot_right(tree, node->rlink); out = node->rlink; rot_left(tree, node); node = out->llink; continue; } if (--tree->count) { while (out) { out->weight--; out = out->parent; } } return 0; } return -1; }
int wb_tree_insert(wb_tree *tree, void *key, void *dat, int overwrite) { int rv = 0; wb_node *node, *parent = NULL; float wbal; ASSERT(tree != NULL); node = tree->root; while (node) { rv = tree->key_cmp(key, node->key); if (rv < 0) parent = node, node = node->llink; else if (rv > 0) parent = node, node = node->rlink; else { if (overwrite == 0) return 1; if (tree->key_del) tree->key_del(node->key); if (tree->dat_del) tree->dat_del(node->dat); node->key = key; node->dat = dat; return 0; } } if ((node = node_new(key, dat)) == NULL) return -1; if ((node->parent = parent) == NULL) { ASSERT(tree->count == 0); tree->root = node; tree->count = 1; return 0; } if (rv < 0) parent->llink = node; else parent->rlink = node; while ((node = parent) != NULL) { parent = node->parent; node->weight++; wbal = WEIGHT(node->llink) / (float)node->weight; if (wbal < ALPHA_0) { wbal = WEIGHT(node->rlink->llink) / (float)node->rlink->weight; if (wbal < ALPHA_3) { /* LL */ rot_left(tree, node); } else { /* RL */ rot_right(tree, node->rlink); rot_left(tree, node); } } else if (wbal > ALPHA_1) { wbal = WEIGHT(node->llink->llink) / (float)node->llink->weight; if (wbal > ALPHA_2) { /* RR */ rot_right(tree, node); } else { /* LR */ rot_left(tree, node->llink); rot_right(tree, node); } } } tree->count++; return 0; }
/* profile 13 : 31 dBm | p00,p01,p02,p03,p04,p05,p06,p07,p08,p09,p10,p11,p12,p13,p14,p15 */ { /* ramp up */ { { 0,0,0,0,0,0,0,15,48,53,150,204,242,250,255,255 }, /* ramp down */ { 255,217,124,67,14,0,0,0,0,0,0,0,0,0,0,0 } } }, /*-------------------------------------------------------------------------------------*/ /* profile 14 : 33 dBm | p00,p01,p02,p03,p04,p05,p06,p07,p08,p09,p10,p11,p12,p13,p14,p15 */ { /* ramp up */ { { 0,0,0,0,0,0,0,25,36,80,144,195,242,248,255,255 }, /* ramp down */ { 255,209,145,71,0,0,0,0,0,0,0,0,0,0,0,0 } } }, /*-------------------------------------------------------------------------------------*/ /* profile 15 : 35 dBm | p00,p01,p02,p03,p04,p05,p06,p07,p08,p09,p10,p11,p12,p13,p14,p15 */ { /* ramp up */ { { 0,0,0,0,0,0,54,59,60,83,116,151,190,228,255,255 }, /* ramp down */ { 255,230,200,171,131,78,55,43,27,18,0,0,0,0,0,0 } } }, /*-------------------------------------------------------------------------------------*/ }, /* ARFCN WEIGHT */ { /* max arfcn , mid_level , hi_weight , lo_weight */ { 160 , 11 , WEIGHT(1.030), WEIGHT(1.026) }, { 190 , 11 , WEIGHT(1.005), WEIGHT(1.013) }, { 220 , 11 , WEIGHT(0.986), WEIGHT(1.013) }, { 190 , 11 , WEIGHT(1.005), WEIGHT(1.013) }, { 220 , 11 , WEIGHT(0.986), WEIGHT(1.013) }, { 251 , 11 , WEIGHT(0.965), WEIGHT(1.000) }, /*------------------------------------------------------*/ { TABLE_END } }, /* Battery WEIGHT */ { /* low temp, mid temp, hi temp */ { WEIGHT(1.000), WEIGHT(1.000), WEIGHT(1.000) }, /* low volt */ { WEIGHT(1.000), WEIGHT(1.000), WEIGHT(1.000) }, /* mid volt */ { WEIGHT(1.000), WEIGHT(1.000), WEIGHT(1.000) }, /* hi volt */ }, };
int main(int argc, char ** argv) { int n; /* linear grid dimension */ int i, j, ii, jj, it, jt, iter; /* dummies */ double norm, /* L1 norm of solution */ reference_norm; double f_active_points; /* interior of grid with respect to stencil */ DTYPE flops; /* floating point ops per iteration */ int iterations; /* number of times to run the algorithm */ double stencil_time, /* timing parameters */ avgtime, max_time; int stencil_size; /* number of points in stencil */ DTYPE weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil */ int istart; /* bounds of grid tile assigned to calling rank */ int jstart; /* bounds of grid tile assigned to calling rank */ int Num_procsx, Num_procsy; /******************************************************************************* ** process and test input parameters ********************************************************************************/ if(MYTHREAD == 0){ printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("UPC stencil execution on 2D grid\n"); fflush(stdout); } if (argc != 4 && argc != 3) if(MYTHREAD == 0) bail_out("Usage: %s <# iterations> <array dimension> [x_tiles]\n", *argv); iterations = atoi(*++argv); if (iterations < 1) if(MYTHREAD == 0) bail_out("iterations must be >= 1 : %d", iterations); n = atoi(*++argv); if (n < 1) if(MYTHREAD == 0) bail_out("grid dimension must be positive: %d", n); if (argc == 4) Num_procsx = atoi(*++argv); else Num_procsx = 0; if(Num_procsx < 0) if(MYTHREAD == 0) bail_out("Number of tiles in the x-direction should be positive (got: %d)", Num_procsx); if(Num_procsx > THREADS) if(MYTHREAD == 0) bail_out("Number of tiles in the x-direction should be < THREADS (got: %d)", Num_procsx); /* Num_procsx=0 refers to automated calculation of division on each coordinates like MPI code */ if(Num_procsx == 0){ for (Num_procsx=(int) (sqrt(THREADS+1)); Num_procsx>0; Num_procsx--) { if (!(THREADS%Num_procsx)) { Num_procsy = THREADS/Num_procsx; break; } } } else { Num_procsy = THREADS / Num_procsx; } if(RADIUS < 1) if(MYTHREAD == 0) bail_out("Stencil radius %d should be positive", RADIUS); if(2*RADIUS +1 > n) if(MYTHREAD == 0) bail_out("Stencil radius %d exceeds grid size %d", RADIUS, n); if(Num_procsx * Num_procsy != THREADS){ bail_out("Num_procsx * Num_procsy != THREADS"); } /* compute amount of space required for input and solution arrays */ int my_IDx = MYTHREAD % Num_procsx; int my_IDy = MYTHREAD / Num_procsx; int blockx = n / Num_procsx; int leftover = n % Num_procsx; if (my_IDx < leftover) { istart = (blockx + 1) * my_IDx; blockx += 1; } else { istart = (blockx+1) * leftover + blockx * (my_IDx-leftover); } if (blockx == 0) bail_out("No work to do on x-direction!"); int blocky = n / Num_procsy; leftover = n % Num_procsy; if (my_IDy < leftover) { jstart = (blocky+1) * my_IDy; blocky += 1; } else { jstart = (blocky+1) * leftover + blocky * (my_IDy-leftover); } if (blocky == 0) bail_out("No work to do on y-direction!"); if(blockx < RADIUS || blocky < RADIUS) { bail_out("blockx < RADIUS || blocky < RADIUS"); } int myoffsetx = istart - RADIUS; int myoffsety = jstart - RADIUS; thread_offsetx[MYTHREAD] = myoffsetx; thread_offsety[MYTHREAD] = myoffsety; int sizex = blockx + 2*RADIUS; int sizey = blocky + 2*RADIUS; thread_sizex[MYTHREAD] = sizex; thread_sizey[MYTHREAD] = sizey; upc_barrier; local_shared_block_ptrs in_array = shared_2d_array_alloc(sizex, sizey, myoffsetx, myoffsety); local_shared_block_ptrs out_array = shared_2d_array_alloc(sizex, sizey, myoffsetx, myoffsety); in_arrays[MYTHREAD] = in_array; out_arrays[MYTHREAD] = out_array; DTYPE **in_array_private = shared_2d_array_to_private(in_array, sizex, sizey, myoffsetx, myoffsety); DTYPE **out_array_private = shared_2d_array_to_private(out_array, sizex, sizey, myoffsetx, myoffsety); upc_barrier; private_in_arrays = prk_malloc(sizeof(private_shared_block_ptrs) * THREADS); if(private_in_arrays == NULL) bail_out("Cannot allocate private_in_arrays"); private_out_arrays = prk_malloc(sizeof(private_shared_block_ptrs) * THREADS); if(private_out_arrays == NULL) bail_out("Cannot allocate private_out_arrays"); for(int thread=0; thread<THREADS; thread++){ private_in_arrays[thread] = partially_privatize(in_arrays[thread], thread); private_out_arrays[thread] = partially_privatize(out_arrays[thread], thread); } /* intialize the input and output arrays */ for(int y=myoffsety; y<myoffsety + sizey; y++){ for(int x=myoffsetx; x<myoffsetx + sizex; x++){ in_array_private[y][x] = COEFX*x + COEFY*y; out_array[y][x] = 0.; } } upc_barrier; for(int y=myoffsety; y<myoffsety + sizey; y++){ for(int x=myoffsetx; x<myoffsetx + sizex; x++){ if(in_array_private[y][x] != COEFX*x + COEFY*y) bail_out("x=%d y=%d in_array=%f != %f", x, y, in_array[y][x], COEFX*x + COEFY*y); } } /* fill the stencil weights to reflect a discrete divergence operator */ for (jj=-RADIUS; jj<=RADIUS; jj++) for (ii=-RADIUS; ii<=RADIUS; ii++) WEIGHT(ii, jj) = (DTYPE)0.0; stencil_size = 4*RADIUS+1; for (ii=1; ii<=RADIUS; ii++) { WEIGHT(0, ii) = WEIGHT( ii,0) = (DTYPE) (1.0/(2.0*ii*RADIUS)); WEIGHT(0,-ii) = WEIGHT(-ii,0) = -(DTYPE) (1.0/(2.0*ii*RADIUS)); } if(MYTHREAD == 0){ printf("Number of threads = %d\n", THREADS); printf("Grid size = %d\n", n); printf("Radius of stencil = %d\n", RADIUS); printf("Tiles in x/y-direction = %d/%d\n", Num_procsx, Num_procsy); #if DOUBLE printf("Data type = double precision\n"); #else printf("Data type = single precision\n"); #endif #if LOOPGEN printf("Script used to expand stencil loop body\n"); #else printf("Compact representation of stencil loop body\n"); #endif printf("Number of iterations = %d\n", iterations); } upc_barrier; int startx = myoffsetx + RADIUS; int endx = myoffsetx + sizex - RADIUS; int starty = myoffsety + RADIUS; int endy = myoffsety + sizey - RADIUS; if(my_IDx == 0) startx += RADIUS; if(my_IDx == Num_procsx - 1) endx -= RADIUS; if(my_IDy == 0) starty += RADIUS; if(my_IDy == Num_procsy - 1) endy -= RADIUS; upc_barrier; for (iter = 0; iter<=iterations; iter++){ /* start timer after a warmup iteration */ if (iter == 1) { upc_barrier; stencil_time = wtime(); } /* Get ghost zones */ /* NORTH */ if(my_IDy != 0){ int peer = (my_IDy - 1) * Num_procsx + my_IDx; for (int y=starty - RADIUS; y<starty; y++) { int transfer_size = (endx - startx) * sizeof(DTYPE); upc_memget(&in_array_private[y][startx], &private_in_arrays[peer][y][startx], transfer_size); } } /* SOUTH */ if(my_IDy != Num_procsy - 1){ int peer = (my_IDy + 1) * Num_procsx + my_IDx; for (int y=endy; y<endy + RADIUS; y++) { int transfer_size = (endx - startx) * sizeof(DTYPE); upc_memget(&in_array_private[y][startx], &private_in_arrays[peer][y][startx], transfer_size); } } /* LEFT */ if(my_IDx != 0){ int peer = my_IDy * Num_procsx + my_IDx - 1; for (int y=starty; y<endy; y++) { for (int x=startx - RADIUS; x<startx; x++) { in_array_private[y][x] = private_in_arrays[peer][y][x]; } } } /* RIGHT*/ if(my_IDx != Num_procsx - 1){ int peer = my_IDy * Num_procsx + my_IDx + 1; for (int y=starty; y<endy; y++) { for (int x=endx; x<endx + RADIUS; x++) { in_array_private[y][x] = private_in_arrays[peer][y][x]; } } } /* Apply the stencil operator */ for (j=starty; j<endy; j++) { for (i=startx; i<endx; i++) { #if LOOPGEN #include "loop_body_star.incl" #else for (jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj); for (ii=-RADIUS; ii<0; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); for (ii=1; ii<=RADIUS; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); #endif } } upc_barrier; /* <- Necessary barrier: some slow threads could use future data */ /* add constant to solution to force refresh of neighbor data, if any */ for(int y=myoffsety + RADIUS; y<myoffsety + sizey - RADIUS; y++) for(int x=myoffsetx + RADIUS; x<myoffsetx + sizex - RADIUS; x++) in_array_private[y][x] += 1.0; upc_barrier; /* <- Necessary barrier: some threads could start on old data */ } /* end of iterations */ stencil_time = wtime() - stencil_time; times[MYTHREAD] = stencil_time; upc_barrier; // Compute max_time if(MYTHREAD == 0){ max_time = times[MYTHREAD]; for(i=1; i<THREADS; i++){ if(max_time < times[i]) max_time = times[i]; } } norm = (double) 0.0; f_active_points = (double)(n-2*RADIUS) * (double)(n-2*RADIUS); /* compute L1 norm in parallel */ for (int y=starty; y<endy; y++) { for (int x=startx; x<endx; x++) { norm += (double)ABS(out_array[y][x]); } } norm /= f_active_points; norms[MYTHREAD] = norm; upc_barrier; if(MYTHREAD == 0){ norm = 0.; for(int i=0; i<THREADS; i++) norm += norms[i]; /******************************************************************************* ** Analyze and output results. ********************************************************************************/ /* verify correctness */ reference_norm = (double) (iterations+1) * (COEFX + COEFY); if (ABS(norm - reference_norm) > EPSILON) bail_out("L1 norm = "FSTR", Reference L1 norm = "FSTR"\n", norm, reference_norm); else { printf("Solution validates\n"); #if VERBOSE printf("Reference L1 norm = "FSTR", L1 norm = "FSTR"\n", reference_norm, norm); #endif } flops = (DTYPE) (2*stencil_size+1) * f_active_points; avgtime = max_time/iterations; printf("Rate (MFlops/s): "FSTR" Avg time (s): %lf\n", 1.0E-06 * flops/avgtime, avgtime); exit(EXIT_SUCCESS); } }
int main(int argc, char ** argv) { long n; /* linear grid dimension */ int i, j, ii, jj, iter;/* dummies */ double norm = 0.0, /* L1 norm of solution */ reference_norm; double f_active_points; /* interior of grid with respect to stencil */ double flops; /* floating point ops per iteration */ int iterations=25; /* number of times to run the algorithm */ double stencil_time, /* timing parameters */ avgtime; int stencil_size; /* number of points in stencil */ double * RESTRICT in; /* input grid values */ double * RESTRICT out; /* output grid values */ long total_length; /* total required length to store grid values */ double weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil */ if(argc ==2){ n = atoi(argv[1]); } else{ n = DEF_SIZE; } if (2*RADIUS +1 > n) { printf("ERROR: Stencil radius %d exceeds grid size %d\n", RADIUS, n); exit(EXIT_FAIL); } /* allocate the required space */ total_length = n*n*sizeof(double); in = (double *) malloc(total_length); out = (double *) malloc(total_length); if (!in || !out) { printf("ERROR: could not allocate space for input or output array\n"); exit(EXIT_FAIL); } /* fill the stencil weights to reflect a discrete divergence operator */ stencil_size = (2*RADIUS+1)*(2*RADIUS+1); for (jj=-RADIUS; jj<= RADIUS; jj++) for (ii=-RADIUS; ii<= RADIUS; ii++) WEIGHT(ii,jj)=0.0; for (jj=1; jj<=RADIUS; jj++) { for (ii=-jj+1; ii<jj; ii++) { WEIGHT(ii,jj) = (double) (1.0/(4.0*jj*(2.0*jj-1)*RADIUS)); WEIGHT(ii,-jj) = -(double) (1.0/(4.0*jj*(2.0*jj-1)*RADIUS)); WEIGHT(jj,ii) = (double) (1.0/(4.0*jj*(2.0*jj-1)*RADIUS)); WEIGHT(-jj,ii) = -(double) (1.0/(4.0*jj*(2.0*jj-1)*RADIUS)); } WEIGHT(jj,jj) = (double) (1.0/(4.0*jj*RADIUS)); WEIGHT(-jj,-jj) = -(double) (1.0/(4.0*jj*RADIUS)); } f_active_points = (double) (n-2*RADIUS)*(double) (n-2*RADIUS); printf("Serial stencil execution on 2D grid\n"); printf("Grid size = %d\n", n); printf("Radius of stencil = %d\n", RADIUS); printf("Type of stencil = compact\n"); printf("Number of iterations = %d\n", iterations); /* intialize the input and output arrays */ for (j=0; j<n; j++) for (i=0; i<n; i++) IN(i,j) = COEFX*i+COEFY*j; for (j=RADIUS; j<n-RADIUS; j++) for (i=RADIUS; i<n-RADIUS; i++) OUT(i,j) = 0.0; for (iter = 0; iter<=iterations; iter++){ /* start timer after a warmup iteration */ if (iter == 1) stencil_time = wtime(); /* Apply the stencil operator */ for (j=RADIUS; j<n-RADIUS; j++) { for (i=RADIUS; i<n-RADIUS; i++) { /* would like to be able to unroll this loop, but compiler will ignore */ for (jj=-RADIUS; jj<=RADIUS; jj++) for (ii=-RADIUS; ii<=RADIUS; ii++) OUT(i,j) += WEIGHT(ii,jj)*IN(i+ii,j+jj); } } /* add constant to solution to force refresh of input data */ for (j=0; j<n; j++) for (i=0; i<n; i++) IN(i,j)+= 1.0; } /* end of iterations */ stencil_time = wtime() - stencil_time; /* compute L1 norm */ for (j=RADIUS; j<n-RADIUS; j++) for (i=RADIUS; i<n-RADIUS; i++) { norm += (double)ABS(OUT(i,j)); } norm /= f_active_points; /****************************************************************************** ** Analyze and output results. *******************************************************************************/ /* verify correctness */ reference_norm = (double) (iterations+1) * (COEFX + COEFY); if (ABS(norm-reference_norm) > EPSILON) { printf("ERROR: L1 norm = %lf, Reference L1 norm = %lf\n", norm, reference_norm); exit(EXIT_FAIL); } else { printf("Solution validates\n"); } flops = (double) (2*stencil_size+1) * f_active_points; avgtime = stencil_time/iterations; printf("Serial Rate (MFlops/s): %lf Avg time (s): %lf\n", 1.0E-06 * flops/avgtime, avgtime); exit(EXIT_SUCCESS); }
void solution_add(solution_t *sol, int *weight, int col) { (void) sm_row_insert(sol->row, col); sol->cost += WEIGHT(weight, col); }