void flat_tree (STREAM_TYPE * target, STREAM_TYPE * source, int nreduce) { STREAM_TYPE *tmptrg; tmptrg = (STREAM_TYPE *) malloc (nreduce * sizeof (STREAM_TYPE)); // only one PE needs to access this section if (_world_rank == 0) { /* First, finish gathering */ for (int n = 0; n < _world_size; n++) { shmem_getmem (tmptrg, source, nreduce * sizeof (STREAM_TYPE), n); /* Compute max */ for (int k = 0; k < nreduce; k++) { tmptrg[k] = REDUCE_MAX (tmptrg[k], source[k]); } } /* Then, broadcast results */ for (int n = 0; n < _world_size; n++) { shmem_putmem (target, tmptrg, nreduce * sizeof (STREAM_TYPE), n); } } shmem_barrier_all (); free (tmptrg); return; }
static void test_prepost(void) { int i, j, k; tmp = 0; total = 0; shmem_barrier_all(); for (i = 0 ; i < niters - 1 ; ++i) { cache_invalidate(); shmem_barrier_all(); tmp = timer(); for (j = 0 ; j < npeers ; ++j) { for (k = 0 ; k < nmsgs ; ++k) { shmem_putmem(recv_buf + (nbytes * (k + j * nmsgs)), send_buf + (nbytes * (k + j * nmsgs)), nbytes, send_peers[npeers - j - 1]); } } shmem_quiet(); shmem_short_wait((short*) (recv_buf + (nbytes * ((nmsgs - 1) + (npeers - 1) * nmsgs))), 0); total += (timer() - tmp); memset(recv_buf, 0, npeers * nmsgs * nbytes); } shmem_double_sum_to_all(&tmp, &total, 1, 0, 0, world_size, reduce_pWrk, reduce_pSync); display_result("pre-post", (niters * npeers * nmsgs * 2) / (tmp / world_size)); }
void SendData(ArgStruct *p) { if(p->bufflen%8==0) shmem_put64(p->s_ptr,p->s_ptr,p->bufflen/8,p->prot.nbor); else shmem_putmem(p->s_ptr,p->s_ptr,p->bufflen,p->prot.nbor); }
void benchmark (struct pe_vars v, char * msg_buffer) { static double pwrk[_SHMEM_REDUCE_SYNC_SIZE]; static long psync[_SHMEM_BCAST_SYNC_SIZE]; static double mr, mr_sum; unsigned long size, i; memset(psync, _SHMEM_SYNC_VALUE, sizeof(long[_SHMEM_BCAST_SYNC_SIZE])); /* * Warmup */ if (v.me < v.pairs) { for (i = 0; i < (ITERS_LARGE * MAX_MSG_SZ); i += MAX_MSG_SZ) { shmem_putmem(&msg_buffer[i], &msg_buffer[i], MAX_MSG_SZ, v.nxtpe); } } shmem_barrier_all(); /* * Benchmark */ for (size = 1; size <= MAX_MSG_SZ; size <<= 1) { i = size < LARGE_THRESHOLD ? ITERS_SMALL : ITERS_LARGE; mr = message_rate(v, msg_buffer, size, i); shmem_double_sum_to_all(&mr_sum, &mr, 1, 0, 0, v.npes, pwrk, psync); print_message_rate(v.me, size, mr_sum); } }
double message_rate (struct pe_vars v, char * buffer, int size, int iterations) { int64_t begin, end; int i, offset; /* * Touch memory */ memset(buffer, size, MAX_MSG_SZ * ITERS_LARGE); shmem_barrier_all(); if (v.me < v.pairs) { begin = TIME(); for (i = 0, offset = 0; i < iterations; i++, offset += size) { shmem_putmem(&buffer[offset], &buffer[offset], size, v.nxtpe); } end = TIME(); return ((double)iterations * 1e6) / ((double)end - (double)begin); } return 0; }
void benchmark (struct pe_vars v, union data_types *msg_buffer) { srand(v.me); /* * Warmup with puts */ if (v.me < v.pairs) { unsigned long i; for (i = 0; i < ITERATIONS; i++) { shmem_putmem(&msg_buffer[i].int_type, &msg_buffer[i].int_type, sizeof(int), v.nxtpe); } } /* * Performance with atomics */ benchmark_fadd(v, msg_buffer, ITERATIONS); benchmark_finc(v, msg_buffer, ITERATIONS); benchmark_add(v, msg_buffer, ITERATIONS); benchmark_inc(v, msg_buffer, ITERATIONS); benchmark_cswap(v, msg_buffer, ITERATIONS); benchmark_swap(v, msg_buffer, ITERATIONS); benchmark_fadd_longlong(v, msg_buffer, ITERATIONS); benchmark_finc_longlong(v, msg_buffer, ITERATIONS); benchmark_add_longlong(v, msg_buffer, ITERATIONS); benchmark_inc_longlong(v, msg_buffer, ITERATIONS); benchmark_cswap_longlong(v, msg_buffer, ITERATIONS); benchmark_swap_longlong(v, msg_buffer, ITERATIONS); }
void sendTmats(LSMSCommunication &comm, LocalTypeInfo &local) { void * temp_buff=(void*)shmalloc(2*local.lDimTmatStore*double_size); for(int i=0; i<comm.numTmatTo; i++) { int to=comm.tmatTo[i].remoteNode; for(int j=0; j<comm.tmatTo[i].numTmats; j++) { // printf("Node %d: send tmat %d to %d\n",comm.rank,comm.tmatTo[i].globalIdx[j],to); /* MPI_Isend(&local.tmatStore(0,comm.tmatTo[i].tmatStoreIdx[j]),2*local.lDimTmatStore, MPI_DOUBLE,to,comm.tmatTo[i].globalIdx[j],comm.comm, &comm.tmatTo[i].communicationRequest[j]); */ // Assuming comm.tmatTo[i].numTmats == comm.tmatFrom[i].numTmats // If not ... check the iteration space mapping between // the receivers (above) and senders (here) memcpy(temp_buff,&local.tmatStore(0,comm.tmatFrom[i].tmatStoreIdx[j]),2*local.lDimTmatStore*double_size); // Note: there is a trade-off here: either reuse the same // buffer and enjoy reduced cache misses (and potential // power savings due to less data xfers between memories) // or enjoy non-blocking communication with different buffers // but at the cost of multiple cache misses (and potentially // high power consumption) shmem_putmem(temp_buff, &local.tmatStore(0,comm.tmatTo[i].tmatStoreIdx[j]), 2*local.lDimTmatStore*double_size, to); } } }
static int test_item1(void) { int rc = TC_PASS; int num_proc = 0; int my_proc = 0; int peer; int size; char *buf; int test_byte; int max_heap_size_per_proc; num_proc = _num_pes(); my_proc = _my_pe(); peer = (my_proc + 1) % num_proc; max_heap_size_per_proc = 1L << (sys_log2((memheap_size() * HEAP_USAGE_PERCENT)/ num_proc) - 1); max_heap_size_per_proc = (max_heap_size_per_proc > MAX_SIZE) ? MAX_SIZE : max_heap_size_per_proc; buf = (char *)shmalloc(max_heap_size_per_proc * num_proc); if (!buf) { log_error(OSH_TC, "shmalloc(%d)\n", max_heap_size_per_proc * num_proc); return TC_SETUP_FAIL; } size = 1L << sys_log2(num_proc); size = ((size - 2) > 0) ? size : 4; log_debug(OSH_TC, "%d: buf = %p size=%d\n", my_proc, buf, size); for (; size <= max_heap_size_per_proc; size *=2) { memset(buf + max_heap_size_per_proc * my_proc, 1 + my_proc % (size - 2), max_heap_size_per_proc); log_debug(OSH_TC, "\n%d: b4 barrier size = %d\n", my_proc, size); shmem_barrier_all(); log_debug(OSH_TC, "%d: b4 putmem size = %d %p -> %p\n", my_proc, size, buf+max_heap_size_per_proc*my_proc, buf + max_heap_size_per_proc * my_proc); shmem_putmem(buf+max_heap_size_per_proc*my_proc, buf+max_heap_size_per_proc*my_proc, size, peer); shmem_fence(); test_byte = 0; log_debug(OSH_TC, "%d: b4 getmem size = %d\n %p <- %p ", my_proc, size, &test_byte, buf+max_heap_size_per_proc*peer + size - 1 ); shmem_getmem(&test_byte, buf+max_heap_size_per_proc*my_proc + size - 1, 1, peer); log_debug(OSH_TC, "%d: after getmem size = %d result=%x\n", my_proc, size, test_byte); if (test_byte != 1 + my_proc % (size-2)) { log_error(OSH_TC, "fence failed at size %d got = %x expected = %x\n", size, test_byte, 1 + my_proc % (size-2)); rc = TC_FAIL; } } shfree(buf); log_debug(OSH_TC, rc == TC_PASS? "passed" : "failed"); return rc; }
static void test_one_way(void) { int i, k; int pe_size = world_size; tmp = 0; total = 0; shmem_barrier_all(); if (world_size % 2 == 1) { pe_size = world_size - 1; } if (!(world_size % 2 == 1 && rank == (world_size - 1))) { if (rank < world_size / 2) { for (i = 0 ; i < niters ; ++i) { cache_invalidate(); shmem_barrier(0, 0, pe_size, barrier_pSync); tmp = timer(); for (k = 0 ; k < nmsgs ; ++k) { shmem_putmem(recv_buf + (nbytes * k), send_buf + (nbytes * k), nbytes, rank + (world_size / 2)); } shmem_quiet(); total += (timer() - tmp); } } else { for (i = 0 ; i < niters ; ++i) { cache_invalidate(); shmem_barrier(0, 0, pe_size, barrier_pSync); tmp = timer(); shmem_short_wait((short*) (recv_buf + (nbytes * (nmsgs - 1))), 0); total += (timer() - tmp); memset(recv_buf, 0, npeers * nmsgs * nbytes); } } shmem_double_sum_to_all(&tmp, &total, 1, 0, 0, pe_size, reduce_pWrk, reduce_pSync); display_result("single direction", (niters * nmsgs) / (tmp / world_size)); } shmem_barrier_all(); }
void flat_tree (STREAM_TYPE * target, STREAM_TYPE * source, int nreduce) { STREAM_TYPE *tmptrg; STREAM_TYPE *write_to; /* use temp target in case source/target overlap/same */ tmptrg = (STREAM_TYPE *) malloc (nreduce * sizeof (STREAM_TYPE)); write_to = tmptrg; for (int j = 0; j < nreduce; j += 1) { write_to[j] = source[j]; } shmem_barrier_all (); // only one PE needs to access this section if (_world_rank == 0) { /* First, finish gathering */ for (int n = 0; n < _world_size; n++) { shmem_getmem (tmptrg, source, nreduce * sizeof (STREAM_TYPE), n); /* Compute max */ for (int k = 0; k < nreduce; k++) { write_to[k] = REDUCE_MAX (write_to[k], source[k]); } } /* Then, broadcast results */ for (int n = 0; n < _world_size; n++) { shmem_putmem (target, tmptrg, nreduce * sizeof (STREAM_TYPE), n); } } shmem_barrier_all (); free (tmptrg); tmptrg = NULL; return; }
/** * \brief Check to make sure the test is correct, SHMEM * \param tst Struct that tells the number of cycles and stages to run the test. * \param m Struct that holds the results of the test. */ void bit_SHMEM_test(test_p tst, measurement_p m) { #ifdef SHMEM buffer_t *abuf, *bbuf, *cbuf; int i, j, k, icycle, istage, partner_rank; unsigned char pattern; abuf = comm_newbuffer(m->buflen); /* set up exchange buffers */ bbuf = comm_newbuffer(m->buflen); cbuf = comm_newbuffer(m->buflen); for (icycle = 0; icycle < tst->num_cycles; icycle++) { /* multiple cycles repeat the test */ for (istage = 0; istage < tst->num_stages; istage++) { /* step through the stage schedule */ partner_rank = my_rank ^ istage; /* who's my partner for this stage? */ shmem_barrier_all(); if ((partner_rank < num_ranks) && (partner_rank != my_rank)) { /* valid pair? proceed with test */ for (k=0x00; k< 0x100; k++) { /* try each byte patter */ pattern=k; for (i=0; i<m->buflen; i++) ((unsigned char *)(abuf->data))[i]=pattern; shmem_putmem(bbuf->data, abuf->data, m->buflen, partner_rank); shmem_fence(); shmem_getmem(cbuf->data, bbuf->data, m->buflen, partner_rank); for (i=0; i<m->buflen; i++) { if (((unsigned char *)(cbuf->data))[i] != pattern) { printf("DATA ERROR DETECTED: node:%20s rank:%10d" " pattern:0x%2x buflen:%10d position:%10d\n", nodename, my_rank, (int)pattern, m->buflen, i); } /* if mismatch */ } /* for buflen */ } /* for pattern */ } /* if valid pairing */ } /* for istage */ } /* for icycle */ shmem_barrier_all(); comm_freebuffer(cbuf); comm_freebuffer(bbuf); comm_freebuffer(abuf); #endif return; }
int main(int argc, char **argv) { const int ITER_CNT = 100; const long int MAX_MSG_SIZE = 1048576; int* source_addr; int peer; long int i=0, buff_size; int j=0; long long int start_time, stop_time, res; double time; shmem_init(); int pe_id = shmem_my_pe(); source_addr = (int*) shmem_malloc(MAX_MSG_SIZE); if(pe_id == 1) { if(shmem_n_pes()!=4) fprintf(stderr,"Num PEs should be ==4"); printf("#Message Cnt;Time(s);MR(msgs/sec)\n"); } if (pe_id==1) peer = 3; else if(pe_id==3) peer = 1; get_rtc_res_(&res); for (i = 0; i < SHMEM_BARRIER_SYNC_SIZE; i += 1){ pSync[i] = SHMEM_SYNC_VALUE; } /* Collective operation: Implicit barrier on return from attach */ shmem_barrier_all(); if(pe_id == 1 || pe_id == 3) { for(buff_size=1; buff_size<=MAX_MSG_SIZE; buff_size*=2) { isdone=0; shmem_barrier(1,1,2,pSync); get_rtc_(&start_time); for(j=1;j<=ITER_CNT;j++) { shmem_putmem(source_addr, source_addr, buff_size, peer); shmem_quiet(); shmem_int_put(&isdone, &j, 1, peer); shmem_quiet(); shmem_int_wait(&isdone,j-1); shmem_putmem(source_addr, source_addr, buff_size, peer); shmem_quiet(); } shmem_barrier(1,1,2,pSync); get_rtc_(&stop_time); time = (stop_time - start_time)*1.0/(double)res/ITER_CNT; if(pe_id == 1) { printf("%20ld;%20.12f;%20.12f\n", buff_size, time, (double)buff_size/time); } fflush(stdout); } } shmem_barrier_all(); shmem_finalize(); }
int main (int argc, char **argv) { int i; int nextpe; int me, npes; int success1, success2, success3, success4, success5, success6, success7, success8; short src1[N]; int src2[N]; long src3[N]; long double src4[N]; long long src5[N]; double src6[N]; float src7[N]; char *src8; short src9; int src10; long src11; double src12; float src13; short *dest1; int *dest2; long *dest3; long double *dest4; long long *dest5; double *dest6; float *dest7; char *dest8; short *dest9; int *dest10; long *dest11; double *dest12; float *dest13; shmem_init (); me = shmem_my_pe (); npes = shmem_n_pes (); if (npes > 1) { success1 = 0; success2 = 0; success3 = 0; success4 = 0; success5 = 0; success6 = 0; success7 = 0; success8 = 0; src8 = (char *) malloc (N * sizeof (char)); for (i = 0; i < N; i += 1) { src1[i] = (short) me; src2[i] = me; src3[i] = (long) me; src4[i] = (long double) me; src5[i] = (long long) me; src6[i] = (double) me; src7[i] = (float) me; src8[i] = (char) me; } src9 = (short) me; src10 = me; src11 = (long) me; src12 = (double) me; src13 = (float) me; dest1 = (short *) shmem_malloc (N * sizeof (*dest1)); dest2 = (int *) shmem_malloc (N * sizeof (*dest2)); dest3 = (long *) shmem_malloc (N * sizeof (*dest3)); dest4 = (long double *) shmem_malloc (N * sizeof (*dest4)); dest5 = (long long *) shmem_malloc (N * sizeof (*dest5)); dest6 = (double *) shmem_malloc (N * sizeof (*dest6)); dest7 = (float *) shmem_malloc (N * sizeof (*dest7)); dest8 = (char *) shmem_malloc (4 * sizeof (*dest8)); dest9 = (short *) shmem_malloc (sizeof (*dest9)); dest10 = (int *) shmem_malloc (sizeof (*dest10)); dest11 = (long *) shmem_malloc (sizeof (*dest11)); dest12 = (double *) shmem_malloc (sizeof (*dest12)); dest13 = (float *) shmem_malloc (sizeof (*dest13)); for (i = 0; i < N; i += 1) { dest1[i] = -9; dest2[i] = -9; dest3[i] = -9; dest4[i] = -9; dest5[i] = -9; dest6[i] = -9; dest7[i] = -9.0; dest8[i] = -9; } *dest9 = -9; *dest10 = -9; *dest11 = -9; *dest12 = -9; *dest13 = -9.0; nextpe = (me + 1) % npes; /* Testing shmem_short_put, shmem_int_put, shmem_long_put, shmem_longdouble_put, shmem_longlong_put, shmem_double_put, shmem_float_put, shmem_putmem */ shmem_barrier_all (); shmem_short_put (dest1, src1, N, nextpe); shmem_int_put (dest2, src2, N, nextpe); shmem_long_put (dest3, src3, N, nextpe); shmem_longdouble_put (dest4, src4, N, nextpe); shmem_longlong_put (dest5, src5, N, nextpe); shmem_double_put (dest6, src6, N, nextpe); shmem_float_put (dest7, src7, N, nextpe); shmem_putmem (dest8, src8, N * sizeof (char), nextpe); shmem_barrier_all (); if (me == 0) { for (i = 0; i < N; i += 1) { if (dest1[i] != (npes - 1)) { success1 = 1; } if (dest2[i] != (npes - 1)) { success2 = 1; } if (dest3[i] != (npes - 1)) { success3 = 1; } if (dest4[i] != (npes - 1)) { success4 = 1; } if (dest5[i] != (npes - 1)) { success5 = 1; } if (dest6[i] != (npes - 1)) { success6 = 1; } if (dest7[i] != (npes - 1)) { success7 = 1; } if (dest8[i] != (npes - 1)) { success8 = 1; } } if (success1 == 0) printf ("Test shmem_short_put: Passed\n"); else printf ("Test shmem_short_put: Failed\n"); if (success2 == 0) printf ("Test shmem_int_put: Passed\n"); else printf ("Test shmem_int_put: Failed\n"); if (success3 == 0) printf ("Test shmem_long_put: Passed\n"); else printf ("Test shmem_long_put: Failed\n"); if (success4 == 0) printf ("Test shmem_longdouble_put: Passed\n"); else printf ("Test shmem_longdouble_put: Failed\n"); if (success5 == 0) printf ("Test shmem_longlong_put: Passed\n"); else printf ("Test shmem_longlong_put: Failed\n"); if (success6 == 0) printf ("Test shmem_double_put: Passed\n"); else printf ("Test shmem_double_put: Failed\n"); if (success7 == 0) printf ("Test shmem_float_put: Passed\n"); else printf ("Test shmem_float_put: Failed\n"); if (success8 == 0) printf ("Test shmem_putmem: Passed\n"); else printf ("Test shmem_putmem: Failed\n"); } shmem_barrier_all (); /* Testing shmem_put32, shmem_put64, shmem_put128 */ if (sizeof (int) == 4) { for (i = 0; i < N; i += 1) { dest2[i] = -9; dest3[i] = -9; dest4[i] = -9; } success2 = 0; success3 = 0; success4 = 0; shmem_barrier_all (); shmem_put32 (dest2, src2, N, nextpe); shmem_put64 (dest3, src3, N, nextpe); shmem_put128 (dest4, src4, N, nextpe); shmem_barrier_all (); if (me == 0) { for (i = 0; i < N; i += 1) { if (dest2[i] != (npes - 1)) { success2 = 1; } if (dest3[i] != (npes - 1)) { success3 = 1; } if (dest4[i] != (npes - 1)) { success4 = 1; } } if (success2 == 0) printf ("Test shmem_put32: Passed\n"); else printf ("Test shmem_put32: Failed\n"); if (success3 == 0) printf ("Test shmem_put64: Passed\n"); else printf ("Test shmem_put64: Failed\n"); if (success4 == 0) printf ("Test shmem_put128: Passed\n"); else printf ("Test shmem_put128: Failed\n"); } } else if (sizeof (int) == 8) { for (i = 0; i < N; i += 1) { dest1[i] = -9; dest2[i] = -9; dest3[i] = -9; } success1 = 0; success2 = 0; success3 = 0; shmem_barrier_all (); shmem_put32 (dest1, src1, N, nextpe); shmem_put64 (dest2, src2, N, nextpe); shmem_put128 (dest3, src3, N, nextpe); shmem_barrier_all (); if (me == 0) { for (i = 0; i < N; i += 1) { if (dest1[i] != (npes - 1)) { success1 = 1; } if (dest2[i] != (npes - 1)) { success2 = 1; } if (dest3[i] != (npes - 1)) { success3 = 1; } } if (success1 == 0) printf ("Test shmem_put32: Passed\n"); else printf ("Test shmem_put32: Failed\n"); if (success2 == 0) printf ("Test shmem_put64: Passed\n"); else printf ("Test shmem_put64: Failed\n"); if (success3 == 0) printf ("Test shmem_put128: Passed\n"); else printf ("Test shmem_put128: Failed\n"); } } /* Testing shmem_iput32, shmem_iput64, shmem_iput128 */ shmem_barrier_all (); if (sizeof (int) == 4) { for (i = 0; i < N; i += 1) { dest2[i] = -9; dest3[i] = -9; dest4[i] = -9; } success2 = 0; success3 = 0; success4 = 0; shmem_barrier_all (); shmem_iput32 (dest2, src2, 1, 2, N, nextpe); shmem_iput64 (dest3, src3, 1, 2, N, nextpe); shmem_iput128 (dest4, src4, 1, 2, N, nextpe); shmem_barrier_all (); if (me == 0) { for (i = 0; i < N / 2; i += 1) { if (dest2[i] != (npes - 1)) { success2 = 1; } if (dest3[i] != (npes - 1)) { success3 = 1; } if (dest4[i] != (npes - 1)) { success4 = 1; } } if (success2 == 0) printf ("Test shmem_iput32: Passed\n"); else printf ("Test shmem_iput32: Failed\n"); if (success3 == 0) printf ("Test shmem_iput64: Passed\n"); else printf ("Test shmem_iput64: Failed\n"); if (success4 == 0) printf ("Test shmem_iput128: Passed\n"); else printf ("Test shmem_iput128: Failed\n"); } } else if (sizeof (int) == 8) { for (i = 0; i < N; i += 1) { dest1[i] = -9; dest2[i] = -9; dest3[i] = -9; } success1 = 0; success2 = 0; success3 = 0; shmem_barrier_all (); shmem_iput32 (dest1, src1, 1, 2, N, nextpe); shmem_iput64 (dest2, src2, 1, 2, N, nextpe); shmem_iput128 (dest3, src3, 1, 2, N, nextpe); shmem_barrier_all (); if (me == 0) { for (i = 0; i < N / 2; i += 1) { if (dest1[i] != (npes - 1)) { success1 = 1; } if (dest2[i] != (npes - 1)) { success2 = 1; } if (dest3[i] != (npes - 1)) { success3 = 1; } } if (success1 == 0) printf ("Test shmem_iput32: Passed\n"); else printf ("Test shmem_iput32: Failed\n"); if (success2 == 0) printf ("Test shmem_iput64: Passed\n"); else printf ("Test shmem_iput64: Failed\n"); if (success3 == 0) printf ("Test shmem_iput128: Passed\n"); else printf ("Test shmem_iput128: Failed\n"); } } /* Testing shmem_short_iput, shmem_int_iput, shmem_long_iput, shmem_double_iput, shmem_float_iput */ for (i = 0; i < N; i += 1) { dest1[i] = -9; dest2[i] = -9; dest3[i] = -9; dest6[i] = -9; dest7[i] = -9; } success1 = 0; success2 = 0; success3 = 0; success6 = 0; success7 = 0; shmem_barrier_all (); shmem_short_iput (dest1, src1, 1, 2, N, nextpe); shmem_int_iput (dest2, src2, 1, 2, N, nextpe); shmem_long_iput (dest3, src3, 1, 2, N, nextpe); shmem_double_iput (dest6, src6, 1, 2, N, nextpe); shmem_float_iput (dest7, src7, 1, 2, N, nextpe); shmem_barrier_all (); if (me == 0) { for (i = 0; i < N / 2; i += 1) { if (dest1[i] != (npes - 1)) { success1 = 1; } if (dest2[i] != (npes - 1)) { success2 = 1; } if (dest3[i] != (npes - 1)) { success3 = 1; } if (dest6[i] != (npes - 1)) { success6 = 1; } if (dest7[i] != (npes - 1)) { success7 = 1; } } if (success1 == 0) printf ("Test shmem_short_iput: Passed\n"); else printf ("Test shmem_short_iput: Failed\n"); if (success2 == 0) printf ("Test shmem_int_iput: Passed\n"); else printf ("Test shmem_int_iput: Failed\n"); if (success3 == 0) printf ("Test shmem_long_iput: Passed\n"); else printf ("Test shmem_long_iput: Failed\n"); if (success6 == 0) printf ("Test shmem_double_iput: Passed\n"); else printf ("Test shmem_double_iput: Failed\n"); if (success7 == 0) printf ("Test shmem_float_iput: Passed\n"); else printf ("Test shmem_float_iput: Failed\n"); } /* Testing shmem_double_p, shmem_float_p, shmem_int_p, shmem_long_p, shmem_short_p */ shmem_barrier_all (); shmem_short_p (dest9, src9, nextpe); shmem_int_p (dest10, src10, nextpe); shmem_long_p (dest11, src11, nextpe); shmem_double_p (dest12, src12, nextpe); shmem_float_p (dest13, src13, nextpe); shmem_barrier_all (); if (me == 0) { if (*dest9 == (npes - 1)) printf ("Test shmem_short_p: Passed\n"); else printf ("Test shmem_short_p: Failed\n"); if (*dest10 == (npes - 1)) printf ("Test shmem_int_p: Passed\n"); else printf ("Test shmem_int_p: Failed\n"); if (*dest11 == (npes - 1)) printf ("Test shmem_long_p: Passed\n"); else printf ("Test shmem_long_p: Failed\n"); if (*dest12 == (npes - 1)) printf ("Test shmem_double_p: Passed\n"); else printf ("Test shmem_double_p: Failed\n"); if (*dest13 == (npes - 1)) printf ("Test shmem_float_p: Passed\n"); else printf ("Test shmem_float_p: Failed\n"); } shmem_barrier_all (); shmem_free (dest1); shmem_free (dest2); shmem_free (dest3); shmem_free (dest4); shmem_free (dest5); shmem_free (dest6); shmem_free (dest7); shmem_free (dest8); shmem_free (dest9); shmem_free (dest10); shmem_free (dest11); shmem_free (dest12); shmem_free (dest13); } else { printf ("Number of PEs must be > 1 to test shmem put, test skipped\n"); } shmem_finalize (); return 0; }
void _PERM_DR(const _permmap* const pm, _permdata* const pd, const int scatter, _array_fnc dst, _array_fnc src) { const int one = 1; const int eltsize = pd->eltsize; int i; int * const restrict ldecnt = (int*)_zmalloc(_PROCESSORS*sizeof(int), "perm dr lcnt"); int * const restrict rdecnt = (int*)_zmalloc(_PROCESSORS*sizeof(int), "perm dr rcnt"); char * const restrict * const restrict ldata = pd->ldata; char * const restrict * const restrict rdata = pd->rdata; char * restrict * const restrict rptr = pm->rptr; int * restrict rflag = pm->rflag; char* addr; #ifdef _SHMEM_PERMUTE_DEBUG printf("DR start %d\n", _INDEX); fflush(stdout); sleep(5); #endif for (i=0; i<_PROCESSORS; i++) { ldecnt[i] = (scatter ? _PERM_LCNT(pm, i) : _PERM_RCNT(pm, i)) * eltsize; rdecnt[i] = (scatter ? _PERM_RCNT(pm, i) : _PERM_LCNT(pm, i)) * eltsize; } for (i = (_INDEX == _PROCESSORS - 1) ? 0 : _INDEX+1; i != _INDEX; i = (i == _PROCESSORS - 1) ? 0 : i++) { if (rdecnt[i] > 0) { #ifdef _SHMEM_PERMUTE_DEBUG printf("DR %d sending addr, %d, to %d\n", _INDEX, (int)&(rdata[i]), i); fflush(stdout); sleep(5); #endif rflag[i] = 0; shmem_put((void*)&(rptr[_INDEX]), (void*)&(rdata[i]), 1, i); } } for (i = (_INDEX == 0) ? _PROCESSORS-1 : _INDEX-1; i != _INDEX; i = (i == 0) ? _PROCESSORS-1 : i--) { if (ldecnt[i] > 0) { shmem_wait((long*)&(rptr[i]), 0); addr = rptr[i]; rptr[i] = 0; #ifdef _SHMEM_PERMUTE_DEBUG printf("DR %d waiting addr, %d, from %d\n", _INDEX, (int)addr, i); fflush(stdout); sleep(5); #endif shmem_putmem(addr, ldata[i], ldecnt[i], i); } } shmem_fence(); for (i = (_INDEX == 0) ? _PROCESSORS-1 : _INDEX-1; i != _INDEX; i = (i == 0) ? _PROCESSORS-1 : i--) { if (ldecnt[i] > 0) { #ifdef _SHMEM_PERMUTE_DEBUG printf("DR %d sending flag to %d\n", _INDEX, i); fflush(stdout); sleep(5); #endif shmem_int_put(&(rflag[_INDEX]), &one, 1, i); } } if (ldecnt[_INDEX] > 0) { memcpy(rdata[_INDEX], ldata[_INDEX], ldecnt[_INDEX]); } _zfree(ldecnt, "perm dr lcnt"); _zfree(rdecnt, "perm dr rcnt"); pd->count = -1; }
void communicateSingleAtomData(LSMSCommunication &comm, int from, int to, int &local_id, AtomData &atom, int tag) { //The buffers used in this func are pre-allocated within initializeCommunication() of size 's' below //int s=sizeof(AtomData)+sizeof(Real)*(2*3*MAXPTS+2*MAXCORE)+sizeof(int)*3*2*MAXCORE+sizeof(int); // 304 bytes transferred in each of the ITER_MAX iterations const int maxPts=MAXPTS; const int maxCore=MAXCORE; int t,i; static int count=0; const int ITER_MAX=1; int sec_id; if(comm.comm.rank==from) { for (i=0;i<ITER_MAX;i++){ int pos=0; memcpy(&p2p_buf[pos],&local_id,int_size); pos+=int_size; memcpy(&p2p_buf[pos],&atom.jmt,int_size); pos+=int_size; memcpy(&p2p_buf[pos],&atom.jws,int_size); pos+=int_size; memcpy(&p2p_buf[pos],&atom.xstart,double_size); pos+=double_size; memcpy(&p2p_buf[pos],&atom.rmt,double_size); pos+=double_size; memcpy(&p2p_buf[pos],atom.header,80*char_size); pos+=80*char_size; memcpy(&p2p_buf[pos],&atom.alat,double_size); pos+=double_size; memcpy(&p2p_buf[pos],&atom.efermi,double_size); pos+=double_size; memcpy(&p2p_buf[pos],&atom.vdif,double_size); pos+=double_size; memcpy(&p2p_buf[pos],&atom.ztotss,double_size); pos+=double_size; memcpy(&p2p_buf[pos],&atom.zcorss,double_size); pos+=double_size; memcpy(&p2p_buf[pos],atom.evec,3*double_size); pos+=3*double_size; memcpy(&p2p_buf[pos],&atom.nspin,int_size); pos+=int_size; memcpy(&p2p_buf[pos],&atom.numc,int_size); pos+=int_size; t=atom.vr.n_row(); memcpy(&p2p_buf[pos],&t,int_size); pos+=int_size; memcpy(&p2p_buf[pos],&atom.vr(0,0),2*t*double_size); pos+=2*t*double_size; memcpy(&p2p_buf[pos],&atom.rhotot(0,0),2*t*double_size); pos+=2*t*double_size; memcpy(&p2p_buf[pos],&atom.corden(0,0),2*t*double_size); pos+=2*t*double_size; t=atom.ec.n_row(); memcpy(&p2p_buf[pos],&t,int_size); pos+=int_size; memcpy(&p2p_buf[pos],&atom.ec(0,0),2*t*double_size); pos+=2*t*double_size; memcpy(&p2p_buf[pos],&atom.nc(0,0),2*t*int_size); pos+=2*t*int_size; memcpy(&p2p_buf[pos],&atom.lc(0,0),2*t*int_size); pos+=2*t*int_size; memcpy(&p2p_buf[pos],&atom.kc(0,0),2*t*int_size); pos+=2*t*int_size; shmem_int_wait_until((sync_send_flag+to),_SHMEM_CMP_EQ,1); shmem_putmem(p2p_buf, p2p_buf, 1048576, to); shmem_int_add((sync_send_flag+to),-1,comm.comm.rank); shmem_int_add((sync_recv_flag+comm.comm.rank),1,to); shmem_quiet(); }// end of false for loop } if(comm.comm.rank==to) { for(i=0;i<ITER_MAX;i++) { int pos=0; sync_recv_flag[from]=0; shmem_int_add((sync_send_flag+comm.comm.rank),1,from); shmem_quiet(); shmem_int_wait_until((sync_recv_flag+from),_SHMEM_CMP_EQ,1); memcpy(&local_id,&p2p_buf[pos],int_size); pos+=int_size; memcpy(&atom.jmt,&p2p_buf[pos],int_size); pos+=int_size; memcpy(&atom.jws,&p2p_buf[pos],int_size); pos+=int_size; memcpy(&atom.xstart,&p2p_buf[pos],double_size); pos+=double_size; memcpy(&atom.rmt,&p2p_buf[pos],double_size); pos+=double_size; memcpy(atom.header,&p2p_buf[pos],80*char_size); pos+=80*char_size; memcpy(&atom.alat,&p2p_buf[pos],double_size); pos+=double_size; memcpy(&atom.efermi,&p2p_buf[pos],double_size); pos+=double_size; memcpy(&atom.vdif,&p2p_buf[pos],double_size); pos+=double_size; memcpy(&atom.ztotss,&p2p_buf[pos],double_size); pos+=double_size; memcpy(&atom.zcorss,&p2p_buf[pos],double_size); pos+=double_size; memcpy(atom.evec,&p2p_buf[pos],3*double_size); pos+=3*double_size; memcpy(&atom.nspin,&p2p_buf[pos],int_size); pos+=int_size; memcpy(&atom.numc,&p2p_buf[pos],int_size); pos+=int_size; memcpy(&t,&p2p_buf[pos],int_size); pos+=int_size; if(t!=atom.vr.n_row()) atom.resizePotential(t); memcpy(&atom.vr(0,0),&p2p_buf[pos],2*t*double_size); pos+=2*t*double_size; memcpy(&atom.rhotot(0,0),&p2p_buf[pos],2*t*double_size); pos+=2*t*double_size; memcpy(&atom.corden(0,0),&p2p_buf[pos],2*t*double_size); pos+=2*t*double_size; memcpy(&t,&p2p_buf[pos],int_size); pos+=int_size; if(t!=atom.nc.n_row()) atom.resizeCore(t); memcpy(&atom.ec(0,0),&p2p_buf[pos],2*t*double_size); pos+=2*t*double_size; memcpy(&atom.nc(0,0),&p2p_buf[pos],2*t*int_size); pos+=2*t*int_size; memcpy(&atom.lc(0,0),&p2p_buf[pos],2*t*int_size); pos+=2*t*int_size; memcpy(&atom.kc(0,0),&p2p_buf[pos],2*t*int_size); pos+=2*t*int_size; shmem_int_add((sync_recv_flag+from),-1,comm.comm.rank); shmem_quiet(); } } }
int main(int argc, char **argv) { int j; int my_pe,n_pes; int *flag,*one; size_t max_elements,max_elements_bytes; char *srce_char,*targ_char; short *srce_short,*targ_short; int *srce_int,*targ_int; long *srce_long,*targ_long; start_pes(0); my_pe = shmem_my_pe(); n_pes = shmem_n_pes(); flag = shmalloc((size_t) sizeof(int)); one = shmalloc((size_t) sizeof(int)); *one = 1; /* fail if trying to use odd number of processors */ if ( (n_pes % 2) != 0 ){ fprintf(stderr, "FAIL - test requires even number of PEs\n"); exit(1); } if(my_pe == 0) fprintf(stderr, "shmem_num_put(%s)\n", argv[0]); /* shmem_putmem test */ *flag = 0; max_elements = (size_t) (MAX_SIZE / sizeof(char)); max_elements_bytes = (size_t) (sizeof(char)*max_elements); if(my_pe == 0) fprintf(stderr,"shmem_putmem max_elements = %d\n",max_elements); srce_char = shmalloc(max_elements_bytes); targ_char = shmalloc(max_elements_bytes); if((srce_char == NULL) || (targ_char == NULL)) shmalloc_error(); if ( (my_pe % 2) == 0 ) for(j = 0; j < max_elements; j++) srce_char[j] = (char)(my_pe+j); else for(j = 0; j < max_elements; j++) targ_char[j] = (char)(my_pe+j); shmem_barrier_all(); if ( (my_pe % 2) == 0 ) { shmem_putmem(targ_char,srce_char,max_elements,my_pe+1); shmem_quiet(); shmem_int_put(flag,one,(size_t)1,my_pe+1); } else { shmem_int_wait(flag,0); for(j = 0; j < max_elements; j++) if ( targ_char[j] != (char)(my_pe+j-1) ) fprintf(stderr, "FAIL: PE [%d] targ_char[%d]=%d my_pe+j-1=%d\n", my_pe,j,targ_char[j],my_pe+j-1); } shfree(srce_char); shfree(targ_char); /* shmem_put16 test */ *flag = 0; max_elements = (size_t) (MAX_SIZE / sizeof(short)); if(max_elements > 20000) max_elements=20000; max_elements_bytes = (size_t) (sizeof(short)*max_elements); if(my_pe == 0) fprintf(stderr,"shmem_put16 max_elements = %d\n",max_elements); srce_short = shmalloc(max_elements_bytes); targ_short = shmalloc(max_elements_bytes); if((srce_short == NULL) || (targ_short == NULL)) shmalloc_error(); if ( (my_pe % 2) == 0 ) for(j = 0; j < max_elements; j++) srce_short[j] = (short)(my_pe+j); else for(j = 0; j < max_elements; j++) targ_short[j] = (short)(my_pe+j); shmem_barrier_all(); if ( (my_pe % 2) == 0 ) { shmem_put16(targ_short,srce_short,max_elements,my_pe+1); shmem_quiet(); shmem_int_put(flag,one,(size_t)1,my_pe+1); } else { shmem_int_wait(flag,0); for(j = 0; j < max_elements; j++) if ( targ_short[j] != (short)(my_pe+j-1) ) fprintf(stderr, "FAIL: PE [%d] targ_short[%d]=%d my_pe+j-1=%d\n", my_pe,j,targ_short[j],my_pe+j-1); } shfree(srce_short); shfree(targ_short); /* shmem_put32 test */ *flag = 0; max_elements = (size_t) (MAX_SIZE / sizeof(int)); max_elements_bytes = (size_t) (sizeof(int)*max_elements); if(my_pe == 0) fprintf(stderr,"shmem_put32 max_elements = %d\n",max_elements); srce_int = shmalloc(max_elements_bytes); targ_int = shmalloc(max_elements_bytes); if((srce_int == NULL) || (targ_int == NULL)) shmalloc_error(); if ( (my_pe % 2) == 0 ) for(j = 0; j < max_elements; j++) srce_int[j] = (int)(my_pe+j); else for(j = 0; j < max_elements; j++) targ_int[j] = (int)(my_pe+j); shmem_barrier_all(); if ( (my_pe % 2) == 0 ) { shmem_put32(targ_int,srce_int,max_elements,my_pe+1); shmem_quiet(); shmem_int_put(flag,one,(size_t)1,my_pe+1); } else { shmem_int_wait(flag,0); for(j = 0; j < max_elements; j++) if ( targ_int[j] != (int)(my_pe+j-1) ) fprintf(stderr, "FAIL: PE [%d] targ_int[%d]=%d my_pe+j-1=%d\n", my_pe,j,targ_int[j],my_pe+j-1); } shfree(srce_int); shfree(targ_int); /* shmem_put64 test */ *flag = 0; max_elements = (size_t) (MAX_SIZE / sizeof(long)); max_elements_bytes = (size_t) (sizeof(long)*max_elements); if(my_pe == 0) fprintf(stderr,"shmem_put64 max_elements = %d\n",max_elements); srce_long = shmalloc(max_elements_bytes); targ_long = shmalloc(max_elements_bytes); if((srce_long == NULL) || (targ_long == NULL)) shmalloc_error(); if ( (my_pe % 2) == 0 ) for(j = 0; j < max_elements; j++) srce_long[j] = (long)(my_pe+j); else for(j = 0; j < max_elements; j++) targ_long[j] = (long)(my_pe+j); shmem_barrier_all(); if ( (my_pe % 2) == 0 ) { shmem_put64(targ_long,srce_long,max_elements,my_pe+1); shmem_quiet(); shmem_int_put(flag,one,(size_t)1,my_pe+1); } else { shmem_int_wait(flag,0); for(j = 0; j < max_elements; j++) if ( targ_long[j] != (long)(my_pe+j-1) ) fprintf(stderr, "FAIL: PE [%d] targ_long[%d]=%d my_pe+j-1=%d\n", my_pe,j,targ_long[j],my_pe+j-1); } shfree(srce_long); shfree(targ_long); /* shmem_put128 test */ *flag = 0; max_elements = (size_t) (MAX_SIZE / sizeof(long)); if ( (max_elements % 2) != 0) max_elements = max_elements-1; max_elements_bytes = (size_t) (sizeof(long)*max_elements); max_elements = max_elements/2; if(my_pe == 0) fprintf(stderr,"shmem_put128 max_elements = %d\n",max_elements); srce_long = shmalloc(max_elements_bytes); targ_long = shmalloc(max_elements_bytes); if((srce_long == NULL) || (targ_long == NULL)) shmalloc_error(); if ( (my_pe % 2) == 0 ) for(j = 0; j < 2*max_elements; j++) srce_long[j] = (long)(my_pe+j); else for(j = 0; j < 2*max_elements; j++) targ_long[j] = (long)(my_pe+j); shmem_barrier_all(); if ( (my_pe % 2) == 0 ) { shmem_put128(targ_long,srce_long,max_elements,my_pe+1); shmem_quiet(); shmem_int_put(flag,one,(size_t)1,my_pe+1); } else { shmem_int_wait(flag,0); for(j = 0; j < 2*max_elements; j++) if ( targ_long[j] != (long)(my_pe+j-1) ) fprintf(stderr, "FAIL: PE [%d] targ_long[%d]=%d my_pe+j-1=%d\n", my_pe,j,targ_long[j],my_pe+j-1); } shfree(srce_long); shfree(targ_long); #ifdef SHMEM_C_GENERIC_32 /* shmem_put (GENERIC 32) test */ *flag = 0; max_elements = (size_t) (MAX_SIZE / sizeof(int)); max_elements_bytes = (size_t) (sizeof(int)*max_elements); if(my_pe == 0) fprintf(stderr,"shmem_put (GENERIC 32) max_elements = %d\n",max_elements); srce_int = shmalloc(max_elements_bytes); targ_int = shmalloc(max_elements_bytes); if((srce_int == NULL) || (targ_int == NULL)) shmalloc_error(); if ( (my_pe % 2) == 0 ) for(j = 0; j < max_elements; j++) srce_int[j] = (int)(my_pe+j); else for(j = 0; j < max_elements; j++) targ_int[j] = (int)(my_pe+j); shmem_barrier_all(); if ( (my_pe % 2) == 0 ) { shmem_put(targ_int,srce_int,max_elements,my_pe+1); shmem_quiet(); shmem_int_put(flag,one,(size_t)1,my_pe+1); } else { shmem_int_wait(flag,0); for(j = 0; j < max_elements; j++) if ( targ_int[j] != (int)(my_pe+j-1) ) fprintf(stderr, "FAIL: PE [%d] targ_int[%d]=%d my_pe+j-1=%d\n", my_pe,j,targ_int[j],my_pe+j-1); } shfree(srce_int); shfree(targ_int); #else /* shmem_put (GENERIC 64) test */ *flag = 0; max_elements = (size_t) (MAX_SIZE / sizeof(long)); max_elements_bytes = (size_t) (sizeof(long)*max_elements); if(my_pe == 0) fprintf(stderr,"shmem_put (GENERIC 64) max_elements = %d\n",max_elements); srce_long = shmalloc(max_elements_bytes); targ_long = shmalloc(max_elements_bytes); if((srce_long == NULL) || (targ_long == NULL)) shmalloc_error(); if ( (my_pe % 2) == 0 ) for(j = 0; j < max_elements; j++) srce_long[j] = (long)(my_pe+j); else for(j = 0; j < max_elements; j++) targ_long[j] = (long)(my_pe+j); shmem_barrier_all(); if ( (my_pe % 2) == 0 ) { shmem_put(targ_long,srce_long,max_elements,my_pe+1); shmem_quiet(); shmem_int_put(flag,one,(size_t)1,my_pe+1); } else { shmem_int_wait(flag,0); for(j = 0; j < max_elements; j++) if ( targ_long[j] != (long)(my_pe+j-1) ) fprintf(stderr, "FAIL: PE [%d] targ_long[%d]=%d my_pe+j-1=%d\n", my_pe,j,targ_long[j],my_pe+j-1); } shfree(srce_long); shfree(targ_long); #endif #ifdef NEEDS_FINALIZE shmem_finalize(); #endif return 0; }
void FORTRANIFY (shmem_putmem) (void *target, const void *source, int *size, int *pe) { shmem_putmem (target, source, *size, *pe); }
int main(int argc, char ** argv) { int Num_procs; /* number of ranks */ int Num_procsx, Num_procsy; /* number of ranks in each coord direction */ int my_ID; /* SHMEM rank */ int my_IDx, my_IDy; /* coordinates of rank in rank grid */ int right_nbr; /* global rank of right neighboring tile */ int left_nbr; /* global rank of left neighboring tile */ int top_nbr; /* global rank of top neighboring tile */ int bottom_nbr; /* global rank of bottom neighboring tile */ DTYPE *top_buf_out; /* communication buffer */ DTYPE *top_buf_in[2]; /* " " */ DTYPE *bottom_buf_out; /* " " */ DTYPE *bottom_buf_in[2];/* " " */ DTYPE *right_buf_out; /* " " */ DTYPE *right_buf_in[2]; /* " " */ DTYPE *left_buf_out; /* " " */ DTYPE *left_buf_in[2]; /* " " */ int root = 0; int n, width, height;/* linear global and local grid dimension */ int i, j, ii, jj, kk, it, jt, iter, leftover; /* dummies */ int istart, iend; /* bounds of grid tile assigned to calling rank */ int jstart, jend; /* bounds of grid tile assigned to calling rank */ DTYPE reference_norm; DTYPE f_active_points; /* interior of grid with respect to stencil */ int stencil_size; /* number of points in the stencil */ DTYPE flops; /* floating point ops per iteration */ int iterations; /* number of times to run the algorithm */ double avgtime, /* timing parameters */ *local_stencil_time, *stencil_time; DTYPE * RESTRICT in; /* input grid values */ DTYPE * RESTRICT out; /* output grid values */ long total_length_in; /* total required length to store input array */ long total_length_out;/* total required length to store output array */ int error=0; /* error flag */ DTYPE weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil */ int *arguments; /* command line parameters */ int count_case=4; /* number of neighbors of a rank */ long *pSync_bcast; /* work space for collectives */ long *pSync_reduce; /* work space for collectives */ double *pWrk_time; /* work space for collectives */ DTYPE *pWrk_norm; /* work space for collectives */ int *iterflag; /* synchronization flags */ int sw; /* double buffering switch */ DTYPE *local_norm, *norm; /* local and global error norms */ /******************************************************************************* ** Initialize the SHMEM environment ********************************************************************************/ prk_shmem_init(); my_ID=prk_shmem_my_pe(); Num_procs=prk_shmem_n_pes(); pSync_bcast = (long *) prk_shmem_malloc(PRK_SHMEM_BCAST_SYNC_SIZE*sizeof(long)); pSync_reduce = (long *) prk_shmem_malloc(PRK_SHMEM_REDUCE_SYNC_SIZE*sizeof(long)); pWrk_time = (double *) prk_shmem_malloc(PRK_SHMEM_REDUCE_MIN_WRKDATA_SIZE*sizeof(double)); pWrk_norm = (DTYPE *) prk_shmem_malloc(PRK_SHMEM_REDUCE_MIN_WRKDATA_SIZE*sizeof(DTYPE)); local_stencil_time = (double *) prk_shmem_malloc(sizeof(double)); stencil_time = (double *) prk_shmem_malloc(sizeof(double)); local_norm = (DTYPE *) prk_shmem_malloc(sizeof(DTYPE)); norm = (DTYPE *) prk_shmem_malloc(sizeof(DTYPE)); iterflag = (int *) prk_shmem_malloc(2*sizeof(int)); if (!(pSync_bcast && pSync_reduce && pWrk_time && pWrk_norm && iterflag && local_stencil_time && stencil_time && local_norm && norm)) { printf("Could not allocate scalar variables on rank %d\n", my_ID); error = 1; } bail_out(error); for(i=0;i<PRK_SHMEM_BCAST_SYNC_SIZE;i++) pSync_bcast[i]=PRK_SHMEM_SYNC_VALUE; for(i=0;i<PRK_SHMEM_REDUCE_SYNC_SIZE;i++) pSync_reduce[i]=PRK_SHMEM_SYNC_VALUE; arguments=(int*)prk_shmem_malloc(2*sizeof(int)); /******************************************************************************* ** process, test, and broadcast input parameters ********************************************************************************/ if (my_ID == root) { #ifndef STAR printf("ERROR: Compact stencil not supported\n"); error = 1; goto ENDOFTESTS; #endif if (argc != 3){ printf("Usage: %s <# iterations> <array dimension> \n", *argv); error = 1; goto ENDOFTESTS; } iterations = atoi(*++argv); arguments[0]=iterations; if (iterations < 1){ printf("ERROR: iterations must be >= 1 : %d \n",iterations); error = 1; goto ENDOFTESTS; } n = atoi(*++argv); arguments[1]=n; long nsquare = (long)n * (long)n; if (nsquare < Num_procs){ printf("ERROR: grid size must be at least # ranks: %ld\n", nsquare); error = 1; goto ENDOFTESTS; } if (RADIUS < 0) { printf("ERROR: Stencil radius %d should be non-negative\n", RADIUS); error = 1; goto ENDOFTESTS; } if (2*RADIUS +1 > n) { printf("ERROR: Stencil radius %d exceeds grid size %d\n", RADIUS, n); error = 1; goto ENDOFTESTS; } ENDOFTESTS:; } bail_out(error); /* determine best way to create a 2D grid of ranks (closest to square, for best surface/volume ratio); we do this brute force for now */ for (Num_procsx=(int) (sqrt(Num_procs+1)); Num_procsx>0; Num_procsx--) { if (!(Num_procs%Num_procsx)) { Num_procsy = Num_procs/Num_procsx; break; } } my_IDx = my_ID%Num_procsx; my_IDy = my_ID/Num_procsx; /* compute neighbors; don't worry about dropping off the edges of the grid */ right_nbr = my_ID+1; left_nbr = my_ID-1; top_nbr = my_ID+Num_procsx; bottom_nbr = my_ID-Num_procsx; iterflag[0] = iterflag[1] = 0; if(my_IDx==0) count_case--; if(my_IDx==Num_procsx-1) count_case--; if(my_IDy==0) count_case--; if(my_IDy==Num_procsy-1) count_case--; if (my_ID == root) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("SHMEM stencil execution on 2D grid\n"); printf("Number of ranks = %d\n", Num_procs); printf("Grid size = %d\n", n); printf("Radius of stencil = %d\n", RADIUS); printf("Tiles in x/y-direction = %d/%d\n", Num_procsx, Num_procsy); printf("Type of stencil = star\n"); #ifdef DOUBLE printf("Data type = double precision\n"); #else printf("Data type = single precision\n"); #endif #if LOOPGEN printf("Script used to expand stencil loop body\n"); #else printf("Compact representation of stencil loop body\n"); #endif #if SPLITFENCE printf("Split fence = ON\n"); #else printf("Split fence = OFF\n"); #endif printf("Number of iterations = %d\n", iterations); } shmem_barrier_all(); shmem_broadcast32(&arguments[0], &arguments[0], 2, root, 0, 0, Num_procs, pSync_bcast); iterations=arguments[0]; n=arguments[1]; shmem_barrier_all(); prk_shmem_free(arguments); /* compute amount of space required for input and solution arrays */ width = n/Num_procsx; leftover = n%Num_procsx; if (my_IDx<leftover) { istart = (width+1) * my_IDx; iend = istart + width + 1; } else { istart = (width+1) * leftover + width * (my_IDx-leftover); iend = istart + width; } width = iend - istart + 1; if (width == 0) { printf("ERROR: rank %d has no work to do\n", my_ID); error = 1; } bail_out(error); height = n/Num_procsy; leftover = n%Num_procsy; if (my_IDy<leftover) { jstart = (height+1) * my_IDy; jend = jstart + height + 1; } else { jstart = (height+1) * leftover + height * (my_IDy-leftover); jend = jstart + height; } height = jend - jstart + 1; if (height == 0) { printf("ERROR: rank %d has no work to do\n", my_ID); error = 1; } bail_out(error); if (width < RADIUS || height < RADIUS) { printf("ERROR: rank %d has work tile smaller then stencil radius\n", my_ID); error = 1; } bail_out(error); total_length_in = (width+2*RADIUS); total_length_in *= (height+2*RADIUS); total_length_in *= sizeof(DTYPE); total_length_out = width; total_length_out *= height; total_length_out *= sizeof(DTYPE); in = (DTYPE *) malloc(total_length_in); out = (DTYPE *) malloc(total_length_out); if (!in || !out) { printf("ERROR: rank %d could not allocate space for input/output array\n", my_ID); error = 1; } bail_out(error); /* fill the stencil weights to reflect a discrete divergence operator */ for (jj=-RADIUS; jj<=RADIUS; jj++) for (ii=-RADIUS; ii<=RADIUS; ii++) WEIGHT(ii,jj) = (DTYPE) 0.0; stencil_size = 4*RADIUS+1; for (ii=1; ii<=RADIUS; ii++) { WEIGHT(0, ii) = WEIGHT( ii,0) = (DTYPE) (1.0/(2.0*ii*RADIUS)); WEIGHT(0,-ii) = WEIGHT(-ii,0) = -(DTYPE) (1.0/(2.0*ii*RADIUS)); } norm[0] = (DTYPE) 0.0; f_active_points = (DTYPE) (n-2*RADIUS)*(DTYPE) (n-2*RADIUS); /* intialize the input and output arrays */ for (j=jstart; j<jend; j++) for (i=istart; i<iend; i++) { IN(i,j) = COEFX*i+COEFY*j; OUT(i,j) = (DTYPE)0.0; } /* allocate communication buffers for halo values */ top_buf_out=(DTYPE*)malloc(2*sizeof(DTYPE)*RADIUS*width); if (!top_buf_out) { printf("ERROR: Rank %d could not allocate output comm buffers for y-direction\n", my_ID); error = 1; } bail_out(error); bottom_buf_out = top_buf_out+RADIUS*width; top_buf_in[0]=(DTYPE*)prk_shmem_malloc(4*sizeof(DTYPE)*RADIUS*width); if(!top_buf_in) { printf("ERROR: Rank %d could not allocate input comm buffers for y-direction\n", my_ID); error=1; } bail_out(error); top_buf_in[1] = top_buf_in[0] + RADIUS*width; bottom_buf_in[0] = top_buf_in[1] + RADIUS*width; bottom_buf_in[1] = bottom_buf_in[0] + RADIUS*width; right_buf_out=(DTYPE*)malloc(2*sizeof(DTYPE)*RADIUS*height); if (!right_buf_out) { printf("ERROR: Rank %d could not allocate output comm buffers for x-direction\n", my_ID); error = 1; } bail_out(error); left_buf_out=right_buf_out+RADIUS*height; right_buf_in[0]=(DTYPE*)prk_shmem_malloc(4*sizeof(DTYPE)*RADIUS*height); if(!right_buf_in) { printf("ERROR: Rank %d could not allocate input comm buffers for x-dimension\n", my_ID); error=1; } bail_out(error); right_buf_in[1] = right_buf_in[0] + RADIUS*height; left_buf_in[0] = right_buf_in[1] + RADIUS*height; left_buf_in[1] = left_buf_in[0] + RADIUS*height; /* make sure all symmetric heaps are allocated before being used */ shmem_barrier_all(); for (iter = 0; iter<=iterations; iter++){ /* start timer after a warmup iteration */ if (iter == 1) { shmem_barrier_all(); local_stencil_time[0] = wtime(); } /* sw determines which incoming buffer to select */ sw = iter%2; /* need to fetch ghost point data from neighbors */ if (my_IDy < Num_procsy-1) { for (kk=0,j=jend-RADIUS; j<=jend-1; j++) for (i=istart; i<=iend; i++) { top_buf_out[kk++]= IN(i,j); } shmem_putmem(bottom_buf_in[sw], top_buf_out, RADIUS*width*sizeof(DTYPE), top_nbr); #if SPLITFENCE shmem_fence(); shmem_int_inc(&iterflag[sw], top_nbr); #endif } if (my_IDy > 0) { for (kk=0,j=jstart; j<=jstart+RADIUS-1; j++) for (i=istart; i<=iend; i++) { bottom_buf_out[kk++]= IN(i,j); } shmem_putmem(top_buf_in[sw], bottom_buf_out, RADIUS*width*sizeof(DTYPE), bottom_nbr); #if SPLITFENCE shmem_fence(); shmem_int_inc(&iterflag[sw], bottom_nbr); #endif } if(my_IDx < Num_procsx-1) { for(kk=0,j=jstart;j<=jend;j++) for(i=iend-RADIUS;i<=iend-1;i++) { right_buf_out[kk++]=IN(i,j); } shmem_putmem(left_buf_in[sw], right_buf_out, RADIUS*height*sizeof(DTYPE), right_nbr); #if SPLITFENCE shmem_fence(); shmem_int_inc(&iterflag[sw], right_nbr); #endif } if(my_IDx>0) { for(kk=0,j=jstart;j<=jend;j++) for(i=istart;i<=istart+RADIUS-1;i++) { left_buf_out[kk++]=IN(i,j); } shmem_putmem(right_buf_in[sw], left_buf_out, RADIUS*height*sizeof(DTYPE), left_nbr); #if SPLITFENCE shmem_fence(); shmem_int_inc(&iterflag[sw], left_nbr); #endif } #if SPLITFENCE == 0 shmem_fence(); if(my_IDy<Num_procsy-1) shmem_int_inc(&iterflag[sw], top_nbr); if(my_IDy>0) shmem_int_inc(&iterflag[sw], bottom_nbr); if(my_IDx<Num_procsx-1) shmem_int_inc(&iterflag[sw], right_nbr); if(my_IDx>0) shmem_int_inc(&iterflag[sw], left_nbr); #endif shmem_int_wait_until(&iterflag[sw], SHMEM_CMP_EQ, count_case*(iter/2+1)); if (my_IDy < Num_procsy-1) { for (kk=0,j=jend; j<=jend+RADIUS-1; j++) for (i=istart; i<=iend; i++) { IN(i,j) = top_buf_in[sw][kk++]; } } if (my_IDy > 0) { for (kk=0,j=jstart-RADIUS; j<=jstart-1; j++) for (i=istart; i<=iend; i++) { IN(i,j) = bottom_buf_in[sw][kk++]; } } if (my_IDx < Num_procsx-1) { for (kk=0,j=jstart; j<=jend; j++) for (i=iend; i<=iend+RADIUS-1; i++) { IN(i,j) = right_buf_in[sw][kk++]; } } if (my_IDx > 0) { for (kk=0,j=jstart; j<=jend; j++) for (i=istart-RADIUS; i<=istart-1; i++) { IN(i,j) = left_buf_in[sw][kk++]; } } /* Apply the stencil operator */ for (j=MAX(jstart,RADIUS); j<=MIN(n-RADIUS-1,jend); j++) { for (i=MAX(istart,RADIUS); i<=MIN(n-RADIUS-1,iend); i++) { #if LOOPGEN #include "loop_body_star.incl" #else for (jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj); for (ii=-RADIUS; ii<0; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); for (ii=1; ii<=RADIUS; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); #endif } } /* add constant to solution to force refresh of neighbor data, if any */ for (j=jstart; j<jend; j++) for (i=istart; i<iend; i++) IN(i,j)+= 1.0; } local_stencil_time[0] = wtime() - local_stencil_time[0]; shmem_barrier_all(); shmem_double_max_to_all(&stencil_time[0], &local_stencil_time[0], 1, 0, 0, Num_procs, pWrk_time, pSync_reduce); /* compute L1 norm in parallel */ local_norm[0] = (DTYPE) 0.0; for (j=MAX(jstart,RADIUS); j<MIN(n-RADIUS,jend); j++) { for (i=MAX(istart,RADIUS); i<MIN(n-RADIUS,iend); i++) { local_norm[0] += (DTYPE)ABS(OUT(i,j)); } } shmem_barrier_all(); #ifdef DOUBLE shmem_double_sum_to_all(&norm[0], &local_norm[0], 1, 0, 0, Num_procs, pWrk_norm, pSync_reduce); #else shmem_float_sum_to_all(&norm[0], &local_norm[0], 1, 0, 0, Num_procs, pWrk_norm, pSync_reduce); #endif /******************************************************************************* ** Analyze and output results. ********************************************************************************/ /* verify correctness */ if (my_ID == root) { norm[0] /= f_active_points; if (RADIUS > 0) { reference_norm = (DTYPE) (iterations+1) * (COEFX + COEFY); } else { reference_norm = (DTYPE) 0.0; } if (ABS(norm[0]-reference_norm) > EPSILON) { printf("ERROR: L1 norm = "FSTR", Reference L1 norm = "FSTR"\n", norm[0], reference_norm); error = 1; } else { printf("Solution validates\n"); #ifdef VERBOSE printf("Reference L1 norm = "FSTR", L1 norm = "FSTR"\n", reference_norm, norm[0]); #endif } } bail_out(error); if (my_ID == root) { /* flops/stencil: 2 flops (fma) for each point in the stencil, plus one flop for the update of the input of the array */ flops = (DTYPE) (2*stencil_size+1) * f_active_points; avgtime = stencil_time[0]/iterations; printf("Rate (MFlops/s): "FSTR" Avg time (s): %lf\n", 1.0E-06 * flops/avgtime, avgtime); } prk_shmem_free(top_buf_in); prk_shmem_free(right_buf_in); free(top_buf_out); free(right_buf_out); prk_shmem_free(pSync_bcast); prk_shmem_free(pSync_reduce); prk_shmem_free(pWrk_time); prk_shmem_free(pWrk_norm); prk_shmem_finalize(); exit(EXIT_SUCCESS); }
void shmemx_putmem_ct(shmemx_ct_t ct, void *target, const void *source, size_t len, int pe) { shmem_putmem(target, source, len, pe); oshmpi_remote_sync_pe(pe); shmem_long_add(ct, 1, pe); }