int main(int argc, char* argv[]) { int c, j, loops, k, l; int my_pe, nProcs, nWorkers; int nWords=1; int failures=0; char *prog_name; long *wp,work_sz; for(j=0; j < SHMEM_BARRIER_SYNC_SIZE; j++) { pSync0[j] = pSync1[j] = pSync2[j] = pSync3[j] = pSync4[j] = SHMEM_SYNC_VALUE; } shmem_init(); my_pe = shmem_my_pe(); nProcs = shmem_n_pes(); nWorkers = nProcs - 1; if (nProcs == 1) { Rfprintf(stderr, "ERR - Requires > 1 PEs\n"); shmem_finalize(); return 0; } for(j=0; j < nProcs; j++) if ( shmem_pe_accessible(j) != 1 ) { fprintf(stderr, "ERR - pe %d not accessible from pe %d\n", j, my_pe); } prog_name = strrchr(argv[0],'/'); if ( prog_name ) prog_name++; else prog_name = argv[0]; while((c=getopt(argc,argv,"hvM:s")) != -1) { switch(c) { case 's': Slow++; break; case 'v': Verbose++; break; case 'M': output_mod = atoi(optarg); if (output_mod <= 0) { Rfprintf(stderr, "ERR - output modulo arg out of " "bounds '%d'?\n", output_mod); shmem_finalize(); return 1; } Rfprintf(stderr,"%s: output modulo %d\n", prog_name,output_mod); break; case 'h': Rfprintf(stderr, "usage: %s {nWords-2-put(%d)K/M} {Loop-count(%d)K/M}\n", prog_name, DFLT_NWORDS, DFLT_LOOPS); shmem_finalize(); return 1; default: shmem_finalize(); return 1; } } if (optind == argc) nWords = DFLT_NWORDS; else { nWords = atoi_scaled(argv[optind++]); if (nWords <= 0) { Rfprintf(stderr, "ERR - Bad nWords arg '%d'?\n", nWords); shmem_finalize(); return 1; } } if (optind == argc) loops = DFLT_LOOPS; else { loops = atoi_scaled(argv[optind++]); if (loops <= 0 || loops > 1000000) { Rfprintf(stderr, "ERR - loops arg out of bounds '%d'?\n", loops); shmem_finalize(); return 1; } } work_sz = (nProcs*nWords) * sizeof(long); work = shmem_malloc( work_sz ); if ( !work ) { fprintf(stderr,"[%d] ERR - work = shmem_malloc(%ld) ?\n",my_pe,work_sz); shmem_global_exit(1); } Target = shmem_malloc( 2 * nWords * sizeof(long) ); if ( !Target ) { fprintf(stderr,"[%d] ERR - Target = shmem_malloc(%ld) ?\n", my_pe, (nWords * sizeof(long))); shmem_global_exit(1); } src = &Target[nWords]; #if _DEBUG Rprintf("%s: %d loops of %d longs per put\n",prog_name,loops,nWords); #endif for(j=0; j < nWords; j++) src[j] = VAL; for(j=0; j < loops; j++) { #if _DEBUG if ( Verbose && (j==0 || (j % output_mod) == 0) ) fprintf(stderr,"[%d] +(%d)\n", my_pe,j); #endif shmem_barrier(0, 0, nProcs, pSync0); if ( my_pe == 0 ) { int p; for(p=1; p < nProcs; p++) shmem_long_put(Target, src, nWords, p); } else { if (Slow) { /* wait for each put to complete */ for(k=0; k < nWords; k++) shmem_wait(&Target[k],my_pe); } else { /* wait for last word to be written */ shmem_wait(&Target[nWords-1],my_pe); } } #if _DEBUG if ( Verbose && (j==0 || (j % output_mod) == 0) ) fprintf(stderr,"[%d] -(%d)\n", shmem_my_pe(),j); #endif shmem_barrier(0, 0, nProcs, pSync1); RDprintf("Workers[1 ... %d] verify Target data put by proc0\n", nWorkers); /* workers verify put data is expected */ if ( my_pe != 0 ) { for(k=0; k < nWords; k++) { if (Target[k] != VAL) { fprintf(stderr, "[%d] Target[%d] %#lx " "!= %#x?\n", my_pe,k,Target[k],VAL); failures++; } assert(Target[k] == VAL); Target[k] = my_pe; } } else /* clear results buffer, workers will put here */ memset(work, 0, work_sz); shmem_barrier(0, 0, nProcs, pSync2); RDprintf("Workers[1 ... %d] put Target data to PE0 work " "vector\n",nWorkers); if ( my_pe != 0 ) { /* push nWords of val my_pe back to PE zero */ shmem_long_put(&work[my_pe * nWords], Target, nWords, 0); } else { /* wait for procs 1 ... nProcs to complete put()s */ for(l=1; l < nProcs; l++) { wp = &work[ l*nWords ]; // procs nWords chunk #if 1 /* wait for last long to be written from each PE */ shmem_wait(&wp[nWords-1],0); #else for(k=0; k < nWords; k++) shmem_wait(&wp[k],0); #endif } } shmem_barrier(0, 0, nProcs, pSync3); if ( my_pe == 0 ) { RDprintf("Loop(%d) PE0 verifing work data.\n",j); for(l=1; l < nProcs; l++) { wp = &work[ l*nWords ]; // procs nWords chunk for(k=0; k < nWords; k++) { if (wp[k] != l) { fprintf(stderr, "[0] PE(%d)_work[%d] %ld " "!= %d?\n", l,k,work[k],l); failures++; } assert(wp[k] == l); break; } if (failures) break; } } shmem_barrier(0, 0, nProcs, pSync4); #if _DEBUG if (loops > 1) { Rfprintf(stderr,"."); RDprintf("Loop(%d) Pass.\n",j); } #endif } shmem_free( work ); shmem_free( Target ); #if _DEBUG Rfprintf(stderr,"\n");fflush(stderr); shmem_barrier_all(); RDprintf("%d(%d) Exit(%d)\n", my_pe, nProcs, failures); #endif shmem_finalize(); return failures; }
void _PERM_IR(_permmap* const pm) { const int one = 1; int * rindbase; int * lindbase; int * const restrict lsize = (int *)shmalloc(_PROCESSORS * sizeof(int)); int * const restrict rsize = (int *)shmalloc(_PROCESSORS * sizeof(int)); int * restrict * const restrict lind = pm->lind; int * restrict * const restrict rind = pm->rind; char * restrict * const restrict rptr = pm->rptr; int * const restrict rflag = pm->rflag; int* addr; int i, j; for (i = 0; i < _PROCESSORS; i++) { lsize[i] = lind[i] ? lind[i][0] : 0; rsize[i] = 0; } shmem_barrier_all(); for (i = (_INDEX == _PROCESSORS - 1) ? 0 : _INDEX+1; i != _INDEX; i = (i == _PROCESSORS - 1) ? 0 : i++) { if (lsize[i] > 0) { #ifdef _SHMEM_PERMUTE_DEBUG printf("%d sending count to %d\n", _INDEX, i); fflush(stdout); #endif shmem_int_put(&(rsize[_INDEX]), &(lsize[i]), 1, i); } } rsize[_INDEX] = lsize[_INDEX]; shmem_barrier_all(); #ifdef _SHMEM_PERMUTE_DEBUG sleep(_PROCESSORS); #endif _PERM_CleanIndices(lsize, rsize, lind, rind, &lindbase, &rindbase); #ifdef _SHMEM_PERMUTE_DEBUG sleep(_INDEX); printf("FROM PROCESSOR %d\n", _INDEX); printf(" LSIZE = "); for (i = 0; i < _PROCESSORS; i++) { printf("%d ", lsize[i]); } printf("\n"); printf(" RSIZE = "); for (i = 0; i < _PROCESSORS; i++) { printf("%d ", rsize[i]); } printf("\n"); printf(" PROCMAP: size = %d, # elts = %d, encoded = %d :: ", pm->procmap[0], pm->procmap[1], pm->procmap[2]); for (j = 3; j < pm->procmap[0]; j++) { printf("%d ", pm->procmap[j]); } printf("\n"); for (i = 0; i < _PROCESSORS; i++) { if (lind[i] != 0) { printf(" TO PROCESSOR %d: ", i); printf("size = %d, # elts = %d, encoded = %d :: ", lind[i][0], lind[i][1], lind[i][2]); for (j = 3; j < lind[i][0]; j++) { printf("%d ", lind[i][j]); } printf("\n"); } } printf("\n"); fflush(stdout); sleep(_PROCESSORS-_INDEX); #endif for (i = (_INDEX == _PROCESSORS - 1) ? 0 : _INDEX+1; i != _INDEX; i = (i == _PROCESSORS - 1) ? 0 : i++) { if (rsize[i] > 0) { #ifdef _SHMEM_PERMUTE_DEBUG printf("%d sending rind address to %d\n", _INDEX, i); fflush(stdout); #endif rflag[_INDEX] = 0; shmem_put((void*)&(rptr[_INDEX]), (void*)&(rind[i]), 1, i); } } #ifdef _SHMEM_PERMUTE_DEBUG sleep(_PROCESSORS); #endif for (i = (_INDEX == 0) ? _PROCESSORS-1 : _INDEX-1; i != _INDEX; i = (i == 0) ? _PROCESSORS-1 : i--) { if (lsize[i] > 0) { #ifdef _SHMEM_PERMUTE_DEBUG printf("%d waiting for rind address from %d, sending lind\n", _INDEX, i); fflush(stdout); #endif shmem_wait((long*)&(rptr[i]), 0); addr = (int*)rptr[i]; rptr[i] = 0; shmem_int_put(addr, lind[i], lsize[i], i); } } #ifdef _SHMEM_PERMUTE_DEBUG sleep(_PROCESSORS); #endif shmem_fence(); for (i = (_INDEX == 0) ? _PROCESSORS-1 : _INDEX-1; i != _INDEX; i = (i == 0) ? _PROCESSORS-1 : i--) { if (lsize[i] > 0) { #ifdef _SHMEM_PERMUTE_DEBUG printf("IR %d sending one to %d\n", _INDEX, i); fflush(stdout); #endif shmem_int_put(&(rflag[_INDEX]), &one, 1, i); } } if (lsize[_INDEX] > 0) { memcpy(rind[_INDEX], lind[_INDEX], lsize[_INDEX]*sizeof(int)); } pm->lindbase = lindbase; pm->rindbase = rindbase; shfree(lsize); shfree(rsize); }
int main (int argc, char *argv[]) { double t,tv[2]; int reps = DFLT_REPS; int doprint = 1/*0*/; char *progName; int minWords; int maxWords; int incWords, nwords, nproc, proc, peer, c, r, i; long *rbuf; /* remote buffer - sink */ long *tbuf; /* transmit buffer - src */ start_pes(0); proc = _my_pe(); nproc = _num_pes(); if (nproc == 1) { fprintf(stderr, "ERR - Requires > 1 Processing Elements\n"); return 1; } for (progName = argv[0] + strlen(argv[0]); progName > argv[0] && *(progName - 1) != '/'; progName--) ; while ((c = getopt (argc, argv, "n:evh")) != -1) switch (c) { case 'n': if ((reps = getSize (optarg)) <= 0) usage (progName); break; case 'e': doprint++; break; case 'v': Verbose++; break; case 'h': help (progName); default: usage (progName); } if (optind == argc) minWords = DFLT_MIN_WORDS; else if ((minWords = getSize (argv[optind++])) <= 0) usage (progName); if (optind == argc) maxWords = minWords; else if ((maxWords = getSize (argv[optind++])) < minWords) usage (progName); if (optind == argc) incWords = 0; else if ((incWords = getSize (argv[optind++])) < 0) usage (progName); if (!(rbuf = (long *)shmalloc(maxWords * sizeof(long)))) { perror ("Failed memory allocation"); exit (1); } memset (rbuf, 0, maxWords * sizeof (long)); if (!(tbuf = (long *)shmalloc(maxWords * sizeof(long)))) { perror ("Failed memory allocation"); exit (1); } for (i = 0; i < maxWords; i++) tbuf[i] = 1000 + (i & 255); if (doprint) printf ("%d(%d): Shmem PING reps %d minWords %d maxWords %d " "incWords %d\n", proc, nproc, reps, minWords, maxWords, incWords); dprint("[%d] rbuf: %ld\n", proc, (unsigned long) rbuf); shmem_barrier_all(); peer = proc ^ 1; if (peer >= nproc) doprint = 0; for (nwords = minWords; nwords <= maxWords; nwords = incWords ? nwords + incWords : nwords ? 2 * nwords : 1) { r = reps; shmem_barrier_all(); tv[0] = gettime(); if (peer < nproc) { if (proc & 1) { r--; shmem_wait(&rbuf[nwords-1], 0); rbuf[nwords-1] = 0; } while (r-- > 0) { shmem_long_put(rbuf, tbuf, nwords, peer); shmem_wait(&rbuf[nwords-1], 0); rbuf[nwords-1] = 0; } if (proc & 1) { shmem_long_put(rbuf, tbuf, nwords, peer); } } tv[1] = gettime(); t = dt (&tv[1], &tv[0]) / (2 * reps); shmem_barrier_all(); printStats (proc, peer, doprint, nwords, t); } shfree(rbuf); shfree(tbuf); shmem_barrier_all(); return 0; }
void _PERM_DR(const _permmap* const pm, _permdata* const pd, const int scatter, _array_fnc dst, _array_fnc src) { const int one = 1; const int eltsize = pd->eltsize; int i; int * const restrict ldecnt = (int*)_zmalloc(_PROCESSORS*sizeof(int), "perm dr lcnt"); int * const restrict rdecnt = (int*)_zmalloc(_PROCESSORS*sizeof(int), "perm dr rcnt"); char * const restrict * const restrict ldata = pd->ldata; char * const restrict * const restrict rdata = pd->rdata; char * restrict * const restrict rptr = pm->rptr; int * restrict rflag = pm->rflag; char* addr; #ifdef _SHMEM_PERMUTE_DEBUG printf("DR start %d\n", _INDEX); fflush(stdout); sleep(5); #endif for (i=0; i<_PROCESSORS; i++) { ldecnt[i] = (scatter ? _PERM_LCNT(pm, i) : _PERM_RCNT(pm, i)) * eltsize; rdecnt[i] = (scatter ? _PERM_RCNT(pm, i) : _PERM_LCNT(pm, i)) * eltsize; } for (i = (_INDEX == _PROCESSORS - 1) ? 0 : _INDEX+1; i != _INDEX; i = (i == _PROCESSORS - 1) ? 0 : i++) { if (rdecnt[i] > 0) { #ifdef _SHMEM_PERMUTE_DEBUG printf("DR %d sending addr, %d, to %d\n", _INDEX, (int)&(rdata[i]), i); fflush(stdout); sleep(5); #endif rflag[i] = 0; shmem_put((void*)&(rptr[_INDEX]), (void*)&(rdata[i]), 1, i); } } for (i = (_INDEX == 0) ? _PROCESSORS-1 : _INDEX-1; i != _INDEX; i = (i == 0) ? _PROCESSORS-1 : i--) { if (ldecnt[i] > 0) { shmem_wait((long*)&(rptr[i]), 0); addr = rptr[i]; rptr[i] = 0; #ifdef _SHMEM_PERMUTE_DEBUG printf("DR %d waiting addr, %d, from %d\n", _INDEX, (int)addr, i); fflush(stdout); sleep(5); #endif shmem_putmem(addr, ldata[i], ldecnt[i], i); } } shmem_fence(); for (i = (_INDEX == 0) ? _PROCESSORS-1 : _INDEX-1; i != _INDEX; i = (i == 0) ? _PROCESSORS-1 : i--) { if (ldecnt[i] > 0) { #ifdef _SHMEM_PERMUTE_DEBUG printf("DR %d sending flag to %d\n", _INDEX, i); fflush(stdout); sleep(5); #endif shmem_int_put(&(rflag[_INDEX]), &one, 1, i); } } if (ldecnt[_INDEX] > 0) { memcpy(rdata[_INDEX], ldata[_INDEX], ldecnt[_INDEX]); } _zfree(ldecnt, "perm dr lcnt"); _zfree(rdecnt, "perm dr rcnt"); pd->count = -1; }
int main (int argc, char *argv[]) { double t, tv[2]; int reps = 10000; int doprint = 0; char *progName; int minWords = 1; int maxWords = 1; int incWords; int nwords; int nproc; int proc; int peer; int c; int r; int i; long *rbuf; long *tbuf; start_pes (0); proc = _my_pe (); nproc = _num_pes (); for (progName = argv[0] + strlen (argv[0]); progName > argv[0] && *(progName - 1) != '/'; progName--) ; while ((c = getopt (argc, argv, "n:eh")) != -1) switch (c) { case 'n': if ((reps = getSize (optarg)) <= 0) usage (progName); break; case 'e': doprint++; break; case 'h': help (progName); default: usage (progName); } if (optind == argc) minWords = 1; else if ((minWords = getSize (argv[optind++])) <= 0) usage (progName); if (optind == argc) maxWords = minWords; else if ((maxWords = getSize (argv[optind++])) < minWords) usage (progName); if (optind == argc) incWords = 0; else if ((incWords = getSize (argv[optind++])) < 0) usage (progName); if (!(rbuf = (long *) shmalloc (maxWords * sizeof (long)))) { perror ("Failed memory allocation"); exit (1); } memset (rbuf, 0, maxWords * sizeof (long)); shmem_barrier_all (); if (!(tbuf = (long *) malloc (maxWords * sizeof (long)))) { perror ("Failed memory allocation"); exit (1); } if (nproc == 1) return 0; for (i = 0; i < maxWords; i++) tbuf[i] = 1000 + (i & 255); if (doprint) printf ("%d(%d): Shmem PING reps %d minWords %d maxWords %d incWords %d\n", proc, nproc, reps, minWords, maxWords, incWords); shmem_barrier_all (); peer = proc ^ 1; if (peer >= nproc) doprint = 0; for (nwords = minWords; nwords <= maxWords; nwords = incWords ? nwords + incWords : nwords ? 2 * nwords : 1) { r = reps; shmem_barrier_all (); tv[0] = gettime (); if (peer < nproc) { if (proc & 1) { r--; shmem_wait (&rbuf[nwords - 1], 0); rbuf[nwords - 1] = 0; } while (r-- > 0) { shmem_long_put (rbuf, tbuf, nwords, peer); shmem_wait (&rbuf[nwords - 1], 0); rbuf[nwords - 1] = 0; } if (proc & 1) shmem_long_put (rbuf, tbuf, nwords, peer); } tv[1] = gettime (); t = dt (&tv[1], &tv[0]) / (2 * reps); shmem_barrier_all (); printStats (proc, peer, doprint, nwords, t); } shmem_barrier_all (); free (tbuf); shfree (rbuf); return 0; }