void FORTRANIFY (shmem_fcollect4) (void *target, const void *source, int *nelems, int *PE_start, int *logPE_stride, int *PE_size, int *pSync) { shmem_fcollect32 (target, source, *nelems, *PE_start, *logPE_stride, *PE_size, (long *) pSync); }
void fcollect(int *target, int *src, int elements, int me, int npes, int loops) { int i; double start_time, elapsed_time; long total_bytes = loops * elements * sizeof(*src); long *ps, *pSync, *pSync1; pSync = (long*) shmem_malloc( 2 * sizeof(long) * _SHMEM_COLLECT_SYNC_SIZE ); pSync1 = &pSync[_SHMEM_COLLECT_SYNC_SIZE]; for (i = 0; i < _SHMEM_COLLECT_SYNC_SIZE; i++) { pSync[i] = pSync1[i] = _SHMEM_SYNC_VALUE; } target = (int *) shmem_malloc( elements * sizeof(*target) * npes ); if (me==0 && Verbose) { fprintf(stdout,"%s: %d loops of fcollect32(%ld bytes) over %d PEs: ", __FUNCTION__,loops,(elements*sizeof(*src)),npes); fflush(stdout); } shmem_barrier_all(); start_time = shmemx_wtime(); for(i = 0; i < loops; i++) { ps = &pSync[(i&1)]; shmem_fcollect32( target, src, elements, 0, 0, npes, ps ); } elapsed_time = shmemx_wtime() - start_time; if (me==0 && Verbose) { printf("%7.3f secs\n", elapsed_time); printf(" %7.5f usecs / fcollect32(), %ld Kbytes @ %7.4f MB/sec\n\n", (elapsed_time/((double)loops*npes))*1000000.0, (total_bytes/1024), ((double)total_bytes/(1024.0*1024.0)) / elapsed_time ); } shmem_barrier_all(); shmem_free(target); shmem_free( pSync ); shmem_barrier_all(); }
int main (void) { int i; int *target; int *source; int me, npes; struct timeval start, end; long time_taken, start_time, end_time; shmem_init (); me = shmem_my_pe (); npes = shmem_n_pes (); source = (int *) shmem_malloc (N_ELEMENTS * sizeof (*source)); time_taken = 0; for (i = 0; i < N_ELEMENTS; i += 1) { source[i] = (i + 1) * 10 + me; } target = (int *) shmem_malloc (N_ELEMENTS * sizeof (*target) * npes); for (i = 0; i < N_ELEMENTS * npes; i += 1) { target[i] = -90; } for (i = 0; i < _SHMEM_COLLECT_SYNC_SIZE; i += 1) { pSyncA[i] = _SHMEM_SYNC_VALUE; pSyncB[i] = _SHMEM_SYNC_VALUE; } shmem_barrier_all (); for (i = 0; i < 10000; i++) { gettimeofday (&start, NULL); start_time = (start.tv_sec * 1000000.0) + start.tv_usec; /* alternate between 2 pSync arrays to synchronize consequent collectives of even and odd iterations */ if (i % 2) { shmem_fcollect32 (target, source, N_ELEMENTS, 0, 0, npes, pSyncA); } else { shmem_fcollect32 (target, source, N_ELEMENTS, 0, 0, npes, pSyncB); } gettimeofday (&end, NULL); end_time = (end.tv_sec * 1000000.0) + end.tv_usec; if (me == 0) { time_taken = time_taken + (end_time - start_time); } } if (me == 0) { printf ("Time required to collect %d bytes of data, with %d PEs is %ld microseconds\n", (4 * N_ELEMENTS * npes), npes, time_taken / 10000); } shmem_barrier_all (); shmem_free (target); shmem_free (source); shmem_finalize (); return 0; }
int main (int argc, char *argv[]) { int *sray, *rray; int *sdisp, *scounts, *rdisp, *rcounts, *rcounts_full; int ssize, rsize, i, k, j; float z; init_it (&argc, &argv); scounts = (int *) shmem_malloc (sizeof (int) * numnodes); rcounts = (int *) shmem_malloc (sizeof (int) * numnodes); rcounts_full = (int *) shmem_malloc (sizeof (int) * numnodes * numnodes); sdisp = (int *) shmem_malloc (sizeof (int) * numnodes); rdisp = (int *) shmem_malloc (sizeof (int) * numnodes); /* ! seed the random number generator with a ! different number on each processor */ seed_random (myid); /* find out how much data to send */ for (i = 0; i < numnodes; i++) { random_number (&z); scounts[i] = (int) (5.0 * z) + 1; } printf ("myid= %d scounts=%d %d %d %d\n", myid, scounts[0], scounts[1], scounts[2], scounts[3]); printf ("\n"); /* tell the other processors how much data is coming */ // mpi_err = MPI_Alltoall(scounts,1,MPI_INT, rcounts,1,MPI_INT, // MPI_COMM_WORLD); static long psync[_SHMEM_COLLECT_SYNC_SIZE]; for (i = 0; i < _SHMEM_COLLECT_SYNC_SIZE; i++) psync[i] = _SHMEM_SYNC_VALUE; shmem_barrier_all (); int other, j1; shmem_fcollect32 (rcounts_full, scounts, 4, 0, 0, numnodes, psync); for (i = 0; i < numnodes; i++) { rcounts[i] = rcounts_full[i * numnodes + myid]; } printf ("-----myid= %d rcounts=", myid); for (i = 0; i < numnodes; i++) printf ("%d ", rcounts[i]); printf ("\n"); /* write(*,*)"myid= ",myid," rcounts= ",rcounts */ /* calculate displacements and the size of the arrays */ sdisp[0] = 0; for (i = 1; i < numnodes; i++) { sdisp[i] = scounts[i - 1] + sdisp[i - 1]; } rdisp[0] = 0; for (i = 1; i < numnodes; i++) { rdisp[i] = rcounts[i - 1] + rdisp[i - 1]; } ssize = 0; rsize = 0; for (i = 0; i < numnodes; i++) { ssize = ssize + scounts[i]; rsize = rsize + rcounts[i]; } /* allocate send and rec arrays */ sray = (int *) shmem_malloc (sizeof (int) * 20); rray = (int *) shmem_malloc (sizeof (int) * 20); for (i = 0; i < ssize; i++) { sray[i] = myid; } /* send/rec different amounts of data to/from each processor */ // mpi_err = MPI_Alltoallv(sray,scounts,sdisp,MPI_INT, // rray,rcounts,rdisp,MPI_INT, MPI_COMM_WORLD); shmem_barrier_all (); for (j1 = 0; j1 < numnodes; j1++) { int k1 = sdisp[j1]; static int k2; shmem_int_get (&k2, &rdisp[myid], 1, j1); shmem_int_put (rray + k2, sray + k1, scounts[j1], j1); } shmem_barrier_all (); // not possible, coz even though the rcounts[myid] will be different on // each PE, the elements collected // by PE0 from other PE's will be constant. // shmem_collect32(rray_full,sray,rcounts[myid],0,0,numnodes,psync); printf ("myid= %d rray=", myid); for (i = 0; i < rsize; i++) printf ("%d ", rray[i]); printf ("\n"); // mpi_err = MPI_Finalize(); shmem_finalize (); return 0; }