int main (void) { int i; long *target; static long source[8] = { 1, 2, 3, 4, 5, 6, 7, 8 }; int nlong = 8; int me; start_pes (0); me = shmem_my_pe (); target = (long *) shmalloc (8 * sizeof (*target)); for (i = 0; i < _SHMEM_BCAST_SYNC_SIZE; i += 1) { pSync[i] = _SHMEM_SYNC_VALUE; } shmem_barrier_all (); shmem_broadcast64 (target, source, nlong, 1, 0, 0, 4, pSync); for (i = 0; i < 8; i++) { printf ("%d: target[%d] = %ld\n", me, i, target[i]); } shmem_barrier_all (); shfree (target); return 0; }
void FORTRANIFY (shmem_broadcast64) (void *target, const void *source, int *nelems, int *PE_root, int *PE_start, int *logPE_stride, int *PE_size, int *pSync) { shmem_broadcast64 (target, source, *nelems, *PE_root, *PE_start, *logPE_stride, *PE_size, (long *) pSync); }
int main (void) { int i; long *target; long *source; int me, npes; start_pes (0); me = shmem_my_pe (); npes = shmem_n_pes (); source = (long *) shmalloc (npes * sizeof (*source)); for (i = 0; i < npes; i += 1) { source[i] = i + 1; } target = (long *) shmalloc (npes * sizeof (*target)); for (i = 0; i < npes; i += 1) { target[i] = -999; } for (i = 0; i < _SHMEM_BCAST_SYNC_SIZE; i += 1) { pSync[i] = _SHMEM_SYNC_VALUE; } shmem_barrier_all (); shmem_broadcast64 (target, source, npes, 0, 0, 0, npes, pSync); for (i = 0; i < npes; i++) { printf ("%-8d %ld\n", me, target[i]); } shmem_barrier_all (); shfree (target); shfree (source); return 0; }
int main(void) { int i, me, npes; shmem_init(); me = shmem_my_pe(); npes = shmem_n_pes(); if (me == 0) for (i = 0; i < NUM_ELEMS; i++) source[i] = i; for (i=0; i < _SHMEM_BCAST_SYNC_SIZE; i++) { pSync[i] = _SHMEM_SYNC_VALUE; } shmem_barrier_all(); /* Wait for all PEs to initialize pSync */ shmem_broadcast64(dest, source, NUM_ELEMS, 4, 5, 0, 3, pSync); printf("%d: %ld", me, dest[0]); for (i = 1; i < NUM_ELEMS; i++) printf(", %ld", dest[i]); printf("\n"); return 0; }
int main(void) { int i, me, npes; int errors = 0; shmem_init(); me = shmem_my_pe(); npes = shmem_n_pes(); for (i = 0; i < NELEM; i++) { src[i] = me; dst[i] = -1; } for (i = 0; i < SHMEM_BCAST_SYNC_SIZE; i++) bcast_psync[i] = SHMEM_SYNC_VALUE; for (i = 0; i < SHMEM_BARRIER_SYNC_SIZE; i++) { barrier_psync0[i] = SHMEM_SYNC_VALUE; barrier_psync1[i] = SHMEM_SYNC_VALUE; } if (me == 0) printf("Shrinking active set test\n"); shmem_barrier_all(); /* A total of npes tests are performed, where the active set in each test * includes PEs i..npes-1 */ for (i = 0; i <= me; i++) { int j; if (me == i) printf(" + active set size %d\n", npes-i); shmem_broadcast64(dst, src, NELEM, 0, i, 0, npes-i, bcast_psync); /* Validate broadcasted data */ for (j = 0; j < NELEM; j++) { int64_t expected = (me == i) ? i-1 : i; if (dst[j] != expected) { printf("%d: Expected dst[%d] = %"PRId64", got dst[%d] = %"PRId64", iteration %d\n", me, j, expected, j, dst[j], i); errors++; } } shmem_barrier(i, 0, npes-i, (i % 2) ? barrier_psync0 : barrier_psync1); } shmem_barrier_all(); for (i = 0; i < NELEM; i++) dst[i] = -1; if (me == 0) printf("Changing root test\n"); shmem_barrier_all(); /* A total of npes tests are performed, where the root changes each time */ for (i = 0; i < npes; i++) { int j; if (me == i) printf(" + root %d\n", i); shmem_broadcast64(dst, src, NELEM, i, 0, 0, npes, bcast_psync); /* Validate broadcasted data */ for (j = 0; j < NELEM; j++) { int64_t expected = (me == i) ? i-1 : i; if (dst[j] != expected) { printf("%d: Expected dst[%d] = %"PRId64", got dst[%d] = %"PRId64", iteration %d\n", me, j, expected, j, dst[j], i); errors++; } } shmem_barrier(0, 0, npes, barrier_psync0); } shmem_finalize(); return errors != 0; }
int main(int argc, char **argv) { int i,j; int my_pe,n_pes,PE_root; size_t max_elements,max_elements_bytes; int *srce_int,*targ_int,ans_int; long *srce_long,*targ_long,ans_long; float *srce_float,*targ_float,ans_float; double *srce_double,*targ_double,ans_double; shmem_init(); my_pe = shmem_my_pe(); n_pes = shmem_n_pes(); /* fail if trying to use only one processor */ if ( n_pes <= 1 ){ fprintf(stderr, "FAIL - test requires at least two PEs\n"); exit(1); } if(my_pe == 0) fprintf(stderr, "shmem_broadcast(%s) n_pes=%d\n", argv[0],n_pes); /* initialize the pSync arrays */ for (i=0; i < _SHMEM_BCAST_SYNC_SIZE; i++) { pSync1[i] = _SHMEM_SYNC_VALUE; pSync2[i] = _SHMEM_SYNC_VALUE; } shmem_barrier_all(); /* Wait for all PEs to initialize pSync1 & pSync2 */ PE_root=1; /* we'll broadcast from this PE */ /* shmem_broadcast32 test */ max_elements = (size_t) (MAX_SIZE / sizeof(int)); max_elements_bytes = (size_t) (sizeof(int)*max_elements); if(my_pe == 0) fprintf(stderr,"shmem_broadcast32 max_elements = %d\n", max_elements); srce_int = shmem_malloc(max_elements_bytes); targ_int = shmem_malloc(max_elements_bytes); srce_float = shmem_malloc(max_elements_bytes); targ_float = shmem_malloc(max_elements_bytes); if((srce_int == NULL) || (targ_int == NULL) || (srce_float == NULL) || (targ_float == NULL)) shmalloc_error(); for(j = 0; j < max_elements; j++) { srce_int[j] = (int)(my_pe+j); srce_float[j] = (float)(my_pe+j); targ_int[j] = (int)(100*my_pe+j); targ_float[j] = (float)(100*my_pe+j); } shmem_barrier_all(); for(i = 0; i < IMAX; i+=2) { /* i is even -- using int */ if (my_pe == PE_root) for(j = 0; j < max_elements; j++) { srce_int[j] = (int)(my_pe+i+j); } /* broadcast from PE_root to all PEs using pSync1 */ shmem_broadcast32(targ_int,srce_int,max_elements,PE_root,0,0,n_pes,pSync1); for(j = 0; j < max_elements; j++) { if (my_pe == PE_root) { ans_int= (int)(100*my_pe+j); } else { ans_int= (int)(PE_root+i+j); } if ( targ_int[j] != ans_int ) fprintf(stderr, "FAIL: PE [%d] targ_int[%d]=%d ans_int=%d\n", my_pe,j,targ_int[j],ans_int); } /* i+1 is odd -- using float */ if (my_pe == PE_root) for(j = 0; j < max_elements; j++) { srce_float[j] = (float)(PE_root+i+1+j); } /* broadcast from PE_root to all PEs using pSync2 */ shmem_broadcast32(targ_float,srce_float,max_elements,PE_root,0,0,n_pes,pSync2); for(j = 0; j < max_elements; j++) { if (my_pe == PE_root) { ans_float= (float)(100*my_pe+j); } else { ans_float= (float)(PE_root+i+1+j); } if ( targ_float[j] != ans_float ) fprintf(stderr, "FAIL: PE [%d] targ_float[%d]=%10.0f ans_float=%10.0f\n", my_pe,j,targ_float[j],ans_float); } } shmem_free(srce_int); shmem_free(targ_int); shmem_free(srce_float); shmem_free(targ_float); /* shmem_broadcast64 test */ max_elements = (size_t) (MAX_SIZE / sizeof(long)); max_elements_bytes = (size_t) (sizeof(long)*max_elements); if(my_pe == 0) fprintf(stderr,"shmem_broadcast64 max_elements = %d\n", max_elements); srce_long = shmem_malloc(max_elements_bytes); targ_long = shmem_malloc(max_elements_bytes); srce_double = shmem_malloc(max_elements_bytes); targ_double = shmem_malloc(max_elements_bytes); if((srce_long == NULL) || (targ_long == NULL) || (srce_double == NULL) || (targ_double == NULL)) shmalloc_error(); for(j = 0; j < max_elements; j++) { srce_long[j] = (long)(my_pe+j); srce_double[j] = (double)(my_pe+j); targ_long[j] = (long)(100*my_pe+j); targ_double[j] = (double)(100*my_pe+j); } shmem_barrier_all(); for(i = 0; i < IMAX; i+=2) { /* i is even -- using long */ if (my_pe == PE_root) for(j = 0; j < max_elements; j++) { srce_long[j] = (long)(my_pe+i+j); } /* broadcast from PE_root to all PEs using pSync1 */ shmem_broadcast64(targ_long,srce_long,max_elements,PE_root,0,0,n_pes,pSync1); for(j = 0; j < max_elements; j++) { if (my_pe == PE_root) { ans_long= (long)(100*my_pe+j); } else { ans_long= (long)(PE_root+i+j); } if ( targ_long[j] != ans_long ) fprintf(stderr, "FAIL: PE [%d] targ_long[%d]=%d ans_long=%d\n", my_pe,j,targ_long[j],ans_long); } /* i+1 is odd -- using double */ if (my_pe == PE_root) for(j = 0; j < max_elements; j++) { srce_double[j] = (double)(PE_root+i+1+j); } /* broadcast from PE_root to all PEs using pSync2 */ shmem_broadcast64(targ_double,srce_double,max_elements,PE_root,0,0,n_pes,pSync2); for(j = 0; j < max_elements; j++) { if (my_pe == PE_root) { ans_double= (double)(100*my_pe+j); } else { ans_double= (double)(PE_root+i+1+j); } if ( targ_double[j] != ans_double ) fprintf(stderr, "FAIL: PE [%d] targ_double[%d]=%10.0f ans_double=%10.0f\n", my_pe,j,targ_double[j],ans_double); } } shmem_free(srce_long); shmem_free(targ_long); shmem_free(srce_double); shmem_free(targ_double); #ifndef OPENSHMEM #ifdef SHMEM_C_GENERIC_32 /* shmemx_broadcast (GENERIC 32) test */ max_elements = (size_t) (MAX_SIZE / sizeof(int)); max_elements_bytes = (size_t) (sizeof(int)*max_elements); if(my_pe == 0) fprintf(stderr,"shmemx_broadcast (GENERIC 32) max_elements = %d\n", max_elements); srce_int = shmem_malloc(max_elements_bytes); targ_int = shmem_malloc(max_elements_bytes); if((srce_int == NULL) || (targ_int == NULL)) shmalloc_error(); for(j = 0; j < max_elements; j++) { srce_int[j] = (int)(my_pe+j); targ_int[j] = (int)(2*my_pe+j); } shmem_barrier_all(); /* broadcast from PE 1 to all PEs */ shmemx_broadcast(targ_int,srce_int,max_elements,1,0,0,n_pes,pSync1); for(j = 0; j < max_elements; j++) { if (my_pe == 1) { ans_int= (int)(j+2); } else { ans_int= (int)(j+1); } if ( targ_int[j] != ans_int ) fprintf(stderr, "FAIL: PE [%d] targ_int[%d]=%d ans_int=%d\n", my_pe,j,targ_int[j],ans_int); } shmem_free(srce_int); shmem_free(targ_int); #else /* shmemx_broadcast (GENERIC 64) test */ max_elements = (size_t) (MAX_SIZE / sizeof(long)); max_elements_bytes = (size_t) (sizeof(long)*max_elements); if(my_pe == 0) fprintf(stderr,"shmemx_broadcast (GENERIC 64) max_elements = %d\n", max_elements); srce_long = shmem_malloc(max_elements_bytes); targ_long = shmem_malloc(max_elements_bytes); if((srce_long == NULL) || (targ_long == NULL)) shmalloc_error(); for(j = 0; j < max_elements; j++) { srce_long[j] = (long)(my_pe+j); targ_long[j] = (long)(2*my_pe+j); } shmem_barrier_all(); /* broadcast from PE 1 to all PEs */ shmemx_broadcast(targ_long,srce_long,max_elements,1,0,0,n_pes,pSync1); for(j = 0; j < max_elements; j++) { if (my_pe == 1) { ans_long = (long)(j+2); } else { ans_long = (long)(j+1); } if ( targ_long[j] != ans_long ) fprintf(stderr, "FAIL: PE [%d] targ_long[%d]=%d ans_long=%d\n", my_pe,j,targ_long[j],ans_long); } shmem_free(srce_long); shmem_free(targ_long); #endif #endif #ifdef NEEDS_FINALIZE shmem_finalize(); #endif return 0; }
int main (int argc, char *argv[]) { /**** Initialising ****/ const unsigned long long full_program_start = current_time_ns(); { shmem_init (); /* Variable Declarations */ int Numprocs,MyRank, Root = 0; int i,j,k, NoofElements, NoofElements_Bloc, NoElementsToSort; int count, temp; TYPE *Input, *InputData; TYPE *Splitter, *AllSplitter; TYPE *Buckets, *BucketBuffer, *LocalBucket; TYPE *OutputBuffer, *Output; MyRank = shmem_my_pe (); Numprocs = shmem_n_pes (); NoofElements = SIZE; if(( NoofElements % Numprocs) != 0){ if(MyRank == Root) printf("Number of Elements are not divisible by Numprocs \n"); shmem_finalize (); exit(0); } /**** Reading Input ****/ Input = (TYPE *) shmem_malloc (NoofElements*sizeof(*Input)); if(Input == NULL) { printf("Error : Can not allocate memory \n"); } if (MyRank == Root){ /* Initialise random number generator */ printf ("Generating input Array for Sorting %d uint64_t numbers\n",SIZE); srand48((TYPE)NoofElements); for(i=0; i< NoofElements; i++) { Input[i] = rand(); } } /**** Sending Data ****/ NoofElements_Bloc = NoofElements / Numprocs; InputData = (TYPE *) shmem_malloc (NoofElements_Bloc * sizeof (*InputData)); if(InputData == NULL) { printf("Error : Can not allocate memory \n"); } //MPI_Scatter(Input, NoofElements_Bloc, TYPE_MPI, InputData, // NoofElements_Bloc, TYPE_MPI, Root, MPI_COMM_WORLD); shmem_barrier_all(); if(MyRank == Root) { for(i=0; i<Numprocs; i++) { TYPE* start = &Input[i * NoofElements_Bloc]; shmem_put64(InputData, start, NoofElements_Bloc, i); } } shmem_barrier_all(); /**** Sorting Locally ****/ sorting(InputData, NoofElements_Bloc); /**** Choosing Local Splitters ****/ Splitter = (TYPE *) shmem_malloc (sizeof (TYPE) * (Numprocs-1)); if(Splitter == NULL) { printf("Error : Can not allocate memory \n"); } for (i=0; i< (Numprocs-1); i++){ Splitter[i] = InputData[NoofElements/(Numprocs*Numprocs) * (i+1)]; } /**** Gathering Local Splitters at Root ****/ AllSplitter = (TYPE *) shmem_malloc (sizeof (TYPE) * Numprocs * (Numprocs-1)); if(AllSplitter == NULL) { printf("Error : Can not allocate memory \n"); } //MPI_Gather (Splitter, Numprocs-1, TYPE_MPI, AllSplitter, Numprocs-1, // TYPE_MPI, Root, MPI_COMM_WORLD); shmem_barrier_all(); TYPE* target_index = &AllSplitter[MyRank * (Numprocs-1)]; shmem_put64(target_index, Splitter, Numprocs-1, Root); shmem_barrier_all(); /**** Choosing Global Splitters ****/ if (MyRank == Root){ sorting (AllSplitter, Numprocs*(Numprocs-1)); for (i=0; i<Numprocs-1; i++) Splitter[i] = AllSplitter[(Numprocs-1)*(i+1)]; } /**** Broadcasting Global Splitters ****/ //MPI_Bcast (Splitter, Numprocs-1, TYPE_MPI, 0, MPI_COMM_WORLD); { int _i; for(_i=0; _i<_SHMEM_BCAST_SYNC_SIZE; _i++) { pSync[_i] = _SHMEM_SYNC_VALUE; } shmem_barrier_all(); } shmem_broadcast64(Splitter, Splitter, Numprocs-1, 0, 0, 0, Numprocs, pSync); shmem_barrier_all(); /**** Creating Numprocs Buckets locally ****/ Buckets = (TYPE *) shmem_malloc (sizeof (TYPE) * (NoofElements + Numprocs)); if(Buckets == NULL) { printf("Error : Can not allocate memory \n"); } j = 0; k = 1; for (i=0; i<NoofElements_Bloc; i++){ if(j < (Numprocs-1)){ if (InputData[i] < Splitter[j]) Buckets[((NoofElements_Bloc + 1) * j) + k++] = InputData[i]; else{ Buckets[(NoofElements_Bloc + 1) * j] = k-1; k=1; j++; i--; } } else Buckets[((NoofElements_Bloc + 1) * j) + k++] = InputData[i]; } Buckets[(NoofElements_Bloc + 1) * j] = k - 1; shmem_free(Splitter); shmem_free(AllSplitter); /**** Sending buckets to respective processors ****/ BucketBuffer = (TYPE *) shmem_malloc (sizeof (TYPE) * (NoofElements + Numprocs)); if(BucketBuffer == NULL) { printf("Error : Can not allocate memory \n"); } //MPI_Alltoall (Buckets, NoofElements_Bloc + 1, TYPE_MPI, BucketBuffer, // NoofElements_Bloc + 1, TYPE_MPI, MPI_COMM_WORLD); shmem_barrier_all(); for(i=0; i<Numprocs; i++) { shmem_put64(&BucketBuffer[MyRank*(NoofElements_Bloc + 1)], &Buckets[i*(NoofElements_Bloc + 1)], NoofElements_Bloc + 1, i); } shmem_barrier_all(); /**** Rearranging BucketBuffer ****/ LocalBucket = (TYPE *) shmem_malloc (sizeof (TYPE) * 2 * NoofElements / Numprocs); if(LocalBucket == NULL) { printf("Error : Can not allocate memory \n"); } count = 1; for (j=0; j<Numprocs; j++) { k = 1; for (i=0; i<BucketBuffer[(NoofElements/Numprocs + 1) * j]; i++) LocalBucket[count++] = BucketBuffer[(NoofElements/Numprocs + 1) * j + k++]; } LocalBucket[0] = count-1; /**** Sorting Local Buckets using Bubble Sort ****/ /*sorting (InputData, NoofElements_Bloc, sizeof(int), intcompare); */ NoElementsToSort = LocalBucket[0]; sorting (&LocalBucket[1], NoElementsToSort); /**** Gathering sorted sub blocks at root ****/ OutputBuffer = (TYPE *) shmem_malloc (sizeof(TYPE) * 2 * NoofElements); if(OutputBuffer == NULL) { printf("Error : Can not allocate memory \n"); } //MPI_Gather (LocalBucket, 2*NoofElements_Bloc, TYPE_MPI, OutputBuffer, // 2*NoofElements_Bloc, TYPE_MPI, Root, MPI_COMM_WORLD); shmem_barrier_all(); target_index = &OutputBuffer[MyRank * (2*NoofElements_Bloc)]; shmem_put64(target_index, LocalBucket, 2*NoofElements_Bloc, Root); shmem_barrier_all(); /**** Rearranging output buffer ****/ if (MyRank == Root){ Output = (TYPE *) malloc (sizeof (TYPE) * NoofElements); count = 0; for(j=0; j<Numprocs; j++){ k = 1; for(i=0; i<OutputBuffer[(2 * NoofElements/Numprocs) * j]; i++) Output[count++] = OutputBuffer[(2*NoofElements/Numprocs) * j + k++]; } printf ( "Number of Elements to be sorted : %d \n", NoofElements); TYPE prev = 0; int fail = 0; for (i=0; i<NoofElements; i++){ if(Output[i] < prev) { printf("Failed at index %d\n",i); fail = 1; } prev = Output[i]; } if(fail) printf("Sorting FAILED\n"); else printf("Sorting PASSED\n"); free(Output); }/* MyRank==0*/ shmem_free(Input); shmem_free(OutputBuffer); shmem_free(InputData); shmem_free(Buckets); shmem_free(BucketBuffer); shmem_free(LocalBucket); /**** Finalize ****/ shmem_finalize(); } ; const unsigned long long full_program_end = current_time_ns(); printf("full_program %llu ns\n", full_program_end - full_program_start); }
void communicateParameters(LSMSCommunication &comm, LSMSSystemParameters &lsms, CrystalParameters &crystal, MixingParameters &mix) { int const s=sizeof(LSMSSystemParameters)+9*sizeof(Real)+sizeof(int)+10 +sizeof(MixingParameters)+5*sizeof(int); int rem=0,ele=0; int tot_bufsize=s; rem=s%32; ele=s/32; if (rem!=0) { tot_bufsize=s-rem+32; ele++; } // TODO fine-tune this size tot_bufsize=65536; char* buf=(char*)shmalloc(tot_bufsize); int pos=0; int sec_id; if(comm.comm.rank==0) { //MPI_Pack(lsms.systemid,80,MPI_CHAR,buf,s,&pos,comm.comm); //MPI_Pack(lsms.title,80,MPI_CHAR,buf,s,&pos,comm.comm); //MPI_Pack(lsms.potential_file_in,128,MPI_CHAR,buf,s,&pos,comm.comm); //MPI_Pack(lsms.potential_file_out,128,MPI_CHAR,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.pot_in_type,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.pot_out_type,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.num_atoms,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.nspin,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.nrel_rel,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.nrelc,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.nrelv,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.n_spin_cant,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.n_spin_pola,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.mtasa,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.fixRMT,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.nscf,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.writeSteps,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.clight,1,MPI_DOUBLE,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.energyContour.grid,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.energyContour.npts,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.energyContour.ebot,1,MPI_DOUBLE,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.energyContour.etop,1,MPI_DOUBLE,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.energyContour.eibot,1,MPI_DOUBLE,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.energyContour.eitop,1,MPI_DOUBLE,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.energyContour.maxGroupSize,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.mixing,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.alphaDV,1,MPI_DOUBLE,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.global.iprint,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.global.print_node,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.global.default_iprint,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.global.istop,32,MPI_CHAR,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.global.GPUThreads,32,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&crystal.num_types,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&crystal.bravais(0,0),9,MPI_DOUBLE,buf,s,&pos,comm.comm); //************ MemCpying *************** memcpy(&buf[pos],&lsms.systemid,80*char_size); pos = pos+80*char_size; memcpy(&buf[pos],&lsms.title,80*char_size); pos = pos+80*char_size; memcpy(&buf[pos],&lsms.potential_file_in,128*char_size); pos = pos+128*char_size; memcpy(&buf[pos],&lsms.potential_file_out,128*char_size); pos = pos+128*char_size; memcpy(&buf[pos],&lsms.pot_in_type,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.pot_out_type ,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.num_atoms,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.nspin,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.nrel_rel,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.nrelc,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.nrelv,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.n_spin_cant,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.n_spin_pola,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.mtasa,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.fixRMT,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.nscf,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.writeSteps,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.clight,double_size); pos = pos+double_size; memcpy(&buf[pos],&lsms.energyContour.grid,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.energyContour.npts,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.energyContour.ebot,double_size); pos = pos+double_size; memcpy(&buf[pos],&lsms.energyContour.etop,double_size); pos = pos+double_size; memcpy(&buf[pos],&lsms.energyContour.eibot,double_size); pos = pos+double_size; memcpy(&buf[pos],&lsms.energyContour.eitop,double_size); pos = pos+double_size; memcpy(&buf[pos],&lsms.energyContour.maxGroupSize,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.mixing,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.alphaDV,double_size); pos = pos+double_size; memcpy(&buf[pos],&lsms.global.iprint,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.global.print_node,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.global.default_iprint,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.global.istop,32*char_size); pos = pos+32*char_size; memcpy(&buf[pos],&lsms.global.GPUThreads,32*int_size); pos = pos+32*int_size; memcpy(&buf[pos],&crystal.num_types,int_size); pos = pos+int_size; memcpy(&buf[pos],&crystal.bravais(0,0),9*double_size); pos = pos+9*double_size; // MixingParameters // MPI_CXX_BOOL is not always available // MPI_Pack(&mix.quantity[0],mix.numQuantities,MPI_CXX_BOOL,buf,s,&pos,comm.comm); // copy to temporary int array and send this int tmpQuantity[mix.numQuantities]; for(int i=0; i<mix.numQuantities; i++) if(mix.quantity[i]) tmpQuantity[i] = 1; else tmpQuantity[i] = 0; //MPI_Pack(&tmpQuantity[0],mix.numQuantities,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&mix.algorithm[0],mix.numQuantities,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&mix.mixingParameter[0],mix.numQuantities,MPI_DOUBLE,buf,s,&pos,comm.comm); memcpy(&buf[pos],&tmpQuantity[0],mix.numQuantities*int_size); pos = pos+mix.numQuantities*int_size; memcpy(&buf[pos],&mix.algorithm[0],mix.numQuantities*int_size); pos = pos+mix.numQuantities*int_size; memcpy(&buf[pos],&mix.mixingParameter[0],mix.numQuantities*double_size); pos = pos+mix.numQuantities*double_size; } //MPI_Bcast(buf,s,MPI_PACKED,0,comm.comm); shmem_barrier(0, 0, comm.comm.size,pSync2); shmem_broadcast32(&buf[0], &buf[0], tot_bufsize, 0, 0, 0, comm.comm.size,pSync1); shmem_barrier(0, 0, comm.comm.size,pSync2); if(comm.comm.rank!=0) { int pos=0; //MPI_Unpack(buf,s,&pos,lsms.systemid,80,MPI_CHAR,comm.comm); //MPI_Unpack(buf,s,&pos,lsms.title,80,MPI_CHAR,comm.comm); //MPI_Unpack(buf,s,&pos,lsms.potential_file_in,128,MPI_CHAR,comm.comm); //MPI_Unpack(buf,s,&pos,lsms.potential_file_out,128,MPI_CHAR,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.pot_in_type,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.pot_out_type,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.num_atoms,1,MPI_INT,comm.comm); memcpy(&lsms.systemid,&buf[pos],80*char_size); pos = pos+80*char_size; memcpy(&lsms.title,&buf[pos],80*char_size); pos = pos+80*char_size; memcpy(&lsms.potential_file_in,&buf[pos],128*char_size); pos = pos+128*char_size; memcpy(&lsms.potential_file_out,&buf[pos],128*char_size); pos = pos+128*char_size; memcpy(&lsms.pot_in_type,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.pot_out_type,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.num_atoms,&buf[pos],int_size); pos = pos+int_size; crystal.num_atoms=lsms.num_atoms; //MPI_Unpack(buf,s,&pos,&lsms.nspin,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.nrel_rel,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.nrelc,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.nrelv,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.n_spin_cant,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.n_spin_pola,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.mtasa,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.fixRMT,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.nscf,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.writeSteps,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.clight,1,MPI_DOUBLE,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.energyContour.grid,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.energyContour.npts,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.energyContour.ebot,1,MPI_DOUBLE,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.energyContour.etop,1,MPI_DOUBLE,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.energyContour.eibot,1,MPI_DOUBLE,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.energyContour.eitop,1,MPI_DOUBLE,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.energyContour.maxGroupSize,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.mixing,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.alphaDV,1,MPI_DOUBLE,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.global.iprint,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.global.print_node,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.global.default_iprint,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.global.istop,32,MPI_CHAR,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.global.GPUThreads,32,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&crystal.num_types,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&crystal.bravais(0,0),9,MPI_DOUBLE,comm.comm); memcpy(&lsms.nspin,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.nrel_rel,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.nrelc,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.nrelv,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.n_spin_cant,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.n_spin_pola,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.mtasa,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.fixRMT,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.nscf,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.writeSteps,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.clight,&buf[pos],double_size); pos = pos+double_size; memcpy(&lsms.energyContour.grid,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.energyContour.npts,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.energyContour.ebot,&buf[pos],double_size); pos = pos+double_size; memcpy(&lsms.energyContour.etop,&buf[pos],double_size); pos = pos+double_size; memcpy(&lsms.energyContour.eibot,&buf[pos],double_size); pos = pos+double_size; memcpy(&lsms.energyContour.eitop,&buf[pos],double_size); pos = pos+double_size; memcpy(&lsms.energyContour.maxGroupSize,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.mixing,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.alphaDV,&buf[pos],double_size); pos = pos+double_size; memcpy(&lsms.global.iprint,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.global.print_node,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.global.default_iprint,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.global.istop,&buf[pos],32*char_size); pos = pos+32*char_size; memcpy(&lsms.global.GPUThreads,&buf[pos],32*int_size); pos = pos+32*int_size; memcpy(&crystal.num_types,&buf[pos],int_size); pos = pos+int_size; memcpy(&crystal.bravais(0,0),&buf[pos],9*double_size); pos = pos+9*double_size; crystal.resize(crystal.num_atoms); crystal.resizeTypes(crystal.num_types); // MixingParameters // MPI_CXX_BOOL is not always available // MPI_Unpack(buf,s,&pos,&mix.quantity[0],mix.numQuantities,MPI_CXX_BOOL,comm.comm); // recieve temporary int array and copy int tmpQuantity[mix.numQuantities]; //MPI_Unpack(buf,s,&pos,&tmpQuantity[0],mix.numQuantities,MPI_INT,comm.comm); memcpy(&tmpQuantity[0],&buf[pos],mix.numQuantities*int_size); pos = pos+mix.numQuantities*int_size; for(int i=0; i<mix.numQuantities; i++) if(tmpQuantity[i]==1) mix.quantity[i] = true; else mix.quantity[i] = false; //MPI_Unpack(buf,s,&pos,&mix.algorithm[0],mix.numQuantities,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&mix.mixingParameter[0],mix.numQuantities,MPI_DOUBLE,comm.comm); memcpy(&mix.algorithm[0],&buf[pos],mix.numQuantities*int_size); pos = pos+mix.numQuantities*int_size; memcpy(&mix.mixingParameter[0],&buf[pos],mix.numQuantities*double_size); pos = pos+mix.numQuantities*double_size; } for(int i=0; i<mix.numQuantities; i++) printf("mix.quantity[%d]=%d\n", i,mix.quantity[i]); // Allocate buffer for transmitting Crystal params int buff_size; if((crystal.num_types*sizeof(AtomType)) > (3*crystal.num_atoms*double_size)) buff_size = crystal.num_types*sizeof(AtomType); else buff_size = 3*crystal.num_atoms*double_size; shfree(buf); // TODO finetune buff-size buff_size=1048576; //sizeof(LSMSSystemParameters)+9*sizeof(Real); rem=buff_size%64; ele=buff_size/64; if(rem != 0) { buff_size=buff_size-rem+64; ele++; } double* temp_buff=(double*) shmalloc(buff_size); int* temp_intbuff=(int*) shmalloc(buff_size); //MPI_Bcast(&crystal.position(0,0),3*crystal.num_atoms,MPI_DOUBLE,0,comm.comm); //TODO check if a barrier is neededa after broadcast ... data not updated otherwise if(comm.comm.rank == 0) memcpy(temp_buff,&crystal.position(0,0),3*crystal.num_atoms*double_size); shmem_barrier(0, 0, comm.comm.size,pSync2); shmem_broadcast64(temp_buff, temp_buff,3*crystal.num_atoms, 0, 0, 0, comm.comm.size,pSync1); shmem_barrier(0, 0, comm.comm.size,pSync2); if(comm.comm.rank != 0) memcpy(&crystal.position(0,0),temp_buff,3*crystal.num_atoms*double_size); //MPI_Bcast(&crystal.evecs(0,0),3*crystal.num_atoms,MPI_DOUBLE,0,comm.comm); if(comm.comm.rank == 0){ memcpy(temp_buff,&crystal.evecs(0,0),3*crystal.num_atoms*double_size); } shmem_barrier(0, 0, comm.comm.size,pSync2); shmem_broadcast64(temp_buff, temp_buff, 3*crystal.num_atoms, 0, 0, 0, comm.comm.size,pSync1); shmem_barrier(0, 0, comm.comm.size,pSync2); if(comm.comm.rank != 0){ memcpy(&crystal.evecs(0,0),temp_buff,3*crystal.num_atoms*double_size); } //MPI_Bcast(&crystal.type[0],crystal.num_atoms,MPI_INT,0,comm.comm); if(comm.comm.rank == 0){ memcpy(temp_intbuff,&crystal.type[0],crystal.num_atoms*int_size); } shmem_barrier(0, 0, comm.comm.size,pSync2); shmem_broadcast32(temp_intbuff, temp_intbuff, crystal.num_atoms, 0, 0, 0, comm.comm.size,pSync1); shmem_barrier(0, 0, comm.comm.size,pSync2); if(comm.comm.rank != 0){ memcpy(&crystal.type[0],temp_intbuff,crystal.num_atoms*int_size); } // This is dangerous and assumes homogeneous nodes: //MPI_Bcast(&crystal.types[0],crystal.num_types*sizeof(AtomType),MPI_BYTE,0,comm.comm); if(comm.comm.rank == 0) memcpy(temp_buff,&crystal.types[0],crystal.num_types*sizeof(AtomType)); // having to use the smallest possible broadcast:"32"-type shmem_barrier(0, 0, comm.comm.size,pSync2); shmem_broadcast32(temp_buff,temp_buff,crystal.num_types*sizeof(AtomType)/4,0,0,0,comm.comm.size,pSync1); shmem_barrier(0, 0, comm.comm.size,pSync2); if(comm.comm.rank != 0) memcpy(&crystal.types[0],temp_buff,crystal.num_types*sizeof(AtomType)); shmem_barrier(0, 0, comm.comm.size,pSync1); shfree(temp_buff); shfree(temp_intbuff); // get maximum lmax crystal.maxlmax=0; for(int i=0; i<crystal.num_types; i++) if(crystal.types[i].lmax>crystal.maxlmax) crystal.maxlmax=crystal.types[i].lmax; lsms.maxlmax=crystal.maxlmax; }
int HPCC_SHMEMRandomAccess(HPCC_Params *params) { s64Int i; static s64Int NumErrors, GlbNumErrors; int NumProcs, logNumProcs, MyProc; u64Int GlobalStartMyProc; int Remainder; /* Number of processors with (LocalTableSize + 1) entries */ u64Int Top; /* Number of table entries in top of Table */ s64Int LocalTableSize; /* Local table width */ u64Int MinLocalTableSize; /* Integer ratio TableSize/NumProcs */ u64Int logTableSize, TableSize; double CPUTime; /* CPU time to update table */ double RealTime; /* Real time to update table */ double TotalMem; static int sAbort, rAbort; int PowerofTwo; double timeBound = -1; /* OPTIONAL time bound for execution time */ u64Int NumUpdates_Default; /* Number of updates to table (suggested: 4x number of table entries) */ u64Int NumUpdates; /* actual number of updates to table - may be smaller than * NumUpdates_Default due to execution time bounds */ s64Int ProcNumUpdates; /* number of updates per processor */ #ifdef RA_TIME_BOUND s64Int GlbNumUpdates; /* for reduction */ #endif static long llpSync[_SHMEM_BCAST_SYNC_SIZE]; static long long int llpWrk[_SHMEM_REDUCE_SYNC_SIZE]; static long ipSync[_SHMEM_BCAST_SYNC_SIZE]; static int ipWrk[_SHMEM_REDUCE_SYNC_SIZE]; FILE *outFile = NULL; double *GUPs; double *temp_GUPs; int numthreads; for (i = 0; i < _SHMEM_BCAST_SYNC_SIZE; i += 1){ ipSync[i] = _SHMEM_SYNC_VALUE; llpSync[i] = _SHMEM_SYNC_VALUE; } params->SHMEMGUPs = -1; GUPs = ¶ms->SHMEMGUPs; NumProcs = shmem_n_pes(); MyProc = shmem_my_pe(); if (0 == MyProc) { outFile = stdout; setbuf(outFile, NULL); } params->HPLMaxProcMem = 200000; TotalMem = params->HPLMaxProcMem; /* max single node memory */ TotalMem *= NumProcs; /* max memory in NumProcs nodes */ TotalMem /= sizeof(u64Int); /* calculate TableSize --- the size of update array (must be a power of 2) */ for (TotalMem *= 0.5, logTableSize = 0, TableSize = 1; TotalMem >= 1.0; TotalMem *= 0.5, logTableSize++, TableSize <<= 1) ; /* EMPTY */ /* determine whether the number of processors is a power of 2 */ if ( (NumProcs & (NumProcs -1)) == 0) { PowerofTwo = HPCC_TRUE; Remainder = 0; Top = 0; MinLocalTableSize = (TableSize / NumProcs); LocalTableSize = MinLocalTableSize; GlobalStartMyProc = (MinLocalTableSize * MyProc); } else { if(MyProc == 0) { printf("Number of processes must be power of 2\n"); } return 0; } sAbort = 0; HPCC_Table = HPCC_XMALLOC( s64Int, LocalTableSize ); if (! HPCC_Table) sAbort = 1; shmem_barrier_all(); shmem_int_sum_to_all(&rAbort, &sAbort, 1, 0, 0, NumProcs, ipWrk, ipSync); shmem_barrier_all(); if (rAbort > 0) { if (MyProc == 0) fprintf(outFile, "Failed to allocate memory for the main table.\n"); /* check all allocations in case there are new added and their order changes */ if (HPCC_Table) HPCC_free( HPCC_Table ); goto failed_table; } params->SHMEMRandomAccess_N = (s64Int)TableSize; /* Default number of global updates to table: 4x number of table entries */ NumUpdates_Default = 4 * TableSize; ProcNumUpdates = 4*LocalTableSize; NumUpdates = NumUpdates_Default; if (MyProc == 0) { fprintf( outFile, "Running on %d processors%s\n", NumProcs, PowerofTwo ? " (PowerofTwo)" : ""); fprintf( outFile, "Total Main table size = 2^" FSTR64 " = " FSTR64 " words\n", logTableSize, TableSize ); if (PowerofTwo) fprintf( outFile, "PE Main table size = 2^" FSTR64 " = " FSTR64 " words/PE\n", (logTableSize - logNumProcs), TableSize/NumProcs ); else fprintf( outFile, "PE Main table size = (2^" FSTR64 ")/%d = " FSTR64 " words/PE MAX\n", logTableSize, NumProcs, LocalTableSize); fprintf( outFile, "Default number of updates (RECOMMENDED) = " FSTR64 "\n", NumUpdates_Default); params->SHMEMRandomAccess_ExeUpdates = NumUpdates; } /* Initialize main table */ for (i=0; i<LocalTableSize; i++) HPCC_Table[i] = i + GlobalStartMyProc; shmem_barrier_all(); RealTime = -RTSEC(); Power2NodesRandomAccessUpdate(logTableSize, TableSize, LocalTableSize, MinLocalTableSize, GlobalStartMyProc, Top, logNumProcs, NumProcs, Remainder, MyProc, ProcNumUpdates); shmem_barrier_all(); /* End timed section */ RealTime += RTSEC(); /* Print timing results */ if (MyProc == 0){ params->SHMEMRandomAccess_time = RealTime; *GUPs = 1e-9*NumUpdates / RealTime; fprintf( outFile, "Real time used = %.6f seconds\n", RealTime ); fprintf( outFile, "%.9f Billion(10^9) Updates per second [GUP/s]\n", *GUPs ); fprintf( outFile, "%.9f Billion(10^9) Updates/PE per second [GUP/s]\n", *GUPs / NumProcs ); /* No longer reporting per CPU number */ /* *GUPs /= NumProcs; */ } /* distribute result to all nodes */ temp_GUPs = GUPs; shmem_barrier_all(); shmem_broadcast64(GUPs,temp_GUPs,1,0,0,0,NumProcs,llpSync); shmem_barrier_all(); /* Verification phase */ /* Begin timing here */ RealTime = -RTSEC(); HPCC_Power2NodesSHMEMRandomAccessCheck(logTableSize, TableSize, LocalTableSize, GlobalStartMyProc, logNumProcs, NumProcs, MyProc, ProcNumUpdates, &NumErrors); shmem_barrier_all(); shmem_longlong_sum_to_all( &GlbNumErrors, &NumErrors, 1, 0,0, NumProcs,llpWrk, llpSync); shmem_barrier_all(); /* End timed section */ RealTime += RTSEC(); if(MyProc == 0){ params->SHMEMRandomAccess_CheckTime = RealTime; fprintf( outFile, "Verification: Real time used = %.6f seconds\n", RealTime); fprintf( outFile, "Found " FSTR64 " errors in " FSTR64 " locations (%s).\n", GlbNumErrors, TableSize, (GlbNumErrors <= 0.01*TableSize) ? "passed" : "failed"); if (GlbNumErrors > 0.01*TableSize) params->Failure = 1; params->SHMEMRandomAccess_Errors = (s64Int)GlbNumErrors; params->SHMEMRandomAccess_ErrorsFraction = (double)GlbNumErrors / (double)TableSize; params->SHMEMRandomAccess_Algorithm = 1; } /* End verification phase */ /* Deallocate memory (in reverse order of allocation which should help fragmentation) */ HPCC_free( HPCC_Table ); failed_table: if (0 == MyProc) if (outFile != stderr) fclose( outFile ); shmem_barrier_all(); return 0; }
int main(int argc, char* argv[]) { int i, Verbose=0; int mpe, num_pes, loops=10, cloop; char *pgm; long *dst, *src; int nBytes = START_BCAST_SIZE; int nLongs=0; shmem_init(); mpe = shmem_my_pe(); num_pes = shmem_n_pes(); if (num_pes == 1) { printf("%s: Requires number of PEs > 1\n", argv[0]); shmem_finalize(); return 0; } if (sizeof(long) != 8) { printf("Test assumes 64-bit long (%zd)\n", sizeof(long)); shmem_global_exit(1); return 0; } if ((pgm=strrchr(argv[0],'/'))) { pgm++; } else { pgm = argv[0]; } if (argc > 1) { if (strncmp(argv[1],"-v",3) == 0) { Verbose=1; } else if (strncmp(argv[1],"-h",3) == 0) { fprintf(stderr,"usage: %s {-v(verbose)|h(help)}\n",pgm); shmem_finalize(); exit(1); } } for (i = 0; i < SHMEM_BCAST_SYNC_SIZE; i += 1) { pSync[i] = SHMEM_SYNC_VALUE; } if ( mpe == 0 && Verbose ) { fprintf(stderr,"%d loops\n",loops); } for(cloop=1; cloop <= loops; cloop++) { nLongs = nBytes / sizeof(long); dst = (long *)shmem_malloc(nBytes*2); if ( !dst ) { fprintf(stderr,"[%d] shmem_malloc(%d) failed %s\n", mpe,nBytes,strerror(errno)); return 0; } memset( (void*)dst, 0, nBytes ); src = &dst[nLongs]; for (i = 1; i < nLongs; i++) { src[i] = i+1; } shmem_barrier_all(); shmem_broadcast64(dst, src, nLongs, 1, 0, 0, num_pes, pSync); for(i=0; i < nLongs; i++) { /* the root node shouldn't have the result into dst (cf specification).*/ if (1 != mpe && dst[i] != src[i]) { fprintf(stderr,"[%d] dst[%d] %ld != expected %ld\n", mpe, i, dst[i],src[i]); shmem_global_exit(1); } else if (1 == mpe && dst[i] != 0) { fprintf(stderr,"[%d] dst[%d] %ld != expected 0\n", mpe, i, dst[i]); shmem_global_exit(1); } } shmem_barrier_all(); shmem_free (dst); if (Verbose && mpe ==0) fprintf(stderr,"loop %2d Bcast %d, Done.\n",cloop,nBytes); nBytes += BCAST_INCR; } shmem_finalize(); return 0; }