Beispiel #1
0
int
main (void)
{
  int i;
  long *target;
  static long source[8] = { 1, 2, 3, 4, 5, 6, 7, 8 };
  int nlong = 8;
  int me;

  start_pes (0);
  me = shmem_my_pe ();

  target = (long *) shmalloc (8 * sizeof (*target));

  for (i = 0; i < _SHMEM_BCAST_SYNC_SIZE; i += 1)
    {
      pSync[i] = _SHMEM_SYNC_VALUE;
    }
  shmem_barrier_all ();

  shmem_broadcast64 (target, source, nlong, 1, 0, 0, 4, pSync);

  for (i = 0; i < 8; i++)
    {
      printf ("%d: target[%d] = %ld\n", me, i, target[i]);
    }

  shmem_barrier_all ();

  shfree (target);

  return 0;
}
Beispiel #2
0
void
FORTRANIFY (shmem_broadcast64) (void *target, const void *source,
                                int *nelems, int *PE_root, int *PE_start,
                                int *logPE_stride, int *PE_size, int *pSync)
{
    shmem_broadcast64 (target, source,
                       *nelems, *PE_root, *PE_start, *logPE_stride, *PE_size,
                       (long *) pSync);
}
Beispiel #3
0
int
main (void)
{
  int i;
  long *target;
  long *source;
  int me, npes;

  start_pes (0);
  me = shmem_my_pe ();
  npes = shmem_n_pes ();

  source = (long *) shmalloc (npes * sizeof (*source));
  for (i = 0; i < npes; i += 1)
    {
      source[i] = i + 1;
    }

  target = (long *) shmalloc (npes * sizeof (*target));
  for (i = 0; i < npes; i += 1)
    {
      target[i] = -999;
    }

  for (i = 0; i < _SHMEM_BCAST_SYNC_SIZE; i += 1)
    {
      pSync[i] = _SHMEM_SYNC_VALUE;
    }
  shmem_barrier_all ();

  shmem_broadcast64 (target, source, npes, 0, 0, 0, npes, pSync);

  for (i = 0; i < npes; i++)
    {
      printf ("%-8d %ld\n", me, target[i]);
    }

  shmem_barrier_all ();

  shfree (target);
  shfree (source);

  return 0;
}
int main(void)
{
   int i, me, npes;

   shmem_init();
   me = shmem_my_pe();
   npes = shmem_n_pes();

   if (me == 0)
      for (i = 0; i < NUM_ELEMS; i++)
         source[i] = i;
   for (i=0; i < _SHMEM_BCAST_SYNC_SIZE; i++) {
      pSync[i] = _SHMEM_SYNC_VALUE;
   }
   shmem_barrier_all(); /* Wait for all PEs to initialize pSync */

   shmem_broadcast64(dest, source, NUM_ELEMS, 4, 5, 0, 3, pSync);
   printf("%d: %ld", me, dest[0]);
   for (i = 1; i < NUM_ELEMS; i++)
      printf(", %ld", dest[i]);
   printf("\n");
   return 0;
}
int main(void)
{
    int i, me, npes;
    int errors = 0;

    shmem_init();

    me = shmem_my_pe();
    npes = shmem_n_pes();

    for (i = 0; i < NELEM; i++) {
        src[i] = me;
        dst[i] = -1;
    }

    for (i = 0; i < SHMEM_BCAST_SYNC_SIZE; i++)
        bcast_psync[i] = SHMEM_SYNC_VALUE;

    for (i = 0; i < SHMEM_BARRIER_SYNC_SIZE; i++) {
        barrier_psync0[i] = SHMEM_SYNC_VALUE;
        barrier_psync1[i] = SHMEM_SYNC_VALUE;
    }

    if (me == 0)
        printf("Shrinking active set test\n");

    shmem_barrier_all();

    /* A total of npes tests are performed, where the active set in each test
     * includes PEs i..npes-1 */
    for (i = 0; i <= me; i++) {
        int j;

        if (me == i)
            printf(" + active set size %d\n", npes-i);

        shmem_broadcast64(dst, src, NELEM, 0, i, 0, npes-i, bcast_psync);

        /* Validate broadcasted data */
        for (j = 0; j < NELEM; j++) {
            int64_t expected = (me == i) ? i-1 : i;
            if (dst[j] != expected) {
                printf("%d: Expected dst[%d] = %"PRId64", got dst[%d] = %"PRId64", iteration %d\n",
                       me, j, expected, j, dst[j], i);
                errors++;
            }
        }

        shmem_barrier(i, 0, npes-i, (i % 2) ? barrier_psync0 : barrier_psync1);
    }

    shmem_barrier_all();

    for (i = 0; i < NELEM; i++)
        dst[i] = -1;

    if (me == 0)
        printf("Changing root test\n");

    shmem_barrier_all();

    /* A total of npes tests are performed, where the root changes each time */
    for (i = 0; i < npes; i++) {
        int j;

        if (me == i)
            printf(" + root %d\n", i);

        shmem_broadcast64(dst, src, NELEM, i, 0, 0, npes, bcast_psync);

        /* Validate broadcasted data */
        for (j = 0; j < NELEM; j++) {
            int64_t expected = (me == i) ? i-1 : i;
            if (dst[j] != expected) {
                printf("%d: Expected dst[%d] = %"PRId64", got dst[%d] = %"PRId64", iteration %d\n",
                       me, j, expected, j, dst[j], i);
                errors++;
            }
        }

        shmem_barrier(0, 0, npes, barrier_psync0);
    }
    shmem_finalize();

    return errors != 0;
}
int main(int argc, char **argv)
{
  int i,j;
  int my_pe,n_pes,PE_root;
  size_t max_elements,max_elements_bytes;

  int *srce_int,*targ_int,ans_int;
  long *srce_long,*targ_long,ans_long;
  float *srce_float,*targ_float,ans_float;
  double *srce_double,*targ_double,ans_double;

  shmem_init();
  my_pe = shmem_my_pe();
  n_pes = shmem_n_pes();

/*  fail if trying to use only one processor  */
  if ( n_pes  <= 1 ){
        fprintf(stderr, "FAIL - test requires at least two PEs\n");
        exit(1);
  }

  if(my_pe == 0)
    fprintf(stderr, "shmem_broadcast(%s) n_pes=%d\n", argv[0],n_pes);
  /* initialize the pSync arrays */
  for (i=0; i < _SHMEM_BCAST_SYNC_SIZE; i++) {
    pSync1[i] = _SHMEM_SYNC_VALUE;
    pSync2[i] = _SHMEM_SYNC_VALUE;
  }
  shmem_barrier_all();  /* Wait for all PEs to initialize pSync1 & pSync2 */
  PE_root=1;  /* we'll broadcast from this PE */

/*  shmem_broadcast32 test   */
  max_elements = (size_t) (MAX_SIZE / sizeof(int));
  max_elements_bytes = (size_t) (sizeof(int)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_broadcast32             max_elements = %d\n",
                                                  max_elements);
  srce_int = shmem_malloc(max_elements_bytes);
  targ_int = shmem_malloc(max_elements_bytes);
  srce_float = shmem_malloc(max_elements_bytes);
  targ_float = shmem_malloc(max_elements_bytes);
  if((srce_int == NULL) || (targ_int == NULL) ||
     (srce_float == NULL) || (targ_float == NULL))
     shmalloc_error();
  for(j = 0; j < max_elements; j++) {
    srce_int[j] = (int)(my_pe+j);
    srce_float[j] = (float)(my_pe+j);
    targ_int[j] = (int)(100*my_pe+j);
    targ_float[j] = (float)(100*my_pe+j);
  }
  shmem_barrier_all();
  for(i = 0; i < IMAX; i+=2) {
    /* i is even -- using int */
    if (my_pe == PE_root)
      for(j = 0; j < max_elements; j++) {
        srce_int[j] = (int)(my_pe+i+j);
      }
    /* broadcast from PE_root to all PEs using pSync1 */
    shmem_broadcast32(targ_int,srce_int,max_elements,PE_root,0,0,n_pes,pSync1);
    for(j = 0; j < max_elements; j++) {
      if (my_pe == PE_root) {
        ans_int= (int)(100*my_pe+j);
      } else {
        ans_int= (int)(PE_root+i+j);
      }
      if ( targ_int[j] != ans_int )
	fprintf(stderr, "FAIL: PE [%d] targ_int[%d]=%d ans_int=%d\n",
                               my_pe,j,targ_int[j],ans_int);
    }
    /* i+1 is odd -- using float */
    if (my_pe == PE_root)
      for(j = 0; j < max_elements; j++) {
        srce_float[j] = (float)(PE_root+i+1+j);
      }
    /* broadcast from PE_root to all PEs using pSync2 */
    shmem_broadcast32(targ_float,srce_float,max_elements,PE_root,0,0,n_pes,pSync2);
    for(j = 0; j < max_elements; j++) {
      if (my_pe == PE_root) {
        ans_float= (float)(100*my_pe+j);
      } else {
        ans_float= (float)(PE_root+i+1+j);
      }
      if ( targ_float[j] != ans_float )
        fprintf(stderr, "FAIL: PE [%d] targ_float[%d]=%10.0f ans_float=%10.0f\n",
                               my_pe,j,targ_float[j],ans_float);
    }
  }
  shmem_free(srce_int);    shmem_free(targ_int);
  shmem_free(srce_float);  shmem_free(targ_float);
  
/*  shmem_broadcast64 test   */
  max_elements = (size_t) (MAX_SIZE / sizeof(long));
  max_elements_bytes = (size_t) (sizeof(long)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_broadcast64             max_elements = %d\n",
                                                  max_elements);
  srce_long = shmem_malloc(max_elements_bytes);
  targ_long = shmem_malloc(max_elements_bytes);
  srce_double = shmem_malloc(max_elements_bytes);
  targ_double = shmem_malloc(max_elements_bytes);
  if((srce_long == NULL) || (targ_long == NULL) ||
     (srce_double == NULL) || (targ_double == NULL))
     shmalloc_error();
  for(j = 0; j < max_elements; j++) {
    srce_long[j] = (long)(my_pe+j);
    srce_double[j] = (double)(my_pe+j);
    targ_long[j] = (long)(100*my_pe+j);
    targ_double[j] = (double)(100*my_pe+j);
  }
  shmem_barrier_all();
  for(i = 0; i < IMAX; i+=2) {
    /* i is even -- using long */
    if (my_pe == PE_root)
      for(j = 0; j < max_elements; j++) {
        srce_long[j] = (long)(my_pe+i+j);
      }
    /* broadcast from PE_root to all PEs using pSync1 */
    shmem_broadcast64(targ_long,srce_long,max_elements,PE_root,0,0,n_pes,pSync1);
    for(j = 0; j < max_elements; j++) {
      if (my_pe == PE_root) {
        ans_long= (long)(100*my_pe+j);
      } else {
        ans_long= (long)(PE_root+i+j);
      }
      if ( targ_long[j] != ans_long )
        fprintf(stderr, "FAIL: PE [%d] targ_long[%d]=%d ans_long=%d\n",
                               my_pe,j,targ_long[j],ans_long);
    }
    /* i+1 is odd -- using double */
    if (my_pe == PE_root)
      for(j = 0; j < max_elements; j++) {
        srce_double[j] = (double)(PE_root+i+1+j);
      }
    /* broadcast from PE_root to all PEs using pSync2 */
    shmem_broadcast64(targ_double,srce_double,max_elements,PE_root,0,0,n_pes,pSync2);
    for(j = 0; j < max_elements; j++) {
      if (my_pe == PE_root) {
        ans_double= (double)(100*my_pe+j);
      } else {
        ans_double= (double)(PE_root+i+1+j);
      }
      if ( targ_double[j] != ans_double )
        fprintf(stderr, "FAIL: PE [%d] targ_double[%d]=%10.0f ans_double=%10.0f\n",
                               my_pe,j,targ_double[j],ans_double);
    }
  }
  shmem_free(srce_long);  shmem_free(targ_long);
  shmem_free(srce_double);  shmem_free(targ_double);

#ifndef OPENSHMEM
#ifdef SHMEM_C_GENERIC_32

/*  shmemx_broadcast (GENERIC 32) test   */
  max_elements = (size_t) (MAX_SIZE / sizeof(int));
  max_elements_bytes = (size_t) (sizeof(int)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmemx_broadcast (GENERIC 32)  max_elements = %d\n",
                                                  max_elements);
  srce_int = shmem_malloc(max_elements_bytes);
  targ_int = shmem_malloc(max_elements_bytes);
  if((srce_int == NULL) || (targ_int == NULL))
    shmalloc_error();
    for(j = 0; j < max_elements; j++) {
      srce_int[j] = (int)(my_pe+j);
      targ_int[j] = (int)(2*my_pe+j);
    }
  shmem_barrier_all();
    /* broadcast from PE 1 to all PEs */
    shmemx_broadcast(targ_int,srce_int,max_elements,1,0,0,n_pes,pSync1);
    for(j = 0; j < max_elements; j++) {
      if (my_pe == 1) {
        ans_int= (int)(j+2);
      } else {
        ans_int= (int)(j+1);
      }
      if ( targ_int[j] != ans_int )
        fprintf(stderr, "FAIL: PE [%d] targ_int[%d]=%d ans_int=%d\n",
                               my_pe,j,targ_int[j],ans_int);
    }
  shmem_free(srce_int);  shmem_free(targ_int);

#else

/*  shmemx_broadcast (GENERIC 64) test   */
  max_elements = (size_t) (MAX_SIZE / sizeof(long));
  max_elements_bytes = (size_t) (sizeof(long)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmemx_broadcast (GENERIC 64)  max_elements = %d\n",
                                                  max_elements);
  srce_long = shmem_malloc(max_elements_bytes);
  targ_long = shmem_malloc(max_elements_bytes);
  if((srce_long == NULL) || (targ_long == NULL))
    shmalloc_error();
  for(j = 0; j < max_elements; j++) {
    srce_long[j] = (long)(my_pe+j);
    targ_long[j] = (long)(2*my_pe+j);
  }
  shmem_barrier_all();
    /* broadcast from PE 1 to all PEs */
    shmemx_broadcast(targ_long,srce_long,max_elements,1,0,0,n_pes,pSync1);
    for(j = 0; j < max_elements; j++) {
      if (my_pe == 1) {
        ans_long = (long)(j+2);
      } else {
        ans_long = (long)(j+1);
      }
      if ( targ_long[j] != ans_long )
        fprintf(stderr, "FAIL: PE [%d] targ_long[%d]=%d ans_long=%d\n",
                               my_pe,j,targ_long[j],ans_long);
    }
  shmem_free(srce_long);  shmem_free(targ_long);

#endif
#endif

#ifdef NEEDS_FINALIZE
  shmem_finalize(); 
#endif
  return 0;
}
Beispiel #7
0
int main (int argc, char *argv[]) {
  /**** Initialising ****/
const unsigned long long full_program_start = current_time_ns();
{
  shmem_init (); 
  /* Variable Declarations */

  int 	     Numprocs,MyRank, Root = 0;
  int 	     i,j,k, NoofElements, NoofElements_Bloc,
				  NoElementsToSort;
  int 	     count, temp;
  TYPE 	     *Input, *InputData;
  TYPE 	     *Splitter, *AllSplitter;
  TYPE 	     *Buckets, *BucketBuffer, *LocalBucket;
  TYPE 	     *OutputBuffer, *Output;
  
  MyRank = shmem_my_pe ();
  Numprocs = shmem_n_pes ();
  NoofElements = SIZE;

  if(( NoofElements % Numprocs) != 0){
    if(MyRank == Root)
      printf("Number of Elements are not divisible by Numprocs \n");
    shmem_finalize ();
    exit(0);
  }
  /**** Reading Input ****/
  
  Input = (TYPE *) shmem_malloc (NoofElements*sizeof(*Input));
  if(Input == NULL) {
    printf("Error : Can not allocate memory \n");
  }

  if (MyRank == Root){
    /* Initialise random number generator  */ 
    printf ("Generating input Array for Sorting %d uint64_t numbers\n",SIZE);
    srand48((TYPE)NoofElements);
    for(i=0; i< NoofElements; i++) {
      Input[i] = rand();
    }
  }

  /**** Sending Data ****/

  NoofElements_Bloc = NoofElements / Numprocs;
  InputData = (TYPE *) shmem_malloc (NoofElements_Bloc * sizeof (*InputData));
  if(InputData == NULL) {
    printf("Error : Can not allocate memory \n");
  }
  //MPI_Scatter(Input, NoofElements_Bloc, TYPE_MPI, InputData, 
  //				  NoofElements_Bloc, TYPE_MPI, Root, MPI_COMM_WORLD);

  shmem_barrier_all();
  if(MyRank == Root) {
    for(i=0; i<Numprocs; i++) {
      TYPE* start = &Input[i * NoofElements_Bloc];
      shmem_put64(InputData, start, NoofElements_Bloc, i);
    }
  }
  shmem_barrier_all();

  /**** Sorting Locally ****/
  sorting(InputData, NoofElements_Bloc);

  /**** Choosing Local Splitters ****/
  Splitter = (TYPE *) shmem_malloc (sizeof (TYPE) * (Numprocs-1));
  if(Splitter == NULL) {
    printf("Error : Can not allocate memory \n");
  }
  for (i=0; i< (Numprocs-1); i++){
        Splitter[i] = InputData[NoofElements/(Numprocs*Numprocs) * (i+1)];
  } 

  /**** Gathering Local Splitters at Root ****/
  AllSplitter = (TYPE *) shmem_malloc (sizeof (TYPE) * Numprocs * (Numprocs-1));
  if(AllSplitter == NULL) {
    printf("Error : Can not allocate memory \n");
  }
  //MPI_Gather (Splitter, Numprocs-1, TYPE_MPI, AllSplitter, Numprocs-1, 
  //				  TYPE_MPI, Root, MPI_COMM_WORLD);
  shmem_barrier_all();
  TYPE* target_index = &AllSplitter[MyRank * (Numprocs-1)];
  shmem_put64(target_index, Splitter, Numprocs-1, Root);
  shmem_barrier_all();

  /**** Choosing Global Splitters ****/
  if (MyRank == Root){
    sorting (AllSplitter, Numprocs*(Numprocs-1));

    for (i=0; i<Numprocs-1; i++)
      Splitter[i] = AllSplitter[(Numprocs-1)*(i+1)];
  }
  
  /**** Broadcasting Global Splitters ****/
  //MPI_Bcast (Splitter, Numprocs-1, TYPE_MPI, 0, MPI_COMM_WORLD);
  { int _i; for(_i=0; _i<_SHMEM_BCAST_SYNC_SIZE; _i++) { pSync[_i] = _SHMEM_SYNC_VALUE; } shmem_barrier_all(); }
  shmem_broadcast64(Splitter, Splitter, Numprocs-1, 0, 0, 0, Numprocs, pSync);
  shmem_barrier_all();

  /**** Creating Numprocs Buckets locally ****/
  Buckets = (TYPE *) shmem_malloc (sizeof (TYPE) * (NoofElements + Numprocs));  
  if(Buckets == NULL) {
    printf("Error : Can not allocate memory \n");
  }
  
  j = 0;
  k = 1;

  for (i=0; i<NoofElements_Bloc; i++){
    if(j < (Numprocs-1)){
       if (InputData[i] < Splitter[j]) 
			 Buckets[((NoofElements_Bloc + 1) * j) + k++] = InputData[i]; 
       else{
	       Buckets[(NoofElements_Bloc + 1) * j] = k-1;
		    k=1;
			 j++;
		    i--;
       }
    }
    else 
       Buckets[((NoofElements_Bloc + 1) * j) + k++] = InputData[i];
  }
  Buckets[(NoofElements_Bloc + 1) * j] = k - 1;
  shmem_free(Splitter);
  shmem_free(AllSplitter);
      
  /**** Sending buckets to respective processors ****/

  BucketBuffer = (TYPE *) shmem_malloc (sizeof (TYPE) * (NoofElements + Numprocs));
  if(BucketBuffer == NULL) {
    printf("Error : Can not allocate memory \n");
  }

  //MPI_Alltoall (Buckets, NoofElements_Bloc + 1, TYPE_MPI, BucketBuffer, 
  //					 NoofElements_Bloc + 1, TYPE_MPI, MPI_COMM_WORLD);
  shmem_barrier_all();
  for(i=0; i<Numprocs; i++) {
    shmem_put64(&BucketBuffer[MyRank*(NoofElements_Bloc + 1)], &Buckets[i*(NoofElements_Bloc + 1)],  NoofElements_Bloc + 1, i);   
  }
  shmem_barrier_all();

  /**** Rearranging BucketBuffer ****/
  LocalBucket = (TYPE *) shmem_malloc (sizeof (TYPE) * 2 * NoofElements / Numprocs);
  if(LocalBucket == NULL) {
    printf("Error : Can not allocate memory \n");
  }

  count = 1;

  for (j=0; j<Numprocs; j++) {
  k = 1;
    for (i=0; i<BucketBuffer[(NoofElements/Numprocs + 1) * j]; i++) 
      LocalBucket[count++] = BucketBuffer[(NoofElements/Numprocs + 1) * j + k++];
  }
  LocalBucket[0] = count-1;
    
  /**** Sorting Local Buckets using Bubble Sort ****/
  /*sorting (InputData, NoofElements_Bloc, sizeof(int), intcompare); */

  NoElementsToSort = LocalBucket[0];
  sorting (&LocalBucket[1], NoElementsToSort); 

  /**** Gathering sorted sub blocks at root ****/
  OutputBuffer = (TYPE *) shmem_malloc (sizeof(TYPE) * 2 * NoofElements);
  if(OutputBuffer == NULL) {
    printf("Error : Can not allocate memory \n");
  }

  //MPI_Gather (LocalBucket, 2*NoofElements_Bloc, TYPE_MPI, OutputBuffer, 
  //				  2*NoofElements_Bloc, TYPE_MPI, Root, MPI_COMM_WORLD);
  shmem_barrier_all();
  target_index = &OutputBuffer[MyRank * (2*NoofElements_Bloc)];
  shmem_put64(target_index, LocalBucket, 2*NoofElements_Bloc, Root);
  shmem_barrier_all();

  /**** Rearranging output buffer ****/
  if (MyRank == Root){
    Output = (TYPE *) malloc (sizeof (TYPE) * NoofElements);
    count = 0;
    for(j=0; j<Numprocs; j++){
      k = 1;
      for(i=0; i<OutputBuffer[(2 * NoofElements/Numprocs) * j]; i++) 
        Output[count++] = OutputBuffer[(2*NoofElements/Numprocs) * j + k++];
      }
       printf ( "Number of Elements to be sorted : %d \n", NoofElements);
       TYPE prev = 0;
       int fail = 0;
       for (i=0; i<NoofElements; i++){
         if(Output[i] < prev) { printf("Failed at index %d\n",i); fail = 1; }
         prev = Output[i];
       }
       if(fail) printf("Sorting FAILED\n");  
       else  printf("Sorting PASSED\n");
  	free(Output);
  }/* MyRank==0*/

  shmem_free(Input);
  shmem_free(OutputBuffer);
  shmem_free(InputData);
  shmem_free(Buckets);
  shmem_free(BucketBuffer);
  shmem_free(LocalBucket);

   /**** Finalize ****/
  shmem_finalize();
  } ; 
const unsigned long long full_program_end = current_time_ns();
printf("full_program %llu ns\n", full_program_end - full_program_start);

}
void communicateParameters(LSMSCommunication &comm, LSMSSystemParameters &lsms, 
                           CrystalParameters &crystal, MixingParameters &mix)
{
  int const s=sizeof(LSMSSystemParameters)+9*sizeof(Real)+sizeof(int)+10
    +sizeof(MixingParameters)+5*sizeof(int);
  int rem=0,ele=0;
  int tot_bufsize=s;
  rem=s%32;
  ele=s/32;
  if  (rem!=0)
  {
    tot_bufsize=s-rem+32;
    ele++;
  }
  // TODO fine-tune this size
  tot_bufsize=65536;
  char* buf=(char*)shmalloc(tot_bufsize);
  int pos=0;
  int sec_id;

  if(comm.comm.rank==0)
  {
    
    //MPI_Pack(lsms.systemid,80,MPI_CHAR,buf,s,&pos,comm.comm);
    //MPI_Pack(lsms.title,80,MPI_CHAR,buf,s,&pos,comm.comm);
    //MPI_Pack(lsms.potential_file_in,128,MPI_CHAR,buf,s,&pos,comm.comm);
    //MPI_Pack(lsms.potential_file_out,128,MPI_CHAR,buf,s,&pos,comm.comm);
    //MPI_Pack(&lsms.pot_in_type,1,MPI_INT,buf,s,&pos,comm.comm);
    //MPI_Pack(&lsms.pot_out_type,1,MPI_INT,buf,s,&pos,comm.comm);
    //MPI_Pack(&lsms.num_atoms,1,MPI_INT,buf,s,&pos,comm.comm);
    //MPI_Pack(&lsms.nspin,1,MPI_INT,buf,s,&pos,comm.comm);
    //MPI_Pack(&lsms.nrel_rel,1,MPI_INT,buf,s,&pos,comm.comm);
    //MPI_Pack(&lsms.nrelc,1,MPI_INT,buf,s,&pos,comm.comm);
    //MPI_Pack(&lsms.nrelv,1,MPI_INT,buf,s,&pos,comm.comm);
    //MPI_Pack(&lsms.n_spin_cant,1,MPI_INT,buf,s,&pos,comm.comm);
    //MPI_Pack(&lsms.n_spin_pola,1,MPI_INT,buf,s,&pos,comm.comm);
    //MPI_Pack(&lsms.mtasa,1,MPI_INT,buf,s,&pos,comm.comm);
    //MPI_Pack(&lsms.fixRMT,1,MPI_INT,buf,s,&pos,comm.comm);
    //MPI_Pack(&lsms.nscf,1,MPI_INT,buf,s,&pos,comm.comm);
    //MPI_Pack(&lsms.writeSteps,1,MPI_INT,buf,s,&pos,comm.comm);
    //MPI_Pack(&lsms.clight,1,MPI_DOUBLE,buf,s,&pos,comm.comm);

    //MPI_Pack(&lsms.energyContour.grid,1,MPI_INT,buf,s,&pos,comm.comm);
    //MPI_Pack(&lsms.energyContour.npts,1,MPI_INT,buf,s,&pos,comm.comm);
    //MPI_Pack(&lsms.energyContour.ebot,1,MPI_DOUBLE,buf,s,&pos,comm.comm);
    //MPI_Pack(&lsms.energyContour.etop,1,MPI_DOUBLE,buf,s,&pos,comm.comm);
    //MPI_Pack(&lsms.energyContour.eibot,1,MPI_DOUBLE,buf,s,&pos,comm.comm);
    //MPI_Pack(&lsms.energyContour.eitop,1,MPI_DOUBLE,buf,s,&pos,comm.comm);
    //MPI_Pack(&lsms.energyContour.maxGroupSize,1,MPI_INT,buf,s,&pos,comm.comm);

    //MPI_Pack(&lsms.mixing,1,MPI_INT,buf,s,&pos,comm.comm);
    //MPI_Pack(&lsms.alphaDV,1,MPI_DOUBLE,buf,s,&pos,comm.comm);

    //MPI_Pack(&lsms.global.iprint,1,MPI_INT,buf,s,&pos,comm.comm);
    //MPI_Pack(&lsms.global.print_node,1,MPI_INT,buf,s,&pos,comm.comm);
    //MPI_Pack(&lsms.global.default_iprint,1,MPI_INT,buf,s,&pos,comm.comm);
    //MPI_Pack(&lsms.global.istop,32,MPI_CHAR,buf,s,&pos,comm.comm);
    //MPI_Pack(&lsms.global.GPUThreads,32,MPI_INT,buf,s,&pos,comm.comm);

    //MPI_Pack(&crystal.num_types,1,MPI_INT,buf,s,&pos,comm.comm);
    //MPI_Pack(&crystal.bravais(0,0),9,MPI_DOUBLE,buf,s,&pos,comm.comm);

    //************  MemCpying  ***************
    memcpy(&buf[pos],&lsms.systemid,80*char_size); pos = pos+80*char_size;
    memcpy(&buf[pos],&lsms.title,80*char_size); pos = pos+80*char_size;
    memcpy(&buf[pos],&lsms.potential_file_in,128*char_size); pos = pos+128*char_size;
    memcpy(&buf[pos],&lsms.potential_file_out,128*char_size); pos = pos+128*char_size;
    memcpy(&buf[pos],&lsms.pot_in_type,int_size); pos = pos+int_size;
    memcpy(&buf[pos],&lsms.pot_out_type ,int_size); pos = pos+int_size;
    memcpy(&buf[pos],&lsms.num_atoms,int_size); pos = pos+int_size;

    memcpy(&buf[pos],&lsms.nspin,int_size); pos = pos+int_size;
    memcpy(&buf[pos],&lsms.nrel_rel,int_size); pos = pos+int_size;
    memcpy(&buf[pos],&lsms.nrelc,int_size); pos = pos+int_size;
    memcpy(&buf[pos],&lsms.nrelv,int_size); pos = pos+int_size;
    memcpy(&buf[pos],&lsms.n_spin_cant,int_size); pos = pos+int_size;
    memcpy(&buf[pos],&lsms.n_spin_pola,int_size); pos = pos+int_size;
    memcpy(&buf[pos],&lsms.mtasa,int_size); pos = pos+int_size;
    memcpy(&buf[pos],&lsms.fixRMT,int_size); pos = pos+int_size;
    memcpy(&buf[pos],&lsms.nscf,int_size); pos = pos+int_size;
    memcpy(&buf[pos],&lsms.writeSteps,int_size); pos = pos+int_size;
    memcpy(&buf[pos],&lsms.clight,double_size); pos = pos+double_size;

    memcpy(&buf[pos],&lsms.energyContour.grid,int_size); pos = pos+int_size;
    memcpy(&buf[pos],&lsms.energyContour.npts,int_size); pos = pos+int_size;
    memcpy(&buf[pos],&lsms.energyContour.ebot,double_size); pos = pos+double_size;
    memcpy(&buf[pos],&lsms.energyContour.etop,double_size); pos = pos+double_size;
    memcpy(&buf[pos],&lsms.energyContour.eibot,double_size); pos = pos+double_size;
    memcpy(&buf[pos],&lsms.energyContour.eitop,double_size); pos = pos+double_size;
    memcpy(&buf[pos],&lsms.energyContour.maxGroupSize,int_size); pos = pos+int_size;

    memcpy(&buf[pos],&lsms.mixing,int_size); pos = pos+int_size;
    memcpy(&buf[pos],&lsms.alphaDV,double_size); pos = pos+double_size;

    memcpy(&buf[pos],&lsms.global.iprint,int_size); pos = pos+int_size;
    memcpy(&buf[pos],&lsms.global.print_node,int_size); pos = pos+int_size;
    memcpy(&buf[pos],&lsms.global.default_iprint,int_size); pos = pos+int_size;
    memcpy(&buf[pos],&lsms.global.istop,32*char_size); pos = pos+32*char_size;
    memcpy(&buf[pos],&lsms.global.GPUThreads,32*int_size); pos = pos+32*int_size;

    memcpy(&buf[pos],&crystal.num_types,int_size); pos = pos+int_size;
    memcpy(&buf[pos],&crystal.bravais(0,0),9*double_size); pos = pos+9*double_size;


// MixingParameters
    // MPI_CXX_BOOL is not always available
    // MPI_Pack(&mix.quantity[0],mix.numQuantities,MPI_CXX_BOOL,buf,s,&pos,comm.comm);
    // copy to temporary int array and send this
    int tmpQuantity[mix.numQuantities];
    for(int i=0; i<mix.numQuantities; i++)
      if(mix.quantity[i])
        tmpQuantity[i] = 1;
      else
        tmpQuantity[i] = 0; 
    //MPI_Pack(&tmpQuantity[0],mix.numQuantities,MPI_INT,buf,s,&pos,comm.comm);
    //MPI_Pack(&mix.algorithm[0],mix.numQuantities,MPI_INT,buf,s,&pos,comm.comm);
    //MPI_Pack(&mix.mixingParameter[0],mix.numQuantities,MPI_DOUBLE,buf,s,&pos,comm.comm);
    memcpy(&buf[pos],&tmpQuantity[0],mix.numQuantities*int_size); pos = pos+mix.numQuantities*int_size;
    memcpy(&buf[pos],&mix.algorithm[0],mix.numQuantities*int_size); pos = pos+mix.numQuantities*int_size;
    memcpy(&buf[pos],&mix.mixingParameter[0],mix.numQuantities*double_size); pos = pos+mix.numQuantities*double_size;

  }
  //MPI_Bcast(buf,s,MPI_PACKED,0,comm.comm);
  shmem_barrier(0, 0, comm.comm.size,pSync2);
  shmem_broadcast32(&buf[0], &buf[0], tot_bufsize, 0, 0, 0, comm.comm.size,pSync1);
  shmem_barrier(0, 0, comm.comm.size,pSync2);
  if(comm.comm.rank!=0)
  {
    int pos=0;
    //MPI_Unpack(buf,s,&pos,lsms.systemid,80,MPI_CHAR,comm.comm);
    //MPI_Unpack(buf,s,&pos,lsms.title,80,MPI_CHAR,comm.comm);
    //MPI_Unpack(buf,s,&pos,lsms.potential_file_in,128,MPI_CHAR,comm.comm);
    //MPI_Unpack(buf,s,&pos,lsms.potential_file_out,128,MPI_CHAR,comm.comm);
    //MPI_Unpack(buf,s,&pos,&lsms.pot_in_type,1,MPI_INT,comm.comm);
    //MPI_Unpack(buf,s,&pos,&lsms.pot_out_type,1,MPI_INT,comm.comm);
    //MPI_Unpack(buf,s,&pos,&lsms.num_atoms,1,MPI_INT,comm.comm);
    memcpy(&lsms.systemid,&buf[pos],80*char_size); pos = pos+80*char_size;
    memcpy(&lsms.title,&buf[pos],80*char_size); pos = pos+80*char_size;
    memcpy(&lsms.potential_file_in,&buf[pos],128*char_size); pos = pos+128*char_size;
    memcpy(&lsms.potential_file_out,&buf[pos],128*char_size); pos = pos+128*char_size;
    memcpy(&lsms.pot_in_type,&buf[pos],int_size); pos = pos+int_size;
    memcpy(&lsms.pot_out_type,&buf[pos],int_size); pos = pos+int_size;
    memcpy(&lsms.num_atoms,&buf[pos],int_size); pos = pos+int_size;
    crystal.num_atoms=lsms.num_atoms;
    //MPI_Unpack(buf,s,&pos,&lsms.nspin,1,MPI_INT,comm.comm);
    //MPI_Unpack(buf,s,&pos,&lsms.nrel_rel,1,MPI_INT,comm.comm);
    //MPI_Unpack(buf,s,&pos,&lsms.nrelc,1,MPI_INT,comm.comm);
    //MPI_Unpack(buf,s,&pos,&lsms.nrelv,1,MPI_INT,comm.comm);
    //MPI_Unpack(buf,s,&pos,&lsms.n_spin_cant,1,MPI_INT,comm.comm);
    //MPI_Unpack(buf,s,&pos,&lsms.n_spin_pola,1,MPI_INT,comm.comm);
    //MPI_Unpack(buf,s,&pos,&lsms.mtasa,1,MPI_INT,comm.comm);
    //MPI_Unpack(buf,s,&pos,&lsms.fixRMT,1,MPI_INT,comm.comm);
    //MPI_Unpack(buf,s,&pos,&lsms.nscf,1,MPI_INT,comm.comm);
    //MPI_Unpack(buf,s,&pos,&lsms.writeSteps,1,MPI_INT,comm.comm);
    //MPI_Unpack(buf,s,&pos,&lsms.clight,1,MPI_DOUBLE,comm.comm);

    //MPI_Unpack(buf,s,&pos,&lsms.energyContour.grid,1,MPI_INT,comm.comm);
    //MPI_Unpack(buf,s,&pos,&lsms.energyContour.npts,1,MPI_INT,comm.comm);
    //MPI_Unpack(buf,s,&pos,&lsms.energyContour.ebot,1,MPI_DOUBLE,comm.comm);
    //MPI_Unpack(buf,s,&pos,&lsms.energyContour.etop,1,MPI_DOUBLE,comm.comm);
    //MPI_Unpack(buf,s,&pos,&lsms.energyContour.eibot,1,MPI_DOUBLE,comm.comm);
    //MPI_Unpack(buf,s,&pos,&lsms.energyContour.eitop,1,MPI_DOUBLE,comm.comm);
    //MPI_Unpack(buf,s,&pos,&lsms.energyContour.maxGroupSize,1,MPI_INT,comm.comm);

    //MPI_Unpack(buf,s,&pos,&lsms.mixing,1,MPI_INT,comm.comm);
    //MPI_Unpack(buf,s,&pos,&lsms.alphaDV,1,MPI_DOUBLE,comm.comm);

    //MPI_Unpack(buf,s,&pos,&lsms.global.iprint,1,MPI_INT,comm.comm);
    //MPI_Unpack(buf,s,&pos,&lsms.global.print_node,1,MPI_INT,comm.comm);
    //MPI_Unpack(buf,s,&pos,&lsms.global.default_iprint,1,MPI_INT,comm.comm);
    //MPI_Unpack(buf,s,&pos,&lsms.global.istop,32,MPI_CHAR,comm.comm);
    //MPI_Unpack(buf,s,&pos,&lsms.global.GPUThreads,32,MPI_INT,comm.comm);

    //MPI_Unpack(buf,s,&pos,&crystal.num_types,1,MPI_INT,comm.comm);
    //MPI_Unpack(buf,s,&pos,&crystal.bravais(0,0),9,MPI_DOUBLE,comm.comm);

    memcpy(&lsms.nspin,&buf[pos],int_size); pos = pos+int_size;
    memcpy(&lsms.nrel_rel,&buf[pos],int_size); pos = pos+int_size;
    memcpy(&lsms.nrelc,&buf[pos],int_size); pos = pos+int_size;
    memcpy(&lsms.nrelv,&buf[pos],int_size); pos = pos+int_size;
    memcpy(&lsms.n_spin_cant,&buf[pos],int_size); pos = pos+int_size;
    memcpy(&lsms.n_spin_pola,&buf[pos],int_size); pos = pos+int_size;
    memcpy(&lsms.mtasa,&buf[pos],int_size); pos = pos+int_size;
    memcpy(&lsms.fixRMT,&buf[pos],int_size); pos = pos+int_size;
    memcpy(&lsms.nscf,&buf[pos],int_size); pos = pos+int_size;
    memcpy(&lsms.writeSteps,&buf[pos],int_size); pos = pos+int_size;
    memcpy(&lsms.clight,&buf[pos],double_size); pos = pos+double_size;

    memcpy(&lsms.energyContour.grid,&buf[pos],int_size); pos = pos+int_size;
    memcpy(&lsms.energyContour.npts,&buf[pos],int_size); pos = pos+int_size;
    memcpy(&lsms.energyContour.ebot,&buf[pos],double_size); pos = pos+double_size;
    memcpy(&lsms.energyContour.etop,&buf[pos],double_size); pos = pos+double_size;
    memcpy(&lsms.energyContour.eibot,&buf[pos],double_size); pos = pos+double_size;
    memcpy(&lsms.energyContour.eitop,&buf[pos],double_size); pos = pos+double_size;
    memcpy(&lsms.energyContour.maxGroupSize,&buf[pos],int_size); pos = pos+int_size;

    memcpy(&lsms.mixing,&buf[pos],int_size); pos = pos+int_size;
    memcpy(&lsms.alphaDV,&buf[pos],double_size); pos = pos+double_size;

    memcpy(&lsms.global.iprint,&buf[pos],int_size); pos = pos+int_size;
    memcpy(&lsms.global.print_node,&buf[pos],int_size); pos = pos+int_size;
    memcpy(&lsms.global.default_iprint,&buf[pos],int_size); pos = pos+int_size;
    memcpy(&lsms.global.istop,&buf[pos],32*char_size); pos = pos+32*char_size;
    memcpy(&lsms.global.GPUThreads,&buf[pos],32*int_size); pos = pos+32*int_size;

    memcpy(&crystal.num_types,&buf[pos],int_size); pos = pos+int_size;
    memcpy(&crystal.bravais(0,0),&buf[pos],9*double_size); pos = pos+9*double_size;

    crystal.resize(crystal.num_atoms);
    crystal.resizeTypes(crystal.num_types);


// MixingParameters
    // MPI_CXX_BOOL is not always available
    // MPI_Unpack(buf,s,&pos,&mix.quantity[0],mix.numQuantities,MPI_CXX_BOOL,comm.comm);
    // recieve temporary int array and copy
    int tmpQuantity[mix.numQuantities];
    //MPI_Unpack(buf,s,&pos,&tmpQuantity[0],mix.numQuantities,MPI_INT,comm.comm);
    memcpy(&tmpQuantity[0],&buf[pos],mix.numQuantities*int_size); pos = pos+mix.numQuantities*int_size;

    for(int i=0; i<mix.numQuantities; i++)
      if(tmpQuantity[i]==1)
        mix.quantity[i] = true;
      else
        mix.quantity[i] = false; 
    //MPI_Unpack(buf,s,&pos,&mix.algorithm[0],mix.numQuantities,MPI_INT,comm.comm);
    //MPI_Unpack(buf,s,&pos,&mix.mixingParameter[0],mix.numQuantities,MPI_DOUBLE,comm.comm);
    memcpy(&mix.algorithm[0],&buf[pos],mix.numQuantities*int_size); pos = pos+mix.numQuantities*int_size;
    memcpy(&mix.mixingParameter[0],&buf[pos],mix.numQuantities*double_size); pos = pos+mix.numQuantities*double_size;
  }

 for(int i=0; i<mix.numQuantities; i++)
      printf("mix.quantity[%d]=%d\n", i,mix.quantity[i]);

  // Allocate buffer for transmitting Crystal params
  int buff_size;

  if((crystal.num_types*sizeof(AtomType)) > (3*crystal.num_atoms*double_size))
     buff_size = crystal.num_types*sizeof(AtomType);
  else 
     buff_size = 3*crystal.num_atoms*double_size;  
 
  shfree(buf);
  // TODO finetune buff-size
  buff_size=1048576; //sizeof(LSMSSystemParameters)+9*sizeof(Real);
  rem=buff_size%64;
  ele=buff_size/64;
  if(rem != 0)
  {
     buff_size=buff_size-rem+64;
     ele++;
  }

  double* temp_buff=(double*) shmalloc(buff_size);
  int*    temp_intbuff=(int*) shmalloc(buff_size);

  //MPI_Bcast(&crystal.position(0,0),3*crystal.num_atoms,MPI_DOUBLE,0,comm.comm);
//TODO check if a barrier is neededa after broadcast ... data not updated otherwise
  if(comm.comm.rank == 0)
      memcpy(temp_buff,&crystal.position(0,0),3*crystal.num_atoms*double_size);
  shmem_barrier(0, 0, comm.comm.size,pSync2);
  shmem_broadcast64(temp_buff, temp_buff,3*crystal.num_atoms, 0, 0, 0, comm.comm.size,pSync1);
  shmem_barrier(0, 0, comm.comm.size,pSync2);
  if(comm.comm.rank != 0)
      memcpy(&crystal.position(0,0),temp_buff,3*crystal.num_atoms*double_size);

  //MPI_Bcast(&crystal.evecs(0,0),3*crystal.num_atoms,MPI_DOUBLE,0,comm.comm);
  if(comm.comm.rank == 0){
      memcpy(temp_buff,&crystal.evecs(0,0),3*crystal.num_atoms*double_size);
}
  shmem_barrier(0, 0, comm.comm.size,pSync2);
  shmem_broadcast64(temp_buff, temp_buff, 3*crystal.num_atoms, 0, 0, 0, comm.comm.size,pSync1);
  shmem_barrier(0, 0, comm.comm.size,pSync2);
  if(comm.comm.rank != 0){
      memcpy(&crystal.evecs(0,0),temp_buff,3*crystal.num_atoms*double_size);
}

  //MPI_Bcast(&crystal.type[0],crystal.num_atoms,MPI_INT,0,comm.comm);
  if(comm.comm.rank == 0){
      memcpy(temp_intbuff,&crystal.type[0],crystal.num_atoms*int_size);
  }
  shmem_barrier(0, 0, comm.comm.size,pSync2);
  shmem_broadcast32(temp_intbuff, temp_intbuff, crystal.num_atoms, 0, 0, 0, comm.comm.size,pSync1);
  shmem_barrier(0, 0, comm.comm.size,pSync2);
  if(comm.comm.rank != 0){
      memcpy(&crystal.type[0],temp_intbuff,crystal.num_atoms*int_size);
  }

// This is dangerous and assumes homogeneous nodes:
  //MPI_Bcast(&crystal.types[0],crystal.num_types*sizeof(AtomType),MPI_BYTE,0,comm.comm);
  if(comm.comm.rank == 0)
      memcpy(temp_buff,&crystal.types[0],crystal.num_types*sizeof(AtomType));
  // having to use the smallest possible broadcast:"32"-type
  shmem_barrier(0, 0, comm.comm.size,pSync2);
  shmem_broadcast32(temp_buff,temp_buff,crystal.num_types*sizeof(AtomType)/4,0,0,0,comm.comm.size,pSync1);
  shmem_barrier(0, 0, comm.comm.size,pSync2);
  if(comm.comm.rank != 0)
      memcpy(&crystal.types[0],temp_buff,crystal.num_types*sizeof(AtomType));

  shmem_barrier(0, 0, comm.comm.size,pSync1);
  shfree(temp_buff);
  shfree(temp_intbuff);

// get maximum lmax
  crystal.maxlmax=0;
  for(int i=0; i<crystal.num_types; i++)
    if(crystal.types[i].lmax>crystal.maxlmax) crystal.maxlmax=crystal.types[i].lmax; 
  lsms.maxlmax=crystal.maxlmax;
}
Beispiel #9
0
int
HPCC_SHMEMRandomAccess(HPCC_Params *params) {
  s64Int i;
  static s64Int NumErrors, GlbNumErrors;

  int NumProcs, logNumProcs, MyProc;
  u64Int GlobalStartMyProc;
  int Remainder;            /* Number of processors with (LocalTableSize + 1) entries */
  u64Int Top;               /* Number of table entries in top of Table */
  s64Int LocalTableSize;    /* Local table width */
  u64Int MinLocalTableSize; /* Integer ratio TableSize/NumProcs */
  u64Int logTableSize, TableSize;

  double CPUTime;               /* CPU  time to update table */
  double RealTime;              /* Real time to update table */

  double TotalMem;
  static int sAbort, rAbort;
  int PowerofTwo;

  double timeBound = -1;  /* OPTIONAL time bound for execution time */
  u64Int NumUpdates_Default; /* Number of updates to table (suggested: 4x number of table entries) */
  u64Int NumUpdates;  /* actual number of updates to table - may be smaller than
                       * NumUpdates_Default due to execution time bounds */
  s64Int ProcNumUpdates; /* number of updates per processor */

#ifdef RA_TIME_BOUND
  s64Int GlbNumUpdates;  /* for reduction */
#endif

  static long llpSync[_SHMEM_BCAST_SYNC_SIZE];
  static long long int llpWrk[_SHMEM_REDUCE_SYNC_SIZE];

  static long ipSync[_SHMEM_BCAST_SYNC_SIZE];
  static int ipWrk[_SHMEM_REDUCE_SYNC_SIZE];

  FILE *outFile = NULL;
  double *GUPs;
  double *temp_GUPs;


  int numthreads;


  for (i = 0; i < _SHMEM_BCAST_SYNC_SIZE; i += 1){
        ipSync[i] = _SHMEM_SYNC_VALUE;
        llpSync[i] = _SHMEM_SYNC_VALUE;
  }


  params->SHMEMGUPs = -1;
  GUPs = &params->SHMEMGUPs;

  NumProcs = shmem_n_pes();
  MyProc = shmem_my_pe();

  if (0 == MyProc) {
    outFile = stdout;
    setbuf(outFile, NULL);
  }

  params->HPLMaxProcMem = 200000;

  TotalMem = params->HPLMaxProcMem; /* max single node memory */
  TotalMem *= NumProcs;             /* max memory in NumProcs nodes */

  TotalMem /= sizeof(u64Int);

  /* calculate TableSize --- the size of update array (must be a power of 2) */
  for (TotalMem *= 0.5, logTableSize = 0, TableSize = 1;
       TotalMem >= 1.0;
       TotalMem *= 0.5, logTableSize++, TableSize <<= 1)
    ; /* EMPTY */


  /* determine whether the number of processors is a power of 2 */
  if ( (NumProcs & (NumProcs -1)) == 0) {
    PowerofTwo = HPCC_TRUE;
    Remainder = 0;
    Top = 0;
    MinLocalTableSize = (TableSize / NumProcs);
    LocalTableSize = MinLocalTableSize;
    GlobalStartMyProc = (MinLocalTableSize * MyProc);
  }
  else {
    if(MyProc == 0) {
        printf("Number of processes must be power of 2\n");

    }
    return 0;
  }
  sAbort = 0;
  HPCC_Table = HPCC_XMALLOC( s64Int, LocalTableSize );

  if (! HPCC_Table) sAbort = 1;



  shmem_barrier_all();
  shmem_int_sum_to_all(&rAbort, &sAbort, 1, 0, 0, NumProcs, ipWrk, ipSync);
  shmem_barrier_all();

  if (rAbort > 0) {
    if (MyProc == 0) fprintf(outFile, "Failed to allocate memory for the main table.\n");
    /* check all allocations in case there are new added and their order changes */
    if (HPCC_Table) HPCC_free( HPCC_Table );
    goto failed_table;
  }

  params->SHMEMRandomAccess_N = (s64Int)TableSize;

  /* Default number of global updates to table: 4x number of table entries */
  NumUpdates_Default = 4 * TableSize;
  ProcNumUpdates = 4*LocalTableSize;
  NumUpdates = NumUpdates_Default;

  if (MyProc == 0) {
    fprintf( outFile, "Running on %d processors%s\n", NumProcs, PowerofTwo ? " (PowerofTwo)" : "");
    fprintf( outFile, "Total Main table size = 2^" FSTR64 " = " FSTR64 " words\n",
             logTableSize, TableSize );
    if (PowerofTwo)
        fprintf( outFile, "PE Main table size = 2^" FSTR64 " = " FSTR64 " words/PE\n",
                 (logTableSize - logNumProcs), TableSize/NumProcs );
      else
        fprintf( outFile, "PE Main table size = (2^" FSTR64 ")/%d  = " FSTR64 " words/PE MAX\n",
                 logTableSize, NumProcs, LocalTableSize);

    fprintf( outFile, "Default number of updates (RECOMMENDED) = " FSTR64 "\n", NumUpdates_Default);
    params->SHMEMRandomAccess_ExeUpdates = NumUpdates;
  }

  /* Initialize main table */
  for (i=0; i<LocalTableSize; i++)
    HPCC_Table[i] = i + GlobalStartMyProc;

  shmem_barrier_all();

  RealTime = -RTSEC();

  Power2NodesRandomAccessUpdate(logTableSize, TableSize, LocalTableSize,
                                     MinLocalTableSize, GlobalStartMyProc, Top,
                                     logNumProcs, NumProcs, Remainder,
                                     MyProc, ProcNumUpdates);

  shmem_barrier_all();

  /* End timed section */

  RealTime += RTSEC();

  /* Print timing results */
  if (MyProc == 0){
    params->SHMEMRandomAccess_time = RealTime;
    *GUPs = 1e-9*NumUpdates / RealTime;
    fprintf( outFile, "Real time used = %.6f seconds\n", RealTime );
    fprintf( outFile, "%.9f Billion(10^9) Updates    per second [GUP/s]\n",
             *GUPs );
    fprintf( outFile, "%.9f Billion(10^9) Updates/PE per second [GUP/s]\n",
             *GUPs / NumProcs );
    /* No longer reporting per CPU number */
    /* *GUPs /= NumProcs; */
  }
  /* distribute result to all nodes */
  temp_GUPs = GUPs;
  shmem_barrier_all();
  shmem_broadcast64(GUPs,temp_GUPs,1,0,0,0,NumProcs,llpSync);
  shmem_barrier_all();

  /* Verification phase */

  /* Begin timing here */

  RealTime = -RTSEC();


  HPCC_Power2NodesSHMEMRandomAccessCheck(logTableSize, TableSize, LocalTableSize,
                                    GlobalStartMyProc,
                                    logNumProcs, NumProcs,
                                    MyProc, ProcNumUpdates,
                                    &NumErrors);

  shmem_barrier_all(); 
  shmem_longlong_sum_to_all( &GlbNumErrors,  &NumErrors, 1, 0,0, NumProcs,llpWrk, llpSync);
  shmem_barrier_all(); 

  /* End timed section */

  RealTime += RTSEC();

  if(MyProc == 0){
    params->SHMEMRandomAccess_CheckTime = RealTime;

    fprintf( outFile, "Verification:  Real time used = %.6f seconds\n", RealTime);
    fprintf( outFile, "Found " FSTR64 " errors in " FSTR64 " locations (%s).\n",
             GlbNumErrors, TableSize, (GlbNumErrors <= 0.01*TableSize) ?
             "passed" : "failed");
    if (GlbNumErrors > 0.01*TableSize) params->Failure = 1;
    params->SHMEMRandomAccess_Errors = (s64Int)GlbNumErrors;
    params->SHMEMRandomAccess_ErrorsFraction = (double)GlbNumErrors / (double)TableSize;
    params->SHMEMRandomAccess_Algorithm = 1;
  }
  /* End verification phase */


  /* Deallocate memory (in reverse order of allocation which should
     help fragmentation) */

  HPCC_free( HPCC_Table );
  failed_table:

  if (0 == MyProc) if (outFile != stderr) fclose( outFile );

  shmem_barrier_all();

  return 0;
}
Beispiel #10
0
int
main(int argc, char* argv[])
{
    int i, Verbose=0;
    int mpe, num_pes, loops=10, cloop;
    char *pgm;
    long *dst, *src;
    int nBytes = START_BCAST_SIZE;
    int nLongs=0;

    shmem_init();
    mpe = shmem_my_pe();
    num_pes = shmem_n_pes();

    if (num_pes == 1) {
        printf("%s: Requires number of PEs > 1\n", argv[0]);
        shmem_finalize();
        return 0;
    }

    if (sizeof(long) != 8) {
        printf("Test assumes 64-bit long (%zd)\n", sizeof(long));
        shmem_global_exit(1);
        return 0;
    }

    if ((pgm=strrchr(argv[0],'/'))) {
        pgm++;
    } else {
        pgm = argv[0];
    }

    if (argc > 1) {
        if (strncmp(argv[1],"-v",3) == 0) {
            Verbose=1;
        } else if (strncmp(argv[1],"-h",3) == 0) {
            fprintf(stderr,"usage: %s {-v(verbose)|h(help)}\n",pgm);
            shmem_finalize();
            exit(1);
        }
    }

    for (i = 0; i < SHMEM_BCAST_SYNC_SIZE; i += 1) {
        pSync[i] = SHMEM_SYNC_VALUE;
    }

    if ( mpe == 0 && Verbose ) {
        fprintf(stderr,"%d loops\n",loops);
    }

    for(cloop=1; cloop <= loops; cloop++) {

        nLongs = nBytes / sizeof(long);
        dst = (long *)shmem_malloc(nBytes*2);
        if ( !dst ) {
            fprintf(stderr,"[%d] shmem_malloc(%d) failed %s\n",
                            mpe,nBytes,strerror(errno));
            return 0;
        }
        memset( (void*)dst, 0, nBytes );
        src = &dst[nLongs];
        for (i = 1; i < nLongs; i++) {
            src[i] = i+1;
        }

        shmem_barrier_all();

        shmem_broadcast64(dst, src, nLongs, 1, 0, 0, num_pes, pSync);

        for(i=0; i < nLongs; i++) {
            /* the root node shouldn't have the result into dst (cf specification).*/
            if (1 != mpe && dst[i] != src[i]) {
                fprintf(stderr,"[%d] dst[%d] %ld != expected %ld\n",
                        mpe, i, dst[i],src[i]);
                shmem_global_exit(1);
            } else if (1 == mpe && dst[i] != 0) {
                fprintf(stderr,"[%d] dst[%d] %ld != expected 0\n",
                        mpe, i, dst[i]);
                shmem_global_exit(1);
            }
        }
        shmem_barrier_all();

        shmem_free (dst);
        if (Verbose && mpe ==0)
            fprintf(stderr,"loop %2d Bcast %d, Done.\n",cloop,nBytes);
        nBytes += BCAST_INCR;
    }

    shmem_finalize();

    return 0;
}